add support for high-low surrogates and UTF-16 decoding - json2tsv - JSON to TS… | |
git clone git://git.codemadness.org/json2tsv | |
Log | |
Files | |
Refs | |
README | |
LICENSE | |
--- | |
commit 933582372d81a911193fb1da7c86b6b960432535 | |
parent 922491e0343ab6f440024803921daf843b0e9cf5 | |
Author: Hiltjo Posthuma <[email protected]> | |
Date: Sun, 13 Oct 2019 21:31:31 +0200 | |
add support for high-low surrogates and UTF-16 decoding | |
seen in the wild on a Reddit JSON file encoding emojis. | |
It is also mentioned in the RFC7159 - 7. Strings | |
Diffstat: | |
M json2tsv.c | 25 ++++++++++++++++++++++++- | |
1 file changed, 24 insertions(+), 1 deletion(-) | |
--- | |
diff --git a/json2tsv.c b/json2tsv.c | |
@@ -107,7 +107,7 @@ parsejson(void (*cb)(struct json_node *, size_t, const char… | |
{ | |
struct json_node nodes[JSON_MAX_NODE_DEPTH] = { 0 }; | |
size_t depth = 0, v = 0, vz = 0; | |
- long cp; | |
+ long cp, hi, lo; | |
int c, i, escape, ret = -1; | |
char *value = NULL; | |
@@ -164,6 +164,29 @@ parsejson(void (*cb)(struct json_node *, size_t, const cha… | |
} | |
cp |= (hexdigit(c) << … | |
} | |
+ /* See also: | |
+ * RFC7159 - 7. Strings and | |
+ * https://unicode.org/faq/utf… | |
+ * 0xd800 - 0xdb7f - high surr… | |
+ if (cp >= 0xd800 && cp <= 0xdb… | |
+ if (GETNEXT() != '\\' … | |
+ *errstr = "inv… | |
+ goto end; | |
+ } | |
+ for (hi = cp, i = 12, … | |
+ if ((c = GETNE… | |
+ *errst… | |
+ goto e… | |
+ } | |
+ lo |= (hexdigi… | |
+ } | |
+ /* 0xdc00 - 0xdfff - l… | |
+ if (!(lo >= 0xdc00 && … | |
+ *errstr = "inv… | |
+ goto end; | |
+ } | |
+ cp = (hi << 10) + (0xD… | |
+ } | |
if (capacity(&value, &vz, v, 5… | |
goto end; | |
v += codepointtoutf8(cp, &valu… |