Skip to content

Commit 53e0b38

Browse files
gh-153: Fix SER handling of high unicode values.
1 parent 429bcda commit 53e0b38

1 file changed

Lines changed: 65 additions & 16 deletions

File tree

src/builtins.c

Lines changed: 65 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -809,23 +809,72 @@ static void jb_free(JsonBuf* jb) {
809809
static void jb_append_json_string(JsonBuf* jb, const char* s) {
810810
jb_append_char(jb, '"');
811811
if (!s) s = "";
812-
for (const unsigned char* p = (const unsigned char*)s; *p; p++) {
812+
const unsigned char* p = (const unsigned char*)s;
813+
while (*p) {
813814
unsigned char c = *p;
814-
switch (c) {
815-
case '"': jb_append_str(jb, "\\\""); break;
816-
case '\\': jb_append_str(jb, "\\\\"); break;
817-
case '\b': jb_append_str(jb, "\\b"); break;
818-
case '\f': jb_append_str(jb, "\\f"); break;
819-
case '\n': jb_append_str(jb, "\\n"); break;
820-
case '\r': jb_append_str(jb, "\\r"); break;
821-
case '\t': jb_append_str(jb, "\\t"); break;
822-
default:
823-
if (c < 0x20 || c >= 0x7f) {
824-
jb_append_fmt(jb, "\\u%04x", (unsigned int)c);
825-
} else {
826-
jb_append_char(jb, (unsigned char)c);
827-
}
828-
break;
815+
/* Common single-byte escapes */
816+
if (c == '"') { jb_append_str(jb, "\\\""); p++; continue; }
817+
if (c == '\\') { jb_append_str(jb, "\\\\"); p++; continue; }
818+
if (c == '\b') { jb_append_str(jb, "\\b"); p++; continue; }
819+
if (c == '\f') { jb_append_str(jb, "\\f"); p++; continue; }
820+
if (c == '\n') { jb_append_str(jb, "\\n"); p++; continue; }
821+
if (c == '\r') { jb_append_str(jb, "\\r"); p++; continue; }
822+
if (c == '\t') { jb_append_str(jb, "\\t"); p++; continue; }
823+
824+
/* Control characters must be escaped as \u00xx */
825+
if (c < 0x20) {
826+
jb_append_fmt(jb, "\\u%04x", (unsigned int)c);
827+
p++;
828+
continue;
829+
}
830+
831+
/* Decode UTF-8 sequence into a Unicode code point. On invalid
832+
* sequences emit U+FFFD. For BMP code points emit \uHHHH, for
833+
* beyond-BMP emit \UHHHHHHHH per the specification. */
834+
uint32_t codepoint = 0;
835+
size_t seq_len = 0;
836+
if (c < 0x80) {
837+
codepoint = c;
838+
seq_len = 1;
839+
} else if ((c & 0xE0) == 0xC0) {
840+
if (p[1] != '\0' && (p[1] & 0xC0) == 0x80) {
841+
codepoint = ((uint32_t)(c & 0x1F) << 6) | (uint32_t)(p[1] & 0x3F);
842+
seq_len = 2;
843+
if (codepoint < 0x80) seq_len = 0; /* overlong */
844+
}
845+
} else if ((c & 0xF0) == 0xE0) {
846+
if (p[1] != '\0' && p[2] != '\0' && (p[1] & 0xC0) == 0x80 && (p[2] & 0xC0) == 0x80) {
847+
codepoint = ((uint32_t)(c & 0x0F) << 12) | ((uint32_t)(p[1] & 0x3F) << 6) | (uint32_t)(p[2] & 0x3F);
848+
seq_len = 3;
849+
if (codepoint < 0x800) seq_len = 0; /* overlong */
850+
}
851+
} else if ((c & 0xF8) == 0xF0) {
852+
if (p[1] != '\0' && p[2] != '\0' && p[3] != '\0' && (p[1] & 0xC0) == 0x80 && (p[2] & 0xC0) == 0x80 && (p[3] & 0xC0) == 0x80) {
853+
codepoint = ((uint32_t)(c & 0x07) << 18) | ((uint32_t)(p[1] & 0x3F) << 12) | ((uint32_t)(p[2] & 0x3F) << 6) | (uint32_t)(p[3] & 0x3F);
854+
seq_len = 4;
855+
if (codepoint < 0x10000 || codepoint > 0x10FFFF) seq_len = 0; /* overlong or out of range */
856+
}
857+
}
858+
859+
if (seq_len == 0) {
860+
/* invalid UTF-8 -> replacement character */
861+
codepoint = 0xFFFD;
862+
p++;
863+
} else {
864+
/* reject surrogate halves */
865+
if (codepoint >= 0xD800 && codepoint <= 0xDFFF) {
866+
codepoint = 0xFFFD;
867+
} else {
868+
p += seq_len;
869+
}
870+
}
871+
872+
if (codepoint < 0x80) {
873+
jb_append_char(jb, (char)codepoint);
874+
} else if (codepoint <= 0xFFFF) {
875+
jb_append_fmt(jb, "\\u%04x", (unsigned int)codepoint);
876+
} else {
877+
jb_append_fmt(jb, "\\U%08x", (unsigned int)codepoint);
829878
}
830879
}
831880
jb_append_char(jb, '"');

0 commit comments

Comments
 (0)