From 41db97d380140af4191322519e2eab10ffd515e5 Mon Sep 17 00:00:00 2001 From: leitner Date: Mon, 13 Mar 2017 14:45:48 +0000 Subject: [PATCH] improve json escape decoder, add test suite --- test/json.c | 32 ++++++++++++++++++++++++++++++++ textcode/scan_jsonescape.c | 24 ++++++++++++++++-------- 2 files changed, 48 insertions(+), 8 deletions(-) create mode 100644 test/json.c diff --git a/test/json.c b/test/json.c new file mode 100644 index 0000000..af91bba --- /dev/null +++ b/test/json.c @@ -0,0 +1,32 @@ +#include +#include +#include + +int main() { + char buf[100]; + size_t l; + assert(scan_jsonescape("fnord",buf,&l)==5 && l==5 && !memcmp(buf,"fnord",5)); + /* is \n properly unescaped? */ + assert(scan_jsonescape("a\\nb",buf,&l)==4 && l==3 && !memcmp(buf,"a\nb",3)); + assert(scan_jsonescape("a\\rb",buf,&l)==4 && l==3 && !memcmp(buf,"a\rb",3)); + assert(scan_jsonescape("a\\bb",buf,&l)==4 && l==3 && !memcmp(buf,"a\bb",3)); + assert(scan_jsonescape("a\\fb",buf,&l)==4 && l==3 && !memcmp(buf,"a\fb",3)); + assert(scan_jsonescape("a\\tb",buf,&l)==4 && l==3 && !memcmp(buf,"a\tb",3)); + assert(scan_jsonescape("a\\\\b",buf,&l)==4 && l==3 && !memcmp(buf,"a\\b",3)); + assert(scan_jsonescape("a\\/b",buf,&l)==4 && l==3 && !memcmp(buf,"a/b",3)); + assert(scan_jsonescape("a\\\"b",buf,&l)==4 && l==3 && !memcmp(buf,"a\"b",3)); + /* does a double quote end the string? */ + assert(scan_jsonescape("a\"b",buf,&l)==1 && l==1 && !memcmp(buf,"a",1)); + /* how about unicode escape */ + assert(scan_jsonescape("a\\u005cb",buf,&l)==8 && l==3 && !memcmp(buf,"a\\b",3)); + /* a trailing surrogate pair with no lead before it */ + assert(scan_jsonescape("a\\udead\"",buf,&l)==1); + /* a lead surrogate pair with no trailer behind it */ + assert(scan_jsonescape("a\\udafd\"",buf,&l)==1); + assert(scan_jsonescape("a\\udafd\\udafd",buf,&l)==1); + assert(scan_jsonescape("a\\udafd0",buf,&l)==1); + /* correct surrogate pair */ + assert(scan_jsonescape("a\\ud834\\udd1eb",buf,&l)==14 && l==6 && !memcmp(buf,"a\xf0\x9d\x84\x9e""b",6)); + /* how about some incorrect UTF-8? */ + assert(scan_jsonescape("a\xc0\xaf",buf,&l)==1 && l==1 && !memcmp(buf,"a",1)); +} diff --git a/textcode/scan_jsonescape.c b/textcode/scan_jsonescape.c index 0ba88b8..5383c7c 100644 --- a/textcode/scan_jsonescape.c +++ b/textcode/scan_jsonescape.c @@ -9,15 +9,16 @@ size_t scan_jsonescape(const char *src,char *dest,size_t *destlen) { unsigned int prev,cur,todo; prev=cur=(unsigned int)-1; for (i=0; s[i]; ++i) { + if (s[i]=='"') { + if (prev!=(unsigned int)-1) goto abort; + goto done; + } if ((c=s[i])=='\\') { switch (s[i+1]) { - case '"': - if (prev!=(unsigned int)-1) return 0; // lead surrogate not followed by tail surrogate - goto done; case '\\': - if (prev!=(unsigned int)-1) return 0; // lead surrogate not followed by tail surrogate // c='\\'; // c already is backslash break; + case '"': c='"'; break; case 'n': c='\n'; break; case 'r': c='\r'; break; case 'b': c='\b'; break; @@ -28,18 +29,19 @@ size_t scan_jsonescape(const char *src,char *dest,size_t *destlen) { size_t j; for (cur=j=0; j<4; ++j) { char x=scan_fromhex(s[i+2+j]); - if (x<0) return 0; // not hex -> invalid input + if (x<0) goto abort; // not hex -> invalid input cur=(cur<<4) | x; } if (cur>=0xd800 && cur<=0xdbff) { // utf-16 surrogate pair; needs to be followed by another // surrogate. We need to read both and convert to UTF-8 - if (prev!=(unsigned int)-1) return 0; // two lead surrogates + if (prev!=(unsigned int)-1) goto abort; // two lead surrogates prev=cur; i+=5; // we want i to go up by 6, 1 is done by the for loop continue; // write nothing! } else if (cur>=0xdc00 && cur<=0xdfff) { - todo=(cur&0x3ff) | ((prev&0x3ff) << 10) | 0x100000; + if (prev==(unsigned int)-1) goto abort; // no lead surrogate + todo=(cur&0x3ff) | ((prev&0x3ff) << 10) | 0x10000; } else todo=cur; written+=fmt_utf8(dest?dest+written:dest,todo); @@ -48,16 +50,22 @@ size_t scan_jsonescape(const char *src,char *dest,size_t *destlen) { continue; } default: - if (prev!=(unsigned int)-1) return 0; // lead surrogate not followed by tail surrogate c=s[i+1]; break; } ++i; } + if (prev!=(unsigned int)-1) goto abort; + /* We expect utf-8 incoming. Make sure it's valid. */ + if (!scan_utf8(s+i,4,NULL)) goto abort; if (dest) dest[written]=c; ++written; } done: *destlen=written; return i; +abort: + if (prev!=(unsigned int)-1) i-=6; // if we abort and there still was an open surrogate pair, cancel it + *destlen=written; + return i; }