improve json escape decoder, add test suite

master
leitner 8 years ago
parent 3c68ae0dcc
commit 41db97d380

@ -0,0 +1,32 @@
#include <assert.h>
#include <string.h>
#include <textcode.h>
int main() {
char buf[100];
size_t l;
assert(scan_jsonescape("fnord",buf,&l)==5 && l==5 && !memcmp(buf,"fnord",5));
/* is \n properly unescaped? */
assert(scan_jsonescape("a\\nb",buf,&l)==4 && l==3 && !memcmp(buf,"a\nb",3));
assert(scan_jsonescape("a\\rb",buf,&l)==4 && l==3 && !memcmp(buf,"a\rb",3));
assert(scan_jsonescape("a\\bb",buf,&l)==4 && l==3 && !memcmp(buf,"a\bb",3));
assert(scan_jsonescape("a\\fb",buf,&l)==4 && l==3 && !memcmp(buf,"a\fb",3));
assert(scan_jsonescape("a\\tb",buf,&l)==4 && l==3 && !memcmp(buf,"a\tb",3));
assert(scan_jsonescape("a\\\\b",buf,&l)==4 && l==3 && !memcmp(buf,"a\\b",3));
assert(scan_jsonescape("a\\/b",buf,&l)==4 && l==3 && !memcmp(buf,"a/b",3));
assert(scan_jsonescape("a\\\"b",buf,&l)==4 && l==3 && !memcmp(buf,"a\"b",3));
/* does a double quote end the string? */
assert(scan_jsonescape("a\"b",buf,&l)==1 && l==1 && !memcmp(buf,"a",1));
/* how about unicode escape */
assert(scan_jsonescape("a\\u005cb",buf,&l)==8 && l==3 && !memcmp(buf,"a\\b",3));
/* a trailing surrogate pair with no lead before it */
assert(scan_jsonescape("a\\udead\"",buf,&l)==1);
/* a lead surrogate pair with no trailer behind it */
assert(scan_jsonescape("a\\udafd\"",buf,&l)==1);
assert(scan_jsonescape("a\\udafd\\udafd",buf,&l)==1);
assert(scan_jsonescape("a\\udafd0",buf,&l)==1);
/* correct surrogate pair */
assert(scan_jsonescape("a\\ud834\\udd1eb",buf,&l)==14 && l==6 && !memcmp(buf,"a\xf0\x9d\x84\x9e""b",6));
/* how about some incorrect UTF-8? */
assert(scan_jsonescape("a\xc0\xaf",buf,&l)==1 && l==1 && !memcmp(buf,"a",1));
}

@ -9,15 +9,16 @@ size_t scan_jsonescape(const char *src,char *dest,size_t *destlen) {
unsigned int prev,cur,todo; unsigned int prev,cur,todo;
prev=cur=(unsigned int)-1; prev=cur=(unsigned int)-1;
for (i=0; s[i]; ++i) { for (i=0; s[i]; ++i) {
if (s[i]=='"') {
if (prev!=(unsigned int)-1) goto abort;
goto done;
}
if ((c=s[i])=='\\') { if ((c=s[i])=='\\') {
switch (s[i+1]) { switch (s[i+1]) {
case '"':
if (prev!=(unsigned int)-1) return 0; // lead surrogate not followed by tail surrogate
goto done;
case '\\': case '\\':
if (prev!=(unsigned int)-1) return 0; // lead surrogate not followed by tail surrogate
// c='\\'; // c already is backslash // c='\\'; // c already is backslash
break; break;
case '"': c='"'; break;
case 'n': c='\n'; break; case 'n': c='\n'; break;
case 'r': c='\r'; break; case 'r': c='\r'; break;
case 'b': c='\b'; break; case 'b': c='\b'; break;
@ -28,18 +29,19 @@ size_t scan_jsonescape(const char *src,char *dest,size_t *destlen) {
size_t j; size_t j;
for (cur=j=0; j<4; ++j) { for (cur=j=0; j<4; ++j) {
char x=scan_fromhex(s[i+2+j]); char x=scan_fromhex(s[i+2+j]);
if (x<0) return 0; // not hex -> invalid input if (x<0) goto abort; // not hex -> invalid input
cur=(cur<<4) | x; cur=(cur<<4) | x;
} }
if (cur>=0xd800 && cur<=0xdbff) { if (cur>=0xd800 && cur<=0xdbff) {
// utf-16 surrogate pair; needs to be followed by another // utf-16 surrogate pair; needs to be followed by another
// surrogate. We need to read both and convert to UTF-8 // surrogate. We need to read both and convert to UTF-8
if (prev!=(unsigned int)-1) return 0; // two lead surrogates if (prev!=(unsigned int)-1) goto abort; // two lead surrogates
prev=cur; prev=cur;
i+=5; // we want i to go up by 6, 1 is done by the for loop i+=5; // we want i to go up by 6, 1 is done by the for loop
continue; // write nothing! continue; // write nothing!
} else if (cur>=0xdc00 && cur<=0xdfff) { } else if (cur>=0xdc00 && cur<=0xdfff) {
todo=(cur&0x3ff) | ((prev&0x3ff) << 10) | 0x100000; if (prev==(unsigned int)-1) goto abort; // no lead surrogate
todo=(cur&0x3ff) | ((prev&0x3ff) << 10) | 0x10000;
} else } else
todo=cur; todo=cur;
written+=fmt_utf8(dest?dest+written:dest,todo); written+=fmt_utf8(dest?dest+written:dest,todo);
@ -48,16 +50,22 @@ size_t scan_jsonescape(const char *src,char *dest,size_t *destlen) {
continue; continue;
} }
default: default:
if (prev!=(unsigned int)-1) return 0; // lead surrogate not followed by tail surrogate
c=s[i+1]; c=s[i+1];
break; break;
} }
++i; ++i;
} }
if (prev!=(unsigned int)-1) goto abort;
/* We expect utf-8 incoming. Make sure it's valid. */
if (!scan_utf8(s+i,4,NULL)) goto abort;
if (dest) dest[written]=c; if (dest) dest[written]=c;
++written; ++written;
} }
done: done:
*destlen=written; *destlen=written;
return i; return i;
abort:
if (prev!=(unsigned int)-1) i-=6; // if we abort and there still was an open surrogate pair, cancel it
*destlen=written;
return i;
} }

Loading…
Cancel
Save