fmt_jsonescape: convert utf-8 >ffff to \u escaped surrogate pair

master
leitner 7 years ago
parent b9eb81c58f
commit 84d3e6a7b4

@ -1,6 +1,7 @@
#include "fmt.h" #include "fmt.h"
#include "textcode.h" #include "textcode.h"
#include "str.h" #include "str.h"
#include "scan.h"
#include "haveinline.h" #include "haveinline.h"
/* src is UTF-8 encoded */ /* src is UTF-8 encoded */
@ -36,6 +37,28 @@ escape:
dest[written+5]=fmt_tohex(s[i]&0xf); dest[written+5]=fmt_tohex(s[i]&0xf);
} }
written+=6; written+=6;
} else if (s[i]>0x7f) {
/* UTF-8! Convert to surrogate pair if needed. */
uint32_t u;
size_t j=scan_utf8_sem((const char*)s+i,len-i,&u);
if (j==0) /* Invalid UTF-8! Abort! */
return written;
if (u>0xffff) {
if (dest) {
dest[written ]='\\';
dest[written+1]='u';
fmt_xlong(dest+written+2,0xd800 + ((u>>10) & 0x3ff));
dest[written+6]='\\';
dest[written+7]='u';
fmt_xlong(dest+written+8,0xdc00 + (u & 0x3ff));
}
written+=12;
} else {
if (dest) memcpy(dest+written,s+i,j);
written+=j;
}
i+=j-1; /* -1 because the for loop will also add 1 */
break;
} else { } else {
if (dest) dest[written]=s[i]; if (dest) dest[written]=s[i];
++written; ++written;
@ -48,3 +71,17 @@ escape:
return written; return written;
} }
#ifdef UNITTEST
#include <assert.h>
#include <string.h>
int main() {
char buf[100];
/* test utf-8 pass-through and correct encoding of \t */
assert(fmt_jsonescape(buf,"\tfnörd",7)==8 && !memcmp(buf,"\\tfnörd",8));
/* test escaping of unprintable characters */
assert(fmt_jsonescape(buf,"\001x",2)==7 && !memcmp(buf,"\\u0001x",7));
/* test conversion of large UTF-8 chars to UTF-16 surrogate pairs (poop emoji) */
assert(fmt_jsonescape(buf,"\xf0\x9f\x92\xa9x",5)==13 && !memcmp(buf,"\\ud87d\\udca9x",13));
}
#endif

@ -25,9 +25,15 @@ static const char* lookup(size_t ofs,const char* t) {
return NULL; return NULL;
} }
static size_t scan_html_inner(const char *src,char *dest,size_t *destlen,int flag) { enum htmlmode { /* <a href="http://example.com/&quot;foo">libowfat&lt;home</a> */
OUTSIDE, /* ^^^^^^^^^^^^^^^^ -> `libowfat<home` */
TAGARG, /* ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -> `http://example.com/"foo´ */
};
static size_t scan_html_inner(const char *src,char *dest,size_t *destlen,enum htmlmode mode) {
register const unsigned char* s=(const unsigned char*) src; register const unsigned char* s=(const unsigned char*) src;
size_t written=0,i; size_t written=0,i;
int dq=0;
for (i=0; s[i]; ++i) { for (i=0; s[i]; ++i) {
if (s[i]=='&') { if (s[i]=='&') {
const char* utf8; const char* utf8;
@ -58,7 +64,8 @@ static size_t scan_html_inner(const char *src,char *dest,size_t *destlen,int fla
continue; continue;
} else } else
dest[written]='&'; dest[written]='&';
} else if (flag && s[i]=='<') { } else if (s[i]=='<') {
if (mode == OUTSIDE) break;
if (case_starts((const char*)s+i+1,"br>")) { if (case_starts((const char*)s+i+1,"br>")) {
dest[written]='\n'; dest[written]='\n';
i+=3; i+=3;
@ -68,7 +75,12 @@ static size_t scan_html_inner(const char *src,char *dest,size_t *destlen,int fla
i+=3; i+=3;
} else } else
dest[written]=s[i]; dest[written]=s[i];
} else } else if (s[i]=='"' && mode==TAGARG) {
if (i==0) { dq=1; continue; }
break;
} else if (mode==TAGARG && !dq && (s[i]==' ' || s[i]=='\t' || s[i]=='\n'))
break;
else
dest[written]=s[i]; dest[written]=s[i];
++written; ++written;
} }
@ -77,9 +89,22 @@ static size_t scan_html_inner(const char *src,char *dest,size_t *destlen,int fla
} }
size_t scan_html_tagarg(const char *src,char *dest,size_t *destlen) { size_t scan_html_tagarg(const char *src,char *dest,size_t *destlen) {
return scan_html_inner(src,dest,destlen,1); return scan_html_inner(src,dest,destlen,TAGARG);
} }
size_t scan_html(const char *src,char *dest,size_t *destlen) { size_t scan_html(const char *src,char *dest,size_t *destlen) {
return scan_html_inner(src,dest,destlen,0); return scan_html_inner(src,dest,destlen,OUTSIDE);
}
#ifdef UNITTEST
#include <assert.h>
int main() {
char* html="<a href=\"http://example.com/&quot;foo\">libowfat&lt;home</a>";
char buf[100];
size_t destlen;
assert(scan_html(html,buf,&destlen)==0 && destlen==0);
assert(scan_html(strchr(html,'>')+1,buf,&destlen)==16 && destlen==13 && !memcmp(buf,"libowfat<home",13));
assert(scan_html_tagarg(strchr(html,'"')+1,buf,&destlen)==28 && destlen==23 && !memcmp(buf,"http://example.com/\"foo",23));
} }
#endif

@ -99,6 +99,7 @@ int main() {
assert(scan_jsonescape("a\\udafd0",buf,&l)==1); assert(scan_jsonescape("a\\udafd0",buf,&l)==1);
/* correct surrogate pair */ /* correct surrogate pair */
assert(scan_jsonescape("a\\ud834\\udd1eb",buf,&l)==14 && l==6 && !memcmp(buf,"a\xf0\x9d\x84\x9e""b",6)); assert(scan_jsonescape("a\\ud834\\udd1eb",buf,&l)==14 && l==6 && !memcmp(buf,"a\xf0\x9d\x84\x9e""b",6));
assert(scan_jsonescape("\\ud87d\\udca9x",buf,&l)==13 && l==5 && !memcmp(buf,"\xf0\x9f\x92\xa9x",5));
/* how about some incorrect UTF-8? */ /* how about some incorrect UTF-8? */
assert(scan_jsonescape("a\xc0\xaf",buf,&l)==1 && l==1 && !memcmp(buf,"a",1)); assert(scan_jsonescape("a\xc0\xaf",buf,&l)==1 && l==1 && !memcmp(buf,"a",1));
return 0; return 0;

Loading…
Cancel
Save