From 84d3e6a7b493722f3bc2e3ca906c444a1403d955 Mon Sep 17 00:00:00 2001 From: leitner Date: Tue, 25 Jul 2017 18:46:24 +0000 Subject: [PATCH] fmt_jsonescape: convert utf-8 >ffff to \u escaped surrogate pair --- textcode/fmt_jsonescape.c | 37 +++++++++++++++++++++++++++++++++++++ textcode/scan_html.c | 35 ++++++++++++++++++++++++++++++----- textcode/scan_jsonescape.c | 1 + 3 files changed, 68 insertions(+), 5 deletions(-) diff --git a/textcode/fmt_jsonescape.c b/textcode/fmt_jsonescape.c index 89113f2..dde2731 100644 --- a/textcode/fmt_jsonescape.c +++ b/textcode/fmt_jsonescape.c @@ -1,6 +1,7 @@ #include "fmt.h" #include "textcode.h" #include "str.h" +#include "scan.h" #include "haveinline.h" /* src is UTF-8 encoded */ @@ -36,6 +37,28 @@ escape: dest[written+5]=fmt_tohex(s[i]&0xf); } written+=6; + } else if (s[i]>0x7f) { + /* UTF-8! Convert to surrogate pair if needed. */ + uint32_t u; + size_t j=scan_utf8_sem((const char*)s+i,len-i,&u); + if (j==0) /* Invalid UTF-8! Abort! */ + return written; + if (u>0xffff) { + if (dest) { + dest[written ]='\\'; + dest[written+1]='u'; + fmt_xlong(dest+written+2,0xd800 + ((u>>10) & 0x3ff)); + dest[written+6]='\\'; + dest[written+7]='u'; + fmt_xlong(dest+written+8,0xdc00 + (u & 0x3ff)); + } + written+=12; + } else { + if (dest) memcpy(dest+written,s+i,j); + written+=j; + } + i+=j-1; /* -1 because the for loop will also add 1 */ + break; } else { if (dest) dest[written]=s[i]; ++written; @@ -48,3 +71,17 @@ escape: return written; } +#ifdef UNITTEST +#include +#include + +int main() { + char buf[100]; + /* test utf-8 pass-through and correct encoding of \t */ + assert(fmt_jsonescape(buf,"\tfnörd",7)==8 && !memcmp(buf,"\\tfnörd",8)); + /* test escaping of unprintable characters */ + assert(fmt_jsonescape(buf,"\001x",2)==7 && !memcmp(buf,"\\u0001x",7)); + /* test conversion of large UTF-8 chars to UTF-16 surrogate pairs (poop emoji) */ + assert(fmt_jsonescape(buf,"\xf0\x9f\x92\xa9x",5)==13 && !memcmp(buf,"\\ud87d\\udca9x",13)); +} +#endif diff --git a/textcode/scan_html.c b/textcode/scan_html.c index 97e9122..5ddb50a 100644 --- a/textcode/scan_html.c +++ b/textcode/scan_html.c @@ -25,9 +25,15 @@ static const char* lookup(size_t ofs,const char* t) { return NULL; } -static size_t scan_html_inner(const char *src,char *dest,size_t *destlen,int flag) { +enum htmlmode { /* libowfat<home */ + OUTSIDE, /* ^^^^^^^^^^^^^^^^ -> `libowfat `http://example.com/"foo´ */ +}; + +static size_t scan_html_inner(const char *src,char *dest,size_t *destlen,enum htmlmode mode) { register const unsigned char* s=(const unsigned char*) src; size_t written=0,i; + int dq=0; for (i=0; s[i]; ++i) { if (s[i]=='&') { const char* utf8; @@ -58,7 +64,8 @@ static size_t scan_html_inner(const char *src,char *dest,size_t *destlen,int fla continue; } else dest[written]='&'; - } else if (flag && s[i]=='<') { + } else if (s[i]=='<') { + if (mode == OUTSIDE) break; if (case_starts((const char*)s+i+1,"br>")) { dest[written]='\n'; i+=3; @@ -68,7 +75,12 @@ static size_t scan_html_inner(const char *src,char *dest,size_t *destlen,int fla i+=3; } else dest[written]=s[i]; - } else + } else if (s[i]=='"' && mode==TAGARG) { + if (i==0) { dq=1; continue; } + break; + } else if (mode==TAGARG && !dq && (s[i]==' ' || s[i]=='\t' || s[i]=='\n')) + break; + else dest[written]=s[i]; ++written; } @@ -77,9 +89,22 @@ static size_t scan_html_inner(const char *src,char *dest,size_t *destlen,int fla } size_t scan_html_tagarg(const char *src,char *dest,size_t *destlen) { - return scan_html_inner(src,dest,destlen,1); + return scan_html_inner(src,dest,destlen,TAGARG); } size_t scan_html(const char *src,char *dest,size_t *destlen) { - return scan_html_inner(src,dest,destlen,0); + return scan_html_inner(src,dest,destlen,OUTSIDE); +} + +#ifdef UNITTEST +#include + +int main() { + char* html="libowfat<home"; + char buf[100]; + size_t destlen; + assert(scan_html(html,buf,&destlen)==0 && destlen==0); + assert(scan_html(strchr(html,'>')+1,buf,&destlen)==16 && destlen==13 && !memcmp(buf,"libowfat