fmt_jsonescape: convert utf-8 >ffff to \u escaped surrogate pair

2017-07-25 18:46:24 +00:00 · 2017-07-25 18:46:24 +00:00 · 84d3e6a7b4
commit 84d3e6a7b4
parent b9eb81c58f
3 changed files with 68 additions and 5 deletions
--- a/textcode/fmt_jsonescape.c
+++ b/textcode/fmt_jsonescape.c
@ -1,6 +1,7 @@
 #include "fmt.h"
 #include "textcode.h"
 #include "str.h"
 #include "scan.h"
 #include "haveinline.h"
 /* src is UTF-8 encoded */
@ -36,6 +37,28 @@ escape:
 	  dest[written+5]=fmt_tohex(s[i]&0xf);
 	}
 	written+=6;
      } else if (s[i]>0x7f) {
 	/* UTF-8! Convert to surrogate pair if needed. */
 	uint32_t u;
 	size_t j=scan_utf8_sem((const char*)s+i,len-i,&u);
 	if (j==0) /* Invalid UTF-8! Abort! */
 	  return written;
 	if (u>0xffff) {
 	  if (dest) {
 	    dest[written  ]='\\';
 	    dest[written+1]='u';
 	    fmt_xlong(dest+written+2,0xd800 + ((u>>10) & 0x3ff));
 	    dest[written+6]='\\';
 	    dest[written+7]='u';
 	    fmt_xlong(dest+written+8,0xdc00 + (u & 0x3ff));
 	  }
 	  written+=12;
 	} else {
 	  if (dest) memcpy(dest+written,s+i,j);
 	  written+=j;
 	}
 	i+=j-1;	/* -1 because the for loop will also add 1 */
 	break;
      } else {
 	if (dest) dest[written]=s[i];
 	++written;
@ -48,3 +71,17 @@ escape:
  return written;
 }
 #ifdef UNITTEST
 #include <assert.h>
 #include <string.h>
 int main() {
  char buf[100];
  /* test utf-8 pass-through and correct encoding of \t */
  assert(fmt_jsonescape(buf,"\tfnörd",7)==8 && !memcmp(buf,"\\tfnörd",8));
  /* test escaping of unprintable characters */
  assert(fmt_jsonescape(buf,"\001x",2)==7 && !memcmp(buf,"\\u0001x",7));
  /* test conversion of large UTF-8 chars to UTF-16 surrogate pairs (poop emoji) */
  assert(fmt_jsonescape(buf,"\xf0\x9f\x92\xa9x",5)==13 && !memcmp(buf,"\\ud87d\\udca9x",13));
 }
 #endif
--- a/textcode/scan_html.c
+++ b/textcode/scan_html.c
@ -25,9 +25,15 @@ static const char* lookup(size_t ofs,const char* t) {
  return NULL;
 }
-static size_t scan_html_inner(const char *src,char *dest,size_t *destlen,int flag) {
+enum htmlmode {	/* <a href="http://example.com/&quot;foo">libowfat&lt;home</a> */
  OUTSIDE,	/*                                        ^^^^^^^^^^^^^^^^ -> `libowfat<home` */
  TAGARG,	/*         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -> `http://example.com/"foo´ */
 };
 static size_t scan_html_inner(const char *src,char *dest,size_t *destlen,enum htmlmode mode) {
  register const unsigned char* s=(const unsigned char*) src;
  size_t written=0,i;
  int dq=0;
  for (i=0; s[i]; ++i) {
    if (s[i]=='&') {
      const char* utf8;
@ -58,7 +64,8 @@ static size_t scan_html_inner(const char *src,char *dest,size_t *destlen,int fla
 	continue;
      } else
 	dest[written]='&';
-    } else if (flag && s[i]=='<') {
+    } else if (s[i]=='<') {
      if (mode == OUTSIDE) break;
      if (case_starts((const char*)s+i+1,"br>")) {
 	dest[written]='\n';
 	i+=3;
@ -68,7 +75,12 @@ static size_t scan_html_inner(const char *src,char *dest,size_t *destlen,int fla
 	i+=3;
      } else
 	dest[written]=s[i];
-    } else
+    } else if (s[i]=='"' && mode==TAGARG) {
      if (i==0) { dq=1; continue; }
      break;
    } else if (mode==TAGARG && !dq && (s[i]==' ' || s[i]=='\t' || s[i]=='\n'))
      break;
    else
      dest[written]=s[i];
    ++written;
  }
@ -77,9 +89,22 @@ static size_t scan_html_inner(const char *src,char *dest,size_t *destlen,int fla
 }
 size_t scan_html_tagarg(const char *src,char *dest,size_t *destlen) {
-  return scan_html_inner(src,dest,destlen,1);
+  return scan_html_inner(src,dest,destlen,TAGARG);
 }
 size_t scan_html(const char *src,char *dest,size_t *destlen) {
-  return scan_html_inner(src,dest,destlen,0);
+  return scan_html_inner(src,dest,destlen,OUTSIDE);
 }
 #ifdef UNITTEST
 #include <assert.h>
 int main() {
  char* html="<a href=\"http://example.com/&quot;foo\">libowfat&lt;home</a>";
  char buf[100];
  size_t destlen;
  assert(scan_html(html,buf,&destlen)==0 && destlen==0);
  assert(scan_html(strchr(html,'>')+1,buf,&destlen)==16 && destlen==13 && !memcmp(buf,"libowfat<home",13));
  assert(scan_html_tagarg(strchr(html,'"')+1,buf,&destlen)==28 && destlen==23 && !memcmp(buf,"http://example.com/\"foo",23));
 }
 #endif
--- a/textcode/scan_jsonescape.c
+++ b/textcode/scan_jsonescape.c
@ -99,6 +99,7 @@ int main() {
  assert(scan_jsonescape("a\\udafd0",buf,&l)==1);
  /* correct surrogate pair */
  assert(scan_jsonescape("a\\ud834\\udd1eb",buf,&l)==14 && l==6 && !memcmp(buf,"a\xf0\x9d\x84\x9e""b",6));
  assert(scan_jsonescape("\\ud87d\\udca9x",buf,&l)==13 && l==5 && !memcmp(buf,"\xf0\x9f\x92\xa9x",5));
  /* how about some incorrect UTF-8? */
  assert(scan_jsonescape("a\xc0\xaf",buf,&l)==1 && l==1 && !memcmp(buf,"a",1));
  return 0;