don't escape poop emoji (emit as utf8 instead)

if input is not valid utf8, encode each invalid byte as utf8 in output
2018-02-02 18:16:09 +00:00 · 2018-02-02 18:16:09 +00:00 · 708e34ab7f
commit 708e34ab7f
parent 112ccf19e7
2 changed files with 15 additions and 5 deletions
--- a/textcode.h
+++ b/textcode.h
@ -49,7 +49,7 @@ size_t fmt_foldwhitespace(char* dest,const char* src,size_t len);
 size_t fmt_ldapescape(char* dest,const char* src,size_t len);
 size_t fmt_ldapescape2(char* dest,const char* src,size_t len,const char* escapeme);
 /* Encode JSON string from UTF-8; will backslash-escape the bare minimum.
- * Will not verify that the input is valid UTF-8!
+ * Invalid UTF-8 in input will output as valid UTF-8 for each byte
 * Worst case: len*6 */
 size_t fmt_jsonescape(char* dest,const char* src,size_t len);

--- a/textcode/fmt_jsonescape.c
+++ b/textcode/fmt_jsonescape.c
@ -42,8 +42,13 @@ escape:
 	/* UTF-8! Convert to surrogate pair if needed. */
 	uint32_t u;
 	size_t j=scan_utf8_sem((const char*)s+i,len-i,&u);
-	if (j==0) /* Invalid UTF-8! Abort! */
-	  return written;
+	if (j==0) {	/* Invalid UTF-8! Try to limp on! */
+	  written+=fmt_utf8(dest?dest+written:0,s[i]);
+	  break;
+	}
+	/* It turns out we are not required to escape these.
+	 * So we won't. */
+#if 0
 	if (u>0xffff) {
 	  if (dest) {
 	    dest[written  ]='\\';
@ -54,7 +59,9 @@ escape:
 	    fmt_xlong(dest+written+8,0xdc00 + (u & 0x3ff));
 	  }
 	  written+=12;
-	} else {
+	} else
+#endif
+	       {
 	  if (dest) memcpy(dest+written,s+i,j);
 	  written+=j;
 	}
@ -83,6 +90,9 @@ int main() {
  /* test escaping of unprintable characters */
  assert(fmt_jsonescape(buf,"\001x",2)==7 && !memcmp(buf,"\\u0001x",7));
  /* test conversion of large UTF-8 chars to UTF-16 surrogate pairs (poop emoji) */
-  assert(fmt_jsonescape(buf,"\xf0\x9f\x92\xa9x",5)==13 && !memcmp(buf,"\\ud83d\\udca9x",13));
+  /* EDIT: this escaping is not actually needed, so we aren't doing it
+   * anymore. This test will fail now:
+  assert(fmt_jsonescape(buf,"\xf0\x9f\x92\xa9x",5)==13 && !memcmp(buf,"\\ud83d\\udca9x",13)); */
+  assert(fmt_jsonescape(buf,"a\x81x",3)==4 && !memcmp(buf,"a\xc2\x81x",4));
 }
 #endif