add json encoding routines to textcode

2014-01-28 16:12:39 +00:00 · 2014-01-28 16:12:39 +00:00 · 1d2add208f
commit 1d2add208f
parent 66cb1c1c82
6 changed files with 151 additions and 17 deletions
--- a/fmt/fmt_utf8.3
+++ b/fmt/fmt_utf8.3
@ -15,5 +15,10 @@ number of bytes it would have written.
 For convenience, fmt.h defines the integer FMT_UTF8 to be big enough to
 contain every possible fmt_utf8 output.
 .SH NOTE
 fmt_utf8 and scan_utf8 implement the encoding from UTF-8, but are meant
 to be able to store integers, not just Unicode code points.  Values
 larger than 0x10ffff are not valid UTF-8 (see RFC 3629) but can be
 represented in the encoding, so fmt_utf8 will allow them.
 .SH "SEE ALSO"
 scan_utf8(3)
--- a/scan/scan_utf8.3
+++ b/scan/scan_utf8.3
@ -17,5 +17,10 @@ sequence, scan_utf8 returns 0 and does not touch \fIdest\fR.
 The length of the longest UTF-8 sequence is 5.  If the buffer is longer
 than that, and scan_utf8 fails, then the data was not a valid UTF-8
 encoded sequence.
 .SH NOTE
 fmt_utf8 and scan_utf8 implement the encoding from UTF-8, but are meant
 to be able to store integers, not just Unicode code points.  Values
 above 0x10ffff are not valid UTF-8.  If you are using this function to
 parse UTF-8, you need to reject them (see RFC 3629).
 .SH "SEE ALSO"
 fmt_utf8(3)
--- a/t.c
+++ b/t.c
@ -29,11 +29,11 @@
 #include "io_internal.h"
 #define rdtscl(low) \
-     __asm__ __volatile__ ("rdtsc" : "=a" (low) : : "edx")
+     __asm__ __volatile__ ("lfence\nrdtsc" : "=a" (low) : : "edx")
 // #define atomic_add(mem,val) asm volatile ("lock; add%z0 %1, %0": "+m" (mem): "ir" (val))
-int64 writecb(int64 fd,const void* buf,uint64 n) {
+static int64 writecb(int64 fd,const void* buf,uint64 n) {
  (void)fd;
  (void)buf;
  (void)n;
@ -48,6 +48,24 @@ int64 writecb(int64 fd,const void* buf,uint64 n) {
 }
 int main(int argc,char* argv[]) {
  char buf[1024];
  size_t l;
  assert(fmt_jsonescape(buf,"foo\nbar\\",8)==14 && byte_equal(buf,14,"foo\\u000abar\\\\"));
  memset(buf,0,sizeof(buf));
  assert(scan_jsonescape("foo\\u000abar\\\\",buf,&l)==14 && l==8 && byte_equal(buf,8,"foo\nbar\\"));
  memset(buf,0,sizeof(buf));
  /* example from the json spec: G clef U+1D11E encoded using UTF-16 surrogates*/
  assert(scan_jsonescape("\\uD834\\uDD1Exyz",buf,&l)==15 && l==7 && byte_equal(buf,7,"\xf4\x8d\x84\x9exyz"));
 /*
 	 1D11E -> 0001 1101 0001 0001 1110
 	       -> ______00 __011101 __000100 __011110
 	 as utf8: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
 	          11110000 10011101 10000100 10011110
 		  f   0    9   d    8   4    9   e
 */
 #if 0
  static size_t x;
  x=23;
  atomic_add(&x,3);
@ -55,6 +73,7 @@ int main(int argc,char* argv[]) {
  printf("%u\n",atomic_add_return(&x,-3));
  printf("%u\n",compare_and_swap(&x,26,17));
  printf("%u\n",compare_and_swap(&x,23,17));
 #endif
 #if 0
  atomic_add(&x,3); printf("%u\n",x);
@ -361,7 +380,7 @@ int main(int argc,char* argv[]) {
 #endif
 #if 0
  printf("%d %d\n",strcmp("foo","bar"),str_diff("foo","bar"));
-  printf("%d %d\n",strcmp("foo","üar"),str_diff("foo","üar"));
+  printf("%d %d\n",strcmp("foo","\xfcar"),str_diff("foo","\xfcar"));
 #endif
 #if 0
  {
--- a/textcode.h
+++ b/textcode.h
@ -11,35 +11,39 @@ extern "C" {
 /* These take len bytes from src and write them in encoded form to
 * dest (if dest != NULL), returning the number of bytes written. */
-/* needs len/3*4 bytes */
+/* Needs len/3*4 bytes */
 size_t fmt_uuencoded(char* dest,const char* src,size_t len);
-/* needs len/3*4 bytes */
+/* Needs len/3*4 bytes */
 size_t fmt_base64(char* dest,const char* src,size_t len);
-/* worst case: len*3 */
+/* Worst case: len*3 */
 size_t fmt_quotedprintable(char* dest,const char* src,size_t len);
-/* worst case: len*3 */
+/* Worst case: len*3 */
 size_t fmt_quotedprintable2(char* dest,const char* src,size_t len,const char* escapeme);
-/* worst case: len*3 */
+/* Worst case: len*3 */
 size_t fmt_urlencoded(char* dest,const char* src,size_t len);
-/* worst case: len*3 */
+/* Worst case: len*3 */
 size_t fmt_urlencoded2(char* dest,const char* src,size_t len,const char* escapeme);
-/* worst case: len*2 */
+/* Worst case: len*2 */
 size_t fmt_yenc(char* dest,const char* src,size_t len);
-/* needs len*2 bytes */
+/* Needs len*2 bytes */
 size_t fmt_hexdump(char* dest,const char* src,size_t len);
-/* change '<' to '&lt;' and '&' to '&amp;'; worst case: len*5 */
+/* Change '<' to '&lt;' and '&' to '&amp;'; worst case: len*5 */
 size_t fmt_html(char* dest,const char* src,size_t len);
-/* change '\' to "\\", '\n' to "\n", ^A to "\x01" etc; worst case: len*4 */
+/* Change '\' to "\\", '\n' to "\n", ^A to "\x01" etc; worst case: len*4 */
 size_t fmt_cescape(char* dest,const char* src,size_t len);
-/* worst case: len*4 */
+/* Worst case: len*4 */
 size_t fmt_cescape2(char* dest,const char* src,size_t len,const char* escapeme);
-/* fold awk whitespace to '_'; this is great for writing fields with
+/* Fold whitespace to '_'; this is great for writing fields with
 * white spaces to a log file and still allow awk to do log analysis */
-/* worst case: same size */
+/* Worst case: same size */
 size_t fmt_foldwhitespace(char* dest,const char* src,size_t len);
-/* worst case: len*3 */
+/* Worst case: len*3 */
 size_t fmt_ldapescape(char* dest,const char* src,size_t len);
 size_t fmt_ldapescape2(char* dest,const char* src,size_t len,const char* escapeme);
 /* Encode JSON string from UTF-8; will backslash-escape the bare minimum.
 * Will not verify that the input is valid UTF-8!
 * Worst case: len*6 */
 size_t fmt_jsonescape(char* dest,const char* src,size_t len);
 /* These read one line from src, decoded it, and write the result to
 * dest.  The number of decoded bytes is written to destlen.  dest
@ -54,6 +58,7 @@ size_t scan_hexdump(const char *src,char *dest,size_t *destlen);
 size_t scan_html(const char *src,char *dest,size_t *destlen);
 size_t scan_cescape(const char *src,char *dest,size_t *destlen);
 size_t scan_ldapescape(const char* src,char* dest,size_t *destlen);
 size_t scan_jsonescape(const char* src,char* dest,size_t *destlen);
 #ifdef STRALLOC_H
 /* WARNING: these functions _append_ to the stralloc, not overwrite! */
@ -78,6 +83,8 @@ size_t scan_to_sa(size_t (*func)(const char*,char*,size_t*),
 #define fmt_hexdump_sa(sa,src,len) fmt_to_sa(fmt_hexdump,sa,src,len)
 #define fmt_html_sa(sa,src,len) fmt_to_sa(fmt_html,sa,src,len)
 #define fmt_cescape_sa(sa,src,len) fmt_to_sa(fmt_cescape,sa,src,len)
 #define fmt_ldapescape_sa(sa,src,len) fmt_to_sa(fmt_ldapescape,sa,src,len)
 #define fmt_jsonescape_sa(sa,src,len) fmt_to_sa(fmt_jsonescape,sa,src,len)
 #define fmt_quotedprintable2_sa(sa,src,len,escapeme) fmt_to_sa2(fmt_quotedprintable2,sa,src,len,escapeme)
 #define fmt_urlencoded2_sa(sa,src,len,escapeme) fmt_to_sa2(fmt_urlencoded2,sa,src,len,escapeme)
@ -91,6 +98,8 @@ size_t scan_to_sa(size_t (*func)(const char*,char*,size_t*),
 #define scan_hexdump_sa(src,sa) scan_to_sa(scan_hexdump,src,sa)
 #define scan_html_sa(src,sa) scan_to_sa(scan_html,src,sa)
 #define scan_cescape_sa(src,sa) scan_to_sa(scan_cescape,src,sa)
 #define scan_ldapescape_sa(src,sa) scan_to_sa(scan_ldapescape,src,sa)
 #define scan_jsonescape_sa(src,sa) scan_to_sa(scan_jsonescape,src,sa)
 #endif
 #ifdef ARRAY_H
--- a/textcode/fmt_jsonescape.c
+++ b/textcode/fmt_jsonescape.c
@ -0,0 +1,42 @@
 #include "fmt.h"
 #include "textcode.h"
 #include "str.h"
 #include "haveinline.h"
 /* src is UTF-8 encoded */
 size_t fmt_jsonescape(char* dest,const char* src,size_t len) {
  register const unsigned char* s=(const unsigned char*) src;
  size_t written=0,i;
  for (i=0; i<len; ++i) {
    switch (s[i]) {
    case '\\':
    case '"':
      if (dest) {
 	dest[written]='\\';
 	dest[written+1]=s[i];
      }
      written+=2;
      break;
    default:
      if (s[i]<' ') {
 	if (dest) {
 	  dest[written]='\\';
 	  dest[written+1]='u';
 	  dest[written+2]='0';
 	  dest[written+3]='0';
 	  dest[written+4]=fmt_tohex(s[i]>>4);
 	  dest[written+5]=fmt_tohex(s[i]&0xf);
 	}
 	written+=6;
      } else {
 	if (dest) dest[written]=s[i];
 	++written;
      }
      break;
    }
    /* in case someone gives us malicious input */
    if (written>((size_t)-1)/2) return (size_t)-1;
  }
  return written;
 }
--- a/textcode/scan_jsonescape.c
+++ b/textcode/scan_jsonescape.c
@ -0,0 +1,54 @@
 #include "fmt.h"
 #include "textcode.h"
 #include "scan.h"
 size_t scan_jsonescape(const char *src,char *dest,size_t *destlen) {
  register const unsigned char* s=(const unsigned char*) src;
  size_t written=0,i;
  char c;
  unsigned int prev,cur,todo;
  prev=cur=(unsigned int)-1;
  for (i=0; s[i]; ++i) {
    if ((c=s[i])=='\\') {
      switch (s[i+1]) {
      case '\\':
 	if (prev!=(unsigned int)-1) return 0; // lead surrogate not followed by tail surrogate
 	// c='\\';	// c already is backslash
 	break;
      case 'u':
 	{
 	  size_t j;
 	  for (cur=j=0; j<4; ++j) {
 	    char x=scan_fromhex(s[i+2+j]);
 	    if (x<0) return 0;	// not hex -> invalid input
 	    cur=(cur<<4) | x;
 	  }
 	  if (cur>=0xd800 && cur<=0xdbff) {
 	    // utf-16 surrogate pair; needs to be followed by another
 	    // surrogate. We need to read both and convert to UTF-8
 	    if (prev!=(unsigned int)-1) return 0;	// two lead surrogates
 	    prev=cur;
 	    i+=5;	// we want i to go up by 6, 1 is done by the for loop
 	    continue;	// write nothing!
 	  } else if (cur>=0xdc00 && cur<=0xdfff) {
 	    todo=(cur&0x3ff) | ((prev&0x3ff) << 10) + 0x100000;
 	  } else
 	    todo=cur;
 	  written+=fmt_utf8(dest?dest+written:dest,todo);
 	  i+=5;
 	  prev=-1;
 	  continue;
 	}
      default:
 	if (prev!=(unsigned int)-1) return 0; // lead surrogate not followed by tail surrogate
 	c=s[i+1];
 	break;
      }
      ++i;
    }
    if (dest) dest[written]=c;
    ++written;
  }
  *destlen=written;
  return i;
 }