libowfat/textcode/scan_jsonescape.c

#include "fmt.h"
#include "textcode.h"
#include "scan.h"

size_t scan_jsonescape(const char *src,char *dest,size_t *destlen) {
  register const unsigned char* s=(const unsigned char*) src;
  size_t written=0,i;
  char c;
  unsigned int prev,cur,todo;
  prev=cur=(unsigned int)-1;
  for (i=0; s[i]; ++i) {
    if (s[i]=='"') {
      if (prev!=(unsigned int)-1) goto abort;
      goto done;
    }
    if ((c=s[i])=='\\') {
      switch (s[i+1]) {
      case '\\':
	// c='\\';	// c already is backslash
	break;
      case '"': c='"'; break;
      case 'n': c='\n'; break;
      case 'r': c='\r'; break;
      case 'b': c='\b'; break;
      case 'f': c='\f'; break;
      case 't': c='\t'; break;
      case 'u':
	{
	  size_t j;
	  for (cur=j=0; j<4; ++j) {
	    char x=scan_fromhex(s[i+2+j]);
	    if (x<0) goto abort;	// not hex -> invalid input
	    cur=(cur<<4) | x;
	  }
	  if (cur>=0xd800 && cur<=0xdbff) {
	    // utf-16 surrogate pair; needs to be followed by another
	    // surrogate. We need to read both and convert to UTF-8
	    if (prev!=(unsigned int)-1) goto abort;	// two lead surrogates
	    prev=cur;
	    i+=5;	// we want i to go up by 6, 1 is done by the for loop
	    continue;	// write nothing!
	  } else if (cur>=0xdc00 && cur<=0xdfff) {
	    if (prev==(unsigned int)-1) goto abort;	// no lead surrogate
	    todo=(cur&0x3ff) | ((prev&0x3ff) << 10) | 0x10000;
	  } else
	    todo=cur;
	  written+=fmt_utf8(dest?dest+written:dest,todo);
	  i+=5;
	  prev=-1;
	  continue;
	}
      default:
	c=s[i+1];
	break;
      }
      ++i;
    }
    if (prev!=(unsigned int)-1) goto abort;
    /* We expect utf-8 incoming. Make sure it's valid. */
    if (!scan_utf8(src+i,4,NULL)) goto abort;
    if (dest) dest[written]=c;
    ++written;
  }
done:
  *destlen=written;
  return i;
abort:
  if (prev!=(unsigned int)-1) i-=6; // if we abort and there still was an open surrogate pair, cancel it
  *destlen=written;
  return i;
}

#ifdef UNITTEST
#include <assert.h>
#include <string.h>
#undef UNITTEST
#include <scan/scan_fromhex.c>
#include <scan/scan_utf8.c>
#include <fmt/fmt_utf8.c>

int main() {
  char buf[100];
  size_t l;
  assert(scan_jsonescape("fnord",buf,&l)==5 && l==5 && !memcmp(buf,"fnord",5));
  /* is \n properly unescaped? */
  assert(scan_jsonescape("a\\nb",buf,&l)==4 && l==3 && !memcmp(buf,"a\nb",3));
  assert(scan_jsonescape("a\\rb",buf,&l)==4 && l==3 && !memcmp(buf,"a\rb",3));
  assert(scan_jsonescape("a\\bb",buf,&l)==4 && l==3 && !memcmp(buf,"a\bb",3));
  assert(scan_jsonescape("a\\fb",buf,&l)==4 && l==3 && !memcmp(buf,"a\fb",3));
  assert(scan_jsonescape("a\\tb",buf,&l)==4 && l==3 && !memcmp(buf,"a\tb",3));
  assert(scan_jsonescape("a\\\\b",buf,&l)==4 && l==3 && !memcmp(buf,"a\\b",3));
  assert(scan_jsonescape("a\\/b",buf,&l)==4 && l==3 && !memcmp(buf,"a/b",3));
  assert(scan_jsonescape("a\\\"b",buf,&l)==4 && l==3 && !memcmp(buf,"a\"b",3));
  /* does a double quote end the string? */
  assert(scan_jsonescape("a\"b",buf,&l)==1 && l==1 && !memcmp(buf,"a",1));
  /* how about unicode escape */
  assert(scan_jsonescape("a\\u005cb",buf,&l)==8 && l==3 && !memcmp(buf,"a\\b",3));
  /* a trailing surrogate pair with no lead before it */
  assert(scan_jsonescape("a\\udead\"",buf,&l)==1);
  /* a lead surrogate pair with no trailer behind it */
  assert(scan_jsonescape("a\\udafd\"",buf,&l)==1);
  assert(scan_jsonescape("a\\udafd\\udafd",buf,&l)==1);
  assert(scan_jsonescape("a\\udafd0",buf,&l)==1);
  /* correct surrogate pair */
  assert(scan_jsonescape("a\\ud834\\udd1eb",buf,&l)==14 && l==6 && !memcmp(buf,"a\xf0\x9d\x84\x9e""b",6));
  assert(scan_jsonescape("\\ud83d\\udca9x",buf,&l)==13 && l==5 && !memcmp(buf,"\xf0\x9f\x92\xa9x",5));
  /* how about some incorrect UTF-8? */
  assert(scan_jsonescape("a\xc0\xaf",buf,&l)==1 && l==1 && !memcmp(buf,"a",1));
  return 0;
}
#endif
add json encoding routines to textcode 11 years ago			`#include "fmt.h"`
			`#include "textcode.h"`
			`#include "scan.h"`

			`size_t scan_jsonescape(const char src,char dest,size_t *destlen) {`
			`register const unsigned char* s=(const unsigned char*) src;`
			`size_t written=0,i;`
			`char c;`
			`unsigned int prev,cur,todo;`
			`prev=cur=(unsigned int)-1;`
			`for (i=0; s[i]; ++i) {`
improve json escape decoder, add test suite 8 years ago			`if (s[i]=='"') {`
			`if (prev!=(unsigned int)-1) goto abort;`
			`goto done;`
			`}`
add json encoding routines to textcode 11 years ago			`if ((c=s[i])=='\\') {`
			`switch (s[i+1]) {`
			`case '\\':`
			`// c='\\'; // c already is backslash`
			`break;`
improve json escape decoder, add test suite 8 years ago			`case '"': c='"'; break;`
handle \n etc 8 years ago			`case 'n': c='\n'; break;`
			`case 'r': c='\r'; break;`
			`case 'b': c='\b'; break;`
			`case 'f': c='\f'; break;`
			`case 't': c='\t'; break;`
add json encoding routines to textcode 11 years ago			`case 'u':`
			`{`
			`size_t j;`
			`for (cur=j=0; j<4; ++j) {`
			`char x=scan_fromhex(s[i+2+j]);`
improve json escape decoder, add test suite 8 years ago			`if (x<0) goto abort; // not hex -> invalid input`
add json encoding routines to textcode 11 years ago			`cur=(cur<<4) \| x;`
			`}`
			`if (cur>=0xd800 && cur<=0xdbff) {`
			`// utf-16 surrogate pair; needs to be followed by another`
			`// surrogate. We need to read both and convert to UTF-8`
improve json escape decoder, add test suite 8 years ago			`if (prev!=(unsigned int)-1) goto abort; // two lead surrogates`
add json encoding routines to textcode 11 years ago			`prev=cur;`
			`i+=5; // we want i to go up by 6, 1 is done by the for loop`
			`continue; // write nothing!`
			`} else if (cur>=0xdc00 && cur<=0xdfff) {`
improve json escape decoder, add test suite 8 years ago			`if (prev==(unsigned int)-1) goto abort; // no lead surrogate`
			`todo=(cur&0x3ff) \| ((prev&0x3ff) << 10) \| 0x10000;`
add json encoding routines to textcode 11 years ago			`} else`
			`todo=cur;`
			`written+=fmt_utf8(dest?dest+written:dest,todo);`
			`i+=5;`
			`prev=-1;`
			`continue;`
			`}`
			`default:`
			`c=s[i+1];`
			`break;`
			`}`
			`++i;`
			`}`
improve json escape decoder, add test suite 8 years ago			`if (prev!=(unsigned int)-1) goto abort;`
			`/* We expect utf-8 incoming. Make sure it's valid. */`
remove signedness warnings 8 years ago			`if (!scan_utf8(src+i,4,NULL)) goto abort;`
add json encoding routines to textcode 11 years ago			`if (dest) dest[written]=c;`
			`++written;`
			`}`
scan_jsonescape ends when it sees an unescaped " 8 years ago			`done:`
add json encoding routines to textcode 11 years ago			`*destlen=written;`
			`return i;`
improve json escape decoder, add test suite 8 years ago			`abort:`
			`if (prev!=(unsigned int)-1) i-=6; // if we abort and there still was an open surrogate pair, cancel it`
			`*destlen=written;`
			`return i;`
add json encoding routines to textcode 11 years ago			`}`
remove signedness warnings 8 years ago
			`#ifdef UNITTEST`
			`#include <assert.h>`
a few more test suites escapecharc now supports non-ascii 8 years ago			`#include <string.h>`
bring scan_cescape up to speed 7 years ago			`#undef UNITTEST`
			`#include <scan/scan_fromhex.c>`
			`#include <scan/scan_utf8.c>`
			`#include <fmt/fmt_utf8.c>`
remove signedness warnings 8 years ago
			`int main() {`
			`char buf[100];`
			`size_t l;`
			`assert(scan_jsonescape("fnord",buf,&l)==5 && l==5 && !memcmp(buf,"fnord",5));`
			`/* is \n properly unescaped? */`
			`assert(scan_jsonescape("a\\nb",buf,&l)==4 && l==3 && !memcmp(buf,"a\nb",3));`
			`assert(scan_jsonescape("a\\rb",buf,&l)==4 && l==3 && !memcmp(buf,"a\rb",3));`
			`assert(scan_jsonescape("a\\bb",buf,&l)==4 && l==3 && !memcmp(buf,"a\bb",3));`
			`assert(scan_jsonescape("a\\fb",buf,&l)==4 && l==3 && !memcmp(buf,"a\fb",3));`
			`assert(scan_jsonescape("a\\tb",buf,&l)==4 && l==3 && !memcmp(buf,"a\tb",3));`
			`assert(scan_jsonescape("a\\\\b",buf,&l)==4 && l==3 && !memcmp(buf,"a\\b",3));`
			`assert(scan_jsonescape("a\\/b",buf,&l)==4 && l==3 && !memcmp(buf,"a/b",3));`
			`assert(scan_jsonescape("a\\\"b",buf,&l)==4 && l==3 && !memcmp(buf,"a\"b",3));`
			`/* does a double quote end the string? */`
			`assert(scan_jsonescape("a\"b",buf,&l)==1 && l==1 && !memcmp(buf,"a",1));`
			`/* how about unicode escape */`
			`assert(scan_jsonescape("a\\u005cb",buf,&l)==8 && l==3 && !memcmp(buf,"a\\b",3));`
			`/* a trailing surrogate pair with no lead before it */`
			`assert(scan_jsonescape("a\\udead\"",buf,&l)==1);`
			`/* a lead surrogate pair with no trailer behind it */`
			`assert(scan_jsonescape("a\\udafd\"",buf,&l)==1);`
			`assert(scan_jsonescape("a\\udafd\\udafd",buf,&l)==1);`
			`assert(scan_jsonescape("a\\udafd0",buf,&l)==1);`
			`/* correct surrogate pair */`
			`assert(scan_jsonescape("a\\ud834\\udd1eb",buf,&l)==14 && l==6 && !memcmp(buf,"a\xf0\x9d\x84\x9e""b",6));`
fix utf-8 json encoding for outside basic multilingual plane 7 years ago			`assert(scan_jsonescape("\\ud83d\\udca9x",buf,&l)==13 && l==5 && !memcmp(buf,"\xf0\x9f\x92\xa9x",5));`
remove signedness warnings 8 years ago			`/* how about some incorrect UTF-8? */`
			`assert(scan_jsonescape("a\xc0\xaf",buf,&l)==1 && l==1 && !memcmp(buf,"a",1));`
move unit tests into each file (compile with -DUNITTEST) 8 years ago			`return 0;`
remove signedness warnings 8 years ago			`}`
			`#endif`
bring scan_cescape up to speed 7 years ago