diff --git a/fmt/fmt_utf8.3 b/fmt/fmt_utf8.3 index ccbab94..9b22d46 100644 --- a/fmt/fmt_utf8.3 +++ b/fmt/fmt_utf8.3 @@ -15,5 +15,10 @@ number of bytes it would have written. For convenience, fmt.h defines the integer FMT_UTF8 to be big enough to contain every possible fmt_utf8 output. +.SH NOTE +fmt_utf8 and scan_utf8 implement the encoding from UTF-8, but are meant +to be able to store integers, not just Unicode code points. Values +larger than 0x10ffff are not valid UTF-8 (see RFC 3629) but can be +represented in the encoding, so fmt_utf8 will allow them. .SH "SEE ALSO" scan_utf8(3) diff --git a/scan/scan_utf8.3 b/scan/scan_utf8.3 index 5512e24..8c9a346 100644 --- a/scan/scan_utf8.3 +++ b/scan/scan_utf8.3 @@ -17,5 +17,10 @@ sequence, scan_utf8 returns 0 and does not touch \fIdest\fR. The length of the longest UTF-8 sequence is 5. If the buffer is longer than that, and scan_utf8 fails, then the data was not a valid UTF-8 encoded sequence. +.SH NOTE +fmt_utf8 and scan_utf8 implement the encoding from UTF-8, but are meant +to be able to store integers, not just Unicode code points. Values +above 0x10ffff are not valid UTF-8. If you are using this function to +parse UTF-8, you need to reject them (see RFC 3629). .SH "SEE ALSO" fmt_utf8(3) diff --git a/t.c b/t.c index ebf877f..82206f9 100644 --- a/t.c +++ b/t.c @@ -29,11 +29,11 @@ #include "io_internal.h" #define rdtscl(low) \ - __asm__ __volatile__ ("rdtsc" : "=a" (low) : : "edx") + __asm__ __volatile__ ("lfence\nrdtsc" : "=a" (low) : : "edx") // #define atomic_add(mem,val) asm volatile ("lock; add%z0 %1, %0": "+m" (mem): "ir" (val)) -int64 writecb(int64 fd,const void* buf,uint64 n) { +static int64 writecb(int64 fd,const void* buf,uint64 n) { (void)fd; (void)buf; (void)n; @@ -48,6 +48,24 @@ int64 writecb(int64 fd,const void* buf,uint64 n) { } int main(int argc,char* argv[]) { + char buf[1024]; + size_t l; + assert(fmt_jsonescape(buf,"foo\nbar\\",8)==14 && byte_equal(buf,14,"foo\\u000abar\\\\")); + memset(buf,0,sizeof(buf)); + assert(scan_jsonescape("foo\\u000abar\\\\",buf,&l)==14 && l==8 && byte_equal(buf,8,"foo\nbar\\")); + memset(buf,0,sizeof(buf)); + /* example from the json spec: G clef U+1D11E encoded using UTF-16 surrogates*/ + assert(scan_jsonescape("\\uD834\\uDD1Exyz",buf,&l)==15 && l==7 && byte_equal(buf,7,"\xf4\x8d\x84\x9exyz")); + +/* + 1D11E -> 0001 1101 0001 0001 1110 + -> ______00 __011101 __000100 __011110 + as utf8: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx + 11110000 10011101 10000100 10011110 + f 0 9 d 8 4 9 e +*/ + +#if 0 static size_t x; x=23; atomic_add(&x,3); @@ -55,6 +73,7 @@ int main(int argc,char* argv[]) { printf("%u\n",atomic_add_return(&x,-3)); printf("%u\n",compare_and_swap(&x,26,17)); printf("%u\n",compare_and_swap(&x,23,17)); +#endif #if 0 atomic_add(&x,3); printf("%u\n",x); @@ -361,7 +380,7 @@ int main(int argc,char* argv[]) { #endif #if 0 printf("%d %d\n",strcmp("foo","bar"),str_diff("foo","bar")); - printf("%d %d\n",strcmp("foo","üar"),str_diff("foo","üar")); + printf("%d %d\n",strcmp("foo","\xfcar"),str_diff("foo","\xfcar")); #endif #if 0 { diff --git a/textcode.h b/textcode.h index 2c26c17..bf6a089 100644 --- a/textcode.h +++ b/textcode.h @@ -11,35 +11,39 @@ extern "C" { /* These take len bytes from src and write them in encoded form to * dest (if dest != NULL), returning the number of bytes written. */ -/* needs len/3*4 bytes */ +/* Needs len/3*4 bytes */ size_t fmt_uuencoded(char* dest,const char* src,size_t len); -/* needs len/3*4 bytes */ +/* Needs len/3*4 bytes */ size_t fmt_base64(char* dest,const char* src,size_t len); -/* worst case: len*3 */ +/* Worst case: len*3 */ size_t fmt_quotedprintable(char* dest,const char* src,size_t len); -/* worst case: len*3 */ +/* Worst case: len*3 */ size_t fmt_quotedprintable2(char* dest,const char* src,size_t len,const char* escapeme); -/* worst case: len*3 */ +/* Worst case: len*3 */ size_t fmt_urlencoded(char* dest,const char* src,size_t len); -/* worst case: len*3 */ +/* Worst case: len*3 */ size_t fmt_urlencoded2(char* dest,const char* src,size_t len,const char* escapeme); -/* worst case: len*2 */ +/* Worst case: len*2 */ size_t fmt_yenc(char* dest,const char* src,size_t len); -/* needs len*2 bytes */ +/* Needs len*2 bytes */ size_t fmt_hexdump(char* dest,const char* src,size_t len); -/* change '<' to '<' and '&' to '&'; worst case: len*5 */ +/* Change '<' to '<' and '&' to '&'; worst case: len*5 */ size_t fmt_html(char* dest,const char* src,size_t len); -/* change '\' to "\\", '\n' to "\n", ^A to "\x01" etc; worst case: len*4 */ +/* Change '\' to "\\", '\n' to "\n", ^A to "\x01" etc; worst case: len*4 */ size_t fmt_cescape(char* dest,const char* src,size_t len); -/* worst case: len*4 */ +/* Worst case: len*4 */ size_t fmt_cescape2(char* dest,const char* src,size_t len,const char* escapeme); -/* fold awk whitespace to '_'; this is great for writing fields with +/* Fold whitespace to '_'; this is great for writing fields with * white spaces to a log file and still allow awk to do log analysis */ -/* worst case: same size */ +/* Worst case: same size */ size_t fmt_foldwhitespace(char* dest,const char* src,size_t len); -/* worst case: len*3 */ +/* Worst case: len*3 */ size_t fmt_ldapescape(char* dest,const char* src,size_t len); size_t fmt_ldapescape2(char* dest,const char* src,size_t len,const char* escapeme); +/* Encode JSON string from UTF-8; will backslash-escape the bare minimum. + * Will not verify that the input is valid UTF-8! + * Worst case: len*6 */ +size_t fmt_jsonescape(char* dest,const char* src,size_t len); /* These read one line from src, decoded it, and write the result to * dest. The number of decoded bytes is written to destlen. dest @@ -54,6 +58,7 @@ size_t scan_hexdump(const char *src,char *dest,size_t *destlen); size_t scan_html(const char *src,char *dest,size_t *destlen); size_t scan_cescape(const char *src,char *dest,size_t *destlen); size_t scan_ldapescape(const char* src,char* dest,size_t *destlen); +size_t scan_jsonescape(const char* src,char* dest,size_t *destlen); #ifdef STRALLOC_H /* WARNING: these functions _append_ to the stralloc, not overwrite! */ @@ -78,6 +83,8 @@ size_t scan_to_sa(size_t (*func)(const char*,char*,size_t*), #define fmt_hexdump_sa(sa,src,len) fmt_to_sa(fmt_hexdump,sa,src,len) #define fmt_html_sa(sa,src,len) fmt_to_sa(fmt_html,sa,src,len) #define fmt_cescape_sa(sa,src,len) fmt_to_sa(fmt_cescape,sa,src,len) +#define fmt_ldapescape_sa(sa,src,len) fmt_to_sa(fmt_ldapescape,sa,src,len) +#define fmt_jsonescape_sa(sa,src,len) fmt_to_sa(fmt_jsonescape,sa,src,len) #define fmt_quotedprintable2_sa(sa,src,len,escapeme) fmt_to_sa2(fmt_quotedprintable2,sa,src,len,escapeme) #define fmt_urlencoded2_sa(sa,src,len,escapeme) fmt_to_sa2(fmt_urlencoded2,sa,src,len,escapeme) @@ -91,6 +98,8 @@ size_t scan_to_sa(size_t (*func)(const char*,char*,size_t*), #define scan_hexdump_sa(src,sa) scan_to_sa(scan_hexdump,src,sa) #define scan_html_sa(src,sa) scan_to_sa(scan_html,src,sa) #define scan_cescape_sa(src,sa) scan_to_sa(scan_cescape,src,sa) +#define scan_ldapescape_sa(src,sa) scan_to_sa(scan_ldapescape,src,sa) +#define scan_jsonescape_sa(src,sa) scan_to_sa(scan_jsonescape,src,sa) #endif #ifdef ARRAY_H diff --git a/textcode/fmt_jsonescape.c b/textcode/fmt_jsonescape.c new file mode 100644 index 0000000..e919010 --- /dev/null +++ b/textcode/fmt_jsonescape.c @@ -0,0 +1,42 @@ +#include "fmt.h" +#include "textcode.h" +#include "str.h" +#include "haveinline.h" + +/* src is UTF-8 encoded */ +size_t fmt_jsonescape(char* dest,const char* src,size_t len) { + register const unsigned char* s=(const unsigned char*) src; + size_t written=0,i; + for (i=0; i>4); + dest[written+5]=fmt_tohex(s[i]&0xf); + } + written+=6; + } else { + if (dest) dest[written]=s[i]; + ++written; + } + break; + } + /* in case someone gives us malicious input */ + if (written>((size_t)-1)/2) return (size_t)-1; + } + return written; +} + diff --git a/textcode/scan_jsonescape.c b/textcode/scan_jsonescape.c new file mode 100644 index 0000000..ab6c992 --- /dev/null +++ b/textcode/scan_jsonescape.c @@ -0,0 +1,54 @@ +#include "fmt.h" +#include "textcode.h" +#include "scan.h" + +size_t scan_jsonescape(const char *src,char *dest,size_t *destlen) { + register const unsigned char* s=(const unsigned char*) src; + size_t written=0,i; + char c; + unsigned int prev,cur,todo; + prev=cur=(unsigned int)-1; + for (i=0; s[i]; ++i) { + if ((c=s[i])=='\\') { + switch (s[i+1]) { + case '\\': + if (prev!=(unsigned int)-1) return 0; // lead surrogate not followed by tail surrogate + // c='\\'; // c already is backslash + break; + case 'u': + { + size_t j; + for (cur=j=0; j<4; ++j) { + char x=scan_fromhex(s[i+2+j]); + if (x<0) return 0; // not hex -> invalid input + cur=(cur<<4) | x; + } + if (cur>=0xd800 && cur<=0xdbff) { + // utf-16 surrogate pair; needs to be followed by another + // surrogate. We need to read both and convert to UTF-8 + if (prev!=(unsigned int)-1) return 0; // two lead surrogates + prev=cur; + i+=5; // we want i to go up by 6, 1 is done by the for loop + continue; // write nothing! + } else if (cur>=0xdc00 && cur<=0xdfff) { + todo=(cur&0x3ff) | ((prev&0x3ff) << 10) + 0x100000; + } else + todo=cur; + written+=fmt_utf8(dest?dest+written:dest,todo); + i+=5; + prev=-1; + continue; + } + default: + if (prev!=(unsigned int)-1) return 0; // lead surrogate not followed by tail surrogate + c=s[i+1]; + break; + } + ++i; + } + if (dest) dest[written]=c; + ++written; + } + *destlen=written; + return i; +}