add json encoding routines to textcode

master
leitner 11 years ago
parent 66cb1c1c82
commit 1d2add208f

@ -15,5 +15,10 @@ number of bytes it would have written.
For convenience, fmt.h defines the integer FMT_UTF8 to be big enough to
contain every possible fmt_utf8 output.
.SH NOTE
fmt_utf8 and scan_utf8 implement the encoding from UTF-8, but are meant
to be able to store integers, not just Unicode code points. Values
larger than 0x10ffff are not valid UTF-8 (see RFC 3629) but can be
represented in the encoding, so fmt_utf8 will allow them.
.SH "SEE ALSO"
scan_utf8(3)

@ -17,5 +17,10 @@ sequence, scan_utf8 returns 0 and does not touch \fIdest\fR.
The length of the longest UTF-8 sequence is 5. If the buffer is longer
than that, and scan_utf8 fails, then the data was not a valid UTF-8
encoded sequence.
.SH NOTE
fmt_utf8 and scan_utf8 implement the encoding from UTF-8, but are meant
to be able to store integers, not just Unicode code points. Values
above 0x10ffff are not valid UTF-8. If you are using this function to
parse UTF-8, you need to reject them (see RFC 3629).
.SH "SEE ALSO"
fmt_utf8(3)

25
t.c

@ -29,11 +29,11 @@
#include "io_internal.h"
#define rdtscl(low) \
__asm__ __volatile__ ("rdtsc" : "=a" (low) : : "edx")
__asm__ __volatile__ ("lfence\nrdtsc" : "=a" (low) : : "edx")
// #define atomic_add(mem,val) asm volatile ("lock; add%z0 %1, %0": "+m" (mem): "ir" (val))
int64 writecb(int64 fd,const void* buf,uint64 n) {
static int64 writecb(int64 fd,const void* buf,uint64 n) {
(void)fd;
(void)buf;
(void)n;
@ -48,6 +48,24 @@ int64 writecb(int64 fd,const void* buf,uint64 n) {
}
int main(int argc,char* argv[]) {
char buf[1024];
size_t l;
assert(fmt_jsonescape(buf,"foo\nbar\\",8)==14 && byte_equal(buf,14,"foo\\u000abar\\\\"));
memset(buf,0,sizeof(buf));
assert(scan_jsonescape("foo\\u000abar\\\\",buf,&l)==14 && l==8 && byte_equal(buf,8,"foo\nbar\\"));
memset(buf,0,sizeof(buf));
/* example from the json spec: G clef U+1D11E encoded using UTF-16 surrogates*/
assert(scan_jsonescape("\\uD834\\uDD1Exyz",buf,&l)==15 && l==7 && byte_equal(buf,7,"\xf4\x8d\x84\x9exyz"));
/*
1D11E -> 0001 1101 0001 0001 1110
-> ______00 __011101 __000100 __011110
as utf8: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
11110000 10011101 10000100 10011110
f 0 9 d 8 4 9 e
*/
#if 0
static size_t x;
x=23;
atomic_add(&x,3);
@ -55,6 +73,7 @@ int main(int argc,char* argv[]) {
printf("%u\n",atomic_add_return(&x,-3));
printf("%u\n",compare_and_swap(&x,26,17));
printf("%u\n",compare_and_swap(&x,23,17));
#endif
#if 0
atomic_add(&x,3); printf("%u\n",x);
@ -361,7 +380,7 @@ int main(int argc,char* argv[]) {
#endif
#if 0
printf("%d %d\n",strcmp("foo","bar"),str_diff("foo","bar"));
printf("%d %d\n",strcmp("foo","üar"),str_diff("foo","üar"));
printf("%d %d\n",strcmp("foo","\xfcar"),str_diff("foo","\xfcar"));
#endif
#if 0
{

@ -11,35 +11,39 @@ extern "C" {
/* These take len bytes from src and write them in encoded form to
* dest (if dest != NULL), returning the number of bytes written. */
/* needs len/3*4 bytes */
/* Needs len/3*4 bytes */
size_t fmt_uuencoded(char* dest,const char* src,size_t len);
/* needs len/3*4 bytes */
/* Needs len/3*4 bytes */
size_t fmt_base64(char* dest,const char* src,size_t len);
/* worst case: len*3 */
/* Worst case: len*3 */
size_t fmt_quotedprintable(char* dest,const char* src,size_t len);
/* worst case: len*3 */
/* Worst case: len*3 */
size_t fmt_quotedprintable2(char* dest,const char* src,size_t len,const char* escapeme);
/* worst case: len*3 */
/* Worst case: len*3 */
size_t fmt_urlencoded(char* dest,const char* src,size_t len);
/* worst case: len*3 */
/* Worst case: len*3 */
size_t fmt_urlencoded2(char* dest,const char* src,size_t len,const char* escapeme);
/* worst case: len*2 */
/* Worst case: len*2 */
size_t fmt_yenc(char* dest,const char* src,size_t len);
/* needs len*2 bytes */
/* Needs len*2 bytes */
size_t fmt_hexdump(char* dest,const char* src,size_t len);
/* change '<' to '&lt;' and '&' to '&amp;'; worst case: len*5 */
/* Change '<' to '&lt;' and '&' to '&amp;'; worst case: len*5 */
size_t fmt_html(char* dest,const char* src,size_t len);
/* change '\' to "\\", '\n' to "\n", ^A to "\x01" etc; worst case: len*4 */
/* Change '\' to "\\", '\n' to "\n", ^A to "\x01" etc; worst case: len*4 */
size_t fmt_cescape(char* dest,const char* src,size_t len);
/* worst case: len*4 */
/* Worst case: len*4 */
size_t fmt_cescape2(char* dest,const char* src,size_t len,const char* escapeme);
/* fold awk whitespace to '_'; this is great for writing fields with
/* Fold whitespace to '_'; this is great for writing fields with
* white spaces to a log file and still allow awk to do log analysis */
/* worst case: same size */
/* Worst case: same size */
size_t fmt_foldwhitespace(char* dest,const char* src,size_t len);
/* worst case: len*3 */
/* Worst case: len*3 */
size_t fmt_ldapescape(char* dest,const char* src,size_t len);
size_t fmt_ldapescape2(char* dest,const char* src,size_t len,const char* escapeme);
/* Encode JSON string from UTF-8; will backslash-escape the bare minimum.
* Will not verify that the input is valid UTF-8!
* Worst case: len*6 */
size_t fmt_jsonescape(char* dest,const char* src,size_t len);
/* These read one line from src, decoded it, and write the result to
* dest. The number of decoded bytes is written to destlen. dest
@ -54,6 +58,7 @@ size_t scan_hexdump(const char *src,char *dest,size_t *destlen);
size_t scan_html(const char *src,char *dest,size_t *destlen);
size_t scan_cescape(const char *src,char *dest,size_t *destlen);
size_t scan_ldapescape(const char* src,char* dest,size_t *destlen);
size_t scan_jsonescape(const char* src,char* dest,size_t *destlen);
#ifdef STRALLOC_H
/* WARNING: these functions _append_ to the stralloc, not overwrite! */
@ -78,6 +83,8 @@ size_t scan_to_sa(size_t (*func)(const char*,char*,size_t*),
#define fmt_hexdump_sa(sa,src,len) fmt_to_sa(fmt_hexdump,sa,src,len)
#define fmt_html_sa(sa,src,len) fmt_to_sa(fmt_html,sa,src,len)
#define fmt_cescape_sa(sa,src,len) fmt_to_sa(fmt_cescape,sa,src,len)
#define fmt_ldapescape_sa(sa,src,len) fmt_to_sa(fmt_ldapescape,sa,src,len)
#define fmt_jsonescape_sa(sa,src,len) fmt_to_sa(fmt_jsonescape,sa,src,len)
#define fmt_quotedprintable2_sa(sa,src,len,escapeme) fmt_to_sa2(fmt_quotedprintable2,sa,src,len,escapeme)
#define fmt_urlencoded2_sa(sa,src,len,escapeme) fmt_to_sa2(fmt_urlencoded2,sa,src,len,escapeme)
@ -91,6 +98,8 @@ size_t scan_to_sa(size_t (*func)(const char*,char*,size_t*),
#define scan_hexdump_sa(src,sa) scan_to_sa(scan_hexdump,src,sa)
#define scan_html_sa(src,sa) scan_to_sa(scan_html,src,sa)
#define scan_cescape_sa(src,sa) scan_to_sa(scan_cescape,src,sa)
#define scan_ldapescape_sa(src,sa) scan_to_sa(scan_ldapescape,src,sa)
#define scan_jsonescape_sa(src,sa) scan_to_sa(scan_jsonescape,src,sa)
#endif
#ifdef ARRAY_H

@ -0,0 +1,42 @@
#include "fmt.h"
#include "textcode.h"
#include "str.h"
#include "haveinline.h"
/* src is UTF-8 encoded */
size_t fmt_jsonescape(char* dest,const char* src,size_t len) {
register const unsigned char* s=(const unsigned char*) src;
size_t written=0,i;
for (i=0; i<len; ++i) {
switch (s[i]) {
case '\\':
case '"':
if (dest) {
dest[written]='\\';
dest[written+1]=s[i];
}
written+=2;
break;
default:
if (s[i]<' ') {
if (dest) {
dest[written]='\\';
dest[written+1]='u';
dest[written+2]='0';
dest[written+3]='0';
dest[written+4]=fmt_tohex(s[i]>>4);
dest[written+5]=fmt_tohex(s[i]&0xf);
}
written+=6;
} else {
if (dest) dest[written]=s[i];
++written;
}
break;
}
/* in case someone gives us malicious input */
if (written>((size_t)-1)/2) return (size_t)-1;
}
return written;
}

@ -0,0 +1,54 @@
#include "fmt.h"
#include "textcode.h"
#include "scan.h"
size_t scan_jsonescape(const char *src,char *dest,size_t *destlen) {
register const unsigned char* s=(const unsigned char*) src;
size_t written=0,i;
char c;
unsigned int prev,cur,todo;
prev=cur=(unsigned int)-1;
for (i=0; s[i]; ++i) {
if ((c=s[i])=='\\') {
switch (s[i+1]) {
case '\\':
if (prev!=(unsigned int)-1) return 0; // lead surrogate not followed by tail surrogate
// c='\\'; // c already is backslash
break;
case 'u':
{
size_t j;
for (cur=j=0; j<4; ++j) {
char x=scan_fromhex(s[i+2+j]);
if (x<0) return 0; // not hex -> invalid input
cur=(cur<<4) | x;
}
if (cur>=0xd800 && cur<=0xdbff) {
// utf-16 surrogate pair; needs to be followed by another
// surrogate. We need to read both and convert to UTF-8
if (prev!=(unsigned int)-1) return 0; // two lead surrogates
prev=cur;
i+=5; // we want i to go up by 6, 1 is done by the for loop
continue; // write nothing!
} else if (cur>=0xdc00 && cur<=0xdfff) {
todo=(cur&0x3ff) | ((prev&0x3ff) << 10) + 0x100000;
} else
todo=cur;
written+=fmt_utf8(dest?dest+written:dest,todo);
i+=5;
prev=-1;
continue;
}
default:
if (prev!=(unsigned int)-1) return 0; // lead surrogate not followed by tail surrogate
c=s[i+1];
break;
}
++i;
}
if (dest) dest[written]=c;
++written;
}
*destlen=written;
return i;
}
Loading…
Cancel
Save