add json encoding routines to textcode
This commit is contained in:
parent
66cb1c1c82
commit
1d2add208f
@ -15,5 +15,10 @@ number of bytes it would have written.
|
||||
|
||||
For convenience, fmt.h defines the integer FMT_UTF8 to be big enough to
|
||||
contain every possible fmt_utf8 output.
|
||||
.SH NOTE
|
||||
fmt_utf8 and scan_utf8 implement the encoding from UTF-8, but are meant
|
||||
to be able to store integers, not just Unicode code points. Values
|
||||
larger than 0x10ffff are not valid UTF-8 (see RFC 3629) but can be
|
||||
represented in the encoding, so fmt_utf8 will allow them.
|
||||
.SH "SEE ALSO"
|
||||
scan_utf8(3)
|
||||
|
@ -17,5 +17,10 @@ sequence, scan_utf8 returns 0 and does not touch \fIdest\fR.
|
||||
The length of the longest UTF-8 sequence is 5. If the buffer is longer
|
||||
than that, and scan_utf8 fails, then the data was not a valid UTF-8
|
||||
encoded sequence.
|
||||
.SH NOTE
|
||||
fmt_utf8 and scan_utf8 implement the encoding from UTF-8, but are meant
|
||||
to be able to store integers, not just Unicode code points. Values
|
||||
above 0x10ffff are not valid UTF-8. If you are using this function to
|
||||
parse UTF-8, you need to reject them (see RFC 3629).
|
||||
.SH "SEE ALSO"
|
||||
fmt_utf8(3)
|
||||
|
25
t.c
25
t.c
@ -29,11 +29,11 @@
|
||||
#include "io_internal.h"
|
||||
|
||||
#define rdtscl(low) \
|
||||
__asm__ __volatile__ ("rdtsc" : "=a" (low) : : "edx")
|
||||
__asm__ __volatile__ ("lfence\nrdtsc" : "=a" (low) : : "edx")
|
||||
|
||||
// #define atomic_add(mem,val) asm volatile ("lock; add%z0 %1, %0": "+m" (mem): "ir" (val))
|
||||
|
||||
int64 writecb(int64 fd,const void* buf,uint64 n) {
|
||||
static int64 writecb(int64 fd,const void* buf,uint64 n) {
|
||||
(void)fd;
|
||||
(void)buf;
|
||||
(void)n;
|
||||
@ -48,6 +48,24 @@ int64 writecb(int64 fd,const void* buf,uint64 n) {
|
||||
}
|
||||
|
||||
int main(int argc,char* argv[]) {
|
||||
char buf[1024];
|
||||
size_t l;
|
||||
assert(fmt_jsonescape(buf,"foo\nbar\\",8)==14 && byte_equal(buf,14,"foo\\u000abar\\\\"));
|
||||
memset(buf,0,sizeof(buf));
|
||||
assert(scan_jsonescape("foo\\u000abar\\\\",buf,&l)==14 && l==8 && byte_equal(buf,8,"foo\nbar\\"));
|
||||
memset(buf,0,sizeof(buf));
|
||||
/* example from the json spec: G clef U+1D11E encoded using UTF-16 surrogates*/
|
||||
assert(scan_jsonescape("\\uD834\\uDD1Exyz",buf,&l)==15 && l==7 && byte_equal(buf,7,"\xf4\x8d\x84\x9exyz"));
|
||||
|
||||
/*
|
||||
1D11E -> 0001 1101 0001 0001 1110
|
||||
-> ______00 __011101 __000100 __011110
|
||||
as utf8: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
|
||||
11110000 10011101 10000100 10011110
|
||||
f 0 9 d 8 4 9 e
|
||||
*/
|
||||
|
||||
#if 0
|
||||
static size_t x;
|
||||
x=23;
|
||||
atomic_add(&x,3);
|
||||
@ -55,6 +73,7 @@ int main(int argc,char* argv[]) {
|
||||
printf("%u\n",atomic_add_return(&x,-3));
|
||||
printf("%u\n",compare_and_swap(&x,26,17));
|
||||
printf("%u\n",compare_and_swap(&x,23,17));
|
||||
#endif
|
||||
|
||||
#if 0
|
||||
atomic_add(&x,3); printf("%u\n",x);
|
||||
@ -361,7 +380,7 @@ int main(int argc,char* argv[]) {
|
||||
#endif
|
||||
#if 0
|
||||
printf("%d %d\n",strcmp("foo","bar"),str_diff("foo","bar"));
|
||||
printf("%d %d\n",strcmp("foo","üar"),str_diff("foo","üar"));
|
||||
printf("%d %d\n",strcmp("foo","\xfcar"),str_diff("foo","\xfcar"));
|
||||
#endif
|
||||
#if 0
|
||||
{
|
||||
|
37
textcode.h
37
textcode.h
@ -11,35 +11,39 @@ extern "C" {
|
||||
/* These take len bytes from src and write them in encoded form to
|
||||
* dest (if dest != NULL), returning the number of bytes written. */
|
||||
|
||||
/* needs len/3*4 bytes */
|
||||
/* Needs len/3*4 bytes */
|
||||
size_t fmt_uuencoded(char* dest,const char* src,size_t len);
|
||||
/* needs len/3*4 bytes */
|
||||
/* Needs len/3*4 bytes */
|
||||
size_t fmt_base64(char* dest,const char* src,size_t len);
|
||||
/* worst case: len*3 */
|
||||
/* Worst case: len*3 */
|
||||
size_t fmt_quotedprintable(char* dest,const char* src,size_t len);
|
||||
/* worst case: len*3 */
|
||||
/* Worst case: len*3 */
|
||||
size_t fmt_quotedprintable2(char* dest,const char* src,size_t len,const char* escapeme);
|
||||
/* worst case: len*3 */
|
||||
/* Worst case: len*3 */
|
||||
size_t fmt_urlencoded(char* dest,const char* src,size_t len);
|
||||
/* worst case: len*3 */
|
||||
/* Worst case: len*3 */
|
||||
size_t fmt_urlencoded2(char* dest,const char* src,size_t len,const char* escapeme);
|
||||
/* worst case: len*2 */
|
||||
/* Worst case: len*2 */
|
||||
size_t fmt_yenc(char* dest,const char* src,size_t len);
|
||||
/* needs len*2 bytes */
|
||||
/* Needs len*2 bytes */
|
||||
size_t fmt_hexdump(char* dest,const char* src,size_t len);
|
||||
/* change '<' to '<' and '&' to '&'; worst case: len*5 */
|
||||
/* Change '<' to '<' and '&' to '&'; worst case: len*5 */
|
||||
size_t fmt_html(char* dest,const char* src,size_t len);
|
||||
/* change '\' to "\\", '\n' to "\n", ^A to "\x01" etc; worst case: len*4 */
|
||||
/* Change '\' to "\\", '\n' to "\n", ^A to "\x01" etc; worst case: len*4 */
|
||||
size_t fmt_cescape(char* dest,const char* src,size_t len);
|
||||
/* worst case: len*4 */
|
||||
/* Worst case: len*4 */
|
||||
size_t fmt_cescape2(char* dest,const char* src,size_t len,const char* escapeme);
|
||||
/* fold awk whitespace to '_'; this is great for writing fields with
|
||||
/* Fold whitespace to '_'; this is great for writing fields with
|
||||
* white spaces to a log file and still allow awk to do log analysis */
|
||||
/* worst case: same size */
|
||||
/* Worst case: same size */
|
||||
size_t fmt_foldwhitespace(char* dest,const char* src,size_t len);
|
||||
/* worst case: len*3 */
|
||||
/* Worst case: len*3 */
|
||||
size_t fmt_ldapescape(char* dest,const char* src,size_t len);
|
||||
size_t fmt_ldapescape2(char* dest,const char* src,size_t len,const char* escapeme);
|
||||
/* Encode JSON string from UTF-8; will backslash-escape the bare minimum.
|
||||
* Will not verify that the input is valid UTF-8!
|
||||
* Worst case: len*6 */
|
||||
size_t fmt_jsonescape(char* dest,const char* src,size_t len);
|
||||
|
||||
/* These read one line from src, decoded it, and write the result to
|
||||
* dest. The number of decoded bytes is written to destlen. dest
|
||||
@ -54,6 +58,7 @@ size_t scan_hexdump(const char *src,char *dest,size_t *destlen);
|
||||
size_t scan_html(const char *src,char *dest,size_t *destlen);
|
||||
size_t scan_cescape(const char *src,char *dest,size_t *destlen);
|
||||
size_t scan_ldapescape(const char* src,char* dest,size_t *destlen);
|
||||
size_t scan_jsonescape(const char* src,char* dest,size_t *destlen);
|
||||
|
||||
#ifdef STRALLOC_H
|
||||
/* WARNING: these functions _append_ to the stralloc, not overwrite! */
|
||||
@ -78,6 +83,8 @@ size_t scan_to_sa(size_t (*func)(const char*,char*,size_t*),
|
||||
#define fmt_hexdump_sa(sa,src,len) fmt_to_sa(fmt_hexdump,sa,src,len)
|
||||
#define fmt_html_sa(sa,src,len) fmt_to_sa(fmt_html,sa,src,len)
|
||||
#define fmt_cescape_sa(sa,src,len) fmt_to_sa(fmt_cescape,sa,src,len)
|
||||
#define fmt_ldapescape_sa(sa,src,len) fmt_to_sa(fmt_ldapescape,sa,src,len)
|
||||
#define fmt_jsonescape_sa(sa,src,len) fmt_to_sa(fmt_jsonescape,sa,src,len)
|
||||
|
||||
#define fmt_quotedprintable2_sa(sa,src,len,escapeme) fmt_to_sa2(fmt_quotedprintable2,sa,src,len,escapeme)
|
||||
#define fmt_urlencoded2_sa(sa,src,len,escapeme) fmt_to_sa2(fmt_urlencoded2,sa,src,len,escapeme)
|
||||
@ -91,6 +98,8 @@ size_t scan_to_sa(size_t (*func)(const char*,char*,size_t*),
|
||||
#define scan_hexdump_sa(src,sa) scan_to_sa(scan_hexdump,src,sa)
|
||||
#define scan_html_sa(src,sa) scan_to_sa(scan_html,src,sa)
|
||||
#define scan_cescape_sa(src,sa) scan_to_sa(scan_cescape,src,sa)
|
||||
#define scan_ldapescape_sa(src,sa) scan_to_sa(scan_ldapescape,src,sa)
|
||||
#define scan_jsonescape_sa(src,sa) scan_to_sa(scan_jsonescape,src,sa)
|
||||
#endif
|
||||
|
||||
#ifdef ARRAY_H
|
||||
|
42
textcode/fmt_jsonescape.c
Normal file
42
textcode/fmt_jsonescape.c
Normal file
@ -0,0 +1,42 @@
|
||||
#include "fmt.h"
|
||||
#include "textcode.h"
|
||||
#include "str.h"
|
||||
#include "haveinline.h"
|
||||
|
||||
/* src is UTF-8 encoded */
|
||||
size_t fmt_jsonescape(char* dest,const char* src,size_t len) {
|
||||
register const unsigned char* s=(const unsigned char*) src;
|
||||
size_t written=0,i;
|
||||
for (i=0; i<len; ++i) {
|
||||
switch (s[i]) {
|
||||
case '\\':
|
||||
case '"':
|
||||
if (dest) {
|
||||
dest[written]='\\';
|
||||
dest[written+1]=s[i];
|
||||
}
|
||||
written+=2;
|
||||
break;
|
||||
default:
|
||||
if (s[i]<' ') {
|
||||
if (dest) {
|
||||
dest[written]='\\';
|
||||
dest[written+1]='u';
|
||||
dest[written+2]='0';
|
||||
dest[written+3]='0';
|
||||
dest[written+4]=fmt_tohex(s[i]>>4);
|
||||
dest[written+5]=fmt_tohex(s[i]&0xf);
|
||||
}
|
||||
written+=6;
|
||||
} else {
|
||||
if (dest) dest[written]=s[i];
|
||||
++written;
|
||||
}
|
||||
break;
|
||||
}
|
||||
/* in case someone gives us malicious input */
|
||||
if (written>((size_t)-1)/2) return (size_t)-1;
|
||||
}
|
||||
return written;
|
||||
}
|
||||
|
54
textcode/scan_jsonescape.c
Normal file
54
textcode/scan_jsonescape.c
Normal file
@ -0,0 +1,54 @@
|
||||
#include "fmt.h"
|
||||
#include "textcode.h"
|
||||
#include "scan.h"
|
||||
|
||||
size_t scan_jsonescape(const char *src,char *dest,size_t *destlen) {
|
||||
register const unsigned char* s=(const unsigned char*) src;
|
||||
size_t written=0,i;
|
||||
char c;
|
||||
unsigned int prev,cur,todo;
|
||||
prev=cur=(unsigned int)-1;
|
||||
for (i=0; s[i]; ++i) {
|
||||
if ((c=s[i])=='\\') {
|
||||
switch (s[i+1]) {
|
||||
case '\\':
|
||||
if (prev!=(unsigned int)-1) return 0; // lead surrogate not followed by tail surrogate
|
||||
// c='\\'; // c already is backslash
|
||||
break;
|
||||
case 'u':
|
||||
{
|
||||
size_t j;
|
||||
for (cur=j=0; j<4; ++j) {
|
||||
char x=scan_fromhex(s[i+2+j]);
|
||||
if (x<0) return 0; // not hex -> invalid input
|
||||
cur=(cur<<4) | x;
|
||||
}
|
||||
if (cur>=0xd800 && cur<=0xdbff) {
|
||||
// utf-16 surrogate pair; needs to be followed by another
|
||||
// surrogate. We need to read both and convert to UTF-8
|
||||
if (prev!=(unsigned int)-1) return 0; // two lead surrogates
|
||||
prev=cur;
|
||||
i+=5; // we want i to go up by 6, 1 is done by the for loop
|
||||
continue; // write nothing!
|
||||
} else if (cur>=0xdc00 && cur<=0xdfff) {
|
||||
todo=(cur&0x3ff) | ((prev&0x3ff) << 10) + 0x100000;
|
||||
} else
|
||||
todo=cur;
|
||||
written+=fmt_utf8(dest?dest+written:dest,todo);
|
||||
i+=5;
|
||||
prev=-1;
|
||||
continue;
|
||||
}
|
||||
default:
|
||||
if (prev!=(unsigned int)-1) return 0; // lead surrogate not followed by tail surrogate
|
||||
c=s[i+1];
|
||||
break;
|
||||
}
|
||||
++i;
|
||||
}
|
||||
if (dest) dest[written]=c;
|
||||
++written;
|
||||
}
|
||||
*destlen=written;
|
||||
return i;
|
||||
}
|
Loading…
x
Reference in New Issue
Block a user