fmt_jsonescape: convert utf-8 >ffff to \u escaped surrogate pair
This commit is contained in:
parent
b9eb81c58f
commit
84d3e6a7b4
@ -1,6 +1,7 @@
|
|||||||
#include "fmt.h"
|
#include "fmt.h"
|
||||||
#include "textcode.h"
|
#include "textcode.h"
|
||||||
#include "str.h"
|
#include "str.h"
|
||||||
|
#include "scan.h"
|
||||||
#include "haveinline.h"
|
#include "haveinline.h"
|
||||||
|
|
||||||
/* src is UTF-8 encoded */
|
/* src is UTF-8 encoded */
|
||||||
@ -36,6 +37,28 @@ escape:
|
|||||||
dest[written+5]=fmt_tohex(s[i]&0xf);
|
dest[written+5]=fmt_tohex(s[i]&0xf);
|
||||||
}
|
}
|
||||||
written+=6;
|
written+=6;
|
||||||
|
} else if (s[i]>0x7f) {
|
||||||
|
/* UTF-8! Convert to surrogate pair if needed. */
|
||||||
|
uint32_t u;
|
||||||
|
size_t j=scan_utf8_sem((const char*)s+i,len-i,&u);
|
||||||
|
if (j==0) /* Invalid UTF-8! Abort! */
|
||||||
|
return written;
|
||||||
|
if (u>0xffff) {
|
||||||
|
if (dest) {
|
||||||
|
dest[written ]='\\';
|
||||||
|
dest[written+1]='u';
|
||||||
|
fmt_xlong(dest+written+2,0xd800 + ((u>>10) & 0x3ff));
|
||||||
|
dest[written+6]='\\';
|
||||||
|
dest[written+7]='u';
|
||||||
|
fmt_xlong(dest+written+8,0xdc00 + (u & 0x3ff));
|
||||||
|
}
|
||||||
|
written+=12;
|
||||||
|
} else {
|
||||||
|
if (dest) memcpy(dest+written,s+i,j);
|
||||||
|
written+=j;
|
||||||
|
}
|
||||||
|
i+=j-1; /* -1 because the for loop will also add 1 */
|
||||||
|
break;
|
||||||
} else {
|
} else {
|
||||||
if (dest) dest[written]=s[i];
|
if (dest) dest[written]=s[i];
|
||||||
++written;
|
++written;
|
||||||
@ -48,3 +71,17 @@ escape:
|
|||||||
return written;
|
return written;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#ifdef UNITTEST
|
||||||
|
#include <assert.h>
|
||||||
|
#include <string.h>
|
||||||
|
|
||||||
|
int main() {
|
||||||
|
char buf[100];
|
||||||
|
/* test utf-8 pass-through and correct encoding of \t */
|
||||||
|
assert(fmt_jsonescape(buf,"\tfnörd",7)==8 && !memcmp(buf,"\\tfnörd",8));
|
||||||
|
/* test escaping of unprintable characters */
|
||||||
|
assert(fmt_jsonescape(buf,"\001x",2)==7 && !memcmp(buf,"\\u0001x",7));
|
||||||
|
/* test conversion of large UTF-8 chars to UTF-16 surrogate pairs (poop emoji) */
|
||||||
|
assert(fmt_jsonescape(buf,"\xf0\x9f\x92\xa9x",5)==13 && !memcmp(buf,"\\ud87d\\udca9x",13));
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
@ -25,9 +25,15 @@ static const char* lookup(size_t ofs,const char* t) {
|
|||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
static size_t scan_html_inner(const char *src,char *dest,size_t *destlen,int flag) {
|
enum htmlmode { /* <a href="http://example.com/"foo">libowfat<home</a> */
|
||||||
|
OUTSIDE, /* ^^^^^^^^^^^^^^^^ -> `libowfat<home` */
|
||||||
|
TAGARG, /* ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -> `http://example.com/"foo´ */
|
||||||
|
};
|
||||||
|
|
||||||
|
static size_t scan_html_inner(const char *src,char *dest,size_t *destlen,enum htmlmode mode) {
|
||||||
register const unsigned char* s=(const unsigned char*) src;
|
register const unsigned char* s=(const unsigned char*) src;
|
||||||
size_t written=0,i;
|
size_t written=0,i;
|
||||||
|
int dq=0;
|
||||||
for (i=0; s[i]; ++i) {
|
for (i=0; s[i]; ++i) {
|
||||||
if (s[i]=='&') {
|
if (s[i]=='&') {
|
||||||
const char* utf8;
|
const char* utf8;
|
||||||
@ -58,7 +64,8 @@ static size_t scan_html_inner(const char *src,char *dest,size_t *destlen,int fla
|
|||||||
continue;
|
continue;
|
||||||
} else
|
} else
|
||||||
dest[written]='&';
|
dest[written]='&';
|
||||||
} else if (flag && s[i]=='<') {
|
} else if (s[i]=='<') {
|
||||||
|
if (mode == OUTSIDE) break;
|
||||||
if (case_starts((const char*)s+i+1,"br>")) {
|
if (case_starts((const char*)s+i+1,"br>")) {
|
||||||
dest[written]='\n';
|
dest[written]='\n';
|
||||||
i+=3;
|
i+=3;
|
||||||
@ -68,7 +75,12 @@ static size_t scan_html_inner(const char *src,char *dest,size_t *destlen,int fla
|
|||||||
i+=3;
|
i+=3;
|
||||||
} else
|
} else
|
||||||
dest[written]=s[i];
|
dest[written]=s[i];
|
||||||
} else
|
} else if (s[i]=='"' && mode==TAGARG) {
|
||||||
|
if (i==0) { dq=1; continue; }
|
||||||
|
break;
|
||||||
|
} else if (mode==TAGARG && !dq && (s[i]==' ' || s[i]=='\t' || s[i]=='\n'))
|
||||||
|
break;
|
||||||
|
else
|
||||||
dest[written]=s[i];
|
dest[written]=s[i];
|
||||||
++written;
|
++written;
|
||||||
}
|
}
|
||||||
@ -77,9 +89,22 @@ static size_t scan_html_inner(const char *src,char *dest,size_t *destlen,int fla
|
|||||||
}
|
}
|
||||||
|
|
||||||
size_t scan_html_tagarg(const char *src,char *dest,size_t *destlen) {
|
size_t scan_html_tagarg(const char *src,char *dest,size_t *destlen) {
|
||||||
return scan_html_inner(src,dest,destlen,1);
|
return scan_html_inner(src,dest,destlen,TAGARG);
|
||||||
}
|
}
|
||||||
|
|
||||||
size_t scan_html(const char *src,char *dest,size_t *destlen) {
|
size_t scan_html(const char *src,char *dest,size_t *destlen) {
|
||||||
return scan_html_inner(src,dest,destlen,0);
|
return scan_html_inner(src,dest,destlen,OUTSIDE);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#ifdef UNITTEST
|
||||||
|
#include <assert.h>
|
||||||
|
|
||||||
|
int main() {
|
||||||
|
char* html="<a href=\"http://example.com/"foo\">libowfat<home</a>";
|
||||||
|
char buf[100];
|
||||||
|
size_t destlen;
|
||||||
|
assert(scan_html(html,buf,&destlen)==0 && destlen==0);
|
||||||
|
assert(scan_html(strchr(html,'>')+1,buf,&destlen)==16 && destlen==13 && !memcmp(buf,"libowfat<home",13));
|
||||||
|
assert(scan_html_tagarg(strchr(html,'"')+1,buf,&destlen)==28 && destlen==23 && !memcmp(buf,"http://example.com/\"foo",23));
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
@ -99,6 +99,7 @@ int main() {
|
|||||||
assert(scan_jsonescape("a\\udafd0",buf,&l)==1);
|
assert(scan_jsonescape("a\\udafd0",buf,&l)==1);
|
||||||
/* correct surrogate pair */
|
/* correct surrogate pair */
|
||||||
assert(scan_jsonescape("a\\ud834\\udd1eb",buf,&l)==14 && l==6 && !memcmp(buf,"a\xf0\x9d\x84\x9e""b",6));
|
assert(scan_jsonescape("a\\ud834\\udd1eb",buf,&l)==14 && l==6 && !memcmp(buf,"a\xf0\x9d\x84\x9e""b",6));
|
||||||
|
assert(scan_jsonescape("\\ud87d\\udca9x",buf,&l)==13 && l==5 && !memcmp(buf,"\xf0\x9f\x92\xa9x",5));
|
||||||
/* how about some incorrect UTF-8? */
|
/* how about some incorrect UTF-8? */
|
||||||
assert(scan_jsonescape("a\xc0\xaf",buf,&l)==1 && l==1 && !memcmp(buf,"a",1));
|
assert(scan_jsonescape("a\xc0\xaf",buf,&l)==1 && l==1 && !memcmp(buf,"a",1));
|
||||||
return 0;
|
return 0;
|
||||||
|
Loading…
x
Reference in New Issue
Block a user