From ac2df2bf2087170eda2540c18f7b7ac990e9c3a1 Mon Sep 17 00:00:00 2001 From: leitner Date: Sun, 30 Jul 2017 12:31:12 +0000 Subject: [PATCH] bring scan_cescape up to speed --- textcode/scan_cescape.3 | 35 ++++++++++++++++++ textcode/scan_cescape.c | 73 ++++++++++++++++++++++++++++++++++---- textcode/scan_jsonescape.3 | 31 ++++++++++++++++ textcode/scan_jsonescape.c | 6 ++++ 4 files changed, 138 insertions(+), 7 deletions(-) create mode 100644 textcode/scan_cescape.3 create mode 100644 textcode/scan_jsonescape.3 diff --git a/textcode/scan_cescape.3 b/textcode/scan_cescape.3 new file mode 100644 index 0000000..69dd293 --- /dev/null +++ b/textcode/scan_cescape.3 @@ -0,0 +1,35 @@ +.TH scan_cescape 3 +.SH NAME +scan_cescape \- parse C escaped string +.SH SYNTAX +.B #include + +size_t \fBscan_cescape\fP(const char *\fIsrc\fR,char *\fIdest\fR,size_t* \fIdestlen\fR); + +.SH DESCRIPTION +scan_cescape parses C escaped text from src into dest. +Parsing stops at the 0 terminator, invalid input characters or a double +quote that was not escaped. + +C escape sequences like \\n and \\x0a and \\012 are translated into +their binary counterparts. The C99 escape sequences \\u and \\U are +supported and lead to UTF-8 sequences being output. + +scan_cescape will then write the number of bytes in dest into *destlen, +and return the number of bytes decoded from src. + +dest can be NULL. destlen can be NULL. + +To make sure dest is large enough, either allocate strlen(src)+1 bytes +or call scan_cescape twice, the first time with dest == NULL (*destlen +will still be written). + +.SH "RETURN VALUE" +scan_cescape returns the number of bytes successfully parsed +from src. + +.SH EXAMPLES +scan_cescape("test\\n\");",buf,&i) -> return 7, i=5, buf="test\n". + +.SH "SEE ALSO" +fmt_jsonescape(3), fmt_cescape(3), scan_ldapescape(3) diff --git a/textcode/scan_cescape.c b/textcode/scan_cescape.c index 314ba3a..00afb90 100644 --- a/textcode/scan_cescape.c +++ b/textcode/scan_cescape.c @@ -18,25 +18,84 @@ size_t scan_cescape(const char *src,char *dest,size_t *destlen) { case 't': c='\t'; break; case 'v': c='\v'; case '\\': break; - case 'x': + case 'x': // hex escape; \x0a -> 10 { unsigned char a,b; a=scan_fromhex(s[i+2]); b=scan_fromhex(s[i+3]); - if (a<16 && b<16) { - c=(a<<4)+b; - i+=2; + if (a<16) { + if (b<16) { + c=(a<<4)+b; + i+=2; + } else { + c=a; + i+=1; + } } } break; + case 'u': // C99 unicode escape: \u000a -> 10 + case 'U': // C99 unicode escape: \U0000000a -> 10 + { + unsigned int j,k=0,l=(s[i+1]=='U'?10:6); + for (j=2; j=16) // error + goto error; // don't allow short sequences + k=k*16+c; + } + written+=fmt_utf8(dest?dest+written:0,k); + i+=j-1; + continue; + } default: + if (s[i+1]>='0' && s[i+1]<='7') { // octal escape; \012 -> 10 + unsigned int j,k; + for (k=0,j=1; j<4; ++j) { + unsigned int l=s[i+j]-'0'; + if (l<8) + k=k*8+l; + else + break; + } + if (dest) dest[written++]=k; + i+=j-1; + continue; + } --i; } ++i; - } - dest[written]=c; + } else if (c=='"') + break; + if (dest) dest[written]=c; ++written; } - *destlen=written; +error: + if (destlen) *destlen=written; return i; } + +#ifdef UNITTEST +#include +#undef UNITTEST +#include +#include + +int main() { + size_t dl; + char buf[100]; + assert(scan_cescape("test\\n\");",buf,&dl)==6 && dl==5 && !memcmp(buf,"test\n",5)); + /* check hex and octal escaping */ + assert(scan_cescape("test\\x0a\");",buf,&dl)==8 && dl==5 && !memcmp(buf,"test\n",5)); + assert(scan_cescape("test\\012\");",buf,&dl)==8 && dl==5 && !memcmp(buf,"test\n",5)); + /* check short escape sequences */ + assert(scan_cescape("test\\xa\");",buf,&dl)==7 && dl==5 && !memcmp(buf,"test\n",5)); + assert(scan_cescape("test\\12\");",buf,&dl)==7 && dl==5 && !memcmp(buf,"test\n",5)); + assert(scan_cescape("test\\1\");",buf,&dl)==6 && dl==5 && !memcmp(buf,"test\1",5)); + /* check unicode */ + assert(scan_cescape("test\\u000a\");",buf,&dl)==10 && dl==5 && !memcmp(buf,"test\n",5)); + assert(scan_cescape("test\\U0000000a\");",buf,&dl)==14 && dl==5 && !memcmp(buf,"test\n",5)); + /* check that short sequences are rejected */ + assert(scan_cescape("test\\Ua\");",buf,&dl)==4 && dl==4 && !memcmp(buf,"test",4)); +} +#endif diff --git a/textcode/scan_jsonescape.3 b/textcode/scan_jsonescape.3 new file mode 100644 index 0000000..d7716b4 --- /dev/null +++ b/textcode/scan_jsonescape.3 @@ -0,0 +1,31 @@ +.TH scan_jsonescape 3 +.SH NAME +scan_jsonescape \- parse JSON escaped string +.SH SYNTAX +.B #include + +size_t \fBscan_jsonescape\fP(const char *\fIsrc\fR,char *\fIdest\fR,size_t* \fIdestlen\fR); + +.SH DESCRIPTION +scan_jsonescape parses JSON escaped text from src into dest, leaving a +UTF-8 string in dest. Parsing stops at the 0 terminator, invalid input +characters or a double quote that was not escaped. + +It will then write the number of bytes in dest into *destlen, +and return the number of bytes decoded from src. + +dest can be NULL. destlen can be NULL. + +To make sure dest is large enough, either allocate strlen(src)+1 bytes +or call scan_jsonescape twice, the first time with dest == NULL (*destlen +will still be written). + +.SH "RETURN VALUE" +scan_jsonescape returns the number of bytes successfully parsed +from src. + +.SH EXAMPLES +scan_jsonescape("test\n\");",buf,&i) -> return 6, i=5, buf="test\n". + +.SH "SEE ALSO" +fmt_jsonescape(3), scan_cescape(3), scan_ldapescape(3) diff --git a/textcode/scan_jsonescape.c b/textcode/scan_jsonescape.c index 30ac6fb..585e87d 100644 --- a/textcode/scan_jsonescape.c +++ b/textcode/scan_jsonescape.c @@ -73,6 +73,10 @@ abort: #ifdef UNITTEST #include #include +#undef UNITTEST +#include +#include +#include int main() { char buf[100]; @@ -105,3 +109,5 @@ int main() { return 0; } #endif + +