From 14f26729e9d194c1b4482bd65f0f3638794f1130 Mon Sep 17 00:00:00 2001 From: leitner Date: Mon, 13 Mar 2017 15:44:48 +0000 Subject: [PATCH] add scan_utf8_sem add utf8 test suite --- fmt/fmt_utf8.c | 3 +- scan.h | 1 + scan/scan_utf8.3 | 11 ++++--- scan/scan_utf8_sem.c | 11 +++++++ test/utf8.c | 74 ++++++++++++++++++++++++++++++++++++++++++++ 5 files changed, 95 insertions(+), 5 deletions(-) create mode 100644 scan/scan_utf8_sem.c create mode 100644 test/utf8.c diff --git a/fmt/fmt_utf8.c b/fmt/fmt_utf8.c index 2ffdab6..9e26eb7 100644 --- a/fmt/fmt_utf8.c +++ b/fmt/fmt_utf8.c @@ -6,7 +6,7 @@ size_t fmt_utf8(char *dest,uint32_t n) { if (dest) *dest=(char)n; return 1; } - for (i=0x3f,j=1; i<0x7fffffff; i=(i<<5)|0x1f, ++j) { + for (i=0x3f,j=1; i<=0x7fffffff; i=(i<<5)|0x1f, ++j) { if (i>=n) { --j; if (dest) { @@ -19,6 +19,7 @@ size_t fmt_utf8(char *dest,uint32_t n) { } return j+1; } + if (i==0x7fffffff) return 0; } /* we were asked to encode a value that cannot be encoded */ return 0; diff --git a/scan.h b/scan.h index 13becb9..bcd4365 100644 --- a/scan.h +++ b/scan.h @@ -111,6 +111,7 @@ size_t scan_iso8601(const char* in,struct timespec* t) __pure__; /* some variable length encodings for integers */ size_t scan_utf8(const char* in,size_t len,uint32_t* n) __pure__; +size_t scan_utf8_sem(const char* in,size_t len,uint32_t* n) __pure__; size_t scan_asn1derlength(const char* in,size_t len,unsigned long long* n) __pure__; size_t scan_asn1dertag(const char* in,size_t len,unsigned long long* n) __pure__; diff --git a/scan/scan_utf8.3 b/scan/scan_utf8.3 index 8c9a346..999c33b 100644 --- a/scan/scan_utf8.3 +++ b/scan/scan_utf8.3 @@ -5,6 +5,7 @@ scan_utf8 \- decode an unsigned integer from UTF-8 encoding .B #include size_t \fBscan_utf8\fP(const char *\fIsrc\fR,size_t \fIlen\fR,uint32_t *\fIdest\fR); +size_t \fBscan_utf8_sem\fP(const char *\fIsrc\fR,size_t \fIlen\fR,uint32_t *\fIdest\fR); .SH DESCRIPTION scan_utf8 decodes an unsigned integer in UTF-8 encoding from a memory area holding binary data. It writes the decode value in \fIdest\fR and @@ -14,13 +15,15 @@ scan_utf8 never reads more than \fIlen\fR bytes from \fIsrc\fR. If the sequence is longer than that, or the memory area contains an invalid sequence, scan_utf8 returns 0 and does not touch \fIdest\fR. -The length of the longest UTF-8 sequence is 5. If the buffer is longer -than that, and scan_utf8 fails, then the data was not a valid UTF-8 -encoded sequence. +The length of the longest valid UTF-8 sequence is 6. + +scan_utf8 will reject syntactically invalid encodings, but not +semantically invalid ones. scan_utf8_sem will additionally reject +surrogates. .SH NOTE fmt_utf8 and scan_utf8 implement the encoding from UTF-8, but are meant to be able to store integers, not just Unicode code points. Values above 0x10ffff are not valid UTF-8. If you are using this function to parse UTF-8, you need to reject them (see RFC 3629). .SH "SEE ALSO" -fmt_utf8(3) +fmt_utf8(3), scan_utf8_sem(3) diff --git a/scan/scan_utf8_sem.c b/scan/scan_utf8_sem.c new file mode 100644 index 0000000..451166e --- /dev/null +++ b/scan/scan_utf8_sem.c @@ -0,0 +1,11 @@ +#include "scan.h" + +size_t scan_utf8_sem(const char* in,size_t len,uint32_t* num) { + size_t r=scan_utf8(in,len,num); + if (r>0) { + if (*num>=0xd800 && *num<=0xdfff) return 0; + if ((*num&0xfffe)==0xfffe) return 0; + if (*num>=0xfdd0 && *num<=0xfdef) return 0; + } + return r; +} diff --git a/test/utf8.c b/test/utf8.c new file mode 100644 index 0000000..20047aa --- /dev/null +++ b/test/utf8.c @@ -0,0 +1,74 @@ +#include +#include "scan.h" +#include "fmt.h" + +/* http://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt */ + +int main() { + char buf[100]; + uint32_t l; + unsigned int i; + /* first positive testing for the various lengths */ + l=fmt_utf8(buf,0); assert(l == 1 && scan_utf8(buf,l+1,&l)==1 && l==0); + l=fmt_utf8(buf,0x80); assert(l == 2 && scan_utf8(buf,l+1,&l)==2 && l==0x80); + l=fmt_utf8(buf,0x800); assert(l == 3 && scan_utf8(buf,l+1,&l)==3 && l==0x800); + l=fmt_utf8(buf,0x10000); assert(l == 4 && scan_utf8(buf,l+1,&l)==4 && l==0x10000); + l=fmt_utf8(buf,0x200000); assert(l == 5 && scan_utf8(buf,l+1,&l)==5 && l==0x200000); + l=fmt_utf8(buf,0x4000000); assert(l == 6 && scan_utf8(buf,l+1,&l)==6 && l==0x4000000); + /* corner cases */ + l=fmt_utf8(buf,0x7f); assert(l == 1 && scan_utf8(buf,l+1,&l)==1 && l==0x7f); + l=fmt_utf8(buf,0x7ff); assert(l == 2 && scan_utf8(buf,l+1,&l)==2 && l==0x7ff); + l=fmt_utf8(buf,0xffff); assert(l == 3 && scan_utf8(buf,l+1,&l)==3 && l==0xffff); + l=fmt_utf8(buf,0x1fffff); assert(l == 4 && scan_utf8(buf,l+1,&l)==4 && l==0x1fffff); + l=fmt_utf8(buf,0x3ffffff); assert(l == 5 && scan_utf8(buf,l+1,&l)==5 && l==0x3ffffff); + l=fmt_utf8(buf,0x7fffffff); assert(l == 6 && scan_utf8(buf,l+1,&l)==6 && l==0x7fffffff); + /* more corner cases */ + l=fmt_utf8(buf,0xd7ff); assert(l == 3 && scan_utf8(buf,l+1,&l)==3 && l==0xd7ff); + l=fmt_utf8(buf,0xe000); assert(l == 3 && scan_utf8(buf,l+1,&l)==3 && l==0xe000); + l=fmt_utf8(buf,0xfffd); assert(l == 3 && scan_utf8(buf,l+1,&l)==3 && l==0xfffd); + l=fmt_utf8(buf,0x10ffff); assert(l == 4 && scan_utf8(buf,l+1,&l)==4 && l==0x10ffff); + l=fmt_utf8(buf,0x110000); assert(l == 4 && scan_utf8(buf,l+1,&l)==4 && l==0x110000); + + /* now negative testing */ + /* start off with some continuation bytes outside a sequence */ + for (i=0x80; i<=0xbf; ++i) { + buf[0]=i; + assert(scan_utf8(buf,2,&l)==0); + } + + /* now check lonely sequence start characters */ + buf[1]=' '; + for (i=0xc0; i<=0xfd; ++i) { + buf[0]=i; + assert(scan_utf8(buf,2,&l)==0); + } + + /* FE and FF are reserved for UTF-16 endianness detection*/ + assert(scan_utf8("\xfe\xff",3,&l)==0); + assert(scan_utf8("\xff\xfe",3,&l)==0); + + /* now check some truncated sequences */ + l=fmt_utf8(buf,0); assert(l == 1 && scan_utf8(buf,l-1,&l)==0); + l=fmt_utf8(buf,0x80); assert(l == 2 && scan_utf8(buf,l-1,&l)==0); + l=fmt_utf8(buf,0x800); assert(l == 3 && scan_utf8(buf,l-1,&l)==0); + l=fmt_utf8(buf,0x10000); assert(l == 4 && scan_utf8(buf,l-1,&l)==0); + l=fmt_utf8(buf,0x200000); assert(l == 5 && scan_utf8(buf,l-1,&l)==0); + l=fmt_utf8(buf,0x4000000); assert(l == 6 && scan_utf8(buf,l-1,&l)==0); + + /* now truncate in another way */ + l=fmt_utf8(buf,0x80); buf[l-1]=' '; assert(l == 2 && scan_utf8(buf,l+1,&l)==0); + l=fmt_utf8(buf,0x800); buf[l-1]=' '; assert(l == 3 && scan_utf8(buf,l+1,&l)==0); + l=fmt_utf8(buf,0x10000); buf[l-1]=' '; assert(l == 4 && scan_utf8(buf,l+1,&l)==0); + l=fmt_utf8(buf,0x200000); buf[l-1]=' '; assert(l == 5 && scan_utf8(buf,l+1,&l)==0); + l=fmt_utf8(buf,0x4000000); buf[l-1]=' '; assert(l == 6 && scan_utf8(buf,l+1,&l)==0); + + /* now check that scan_utf8_sem rejects surrogate pairs */ + assert(scan_utf8_sem("\xed\xa0\x80",4,&l)==0); + + /* now some not minimally encoded utf-8 sequences */ + assert(scan_utf8("\xc0\x80",3,&l)==0); + assert(scan_utf8("\xe0\x80\x80",4,&l)==0); + assert(scan_utf8("\xf0\x80\x80\x80",5,&l)==0); + assert(scan_utf8("\xf8\x80\x80\x80\x80",6,&l)==0); + assert(scan_utf8("\xfc\x80\x80\x80\x80\x80",7,&l)==0); +}