add scan_utf8_sem

add utf8 test suite
master
leitner 8 years ago
parent 41db97d380
commit 14f26729e9

@ -6,7 +6,7 @@ size_t fmt_utf8(char *dest,uint32_t n) {
if (dest) *dest=(char)n; if (dest) *dest=(char)n;
return 1; return 1;
} }
for (i=0x3f,j=1; i<0x7fffffff; i=(i<<5)|0x1f, ++j) { for (i=0x3f,j=1; i<=0x7fffffff; i=(i<<5)|0x1f, ++j) {
if (i>=n) { if (i>=n) {
--j; --j;
if (dest) { if (dest) {
@ -19,6 +19,7 @@ size_t fmt_utf8(char *dest,uint32_t n) {
} }
return j+1; return j+1;
} }
if (i==0x7fffffff) return 0;
} }
/* we were asked to encode a value that cannot be encoded */ /* we were asked to encode a value that cannot be encoded */
return 0; return 0;

@ -111,6 +111,7 @@ size_t scan_iso8601(const char* in,struct timespec* t) __pure__;
/* some variable length encodings for integers */ /* some variable length encodings for integers */
size_t scan_utf8(const char* in,size_t len,uint32_t* n) __pure__; size_t scan_utf8(const char* in,size_t len,uint32_t* n) __pure__;
size_t scan_utf8_sem(const char* in,size_t len,uint32_t* n) __pure__;
size_t scan_asn1derlength(const char* in,size_t len,unsigned long long* n) __pure__; size_t scan_asn1derlength(const char* in,size_t len,unsigned long long* n) __pure__;
size_t scan_asn1dertag(const char* in,size_t len,unsigned long long* n) __pure__; size_t scan_asn1dertag(const char* in,size_t len,unsigned long long* n) __pure__;

@ -5,6 +5,7 @@ scan_utf8 \- decode an unsigned integer from UTF-8 encoding
.B #include <scan.h> .B #include <scan.h>
size_t \fBscan_utf8\fP(const char *\fIsrc\fR,size_t \fIlen\fR,uint32_t *\fIdest\fR); size_t \fBscan_utf8\fP(const char *\fIsrc\fR,size_t \fIlen\fR,uint32_t *\fIdest\fR);
size_t \fBscan_utf8_sem\fP(const char *\fIsrc\fR,size_t \fIlen\fR,uint32_t *\fIdest\fR);
.SH DESCRIPTION .SH DESCRIPTION
scan_utf8 decodes an unsigned integer in UTF-8 encoding from a memory scan_utf8 decodes an unsigned integer in UTF-8 encoding from a memory
area holding binary data. It writes the decode value in \fIdest\fR and area holding binary data. It writes the decode value in \fIdest\fR and
@ -14,13 +15,15 @@ scan_utf8 never reads more than \fIlen\fR bytes from \fIsrc\fR. If the
sequence is longer than that, or the memory area contains an invalid sequence is longer than that, or the memory area contains an invalid
sequence, scan_utf8 returns 0 and does not touch \fIdest\fR. sequence, scan_utf8 returns 0 and does not touch \fIdest\fR.
The length of the longest UTF-8 sequence is 5. If the buffer is longer The length of the longest valid UTF-8 sequence is 6.
than that, and scan_utf8 fails, then the data was not a valid UTF-8
encoded sequence. scan_utf8 will reject syntactically invalid encodings, but not
semantically invalid ones. scan_utf8_sem will additionally reject
surrogates.
.SH NOTE .SH NOTE
fmt_utf8 and scan_utf8 implement the encoding from UTF-8, but are meant fmt_utf8 and scan_utf8 implement the encoding from UTF-8, but are meant
to be able to store integers, not just Unicode code points. Values to be able to store integers, not just Unicode code points. Values
above 0x10ffff are not valid UTF-8. If you are using this function to above 0x10ffff are not valid UTF-8. If you are using this function to
parse UTF-8, you need to reject them (see RFC 3629). parse UTF-8, you need to reject them (see RFC 3629).
.SH "SEE ALSO" .SH "SEE ALSO"
fmt_utf8(3) fmt_utf8(3), scan_utf8_sem(3)

@ -0,0 +1,11 @@
#include "scan.h"
size_t scan_utf8_sem(const char* in,size_t len,uint32_t* num) {
size_t r=scan_utf8(in,len,num);
if (r>0) {
if (*num>=0xd800 && *num<=0xdfff) return 0;
if ((*num&0xfffe)==0xfffe) return 0;
if (*num>=0xfdd0 && *num<=0xfdef) return 0;
}
return r;
}

@ -0,0 +1,74 @@
#include <assert.h>
#include "scan.h"
#include "fmt.h"
/* http://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt */
int main() {
char buf[100];
uint32_t l;
unsigned int i;
/* first positive testing for the various lengths */
l=fmt_utf8(buf,0); assert(l == 1 && scan_utf8(buf,l+1,&l)==1 && l==0);
l=fmt_utf8(buf,0x80); assert(l == 2 && scan_utf8(buf,l+1,&l)==2 && l==0x80);
l=fmt_utf8(buf,0x800); assert(l == 3 && scan_utf8(buf,l+1,&l)==3 && l==0x800);
l=fmt_utf8(buf,0x10000); assert(l == 4 && scan_utf8(buf,l+1,&l)==4 && l==0x10000);
l=fmt_utf8(buf,0x200000); assert(l == 5 && scan_utf8(buf,l+1,&l)==5 && l==0x200000);
l=fmt_utf8(buf,0x4000000); assert(l == 6 && scan_utf8(buf,l+1,&l)==6 && l==0x4000000);
/* corner cases */
l=fmt_utf8(buf,0x7f); assert(l == 1 && scan_utf8(buf,l+1,&l)==1 && l==0x7f);
l=fmt_utf8(buf,0x7ff); assert(l == 2 && scan_utf8(buf,l+1,&l)==2 && l==0x7ff);
l=fmt_utf8(buf,0xffff); assert(l == 3 && scan_utf8(buf,l+1,&l)==3 && l==0xffff);
l=fmt_utf8(buf,0x1fffff); assert(l == 4 && scan_utf8(buf,l+1,&l)==4 && l==0x1fffff);
l=fmt_utf8(buf,0x3ffffff); assert(l == 5 && scan_utf8(buf,l+1,&l)==5 && l==0x3ffffff);
l=fmt_utf8(buf,0x7fffffff); assert(l == 6 && scan_utf8(buf,l+1,&l)==6 && l==0x7fffffff);
/* more corner cases */
l=fmt_utf8(buf,0xd7ff); assert(l == 3 && scan_utf8(buf,l+1,&l)==3 && l==0xd7ff);
l=fmt_utf8(buf,0xe000); assert(l == 3 && scan_utf8(buf,l+1,&l)==3 && l==0xe000);
l=fmt_utf8(buf,0xfffd); assert(l == 3 && scan_utf8(buf,l+1,&l)==3 && l==0xfffd);
l=fmt_utf8(buf,0x10ffff); assert(l == 4 && scan_utf8(buf,l+1,&l)==4 && l==0x10ffff);
l=fmt_utf8(buf,0x110000); assert(l == 4 && scan_utf8(buf,l+1,&l)==4 && l==0x110000);
/* now negative testing */
/* start off with some continuation bytes outside a sequence */
for (i=0x80; i<=0xbf; ++i) {
buf[0]=i;
assert(scan_utf8(buf,2,&l)==0);
}
/* now check lonely sequence start characters */
buf[1]=' ';
for (i=0xc0; i<=0xfd; ++i) {
buf[0]=i;
assert(scan_utf8(buf,2,&l)==0);
}
/* FE and FF are reserved for UTF-16 endianness detection*/
assert(scan_utf8("\xfe\xff",3,&l)==0);
assert(scan_utf8("\xff\xfe",3,&l)==0);
/* now check some truncated sequences */
l=fmt_utf8(buf,0); assert(l == 1 && scan_utf8(buf,l-1,&l)==0);
l=fmt_utf8(buf,0x80); assert(l == 2 && scan_utf8(buf,l-1,&l)==0);
l=fmt_utf8(buf,0x800); assert(l == 3 && scan_utf8(buf,l-1,&l)==0);
l=fmt_utf8(buf,0x10000); assert(l == 4 && scan_utf8(buf,l-1,&l)==0);
l=fmt_utf8(buf,0x200000); assert(l == 5 && scan_utf8(buf,l-1,&l)==0);
l=fmt_utf8(buf,0x4000000); assert(l == 6 && scan_utf8(buf,l-1,&l)==0);
/* now truncate in another way */
l=fmt_utf8(buf,0x80); buf[l-1]=' '; assert(l == 2 && scan_utf8(buf,l+1,&l)==0);
l=fmt_utf8(buf,0x800); buf[l-1]=' '; assert(l == 3 && scan_utf8(buf,l+1,&l)==0);
l=fmt_utf8(buf,0x10000); buf[l-1]=' '; assert(l == 4 && scan_utf8(buf,l+1,&l)==0);
l=fmt_utf8(buf,0x200000); buf[l-1]=' '; assert(l == 5 && scan_utf8(buf,l+1,&l)==0);
l=fmt_utf8(buf,0x4000000); buf[l-1]=' '; assert(l == 6 && scan_utf8(buf,l+1,&l)==0);
/* now check that scan_utf8_sem rejects surrogate pairs */
assert(scan_utf8_sem("\xed\xa0\x80",4,&l)==0);
/* now some not minimally encoded utf-8 sequences */
assert(scan_utf8("\xc0\x80",3,&l)==0);
assert(scan_utf8("\xe0\x80\x80",4,&l)==0);
assert(scan_utf8("\xf0\x80\x80\x80",5,&l)==0);
assert(scan_utf8("\xf8\x80\x80\x80\x80",6,&l)==0);
assert(scan_utf8("\xfc\x80\x80\x80\x80\x80",7,&l)==0);
}
Loading…
Cancel
Save