add scan_utf8_sem

add utf8 test suite
2017-03-13 15:44:48 +00:00 · 2017-03-13 15:44:48 +00:00 · 14f26729e9
commit 14f26729e9
parent 41db97d380
5 changed files with 95 additions and 5 deletions
--- a/fmt/fmt_utf8.c
+++ b/fmt/fmt_utf8.c
@ -6,7 +6,7 @@ size_t fmt_utf8(char *dest,uint32_t n) {
    if (dest) *dest=(char)n;
    return 1;
  }
-  for (i=0x3f,j=1; i<0x7fffffff; i=(i<<5)|0x1f, ++j) {
+  for (i=0x3f,j=1; i<=0x7fffffff; i=(i<<5)|0x1f, ++j) {
    if (i>=n) {
      --j;
      if (dest) {
@ -19,6 +19,7 @@ size_t fmt_utf8(char *dest,uint32_t n) {
      }
      return j+1;
    }
+    if (i==0x7fffffff) return 0;
  }
  /* we were asked to encode a value that cannot be encoded */
  return 0;
--- a/scan.h
+++ b/scan.h
@ -111,6 +111,7 @@ size_t scan_iso8601(const char* in,struct timespec* t) __pure__;

 /* some variable length encodings for integers */
 size_t scan_utf8(const char* in,size_t len,uint32_t* n) __pure__;
+size_t scan_utf8_sem(const char* in,size_t len,uint32_t* n) __pure__;
 size_t scan_asn1derlength(const char* in,size_t len,unsigned long long* n) __pure__;
 size_t scan_asn1dertag(const char* in,size_t len,unsigned long long* n) __pure__;

--- a/scan/scan_utf8.3
+++ b/scan/scan_utf8.3
@ -5,6 +5,7 @@ scan_utf8 \- decode an unsigned integer from UTF-8 encoding
 .B #include <scan.h>

 size_t \fBscan_utf8\fP(const char *\fIsrc\fR,size_t \fIlen\fR,uint32_t *\fIdest\fR);
+size_t \fBscan_utf8_sem\fP(const char *\fIsrc\fR,size_t \fIlen\fR,uint32_t *\fIdest\fR);
 .SH DESCRIPTION
 scan_utf8 decodes an unsigned integer in UTF-8 encoding from a memory
 area holding binary data.  It writes the decode value in \fIdest\fR and
@ -14,13 +15,15 @@ scan_utf8 never reads more than \fIlen\fR bytes from \fIsrc\fR.  If the
 sequence is longer than that, or the memory area contains an invalid
 sequence, scan_utf8 returns 0 and does not touch \fIdest\fR.

-The length of the longest UTF-8 sequence is 5.  If the buffer is longer
-than that, and scan_utf8 fails, then the data was not a valid UTF-8
-encoded sequence.
+The length of the longest valid UTF-8 sequence is 6.
+
+scan_utf8 will reject syntactically invalid encodings, but not
+semantically invalid ones. scan_utf8_sem will additionally reject
+surrogates.
 .SH NOTE
 fmt_utf8 and scan_utf8 implement the encoding from UTF-8, but are meant
 to be able to store integers, not just Unicode code points.  Values
 above 0x10ffff are not valid UTF-8.  If you are using this function to
 parse UTF-8, you need to reject them (see RFC 3629).
 .SH "SEE ALSO"
-fmt_utf8(3)
+fmt_utf8(3), scan_utf8_sem(3)
--- a/scan/scan_utf8_sem.c
+++ b/scan/scan_utf8_sem.c
@ -0,0 +1,11 @@
+#include "scan.h"
+
+size_t scan_utf8_sem(const char* in,size_t len,uint32_t* num) {
+  size_t r=scan_utf8(in,len,num);
+  if (r>0) {
+    if (*num>=0xd800 && *num<=0xdfff) return 0;
+    if ((*num&0xfffe)==0xfffe) return 0;
+    if (*num>=0xfdd0 && *num<=0xfdef) return 0;
+  }
+  return r;
+}
--- a/test/utf8.c
+++ b/test/utf8.c
@ -0,0 +1,74 @@
+#include <assert.h>
+#include "scan.h"
+#include "fmt.h"
+
+/* http://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt */
+
+int main() {
+  char buf[100];
+  uint32_t l;
+  unsigned int i;
+  /* first positive testing for the various lengths */
+  l=fmt_utf8(buf,0); assert(l == 1 && scan_utf8(buf,l+1,&l)==1 && l==0);
+  l=fmt_utf8(buf,0x80); assert(l == 2 && scan_utf8(buf,l+1,&l)==2 && l==0x80);
+  l=fmt_utf8(buf,0x800); assert(l == 3 && scan_utf8(buf,l+1,&l)==3 && l==0x800);
+  l=fmt_utf8(buf,0x10000); assert(l == 4 && scan_utf8(buf,l+1,&l)==4 && l==0x10000);
+  l=fmt_utf8(buf,0x200000); assert(l == 5 && scan_utf8(buf,l+1,&l)==5 && l==0x200000);
+  l=fmt_utf8(buf,0x4000000); assert(l == 6 && scan_utf8(buf,l+1,&l)==6 && l==0x4000000);
+  /* corner cases */
+  l=fmt_utf8(buf,0x7f); assert(l == 1 && scan_utf8(buf,l+1,&l)==1 && l==0x7f);
+  l=fmt_utf8(buf,0x7ff); assert(l == 2 && scan_utf8(buf,l+1,&l)==2 && l==0x7ff);
+  l=fmt_utf8(buf,0xffff); assert(l == 3 && scan_utf8(buf,l+1,&l)==3 && l==0xffff);
+  l=fmt_utf8(buf,0x1fffff); assert(l == 4 && scan_utf8(buf,l+1,&l)==4 && l==0x1fffff);
+  l=fmt_utf8(buf,0x3ffffff); assert(l == 5 && scan_utf8(buf,l+1,&l)==5 && l==0x3ffffff);
+  l=fmt_utf8(buf,0x7fffffff); assert(l == 6 && scan_utf8(buf,l+1,&l)==6 && l==0x7fffffff);
+  /* more corner cases */
+  l=fmt_utf8(buf,0xd7ff); assert(l == 3 && scan_utf8(buf,l+1,&l)==3 && l==0xd7ff);
+  l=fmt_utf8(buf,0xe000); assert(l == 3 && scan_utf8(buf,l+1,&l)==3 && l==0xe000);
+  l=fmt_utf8(buf,0xfffd); assert(l == 3 && scan_utf8(buf,l+1,&l)==3 && l==0xfffd);
+  l=fmt_utf8(buf,0x10ffff); assert(l == 4 && scan_utf8(buf,l+1,&l)==4 && l==0x10ffff);
+  l=fmt_utf8(buf,0x110000); assert(l == 4 && scan_utf8(buf,l+1,&l)==4 && l==0x110000);
+
+  /* now negative testing */
+  /* start off with some continuation bytes outside a sequence */
+  for (i=0x80; i<=0xbf; ++i) {
+    buf[0]=i;
+    assert(scan_utf8(buf,2,&l)==0);
+  }
+
+  /* now check lonely sequence start characters */
+  buf[1]=' ';
+  for (i=0xc0; i<=0xfd; ++i) {
+    buf[0]=i;
+    assert(scan_utf8(buf,2,&l)==0);
+  }
+
+  /* FE and FF are reserved for UTF-16 endianness detection*/
+  assert(scan_utf8("\xfe\xff",3,&l)==0);
+  assert(scan_utf8("\xff\xfe",3,&l)==0);
+
+  /* now check some truncated sequences */
+  l=fmt_utf8(buf,0); assert(l == 1 && scan_utf8(buf,l-1,&l)==0);
+  l=fmt_utf8(buf,0x80); assert(l == 2 && scan_utf8(buf,l-1,&l)==0);
+  l=fmt_utf8(buf,0x800); assert(l == 3 && scan_utf8(buf,l-1,&l)==0);
+  l=fmt_utf8(buf,0x10000); assert(l == 4 && scan_utf8(buf,l-1,&l)==0);
+  l=fmt_utf8(buf,0x200000); assert(l == 5 && scan_utf8(buf,l-1,&l)==0);
+  l=fmt_utf8(buf,0x4000000); assert(l == 6 && scan_utf8(buf,l-1,&l)==0);
+
+  /* now truncate in another way */
+  l=fmt_utf8(buf,0x80); buf[l-1]=' '; assert(l == 2 && scan_utf8(buf,l+1,&l)==0);
+  l=fmt_utf8(buf,0x800); buf[l-1]=' '; assert(l == 3 && scan_utf8(buf,l+1,&l)==0);
+  l=fmt_utf8(buf,0x10000); buf[l-1]=' '; assert(l == 4 && scan_utf8(buf,l+1,&l)==0);
+  l=fmt_utf8(buf,0x200000); buf[l-1]=' '; assert(l == 5 && scan_utf8(buf,l+1,&l)==0);
+  l=fmt_utf8(buf,0x4000000); buf[l-1]=' '; assert(l == 6 && scan_utf8(buf,l+1,&l)==0);
+
+  /* now check that scan_utf8_sem rejects surrogate pairs */
+  assert(scan_utf8_sem("\xed\xa0\x80",4,&l)==0);
+
+  /* now some not minimally encoded utf-8 sequences */
+  assert(scan_utf8("\xc0\x80",3,&l)==0);
+  assert(scan_utf8("\xe0\x80\x80",4,&l)==0);
+  assert(scan_utf8("\xf0\x80\x80\x80",5,&l)==0);
+  assert(scan_utf8("\xf8\x80\x80\x80\x80",6,&l)==0);
+  assert(scan_utf8("\xfc\x80\x80\x80\x80\x80",7,&l)==0);
+}