add man page and unit tests for scan_base64url

2017-07-30 13:41:43 +00:00 · 2017-07-30 13:41:43 +00:00 · c8156a9841
commit c8156a9841
parent 8d449d442b
5 changed files with 103 additions and 35 deletions
--- a/textcode/scan_base64.3
+++ b/textcode/scan_base64.3
@ -16,10 +16,6 @@ Note that real world base64 encoded data is sometimes permitted to
 contain whitespace characters or new lines. This function will not allow
 those and return the decoded data until then.
 base64 works by taking 3 bytes of binary input and converting them into
 4 bytes of printable ASCII. If the input ends in the middle of a base64
 4-byte-tuple, scan_base64 will disregard the whole tuple.
 Many base64 variants demand padding in the last block. Some don't. This
 implementation will consume padding if it is there, but will not
 complain if it is not.
@ -30,7 +26,7 @@ dest can be NULL. destlen can be NULL.
 scan_base64 returns the number of bytes successfully scanned and
 processed from src.
 .SH EXAMPLES
-scan_base64("%9FYO<F0`",buf,&i) -> return 8, i=5, buf="fnord"
+scan_base64("Zm5vcmQ=",buf,&i) -> return 8, i=5, buf="fnord"
 .SH "SEE ALSO"
-scan_xlong(3), scan_8long(3), fmt_ulong(3)
+scan_base64url(3), scan_xlong(3), scan_8long(3), fmt_ulong(3)
--- a/textcode/scan_base64.c
+++ b/textcode/scan_base64.c
@ -48,8 +48,10 @@ int main() {
  char buf[100];
  size_t i,l;
  memset(buf,0,10); assert(scan_base64("Zm5vcmQ=",buf,&l)==8 && l==5 && !memcmp(buf,"fnord",6));
  /* check that we don't insist on the padding */
  memset(buf,0,10); assert(scan_base64("Zm5vcmQ",buf,&l)==7 && l==5 && !memcmp(buf,"fnord",6));
-  memset(buf,0,10); assert(scan_base64("//8=",buf,&l)==4 && l==2 && !memcmp(buf,"\xff\xff",3));
+  /* check the special non-isalnum chars :) */
  memset(buf,0,10); assert(scan_base64("/+8=",buf,&l)==4 && l==2 && !memcmp(buf,"\xff\xef",3));
  return 0;
 }
 #endif
--- a/textcode/scan_base64url.3
+++ b/textcode/scan_base64url.3
@ -0,0 +1,32 @@
 .TH scan_base64url 3
 .SH NAME
 scan_base64url \- decode base64url encoded data
 .SH SYNTAX
 .B #include <libowfat/textcode.h>
 size_t \fBscan_base64url\fP(const char *\fIsrc\fR,char *\fIdest\fR,size_t* \fIdestlen\fR);
 .SH DESCRIPTION
 base64url is a variant of base64 for use in URLs (standard base64 uses /
 and +, which can cause problems in URLs, so base64url uses - and _
 instead; also base64url does not use = padding at the end).
 scan_base64url decodes base64url encoded data from src into dest.
 It will stop when it encountes any non-valid input characters.
 It will then write the number of decoded bytes in dest into *destlen,
 and return the number of bytes decoded from src.
 Many base64 variants demand padding in the last block. Some don't. This
 implementation will consume padding if it is there, but will not
 complain if it is not.
 dest can be NULL. destlen can be NULL.
 .SH "RETURN VALUE"
 scan_base64url returns the number of bytes successfully scanned and
 processed from src.
 .SH EXAMPLES
 scan_base64url("Zm5vcmQ",buf,&i) -> return 7, i=5, buf="fnord"
 .SH "SEE ALSO"
 scan_base64(3), scan_xlong(3), scan_8long(3), fmt_ulong(3)
--- a/textcode/scan_base64url.c
+++ b/textcode/scan_base64url.c
@ -15,17 +15,36 @@ static inline int dec(unsigned char x) {
 size_t scan_base64url(const char *src,char *dest,size_t *destlen) {
  unsigned short tmp=0,bits=0;
  register const unsigned char* s=(const unsigned char*) src;
-  const char* orig=dest;
+  size_t i,j=0;
-  for (;;) {
+  for (i=0;;) {
    int a=dec(*s);
-    if (a<0) break;
+    if (a<0) break;	/* base64url does not have padding */
    tmp=(tmp<<6)|a; bits+=6;
    ++s;
    if (bits>=8) {
-      *dest=(tmp>>(bits-=8));
+      bits-=8;
-      ++dest;
+      if (dest) dest[i]=(tmp>>bits);
      ++i;
    }
  }
-  *destlen=dest-orig;
+  if (destlen) *destlen=i;
  return (const char*)s-src;
 }
 #ifdef UNITTEST
 #include <assert.h>
 #include <string.h>
 #include <stdio.h>
 int main() {
  char buf[100];
  size_t i,l;
  /* check that we don't consume padding */
  memset(buf,0,10); assert(scan_base64url("Zm5vcmQ=",buf,&l)==7 && l==5 && !memcmp(buf,"fnord",6));
  /* check that we don't insist on the padding */
  memset(buf,0,10); assert(scan_base64url("Zm5vcmQ",buf,&l)==7 && l==5 && !memcmp(buf,"fnord",6));
  /* check the special non-isalnum chars :) */
  memset(buf,0,10); assert(scan_base64url("_-8=",buf,&l)==3 && l==2 && !memcmp(buf,"\xff\xef",3));
  return 0;
 }
 #endif
--- a/textcode/scan_html.c
+++ b/textcode/scan_html.c
@ -26,8 +26,8 @@ static const char* lookup(size_t ofs,const char* t) {
 }
 enum htmlmode {	/* <a href="http://example.com/&quot;foo">libowfat&lt;home</a> */
-  OUTSIDE,	/*                                        ^^^^^^^^^^^^^^^^ -> `libowfat<home` */
+  OUTSIDE,	/*                                        ^^^^^^^^^^^^^^^^ -> libowfat<home */
-  TAGARG,	/*         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -> `http://example.com/"foo´ */
+  TAGARG,	/*         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -> http://example.com/"foo */
 };
 static size_t scan_html_inner(const char *src,char *dest,size_t *destlen,enum htmlmode mode) {
@ -42,49 +42,41 @@ static size_t scan_html_inner(const char *src,char *dest,size_t *destlen,enum ht
 	size_t j;
 	if ((s[i+2]&~32)=='X') {
 	  j=scan_xlong(src+i+3,&l);
-	  if (!j) j+=3;
+	  if (j) j+=3;
 	} else {
 	  j=scan_ulong(src+i+2,&l);
-	  if (!j) j+=3;
+	  if (j) j+=2;
 	}
 	if (s[i+j]==';') {
 	  i+=j;
-	  written+=fmt_utf8(dest+written,l);
+	  written+=fmt_utf8(dest?dest+written:0,l);
 	} else {
-	  dest[written++]='&';
+	  if (dest) dest[written]='&';
 	  ++written;
 	}
 	continue;
      }
      utf8=lookup(1,src+i+1);
      if (utf8) {
 	size_t l=strlen(utf8);
-	memcpy(dest+written,utf8,l);
+	if (dest) memcpy(dest+written,utf8,l);
 	written+=l;
 	i+=2+str_chr(src+i+2,';');
 	continue;
      } else
-	dest[written]='&';
+	if (dest) dest[written]='&';
    } else if (s[i]=='<') {
-      if (mode == OUTSIDE) break;
+      break;
      if (case_starts((const char*)s+i+1,"br>")) {
 	dest[written]='\n';
 	i+=3;
      } else if (case_starts((const char*)s+i+1,"p>")) {
 	dest[written]='\n'; ++written;
 	dest[written]='\n';
 	i+=3;
      } else
 	dest[written]=s[i];
    } else if (s[i]=='"' && mode==TAGARG) {
      if (i==0) { dq=1; continue; }
      break;
    } else if (mode==TAGARG && !dq && (s[i]==' ' || s[i]=='\t' || s[i]=='\n'))
      break;
    else
-      dest[written]=s[i];
+      if (dest) dest[written]=s[i];
    ++written;
  }
-  *destlen=written;
+  if (destlen) *destlen=written;
  return i;
 }
@ -98,13 +90,40 @@ size_t scan_html(const char *src,char *dest,size_t *destlen) {
 #ifdef UNITTEST
 #include <assert.h>
 #undef UNITTEST
 #include <scan/scan_fromhex.c>
 #include <scan/scan_xlongn.c>
 #include <scan/scan_xlong.c>
 #include <scan/scan_ulongn.c>
 #include <scan/scan_ulong.c>
 #include <str/str_chr.c>
 #include <fmt/fmt_utf8.c>
 #include <stdio.h>
 int main() {
  char* html="<a href=\"http://example.com/&quot;foo\">libowfat&lt;home</a>";
  char buf[100];
  size_t destlen;
  /* check that we stop at < */
  assert(scan_html(html,buf,&destlen)==0 && destlen==0);
-  assert(scan_html(strchr(html,'>')+1,buf,&destlen)==16 && destlen==13 && !memcmp(buf,"libowfat<home",13));
+  /* check that we properly decode &lt; */
-  assert(scan_html_tagarg(strchr(html,'"')+1,buf,&destlen)==28 && destlen==23 && !memcmp(buf,"http://example.com/\"foo",23));
+  memset(buf,'?',sizeof(buf));
  assert(scan_html(strchr(html,'>')+1,buf,&destlen)==16 && destlen==13 && !memcmp(buf,"libowfat<home?",14));
  /* check that we stop at " and properly decode &quot; */
  memset(buf,'?',sizeof(buf));
  assert(scan_html_tagarg(strchr(html,'"')+1,buf,&destlen)==28 && destlen==23 && !memcmp(buf,"http://example.com/\"foo?",24));
  /* check that we pass through invalid escapes */
  memset(buf,'?',sizeof(buf));
  assert(scan_html("&fnord;",buf,&destlen)==7 && destlen==7 && !memcmp(buf,"&fnord;?",8));
  memset(buf,'?',sizeof(buf));
  assert(scan_html("&#x;",buf,&destlen)==4 && destlen==4 && !memcmp(buf,"&#x;?",5));
  memset(buf,'?',sizeof(buf));
  assert(scan_html("&#;",buf,&destlen)==3 && destlen==3 && !memcmp(buf,"&#;?",4));
  /* check that &#x[hex]; is decoded properly */
  memset(buf,'?',sizeof(buf));
  assert(scan_html("&#x1;",buf,&destlen)==5 && destlen==1 && buf[0]==1 && buf[1]=='?');
  /* check that &#[decimal]; is decoded properly */
  memset(buf,'?',sizeof(buf));
  assert(scan_html("&#1;",buf,&destlen)==4 && destlen==1 && buf[0]==1 && buf[1]=='?');
 }
 #endif