add man page and unit tests for scan_base64url

2017-07-30 13:41:43 +00:00 · 2017-07-30 13:41:43 +00:00 · c8156a9841
commit c8156a9841
parent 8d449d442b
5 changed files with 103 additions and 35 deletions
--- a/textcode/scan_base64.3
+++ b/textcode/scan_base64.3
@ -16,10 +16,6 @@ Note that real world base64 encoded data is sometimes permitted to
 contain whitespace characters or new lines. This function will not allow
 those and return the decoded data until then.

-base64 works by taking 3 bytes of binary input and converting them into
-4 bytes of printable ASCII. If the input ends in the middle of a base64
-4-byte-tuple, scan_base64 will disregard the whole tuple.
-
 Many base64 variants demand padding in the last block. Some don't. This
 implementation will consume padding if it is there, but will not
 complain if it is not.
@ -30,7 +26,7 @@ dest can be NULL. destlen can be NULL.
 scan_base64 returns the number of bytes successfully scanned and
 processed from src.
 .SH EXAMPLES
-scan_base64("%9FYO<F0`",buf,&i) -> return 8, i=5, buf="fnord"
+scan_base64("Zm5vcmQ=",buf,&i) -> return 8, i=5, buf="fnord"

 .SH "SEE ALSO"
-scan_xlong(3), scan_8long(3), fmt_ulong(3)
+scan_base64url(3), scan_xlong(3), scan_8long(3), fmt_ulong(3)
--- a/textcode/scan_base64.c
+++ b/textcode/scan_base64.c
@ -48,8 +48,10 @@ int main() {
  char buf[100];
  size_t i,l;
  memset(buf,0,10); assert(scan_base64("Zm5vcmQ=",buf,&l)==8 && l==5 && !memcmp(buf,"fnord",6));
+  /* check that we don't insist on the padding */
  memset(buf,0,10); assert(scan_base64("Zm5vcmQ",buf,&l)==7 && l==5 && !memcmp(buf,"fnord",6));
-  memset(buf,0,10); assert(scan_base64("//8=",buf,&l)==4 && l==2 && !memcmp(buf,"\xff\xff",3));
+  /* check the special non-isalnum chars :) */
+  memset(buf,0,10); assert(scan_base64("/+8=",buf,&l)==4 && l==2 && !memcmp(buf,"\xff\xef",3));
  return 0;
 }
 #endif
--- a/textcode/scan_base64url.3
+++ b/textcode/scan_base64url.3
@ -0,0 +1,32 @@
+.TH scan_base64url 3
+.SH NAME
+scan_base64url \- decode base64url encoded data
+.SH SYNTAX
+.B #include <libowfat/textcode.h>
+
+size_t \fBscan_base64url\fP(const char *\fIsrc\fR,char *\fIdest\fR,size_t* \fIdestlen\fR);
+
+.SH DESCRIPTION
+base64url is a variant of base64 for use in URLs (standard base64 uses /
+and +, which can cause problems in URLs, so base64url uses - and _
+instead; also base64url does not use = padding at the end).
+
+scan_base64url decodes base64url encoded data from src into dest.
+It will stop when it encountes any non-valid input characters.
+It will then write the number of decoded bytes in dest into *destlen,
+and return the number of bytes decoded from src.
+
+Many base64 variants demand padding in the last block. Some don't. This
+implementation will consume padding if it is there, but will not
+complain if it is not.
+
+dest can be NULL. destlen can be NULL.
+
+.SH "RETURN VALUE"
+scan_base64url returns the number of bytes successfully scanned and
+processed from src.
+.SH EXAMPLES
+scan_base64url("Zm5vcmQ",buf,&i) -> return 7, i=5, buf="fnord"
+
+.SH "SEE ALSO"
+scan_base64(3), scan_xlong(3), scan_8long(3), fmt_ulong(3)
--- a/textcode/scan_base64url.c
+++ b/textcode/scan_base64url.c
@ -15,17 +15,36 @@ static inline int dec(unsigned char x) {
 size_t scan_base64url(const char *src,char *dest,size_t *destlen) {
  unsigned short tmp=0,bits=0;
  register const unsigned char* s=(const unsigned char*) src;
-  const char* orig=dest;
-  for (;;) {
+  size_t i,j=0;
+  for (i=0;;) {
    int a=dec(*s);
-    if (a<0) break;
+    if (a<0) break;	/* base64url does not have padding */
    tmp=(tmp<<6)|a; bits+=6;
    ++s;
    if (bits>=8) {
-      *dest=(tmp>>(bits-=8));
-      ++dest;
+      bits-=8;
+      if (dest) dest[i]=(tmp>>bits);
+      ++i;
    }
  }
-  *destlen=dest-orig;
+  if (destlen) *destlen=i;
  return (const char*)s-src;
 }
+
+#ifdef UNITTEST
+#include <assert.h>
+#include <string.h>
+#include <stdio.h>
+
+int main() {
+  char buf[100];
+  size_t i,l;
+  /* check that we don't consume padding */
+  memset(buf,0,10); assert(scan_base64url("Zm5vcmQ=",buf,&l)==7 && l==5 && !memcmp(buf,"fnord",6));
+  /* check that we don't insist on the padding */
+  memset(buf,0,10); assert(scan_base64url("Zm5vcmQ",buf,&l)==7 && l==5 && !memcmp(buf,"fnord",6));
+  /* check the special non-isalnum chars :) */
+  memset(buf,0,10); assert(scan_base64url("_-8=",buf,&l)==3 && l==2 && !memcmp(buf,"\xff\xef",3));
+  return 0;
+}
+#endif
--- a/textcode/scan_html.c
+++ b/textcode/scan_html.c
@ -26,8 +26,8 @@ static const char* lookup(size_t ofs,const char* t) {
 }

 enum htmlmode {	/* <a href="http://example.com/&quot;foo">libowfat&lt;home</a> */
-  OUTSIDE,	/*                                        ^^^^^^^^^^^^^^^^ -> `libowfat<home` */
-  TAGARG,	/*         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -> `http://example.com/"foo´ */
+  OUTSIDE,	/*                                        ^^^^^^^^^^^^^^^^ -> libowfat<home */
+  TAGARG,	/*         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -> http://example.com/"foo */
 };

 static size_t scan_html_inner(const char *src,char *dest,size_t *destlen,enum htmlmode mode) {
@ -42,49 +42,41 @@ static size_t scan_html_inner(const char *src,char *dest,size_t *destlen,enum ht
 	size_t j;
 	if ((s[i+2]&~32)=='X') {
 	  j=scan_xlong(src+i+3,&l);
-	  if (!j) j+=3;
+	  if (j) j+=3;
 	} else {
 	  j=scan_ulong(src+i+2,&l);
-	  if (!j) j+=3;
+	  if (j) j+=2;
 	}
 	if (s[i+j]==';') {
 	  i+=j;
-	  written+=fmt_utf8(dest+written,l);
+	  written+=fmt_utf8(dest?dest+written:0,l);
 	} else {
-	  dest[written++]='&';
+	  if (dest) dest[written]='&';
+	  ++written;
 	}
 	continue;
      }
      utf8=lookup(1,src+i+1);
      if (utf8) {
 	size_t l=strlen(utf8);
-	memcpy(dest+written,utf8,l);
+	if (dest) memcpy(dest+written,utf8,l);
 	written+=l;
 	i+=2+str_chr(src+i+2,';');
 	continue;
      } else
-	dest[written]='&';
+	if (dest) dest[written]='&';
    } else if (s[i]=='<') {
-      if (mode == OUTSIDE) break;
-      if (case_starts((const char*)s+i+1,"br>")) {
-	dest[written]='\n';
-	i+=3;
-      } else if (case_starts((const char*)s+i+1,"p>")) {
-	dest[written]='\n'; ++written;
-	dest[written]='\n';
-	i+=3;
-      } else
-	dest[written]=s[i];
+      break;
    } else if (s[i]=='"' && mode==TAGARG) {
      if (i==0) { dq=1; continue; }
      break;
    } else if (mode==TAGARG && !dq && (s[i]==' ' || s[i]=='\t' || s[i]=='\n'))
      break;
    else
-      dest[written]=s[i];
+      if (dest) dest[written]=s[i];
    ++written;
  }
-  *destlen=written;
+  if (destlen) *destlen=written;
  return i;
 }

@ -98,13 +90,40 @@ size_t scan_html(const char *src,char *dest,size_t *destlen) {

 #ifdef UNITTEST
 #include <assert.h>
+#undef UNITTEST
+#include <scan/scan_fromhex.c>
+#include <scan/scan_xlongn.c>
+#include <scan/scan_xlong.c>
+#include <scan/scan_ulongn.c>
+#include <scan/scan_ulong.c>
+#include <str/str_chr.c>
+#include <fmt/fmt_utf8.c>
+#include <stdio.h>

 int main() {
  char* html="<a href=\"http://example.com/&quot;foo\">libowfat&lt;home</a>";
  char buf[100];
  size_t destlen;
+  /* check that we stop at < */
  assert(scan_html(html,buf,&destlen)==0 && destlen==0);
-  assert(scan_html(strchr(html,'>')+1,buf,&destlen)==16 && destlen==13 && !memcmp(buf,"libowfat<home",13));
-  assert(scan_html_tagarg(strchr(html,'"')+1,buf,&destlen)==28 && destlen==23 && !memcmp(buf,"http://example.com/\"foo",23));
+  /* check that we properly decode &lt; */
+  memset(buf,'?',sizeof(buf));
+  assert(scan_html(strchr(html,'>')+1,buf,&destlen)==16 && destlen==13 && !memcmp(buf,"libowfat<home?",14));
+  /* check that we stop at " and properly decode &quot; */
+  memset(buf,'?',sizeof(buf));
+  assert(scan_html_tagarg(strchr(html,'"')+1,buf,&destlen)==28 && destlen==23 && !memcmp(buf,"http://example.com/\"foo?",24));
+  /* check that we pass through invalid escapes */
+  memset(buf,'?',sizeof(buf));
+  assert(scan_html("&fnord;",buf,&destlen)==7 && destlen==7 && !memcmp(buf,"&fnord;?",8));
+  memset(buf,'?',sizeof(buf));
+  assert(scan_html("&#x;",buf,&destlen)==4 && destlen==4 && !memcmp(buf,"&#x;?",5));
+  memset(buf,'?',sizeof(buf));
+  assert(scan_html("&#;",buf,&destlen)==3 && destlen==3 && !memcmp(buf,"&#;?",4));
+  /* check that &#x[hex]; is decoded properly */
+  memset(buf,'?',sizeof(buf));
+  assert(scan_html("&#x1;",buf,&destlen)==5 && destlen==1 && buf[0]==1 && buf[1]=='?');
+  /* check that &#[decimal]; is decoded properly */
+  memset(buf,'?',sizeof(buf));
+  assert(scan_html("&#1;",buf,&destlen)==4 && destlen==1 && buf[0]==1 && buf[1]=='?');
 }
 #endif