From d17c2d1012e9ce2350f345d2c8e66d1a886df6b6 Mon Sep 17 00:00:00 2001
From: leitner <leitner>
Date: Mon, 6 Feb 2012 07:59:56 +0000
Subject: [PATCH] add functions to encode and decode integers in variable
 length binary formats

---
 fmt/fmt_asn1derlength.3   | 18 ++++++++++++++
 fmt/fmt_asn1derlength.c   | 25 +++++++++++++++++++
 fmt/fmt_utf8.3            | 19 +++++++++++++++
 fmt/fmt_utf8.c            | 26 ++++++++++++++++++++
 scan/scan_asn1derlength.3 | 20 +++++++++++++++
 scan/scan_asn1derlength.c | 25 +++++++++++++++++++
 scan/scan_utf8.3          | 21 ++++++++++++++++
 scan/scan_utf8.c          | 51 +++++++++++++++++++++++++++++++++++++++
 8 files changed, 205 insertions(+)
 create mode 100644 fmt/fmt_asn1derlength.3
 create mode 100644 fmt/fmt_asn1derlength.c
 create mode 100644 fmt/fmt_utf8.3
 create mode 100644 fmt/fmt_utf8.c
 create mode 100644 scan/scan_asn1derlength.3
 create mode 100644 scan/scan_asn1derlength.c
 create mode 100644 scan/scan_utf8.3
 create mode 100644 scan/scan_utf8.c
diff --git a/fmt/fmt_asn1derlength.3 b/fmt/fmt_asn1derlength.3
new file mode 100644
index 0000000..cf0e445
--- /dev/null
+++ b/fmt/fmt_asn1derlength.3
@@ -0,0 +1,18 @@
+.TH fmt_asn1derlength 3
+.SH NAME
+fmt_asn1derlength \- encode unsigned integer like ASN.1 DER length
+.SH SYNTAX
+.B #include <fmt.h>
+
+size_t \fBfmt_asn1derlength\fP(char *\fIdest\fR,unsigned long long \fIsource\fR);
+.SH DESCRIPTION
+fmt_asn1derlength encodes an unsigned integer using the UTF-8 rules.  This
+can take from 1 byte (0-0x7f) up to sizeof(source)+1 bytes.
+
+If \fIdest\fR equals FMT_LEN (i.e. is NULL), fmt_asn1derlength returns the
+number of bytes it would have written.
+
+For convenience, fmt.h defines the integer FMT_ASN1LENGTH to be big
+enough to contain every possible fmt_asn1derlength output.
+.SH "SEE ALSO"
+scan_asn1derlength(3)
diff --git a/fmt/fmt_asn1derlength.c b/fmt/fmt_asn1derlength.c
new file mode 100644
index 0000000..2ab3c91
--- /dev/null
+++ b/fmt/fmt_asn1derlength.c
@@ -0,0 +1,25 @@
+#include "fmt.h"
+
+/* write int in least amount of bytes, return number of bytes */
+/* as used in ASN.1 length */
+size_t fmt_asn1derlength(char* dest,unsigned long long l) {
+  /* encoding is either l%128 or (0x80+number of bytes,bytes) */
+  size_t needed=(sizeof l),i;
+  if (l<128) {
+    if (dest) *dest=l&0x7f;
+    return 1;
+  }
+  for (i=1; i<needed; ++i)
+    if (!(l>>(i*8)))
+      break;
+  if (dest) {
+    int j=i;
+    *dest=0x80+i; ++dest;
+    while (j) {
+      --j;
+      *dest=((l>>(j*8))&0xff);
+      ++dest;
+    }
+  }
+  return i+1;
+}
diff --git a/fmt/fmt_utf8.3 b/fmt/fmt_utf8.3
new file mode 100644
index 0000000..ccbab94
--- /dev/null
+++ b/fmt/fmt_utf8.3
@@ -0,0 +1,19 @@
+.TH fmt_utf8 3
+.SH NAME
+fmt_utf8 \- encode 31-bit unsigned integer using UTF-8 rules
+.SH SYNTAX
+.B #include <fmt.h>
+
+size_t \fBfmt_utf8\fP(char *\fIdest\fR,uint32_t \fIsource\fR);
+.SH DESCRIPTION
+fmt_utf8 encodes a 31-bit unsigned integer using the UTF-8 rules.  This
+can take from 1 byte (0-0x7f) up to 5 bytes (0x4000000-0x7fffffff).
+Values larger than 0x7fffffff cannot be represented in this encoding.
+
+If \fIdest\fR equals FMT_LEN (i.e. is NULL), fmt_utf8 returns the
+number of bytes it would have written.
+
+For convenience, fmt.h defines the integer FMT_UTF8 to be big enough to
+contain every possible fmt_utf8 output.
+.SH "SEE ALSO"
+scan_utf8(3)
diff --git a/fmt/fmt_utf8.c b/fmt/fmt_utf8.c
new file mode 100644
index 0000000..74051eb
--- /dev/null
+++ b/fmt/fmt_utf8.c
@@ -0,0 +1,26 @@
+#include "fmt.h"
+
+size_t fmt_utf8(char *dest,uint32_t n) {
+  size_t i,j;
+  if (n<=0x7f) {
+    if (dest) *dest=n;
+    return 1;
+  }
+  for (i=0x3f,j=1; i<0x7fffffff; i=(i<<5)|0x1f, ++j) {
+    if (i>=n) {
+      --j;
+      if (dest) {
+	size_t k=j*6;
+	*dest++=((char)0xc0 >> (j-1)) | (n >> k);
+	while (k) {
+	  *dest++=0x80 | ((n >> (k-6)) & 0x3f);
+	  k-=6;
+	}
+      }
+      return j+1;
+    }
+  }
+  /* we were asked to encode a value that cannot be encoded */
+  return 0;
+}
+
diff --git a/scan/scan_asn1derlength.3 b/scan/scan_asn1derlength.3
new file mode 100644
index 0000000..9f5dcb6
--- /dev/null
+++ b/scan/scan_asn1derlength.3
@@ -0,0 +1,20 @@
+.TH scan_asn1length 3
+.SH NAME
+scan_asn1length \- decode an unsigned integer from ASN.1 DER length encoding
+.SH SYNTAX
+.B #include <scan.h>
+
+size_t \fBscan_asn1length\fP(const char *\fIsrc\fR,size_t \fIlen\fR,unsigned long long *\fIdest\fR);
+.SH DESCRIPTION
+scan_asn1length decodes an unsigned integer in ASN.1 DER length encoding
+from a memory area holding binary data.  It writes the decode value in
+\fIdest\fR and returns the number of bytes it read from \fIsrc\fR.
+
+scan_asn1length never reads more than \fIlen\fR bytes from \fIsrc\fR.  If the
+sequence is longer than that, or the memory area contains an invalid
+sequence, scan_asn1length returns 0 and does not touch \fIdest\fR.
+
+The length of the longest ASN.1 DER length sequence is 128 bytes.  In
+practice the largest sequence is sizeof(*dest)+1.
+.SH "SEE ALSO"
+fmt_asn1length(3)
diff --git a/scan/scan_asn1derlength.c b/scan/scan_asn1derlength.c
new file mode 100644
index 0000000..e546cd0
--- /dev/null
+++ b/scan/scan_asn1derlength.c
@@ -0,0 +1,25 @@
+#include "scan.h"
+
+size_t scan_asn1derlength(const char* src,size_t len,unsigned long long* length) {
+  const char* orig=src;
+  const char* max=orig+len;
+  if (src>=max) return 0;
+/* If the highest bit of the first byte is clear, the byte is the length.
+ * Otherwise the next n bytes are the length (n being the lower 7 bits) */
+  if (*src&0x80) {
+    int chars=*src&0x7f;
+    unsigned long long l=0;
+    while (chars>0) {
+      if (++src>=max) return 0;
+      if (l>(((unsigned long long)-1)>>8)) return 0;	/* catch integer overflow */
+      l=l*256+(unsigned char)*src;
+      --chars;
+    }
+    *length=l;
+  } else
+    *length=*src&0x7f;
+  src++;
+  if (src+*length>max) return 0;	/* catch integer overflow */
+  if ((uintptr_t)src+*length<(uintptr_t)src) return 0;	/* gcc 4.1 removes this check without the cast to uintptr_t */
+  return src-orig;
+}
diff --git a/scan/scan_utf8.3 b/scan/scan_utf8.3
new file mode 100644
index 0000000..5512e24
--- /dev/null
+++ b/scan/scan_utf8.3
@@ -0,0 +1,21 @@
+.TH scan_utf8 3
+.SH NAME
+scan_utf8 \- decode an unsigned integer from UTF-8 encoding
+.SH SYNTAX
+.B #include <scan.h>
+
+size_t \fBscan_utf8\fP(const char *\fIsrc\fR,size_t \fIlen\fR,uint32_t *\fIdest\fR);
+.SH DESCRIPTION
+scan_utf8 decodes an unsigned integer in UTF-8 encoding from a memory
+area holding binary data.  It writes the decode value in \fIdest\fR and
+returns the number of bytes it read from \fIsrc\fR.
+
+scan_utf8 never reads more than \fIlen\fR bytes from \fIsrc\fR.  If the
+sequence is longer than that, or the memory area contains an invalid
+sequence, scan_utf8 returns 0 and does not touch \fIdest\fR.
+
+The length of the longest UTF-8 sequence is 5.  If the buffer is longer
+than that, and scan_utf8 fails, then the data was not a valid UTF-8
+encoded sequence.
+.SH "SEE ALSO"
+fmt_utf8(3)
diff --git a/scan/scan_utf8.c b/scan/scan_utf8.c
new file mode 100644
index 0000000..caa96a6
--- /dev/null
+++ b/scan/scan_utf8.c
@@ -0,0 +1,51 @@
+#include "fmt.h"
+
+size_t scan_utf8(const char* in,size_t len,uint32_t* num) {
+  uint32_t i,k,m;
+  const char* orig=in;
+  if (len==0) return 0;
+  i=(*(unsigned char*)in++);	/* grab first byte */
+  if (i>=0xfe ||		/* 0xfe and 0xff are invalid encodings in utf-8 for the first byte */
+      (i&0xc0)==0x80) return 0;	/* first bits being 10 marks continuation chars, invalid sequence for first byte */
+  for (k=0; i&0x80; i<<=1, ++k);	/* count leading 1 bits */
+  if (!k) {
+    if (num) *num=i;
+    return 1;
+  }
+  if (k>len) return 0;
+  i=(i&0xff)>>k;		/* mask the leading 1 bits */
+  /* The next part is a little tricky.
+   * UTF-8 says that the encoder has to choose the most efficient
+   * encoding, and the decoder has to reject other encodings.  The
+   * background is that attackers encoded '/' not as 0x2f but as 0xc0
+   * 0xaf, and that evaded bad security checks just scan for the '/'
+   * byte in pathnames.
+   * At this point k contains the number of bytes, so k-1 is the number
+   * of continuation bytes.  For each additional continuation byte, we
+   * gain 6 bits of storage space, but we lose one in the signalling in
+   * the initial byte.  So we have 6 + (k-1) * 5 bits total storage
+   * space for this encoding.  The minimum value for k bytes is the
+   * maximum number for k-1 bytes plus 1.  If the previous encoding has
+   * 11 bits, its maximum value is 11 1-bits or 0x7ff, and the minimum
+   * value we are looking for is 0x800 or 1<<11.  For 2 bytes, UTF-8 can
+   * encode 11 bits, after that each additional byte gains 5 more bits.
+   * So for k>2, we want
+   *   1 << (11+(k-3)*5)
+   * or optimized to get rid of the -3
+   *   1 << (k*5-4)
+   * but for k==2 the delta is 4 bits (not 5), so we want
+   *   1 << 7
+   * abusing the fact that a boolean expression evaluates to 0 or 1, the
+   * expression can be written as
+   *   1 << (k*5-4+(k==2))
+   */
+  m=(1<<(k*5-4+(k==2)));
+  while (k>1) {
+    i=(i<<6) | ((*in++)&0x3f);
+    --k;
+  }
+  if (i<m) return 0;	/* if the encoded value was less than m, reject */
+  if (num) *num=i;
+  return in-orig;
+}
+