From d17c2d1012e9ce2350f345d2c8e66d1a886df6b6 Mon Sep 17 00:00:00 2001 From: leitner Date: Mon, 6 Feb 2012 07:59:56 +0000 Subject: [PATCH] add functions to encode and decode integers in variable length binary formats --- fmt/fmt_asn1derlength.3 | 18 ++++++++++++++ fmt/fmt_asn1derlength.c | 25 +++++++++++++++++++ fmt/fmt_utf8.3 | 19 +++++++++++++++ fmt/fmt_utf8.c | 26 ++++++++++++++++++++ scan/scan_asn1derlength.3 | 20 +++++++++++++++ scan/scan_asn1derlength.c | 25 +++++++++++++++++++ scan/scan_utf8.3 | 21 ++++++++++++++++ scan/scan_utf8.c | 51 +++++++++++++++++++++++++++++++++++++++ 8 files changed, 205 insertions(+) create mode 100644 fmt/fmt_asn1derlength.3 create mode 100644 fmt/fmt_asn1derlength.c create mode 100644 fmt/fmt_utf8.3 create mode 100644 fmt/fmt_utf8.c create mode 100644 scan/scan_asn1derlength.3 create mode 100644 scan/scan_asn1derlength.c create mode 100644 scan/scan_utf8.3 create mode 100644 scan/scan_utf8.c diff --git a/fmt/fmt_asn1derlength.3 b/fmt/fmt_asn1derlength.3 new file mode 100644 index 0000000..cf0e445 --- /dev/null +++ b/fmt/fmt_asn1derlength.3 @@ -0,0 +1,18 @@ +.TH fmt_asn1derlength 3 +.SH NAME +fmt_asn1derlength \- encode unsigned integer like ASN.1 DER length +.SH SYNTAX +.B #include + +size_t \fBfmt_asn1derlength\fP(char *\fIdest\fR,unsigned long long \fIsource\fR); +.SH DESCRIPTION +fmt_asn1derlength encodes an unsigned integer using the UTF-8 rules. This +can take from 1 byte (0-0x7f) up to sizeof(source)+1 bytes. + +If \fIdest\fR equals FMT_LEN (i.e. is NULL), fmt_asn1derlength returns the +number of bytes it would have written. + +For convenience, fmt.h defines the integer FMT_ASN1LENGTH to be big +enough to contain every possible fmt_asn1derlength output. +.SH "SEE ALSO" +scan_asn1derlength(3) diff --git a/fmt/fmt_asn1derlength.c b/fmt/fmt_asn1derlength.c new file mode 100644 index 0000000..2ab3c91 --- /dev/null +++ b/fmt/fmt_asn1derlength.c @@ -0,0 +1,25 @@ +#include "fmt.h" + +/* write int in least amount of bytes, return number of bytes */ +/* as used in ASN.1 length */ +size_t fmt_asn1derlength(char* dest,unsigned long long l) { + /* encoding is either l%128 or (0x80+number of bytes,bytes) */ + size_t needed=(sizeof l),i; + if (l<128) { + if (dest) *dest=l&0x7f; + return 1; + } + for (i=1; i>(i*8))) + break; + if (dest) { + int j=i; + *dest=0x80+i; ++dest; + while (j) { + --j; + *dest=((l>>(j*8))&0xff); + ++dest; + } + } + return i+1; +} diff --git a/fmt/fmt_utf8.3 b/fmt/fmt_utf8.3 new file mode 100644 index 0000000..ccbab94 --- /dev/null +++ b/fmt/fmt_utf8.3 @@ -0,0 +1,19 @@ +.TH fmt_utf8 3 +.SH NAME +fmt_utf8 \- encode 31-bit unsigned integer using UTF-8 rules +.SH SYNTAX +.B #include + +size_t \fBfmt_utf8\fP(char *\fIdest\fR,uint32_t \fIsource\fR); +.SH DESCRIPTION +fmt_utf8 encodes a 31-bit unsigned integer using the UTF-8 rules. This +can take from 1 byte (0-0x7f) up to 5 bytes (0x4000000-0x7fffffff). +Values larger than 0x7fffffff cannot be represented in this encoding. + +If \fIdest\fR equals FMT_LEN (i.e. is NULL), fmt_utf8 returns the +number of bytes it would have written. + +For convenience, fmt.h defines the integer FMT_UTF8 to be big enough to +contain every possible fmt_utf8 output. +.SH "SEE ALSO" +scan_utf8(3) diff --git a/fmt/fmt_utf8.c b/fmt/fmt_utf8.c new file mode 100644 index 0000000..74051eb --- /dev/null +++ b/fmt/fmt_utf8.c @@ -0,0 +1,26 @@ +#include "fmt.h" + +size_t fmt_utf8(char *dest,uint32_t n) { + size_t i,j; + if (n<=0x7f) { + if (dest) *dest=n; + return 1; + } + for (i=0x3f,j=1; i<0x7fffffff; i=(i<<5)|0x1f, ++j) { + if (i>=n) { + --j; + if (dest) { + size_t k=j*6; + *dest++=((char)0xc0 >> (j-1)) | (n >> k); + while (k) { + *dest++=0x80 | ((n >> (k-6)) & 0x3f); + k-=6; + } + } + return j+1; + } + } + /* we were asked to encode a value that cannot be encoded */ + return 0; +} + diff --git a/scan/scan_asn1derlength.3 b/scan/scan_asn1derlength.3 new file mode 100644 index 0000000..9f5dcb6 --- /dev/null +++ b/scan/scan_asn1derlength.3 @@ -0,0 +1,20 @@ +.TH scan_asn1length 3 +.SH NAME +scan_asn1length \- decode an unsigned integer from ASN.1 DER length encoding +.SH SYNTAX +.B #include + +size_t \fBscan_asn1length\fP(const char *\fIsrc\fR,size_t \fIlen\fR,unsigned long long *\fIdest\fR); +.SH DESCRIPTION +scan_asn1length decodes an unsigned integer in ASN.1 DER length encoding +from a memory area holding binary data. It writes the decode value in +\fIdest\fR and returns the number of bytes it read from \fIsrc\fR. + +scan_asn1length never reads more than \fIlen\fR bytes from \fIsrc\fR. If the +sequence is longer than that, or the memory area contains an invalid +sequence, scan_asn1length returns 0 and does not touch \fIdest\fR. + +The length of the longest ASN.1 DER length sequence is 128 bytes. In +practice the largest sequence is sizeof(*dest)+1. +.SH "SEE ALSO" +fmt_asn1length(3) diff --git a/scan/scan_asn1derlength.c b/scan/scan_asn1derlength.c new file mode 100644 index 0000000..e546cd0 --- /dev/null +++ b/scan/scan_asn1derlength.c @@ -0,0 +1,25 @@ +#include "scan.h" + +size_t scan_asn1derlength(const char* src,size_t len,unsigned long long* length) { + const char* orig=src; + const char* max=orig+len; + if (src>=max) return 0; +/* If the highest bit of the first byte is clear, the byte is the length. + * Otherwise the next n bytes are the length (n being the lower 7 bits) */ + if (*src&0x80) { + int chars=*src&0x7f; + unsigned long long l=0; + while (chars>0) { + if (++src>=max) return 0; + if (l>(((unsigned long long)-1)>>8)) return 0; /* catch integer overflow */ + l=l*256+(unsigned char)*src; + --chars; + } + *length=l; + } else + *length=*src&0x7f; + src++; + if (src+*length>max) return 0; /* catch integer overflow */ + if ((uintptr_t)src+*length<(uintptr_t)src) return 0; /* gcc 4.1 removes this check without the cast to uintptr_t */ + return src-orig; +} diff --git a/scan/scan_utf8.3 b/scan/scan_utf8.3 new file mode 100644 index 0000000..5512e24 --- /dev/null +++ b/scan/scan_utf8.3 @@ -0,0 +1,21 @@ +.TH scan_utf8 3 +.SH NAME +scan_utf8 \- decode an unsigned integer from UTF-8 encoding +.SH SYNTAX +.B #include + +size_t \fBscan_utf8\fP(const char *\fIsrc\fR,size_t \fIlen\fR,uint32_t *\fIdest\fR); +.SH DESCRIPTION +scan_utf8 decodes an unsigned integer in UTF-8 encoding from a memory +area holding binary data. It writes the decode value in \fIdest\fR and +returns the number of bytes it read from \fIsrc\fR. + +scan_utf8 never reads more than \fIlen\fR bytes from \fIsrc\fR. If the +sequence is longer than that, or the memory area contains an invalid +sequence, scan_utf8 returns 0 and does not touch \fIdest\fR. + +The length of the longest UTF-8 sequence is 5. If the buffer is longer +than that, and scan_utf8 fails, then the data was not a valid UTF-8 +encoded sequence. +.SH "SEE ALSO" +fmt_utf8(3) diff --git a/scan/scan_utf8.c b/scan/scan_utf8.c new file mode 100644 index 0000000..caa96a6 --- /dev/null +++ b/scan/scan_utf8.c @@ -0,0 +1,51 @@ +#include "fmt.h" + +size_t scan_utf8(const char* in,size_t len,uint32_t* num) { + uint32_t i,k,m; + const char* orig=in; + if (len==0) return 0; + i=(*(unsigned char*)in++); /* grab first byte */ + if (i>=0xfe || /* 0xfe and 0xff are invalid encodings in utf-8 for the first byte */ + (i&0xc0)==0x80) return 0; /* first bits being 10 marks continuation chars, invalid sequence for first byte */ + for (k=0; i&0x80; i<<=1, ++k); /* count leading 1 bits */ + if (!k) { + if (num) *num=i; + return 1; + } + if (k>len) return 0; + i=(i&0xff)>>k; /* mask the leading 1 bits */ + /* The next part is a little tricky. + * UTF-8 says that the encoder has to choose the most efficient + * encoding, and the decoder has to reject other encodings. The + * background is that attackers encoded '/' not as 0x2f but as 0xc0 + * 0xaf, and that evaded bad security checks just scan for the '/' + * byte in pathnames. + * At this point k contains the number of bytes, so k-1 is the number + * of continuation bytes. For each additional continuation byte, we + * gain 6 bits of storage space, but we lose one in the signalling in + * the initial byte. So we have 6 + (k-1) * 5 bits total storage + * space for this encoding. The minimum value for k bytes is the + * maximum number for k-1 bytes plus 1. If the previous encoding has + * 11 bits, its maximum value is 11 1-bits or 0x7ff, and the minimum + * value we are looking for is 0x800 or 1<<11. For 2 bytes, UTF-8 can + * encode 11 bits, after that each additional byte gains 5 more bits. + * So for k>2, we want + * 1 << (11+(k-3)*5) + * or optimized to get rid of the -3 + * 1 << (k*5-4) + * but for k==2 the delta is 4 bits (not 5), so we want + * 1 << 7 + * abusing the fact that a boolean expression evaluates to 0 or 1, the + * expression can be written as + * 1 << (k*5-4+(k==2)) + */ + m=(1<<(k*5-4+(k==2))); + while (k>1) { + i=(i<<6) | ((*in++)&0x3f); + --k; + } + if (i