#include #include #include "entities.h" #include "fmt.h" #include "textcode.h" #include "haveinline.h" #include "scan.h" #include "case.h" #include "str.h" static const char* lookup(size_t ofs,const char* t) { if (ofs>entities.tab[0]) return 0; while (ofs>8); else return lookup(entities.tab[ofs]>>8,t+1); } else ++ofs; if (!ch) break; } return NULL; } enum htmlmode { /* libowfat<home */ OUTSIDE, /* ^^^^^^^^^^^^^^^^ -> libowfat http://example.com/"foo */ }; static size_t scan_html_inner(const char *src,char *dest,size_t *destlen,enum htmlmode mode) { register const unsigned char* s=(const unsigned char*) src; size_t written=0,i; int dq=0; for (i=0; s[i]; ++i) { if (s[i]=='&') { const char* utf8; if (s[i+1]=='#') { unsigned long l; size_t j; if ((s[i+2]&~32)=='X') { j=scan_xlong(src+i+3,&l); if (j) j+=3; } else { j=scan_ulong(src+i+2,&l); if (j) j+=2; } if (s[i+j]==';') { i+=j; written+=fmt_utf8(dest?dest+written:0,l); } else { if (dest) dest[written]='&'; ++written; } continue; } utf8=lookup(1,src+i+1); if (utf8) { size_t l=strlen(utf8); if (dest) memcpy(dest+written,utf8,l); written+=l; i+=2+str_chr(src+i+2,';'); continue; } else if (dest) dest[written]='&'; } else if (s[i]=='<') { break; } else if (s[i]=='"' && mode==TAGARG) { if (i==0) { dq=1; continue; } break; } else if (mode==TAGARG && !dq && (s[i]==' ' || s[i]=='\t' || s[i]=='\n')) break; else if (dest) dest[written]=s[i]; ++written; } if (destlen) *destlen=written; return i; } size_t scan_html_tagarg(const char *src,char *dest,size_t *destlen) { return scan_html_inner(src,dest,destlen,TAGARG); } size_t scan_html(const char *src,char *dest,size_t *destlen) { return scan_html_inner(src,dest,destlen,OUTSIDE); } #ifdef UNITTEST #include #undef UNITTEST #include #include #include #include #include #include #include #include int main() { char* html="libowfat<home"; char buf[100]; size_t destlen; /* check that we stop at < */ assert(scan_html(html,buf,&destlen)==0 && destlen==0); /* check that we properly decode < */ memset(buf,'?',sizeof(buf)); assert(scan_html(strchr(html,'>')+1,buf,&destlen)==16 && destlen==13 && !memcmp(buf,"libowfat