#define _BSD_SOURCE #define _DEFAULT_SOURCE #include #include #include #include #include "scan.h" #include #define INTERNAL #include "scan/scan_ulong.c" #include "scan/scan_ulongn.c" #include "fmt/fmt_utf8.c" #include "fmt/fmt_tohex.c" #include "fmt/fmt_escapecharc.c" char tmp[20]; char tmp2[20]; size_t n,m; unsigned long l; struct entity { const char* entity; char utf8[10]; struct entity* next; }* root,** cur=&root; struct letter { unsigned char c; struct letters* weiter; uint32_t marshaled; /* lower 8 bits: char. rest: ofs from start of marshaled blob */ }; struct letters { size_t n; struct letter liste[256]; }; struct letters* d; size_t nodes,datasize; void nomem() { fprintf(stderr, "memory allocation failure!\n"); exit(1); } void addword(struct letters** s,const char* t, void* pointer) { size_t i; if (!*s) { *s=malloc(sizeof(**s)); if (!*s) nomem(); memset(*s,0,sizeof(**s)); (*s)->liste[0].c='?'; } i=(unsigned char)*t; if ((*s)->liste[i].c==*t) { if (!*t) { datasize+=strlen((char*)pointer)+1; (*s)->liste[i].weiter=pointer; } else addword(&(*s)->liste[i].weiter,t+1,pointer); return; } ++nodes; (*s)->n++; (*s)->liste[i].c=*t; if (!*t) { datasize+=strlen((char*)pointer)+1; (*s)->liste[i].weiter=pointer; } else { (*s)->liste[i].weiter=0; addword(&(*s)->liste[i].weiter,t+1,pointer); } } void dump(struct letters* s,size_t depth) { size_t i,j; if (!s) return; for (i=0; i<256; ++i) { if (s->liste[i].c!=i) continue; for (j=0; j {\n",s->liste[i].c); if (s->liste[i].c) dump(s->liste[i].weiter,depth+1); for (j=0; jn; assert(usedliste[i].c!=i) { if (i==0) return; continue; } /* printf("marshalhelper: %c\n",i); */ x=(unsigned char)s->liste[i].c; if (!x) { size_t l=strlen((char*)s->liste[i].weiter)+1; /* puts((char*)s->liste[i].weiter); */ x|=useddata<<8; assert(useddata+l<=datasize); memcpy(data+useddata,s->liste[i].weiter,l); useddata+=l; marshaled[++myindex]=x; return; } else { x|=(used+1)<<8; marshalhelper(s->liste[i].weiter); } marshaled[++myindex]=x; } /* printf("return\n"); */ } void marshal(struct letters* s) { fprintf(stderr,"nodes=%zu, datasize=%zu\n",nodes,datasize); { size_t l; heap=malloc(l=(nodes+1)*sizeof(uint32_t)+datasize); if (!heap) nomem(); memset(heap,0,l); } marshaled=(uint32_t*)heap; marshaled[0]=nodes+1; data=heap+(nodes+1)*sizeof(uint32_t); marshalhelper(s); fprintf(stderr,"actually used: %zu nodes, %zu bytes data\n",used,useddata); } char* lookup(char* ds,size_t ofs,const char* t) { uint32_t* tab=(uint32_t*)ds; if (ofs>tab[0]) return 0; while (ofs>8); else return lookup(ds,tab[ofs]>>8,t+1); } else ++ofs; if (!ch) break; } return NULL; } int main() { FILE* f=fopen("entities.json","r"); char buf[256]; if (!f) return 1; #if 0 puts("struct { const char* entity; const char* utf8; } codepoints[] = {"); #endif while (fgets(buf,sizeof(buf),f)) { char* s,* entity; size_t ul; if (!isspace(buf[0])) continue; for (s=buf; *s && *s!='"'; ++s) ; /* skip whitespace */ if (!(*s=='"')) continue; ++s; entity=s; if (*entity!='&') continue; ++entity; ++s; for (; *s && *s!='"'; ++s) ; /* skip to end of entity */ if (!(*s=='"')) continue; if (s[-1]!=';') continue; s[-1]=0; ++s; s=strchr(s,'['); if (!s) continue; n=0; #if 0 printf(" { \"%s\", \"",entity); #endif ++s; *cur=malloc(sizeof(**cur)); if (!*cur) nomem(); (*cur)->next=0; if (!((*cur)->entity=strdup(entity))) return 1; ul=0; do { while (isspace(*s)) ++s; m=scan_ulong(s,&l); if (!m) return 2; s+=n; n=fmt_utf8(tmp,l); if (ul+n>sizeof((*cur)->utf8)) return 3; memcpy((*cur)->utf8+ul,tmp,n); ul+=n; #if 0 { size_t i; for (i=0; iutf8[ul]=0; #if 0 puts("\" },"); #endif addword(&d,(*cur)->entity,(*cur)->utf8); } fclose(f); /* dump(d,0); */ marshal(d); { FILE* f=fopen("entities.h","w"); size_t i; fprintf(f,"struct {\n uint32_t tab[%u];\n char data[%zu];\n} entities = {\n {",marshaled[0],datasize); for (i=0; i