#include <inttypes.h>
#include <string.h>
#include "entities.h"

#include "fmt.h"
#include "textcode.h"
#include "haveinline.h"
#include "scan.h"
#include "case.h"
#include "str.h"

static const char* lookup(size_t ofs,const char* t) {
  if (ofs>entities.tab[0]) return 0;
  while (ofs<entities.tab[0]) {
    unsigned char ch=entities.tab[ofs]&0xff;
    if (ch==(unsigned char)*t || (!ch && *t==';')) {
      if (!ch || *t==';')
	return entities.data+(entities.tab[ofs]>>8);
      else
	return lookup(entities.tab[ofs]>>8,t+1);
    } else
      ++ofs;
    if (!ch) break;
  }
  return NULL;
}

size_t scan_html(const char *src,char *dest,size_t *destlen) {
  register const unsigned char* s=(const unsigned char*) src;
  size_t written=0,i;
  for (i=0; s[i]; ++i) {
    if (s[i]=='&') {
      const char* utf8;
      if (s[i+1]=='#') {
	unsigned long l;
	size_t j;
	if ((s[i+2]&~32)=='X') {
	  j=scan_xlong(src+i+3,&l);
	  if (!j) j+=3;
	} else {
	  j=scan_ulong(src+i+2,&l);
	  if (!j) j+=3;
	}
	if (s[i+j]==';') {
	  i+=j;
	  written+=fmt_utf8(dest+written,l);
	} else {
	  dest[written++]='&';
	}
	continue;
      }
      utf8=lookup(1,src+i+1);
      if (utf8) {
	size_t l=strlen(utf8);
	memcpy(dest+written,utf8,l);
	written+=l;
	i+=2+str_chr(src+i+2,';');
	continue;
      } else
	dest[written]='&';
    } else if (s[i]=='<') {
      if (case_starts((const char*)s+i+1,"br>")) {
	dest[written]='\n';
	i+=3;
      } else if (case_starts((const char*)s+i+1,"p>")) {
	dest[written]='\n'; ++written;
	dest[written]='\n';
	i+=3;
      }
    } else
      dest[written]=s[i];
    ++written;
  }
  *destlen=written;
  return i;
}