X-Git-Url: http://www.chiark.greenend.org.uk/ucgi/~yarrgweb/git?a=blobdiff_plain;f=pctb%2Focr.c;fp=pctb%2Focr.c;h=0000000000000000000000000000000000000000;hb=c68fb80a6bbf7acbcac4b2cb2143f5fea745cd2b;hp=92b09031c5c53ccde0029c0b7a26343db68645c6;hpb=b9cce976550d000f15e5a8f2b690740bdae1e468;p=ypp-sc-tools.db-live.git diff --git a/pctb/ocr.c b/pctb/ocr.c deleted file mode 100644 index 92b0903..0000000 --- a/pctb/ocr.c +++ /dev/null @@ -1,730 +0,0 @@ -/* - * Core OCR algorithm (first exact bitmap match) - */ -/* - * This is part of ypp-sc-tools, a set of third-party tools for assisting - * players of Yohoho Puzzle Pirates. - * - * Copyright (C) 2009 Ian Jackson - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see . - * - * Yohoho and Puzzle Pirates are probably trademarks of Three Rings and - * are used without permission. This program is not endorsed or - * sponsored by Three Rings. - */ - -#include "ocr.h" -#include "convert.h" - -typedef struct { - Pixcol col; - struct DatabaseNode *then; -} DatabaseLink; - -typedef struct DatabaseNode { - char *str; - int nlinks, alinks; - unsigned match:1, defined:1, endsword:1, local:1; - DatabaseLink *links; -} DatabaseNode; - -typedef struct { - OcrReader *rd; - OcrCellType ct; - int w; - const Pixcol *cols; - int x; - unsigned ctxmap; -} FindCharArgs; -typedef struct { - DatabaseNode *match; - int rx; -} FindCharResults; - -#define FOR_EACH_CONTEXT(EACH) \ - EACH(Word) \ - EACH(Lower) \ - EACH(Upper) \ - EACH(Digit) - -#define FEC_ENUM(Context) ct_##Context, -#define FEC_BIT(Context) ctf_##Context = 1 << ct_##Context, -enum { - FOR_EACH_CONTEXT(FEC_ENUM) - FOR_EACH_CONTEXT(FEC_BIT) -}; - -#define FEC_STRINGS(Context) #Context, -static const char *context_names[]= { FOR_EACH_CONTEXT(FEC_STRINGS) }; - -struct OcrCellTypeInfo { - /* bitmaps of indices into context_names: */ - unsigned initial, nextword, midword; - int space_spaces; - const char *name; - int (*findchar_select)(const FindCharArgs *fca, - const FindCharResults results[]); -}; - -#define NCONTEXTS (sizeof(context_names)/sizeof(context_names[0])) - -struct OcrReader { - int h; - DatabaseNode contexts[NCONTEXTS]; - char *result; - int lresult, aresult; - OcrResultGlyph *results; - int aresults, nresults; -}; - -DEBUG_DEFINE_DEBUGF(ocr) - -#define FGETSLINE (dbfile_getsline(lbuf,sizeof(lbuf),__FILE__,__LINE__)) - -static void cleardb_node(DatabaseNode *n) { - int i; - free(n->str); n->str=0; - n->defined=n->match=n->endsword= 0; - for (i=0; inlinks; i++) - cleardb_node(n->links[i].then); -} - -static void readdb1(OcrReader *rd, const char *which, int local); - -static void readdb(OcrReader *rd) { - int ctxi; - - for (ctxi=0; ctxicontexts[ctxi]); - - readdb1(rd, "master", 0); - readdb1(rd, "local", 1); -} - -static void readdb1(OcrReader *rd, const char *which, int local) { - int nchrs; - DatabaseNode *current, *additional; - char chrs[100]; - Pixcol cv; - int j,ctxi; - int h, endsword; - char lbuf[100]; - - char *dbfname= masprintf("%s/_%s-char%d.txt", - get_vardir(), which, rd->h); - - if (!dbfile_open(dbfname)) - goto x; - - if (local) - o_flags |= ff_charset_havelocal; - - FGETSLINE; - dbassert(!strcmp(lbuf,"# ypp-sc-tools pctb font v3 depth=" STRING(AADEPTH))); - - dbassert( dbfile_scanf("%d", &h) == 1); - dbassert(h==rd->h); - - for (;;) { - FGETSLINE; - if (!lbuf[0] || lbuf[0]=='#') continue; - if (!strcmp(lbuf,".")) break; - - for (ctxi=0; ctxi0 && chrs[nchrs-1]==' ') { - endsword= 1; - nchrs--; - } - - current= &rd->contexts[ctxi]; - for (;;) { - FGETSLINE; - if (!lbuf[0]) { dbassert(current != &rd->contexts[ctxi]); break; } - dbassert( strlen(lbuf) == rd->h ); - FILLZERO(cv); - int y; - for (y=0; ynlinks; j++) - if (!pixcol_cmp(&cv, ¤t->links[j].col)) { - current= current->links[j].then; - goto found_link; - } - - additional= mmalloc(sizeof(*additional)); - additional->str= 0; - additional->defined= 0; - additional->match= 0; - additional->endsword= 0; - additional->nlinks= additional->alinks= 0; - additional->links= 0; - if (current->nlinks==current->alinks) { - current->alinks++; - current->alinks<<=1; - current->links= mrealloc(current->links, - sizeof(*current->links) * current->alinks); - } - current->links[current->nlinks].col= cv; - current->links[current->nlinks].then= additional; - current->nlinks++; - current= additional; - - found_link:; - } - - if (!current->defined) { - free(current->str); - current->str= 0; - current->defined= 1; - current->match= 0; - current->local= local; - - if (nchrs) { - current->str= mmalloc(nchrs+1); - memcpy(current->str, chrs, nchrs); - current->str[nchrs]= 0; - current->match= 1; - current->endsword= endsword; - } - } - } - x: - dbfile_close(); - free(dbfname); -} - -typedef struct Rejection Rejection; -struct Rejection { - struct Rejection *next; - const char *fname; - int lno; - pcre *re; -}; - -Rejection *rejections; - -static void load_rejections(const char *which) { - char lbuf[1000]; - char *fname= masprintf("%s/_%s-reject.txt", get_vardir(), which); - int c, lno=0; - Rejection *rej; - - if (!dbfile_open(fname)) { free(fname); return; } - - while ((c= fgetc(dbfile))!=EOF) { - ungetc(c,dbfile); - lno++; - dbfile_getsline(lbuf,sizeof(lbuf),fname,lno); - - if (!lbuf[0] || isspace(lbuf[0] || lbuf[0]=='#')) - continue; - - rej= mmalloc(sizeof(*rej)); - rej->next= rejections; - rej->fname= fname; - rej->lno= lno; - - const char *err; - int erroffset; - rej->re= pcre_compile(lbuf, PCRE_NO_AUTO_CAPTURE|PCRE_UTF8, - &err, &erroffset, 0); - if (!rej->re) { - char *what= masprintf("invalid regexp at offset %d: %s\n", - erroffset, err); - dbfile_assertfail(fname, lno, what); - } - debugf("OCR LOADED REJECTION %s:%d `%s' %p\n", fname,lno, lbuf, rej->re); - - rejections= rej; - } - sysassert(feof(dbfile)); - dbfile_close(); -} - -static int should_reject(OcrReader *rd) { - static int rejections_loaded; - int ovector[30]; - - if (!rejections_loaded) { - fetch_with_rsync("reject"); - load_rejections("master"); - load_rejections("local"); - rejections_loaded=1; - } - - debugf("[OCR REJECTION `%s'%d", rd->result, rd->lresult); - Rejection *rej; - for (rej=rejections; rej; rej=rej->next) { - debugf(" (%p)", rej); - - int res= pcre_exec(rej->re, 0, rd->result,rd->lresult, 0, - 0, ovector, ARRAYSIZE(ovector)); - if (res==PCRE_ERROR_NOMATCH) continue; - sysassert(res>0); - - debugf(" MATCH]\n"); - fprintf(stderr,"Rejecting OCR result `%s' (due to %s:%d)\n", - rd->result, rej->fname,rej->lno); - return 1; - } - debugf(" OK]"); - return 0; -} - -static void cu_pr_ctxmap(FILE *resolver, unsigned ctxmap) { - fprintf(resolver,"{"); - const char *spc=""; - int ctxi; - for (ctxi=0; ctxiresults; inresults; i++, s++) { - if (!strcmp(s->s," ")) continue; - fprintf(resolver," %d %d ",s->l,s->r); - cu_pr_ctxmap(resolver, 1u << s->match); - fprintf(resolver," "); - cu_pr_ctxmap(resolver, s->ctxmap); - fprintf(resolver," "); - for (p=s->s; (c= *p); p++) { - if (c=='\\') fprintf(resolver,"\\%c",c); - else if (c>=33 && c<=126) fputc(c,resolver); - else fprintf(resolver,"\\x%02x",(unsigned char)c); - } - } - fputc('\n',resolver); - - fprintf(resolver, - "P2\n%d %d %d\n", w, rd->h, AAMAXVAL); - for (y=0; yh; y++) { - for (x=0; xnresults >= rd->aresults) { - rd->aresults++; rd->aresults<<=1; - rd->results= mrealloc(rd->results, sizeof(*rd->results)*rd->aresults); - } - rd->results[rd->nresults].s= s; - rd->results[rd->nresults].l= l; - rd->results[rd->nresults].r= r; - rd->results[rd->nresults].match= match; - rd->results[rd->nresults].ctxmap= ctxmap; - rd->nresults++; - - if (!s) return; /* just the sentinel for the caller */ - - int sl= strlen(s); - int newlresult= rd->lresult + sl; - if (newlresult >= rd->aresult) { - rd->aresult= (newlresult << 1) + 1; - rd->result= mrealloc(rd->result, rd->aresult); - } - memcpy(rd->result + rd->lresult, s, sl); - rd->lresult= newlresult; - rd->result[rd->lresult]= 0; -} - - -static DatabaseNode *findchar_1ctx(const FindCharArgs *fca, - DatabaseNode *start, int *matchx_r) { - DatabaseNode *current= start; - DatabaseNode *bestmatch= 0; - int i; - int x= fca->x; - - for (;;) { - if (DEBUGP(ocr)) debug_flush(); - debugf(" | x=%d",x); - if (x > fca->w) break; - Pixcol cv= fca->cols[x]; - debugf(" cv="PIXCOL_PRFMT, PIXCOL_PRVAL(cv)); - for (i=0; inlinks; i++) - if (!pixcol_cmp(&cv, ¤t->links[i].col)) - goto found; - /* not found */ - debugf(" ?"); - break; - - found: - current= current->links[i].then; - if (current->match) { - debugf(" \"%s\"%s",current->str,current->endsword?"_":""); - bestmatch= current; - *matchx_r= x; - } else { - debugf(" ..."); - } - - x++; - } - return bestmatch; -} - -static DatabaseNode *findchar(const FindCharArgs *fca, - int *match_rx, int *match_rctxi) { - FindCharResults results[NCONTEXTS]; - int ctxi, match=-1, nmatches=0; - - debugf("OCR lx=%d ct_state=%x ", fca->x, fca->ctxmap); - for (ctxi=0; ctxictxmap & (1u << ctxi))) continue; - debugf(" || %s",context_names[ctxi]); - - results[ctxi].match= findchar_1ctx(fca, &fca->rd->contexts[ctxi], - &results[ctxi].rx); - if (!results[ctxi].match) continue; - - match= ctxi; - nmatches++; - } - if (nmatches==1) { - debugf(" unique"); - } else { - debugf(" ambiguous"); - match= !fca->ct->findchar_select ? -1 : - fca->ct->findchar_select(fca,results); - debugf(" resolved %s", match<0 ? "" : context_names[match]); - } - if (match<0) - return 0; - - *match_rx= results[match].rx; - if (match_rctxi) *match_rctxi= match; - return results[match].match; -} - -static int findchar_select_text(const FindCharArgs *fca, - const FindCharResults results[]) { - - dbassert(! results[ct_Digit].match ); /* digits are supposedly unambiguous */ - - switch (fca->ctxmap) { - -#define RETURN_IF_LONGER(this,that) do{ \ - if (results[ct_##this].rx > results[ct_##that].rx) \ - return ct_##this; \ - }while(0) - - case ctf_Digit | ctf_Upper | ctf_Lower | ctf_Word: - /* Start of word. Prefer Word match; failing that, take the longest */ - if (results[ct_Word].match) return ct_Word; - RETURN_IF_LONGER(Lower,Upper); - RETURN_IF_LONGER(Upper,Lower); - break; - - case ctf_Digit | ctf_Upper | ctf_Lower: - /* Mid-word. Prefer longer match; failing that, match lower. */ - RETURN_IF_LONGER(Upper,Lower); - return ct_Lower; - } - - /* oh well */ - return -1; -} - -const struct OcrCellTypeInfo ocr_celltype_number= { - ctf_Digit, ctf_Digit, ctf_Digit, - .space_spaces= 5, - .name= "number", - .findchar_select= 0 -}; -const struct OcrCellTypeInfo ocr_celltype_text= { - .initial= ctf_Digit | ctf_Upper, - .nextword= ctf_Digit | ctf_Upper | ctf_Lower | ctf_Word, - .midword= ctf_Digit | ctf_Upper | ctf_Lower, - .space_spaces= 4, - .name= "text", - .findchar_select= findchar_select_text -}; - - -const char *ocr_celltype_name(OcrCellType ct) { return ct->name; } - -OcrResultGlyph *ocr(OcrReader *rd, OcrCellType ct, int w, - const Pixcol cols[]) { - int nspaces; - int x; - - FindCharArgs fca; - fca.rd= rd; - fca.ct= ct; - fca.w= w; - fca.cols= cols; - fca.x= -1; - - restart: - - nspaces=- w; - fca.ctxmap= ct->initial; - rd->nresults=0; - rd->lresult=0; - rd->result[0]=0; - debugf("OCR h=%d w=%d",rd->h,w); - for (x=0; x=w) - break; - - if (!pixcol_nonzero(&cols[x])) { - nspaces++; - x++; - if (nspaces == ct->space_spaces) { - debugf("OCR x=%x nspaces=%d space\n",x,nspaces); - fca.ctxmap= ct->nextword; - } - continue; - } - - /* something here, so we need to add the spaces */ - if (nspaces >= ct->space_spaces) - add_result(rd," ",x-nspaces,x+1,-1,0); - nspaces=0; - - fca.x= x; - - int match_rx=-1; - int match_ctxi=-1; - DatabaseNode *match= findchar(&fca, &match_rx, &match_ctxi); - - if (match) { - debugf(" || YES"); - add_result(rd, match->str, x, match_rx, match_ctxi, fca.ctxmap); - if (should_reject(rd)) { - callout_unknown(rd, w,cols, match_rx+1,match_rx, 0); - goto restart; - } - x= match_rx+1; - if (match->match) fca.ctxmap= ct->midword; - else debugf(" (empty)"); - if (match->endsword) { - nspaces= ct->space_spaces; - debugf("_"); - fca.ctxmap= ct->nextword; - } - debugf("\n"); - } else { - int rx; - debugf(" || UNKNOWN"); - for (rx=x; rxnresults); - debug_flush(); - return rd->results; -} - -OcrReader *ocr_init(int h) { - OcrReader *rd; - - assert(h <= OCR_MAX_H); - - if (o_flags & ff_dict_fetch) { - char *fetchfile= masprintf("char%d",h); - fetch_with_rsync(fetchfile); - free(fetchfile); - } - - rd= mmalloc(sizeof(*rd)); - memset(rd,0,sizeof(*rd)); - - rd->h= h; - - rd->aresult= 10; - rd->result= mmalloc(rd->aresult); - - readdb(rd); - return rd; -} - -/*---------- character set dump ----------*/ - -static void show_recurse(const DatabaseNode *t, int *count, - const DatabaseNode **store_ary) { - if (t->defined) { - if (store_ary) store_ary[*count]= t; - (*count)++; - } - int l; - for (l=0; lnlinks; l++) - show_recurse(t->links[l].then, count,store_ary); -} - -static int show_char_compar(const void *av, const void *bv) { - const DatabaseNode *const *ap= av; const DatabaseNode *a= *ap; - const DatabaseNode *const *bp= bv; const DatabaseNode *b= *bp; - return strcmp(a->str, b->str) ?: - ((int)a->match - (int)b->match) ?: - ((int)a->endsword - (int)b->endsword) ?: - ((int)a->local - (int)b->local) ?: - 0; -} - -void ocr_showcharsets(void) { - DIR *d; - struct dirent *de; - char found[OCR_MAX_H]; - pcre *fnpat; - int matchvec[10]; - char hbuf[10]; - const char *pcre_err; - int pcre_erroffset; - - memset(found,0,sizeof(found)); - - fnpat= pcre_compile("_(?:master|local)\\-char([1-9]\\d{0,2})\\.txt$", - PCRE_ANCHORED|PCRE_DOLLAR_ENDONLY, - &pcre_err,&pcre_erroffset, 0); - debugf("pcre_compile %p %s\n",fnpat,pcre_err); - assert(fnpat); - - sysassert( d= opendir(get_vardir()) ); - for (;;) { - errno=0; de= readdir(d); if (!de) break; - - int rer= pcre_exec(fnpat,0, de->d_name,strlen(de->d_name), 0,0, - matchvec,ARRAYSIZE(matchvec)); - debugf("pcre_exec `%s' => %d\n", de->d_name,rer); - - if (rer==PCRE_ERROR_NOMATCH || rer==PCRE_ERROR_BADUTF8) continue; - assert(rer==2); - - rer= pcre_copy_substring(de->d_name,matchvec,rer, 1, hbuf,sizeof(hbuf)); - debugf("pcre_copy_substring => %d\n", rer); - assert(rer>0); - - int h= atoi(hbuf); - if (h >= ARRAYSIZE(found)) continue; - - found[h]= 1; - } - - int h; - for (h=0; hcontexts[ctxi], &nchars, 0); - const DatabaseNode **chars= mmalloc(sizeof(*chars) * nchars); - int chari= 0; - show_recurse(&rd->contexts[ctxi], &chari, chars); - assert(chari==nchars); - qsort(chars, nchars, sizeof(*chars), show_char_compar); - - int local; - for (local=0; local<2; local++) { - printf("%2d %-6s %-6s ", h, context_names[ctxi], - local?"local":"master"); - for (chari=0; chari"; - - if (t->local != local) continue; - - if (!t->match) - printf(" [nomatch]"); - else if (!t->endsword && strspn(t->str, accept) == strlen(t->str)) - printf(" %s",t->str); - else { - printf(" \""); - char *p= t->str; - int c; - while ((c=*p++)) { - if (c=='"' || c=='\\') printf("\\%c",c); - else if (c>=' ' && c<=126) putchar(c); - else printf("\\x%02x", (unsigned char)c); - } - if (t->endsword) putchar(' '); - putchar('"'); - } - } - putchar('\n'); - } - free(chars); - } - } -}