X-Git-Url: http://www.chiark.greenend.org.uk/ucgi/~yarrgweb/git?p=ypp-sc-tools.db-test.git;a=blobdiff_plain;f=pctb%2Focr.c;h=ecfac976b2d025176bd4ee82675c31a108c3ba18;hp=2c3a16772ec8ac0a90f5a9be420c9db188c1e6da;hb=3a820e9871e6c4607b34939cc4d55c6925a7de29;hpb=3063e05a93fb97a5eca7f26c38da94fa4000406e;ds=sidebyside diff --git a/pctb/ocr.c b/pctb/ocr.c index 2c3a167..ecfac97 100644 --- a/pctb/ocr.c +++ b/pctb/ocr.c @@ -277,10 +277,44 @@ static void add_result(OcrReader *rd, const char *s, int l, int r, const char *ocr_celltype_name(OcrCellType ct) { return ct->name; } +static DatabaseNode *findchar(OcrReader *rd, int w, Pixcol cols[], + int x, int ctxi, int *matchx_r) { + DatabaseNode *current= &rd->contexts[ctxi]; + DatabaseNode *bestmatch= 0; + int i; + + for (;;) { + debug_flush(); + debugf(" | x=%d",x); + if (x>w) break; + Pixcol cv= cols[x]; + debugf(" cv=%"PSPIXCOL(PRIx),cv); + for (i=0; inlinks; i++) + if (current->links[i].col == cv) + goto found; + /* not found */ + debugf(" ?"); + break; + + found: + current= current->links[i].then; + if (current->match) { + debugf(" \"%s\"%s",current->str,current->endsword?"_":""); + bestmatch= current; + *matchx_r= x; + } else { + debugf(" ..."); + } + + x++; + } + return bestmatch; +} + OcrResultGlyph *ocr(OcrReader *rd, OcrCellType ct, int w, Pixcol cols[]) { int nspaces; unsigned ctxmap; - int ctxi, i, x; + int ctxi, x; restart: @@ -314,63 +348,30 @@ OcrResultGlyph *ocr(OcrReader *rd, OcrCellType ct, int w, Pixcol cols[]) { add_result(rd," ",x-nspaces,x+1,0); nspaces=0; - /* find character */ - int lx=x; - DatabaseNode *uniquematch= 0; int uniquematch_rx=-1; - debugf("OCR lx=%d ctxmap=%x ",lx,ctxmap); + debugf("OCR lx=%d ctxmap=%x ",x,ctxmap); for (ctxi=0; ctxicontexts[ctxi];; - DatabaseNode *bestmatch= 0; - int bestmatch_rx=-1; - - x= lx; if (!(ctxmap & (1u << ctxi))) continue; debugf(" || %s",context_names[ctxi]); - for (;;) { - debug_flush(); - debugf(" | x=%d",x); - if (x>w) break; - Pixcol cv= cols[x]; - debugf(" cv=%"PSPIXCOL(PRIx),cv); - for (i=0; inlinks; i++) - if (current->links[i].col == cv) - goto found; - /* not found */ - debugf(" ?"); - break; - - found: - current= current->links[i].then; - if (current->match) { - debugf(" \"%s\"%s",current->str,current->endsword?"_":""); - bestmatch= current; - bestmatch_rx= x; - } else { - debugf(" ..."); - } + DatabaseNode *match= + findchar(rd,w,cols, x,ctxi, &uniquematch_rx); + if (!match) continue; - x++; - } - - if (bestmatch) { - if (uniquematch && strcmp(bestmatch->str, uniquematch->str)) { - debugf( " ambiguous"); - uniquematch= 0; - break; - } - uniquematch= bestmatch; - uniquematch_rx= bestmatch_rx; + if (uniquematch && strcmp(match->str, uniquematch->str)) { + debugf( " ambiguous"); + uniquematch= 0; + } else { + uniquematch= match; } } if (uniquematch) { debugf(" || YES"); - add_result(rd, uniquematch->str, lx, uniquematch_rx, ctxmap); + add_result(rd, uniquematch->str, x, uniquematch_rx, ctxmap); x= uniquematch_rx+1; if (uniquematch->match) ctxmap= ct->midword; else debugf(" (empty)"); @@ -383,12 +384,13 @@ OcrResultGlyph *ocr(OcrReader *rd, OcrCellType ct, int w, Pixcol cols[]) { } else { int rx; debugf(" || UNKNOWN"); - for (rx=lx; rxnresults);