X-Git-Url: http://www.chiark.greenend.org.uk/ucgi/~yarrgweb/git?a=blobdiff_plain;f=pctb%2Focr.c;h=f6104452dc1c1854e6ccd19aac77882caae0aba9;hb=ac65228e40fa375c829b46607fb4941ff11376e9;hp=fe264e8110ce8ffce6cbb6be0282c004151dec31;hpb=cf8e9b2a56d12305cd61f8d4a9284d4319265215;p=ypp-sc-tools.web-live.git diff --git a/pctb/ocr.c b/pctb/ocr.c index fe264e8..f610445 100644 --- a/pctb/ocr.c +++ b/pctb/ocr.c @@ -42,6 +42,7 @@ typedef struct DatabaseNode { typedef struct { OcrReader *rd; + OcrCellType ct; int w; Pixcol *cols; int x; @@ -52,30 +53,21 @@ typedef struct { int rx; } FindCharResults; +enum { ct_Lower, ct_Upper, ct_Word, ct_Digit }; static const char *context_names[]= { - "Lower", - "Upper", - "Digit" + "Lower", /* bit 0, value 001 */ + "Upper", /* bit 1, value 002 */ + "Word", /* bit 2, value 004 */ + "Digit", /* bit 3, value 010 */ }; struct OcrCellTypeInfo { /* bitmaps of indices into context_names: */ unsigned initial, nextword, midword; int space_spaces; const char *name; + int (*findchar_select)(const FindCharArgs *fca, + const FindCharResults results[]); }; -const struct OcrCellTypeInfo ocr_celltype_number= { - 4,4,4, - .space_spaces= 5, - .name= "number" -}; -const struct OcrCellTypeInfo ocr_celltype_text= { - .initial=2, /* Uppercase */ - .nextword=3, /* Either */ - .midword=1, /* Lower only */ - .space_spaces= 4, - .name= "text" -}; - #define NCONTEXTS (sizeof(context_names)/sizeof(context_names[0])) @@ -340,9 +332,12 @@ static DatabaseNode *findchar(const FindCharArgs *fca, int *match_rx) { nmatches++; } if (nmatches==1) { - debugf( " unambiguous"); + debugf(" unique"); } else { - match=-1; + debugf(" ambiguous"); + match= !fca->ct->findchar_select ? -1 : + fca->ct->findchar_select(fca,results); + debugf(" resolved %s", match<0 ? "" : context_names[match]); } if (match<0) return 0; @@ -351,6 +346,33 @@ static DatabaseNode *findchar(const FindCharArgs *fca, int *match_rx) { return results[ctxi].match; } +static int findchar_select_text(const FindCharArgs *fca, + const FindCharResults results[]) { + if (fca->ctxmap != 017) return -1; + + dbassert(! results[ct_Digit].match ); + if (results[ct_Word].match) return ct_Word; + if (results[ct_Lower].rx > results[ct_Upper].rx) return ct_Lower; + if (results[ct_Upper].rx > results[ct_Lower].rx) return ct_Upper; + return -1; +} + +const struct OcrCellTypeInfo ocr_celltype_number= { + 010,010,010, + .space_spaces= 5, + .name= "number", + .findchar_select= 0 +}; +const struct OcrCellTypeInfo ocr_celltype_text= { + .initial= 012, /* Digit|Upper */ + .nextword= 017, /* Digit|Upper|Lower|Word */ + .midword= 014, /* Digit|Lower */ + .space_spaces= 4, + .name= "text", + .findchar_select= findchar_select_text +}; + + const char *ocr_celltype_name(OcrCellType ct) { return ct->name; } OcrResultGlyph *ocr(OcrReader *rd, OcrCellType ct, int w, Pixcol cols[]) { @@ -359,6 +381,7 @@ OcrResultGlyph *ocr(OcrReader *rd, OcrCellType ct, int w, Pixcol cols[]) { FindCharArgs fca; fca.rd= rd; + fca.ct= ct; fca.w= w; fca.cols= cols; fca.x= -1;