From 8787ee59f6840de63bac432b516a30d0dfe22c84 Mon Sep 17 00:00:00 2001 From: Ian Jackson Date: Fri, 5 Jun 2009 23:25:40 +0100 Subject: [PATCH] new context arrangements --- pctb/{database => charset-15.txt} | 29 +-- pctb/charset.charsource | 113 ------------ pctb/convert.c | 14 +- pctb/ocr.c | 290 ++++++++++++++++++++---------- pctb/ocr.h | 16 +- pctb/show-thing.tcl | 149 +++++++++------ 6 files changed, 328 insertions(+), 283 deletions(-) rename pctb/{database => charset-15.txt} (56%) delete mode 100644 pctb/charset.charsource diff --git a/pctb/database b/pctb/charset-15.txt similarity index 56% rename from pctb/database rename to pctb/charset-15.txt index 847b618..873cf51 100644 --- a/pctb/database +++ b/pctb/charset-15.txt @@ -1,36 +1,45 @@ -0 1 61 -6 +# ypp-sc-tools pctb font v1 +15 + +Lower +a 640 920 920 920 fc0 800 -0 1 67 -5 + +Lower +g 27c0 2820 2820 2420 1fe0 -0 1 72 -4 + +Lower +r fe0 40 20 20 -0 1 75 -5 + +Lower +u 7e0 800 800 400 fe0 -1 1 53 -6 + +Upper +S 830 848 888 888 708 8 + +. diff --git a/pctb/charset.charsource b/pctb/charset.charsource deleted file mode 100644 index 59bddfc..0000000 --- a/pctb/charset.charsource +++ /dev/null @@ -1,113 +0,0 @@ -a -++++++ - - - - - - ooo -o o - o - oooo -o o -o o - ooooo - - - -++++++ - -b -++++++ - - - -o -o -o ooo -oo o -o o -o o -o o -oo o -o ooo - - - -++++++ - -e -+++++ - - - - - - ooo -o o -o o -ooooo -o -o - oooo - - - -+++++ - -m -+++++++++ - - - - - -o oo oo -oo oo o -o o o -o o o -o o o -o o o -o o o - - - -+++++++++ - -n -++++++ - - - - - -o oo -oo o -o o -o o -o o -o o -o o - - - -++++++ - -o -+++++ - - - - - - ooo -o o -o o -o o -o o -o o - ooo - - - -+++++ diff --git a/pctb/convert.c b/pctb/convert.c index 163f814..93c7137 100644 --- a/pctb/convert.c +++ b/pctb/convert.c @@ -35,7 +35,8 @@ static inline char get_p(Point p) { return get(p.x,p.y); } static Rect mainr = { START_MAIN,START_MAIN }; static int commbasey, comminty; static int colrightx[INTERESTING_COLUMNS]; - +static int text_h; +static OcrReader *rd; static const CanonColourInfo canoncolourinfos[]= { { 0x475A5E, '*' }, /* edge */ @@ -174,6 +175,8 @@ static void find_structure(void) { across.x++; } eassert(colno >= MIN_COLUMNS); + + text_h = comminty - 1; } static void find_commodity(int offset, Rect *rr) { @@ -239,12 +242,11 @@ static void ocr_rectangle(Rect r) { OcrResultGlyph *results, *res; int w= r.br.x - r.tl.x + 1; - int h= r.br.y - r.tl.y + 1; Pixcol cols[w+1]; int x,y; for (x=0; xs; res++) printf("%s",res->s); @@ -268,9 +270,9 @@ int main(void) { Rect thisr, entryr; int tryrect, colno; - ocr_init(); load_image_and_canonify(); find_structure(); + rd= ocr_init(text_h); for (tryrect= +height; tryrect >= -height; tryrect--) { find_commodity(tryrect, &thisr); @@ -285,3 +287,5 @@ int main(void) { } return 0; } + +const char *get_vardir(void) { return "."; } diff --git a/pctb/ocr.c b/pctb/ocr.c index be479f8..fd4fcf4 100644 --- a/pctb/ocr.c +++ b/pctb/ocr.c @@ -5,58 +5,115 @@ typedef struct { Pixcol col; - struct OCRDatabaseNode *then; -} OCRDatabaseLink; + struct DatabaseNode *then; +} DatabaseLink; #define MAXGLYPHCHRS 3 -typedef struct OCRDatabaseNode { +typedef struct DatabaseNode { char s[MAXGLYPHCHRS+1]; /* null-terminated; "" means no match here */ int nlinks, alinks; - OCRDatabaseLink *links; -} OCRDatabaseNode; + DatabaseLink *links; +} DatabaseNode; -#define N_OCR_CONTEXTS 2 +static const char *context_names[]= { + "Lower", + "Upper", +/* "Digit"*/ +}; -static OCRDatabaseNode ocr_contexts[N_OCR_CONTEXTS]; -static FILE *db; -static OcrResultGlyph *results; -static int aresults, nresults; +#define NCONTEXTS (sizeof(context_names)/sizeof(context_names[0])) + +struct OcrReader { + int h; + DatabaseNode contexts[NCONTEXTS]; + OcrResultGlyph *results; + int aresults, nresults; +}; static FILE *resolver; static pid_t resolver_pid; static int resolver_done; -static void ocr_readdb(void) { - int ctx,nchrs; - OCRDatabaseNode *current, *additional; +static void fgetsline(FILE *f, char *lbuf, size_t lbufsz) { + char *s= fgets(lbuf,lbufsz,f); + eassert(s); + int l= strlen(lbuf); + eassert(l>0); eassert(lbuf[--l]='\n'); + lbuf[l]= 0; +} +#define FGETSLINE(f,buf) (fgetsline(f,buf,sizeof(buf))) + +static void cleardb_node(DatabaseNode *n) { + int i; + n->s[0]= 0; + for (i=0; inlinks; i++) + cleardb_node(n->links[i].then); +} + +static void readdb(OcrReader *rd) { + int nchrs; + DatabaseNode *current, *additional; char chrs[MAXGLYPHCHRS+1]; Pixcol cv; - int r,i,j; + int r,j,ctxi; + int h; + char lbuf[100]; + FILE *db; + + char *dbfname=0; + asprintf(&dbfname,"%s/charset-%d.txt",get_vardir(),rd->h); + eassert(dbfname); + + db= fopen(dbfname,"r"); eassert(db); + free(dbfname); + + FGETSLINE(db,lbuf); + eassert(!strcmp(lbuf,"# ypp-sc-tools pctb font v1")); + + r= fscanf(db, "%d", &h); + eassert(r==1); + eassert(h==rd->h); - assert(!db); - db= fopen("database","r"); eassert(db); + for (ctxi=0; ctxicontexts[ctxi]); for (;;) { - r= fscanf(db, "%d %d", &ctx, &nchrs); - if (r==EOF) break; - eassert(r==2); - eassert(ctx>=0 && ctx0 && nchrs<=MAXGLYPHCHRS); - - for (i=0; i0 && c<=255); - chrs[i]= c; + FGETSLINE(db,lbuf); + if (!lbuf || lbuf[0]=='#') continue; + if (!strcmp(lbuf,".")) break; + + for (ctxi=0; ctxi0 && cr<=255); + c= cr; + } + chrs[nchrs++]= c; } chrs[nchrs]= 0; - int twidth; - r= fscanf(db, "%d", &twidth); eassert(r==1); - current= &ocr_contexts[ctx]; - for (i=0; icontexts[ctxi]; + for (;;) { + FGETSLINE(db,lbuf); + if (!lbuf[0]) { eassert(current != &rd->contexts[ctxi]); break; } + char *ep; + cv= strtoul(lbuf,&ep,16); eassert(!*ep); + eassert(!(cv & ~((1UL << rd->h)-1))); + for (j=0; jnlinks; j++) if (current->links[j].col == cv) { current= current->links[j].then; @@ -71,7 +128,7 @@ static void ocr_readdb(void) { current->alinks++; current->alinks<<=1; current->links= realloc(current->links, - sizeof(*current->links) * current->alinks); + sizeof(*current->links) * current->alinks); eassert(current->links); } current->links[current->nlinks].col= cv; @@ -86,12 +143,11 @@ static void ocr_readdb(void) { strcpy(current->s, chrs); } eassert(!ferror(db)); - eassert(feof(db)); + eassert(!fclose(db)); } -static void callout_unknown(int w, int h, Pixcol cols[], - int unk_l, int unk_r, int unk_ctx, - const OcrResultGlyph *sofar, int nsofar) { +static void callout_unknown(OcrReader *rd, int w, Pixcol cols[], + int unk_l, int unk_r, unsigned unk_ctxmap) { int jobpipe[2],donepipe[2], c, r,i, x,y; const OcrResultGlyph *s; const char *p; @@ -119,10 +175,18 @@ static void callout_unknown(int w, int h, Pixcol cols[], resolver= fdopen(jobpipe[1],"w"); eassert(resolver); resolver_done= donepipe[0]; } - fprintf(resolver,"%d %d %d",unk_l,unk_r,unk_ctx); - for (i=0, s=sofar; iresults; inresults; i++, s++) { if (!strcmp(s->s," ")) continue; - fprintf(resolver," %d %d %d ",s->l,s->r,s->ctx); + fprintf(resolver," %d %d %s ",s->l,s->r,context_names[s->ctx]); for (p=s->s; (c= *p); p++) { if (c=='\\') fprintf(resolver,"\\%c",c); else if (c>=33 && c<=126) fputc(c,resolver); @@ -138,8 +202,8 @@ static void callout_unknown(int w, int h, Pixcol cols[], "\"%d %d 2 1\",\n" "\" c black\",\n" "\"o c white\",\n", - w,h); - for (y=0, pv=1; yh); + for (y=0, pv=1; yh; y++, pv<<=1) { fputc('"',resolver); for (x=0; x= aresults) { - aresults++; aresults<<=1; - results= realloc(results,sizeof(*results)*aresults); - eassert(results); +static void add_result(OcrReader *rd, const char *s, int l, int r, int ctx) { + if (rd->nresults >= rd->aresults) { + rd->aresults++; rd->aresults<<=1; + rd->results= realloc(rd->results,sizeof(*rd->results)*rd->aresults); + eassert(rd->results); } - results[nresults].s= s; - results[nresults].l= l; - results[nresults].r= r; - results[nresults].ctx= ctx; - nresults++; + rd->results[rd->nresults].s= s; + rd->results[rd->nresults].l= l; + rd->results[rd->nresults].r= r; + rd->results[rd->nresults].ctx= ctx; + rd->nresults++; } -OcrResultGlyph *ocr(int w, int h, Pixcol cols[]) { +OcrResultGlyph *ocr(OcrReader *rd, int w, Pixcol cols[]) { int nspaces=-w; - int ctx=1,i, x; + unsigned ctxmap=2; /* uppercase */ + int ctxi, i, x; - nresults=0; - assert(db); + rd->nresults=0; - fprintf(debug,"OCR h=%d w=%d",w,h); + fprintf(debug,"OCR h=%d w=%d",rd->h,w); for (x=0; xw) break; - Pixcol cv= cols[x]; - fprintf(debug," cv=%"PSPIXCOL(PRIx),x); - for (i=0; inlinks; i++) - if (current->links[i].col == cv) - goto found; - /* not found */ - fprintf(debug," ?"); - break; + DatabaseNode *uniquematch= 0; + int uniquematch_rx=-1, uniquematch_ctxi=-1; + + fprintf(debug,"OCR lx=%d ctxmap=%x ",lx,ctxmap); + + for (ctxi=0; ctxicontexts[ctxi];; + DatabaseNode *bestmatch= 0; + int bestmatch_rx=-1; + + x= lx; + if (!(ctxmap & (1u << ctxi))) continue; + fprintf(debug," || %s",context_names[ctxi]); + + for (;;) { + debug_flush(); + fprintf(debug," | x=%d",x); + if (x>w) break; + Pixcol cv= cols[x]; + fprintf(debug," cv=%"PSPIXCOL(PRIx),cv); + for (i=0; inlinks; i++) + if (current->links[i].col == cv) + goto found; + /* not found */ + fprintf(debug," ?"); + break; + + found: + current= current->links[i].then; + if (current->s[0]) { + fprintf(debug," \"%s\"",current->s); + bestmatch= current; + bestmatch_rx= x; + } else { + fprintf(debug," ..."); + } - found: - current= current->links[i].then; - if (current->s[0]) { - fprintf(debug," \"%s\"",current->s); - bestmatch=current; bestmatch_rx=x; - } else { - fprintf(debug," ..."); + x++; + } + + if (bestmatch) { + if (uniquematch) { + fprintf(debug, " ambiguous"); + uniquematch= 0; + break; + } + uniquematch= bestmatch; + uniquematch_rx= bestmatch_rx; + uniquematch_ctxi= ctxi; } - x++; } - if (bestmatch) { - fprintf(debug," YES\n"); - add_result(bestmatch->s, lx, bestmatch_rx, ctx); - x= bestmatch_rx+1; - ctx= 0; + if (uniquematch) { + fprintf(debug," || YES\n"); + add_result(rd, uniquematch->s, lx, uniquematch_rx, uniquematch_ctxi); + x= uniquematch_rx+1; + ctxmap= 1; /* Lower only */ } else { int rx; - fprintf(debug," UNKNOWN"); + fprintf(debug," || UNKNOWN"); for (rx=lx; rxnresults); debug_flush(); - return results; + return rd->results; } -void ocr_init(void) { - ocr_readdb(); +OcrReader *ocr_init(int h) { + OcrReader *rd; + + rd= malloc(sizeof(*rd)); eassert(rd); + memset(rd,0,sizeof(*rd)); + rd->h= h; + readdb(rd); + return rd; } diff --git a/pctb/ocr.h b/pctb/ocr.h index 26bccbc..d08ae15 100644 --- a/pctb/ocr.h +++ b/pctb/ocr.h @@ -1,12 +1,15 @@ #ifndef OCR_H #define OCR_H +#define _GNU_SOURCE + #include #include #include #include #include #include +#include #include #include @@ -19,16 +22,23 @@ typedef struct { int ctx; /* match context index */ } OcrResultGlyph; -OcrResultGlyph *ocr(int w, int h, Pixcol cols[]); +typedef const struct OcrGlyphContextDeveloperInfo *OcrCellContext; +extern const struct OcrGlyphContextDeveloperInfo *ocr_celltype_text; +extern const struct OcrGlyphContextDeveloperInfo *ocr_celltype_number; + +typedef struct OcrReader OcrReader; +OcrReader *ocr_init(int h); + +OcrResultGlyph *ocr(OcrReader *rd, int w, Pixcol cols[]); /* return value is array terminated by {0,-1,-1} * array is valid until next call to ocr() */ -void ocr_init(void); - void debug_flush(void); #define eassert assert #define debug stdout +const char *get_vardir(void); + #endif /*OCR_H*/ diff --git a/pctb/show-thing.tcl b/pctb/show-thing.tcl index 0d2ec0c..440f9c6 100755 --- a/pctb/show-thing.tcl +++ b/pctb/show-thing.tcl @@ -3,7 +3,7 @@ # usage: # run show-thing without args # then on stdin write -# one line which is a Tcl list for glyphsdone +# one line which is a Tcl list for unk_{l,r} unk_contexts glyphsdone # the xpm in the format expected # then expect child to raise SIGSTOP or exit 0 or exit nonzero # if child raised SIGSTOP, check database was updated @@ -24,10 +24,11 @@ set inter 1 set gotsh 20 set csrh 20 +set ctxh 20 proc init_widgets {} { # idempotent - global csrh gotsh + global csrh gotsh ctxh if {[winfo exists .d]} return @@ -38,6 +39,7 @@ proc init_widgets {} { frame .d.csr -bg black -height $csrh frame .d.got -bg black -height $gotsh + frame .d.ctx -bg black image create bitmap image/cursor -data \ {#define csr_width 11 @@ -52,22 +54,54 @@ static unsigned char csr_bits[] = { entry .d.csr.csr.e -bd 0 pack .d.csr.csr.l -side left - frame .d.csr_0 -bg white -width 1 - frame .d.csr_1 -bg white -width 1 + frame .d.mi.csr_0 -bg white -width 1 + frame .d.mi.csr_1 -bg white -width 1 - place .d.csr -x 0 -y 0 - place .d.mi -x 0 -y $csrh + pack .d.csr .d.mi .d.got .d.ctx -side top pack .d frame .help pack .help } +proc show_context {maxhv x ctxs} { + global mul + upvar 1 $maxhv maxh + set w .d.ctx.at$x + if {[llength $ctxs]==1} { set fg blue } { set fg yellow } + label $w -bg black -fg $fg -text [join $ctxs "/\n "] + place $w -x [expr {$x*$mul}] -y 0 + set wh [winfo reqheight $w] + if {$wh > $maxh} { set maxh $wh } +} + +proc resize_widgets {} { + global mulcols mulrows csrh gotsh ctxh glyphsdone + global unk_l unk_contexts + + foreach w {.d.csr .d.got .d.ctx} { + $w configure -width $mulcols + } + #.d configure -height [expr {$csrh+$mulrows+$gotsh+$ctxh}] + foreach w {0 1} { + .d.mi.csr_$w configure -height $mulrows + } + + eval destroy [winfo children .d.ctx] + + set maxh 0 + foreach {min max context got} $glyphsdone { + show_context maxh $min [list $context] + } + show_context maxh $unk_l $unk_contexts + .d.ctx configure -height $maxh +} + #---------- xpm input processor ---------- proc read_xpm {f} { - global glyphsdone mul inter rhsmost_max unk_l unk_r gotsh csrh + global glyphsdone mul inter rhsmost_max unk_l unk_r mulcols mulrows global cols rows wordmap set o {} @@ -144,15 +178,6 @@ proc read_xpm {f} { } set data [exec xpmtoppm << $o] image create photo image/main -data $data - - foreach w {.d .d.csr .d.got} { - $w configure -width $mulcols - } - .d configure -height [expr {$csrh+$mulrows+$gotsh}] - foreach w {0 1} { - .d.csr_$w configure -height $mulrows - } - place .d.got -x 0 -y [expr {$csrh+$mulrows}] } @@ -245,9 +270,10 @@ proc recursor/text {} { pack .d.csr.csr.e -side left focus .d.csr.csr.e bind_key Return { - binary scan [.d.csr.csr.e get] H* hex - if {[string length $hex]} { - RETURN_RESULT DEFINE "$cur_0 $cur_1 $hex" + set strq [.d.csr.csr.e get] + if {[regexp {^(?:[!-[]|[]-~]|\\\\|\\x[0-9a-f]{2})+} $strq]} { + .d.csr.csr.e delete 0 end + RETURN_RESULT DEFINE "$cur_0 $cur_1 $strq" } } bind_key Escape { @@ -308,7 +334,7 @@ proc leftright {var min max inc} { proc recursor {} { global csrh cur_mode cur_0 cur_1 mul foreach z1 {0 1} { - place .d.csr_$z1 -y $csrh -x [expr {[set cur_$z1] * $mul}] + place .d.mi.csr_$z1 -y 0 -x [expr {[set cur_$z1] * $mul}] } recursor/$cur_mode } @@ -324,43 +350,55 @@ proc recursor {} { # $database($context 0x 0x...) = $hex +set database_header {# ypp-sc-tools pctb font v1} + +proc db_getsl {f} { + if {[gets $f l] < 0} { error "unexpected db eof" } + return $l +} + proc read_database {} { - global database - set f [open database r] - while {[gets $f l] >= 0} { - if {![regexp {^(\w+) (\d+) ((?:[0-9a-f]{2})+)$} $l \ - dummy context strl strh]} { - error "bad syntax" - } - if {[string length $strh] != $strl*2} { error "$strh $strl" } - gets $f l; set width [format %d $l] + global database database_header rows database_fn + catch { unset database } + set database_fn ./charset-$rows.txt + set f [open $database_fn r] + if {[string compare [db_getsl $f] $database_header]} { error "$l ?" } + if {([db_getsl $f])+0 != $rows} { error "wrong h ?" } + while 1 { + set context [db_getsl $f] + if {![string length $context]} continue + if {[regexp {^\#} $context]} continue + if {![string compare . $context]} break + set bm $context - for {set x 0} {$x < $width} {incr x} { - gets $f l; lappend bm [format %x 0x$l] + set strq [db_getsl $f] + while 1 { + set l [db_getsl $f] + if {![string length $l]} break + lappend bm [format %x 0x$l] } - set database($bm) $strh + set database($bm) $strq } close $f } proc write_database {} { - global database + global database rows database_fn database_header set ol {} foreach bm [array names database] { - set strh $database($bm) - set strs [binary format H* $strh] - set strdo [format "%d %s" [expr {[string length $strh]/2}] $strh] - set o "[lindex $bm 0] $strdo\n" - append o [format "%d\n" [expr {[llength $bm]-1}]] + set strq $database($bm) + set o "[lindex $bm 0]\n$strq\n" foreach x [lrange $bm 1 end] { append o "$x\n" } + lappend ol $o } - set f [open database.new w] + set f [open $database_fn.new w] + puts $f "$database_header\n$rows\n" foreach o [lsort $ol] { - puts -nonewline $f $o + puts $f $o } close $f - file rename -force database.new database + file rename -force $database_fn.new $database_fn } proc dbkey {ctx l r} { @@ -372,23 +410,25 @@ proc dbkey {ctx l r} { return $bm } -proc update_database/DEFINE {c0 c1 strh} { - global glyphsdone unk_l unk_context wordmap database +proc update_database/DEFINE {c0 c1 strq} { + global glyphsdone unk_l unk_contexts wordmap database if {$c0 > $c1} { manyset [list $c0 $c1] c1 c0 } if {$c0 == $unk_l} { - set ncontext $unk_context + set ncontexts $unk_contexts } else { foreach {l r context got} $glyphsdone { - if {$l==$c0} { set ncontext $context; break } + if {$l==$c0} { set ncontexts [list $context]; break } } - if {![info exists ncontext]} { + if {![info exists ncontexts]} { puts stderr "must start at letter LHS!" return } } incr c1 -1 - set bm [dbkey $ncontext $c0 $c1] - set database($bm) $strh + foreach c $ncontexts { + set bm [dbkey $c $c0 $c1] + set database($bm) $strq + } write_database } @@ -415,7 +455,7 @@ proc RETURN_RESULT {how what} { #---------- main progrm ---------- proc main/test {} { - global glyphsdone unk_l unk_r unk_context + global glyphsdone unk_l unk_r unk_contexts set glyphsdone { 7 11 1 M @@ -424,12 +464,14 @@ proc main/test {} { } set unk_l 25 set unk_r 29 - set unk_context 0 + set unk_contexts Test set f [open text.xpm] read_xpm $f close $f + read_database + resize_widgets draw_glyphsdone startup_cursor } @@ -437,20 +479,22 @@ proc done/test {} { } proc required {} { - global glyphsdone unk_l unk_r unk_context + global glyphsdone unk_l unk_r unk_contexts if {[gets stdin l]<0} { if {[eof stdin]} { fconfigure stdin -blocking yes; exit 0 } return } init_widgets - manyset [lrange $l 0 3] unk_l unk_r unk_context + manyset [lrange $l 0 3] unk_l unk_r unk_contexts set glyphsdone [lrange $l 3 end] puts stderr "SHOW-THING GOT $l" fileevent stdin readable {} read_xpm stdin + resize_widgets + read_database draw_glyphsdone startup_cursor } @@ -471,5 +515,4 @@ switch -exact -- $argv { default { error "huh $argv ?" } } -read_database main/$mainkind -- 2.30.2