typedef struct {
Pixcol col;
- struct OCRDatabaseNode *then;
-} OCRDatabaseLink;
+ struct DatabaseNode *then;
+} DatabaseLink;
#define MAXGLYPHCHRS 3
-typedef struct OCRDatabaseNode {
+typedef struct DatabaseNode {
char s[MAXGLYPHCHRS+1]; /* null-terminated; "" means no match here */
int nlinks, alinks;
- OCRDatabaseLink *links;
-} OCRDatabaseNode;
+ DatabaseLink *links;
+} DatabaseNode;
-#define N_OCR_CONTEXTS 2
+static const char *context_names[]= {
+ "Lower",
+ "Upper",
+/* "Digit"*/
+};
-static OCRDatabaseNode ocr_contexts[N_OCR_CONTEXTS];
-static FILE *db;
-static OcrResultGlyph *results;
-static int aresults, nresults;
+#define NCONTEXTS (sizeof(context_names)/sizeof(context_names[0]))
+
+struct OcrReader {
+ int h;
+ DatabaseNode contexts[NCONTEXTS];
+ OcrResultGlyph *results;
+ int aresults, nresults;
+};
static FILE *resolver;
static pid_t resolver_pid;
static int resolver_done;
-static void ocr_readdb(void) {
- int ctx,nchrs;
- OCRDatabaseNode *current, *additional;
+static void fgetsline(FILE *f, char *lbuf, size_t lbufsz) {
+ char *s= fgets(lbuf,lbufsz,f);
+ eassert(s);
+ int l= strlen(lbuf);
+ eassert(l>0); eassert(lbuf[--l]='\n');
+ lbuf[l]= 0;
+}
+#define FGETSLINE(f,buf) (fgetsline(f,buf,sizeof(buf)))
+
+static void cleardb_node(DatabaseNode *n) {
+ int i;
+ n->s[0]= 0;
+ for (i=0; i<n->nlinks; i++)
+ cleardb_node(n->links[i].then);
+}
+
+static void readdb(OcrReader *rd) {
+ int nchrs;
+ DatabaseNode *current, *additional;
char chrs[MAXGLYPHCHRS+1];
Pixcol cv;
- int r,i,j;
+ int r,j,ctxi;
+ int h;
+ char lbuf[100];
+ FILE *db;
+
+ char *dbfname=0;
+ asprintf(&dbfname,"%s/charset-%d.txt",get_vardir(),rd->h);
+ eassert(dbfname);
+
+ db= fopen(dbfname,"r"); eassert(db);
+ free(dbfname);
+
+ FGETSLINE(db,lbuf);
+ eassert(!strcmp(lbuf,"# ypp-sc-tools pctb font v1"));
+
+ r= fscanf(db, "%d", &h);
+ eassert(r==1);
+ eassert(h==rd->h);
- assert(!db);
- db= fopen("database","r"); eassert(db);
+ for (ctxi=0; ctxi<NCONTEXTS; ctxi++)
+ cleardb_node(&rd->contexts[ctxi]);
for (;;) {
- r= fscanf(db, "%d %d", &ctx, &nchrs);
- if (r==EOF) break;
- eassert(r==2);
- eassert(ctx>=0 && ctx<N_OCR_CONTEXTS);
- eassert(nchrs>0 && nchrs<=MAXGLYPHCHRS);
-
- for (i=0; i<nchrs; i++) {
- int c;
- r= fscanf(db, "%x", &c); eassert(r==1);
- eassert(c>0 && c<=255);
- chrs[i]= c;
+ FGETSLINE(db,lbuf);
+ if (!lbuf || lbuf[0]=='#') continue;
+ if (!strcmp(lbuf,".")) break;
+
+ for (ctxi=0; ctxi<NCONTEXTS; ctxi++)
+ if (!strcmp(lbuf,context_names[ctxi]))
+ goto found_ctx;
+ /* not found, just skip */
+ for (;;) { FGETSLINE(db,lbuf); if (!lbuf[0]) break; }
+ continue;
+
+ found_ctx:
+ for (nchrs=0;;) {
+ int c= fgetc(db); eassert(c!=EOF);
+ if (c=='\n') { eassert(nchrs); break; }
+ eassert(nchrs<MAXGLYPHCHRS);
+ if (c=='\\') {
+ unsigned cr;
+ c= fgetc(db); eassert(c=='x');
+ r= fscanf(db, "%2x", &cr); eassert(r==1);
+ assert(cr>0 && cr<=255);
+ c= cr;
+ }
+ chrs[nchrs++]= c;
}
chrs[nchrs]= 0;
- int twidth;
- r= fscanf(db, "%d", &twidth); eassert(r==1);
- current= &ocr_contexts[ctx];
- for (i=0; i<twidth; i++) {
- r= fscanf(db, "%"PSPIXCOL(SCNx), &cv); eassert(r==1);
+ current= &rd->contexts[ctxi];
+ for (;;) {
+ FGETSLINE(db,lbuf);
+ if (!lbuf[0]) { eassert(current != &rd->contexts[ctxi]); break; }
+ char *ep;
+ cv= strtoul(lbuf,&ep,16); eassert(!*ep);
+ eassert(!(cv & ~((1UL << rd->h)-1)));
+
for (j=0; j<current->nlinks; j++)
if (current->links[j].col == cv) {
current= current->links[j].then;
current->alinks++;
current->alinks<<=1;
current->links= realloc(current->links,
- sizeof(*current->links) * current->alinks);
+ sizeof(*current->links) * current->alinks);
eassert(current->links);
}
current->links[current->nlinks].col= cv;
strcpy(current->s, chrs);
}
eassert(!ferror(db));
- eassert(feof(db));
+ eassert(!fclose(db));
}
-static void callout_unknown(int w, int h, Pixcol cols[],
- int unk_l, int unk_r, int unk_ctx,
- const OcrResultGlyph *sofar, int nsofar) {
+static void callout_unknown(OcrReader *rd, int w, Pixcol cols[],
+ int unk_l, int unk_r, unsigned unk_ctxmap) {
int jobpipe[2],donepipe[2], c, r,i, x,y;
const OcrResultGlyph *s;
const char *p;
resolver= fdopen(jobpipe[1],"w"); eassert(resolver);
resolver_done= donepipe[0];
}
- fprintf(resolver,"%d %d %d",unk_l,unk_r,unk_ctx);
- for (i=0, s=sofar; i<nsofar; i++, s++) {
+ fprintf(resolver,"%d %d {",unk_l,unk_r);
+ const char *spc="";
+ int ctxi;
+ for (ctxi=0; ctxi<NCONTEXTS; ctxi++) {
+ if (!(unk_ctxmap & (1u << ctxi))) continue;
+ fprintf(resolver,"%s%s",spc,context_names[ctxi]);
+ spc=" ";
+ }
+ fprintf(resolver,"}");
+ for (i=0, s=rd->results; i<rd->nresults; i++, s++) {
if (!strcmp(s->s," ")) continue;
- fprintf(resolver," %d %d %d ",s->l,s->r,s->ctx);
+ fprintf(resolver," %d %d %s ",s->l,s->r,context_names[s->ctx]);
for (p=s->s; (c= *p); p++) {
if (c=='\\') fprintf(resolver,"\\%c",c);
else if (c>=33 && c<=126) fputc(c,resolver);
"\"%d %d 2 1\",\n"
"\" c black\",\n"
"\"o c white\",\n",
- w,h);
- for (y=0, pv=1; y<h; y++, pv<<=1) {
+ w,rd->h);
+ for (y=0, pv=1; y<rd->h; y++, pv<<=1) {
fputc('"',resolver);
for (x=0; x<w; x++)
fputc(cols[x] & pv ? 'o' : ' ', resolver);
eassert(cb==0);
}
- fclose(db);
- db= 0;
- ocr_readdb();
+ readdb(rd);
}
-static void add_result(const char *s, int l, int r, int ctx) {
- if (nresults >= aresults) {
- aresults++; aresults<<=1;
- results= realloc(results,sizeof(*results)*aresults);
- eassert(results);
+static void add_result(OcrReader *rd, const char *s, int l, int r, int ctx) {
+ if (rd->nresults >= rd->aresults) {
+ rd->aresults++; rd->aresults<<=1;
+ rd->results= realloc(rd->results,sizeof(*rd->results)*rd->aresults);
+ eassert(rd->results);
}
- results[nresults].s= s;
- results[nresults].l= l;
- results[nresults].r= r;
- results[nresults].ctx= ctx;
- nresults++;
+ rd->results[rd->nresults].s= s;
+ rd->results[rd->nresults].l= l;
+ rd->results[rd->nresults].r= r;
+ rd->results[rd->nresults].ctx= ctx;
+ rd->nresults++;
}
-OcrResultGlyph *ocr(int w, int h, Pixcol cols[]) {
+OcrResultGlyph *ocr(OcrReader *rd, int w, Pixcol cols[]) {
int nspaces=-w;
- int ctx=1,i, x;
+ unsigned ctxmap=2; /* uppercase */
+ int ctxi, i, x;
- nresults=0;
- assert(db);
+ rd->nresults=0;
- fprintf(debug,"OCR h=%d w=%d",w,h);
+ fprintf(debug,"OCR h=%d w=%d",rd->h,w);
for (x=0; x<w; x++) fprintf(debug," %"PSPIXCOL(PRIx),cols[x]);
fprintf(debug,"\n");
debug_flush();
x++;
if (nspaces==3) {
fprintf(debug,"OCR x=%x nspaces=%d space\n",x,nspaces);
- add_result(" ",x-nspaces,x+1,0);
- ctx=1;
+ add_result(rd," ",x-nspaces,x+1,0);
+ ctxmap=3; /* either */
}
continue;
}
nspaces=0;
/* find character */
- OCRDatabaseNode *current=0, *bestmatch=0;
int lx=x;
- int bestmatch_rx=-1;
- current= &ocr_contexts[ctx];
- fprintf(debug,"OCR lx=%d ctx=%d ",lx,ctx);
- for (;;) {
- debug_flush();
- fprintf(debug,"| x=%d",x);
- if (x>w) break;
- Pixcol cv= cols[x];
- fprintf(debug," cv=%"PSPIXCOL(PRIx),x);
- for (i=0; i<current->nlinks; i++)
- if (current->links[i].col == cv)
- goto found;
- /* not found */
- fprintf(debug," ?");
- break;
+ DatabaseNode *uniquematch= 0;
+ int uniquematch_rx=-1, uniquematch_ctxi=-1;
+
+ fprintf(debug,"OCR lx=%d ctxmap=%x ",lx,ctxmap);
+
+ for (ctxi=0; ctxi<NCONTEXTS; ctxi++) {
+ DatabaseNode *current= &rd->contexts[ctxi];;
+ DatabaseNode *bestmatch= 0;
+ int bestmatch_rx=-1;
+
+ x= lx;
+ if (!(ctxmap & (1u << ctxi))) continue;
+ fprintf(debug," || %s",context_names[ctxi]);
+
+ for (;;) {
+ debug_flush();
+ fprintf(debug," | x=%d",x);
+ if (x>w) break;
+ Pixcol cv= cols[x];
+ fprintf(debug," cv=%"PSPIXCOL(PRIx),cv);
+ for (i=0; i<current->nlinks; i++)
+ if (current->links[i].col == cv)
+ goto found;
+ /* not found */
+ fprintf(debug," ?");
+ break;
+
+ found:
+ current= current->links[i].then;
+ if (current->s[0]) {
+ fprintf(debug," \"%s\"",current->s);
+ bestmatch= current;
+ bestmatch_rx= x;
+ } else {
+ fprintf(debug," ...");
+ }
- found:
- current= current->links[i].then;
- if (current->s[0]) {
- fprintf(debug," \"%s\"",current->s);
- bestmatch=current; bestmatch_rx=x;
- } else {
- fprintf(debug," ...");
+ x++;
+ }
+
+ if (bestmatch) {
+ if (uniquematch) {
+ fprintf(debug, " ambiguous");
+ uniquematch= 0;
+ break;
+ }
+ uniquematch= bestmatch;
+ uniquematch_rx= bestmatch_rx;
+ uniquematch_ctxi= ctxi;
}
- x++;
}
- if (bestmatch) {
- fprintf(debug," YES\n");
- add_result(bestmatch->s, lx, bestmatch_rx, ctx);
- x= bestmatch_rx+1;
- ctx= 0;
+ if (uniquematch) {
+ fprintf(debug," || YES\n");
+ add_result(rd, uniquematch->s, lx, uniquematch_rx, uniquematch_ctxi);
+ x= uniquematch_rx+1;
+ ctxmap= 1; /* Lower only */
} else {
int rx;
- fprintf(debug," UNKNOWN");
+ fprintf(debug," || UNKNOWN");
for (rx=lx; rx<w && cols[rx]; rx++);
- fprintf(debug," x=%d ctx=%d %d..%d\n",x, ctx, lx,rx);
+ fprintf(debug," x=%d ctxmap=%x %d..%d\n",x, ctxmap, lx,rx);
debug_flush();
- callout_unknown(w,h,cols, lx,rx-1,ctx, results,nresults);
+ callout_unknown(rd, w,cols, lx,rx-1, ctxmap);
goto restart;
}
}
- add_result(0,-1,-1,0);
- fprintf(debug,"OCR finished %d glyphs\n",nresults);
+ add_result(rd, 0,-1,-1,0);
+ fprintf(debug,"OCR finished %d glyphs\n",rd->nresults);
debug_flush();
- return results;
+ return rd->results;
}
-void ocr_init(void) {
- ocr_readdb();
+OcrReader *ocr_init(int h) {
+ OcrReader *rd;
+
+ rd= malloc(sizeof(*rd)); eassert(rd);
+ memset(rd,0,sizeof(*rd));
+ rd->h= h;
+ readdb(rd);
+ return rd;
}
# usage:
# run show-thing without args
# then on stdin write
-# one line which is a Tcl list for glyphsdone
+# one line which is a Tcl list for unk_{l,r} unk_contexts glyphsdone
# the xpm in the format expected
# then expect child to raise SIGSTOP or exit 0 or exit nonzero
# if child raised SIGSTOP, check database was updated
set gotsh 20
set csrh 20
+set ctxh 20
proc init_widgets {} {
# idempotent
- global csrh gotsh
+ global csrh gotsh ctxh
if {[winfo exists .d]} return
frame .d.csr -bg black -height $csrh
frame .d.got -bg black -height $gotsh
+ frame .d.ctx -bg black
image create bitmap image/cursor -data \
{#define csr_width 11
entry .d.csr.csr.e -bd 0
pack .d.csr.csr.l -side left
- frame .d.csr_0 -bg white -width 1
- frame .d.csr_1 -bg white -width 1
+ frame .d.mi.csr_0 -bg white -width 1
+ frame .d.mi.csr_1 -bg white -width 1
- place .d.csr -x 0 -y 0
- place .d.mi -x 0 -y $csrh
+ pack .d.csr .d.mi .d.got .d.ctx -side top
pack .d
frame .help
pack .help
}
+proc show_context {maxhv x ctxs} {
+ global mul
+ upvar 1 $maxhv maxh
+ set w .d.ctx.at$x
+ if {[llength $ctxs]==1} { set fg blue } { set fg yellow }
+ label $w -bg black -fg $fg -text [join $ctxs "/\n "]
+ place $w -x [expr {$x*$mul}] -y 0
+ set wh [winfo reqheight $w]
+ if {$wh > $maxh} { set maxh $wh }
+}
+
+proc resize_widgets {} {
+ global mulcols mulrows csrh gotsh ctxh glyphsdone
+ global unk_l unk_contexts
+
+ foreach w {.d.csr .d.got .d.ctx} {
+ $w configure -width $mulcols
+ }
+ #.d configure -height [expr {$csrh+$mulrows+$gotsh+$ctxh}]
+ foreach w {0 1} {
+ .d.mi.csr_$w configure -height $mulrows
+ }
+
+ eval destroy [winfo children .d.ctx]
+
+ set maxh 0
+ foreach {min max context got} $glyphsdone {
+ show_context maxh $min [list $context]
+ }
+ show_context maxh $unk_l $unk_contexts
+ .d.ctx configure -height $maxh
+}
+
#---------- xpm input processor ----------
proc read_xpm {f} {
- global glyphsdone mul inter rhsmost_max unk_l unk_r gotsh csrh
+ global glyphsdone mul inter rhsmost_max unk_l unk_r mulcols mulrows
global cols rows wordmap
set o {}
}
set data [exec xpmtoppm << $o]
image create photo image/main -data $data
-
- foreach w {.d .d.csr .d.got} {
- $w configure -width $mulcols
- }
- .d configure -height [expr {$csrh+$mulrows+$gotsh}]
- foreach w {0 1} {
- .d.csr_$w configure -height $mulrows
- }
- place .d.got -x 0 -y [expr {$csrh+$mulrows}]
}
pack .d.csr.csr.e -side left
focus .d.csr.csr.e
bind_key Return {
- binary scan [.d.csr.csr.e get] H* hex
- if {[string length $hex]} {
- RETURN_RESULT DEFINE "$cur_0 $cur_1 $hex"
+ set strq [.d.csr.csr.e get]
+ if {[regexp {^(?:[!-[]|[]-~]|\\\\|\\x[0-9a-f]{2})+} $strq]} {
+ .d.csr.csr.e delete 0 end
+ RETURN_RESULT DEFINE "$cur_0 $cur_1 $strq"
}
}
bind_key Escape {
proc recursor {} {
global csrh cur_mode cur_0 cur_1 mul
foreach z1 {0 1} {
- place .d.csr_$z1 -y $csrh -x [expr {[set cur_$z1] * $mul}]
+ place .d.mi.csr_$z1 -y 0 -x [expr {[set cur_$z1] * $mul}]
}
recursor/$cur_mode
}
# $database($context 0x<bits> 0x<bits>...) = $hex
+set database_header {# ypp-sc-tools pctb font v1}
+
+proc db_getsl {f} {
+ if {[gets $f l] < 0} { error "unexpected db eof" }
+ return $l
+}
+
proc read_database {} {
- global database
- set f [open database r]
- while {[gets $f l] >= 0} {
- if {![regexp {^(\w+) (\d+) ((?:[0-9a-f]{2})+)$} $l \
- dummy context strl strh]} {
- error "bad syntax"
- }
- if {[string length $strh] != $strl*2} { error "$strh $strl" }
- gets $f l; set width [format %d $l]
+ global database database_header rows database_fn
+ catch { unset database }
+ set database_fn ./charset-$rows.txt
+ set f [open $database_fn r]
+ if {[string compare [db_getsl $f] $database_header]} { error "$l ?" }
+ if {([db_getsl $f])+0 != $rows} { error "wrong h ?" }
+ while 1 {
+ set context [db_getsl $f]
+ if {![string length $context]} continue
+ if {[regexp {^\#} $context]} continue
+ if {![string compare . $context]} break
+
set bm $context
- for {set x 0} {$x < $width} {incr x} {
- gets $f l; lappend bm [format %x 0x$l]
+ set strq [db_getsl $f]
+ while 1 {
+ set l [db_getsl $f]
+ if {![string length $l]} break
+ lappend bm [format %x 0x$l]
}
- set database($bm) $strh
+ set database($bm) $strq
}
close $f
}
proc write_database {} {
- global database
+ global database rows database_fn database_header
set ol {}
foreach bm [array names database] {
- set strh $database($bm)
- set strs [binary format H* $strh]
- set strdo [format "%d %s" [expr {[string length $strh]/2}] $strh]
- set o "[lindex $bm 0] $strdo\n"
- append o [format "%d\n" [expr {[llength $bm]-1}]]
+ set strq $database($bm)
+ set o "[lindex $bm 0]\n$strq\n"
foreach x [lrange $bm 1 end] { append o "$x\n" }
+
lappend ol $o
}
- set f [open database.new w]
+ set f [open $database_fn.new w]
+ puts $f "$database_header\n$rows\n"
foreach o [lsort $ol] {
- puts -nonewline $f $o
+ puts $f $o
}
close $f
- file rename -force database.new database
+ file rename -force $database_fn.new $database_fn
}
proc dbkey {ctx l r} {
return $bm
}
-proc update_database/DEFINE {c0 c1 strh} {
- global glyphsdone unk_l unk_context wordmap database
+proc update_database/DEFINE {c0 c1 strq} {
+ global glyphsdone unk_l unk_contexts wordmap database
if {$c0 > $c1} { manyset [list $c0 $c1] c1 c0 }
if {$c0 == $unk_l} {
- set ncontext $unk_context
+ set ncontexts $unk_contexts
} else {
foreach {l r context got} $glyphsdone {
- if {$l==$c0} { set ncontext $context; break }
+ if {$l==$c0} { set ncontexts [list $context]; break }
}
- if {![info exists ncontext]} {
+ if {![info exists ncontexts]} {
puts stderr "must start at letter LHS!"
return
}
}
incr c1 -1
- set bm [dbkey $ncontext $c0 $c1]
- set database($bm) $strh
+ foreach c $ncontexts {
+ set bm [dbkey $c $c0 $c1]
+ set database($bm) $strq
+ }
write_database
}
#---------- main progrm ----------
proc main/test {} {
- global glyphsdone unk_l unk_r unk_context
+ global glyphsdone unk_l unk_r unk_contexts
set glyphsdone {
7 11 1 M
}
set unk_l 25
set unk_r 29
- set unk_context 0
+ set unk_contexts Test
set f [open text.xpm]
read_xpm $f
close $f
+ read_database
+ resize_widgets
draw_glyphsdone
startup_cursor
}
}
proc required {} {
- global glyphsdone unk_l unk_r unk_context
+ global glyphsdone unk_l unk_r unk_contexts
if {[gets stdin l]<0} {
if {[eof stdin]} { fconfigure stdin -blocking yes; exit 0 }
return
}
init_widgets
- manyset [lrange $l 0 3] unk_l unk_r unk_context
+ manyset [lrange $l 0 3] unk_l unk_r unk_contexts
set glyphsdone [lrange $l 3 end]
puts stderr "SHOW-THING GOT $l"
fileevent stdin readable {}
read_xpm stdin
+ resize_widgets
+ read_database
draw_glyphsdone
startup_cursor
}
default { error "huh $argv ?" }
}
-read_database
main/$mainkind