From: Ian Jackson Date: Tue, 2 Jun 2009 01:47:02 +0000 (+0100) Subject: wip ocr X-Git-Tag: 1.9.2~195 X-Git-Url: https://www.chiark.greenend.org.uk/ucgi/~yarrgweb/git?p=ypp-sc-tools.web-live.git;a=commitdiff_plain;h=59acfe7ae6e8c81462dce00da5006166efb63d22 wip ocr --- diff --git a/pctb/Makefile b/pctb/Makefile index bff47f5..10c0be9 100644 --- a/pctb/Makefile +++ b/pctb/Makefile @@ -1,5 +1,5 @@ LDLIBS += -lnetpbm CFLAGS += -Wall -Wwrite-strings -Wpointer-arith -Wmissing-prototypes \ - -Wstrict-prototypes + -Wstrict-prototypes -g all: convert diff --git a/pctb/convert.c b/pctb/convert.c index 598d446..a0ecca1 100644 --- a/pctb/convert.c +++ b/pctb/convert.c @@ -3,6 +3,7 @@ #include #include #include +#include #define eassert assert #define debug stdout @@ -241,10 +242,155 @@ static void load_image_and_canonify(void) { debug_flush(); } +typedef uint32_t Pixcol; +#define PSPIXCOL(priscan) priscan##32 + +typedef struct { + Pixcol col; + struct OCRDatabaseNode *then; +} OCRDatabaseLink; + +#define MAXGLYPHCHRS 3 + +typedef struct OCRDatabaseNode { + char s[MAXGLYPHCHRS+1]; /* null-terminated; "" means no match here */ + int nlinks, alinks; + OCRDatabaseLink *links; +} OCRDatabaseNode; + +#define N_OCR_CONTEXTS 2 +static OCRDatabaseNode ocr_contexts[N_OCR_CONTEXTS]; + +static void load_ocr_database(void) { + int ctx,nchrs; + OCRDatabaseNode *current, *additional; + char chrs[MAXGLYPHCHRS+1]; + Pixcol cv; + int r,i,j; + + FILE *db= fopen("database","r"); eassert(db); + + for (;;) { + r= fscanf(db, "%d %d", &ctx, &nchrs); + if (r==EOF) break; + eassert(r==2); + eassert(ctx>=0 && ctx0 && nchrs<=MAXGLYPHCHRS); + + for (i=0; i0 && c<=255); + chrs[i]= c; + } + chrs[nchrs]= 0; + + int twidth; + r= fscanf(db, "%d", &twidth); eassert(r==1); + current= &ocr_contexts[ctx]; + for (i=0; inlinks; j++) + if (current->links[j].col == cv) { + current= current->links[j].then; + goto found_link; + } + + additional= malloc(sizeof(*additional)); eassert(additional); + additional->s[0]= 0; + additional->nlinks= additional->alinks= 0; + additional->links= 0; + if (current->nlinks==current->alinks) { + current->alinks++; + current->alinks<<=1; + current->links= realloc(current->links, + sizeof(*current->links) * current->alinks); + eassert(current->links); + } + current->links[current->nlinks].col= cv; + current->links[current->nlinks].then= additional; + current->nlinks++; + current= additional; + + found_link:; + } + + eassert(!current->s[0]); + strcpy(current->s, chrs); + } + eassert(!ferror(db)); + eassert(feof(db)); + fclose(db); +} + +static void ocr_rectangle(Rect r) { + int w= r.br.x - r.tl.x + 1; + int h= r.br.y - r.tl.y + 1; + Pixcol cols[w+1]; + int x,y; + for (x=0; xw) break; + + if (!cols[x]) { + nspaces++; + x++; + if (nspaces>3) ctx=1; + continue; + } + + OCRDatabaseNode *current=0, *lastmatch=0; + int startx=x; + int afterlastmatchx=-1; + current= &ocr_contexts[ctx]; + for (;;) { + if (x>w) break; + Pixcol cv= cols[x]; + for (i=0; inlinks; i++) + if (current->links[i].col == cv) + goto found; + /* not found */ + break; + found: + x++; + current= current->links[i].then; + if (current->s[0]) { lastmatch=current; afterlastmatchx=x; } + } + + if (!lastmatch) { + int x2; + for (x2=x+1; x2s); + x= afterlastmatchx; + ctx= 0; + } + } +} + int main(void) { Rect thisr, entryr; int tryrect, colno; - + + load_ocr_database(); load_image_and_canonify(); find_structure(); @@ -256,7 +402,7 @@ int main(void) { for (colno=0; colno= 0} { + if {![regexp {^"(.*)",$} $l dummy l]} { + append o "$l\n" + continue + } + if {$y==-3} { + manyset $l cols rows colours cpp + #assert {$colours==2} + #assert {$cpp==1} + set mulcols [expr {$cols*$mul+$inter}] + set mulrows [expr {$rows*$mul+$inter}] + append o "\"$mulcols $mulrows 9 1\",\n" + for {set x 0} {$x<$cols} {incr x} { set wordmap($x) 0 } + } elseif {$y==-2} { # first pixel + append o \ +"\"+ c #111\", +\"a c #800\", +\"A c #fcc\", +\"b c #00c\", +\"B c #fff\", +\"u c #000\", +\"U c #ff0\", +\"q c #000\", +\"Q c #ff0\",\n" + } elseif {$y==-1} { # 2nd pixel but we've already printed ours + } else { + set ybit [expr {1<<$y}] + set x 0 + set ol "\"+" + set olh $ol + foreach c [split $l ""] { + set how "u" + if {$x >= $unk_l && $x <= $unk_r} { + set how q + } else { + set ab 0 + foreach {min max context got} $foolist { + set rhsmost_max $max + if {$x >= $min && $x <= $max} { + set how [lindex {a b} $ab] + break + } + set ab [expr {!$ab}] + } + } + switch -exact $c { + " " { set p $how } + "o" { + set p [string toupper $how] + incr wordmap($x) $ybit + } + default { error "$c ?" } + } + append ol "[string repeat $p [expr {$mul-$inter}]][ + string repeat + $inter]" + append olh [string repeat + $mul] + incr x + } + set ole "\",\n" + append ol $ole + append olh $ole + set olhn [string repeat $olh $inter] + if {!$y} { append o $olhn } + append o [string repeat $ol [expr {$mul-1}]] + append o $olhn + } + incr y +} + +#puts $o + +set xpm [exec xpmtoppm << $o] + +set gotsh 20 +set csrh 20 + +frame .d -width $mulcols -height [expr {$csrh+$mulrows+$gotsh}] + +set mi [image create photo -data $xpm] +label .d.mi -image $mi -borderwidth 0 + +frame .d.csr -bg black -width $mulcols -height $csrh +frame .d.got -bg black -width $mulcols -height $gotsh + +foreach {min max context got} $foolist { + frame .d.got.m$min -bd 0 -background \#888 + label .d.got.m$min.l -text "$got" -fg white -bg black -bd 0 + pack .d.got.m$min.l -padx 1 -pady 1 + place .d.got.m$min -x [expr {$min*$mul+$inter}] -y 0 +} + +set imcsr [image create bitmap -data \ +{#define csr_width 11 +#define csr_height 11 +static unsigned char csr_bits[] = { + 0x20, 0x00, 0x20, 0x00, 0x20, 0x00, 0x21, 0x04, 0x22, 0x02, 0x25, 0x05, + 0xaa, 0x02, 0x74, 0x01, 0xa8, 0x00, 0x70, 0x00, 0x20, 0x00}; +}] + +frame .d.csr.csr +label .d.csr.csr.l -image $imcsr -compound left +entry .d.csr.csr.e -bd 0 +pack .d.csr.csr.l -side left + +frame .d.csr_0 -bg white -height $mulrows -width 1 +frame .d.csr_1 -bg white -height $mulrows -width 1 + +place .d.csr -x 0 -y 0 +place .d.mi -x 0 -y $csrh +place .d.got -x 0 -y [expr {$csrh+$mulrows}] +pack .d + +frame .help +pack .help + +set cur_already [expr {[llength $foolist]/4-1}] +set cur_mode 1 ;# one of: 0 1 already text + +set cur_0 $unk_l +set cur_1 [expr {$unk_r+1}] +set last_ht {} + +proc helptext {t} { + global last_ht + if {![string compare $t $last_ht]} return + eval destroy [grid slaves .help] + set y 0; foreach l $t { + set x 0; foreach c $l { + set w .help.at${x}x${y} + label $w -text $c + grid $w -row $y -column $x -padx 5 + incr x + } + incr y + } + set last_ht $t +} + +proc recursor/0 {} { recursor//01 0 } +proc recursor/1 {} { recursor//01 1 } +proc recursor//01 {z1} { + global mul rhsmost_max cols foolist + upvar #0 cur_$z1 cur + .d.csr.csr.l configure -text {adjust} + place .d.csr.csr -x [expr {$cur*$mul - 7}] + bind_key space { othercursor } + bind_leftright cur_$z1 0 [expr {$cols-1}] + if {[llength $foolist]} { + bind_key Tab { set cur_mode already; recursor } + } else { + bind_key Tab {} + } + bind_key Return { + if {$cur_0 != $cur_1} { + set cur_mode text + recursor + } + } + helptext { + {{<- ->} {move cursor, adjusting area to define}} + {Space {switch to moving other cursor}} + {Return {confirm location, enter letter(s)}} + {Tab {switch to correcting earlier ocr}} + } +} +proc othercursor {} { + global cur_mode + set cur_mode [expr {!$cur_mode}] + recursor +} + +proc recursor/text {} { + helptext { + {Return {confirm entry of new glyph}} + {Escape {abandon entry}} + } + unbind_all_keys + .d.csr.csr.l configure -text {define:} + pack .d.csr.csr.e -side left + focus .d.csr.csr.e + bind_key Return { + binary scan [.d.csr.csr.e get] h* hex + if {[string length $hex]} { + RETURN_RESULT DEFINE "$cur_0 $cur_1 $hex" + } + } + bind_key Escape { + bind_key Escape {} + pack forget .d.csr.csr.e + set cur_mode 1 + recursor + } +} + +proc recursor/already {} { + global mul + global foolist + global cur_already mul + global foolist cur_already mul + .d.csr.csr.l configure -text {correct} + set rmax [lindex $foolist [expr {$cur_already*4}]] + place .d.csr.csr -x [expr {$rmax*$mul-3}] + bind_key Return {} + bind_key space {} + bind_leftright cur_already 0 [expr {[llength $foolist]/4-1}] + bind_key Tab { bind_key Delete {}; set cur_mode 1; recursor } + bind_key Delete { + RETURN_RESULT DELETE [lrange $foolist \ + [expr $cur_already*4] \ + [expr $cur_already*4+1]] + } + helptext { + {{<- ->} {move cursor, selecting glyph to correct}} + {Del {clear this glyph from the recognition database}} + {Tab {switch to selecting area to define as new glyph}} + } +} + +proc bind_key {k proc} { + global keybindings + bind . $proc + set keybindings($k) [expr {!![string length $proc]}] +} +proc unbind_all_keys {} { + global keybindings + foreach k [array names keybindings] { bind_key $k {} } +} + +proc bind_leftright {var min max} { + bind_key Left [list leftright $var $min $max -1] + bind_key Right [list leftright $var $min $max +1] +} +proc leftright {var min max inc} { + upvar #0 $var v + set vnew $v + incr vnew $inc + if {$vnew < $min || $vnew > $max} return + set v $vnew + recursor +} + +proc recursor {} { + global csrh cur_mode cur_0 cur_1 mul + foreach z1 {0 1} { + place .d.csr_$z1 -y $csrh -x [expr {[set cur_$z1] * $mul}] + } + recursor/$cur_mode +} + + +# database format: +# series of glyphs: +# ... +# width +# + +# $database($context 0x 0x...) = $hex + +proc read_database {} { + global database + set f [open database r] + while {[gets $f l] >= 0} { + if {![regexp {^(\w+) (\d+) ([0-9a-f]{2}+)$} $l \ + dummy context strl strh]} { + error "bad syntax" + } + binary scan $strw h* strh + if {[string length $strh] != $strl*2} { error "$strh $strl" } + gets $f l; set width [format %d $l] + set bm $context + for {set x 0} {$x < $width} {incr x} { + gets $f l; lappend bm [format %x 0x$l] + } + set database($bm) $strh + } +} + +proc write_database {} { + global database + set ol {} + foreach bm [array names database] { + set strh $database($bm) + set strs [binary format h* $strh] + set strdo [format "%d %s" [expr {[string length $strh]/2}] $strh] + set o "[lindex $bm 0] $strdo\n" + append o [format "%d\n" [expr {[llength $bm]-1}]] + foreach x [lrange $bm 1 end] { append o "$x\n" } + lappend ol $o + } + foreach o [lsort $ol] { + puts -nonewline $o + } +} + +proc update_database/DEFINE {c0 c1 strh} { + global foolist unk_l unk_context wordmap database + if {$c0 > $c1} { manyset [list $c0 $c1] c1 c0 } + if {$c0 == $unk_l} { + set ncontext $unk_context + } else { + foreach {l r context got} $foolist { + if {$l==$c0} { set ncontext $context; break } + } + if {![exists ncontext]} { + puts stderr "must start at letter LHS!" + return + } + } + set bm $ncontext + for {set x $c0} {$x < $c1} {incr x} { + lappend bm [format %x $wordmap($x)] + } + set database($bm) $strh + write_database +} + + +proc RETURN_RESULT {how what} { + place forget .d.csr.csr + pack forget .d.csr.csr.e + helptext {{{ Processing }}} + unbind_all_keys + update idletasks + puts "$how $what" + eval update_database/$how $what +} + +# bind . {} + +read_database +recursor