chiark / gitweb /
made its first tsv!
authorIan Jackson <ian@liberator.relativity.greenend.org.uk>
Sat, 6 Jun 2009 20:33:13 +0000 (21:33 +0100)
committerIan Jackson <ian@liberator.relativity.greenend.org.uk>
Sat, 6 Jun 2009 20:33:13 +0000 (21:33 +0100)
pctb/charset-15.txt
pctb/ocr.c

index 2a59415..d12325b 100644 (file)
@@ -231,6 +231,16 @@ ff8
 ff8
 
 Lower
+O
+3e0
+410
+808
+808
+808
+410
+3e0
+
+Lower
 P
 ff8
 88
@@ -359,6 +369,123 @@ l
 ff8
 
 Lower
+laven
+ff8
+0
+0
+640
+920
+920
+920
+fc0
+800
+20
+c0
+700
+800
+700
+c0
+20
+7c0
+920
+920
+920
+9c0
+0
+0
+fe0
+40
+20
+20
+fc0
+
+Lower
+lemon
+ff8
+0
+0
+7c0
+920
+920
+920
+9c0
+0
+0
+fe0
+40
+20
+20
+fc0
+40
+20
+20
+fc0
+0
+0
+7c0
+820
+820
+820
+7c0
+0
+0
+fe0
+40
+20
+20
+fc0
+
+Lower
+light
+ff8
+0
+0
+fe8
+0
+0
+27c0
+2820
+2820
+2420
+1fe0
+0
+0
+ff8
+40
+20
+20
+fc0
+0
+20
+7f0
+820
+
+Lower
+lime
+ff8
+0
+0
+fe8
+0
+0
+fe0
+40
+20
+20
+fc0
+40
+20
+20
+fc0
+0
+0
+7c0
+920
+920
+920
+9c0
+
+Lower
 m
 fe0
 40
@@ -396,6 +523,15 @@ p
 7c0
 
 Lower
+q
+7c0
+820
+820
+820
+440
+3fe0
+
+Lower
 r
 fe0
 40
@@ -480,6 +616,19 @@ y
 60
 
 Lower
+y 
+60
+180
+2600
+1800
+600
+180
+60
+0
+0
+0
+
+Lower
 yw
 60
 180
@@ -660,6 +809,16 @@ ff8
 ff8
 
 Upper
+O
+3e0
+410
+808
+808
+808
+410
+3e0
+
+Upper
 P
 ff8
 88
@@ -738,6 +897,14 @@ f80
 8
 
 Upper
+Z
+c08
+b08
+888
+868
+818
+
+Upper
 b
 ff8
 440
@@ -754,6 +921,132 @@ c
 820
 
 Upper
+d
+7c0
+820
+820
+820
+440
+ff8
+
+Upper
+laven
+ff8
+0
+0
+640
+920
+920
+920
+fc0
+800
+20
+c0
+700
+800
+700
+c0
+20
+7c0
+920
+920
+920
+9c0
+0
+0
+fe0
+40
+20
+20
+fc0
+
+Upper
+lemon
+ff8
+0
+0
+7c0
+920
+920
+920
+9c0
+0
+0
+fe0
+40
+20
+20
+fc0
+40
+20
+20
+fc0
+0
+0
+7c0
+820
+820
+820
+7c0
+0
+0
+fe0
+40
+20
+20
+fc0
+
+Upper
+light
+ff8
+0
+0
+fe8
+0
+0
+27c0
+2820
+2820
+2420
+1fe0
+0
+0
+ff8
+40
+20
+20
+fc0
+0
+20
+7f0
+820
+
+Upper
+lime
+ff8
+0
+0
+fe8
+0
+0
+fe0
+40
+20
+20
+fc0
+40
+20
+20
+fc0
+0
+0
+7c0
+920
+920
+920
+9c0
+
+Upper
 o
 7c0
 820
index 266ede2..712f90d 100644 (file)
@@ -13,6 +13,7 @@ typedef struct {
 typedef struct DatabaseNode {
   char s[MAXGLYPHCHRS+1]; /* null-terminated; "" means no match here */
   int nlinks, alinks;
+  unsigned endsword:1;
   DatabaseLink *links;
 } DatabaseNode;
 
@@ -61,7 +62,7 @@ static void readdb(OcrReader *rd) {
   char chrs[MAXGLYPHCHRS+1];
   Pixcol cv;
   int r,j,ctxi;
-  int h;
+  int h, endsword;
   char lbuf[100];
   FILE *db;
 
@@ -112,6 +113,11 @@ static void readdb(OcrReader *rd) {
       }
       chrs[nchrs++]= c;
     }
+    endsword= 0;
+    if (nchrs>1 && chrs[nchrs-1]==' ') {
+      endsword= 1;
+      nchrs--;
+    }
     chrs[nchrs]= 0;
 
     current= &rd->contexts[ctxi];
@@ -149,6 +155,7 @@ static void readdb(OcrReader *rd) {
 
     eassert(!current->s[0]);
     strcpy(current->s, chrs);
+    current->endsword= endsword;
   }
   eassert(!ferror(db));
   eassert(!fclose(db));
@@ -360,7 +367,7 @@ OcrResultGlyph *ocr(OcrReader *rd, OcrCellType ct, int w, Pixcol cols[]) {
       found:
        current= current->links[i].then;
        if (current->s[0]) {
-         debugf(" \"%s\"",current->s);
+         debugf(" \"%s\"%s",current->s,current->endsword?"_":"");
          bestmatch= current;
          bestmatch_rx= x;
        } else {
@@ -382,10 +389,17 @@ OcrResultGlyph *ocr(OcrReader *rd, OcrCellType ct, int w, Pixcol cols[]) {
     }
 
     if (uniquematch) {
-      debugf(" || YES\n");
+      debugf(" || YES");
       add_result(rd, uniquematch->s, lx, uniquematch_rx, ctxmap);
       x= uniquematch_rx+1;
-      ctxmap= ct->midword;
+      if (uniquematch->s[0]) ctxmap= ct->midword;
+      else debugf(" (empty)");
+      if (uniquematch->endsword) {
+       nspaces= SPACE_SPACES;
+       debugf("_");
+       ctxmap= ct->nextword;
+      }
+      debugf("\n");
     } else {
       int rx;
       debugf(" || UNKNOWN");