chiark / gitweb /
made its first tsv!
authorIan Jackson <ian@liberator.relativity.greenend.org.uk>
Sat, 6 Jun 2009 20:33:13 +0000 (21:33 +0100)
committerIan Jackson <ian@liberator.relativity.greenend.org.uk>
Sat, 6 Jun 2009 20:33:13 +0000 (21:33 +0100)
pctb/charset-15.txt
pctb/ocr.c

index 2a5941569cf6439128e21212b4a9ea8dc25f62f6..d12325b2701ab4b0dfff4c5319662e8718abd31b 100644 (file)
@@ -230,6 +230,16 @@ ff8
 400
 ff8
 
+Lower
+O
+3e0
+410
+808
+808
+808
+410
+3e0
+
 Lower
 P
 ff8
@@ -358,6 +368,123 @@ Lower
 l
 ff8
 
+Lower
+laven
+ff8
+0
+0
+640
+920
+920
+920
+fc0
+800
+20
+c0
+700
+800
+700
+c0
+20
+7c0
+920
+920
+920
+9c0
+0
+0
+fe0
+40
+20
+20
+fc0
+
+Lower
+lemon
+ff8
+0
+0
+7c0
+920
+920
+920
+9c0
+0
+0
+fe0
+40
+20
+20
+fc0
+40
+20
+20
+fc0
+0
+0
+7c0
+820
+820
+820
+7c0
+0
+0
+fe0
+40
+20
+20
+fc0
+
+Lower
+light
+ff8
+0
+0
+fe8
+0
+0
+27c0
+2820
+2820
+2420
+1fe0
+0
+0
+ff8
+40
+20
+20
+fc0
+0
+20
+7f0
+820
+
+Lower
+lime
+ff8
+0
+0
+fe8
+0
+0
+fe0
+40
+20
+20
+fc0
+40
+20
+20
+fc0
+0
+0
+7c0
+920
+920
+920
+9c0
+
 Lower
 m
 fe0
@@ -395,6 +522,15 @@ p
 820
 7c0
 
+Lower
+q
+7c0
+820
+820
+820
+440
+3fe0
+
 Lower
 r
 fe0
@@ -479,6 +615,19 @@ y
 180
 60
 
+Lower
+y 
+60
+180
+2600
+1800
+600
+180
+60
+0
+0
+0
+
 Lower
 yw
 60
@@ -659,6 +808,16 @@ ff8
 400
 ff8
 
+Upper
+O
+3e0
+410
+808
+808
+808
+410
+3e0
+
 Upper
 P
 ff8
@@ -737,6 +896,14 @@ f80
 10
 8
 
+Upper
+Z
+c08
+b08
+888
+868
+818
+
 Upper
 b
 ff8
@@ -753,6 +920,132 @@ c
 820
 820
 
+Upper
+d
+7c0
+820
+820
+820
+440
+ff8
+
+Upper
+laven
+ff8
+0
+0
+640
+920
+920
+920
+fc0
+800
+20
+c0
+700
+800
+700
+c0
+20
+7c0
+920
+920
+920
+9c0
+0
+0
+fe0
+40
+20
+20
+fc0
+
+Upper
+lemon
+ff8
+0
+0
+7c0
+920
+920
+920
+9c0
+0
+0
+fe0
+40
+20
+20
+fc0
+40
+20
+20
+fc0
+0
+0
+7c0
+820
+820
+820
+7c0
+0
+0
+fe0
+40
+20
+20
+fc0
+
+Upper
+light
+ff8
+0
+0
+fe8
+0
+0
+27c0
+2820
+2820
+2420
+1fe0
+0
+0
+ff8
+40
+20
+20
+fc0
+0
+20
+7f0
+820
+
+Upper
+lime
+ff8
+0
+0
+fe8
+0
+0
+fe0
+40
+20
+20
+fc0
+40
+20
+20
+fc0
+0
+0
+7c0
+920
+920
+920
+9c0
+
 Upper
 o
 7c0
index 266ede2a32fa197299aaff620e974e6a3cc4c834..712f90df77c21e239b8f47c256564aae512acaf4 100644 (file)
@@ -13,6 +13,7 @@ typedef struct {
 typedef struct DatabaseNode {
   char s[MAXGLYPHCHRS+1]; /* null-terminated; "" means no match here */
   int nlinks, alinks;
+  unsigned endsword:1;
   DatabaseLink *links;
 } DatabaseNode;
 
@@ -61,7 +62,7 @@ static void readdb(OcrReader *rd) {
   char chrs[MAXGLYPHCHRS+1];
   Pixcol cv;
   int r,j,ctxi;
-  int h;
+  int h, endsword;
   char lbuf[100];
   FILE *db;
 
@@ -112,6 +113,11 @@ static void readdb(OcrReader *rd) {
       }
       chrs[nchrs++]= c;
     }
+    endsword= 0;
+    if (nchrs>1 && chrs[nchrs-1]==' ') {
+      endsword= 1;
+      nchrs--;
+    }
     chrs[nchrs]= 0;
 
     current= &rd->contexts[ctxi];
@@ -149,6 +155,7 @@ static void readdb(OcrReader *rd) {
 
     eassert(!current->s[0]);
     strcpy(current->s, chrs);
+    current->endsword= endsword;
   }
   eassert(!ferror(db));
   eassert(!fclose(db));
@@ -360,7 +367,7 @@ OcrResultGlyph *ocr(OcrReader *rd, OcrCellType ct, int w, Pixcol cols[]) {
       found:
        current= current->links[i].then;
        if (current->s[0]) {
-         debugf(" \"%s\"",current->s);
+         debugf(" \"%s\"%s",current->s,current->endsword?"_":"");
          bestmatch= current;
          bestmatch_rx= x;
        } else {
@@ -382,10 +389,17 @@ OcrResultGlyph *ocr(OcrReader *rd, OcrCellType ct, int w, Pixcol cols[]) {
     }
 
     if (uniquematch) {
-      debugf(" || YES\n");
+      debugf(" || YES");
       add_result(rd, uniquematch->s, lx, uniquematch_rx, ctxmap);
       x= uniquematch_rx+1;
-      ctxmap= ct->midword;
+      if (uniquematch->s[0]) ctxmap= ct->midword;
+      else debugf(" (empty)");
+      if (uniquematch->endsword) {
+       nspaces= SPACE_SPACES;
+       debugf("_");
+       ctxmap= ct->nextword;
+      }
+      debugf("\n");
     } else {
       int rx;
       debugf(" || UNKNOWN");