chiark / gitweb /
seems to be able to do most scanning
authorIan Jackson <ian@liberator.relativity.greenend.org.uk>
Fri, 5 Jun 2009 23:13:15 +0000 (00:13 +0100)
committerIan Jackson <ian@liberator.relativity.greenend.org.uk>
Fri, 5 Jun 2009 23:13:15 +0000 (00:13 +0100)
pctb/charset-15.txt
pctb/ocr.c

index 8c7b13a..0a82f9b 100644 (file)
@@ -585,6 +585,15 @@ f80
 8
 
 Upper
+b
+ff8
+440
+820
+820
+820
+7c0
+
+Upper
 c
 7c0
 820
index 04aecc0..48c25c0 100644 (file)
@@ -24,6 +24,8 @@ static const char *context_names[]= {
 
 #define NCONTEXTS (sizeof(context_names)/sizeof(context_names[0]))
 
+#define SPACE_SPACES 3
+
 struct OcrReader {
   int h;
   DatabaseNode contexts[NCONTEXTS];
@@ -305,13 +307,16 @@ OcrResultGlyph *ocr(OcrReader *rd, OcrCellType ct, int w, Pixcol cols[]) {
     if (!cols[x]) {
       nspaces++;
       x++;
-      if (nspaces==3) {
+      if (nspaces==SPACE_SPACES) {
        fprintf(debug,"OCR  x=%x nspaces=%d space\n",x,nspaces);
-       add_result(rd," ",x-nspaces,x+1,0);
        ctxmap= ct->nextword;
       }
       continue;
     }
+
+    /* something here, so we need to add the spaces */
+    if (nspaces>=SPACE_SPACES)
+      add_result(rd," ",x-nspaces,x+1,0);
     nspaces=0;
 
     /* find character */