Merge branch 'origin'

author Ian Jackson <ijackson@chiark.greenend.org.uk>

Tue, 30 Jun 2009 17:19:08 +0000 (18:19 +0100)

committer Ian Jackson <Ian.Jackson@eu.citrix.com>

Tue, 30 Jun 2009 17:19:08 +0000 (18:19 +0100)
author Ian Jackson <ijackson@chiark.greenend.org.uk>
Tue, 30 Jun 2009 17:19:08 +0000 (18:19 +0100)
committer Ian Jackson <Ian.Jackson@eu.citrix.com>
Tue, 30 Jun 2009 17:19:08 +0000 (18:19 +0100)
diff --git a/pctb/README.charset b/pctb/README.charset

index e1fd3ff5f86ba8f41e9843d2255e11edefe120b2..c57f2f5187a537b85c77f793493eefd9f7f0a3e3 100644 (file)
--- a/pctb/README.charset
+++ b/pctb/README.charset
@@ -38,7 +38,8 @@ the entry you have just made.
  Upper vs lower case - important note regarding `l' and `I'
  ----------------------------------------------------------
  
-We maintain separate dictionaries for upper and lower case.  At the
+We maintain separate dictionaries for upper case (Upper), lower case
+(Lower), and (initial portions of) mid-phrase words (Word).  At the
  beginning of each cell in the table, we expect uppercase; in the
  middle of a word we expect lowercase; and, unfortunately, after an
  inter-word gap, we are not sure.
@@ -47,10 +48,16 @@ This is troublesome because `l' and `I' look identical on the screen.
  So any time we see a word starting with `l' or `I', the program has to
  ask about it.
  
+After an interword gap, we first search for a Word entry in the
+dictionary.  If there is a match we use it.  Otherwise we search both
+the uppercase and lowercase dictionaries; if one matches and the other
+doesn't, or one matches a wider character than the other, we use it.
+If that fails to resolve the ambiguity we must ask.
+
  *Do not* make an entry in the character set dictionary mapping `vertical
  stick' to `l' or `I'.  Instead, select enough of the whole word in
  question that no word would start with the other letter, and enter the
-whole word or part of it as a new glyph.
+whole word or part of it as a new glyph as a new Word.
  
  For example, in the supplied dictionary there is already a glyph for
  `Iron'; this is OK because there are no words which start `lron'.
@@ -72,12 +79,11 @@ for the uppercase letter in the lowercase dictionary.
  
  Instead, make a new glyph for the last letter of the previous word
  plus the (unusually narrow) inter-word space, and end that entry with
-\x20 (yes, type \ x 20).
+a literal space ` '.
  
  For example, you might find that `y<space>G' is treated as
  `y<??lowercase>' and the G doesn't get matched.  Select the `y<space>'
-region of the bitmap and type `y\x20' into the string box.
-Sorry for this rather poor UI!
+region of the bitmap and type `y ' into the string box.
  
  
  Overlapping characters - ligatures
@@ -101,9 +107,10 @@ However since you are not guaranteed to actually get an OCR query at
  all if the dictionary contains errors, you shouldn't rely on this.
  
  If you think you have made mistakes answering OCR queries (for
-example, the recognised data is wrong), you should download a fresh
-copy of charset-15.txt from
- http://www.chiark.greenend.org.uk/~ijackson/ypp-sc-tools/master/pctb/charset-15.txt
+example, the recognised data is wrong), you should delete the file
+#local-char*#.txt, which contains your local updates.  It will then
+only use the centrally provided (and vetted) master file (which is
+automatically updated when you run the PCTB client, by default).
  
  
  Enabling interactive character set update
diff --git a/pctb/TODO b/pctb/TODO

index 935554064b0ed54448b4ceb73b1e10299b9574d3..6a603c61c61989ea6b94a3042f3bc785a14fffdb 100644 (file)
--- a/pctb/TODO
+++ b/pctb/TODO
@@ -1,11 +1,6 @@
-make dict char update able to only add to one of upper/lower so that
-we can have
-               upper           lower
-                               in
-               I  err
-               Iron
-  need to rethink all this
-
+add UI option to dictionary-manager to make user specify which dictionary
+  to add multi-context entries to
+install/test dictionary upload/approval
  write real uploader
  test real uploader
  further speedups
diff --git a/pctb/common.c b/pctb/common.c

index 3cb4323151c86cbc46a0c80cfe73241537e253ac..bc1cb8df749cb63ec2ef392300551c37262c6e16 100644 (file)
--- a/pctb/common.c
+++ b/pctb/common.c
@@ -88,8 +88,14 @@ int dbfile_scanf(const char *fmt, ...) {
  }
  
  void dbfile_assertfail(const char *file, int line, const char *m) {
-  fatal("Error in database file %s at byte %ld:\n"
-       " Requirement not met at %s:%d:\n"
-       " %s",
-       path,(long)ftell(dbfile), file,line, m);
+  if (dbfile)
+    fatal("Error in dictionary file %s at byte %ld:\n"
+         " Requirement not met at %s:%d:\n"
+         " %s",
+         path,(long)ftell(dbfile), file,line, m);
+  else
+    fatal("Semantic error in dictionaries:\n"
+         " Requirement not met at %s:%d:\n"
+         " %s",
+         file,line, m);
  }
diff --git a/pctb/ocr.c b/pctb/ocr.c

index 2c3a16772ec8ac0a90f5a9be420c9db188c1e6da..f6104452dc1c1854e6ccd19aac77882caae0aba9 100644 (file)
--- a/pctb/ocr.c
+++ b/pctb/ocr.c
@@ -40,30 +40,34 @@ typedef struct DatabaseNode {
    DatabaseLink *links;
  } DatabaseNode;
  
+typedef struct {
+  OcrReader *rd;
+  OcrCellType ct;
+  int w;
+  Pixcol *cols;
+  int x;
+  unsigned ctxmap;
+} FindCharArgs;
+typedef struct {
+  DatabaseNode *match;
+  int rx;
+} FindCharResults;
+
+enum { ct_Lower, ct_Upper, ct_Word, ct_Digit };
  static const char *context_names[]= {
-  "Lower",
-  "Upper",
-  "Digit"
+  "Lower",  /*  bit 0, value 001 */
+  "Upper",  /*  bit 1, value 002 */
+  "Word",   /*  bit 2, value 004 */
+  "Digit",  /*  bit 3, value 010 */
  };
  struct OcrCellTypeInfo {
    /* bitmaps of indices into context_names: */
    unsigned initial, nextword, midword;
    int space_spaces;
    const char *name;
+  int (*findchar_select)(const FindCharArgs *fca,
+                        const FindCharResults results[]);
  };
-const struct OcrCellTypeInfo ocr_celltype_number= {
-  4,4,4,
-  .space_spaces= 5,
-  .name= "number"
-};
-const struct OcrCellTypeInfo ocr_celltype_text= {
-  .initial=2, /* Uppercase */
-  .nextword=3, /* Either */
-  .midword=1, /* Lower only */
-  .space_spaces= 4,
-  .name= "text"
-};
-
  
  #define NCONTEXTS (sizeof(context_names)/sizeof(context_names[0]))
  
@@ -275,17 +279,117 @@ static void add_result(OcrReader *rd, const char *s, int l, int r,
  }
  
  
+static DatabaseNode *findchar_1ctx(const FindCharArgs *fca,
+                                  DatabaseNode *start, int *matchx_r) {
+  DatabaseNode *current= start;
+  DatabaseNode *bestmatch= 0;
+  int i;
+  int x= fca->x;
+
+  for (;;) {
+    debug_flush();
+    debugf(" | x=%d",x);
+    if (x > fca->w) break;
+    Pixcol cv= fca->cols[x];
+    debugf(" cv=%"PSPIXCOL(PRIx),cv);
+    for (i=0; i<current->nlinks; i++)
+      if (current->links[i].col == cv)
+       goto found;
+    /* not found */
+    debugf(" ?");
+    break;
+
+  found:
+    current= current->links[i].then;
+    if (current->match) {
+      debugf(" \"%s\"%s",current->str,current->endsword?"_":"");
+      bestmatch= current;
+      *matchx_r= x;
+    } else {
+      debugf(" ...");
+    }
+
+    x++;
+  }
+  return bestmatch;
+}  
+
+static DatabaseNode *findchar(const FindCharArgs *fca, int *match_rx) {
+  FindCharResults results[NCONTEXTS];
+  int ctxi, match=-1, nmatches=0;
+
+  debugf("OCR  lx=%d ct_state=%x  ", fca->x, fca->ctxmap);
+  for (ctxi=0; ctxi<NCONTEXTS; ctxi++) {
+    results[ctxi].match= 0;
+    if (!(fca->ctxmap & (1u << ctxi))) continue;
+    debugf(" || %s",context_names[ctxi]);
+
+    results[ctxi].match= findchar_1ctx(fca, &fca->rd->contexts[ctxi],
+                                      &results[ctxi].rx);
+    if (!results[ctxi].match) continue;
+
+    match= ctxi;
+    nmatches++;
+  }
+  if (nmatches==1) {
+    debugf(" unique");
+  } else {
+    debugf(" ambiguous");
+    match= !fca->ct->findchar_select ? -1 :
+      fca->ct->findchar_select(fca,results);
+    debugf(" resolved %s", match<0 ? "<none>" : context_names[match]);
+  }
+  if (match<0)
+    return 0;
+  
+  *match_rx= results[ctxi].rx;
+  return results[ctxi].match;
+}
+
+static int findchar_select_text(const FindCharArgs *fca,
+                               const FindCharResults results[]) {
+  if (fca->ctxmap != 017) return -1;
+
+  dbassert(! results[ct_Digit].match );
+  if (results[ct_Word].match) return ct_Word;
+  if (results[ct_Lower].rx > results[ct_Upper].rx) return ct_Lower;
+  if (results[ct_Upper].rx > results[ct_Lower].rx) return ct_Upper;
+  return -1;
+}
+
+const struct OcrCellTypeInfo ocr_celltype_number= {
+  010,010,010,
+  .space_spaces= 5,
+  .name= "number",
+  .findchar_select= 0
+};
+const struct OcrCellTypeInfo ocr_celltype_text= {
+  .initial=  012, /* Digit|Upper */
+  .nextword= 017, /* Digit|Upper|Lower|Word */
+  .midword=  014, /* Digit|Lower */
+  .space_spaces= 4,
+  .name= "text",
+  .findchar_select= findchar_select_text
+};
+
+
  const char *ocr_celltype_name(OcrCellType ct) { return ct->name; }
  
  OcrResultGlyph *ocr(OcrReader *rd, OcrCellType ct, int w, Pixcol cols[]) {
    int nspaces;
-  unsigned ctxmap;
-  int ctxi, i, x;
+  int x;
+
+  FindCharArgs fca;
+  fca.rd= rd;
+  fca.ct= ct;
+  fca.w= w;
+  fca.cols= cols;
+  fca.x= -1;
  
   restart:
  
    nspaces=- w;
-  ctxmap= ct->initial;
+  fca.ctxmap= ct->initial;
    rd->nresults=0;
    debugf("OCR h=%d w=%d",rd->h,w);
    for (x=0; x<w; x++) debugf(" %"PSPIXCOL(PRIx),cols[x]);
@@ -304,7 +408,7 @@ OcrResultGlyph *ocr(OcrReader *rd, OcrCellType ct, int w, Pixcol cols[]) {
        x++;
        if (nspaces == ct->space_spaces) {
         debugf("OCR  x=%x nspaces=%d space\n",x,nspaces);
-       ctxmap= ct->nextword;
+       fca.ctxmap= ct->nextword;
        }
        continue;
      }
@@ -314,81 +418,33 @@ OcrResultGlyph *ocr(OcrReader *rd, OcrCellType ct, int w, Pixcol cols[]) {
        add_result(rd," ",x-nspaces,x+1,0);
      nspaces=0;
  
-    /* find character */
-    int lx=x;
+    fca.x= x;
  
-    DatabaseNode *uniquematch= 0;
-    int uniquematch_rx=-1;
+    int match_rx=-1;
+    DatabaseNode *match= findchar(&fca, &match_rx);
      
-    debugf("OCR  lx=%d ctxmap=%x  ",lx,ctxmap);
-
-    for (ctxi=0; ctxi<NCONTEXTS; ctxi++) {
-      DatabaseNode *current= &rd->contexts[ctxi];;
-      DatabaseNode *bestmatch= 0;
-      int bestmatch_rx=-1;
-
-      x= lx;
-      if (!(ctxmap & (1u << ctxi))) continue;
-      debugf(" || %s",context_names[ctxi]);
-
-      for (;;) {
-       debug_flush();
-       debugf(" | x=%d",x);
-       if (x>w) break;
-       Pixcol cv= cols[x];
-       debugf(" cv=%"PSPIXCOL(PRIx),cv);
-       for (i=0; i<current->nlinks; i++)
-         if (current->links[i].col == cv)
-           goto found;
-       /* not found */
-       debugf(" ?");
-       break;
-
-      found:
-       current= current->links[i].then;
-       if (current->match) {
-         debugf(" \"%s\"%s",current->str,current->endsword?"_":"");
-         bestmatch= current;
-         bestmatch_rx= x;
-       } else {
-         debugf(" ...");
-       }
-
-       x++;
-      }
-      
-      if (bestmatch) {
-       if (uniquematch && strcmp(bestmatch->str, uniquematch->str)) {
-         debugf( " ambiguous");
-         uniquematch= 0;
-         break;
-       }
-       uniquematch= bestmatch;
-       uniquematch_rx= bestmatch_rx;
-      }
-    }
-
-    if (uniquematch) {
+    if (match) {
        debugf(" || YES");
-      add_result(rd, uniquematch->str, lx, uniquematch_rx, ctxmap);
-      x= uniquematch_rx+1;
-      if (uniquematch->match) ctxmap= ct->midword;
+      add_result(rd, match->str, x, match_rx, fca.ctxmap);
+      x= match_rx+1;
+      if (match->match) fca.ctxmap= ct->midword;
        else debugf(" (empty)");
-      if (uniquematch->endsword) {
+      if (match->endsword) {
         nspaces= ct->space_spaces;
         debugf("_");
-       ctxmap= ct->nextword;
+       fca.ctxmap= ct->nextword;
        }
        debugf("\n");
      } else {
        int rx;
        debugf(" || UNKNOWN");
-      for (rx=lx; rx<w && cols[rx]; rx++);
-      debugf(" x=%d ctxmap=%x %d..%d\n",x, ctxmap, lx,rx);
+      for (rx=x; rx<w && cols[rx]; rx++);
+      debugf(" x=%d ctxmap=%x %d..%d\n",x, fca.ctxmap, x,rx);
        debug_flush();
-      callout_unknown(rd, w,cols, lx,rx-1, ctxmap);
+      callout_unknown(rd, w,cols, x,rx-1, fca.ctxmap);
        goto restart;
      }
+
    }
    add_result(rd, 0,-1,-1,0);
    debugf("OCR  finished %d glyphs\n",rd->nresults);
author	Ian Jackson <ijackson@chiark.greenend.org.uk>
	Tue, 30 Jun 2009 17:19:08 +0000 (18:19 +0100)
committer	Ian Jackson <Ian.Jackson@eu.citrix.com>
	Tue, 30 Jun 2009 17:19:08 +0000 (18:19 +0100)
pctb/README.charset		patch \| blob \| history
pctb/TODO		patch \| blob \| history
pctb/common.c		patch \| blob \| history
pctb/ocr.c		patch \| blob \| history