chiark
/
gitweb
/
~yarrgweb
/
ypp-sc-tools.web-live.git
/ blobdiff
commit
grep
author
committer
pickaxe
?
search:
re
summary
|
shortlog
|
log
|
commit
|
commitdiff
|
tree
raw
|
inline
| side by side
made its first tsv!
[ypp-sc-tools.web-live.git]
/
pctb
/
ocr.c
diff --git
a/pctb/ocr.c
b/pctb/ocr.c
index bb9576769e10a1d633ed6ebb7786d75e3bfcf585..712f90df77c21e239b8f47c256564aae512acaf4 100644
(file)
--- a/
pctb/ocr.c
+++ b/
pctb/ocr.c
@@
-8,11
+8,12
@@
typedef struct {
struct DatabaseNode *then;
} DatabaseLink;
struct DatabaseNode *then;
} DatabaseLink;
-#define MAXGLYPHCHRS
3
+#define MAXGLYPHCHRS
7
typedef struct DatabaseNode {
char s[MAXGLYPHCHRS+1]; /* null-terminated; "" means no match here */
int nlinks, alinks;
typedef struct DatabaseNode {
char s[MAXGLYPHCHRS+1]; /* null-terminated; "" means no match here */
int nlinks, alinks;
+ unsigned endsword:1;
DatabaseLink *links;
} DatabaseNode;
DatabaseLink *links;
} DatabaseNode;
@@
-24,7
+25,7
@@
static const char *context_names[]= {
#define NCONTEXTS (sizeof(context_names)/sizeof(context_names[0]))
#define NCONTEXTS (sizeof(context_names)/sizeof(context_names[0]))
-#define SPACE_SPACES
3
+#define SPACE_SPACES
4
struct OcrReader {
int h;
struct OcrReader {
int h;
@@
-37,6
+38,8
@@
static FILE *resolver;
static pid_t resolver_pid;
static int resolver_done;
static pid_t resolver_pid;
static int resolver_done;
+DEBUG_DEFINE_DEBUGF(ocr)
+
static void fgetsline(FILE *f, char *lbuf, size_t lbufsz) {
char *s= fgets(lbuf,lbufsz,f);
eassert(s);
static void fgetsline(FILE *f, char *lbuf, size_t lbufsz) {
char *s= fgets(lbuf,lbufsz,f);
eassert(s);
@@
-59,7
+62,7
@@
static void readdb(OcrReader *rd) {
char chrs[MAXGLYPHCHRS+1];
Pixcol cv;
int r,j,ctxi;
char chrs[MAXGLYPHCHRS+1];
Pixcol cv;
int r,j,ctxi;
- int h;
+ int h
, endsword
;
char lbuf[100];
FILE *db;
char lbuf[100];
FILE *db;
@@
-110,6
+113,11
@@
static void readdb(OcrReader *rd) {
}
chrs[nchrs++]= c;
}
}
chrs[nchrs++]= c;
}
+ endsword= 0;
+ if (nchrs>1 && chrs[nchrs-1]==' ') {
+ endsword= 1;
+ nchrs--;
+ }
chrs[nchrs]= 0;
current= &rd->contexts[ctxi];
chrs[nchrs]= 0;
current= &rd->contexts[ctxi];
@@
-147,6
+155,7
@@
static void readdb(OcrReader *rd) {
eassert(!current->s[0]);
strcpy(current->s, chrs);
eassert(!current->s[0]);
strcpy(current->s, chrs);
+ current->endsword= endsword;
}
eassert(!ferror(db));
eassert(!fclose(db));
}
eassert(!ferror(db));
eassert(!fclose(db));
@@
-185,7
+194,9
@@
static void callout_unknown(OcrReader *rd, int w, Pixcol cols[],
* so we aren't in any danger of overwriting some other fd 4: */
r= dup2(donepipe[1],4); eassert(r==4);
execlp("./show-thing.tcl", "./show-thing.tcl",
* so we aren't in any danger of overwriting some other fd 4: */
r= dup2(donepipe[1],4); eassert(r==4);
execlp("./show-thing.tcl", "./show-thing.tcl",
- "--automatic","1",(char*)0);
+ DEBUGP(callout) ? "--debug" : "--noop-arg",
+ "--automatic-1",
+ (char*)0);
eassert(!"execlp failed");
}
r= close(jobpipe[0]); eassert(!r);
eassert(!"execlp failed");
}
r= close(jobpipe[0]); eassert(!r);
@@
-296,9
+307,9
@@
OcrResultGlyph *ocr(OcrReader *rd, OcrCellType ct, int w, Pixcol cols[]) {
nspaces=- w;
ctxmap= ct->initial;
rd->nresults=0;
nspaces=- w;
ctxmap= ct->initial;
rd->nresults=0;
-
fprintf(debug,
"OCR h=%d w=%d",rd->h,w);
- for (x=0; x<w; x++)
fprintf(debug,
" %"PSPIXCOL(PRIx),cols[x]);
-
fprintf(debug,
"\n");
+
debugf(
"OCR h=%d w=%d",rd->h,w);
+ for (x=0; x<w; x++)
debugf(
" %"PSPIXCOL(PRIx),cols[x]);
+
debugf(
"\n");
debug_flush();
x=0;
debug_flush();
x=0;
@@
-312,7
+323,7
@@
OcrResultGlyph *ocr(OcrReader *rd, OcrCellType ct, int w, Pixcol cols[]) {
nspaces++;
x++;
if (nspaces==SPACE_SPACES) {
nspaces++;
x++;
if (nspaces==SPACE_SPACES) {
-
fprintf(debug,
"OCR x=%x nspaces=%d space\n",x,nspaces);
+
debugf(
"OCR x=%x nspaces=%d space\n",x,nspaces);
ctxmap= ct->nextword;
}
continue;
ctxmap= ct->nextword;
}
continue;
@@
-329,7
+340,7
@@
OcrResultGlyph *ocr(OcrReader *rd, OcrCellType ct, int w, Pixcol cols[]) {
DatabaseNode *uniquematch= 0;
int uniquematch_rx=-1;
DatabaseNode *uniquematch= 0;
int uniquematch_rx=-1;
-
fprintf(debug,
"OCR lx=%d ctxmap=%x ",lx,ctxmap);
+
debugf(
"OCR lx=%d ctxmap=%x ",lx,ctxmap);
for (ctxi=0; ctxi<NCONTEXTS; ctxi++) {
DatabaseNode *current= &rd->contexts[ctxi];;
for (ctxi=0; ctxi<NCONTEXTS; ctxi++) {
DatabaseNode *current= &rd->contexts[ctxi];;
@@
-338,29
+349,29
@@
OcrResultGlyph *ocr(OcrReader *rd, OcrCellType ct, int w, Pixcol cols[]) {
x= lx;
if (!(ctxmap & (1u << ctxi))) continue;
x= lx;
if (!(ctxmap & (1u << ctxi))) continue;
-
fprintf(debug,
" || %s",context_names[ctxi]);
+
debugf(
" || %s",context_names[ctxi]);
for (;;) {
debug_flush();
for (;;) {
debug_flush();
-
fprintf(debug,
" | x=%d",x);
+
debugf(
" | x=%d",x);
if (x>w) break;
Pixcol cv= cols[x];
if (x>w) break;
Pixcol cv= cols[x];
-
fprintf(debug,
" cv=%"PSPIXCOL(PRIx),cv);
+
debugf(
" cv=%"PSPIXCOL(PRIx),cv);
for (i=0; i<current->nlinks; i++)
if (current->links[i].col == cv)
goto found;
/* not found */
for (i=0; i<current->nlinks; i++)
if (current->links[i].col == cv)
goto found;
/* not found */
-
fprintf(debug,
" ?");
+
debugf(
" ?");
break;
found:
current= current->links[i].then;
if (current->s[0]) {
break;
found:
current= current->links[i].then;
if (current->s[0]) {
-
fprintf(debug," \"%s\"",current->s
);
+
debugf(" \"%s\"%s",current->s,current->endsword?"_":""
);
bestmatch= current;
bestmatch_rx= x;
} else {
bestmatch= current;
bestmatch_rx= x;
} else {
-
fprintf(debug,
" ...");
+
debugf(
" ...");
}
x++;
}
x++;
@@
-368,7
+379,7
@@
OcrResultGlyph *ocr(OcrReader *rd, OcrCellType ct, int w, Pixcol cols[]) {
if (bestmatch) {
if (uniquematch && strcmp(bestmatch->s, uniquematch->s)) {
if (bestmatch) {
if (uniquematch && strcmp(bestmatch->s, uniquematch->s)) {
-
fprintf(debug,
" ambiguous");
+
debugf(
" ambiguous");
uniquematch= 0;
break;
}
uniquematch= 0;
break;
}
@@
-378,22
+389,29
@@
OcrResultGlyph *ocr(OcrReader *rd, OcrCellType ct, int w, Pixcol cols[]) {
}
if (uniquematch) {
}
if (uniquematch) {
-
fprintf(debug," || YES\n
");
+
debugf(" || YES
");
add_result(rd, uniquematch->s, lx, uniquematch_rx, ctxmap);
x= uniquematch_rx+1;
add_result(rd, uniquematch->s, lx, uniquematch_rx, ctxmap);
x= uniquematch_rx+1;
- ctxmap= ct->midword;
+ if (uniquematch->s[0]) ctxmap= ct->midword;
+ else debugf(" (empty)");
+ if (uniquematch->endsword) {
+ nspaces= SPACE_SPACES;
+ debugf("_");
+ ctxmap= ct->nextword;
+ }
+ debugf("\n");
} else {
int rx;
} else {
int rx;
-
fprintf(debug,
" || UNKNOWN");
+
debugf(
" || UNKNOWN");
for (rx=lx; rx<w && cols[rx]; rx++);
for (rx=lx; rx<w && cols[rx]; rx++);
-
fprintf(debug,
" x=%d ctxmap=%x %d..%d\n",x, ctxmap, lx,rx);
+
debugf(
" x=%d ctxmap=%x %d..%d\n",x, ctxmap, lx,rx);
debug_flush();
callout_unknown(rd, w,cols, lx,rx-1, ctxmap);
goto restart;
}
}
add_result(rd, 0,-1,-1,0);
debug_flush();
callout_unknown(rd, w,cols, lx,rx-1, ctxmap);
goto restart;
}
}
add_result(rd, 0,-1,-1,0);
-
fprintf(debug,
"OCR finished %d glyphs\n",rd->nresults);
+
debugf(
"OCR finished %d glyphs\n",rd->nresults);
debug_flush();
return rd->results;
}
debug_flush();
return rd->results;
}