X-Git-Url: http://www.chiark.greenend.org.uk/ucgi/~yarrgweb/git?a=blobdiff_plain;f=pctb%2Focr.c;h=f4bc77d56118ae8b4caebfce168a3336cd2eecc2;hb=74e4e249f2c3e848592984cb193aded6a77a341d;hp=fd4cba9fe841e1c6a439f34edc741eae31d44c5c;hpb=f2c5d45ada202f2ad2640b260cac2fd7bb83eb3e;p=ypp-sc-tools.db-live.git diff --git a/pctb/ocr.c b/pctb/ocr.c index fd4cba9..f4bc77d 100644 --- a/pctb/ocr.c +++ b/pctb/ocr.c @@ -46,10 +46,27 @@ static const char *context_names[]= { "Upper", "Digit" }; +struct OcrCellTypeInfo { + /* bitmaps of indices into context_names: */ + unsigned initial, nextword, midword; + int space_spaces; + const char *name; +}; +const struct OcrCellTypeInfo ocr_celltype_number= { + 4,4,4, + .space_spaces= 5, + .name= "number" +}; +const struct OcrCellTypeInfo ocr_celltype_text= { + .initial=2, /* Uppercase */ + .nextword=3, /* Either */ + .midword=1, /* Lower only */ + .space_spaces= 4, + .name= "text" +}; -#define NCONTEXTS (sizeof(context_names)/sizeof(context_names[0])) -#define SPACE_SPACES 4 +#define NCONTEXTS (sizeof(context_names)/sizeof(context_names[0])) struct OcrReader { int h; @@ -69,17 +86,7 @@ DEBUG_DEFINE_DEBUGF(ocr) fatal("Error in character set database.\n" \ " Requirement not met: %s:%d: %s", __FILE__,__LINE__, #x)) -static void fgetsline(FILE *f, char *lbuf, size_t lbufsz) { - errno=0; - char *s= fgets(lbuf,lbufsz,f); - sysassert(!ferror(f)); - dbassert(!feof(f)); - assert(s); - int l= strlen(lbuf); - dbassert(l>0); dbassert(lbuf[--l]='\n'); - lbuf[l]= 0; -} -#define FGETSLINE(f,buf) (fgetsline(f,buf,sizeof(buf))) +#define FGETSLINE (fgetsline(db,lbuf,sizeof(lbuf))) static void cleardb_node(DatabaseNode *n) { int i; @@ -112,7 +119,7 @@ static void readdb(OcrReader *rd) { return; } - FGETSLINE(db,lbuf); + FGETSLINE; dbassert(!strcmp(lbuf,"# ypp-sc-tools pctb font v1")); r= fscanf(db, "%d", &h); @@ -120,7 +127,7 @@ static void readdb(OcrReader *rd) { dbassert(h==rd->h); for (;;) { - FGETSLINE(db,lbuf); + FGETSLINE; if (!lbuf || lbuf[0]=='#') continue; if (!strcmp(lbuf,".")) break; @@ -128,7 +135,7 @@ static void readdb(OcrReader *rd) { if (!strcmp(lbuf,context_names[ctxi])) goto found_ctx; /* not found, just skip */ - for (;;) { FGETSLINE(db,lbuf); if (!lbuf[0]) break; } + for (;;) { FGETSLINE; if (!lbuf[0]) break; } continue; found_ctx: @@ -154,7 +161,7 @@ static void readdb(OcrReader *rd) { current= &rd->contexts[ctxi]; for (;;) { - FGETSLINE(db,lbuf); + FGETSLINE; if (!lbuf[0]) { dbassert(current != &rd->contexts[ctxi]); break; } char *ep; cv= strtoul(lbuf,&ep,16); dbassert(!*ep); @@ -211,6 +218,11 @@ static void callout_unknown(OcrReader *rd, int w, Pixcol cols[], const char *p; char cb; Pixcol pv; + + if (!o_resolver) + fatal("OCR failed - unrecognised characters or ligatures.\n" + "Character set database needs to be updated or augmented.\n" + "See README.charset.\n"); if (!resolver) { sysassert(! pipe(jobpipe) ); @@ -224,11 +236,11 @@ static void callout_unknown(OcrReader *rd, int w, Pixcol cols[], /* we know donepipe[1] is >= 4 and we have dealt with all the others * so we aren't in any danger of overwriting some other fd 4: */ sysassert( dup2(donepipe[1],4) ==4 ); - execlp("./yppsc-ocr-resolver", "yppsc-ocr-resolver", + execlp(o_resolver, o_resolver, DEBUGP(callout) ? "--debug" : "--noop-arg", "--automatic-1", (char*)0); - sysassert(!"execlp failed"); + sysassert(!"execlp ocr-resolver failed"); } sysassert(! close(jobpipe[0]) ); sysassert(! close(donepipe[1]) ); @@ -278,27 +290,10 @@ static void callout_unknown(OcrReader *rd, int w, Pixcol cols[], } if (r==0) { - pid_t pid; - int st; - for (;;) { - pid= waitpid(resolver_pid, &st, 0); - if (pid==-1) { sysassert(errno==EINTR); continue; } - break; - } - sysassert(pid==resolver_pid); - if (WIFEXITED(st)) { - if (WEXITSTATUS(st)) - fatal("character resolver failed with nonzero exit status %d", - WEXITSTATUS(st)); - fclose(resolver); - close(resolver_done); - resolver= 0; - } else if (WIFSIGNALED(st)) { - fatal("character resolver died due to signal %s%s", - strsignal(WTERMSIG(st)), WCOREDUMP(st)?" (core dumped)":""); - } else { - fatal("character resolver gave strange wait status %d",st); - } + waitpid_check_exitstatus(resolver_pid, "character resolver"); + fclose(resolver); + close(resolver_done); + resolver= 0; } else { assert(r==1); sysassert(cb==0); @@ -320,20 +315,6 @@ static void add_result(OcrReader *rd, const char *s, int l, int r, rd->nresults++; } -struct OcrCellTypeInfo { - unsigned initial, nextword, midword; - const char *name; -}; -const struct OcrCellTypeInfo ocr_celltype_number= { - 4,4,4, - .name= "number" -}; -const struct OcrCellTypeInfo ocr_celltype_text= { - .initial=2, /* Uppercase */ - .nextword=3, /* Either */ - .midword=1, /* Lower only */ - .name= "text" -}; const char *ocr_celltype_name(OcrCellType ct) { return ct->name; } @@ -362,7 +343,7 @@ OcrResultGlyph *ocr(OcrReader *rd, OcrCellType ct, int w, Pixcol cols[]) { if (!cols[x]) { nspaces++; x++; - if (nspaces==SPACE_SPACES) { + if (nspaces == ct->space_spaces) { debugf("OCR x=%x nspaces=%d space\n",x,nspaces); ctxmap= ct->nextword; } @@ -370,7 +351,7 @@ OcrResultGlyph *ocr(OcrReader *rd, OcrCellType ct, int w, Pixcol cols[]) { } /* something here, so we need to add the spaces */ - if (nspaces>=SPACE_SPACES) + if (nspaces >= ct->space_spaces) add_result(rd," ",x-nspaces,x+1,0); nspaces=0; @@ -435,7 +416,7 @@ OcrResultGlyph *ocr(OcrReader *rd, OcrCellType ct, int w, Pixcol cols[]) { if (uniquematch->s[0]) ctxmap= ct->midword; else debugf(" (empty)"); if (uniquematch->endsword) { - nspaces= SPACE_SPACES; + nspaces= ct->space_spaces; debugf("_"); ctxmap= ct->nextword; }