From 6e70a6f2ac5765379239252d771f9218a80b9c71 Mon Sep 17 00:00:00 2001 From: Ian Jackson Date: Tue, 9 Jun 2009 13:04:43 +0100 Subject: [PATCH] Do not run ocr resolver by default. --- pctb/README | 8 +------ pctb/README.charset | 53 +++++++++++++++++++++++++++++++++------------ pctb/convert.c | 3 +++ pctb/ocr.c | 7 +++++- pctb/ocr.h | 2 ++ 5 files changed, 51 insertions(+), 22 deletions(-) diff --git a/pctb/README b/pctb/README index 0d83e39..9402240 100644 --- a/pctb/README +++ b/pctb/README @@ -12,12 +12,6 @@ To run it, change to this directory, type `make', and then: While it is capturing the screenshots, do not move the mouse or use the keyboard. Keyboard focus must stay in the YPP client window. -*IMPORTANT* -It may put up a window asking about characters it does not understand. -It is important to get these inputs right or your client may -misrecognise text in future. You *must* read the documentation in -README.charset before answering these questions. - Command-line options -------------------- @@ -33,6 +27,7 @@ Options to vary the processing: --quiet Suppress progress messages --screenshot-file F Store or read screenshots in F rather than #pages#.pnm --window-id ID Specified X window is the YPP client - do not search + --edit-charset Enable character set editing. See README.charset. Controlling what happens to the results: --upload (default) Upload to the PCTB server @@ -78,7 +73,6 @@ The program reads and writes the following files: Map from commodity names to the numbers required by the PCTB server. This is fetched and updated automatically as necessary. It can safely be deleted as it will then be refetched. - * .new When any of these tools overwrite one of the persistent database diff --git a/pctb/README.charset b/pctb/README.charset index bbabb05..f2bfc7d 100644 --- a/pctb/README.charset +++ b/pctb/README.charset @@ -1,19 +1,14 @@ -Character set query tool, and semantics of the glyphs ------------------------------------------------------ +Handing OCR failures +-------------------- -Sometimes the OCR will not be able to recognise some text and you will -have to help it out. It will display the part it is having trouble -with, showing where it has got to, and allow you to edit the character -set database it uses for recognising the text. +Sometimes the OCR will not be able to recognise some text. By +default, when this happens, the program will stop with a fatal error +and refer you to this document. -*This is subtle* and it is important to understand the way the -machinery works, and the possible mistakes you can make, before -answering the program. *Please read this documentation* - -If you need help please ask me (ijackson@chiark.greenend.org.uk, or -Aristarchus on Midnight in game if I'm on line, or ask any pirate of -the crew Special Circumstances if they happen to know where I am -and/or can get in touch). +It is possible to fix this by editing the character set database used +by the OCR algorithm. But, it is important to get these inputs right +or your client may misrecognise text in future. You *must* read the +documentation here first. Recognition algorithm @@ -115,6 +110,33 @@ copy of charset-15.txt from http://www.chiark.greenend.org.uk/~ijackson/ypp-sc-tools/master/pctb/charset-15.txt +Enabling interactive character set update +----------------------------------------- + +Now that you have read this document, you should rerun your OCR job +with the --edit-charset option. You probably want to supply --same as +well, to avoid having to wait for it to page through and recapture all +the screenshots. So, this time, + ./ypp-commodities --edit-charset --same +and in future, just always run it with the --edit-charset option. + +With --edit-charset, when the OCR finds characters it does not +understand, it will put up an OCR resolution query window. This will +display the part of the text it is having trouble with, showing where +it has got to, and allow you to edit the character set database it +uses for recognising the text. + +*This is subtle* and it is important to understand the way the +machinery works, and the possible mistakes you can make, before +answering the program. *Please read this documentation*, which +explains the meaning of the entries you make. + +If you need help please ask me (ijackson@chiark.greenend.org.uk, or +Aristarchus on Midnight in game if I'm on line, or ask any pirate of +the crew Special Circumstances if they happen to know where I am +and/or can get in touch). + + Send me your updates -------------------- @@ -123,3 +145,6 @@ characters, they are added there. If you do this, please email me your charset file (ijackson@chiark.greenend.org.uk) so that I can include your contributions in future versions. This will also let me check that they seem right :-). + +In future I may have the program phone home automatically so that I +can double-check your answers and distribute them in the next version. diff --git a/pctb/convert.c b/pctb/convert.c index 0114ea4..75c7080 100644 --- a/pctb/convert.c +++ b/pctb/convert.c @@ -51,6 +51,7 @@ static char *o_screenshot_fn; static int o_single_page, o_quiet; static const char *o_outputmode= "upload"; +const char *o_resolver; FILE *screenshot_file; @@ -120,6 +121,8 @@ int main(int argc, char **argv) { o_single_page= 1; else if (!strcmp(arg,"--quiet")) o_quiet= 1; + else if (!strcmp(arg,"--edit-charset")) + o_resolver= "./yppsc-ocr-resolver"; else if (!strcmp(arg,"--raw-tsv")) o_outputmode= 0; else if (!strcmp(arg,"--upload") || diff --git a/pctb/ocr.c b/pctb/ocr.c index 6a91935..7854239 100644 --- a/pctb/ocr.c +++ b/pctb/ocr.c @@ -228,6 +228,11 @@ static void callout_unknown(OcrReader *rd, int w, Pixcol cols[], const char *p; char cb; Pixcol pv; + + if (!o_resolver) + fatal("OCR failed - unrecognised characters or ligatures.\n" + "Character set database needs to be updated or augmented.\n" + "See README.charset.\n"); if (!resolver) { sysassert(! pipe(jobpipe) ); @@ -241,7 +246,7 @@ static void callout_unknown(OcrReader *rd, int w, Pixcol cols[], /* we know donepipe[1] is >= 4 and we have dealt with all the others * so we aren't in any danger of overwriting some other fd 4: */ sysassert( dup2(donepipe[1],4) ==4 ); - execlp("./yppsc-ocr-resolver", "yppsc-ocr-resolver", + execlp(o_resolver, o_resolver, DEBUGP(callout) ? "--debug" : "--noop-arg", "--automatic-1", (char*)0); diff --git a/pctb/ocr.h b/pctb/ocr.h index df6aaf8..69ccbe9 100644 --- a/pctb/ocr.h +++ b/pctb/ocr.h @@ -67,5 +67,7 @@ OcrResultGlyph *ocr(OcrReader *rd, OcrCellType, int w, Pixcol cols[]); * array is valid until next call to ocr() */ +extern const char *o_resolver; + #endif /*OCR_H*/ -- 2.30.2