From: Ian Jackson Date: Tue, 9 Jun 2009 18:50:36 +0000 (+0100) Subject: Merge branch 'master' of chiark:/home/ijackson/things/ypp-sc-tools X-Git-Tag: 1.9.2~124^2~16 X-Git-Url: http://www.chiark.greenend.org.uk/ucgi/~yarrgweb/git?p=ypp-sc-tools.main.git;a=commitdiff_plain;h=472536cd52cfa06045b511ff7c9ac35a2d28b359;hp=4fac35d62ce44312ff9f36b279495fd22fb731b2 Merge branch 'master' of chiark:/home/ijackson/things/ypp-sc-tools --- diff --git a/pctb/README b/pctb/README index 5f1cea2..9402240 100644 --- a/pctb/README +++ b/pctb/README @@ -1,3 +1,6 @@ +Overview +-------- + This tool can: - screenscrape the commodities trading screen - produce the results as a tab separated values file @@ -6,35 +9,147 @@ This tool can: To run it, change to this directory, type `make', and then: ./ypp-commodities --tsv >commods.tsv -It may put up a window asking about characters it does not understand. -It is important to get these inputs right or it may misrecognise -things in future. **TODO** write actual useful instructuions to cover the -subtleties. The results are stored in the file `charset-15.txt'. - -If you need to report a bug, please be sure to remember the exact -error message and circumstances. Also, for recognition problems there -will probably be a very useful screenshot file called `#pages#.pnm'. -This is likely to be very large so don't just email it to me, but if -you can put it up on a webpage for me to download that will help. - -Options available: - - Setting the operation mode: - --find-window-only Just check that we can find the YPP client window. - --screenshot-only Page through and take screenshots, do not OCR - --analyse-only | --same Process previously taken screenshots - --everything (default) Take screenshots and process them - - Options to vary the processing: - --single-page One screenful, no paging - results will be incomplete - --quiet Suppress progress messages - --screenshot-file F Store or read screenshots in F rather than #pages#.pnm - --window-id ID Specified X window is the YPP client - do not search - - Setting the output processing: - --raw-tsv Dump the raw not deduped unsorted OCR'd data - --upload (default) Upload to the PCTB server - --tsv Print data as clean tab-separated-values file - --best-prices Print best buy and sell price for each commodity - --arbitrage Print arbitrage opportunityes +While it is capturing the screenshots, do not move the mouse or use +the keyboard. Keyboard focus must stay in the YPP client window. + + +Command-line options +-------------------- + +Setting the operation mode: + --find-window-only Just check that we can find the YPP client window. + --screenshot-only Page through and take screenshots, do not OCR + --analyse-only | --same Process previously taken screenshots + --everything (default) Take screenshots and process them + +Options to vary the processing: + --single-page One screenful, no paging - results will be incomplete + --quiet Suppress progress messages + --screenshot-file F Store or read screenshots in F rather than #pages#.pnm + --window-id ID Specified X window is the YPP client - do not search + --edit-charset Enable character set editing. See README.charset. + +Controlling what happens to the results: + --upload (default) Upload to the PCTB server + --tsv Print data as clean tab-separated-values file + --raw-tsv Dump the raw (not deduped, unsorted) OCR'd data + --best-prices Print best buy and sell price for each commodity + --arbitrage Print arbitrage opportunities + + +Files we use and update +----------------------- + +The program reads and writes the following files: + + * #pages#.pnm + + Contains one or more images (as raw ppms, end-to-end) which are the + screenshots taken in the last run. This is (over)written whenever + we take screenshots from the YPP client. You can reprocess an + existing set of screenshots with the --same (aka --analyse-only) + option; in that case we just read the screenshots file. + + You can specify a different file with --screenshot-file. + + If you want to display the contents of this file, `display' can do + it. Don't try `display vid:#pages#.pnm' as this will consume + truly stupendous quantities of RAM - it wedged my laptop. + + * charset-15.txt + + Character set database. For the semantics of the contents of this + file see README.charset. There is not currently any accurate + documentation of this database format. + + If you delete this file you'll have to re-enter a lot of glyph data + (and probably get it wrong and make the program misrecognise + things). If you want to undo any mistakes you may have made + answering OCR questions you can safely revert this to the version + I've supplied. + + * #commodmap#.tsv + + Map from commodity names to the numbers required by the PCTB + server. This is fetched and updated automatically as necessary. + It can safely be deleted as it will then be refetched. + * .new + + When any of these tools overwrite one of the persistent database + files, they temporarily write to .new. + +These files are all in the current working directory. There is not +yet any feature to have them be somewhere else. The helper programs + yppsc-ocr-resolver + yppsc-commod-processor +must (currently) also be in the current directory. + +Future versions may have more helpers and more data files. + + +Installation requirements +------------------------- + +This program has quite a few dependencies: + Package (Debian etch) + + - For building, C compiler and build environment build-essential + - pnm library, including dev files for building libnetpbm10-dev + - pnm command line utilities for image manipulation netpbm + - X11 libraries, including dev files for building libx11-dev + - XTEST library, including dev files for building libxtst-dev + - Tk interpreter /usr/bin/wish tk8.4 + - Perl module XML::Parser libxml-parser-perl + - Perl module JSON::Parser libjson-perl + - XTEST extension in the X server (part of X package) + - Perl interpreter and basic modules perl (usu.installed) + +On other Linux distros the packages may have different names, but +these should be roughly right for Debian and its derivatives. + + +Reporting problems +------------------ + +If you need to report a bug, for example an inability to recognise, +please be sure to remember the exact error message and circumstances. +Also, for recognition problems there will probably be a very useful +screenshot file called `#pages#.pnm'. This is likely to be very large +so don't just email it to me, but if you can put it up on a webpage +for me to download that will help. At least keep a copy of it. + +If the problem is a failure to cope with some particular YPP client +display and is reproducible, try running: + ./ypp-commodities --raw-tsv --single-page +If this reproduces the problem, please email me the screenshot file +#pages#.pnm, which will consist only of the single screen, plus the +error messasge. I'll then be able to understand what's wrong, +hopefully. + + +Phoning home - privacy +---------------------- + +The main purpose of this program is to connect to the PCTB server and +upload data. The program does not currently phone home at all in +modes other than --upload, and when it does it connects to the +PCTB server not to a system of mine. + +However, there are some improvements which I may introduce in the +future which may change this. I am considering: + + * Having the ocr character resolver talk to a server run by me + to look for missing glpyhs, and/or upload those glyphs back + to that server so that they can be shared. + + * Having the upload client upload a copy of the data to a server run + by me, when run in --upload mode. + +If I do do this these new functions may be enabled by default, but it +will be possible to turn them off, or direct them to different +servers, with command-line options, and they will be documented here. + + - Ian Jackson + ijackson@chiark.greenend.org.uk + Aristarchus on the Midnight ocean diff --git a/pctb/README.charset b/pctb/README.charset new file mode 100644 index 0000000..f2bfc7d --- /dev/null +++ b/pctb/README.charset @@ -0,0 +1,150 @@ +Handing OCR failures +-------------------- + +Sometimes the OCR will not be able to recognise some text. By +default, when this happens, the program will stop with a fatal error +and refer you to this document. + +It is possible to fix this by editing the character set database used +by the OCR algorithm. But, it is important to get these inputs right +or your client may misrecognise text in future. You *must* read the +documentation here first. + + +Recognition algorithm +--------------------- + +We recognise the text in the commodity screen by doing exact matching +of `glyph' bitmaps, against the bitmap in each cell in the commodity +table. We match from left to right. + +We do not insist that each glyph is followed by whitespace, and nor do +we insist that glyphs do not contain whitespace. Our glyph database +can contain entries which are strict prefixes of other entries - that +is, a glyph for (say) `v' which is the leftmost part of another glyph +for (say) `w'. We resolve these ambiguities by taking the longest +(widest) glyph which matches. + +So you should not be surprised if the program has matched the +left-hand half of some letter and thinks it is a different letter. If +the part that it did recognise does look like the letter in question, +that isn't wrong. All you need to do is insert the whole of the +actual letter in the database - move the LH cursor to the start of the +letter, and the RH cursor to its end, and hit `return' and enter the +correct character. The longest match rule will mean it will prefer +the entry you have just made. + + +Upper vs lower case - important note regarding `l' and `I' +---------------------------------------------------------- + +We maintain separate databases for upper and lower case. At the +beginning of each cell in the table, we expect uppercase; in the +middle of a word we expect lowercase; and, unfortunately, after an +inter-word gap, we are not sure. + +This is troublesome because `l' and `I' look identical on the screen. +So any time we see a word starting with `l' or `I', the program has to +ask about it. + +*Do not* make an entry in the character set database mapping `vertical +stick' to `l' or `I'. Instead, select enough of the whole word in +question that no word would start with the other letter, and enter the +whole word or part of it as a new glyph. + +For example, in the supplied database there is already a glyph for +`Iron'; this is OK because there are no words which start `lron'. + +Do not make an entry for a string more than 7 characters long; +currently we cannot cope (and you'll have to remove it manually from +the charset-15.txt file). + + +Short inter-word gaps +--------------------- + +It can happen that the problem you are being asked about is caused by +the program failing to spot an inter-word gap and mistakenly thinks +that the next word is necessarily in lowercase, so fails to recognise +an uppercase letter. The context in which each glyph was recognised +is shown on the screen, underneath the text which shows what it was +recognised as. + +*You should check the alleged context before entering a character*. +If it is wrong, you should fix it, rather that just making an entry +for the uppercase letter in the lowercase database. + +Instead, make a new glyph for the last letter of the previous word +plus the (unusually narrow) inter-word space, and end that entry with +\x20 (yes, type \ x 20). + +For example, you might find that `yG' is treated as +`y' and the G doesn't get matched. Select the `y' +region of the bitmap and type `y\x20' into the string box. +Sorry for this rather poor UI! + + +Overlapping characters - ligatures +---------------------------------- + +Some of the characters in the font used overlap with the next +character. When this happens, select both the characters and enter +them together as one glyph with a multi-character definition. + +For example `yw' is rendered with the top right corner of the `y' and +the top left corner of the `w' overlapping. This is dealt with by +matching the whole merged thing - select the region of the screen +containing `yw' and define it as `yw'. + + +Fixing mistakes +--------------- + +The OCR query UI allows you to delete things from the glyph database. +However since you are not guaranteed to actually get an OCR query at +all if the database contains errors, you shouldn't rely on this. + +If you think you have made mistakes answering OCR queries (for +example, the recognised data is wrong), you should download a fresh +copy of charset-15.txt from + http://www.chiark.greenend.org.uk/~ijackson/ypp-sc-tools/master/pctb/charset-15.txt + + +Enabling interactive character set update +----------------------------------------- + +Now that you have read this document, you should rerun your OCR job +with the --edit-charset option. You probably want to supply --same as +well, to avoid having to wait for it to page through and recapture all +the screenshots. So, this time, + ./ypp-commodities --edit-charset --same +and in future, just always run it with the --edit-charset option. + +With --edit-charset, when the OCR finds characters it does not +understand, it will put up an OCR resolution query window. This will +display the part of the text it is having trouble with, showing where +it has got to, and allow you to edit the character set database it +uses for recognising the text. + +*This is subtle* and it is important to understand the way the +machinery works, and the possible mistakes you can make, before +answering the program. *Please read this documentation*, which +explains the meaning of the entries you make. + +If you need help please ask me (ijackson@chiark.greenend.org.uk, or +Aristarchus on Midnight in game if I'm on line, or ask any pirate of +the crew Special Circumstances if they happen to know where I am +and/or can get in touch). + + +Send me your updates +-------------------- + +The character set is in the file `charset-15.txt'. When you enter new +characters, they are added there. If you do this, please email me +your charset file (ijackson@chiark.greenend.org.uk) so that I can +include your contributions in future versions. This will also let me +check that they seem right :-). + +In future I may have the program phone home automatically so that I +can double-check your answers and distribute them in the next version. diff --git a/pctb/convert.c b/pctb/convert.c index eb7536d..75c7080 100644 --- a/pctb/convert.c +++ b/pctb/convert.c @@ -51,6 +51,7 @@ static char *o_screenshot_fn; static int o_single_page, o_quiet; static const char *o_outputmode= "upload"; +const char *o_resolver; FILE *screenshot_file; @@ -77,6 +78,10 @@ static void run_analysis(void) { progress("running recognition..."); analyse(tf); + if (o_single_page && !strcmp(o_outputmode,"upload")) + fatal("Recognition successful, but refusing to upload partial data\n" + " (--single-page specified). Specify an output mode?"); + sysassert( fseek(tf,0,SEEK_SET) == 0); progress_log("processing results (--%s)...", o_outputmode); @@ -116,6 +121,8 @@ int main(int argc, char **argv) { o_single_page= 1; else if (!strcmp(arg,"--quiet")) o_quiet= 1; + else if (!strcmp(arg,"--edit-charset")) + o_resolver= "./yppsc-ocr-resolver"; else if (!strcmp(arg,"--raw-tsv")) o_outputmode= 0; else if (!strcmp(arg,"--upload") || diff --git a/pctb/ocr.c b/pctb/ocr.c index 6a91935..7854239 100644 --- a/pctb/ocr.c +++ b/pctb/ocr.c @@ -228,6 +228,11 @@ static void callout_unknown(OcrReader *rd, int w, Pixcol cols[], const char *p; char cb; Pixcol pv; + + if (!o_resolver) + fatal("OCR failed - unrecognised characters or ligatures.\n" + "Character set database needs to be updated or augmented.\n" + "See README.charset.\n"); if (!resolver) { sysassert(! pipe(jobpipe) ); @@ -241,7 +246,7 @@ static void callout_unknown(OcrReader *rd, int w, Pixcol cols[], /* we know donepipe[1] is >= 4 and we have dealt with all the others * so we aren't in any danger of overwriting some other fd 4: */ sysassert( dup2(donepipe[1],4) ==4 ); - execlp("./yppsc-ocr-resolver", "yppsc-ocr-resolver", + execlp(o_resolver, o_resolver, DEBUGP(callout) ? "--debug" : "--noop-arg", "--automatic-1", (char*)0); diff --git a/pctb/ocr.h b/pctb/ocr.h index df6aaf8..69ccbe9 100644 --- a/pctb/ocr.h +++ b/pctb/ocr.h @@ -67,5 +67,7 @@ OcrResultGlyph *ocr(OcrReader *rd, OcrCellType, int w, Pixcol cols[]); * array is valid until next call to ocr() */ +extern const char *o_resolver; + #endif /*OCR_H*/ diff --git a/pctb/yppsc-decode-marketdata b/pctb/yppsc-decode-marketdata new file mode 100755 index 0000000..969ada0 --- /dev/null +++ b/pctb/yppsc-decode-marketdata @@ -0,0 +1,81 @@ +#!/usr/bin/perl + +use IO::Handle; + +open CM, "commodmap" or die $!; +while () { + m/^(\S.*\S)\t(\d+)$/ or die; + $commodmap[$2]= $1; +} +die $! if CM->error; + +%stallkinds= qw(A Apothecary + D Distilling + F Furnishing + I Ironworking + S Shipbuilding + T Tailor + W Weaving); + +sub getline() { + $!=0; my $l= ; die $! unless defined $l; + die $! if STDIN->error; + die unless chomp $l; +#print STDERR "GOT LINE [$l]\n"; + return $l; +} + +sub getint() { + my $b; + my $r= read STDIN,$b,2; die $! if STDIN->error; + die unless $r==2; + my $v= scalar unpack "v", $b; +#printf STDERR "GOT INT %d 0x%x\n", $v, $v; + return $v; +} + +sub inmap($\@$) { + my ($what,$ary,$ix) = @_; + my $got= $ary->[$ix]; + return $got if defined $got; + die "$what $ix ?"; +} + + +printf "# Version: \"%s\"\n", getline(); +$nstalls= getline()+0; + +while (@stalls < $nstalls) { + $_= getline(); + if (s/\^[A-Z]$//) { + $kind= $1; + $sk= $stallkinds{$kind}; + die "kind $kind in $_ ?" unless defined $sk; + $_ .= "'s $sk Stall"; + } + push @stalls, $_; +} +unshift @stalls, undef; + +$|=1; + +foreach $bs qw(Buy Sell) { + $ncommods= getint(); + for ($commodnum=0; $commodnum<$ncommods; $commodnum++) { + $commodix= getint(); + $offers= getint(); + for ($offernum=0; $offernum<$offers; $offernum++) { + $stallix= getint(); + $price= getint(); + $qty= getint(); + printf("%s\t%s\t%s", + $bs, + inmap('commod',@commodmap,$commodix), + inmap('stall',@stalls,$stallix)) or die $!; + if ($bs eq 'Sell') { print "\t\t" or die $!; } + printf("\t%d\t%d", $price, $qty) or die $!; + if ($bs eq 'Buy') { print "\t\t" or die $!; } + print "\n" or die $!; + } + } +} diff --git a/pctb/yppsc-ocr-resolver b/pctb/yppsc-ocr-resolver index 694adc3..bb55d3d 100755 --- a/pctb/yppsc-ocr-resolver +++ b/pctb/yppsc-ocr-resolver @@ -26,6 +26,7 @@ # invocation: +# OUT OF DATE # run this without args # then on stdin write # one line which is a Tcl list for unk_{l,r} unk_contexts glyphsdone etc. @@ -399,6 +400,7 @@ proc recursor {} { #---------- database read and write ---------- +# OUT OF DATE # database format: # series of glyphs: # ...