From: Ian Jackson <ian@liberator.relativity.greenend.org.uk>
Date: Tue, 9 Jun 2009 18:50:36 +0000 (+0100)
Subject: Merge branch 'master' of chiark:/home/ijackson/things/ypp-sc-tools
X-Git-Tag: 1.9.2~124^2~16
X-Git-Url: http://www.chiark.greenend.org.uk/ucgi/~yarrgweb/git?p=ypp-sc-tools.main.git;a=commitdiff_plain;h=472536cd52cfa06045b511ff7c9ac35a2d28b359;hp=4fac35d62ce44312ff9f36b279495fd22fb731b2

Merge branch 'master' of chiark:/home/ijackson/things/ypp-sc-tools
---

diff --git a/pctb/README b/pctb/README
index 5f1cea2..9402240 100644
--- a/pctb/README
+++ b/pctb/README
@@ -1,3 +1,6 @@
+Overview
+--------
+
 This tool can:
   - screenscrape the commodities trading screen
   - produce the results as a tab separated values file
@@ -6,35 +9,147 @@ This tool can:
 To run it, change to this directory, type `make', and then:
   ./ypp-commodities --tsv >commods.tsv
 
-It may put up a window asking about characters it does not understand.
-It is important to get these inputs right or it may misrecognise
-things in future.  **TODO** write actual useful instructuions to cover the
-subtleties.  The results are stored in the file `charset-15.txt'.
-
-If you need to report a bug, please be sure to remember the exact
-error message and circumstances.  Also, for recognition problems there
-will probably be a very useful screenshot file called `#pages#.pnm'.
-This is likely to be very large so don't just email it to me, but if
-you can put it up on a webpage for me to download that will help.
-
-Options available:
-
- Setting the operation mode:
-   --find-window-only       Just check that we can find the YPP client window.
-   --screenshot-only        Page through and take screenshots, do not OCR
-   --analyse-only | --same  Process previously taken screenshots
-   --everything (default)   Take screenshots and process them
-
- Options to vary the processing:
-   --single-page         One screenful, no paging - results will be incomplete
-   --quiet               Suppress progress messages
-   --screenshot-file F   Store or read screenshots in F rather than #pages#.pnm
-   --window-id ID        Specified X window is the YPP client - do not search
-
- Setting the output processing:
-   --raw-tsv          Dump the raw not deduped unsorted OCR'd data
-   --upload (default) Upload to the PCTB server
-   --tsv              Print data as clean tab-separated-values file
-   --best-prices      Print best buy and sell price for each commodity
-   --arbitrage        Print arbitrage opportunityes
+While it is capturing the screenshots, do not move the mouse or use
+the keyboard.  Keyboard focus must stay in the YPP client window.
+
+
+Command-line options
+--------------------
+
+Setting the operation mode:
+  --find-window-only       Just check that we can find the YPP client window.
+  --screenshot-only        Page through and take screenshots, do not OCR
+  --analyse-only | --same  Process previously taken screenshots
+  --everything (default)   Take screenshots and process them
+
+Options to vary the processing:
+  --single-page         One screenful, no paging - results will be incomplete
+  --quiet               Suppress progress messages
+  --screenshot-file F   Store or read screenshots in F rather than #pages#.pnm
+  --window-id ID        Specified X window is the YPP client - do not search
+  --edit-charset        Enable character set editing.  See README.charset.
+
+Controlling what happens to the results:
+  --upload (default) Upload to the PCTB server
+  --tsv              Print data as clean tab-separated-values file
+  --raw-tsv          Dump the raw (not deduped, unsorted) OCR'd data
+  --best-prices      Print best buy and sell price for each commodity
+  --arbitrage        Print arbitrage opportunities
+
+
+Files we use and update
+-----------------------
+
+The program reads and writes the following files:
+
+ * #pages#.pnm
+
+   Contains one or more images (as raw ppms, end-to-end) which are the
+   screenshots taken in the last run.  This is (over)written whenever
+   we take screenshots from the YPP client.  You can reprocess an
+   existing set of screenshots with the --same (aka --analyse-only)
+   option; in that case we just read the screenshots file.
+
+   You can specify a different file with --screenshot-file.
+
+   If you want to display the contents of this file, `display' can do
+   it.  Don't try `display vid:#pages#.pnm' as this will consume
+   truly stupendous quantities of RAM - it wedged my laptop.
+
+ * charset-15.txt
+
+   Character set database.  For the semantics of the contents of this
+   file see README.charset.  There is not currently any accurate
+   documentation of this database format.
+
+   If you delete this file you'll have to re-enter a lot of glyph data
+   (and probably get it wrong and make the program misrecognise
+   things).  If you want to undo any mistakes you may have made
+   answering OCR questions you can safely revert this to the version
+   I've supplied.
+
+ * #commodmap#.tsv
+
+   Map from commodity names to the numbers required by the PCTB
+   server.  This is fetched and updated automatically as necessary.
+   It can safely be deleted as it will then be refetched.
+ * <file>.new
+
+   When any of these tools overwrite one of the persistent database
+   files, they temporarily write to <file>.new.
+
+These files are all in the current working directory.  There is not
+yet any feature to have them be somewhere else.  The helper programs
+  yppsc-ocr-resolver
+  yppsc-commod-processor
+must (currently) also be in the current directory.
+
+Future versions may have more helpers and more data files.
+
+
+Installation requirements
+-------------------------
+
+This program has quite a few dependencies:
+							Package (Debian etch)
+
+ - For building, C compiler and build environment	build-essential
+ - pnm library, including dev files for building	libnetpbm10-dev
+ - pnm command line utilities for image manipulation	netpbm
+ - X11 libraries, including dev files for building	libx11-dev
+ - XTEST library, including dev files for building	libxtst-dev
+ - Tk interpreter /usr/bin/wish				tk8.4
+ - Perl module XML::Parser				libxml-parser-perl
+ - Perl module JSON::Parser				libjson-perl
+ - XTEST extension in the X server			(part of X package)
+ - Perl interpreter and basic modules			perl (usu.installed)
+
+On other Linux distros the packages may have different names, but
+these should be roughly right for Debian and its derivatives.
+
+
+Reporting problems
+------------------
+
+If you need to report a bug, for example an inability to recognise,
+please be sure to remember the exact error message and circumstances.
+Also, for recognition problems there will probably be a very useful
+screenshot file called `#pages#.pnm'.  This is likely to be very large
+so don't just email it to me, but if you can put it up on a webpage
+for me to download that will help.  At least keep a copy of it.
+
+If the problem is a failure to cope with some particular YPP client
+display and is reproducible, try running:
+   ./ypp-commodities --raw-tsv --single-page
+If this reproduces the problem, please email me the screenshot file
+#pages#.pnm, which will consist only of the single screen, plus the
+error messasge.  I'll then be able to understand what's wrong,
+hopefully.
+
+
+Phoning home - privacy
+----------------------
+
+The main purpose of this program is to connect to the PCTB server and
+upload data.  The program does not currently phone home at all in
+modes other than --upload, and when it does it connects to the
+PCTB server not to a system of mine.
+
+However, there are some improvements which I may introduce in the
+future which may change this.  I am considering:
+
+ * Having the ocr character resolver talk to a server run by me
+   to look for missing glpyhs, and/or upload those glyphs back
+   to that server so that they can be shared.
+
+ * Having the upload client upload a copy of the data to a server run
+   by me, when run in --upload mode.
+
+If I do do this these new functions may be enabled by default, but it
+will be possible to turn them off, or direct them to different
+servers, with command-line options, and they will be documented here.
+
 
+ - Ian Jackson
+   ijackson@chiark.greenend.org.uk
+   Aristarchus on the Midnight ocean
diff --git a/pctb/README.charset b/pctb/README.charset
new file mode 100644
index 0000000..f2bfc7d
--- /dev/null
+++ b/pctb/README.charset
@@ -0,0 +1,150 @@
+Handing OCR failures
+--------------------
+
+Sometimes the OCR will not be able to recognise some text.  By
+default, when this happens, the program will stop with a fatal error
+and refer you to this document.
+
+It is possible to fix this by editing the character set database used
+by the OCR algorithm.  But, it is important to get these inputs right
+or your client may misrecognise text in future.  You *must* read the
+documentation here first.
+
+
+Recognition algorithm
+---------------------
+
+We recognise the text in the commodity screen by doing exact matching
+of `glyph' bitmaps, against the bitmap in each cell in the commodity
+table.  We match from left to right.
+
+We do not insist that each glyph is followed by whitespace, and nor do
+we insist that glyphs do not contain whitespace.  Our glyph database
+can contain entries which are strict prefixes of other entries - that
+is, a glyph for (say) `v' which is the leftmost part of another glyph
+for (say) `w'.  We resolve these ambiguities by taking the longest
+(widest) glyph which matches.
+
+So you should not be surprised if the program has matched the
+left-hand half of some letter and thinks it is a different letter.  If
+the part that it did recognise does look like the letter in question,
+that isn't wrong.  All you need to do is insert the whole of the
+actual letter in the database - move the LH cursor to the start of the
+letter, and the RH cursor to its end, and hit `return' and enter the
+correct character.  The longest match rule will mean it will prefer
+the entry you have just made.
+
+
+Upper vs lower case - important note regarding `l' and `I'
+----------------------------------------------------------
+
+We maintain separate databases for upper and lower case.  At the
+beginning of each cell in the table, we expect uppercase; in the
+middle of a word we expect lowercase; and, unfortunately, after an
+inter-word gap, we are not sure.
+
+This is troublesome because `l' and `I' look identical on the screen.
+So any time we see a word starting with `l' or `I', the program has to
+ask about it.
+
+*Do not* make an entry in the character set database mapping `vertical
+stick' to `l' or `I'.  Instead, select enough of the whole word in
+question that no word would start with the other letter, and enter the
+whole word or part of it as a new glyph.
+
+For example, in the supplied database there is already a glyph for
+`Iron'; this is OK because there are no words which start `lron'.
+
+Do not make an entry for a string more than 7 characters long;
+currently we cannot cope (and you'll have to remove it manually from
+the charset-15.txt file).
+
+
+Short inter-word gaps
+---------------------
+
+It can happen that the problem you are being asked about is caused by
+the program failing to spot an inter-word gap and mistakenly thinks
+that the next word is necessarily in lowercase, so fails to recognise
+an uppercase letter.  The context in which each glyph was recognised
+is shown on the screen, underneath the text which shows what it was
+recognised as.
+
+*You should check the alleged context before entering a character*.
+If it is wrong, you should fix it, rather that just making an entry
+for the uppercase letter in the lowercase database.
+
+Instead, make a new glyph for the last letter of the previous word
+plus the (unusually narrow) inter-word space, and end that entry with
+\x20 (yes, type \ x 20).
+
+For example, you might find that `y<space>G' is treated as
+`y<??lowercase>' and the G doesn't get matched.  Select the `y<space>'
+region of the bitmap and type `y\x20' into the string box.
+Sorry for this rather poor UI!
+
+
+Overlapping characters - ligatures
+----------------------------------
+
+Some of the characters in the font used overlap with the next
+character.  When this happens, select both the characters and enter
+them together as one glyph with a multi-character definition.
+
+For example `yw' is rendered with the top right corner of the `y' and
+the top left corner of the `w' overlapping.  This is dealt with by
+matching the whole merged thing - select the region of the screen
+containing `yw' and define it as `yw'.
+
+
+Fixing mistakes
+---------------
+
+The OCR query UI allows you to delete things from the glyph database.
+However since you are not guaranteed to actually get an OCR query at
+all if the database contains errors, you shouldn't rely on this.
+
+If you think you have made mistakes answering OCR queries (for
+example, the recognised data is wrong), you should download a fresh
+copy of charset-15.txt from
+ http://www.chiark.greenend.org.uk/~ijackson/ypp-sc-tools/master/pctb/charset-15.txt
+
+
+Enabling interactive character set update
+-----------------------------------------
+
+Now that you have read this document, you should rerun your OCR job
+with the --edit-charset option.  You probably want to supply --same as
+well, to avoid having to wait for it to page through and recapture all
+the screenshots.  So, this time,
+   ./ypp-commodities --edit-charset --same
+and in future, just always run it with the --edit-charset option.
+
+With --edit-charset, when the OCR finds characters it does not
+understand, it will put up an OCR resolution query window.  This will
+display the part of the text it is having trouble with, showing where
+it has got to, and allow you to edit the character set database it
+uses for recognising the text.
+
+*This is subtle* and it is important to understand the way the
+machinery works, and the possible mistakes you can make, before
+answering the program.  *Please read this documentation*, which
+explains the meaning of the entries you make.
+
+If you need help please ask me (ijackson@chiark.greenend.org.uk, or
+Aristarchus on Midnight in game if I'm on line, or ask any pirate of
+the crew Special Circumstances if they happen to know where I am
+and/or can get in touch).
+
+
+Send me your updates
+--------------------
+
+The character set is in the file `charset-15.txt'.  When you enter new
+characters, they are added there.  If you do this, please email me
+your charset file (ijackson@chiark.greenend.org.uk) so that I can
+include your contributions in future versions.  This will also let me
+check that they seem right :-).
+
+In future I may have the program phone home automatically so that I
+can double-check your answers and distribute them in the next version.
diff --git a/pctb/convert.c b/pctb/convert.c
index eb7536d..75c7080 100644
--- a/pctb/convert.c
+++ b/pctb/convert.c
@@ -51,6 +51,7 @@ static char *o_screenshot_fn;
 static int o_single_page, o_quiet;
 static const char *o_outputmode= "upload";
 
+const char *o_resolver;
 FILE *screenshot_file;
 
 
@@ -77,6 +78,10 @@ static void run_analysis(void) {
   progress("running recognition...");
   analyse(tf);
 
+  if (o_single_page && !strcmp(o_outputmode,"upload"))
+    fatal("Recognition successful, but refusing to upload partial data\n"
+	  " (--single-page specified).  Specify an output mode?");
+
   sysassert( fseek(tf,0,SEEK_SET) == 0);
 
   progress_log("processing results (--%s)...", o_outputmode);
@@ -116,6 +121,8 @@ int main(int argc, char **argv) {
       o_single_page= 1;
     else if (!strcmp(arg,"--quiet"))
       o_quiet= 1;
+    else if (!strcmp(arg,"--edit-charset"))
+      o_resolver= "./yppsc-ocr-resolver";
     else if (!strcmp(arg,"--raw-tsv"))
       o_outputmode= 0;
     else if (!strcmp(arg,"--upload") ||
diff --git a/pctb/ocr.c b/pctb/ocr.c
index 6a91935..7854239 100644
--- a/pctb/ocr.c
+++ b/pctb/ocr.c
@@ -228,6 +228,11 @@ static void callout_unknown(OcrReader *rd, int w, Pixcol cols[],
   const char *p;
   char cb;
   Pixcol pv;
+
+  if (!o_resolver)
+    fatal("OCR failed - unrecognised characters or ligatures.\n"
+	  "Character set database needs to be updated or augmented.\n"
+	  "See README.charset.\n");
   
   if (!resolver) {
     sysassert(! pipe(jobpipe) );
@@ -241,7 +246,7 @@ static void callout_unknown(OcrReader *rd, int w, Pixcol cols[],
       /* we know donepipe[1] is >= 4 and we have dealt with all the others
        * so we aren't in any danger of overwriting some other fd 4: */
       sysassert( dup2(donepipe[1],4) ==4 );
-      execlp("./yppsc-ocr-resolver", "yppsc-ocr-resolver",
+      execlp(o_resolver, o_resolver,
 	     DEBUGP(callout) ? "--debug" : "--noop-arg",
 	     "--automatic-1",
 	     (char*)0);
diff --git a/pctb/ocr.h b/pctb/ocr.h
index df6aaf8..69ccbe9 100644
--- a/pctb/ocr.h
+++ b/pctb/ocr.h
@@ -67,5 +67,7 @@ OcrResultGlyph *ocr(OcrReader *rd, OcrCellType, int w, Pixcol cols[]);
    * array is valid until next call to ocr()
    */
 
+extern const char *o_resolver;
+
 
 #endif /*OCR_H*/
diff --git a/pctb/yppsc-decode-marketdata b/pctb/yppsc-decode-marketdata
new file mode 100755
index 0000000..969ada0
--- /dev/null
+++ b/pctb/yppsc-decode-marketdata
@@ -0,0 +1,81 @@
+#!/usr/bin/perl
+
+use IO::Handle;
+
+open CM, "commodmap" or die $!;
+while (<CM>) {
+    m/^(\S.*\S)\t(\d+)$/ or die;
+    $commodmap[$2]= $1;
+}
+die $! if CM->error;
+
+%stallkinds= qw(A Apothecary
+		D Distilling
+		F Furnishing
+		I Ironworking
+		S Shipbuilding
+		T Tailor
+		W Weaving);
+
+sub getline() {
+    $!=0; my $l= <STDIN>; die $! unless defined $l;
+    die $! if STDIN->error;
+    die unless chomp $l;
+#print STDERR "GOT LINE [$l]\n";
+    return $l;
+}
+
+sub getint() {
+    my $b;
+    my $r= read STDIN,$b,2; die $! if STDIN->error;
+    die unless $r==2;
+    my $v= scalar unpack "v", $b;
+#printf STDERR "GOT INT %d 0x%x\n", $v, $v;
+    return $v;
+}
+
+sub inmap($\@$) {
+    my ($what,$ary,$ix) = @_;
+    my $got= $ary->[$ix];
+    return $got if defined $got;
+    die "$what $ix ?";
+}
+    
+
+printf "# Version: \"%s\"\n", getline();
+$nstalls= getline()+0;
+
+while (@stalls < $nstalls) {
+    $_= getline();
+    if (s/\^[A-Z]$//) {
+	$kind= $1;
+	$sk= $stallkinds{$kind};
+	die "kind $kind in $_ ?" unless defined $sk;
+	$_ .= "'s $sk Stall";
+    }
+    push @stalls, $_;
+}
+unshift @stalls, undef;
+
+$|=1;
+
+foreach $bs qw(Buy Sell) {
+    $ncommods= getint();
+    for ($commodnum=0; $commodnum<$ncommods; $commodnum++) {
+	$commodix= getint();
+	$offers= getint();
+	for ($offernum=0; $offernum<$offers; $offernum++) {
+	    $stallix= getint();
+	    $price= getint();
+	    $qty= getint();
+	    printf("%s\t%s\t%s",
+		   $bs,
+		   inmap('commod',@commodmap,$commodix),
+		   inmap('stall',@stalls,$stallix)) or die $!;
+	    if ($bs eq 'Sell') { print "\t\t" or die $!; }
+	    printf("\t%d\t%d", $price, $qty) or die $!;
+	    if ($bs eq 'Buy') { print "\t\t" or die $!; }
+	    print "\n" or die $!;
+	}
+    }
+}
diff --git a/pctb/yppsc-ocr-resolver b/pctb/yppsc-ocr-resolver
index 694adc3..bb55d3d 100755
--- a/pctb/yppsc-ocr-resolver
+++ b/pctb/yppsc-ocr-resolver
@@ -26,6 +26,7 @@
 
 
 # invocation:
+# OUT OF DATE
 #  run this without args
 #  then on stdin write
 #     one line which is a Tcl list for unk_{l,r} unk_contexts glyphsdone etc.
@@ -399,6 +400,7 @@ proc recursor {} {
 
 #---------- database read and write ----------
 
+# OUT OF DATE
 # database format:
 # series of glyphs:
 #   <context> <ncharacters> <hex>...