From 6a3c0962283d32bc6e5f6c47c929baf37ddc642f Mon Sep 17 00:00:00 2001 From: Ian Jackson Date: Sun, 21 Jun 2009 12:50:15 +0100 Subject: [PATCH] WIP overhaul of plumbing, databases, etc. --- pctb/README | 7 +- pctb/{README.dictionary => README.charset} | 21 +-- pctb/README.privacy | 14 +- pctb/common.h | 13 ++ pctb/convert.c | 172 ++++++++++++++++----- pctb/convert.h | 16 ++ pctb/dictionary-manager | 2 +- pctb/ocr.c | 85 ++++++---- pctb/resolve.c | 12 +- pctb/yppsc-commod-processor | 3 +- pctb/yppsc-resolver-pixoptions | 2 +- 11 files changed, 238 insertions(+), 109 deletions(-) rename pctb/{README.dictionary => README.charset} (89%) diff --git a/pctb/README b/pctb/README index 986338a..c747534 100644 --- a/pctb/README +++ b/pctb/README @@ -44,9 +44,10 @@ Privacy options, which control conversations with the dictionary server: Please do not use options marked * with --upload. See README.privacy. Options to override which servers we talk to: - --pctb-url HOST|URL Talk to the PCTB server at HOST or URL. - --dict-submit-url URL Submit dictionary entries with HTTP POST under URL. - --dict-update-url URL Fetch updated master dictionary with rsync from URL. + --pctb-server HOST|URL Talk to the PCTB server at HOST or URL. + --dict-submit-url URL Submit dictionary entries with HTTP POST under URL. + --dict-update-from SRC Fetch updated master dictionary with rsync from SRC. +Or set the environment variables YPPSC_PCTB{_PCTB, _DICT_UPDATE, _DICT_SUBMIT} Files we use and update diff --git a/pctb/README.dictionary b/pctb/README.charset similarity index 89% rename from pctb/README.dictionary rename to pctb/README.charset index 4fdc37d..e1fd3ff 100644 --- a/pctb/README.dictionary +++ b/pctb/README.charset @@ -55,10 +55,6 @@ whole word or part of it as a new glyph. For example, in the supplied dictionary there is already a glyph for `Iron'; this is OK because there are no words which start `lron'. -Do not make an entry for a string more than 7 characters long; -currently we cannot cope (and you'll have to remove it manually from -the charset-15.txt file). - Short inter-word gaps --------------------- @@ -131,20 +127,11 @@ machinery works, and the possible mistakes you can make, before answering the program. *Please read this documentation*, which explains the meaning of the entries you make. +Also, the character set updates you make will by default be submitted +to my server so that they can be checked by me and shared with other +users. See README.privacy. + If you need help please ask me (ijackson@chiark.greenend.org.uk, or Aristarchus on Midnight in game if I'm on line, or ask any pirate of the crew Special Circumstances if they happen to know where I am and/or can get in touch). - - -Send me your updates --------------------- - -The character set is in the file `charset-15.txt'. When you enter new -characters, they are added there. If you do this, please email me -your charset file (ijackson@chiark.greenend.org.uk) so that I can -include your contributions in future versions. This will also let me -check that they seem right :-). - -In future I may have the program phone home automatically so that I -can double-check your answers and distribute them in the next version. diff --git a/pctb/README.privacy b/pctb/README.privacy index aa98ae8..70cd93a 100644 --- a/pctb/README.privacy +++ b/pctb/README.privacy @@ -91,15 +91,15 @@ Records kept ------------ I keep a permanent log of all the submissions, including date, time, -submitting pirate or IP address, and YPP SC PCTB client version.. -This is so that I have enough information to go back and fix things if -anything goes badly wrong (for example, if a particular client is -broken). +submitting pirate or IP address, and YPP SC PCTB client version. +This is so that I have enough information to go back and fix the +dictionary if anything goes badly wrong (for example, if a particular +client is broken). -My system probably also records your IP address when your client -fetches new master dictionaries; those logs are used only for +My rsync server (file server) also records your IP address when your +client fetches new master dictionaries; those logs are used only for debugging the rsync server (which also serves many other files), and -they are routinely expired. +they are routinely expired, currently after about two months. The information about the source of a submissions doesn't appear in the dictionaries as available for download, so other people won't diff --git a/pctb/common.h b/pctb/common.h index 47f898f..3fa8a81 100644 --- a/pctb/common.h +++ b/pctb/common.h @@ -30,6 +30,7 @@ #define _GNU_SOURCE +#include #include #include #include @@ -72,6 +73,7 @@ typedef struct { /* both inclusive */ DF(pixmap) \ DF(struct) \ DF(ocr) \ + DF(rsync) \ DF(callout) enum { @@ -93,6 +95,7 @@ void debug_flush(void); #define debug stderr const char *get_vardir(void); +const char *get_libdir(void); #define FMT(f,a) __attribute__((format(printf,f,a))) #define SCANFMT(f,a) __attribute__((format(scanf,f,a))) @@ -144,4 +147,14 @@ int dbfile_scanf(const char *fmt, ...) SCANFMT(1,2); int dbfile_vscanf(const char *fmt, va_list al) SCANFMT(1,0); +char *masprintf(const char *fmt, ...) FMT(1,2); + +#define EXECLP_HELPER(helper, ...) do{ \ + char *helper_path= masprintf("%s/%s",get_libdir(),helper); \ + execlp(helper_path,helper, __VA_ARGS__); \ + sysassert(errno==ENOENT); \ + fatal("Failed to find helper program %s.\n" \ + "(Are you in the correct directory?)", helper); \ + }while(0) + #endif /*COMMON_H*/ diff --git a/pctb/convert.c b/pctb/convert.c index 4cc45b4..b9325e8 100644 --- a/pctb/convert.c +++ b/pctb/convert.c @@ -33,8 +33,10 @@ void debug_flush(void) { } const char *get_vardir(void) { return "."; } +const char *get_libdir(void) { return "."; } -static enum { + +enum mode { mf_findwindow= 0001, mf_screenshot= 0010, mf_readscreenshot= 0020, @@ -45,15 +47,18 @@ static enum { mode_analyse= 0120, mode_all= 0111, -} o_mode= mode_all; +}; +static enum mode o_mode= mode_all; static char *o_screenshot_fn; -static int o_single_page, o_quiet; +static int o_quiet; static const char *o_outputmode= "upload"; +static const char *o_serv_pctb, *o_serv_dict_fetch, *o_serv_dict_submit; -const char *o_resolver; +const char *o_resolver= "./dictionary-manager"; FILE *screenshot_file; +enum flags o_flags= ff_dict_fetch|ff_dict_submit|ff_dict_pirate; static void vbadusage(const char *fmt, va_list) FMT(1,0) NORET; static void vbadusage(const char *fmt, va_list al) { @@ -78,9 +83,11 @@ static void run_analysis(void) { progress("running recognition..."); analyse(tf); - if (o_single_page && !strcmp(o_outputmode,"upload")) - fatal("Recognition successful, but refusing to upload partial data\n" - " (--single-page specified). Specify an output mode?"); + if (o_flags & ff_upload) { + if (o_flags & ff_singlepage) + fatal("Recognition successful, but refusing to upload partial data\n" + " (--single-page specified). Specify an output mode?"); + } sysassert( fseek(tf,0,SEEK_SET) == 0); @@ -98,46 +105,100 @@ static void run_analysis(void) { waitpid_check_exitstatus(processor, "output processor/uploader"); fclose(tf); progress_log("all complete."); -} +} + +void fetch_with_rsync(const char *stem) { + pid_t fetcher; + + sysassert( (fetcher= fork()) != -1 ); + if (!fetcher) { + const char *rsync= getenv("YPPSC_PCTB_RSYNC"); + if (!rsync) rsync= "rsync"; + + const char *src= getenv("YPPSC_PCTB_DICT_UPDATE"); + char *remote= masprintf("%s/master-%s.txt", src, stem); + char *local= masprintf("#master-%s#.txt", stem); + execlp(rsync, "rsync", + DEBUGP(rsync) ? "-vLt" : "-Lt", + "--",remote,local,(char*)0); + sysassert(!"exec rsync failed"); + } + + waitpid_check_exitstatus(fetcher, "dictionary-manager --update"); +} + +static void set_server(const char *envname, const char *defprotocol, + const char *defvalue, const char *userspecified, + int enable) { + const char *value; + + if (!enable) { value= "0"; goto ok; } + + if (userspecified) + value= userspecified; + else if ((value= getenv(envname))) + ; + else + value= defvalue; + + if (value[0]=='/' || (value[0]=='.' && value[1]=='/')) + /* absolute or relative pathname - or anyway, something with no hostname */ + goto ok; + + const char *colon= strchr(value, ':'); + const char *slash= strchr(value, '/'); + + if (colon && (!slash || colon < slash)) + /* colon before the first slash, if any */ + /* rsync :: protocol specification - anyway, adding scheme:// won't help */ + goto ok; + + value= masprintf("%s%s", defprotocol, value); + + ok: + sysassert(! setenv(envname,value,1) ); +} int main(int argc, char **argv) { const char *arg; - int r; + + sysassert( setlocale(LC_MESSAGES,"") ); + sysassert( setlocale(LC_CTYPE,"en_GB.UTF-8") || + setlocale(LC_CTYPE,"en.UTF-8") ); #define ARGVAL ((*++argv) ? *argv : \ (badusage("missing value for option %s",arg),(char*)0)) +#define IS(s) (!strcmp(arg,(s))) + while ((arg=*++argv)) { - if (!strcmp(arg,"--find-window-only")) - o_mode= mode_findwindow; - else if (!strcmp(arg,"--screenshot-only")) - o_mode= mode_screenshot; - else if (!strcmp(arg,"--analyse-only") || - !strcmp(arg,"--same")) - o_mode= mode_analyse; - else if (!strcmp(arg,"--everything")) - o_mode= mode_all; - else if (!strcmp(arg,"--single-page")) - o_single_page= 1; - else if (!strcmp(arg,"--quiet")) - o_quiet= 1; - else if (!strcmp(arg,"--edit-dictionary")) - o_resolver= "./dictionary-manager"; - else if (!strcmp(arg,"--raw-tsv")) - o_outputmode= 0; - else if (!strcmp(arg,"--upload") || - !strcmp(arg,"--arbitrage") || - !strcmp(arg,"--tsv") || - !strcmp(arg,"--best-prices")) - o_outputmode= arg+2; - else if (!strcmp(arg,"--screenshot-file")) - o_screenshot_fn= ARGVAL; + if (IS("--find-window-only")) o_mode= mode_findwindow; + else if (IS("--screenshot-only")) o_mode= mode_screenshot; + else if (IS("--analyse-only") || + IS("--same")) o_mode= mode_analyse; + else if (IS("--everything")) o_mode= mode_all; + else if (IS("--single-page")) o_flags |= ff_singlepage; + else if (IS("--quiet")) o_quiet= 1; + else if (IS("--edit-charset")) o_flags |= ff_editcharset; + else if (IS("--dict-local-only")) o_flags &= ~ffs_dict; + else if (IS("--dict-read-only")) o_flags &= (~ffs_dict | ff_dict_fetch); + else if (IS("--dict-anon")) o_flags &= ~ff_dict_pirate; + else if (IS("--dict-submit")) o_flags |= ff_dict_fetch|ff_dict_submit; + else if (IS("--upload") || + IS("--arbitrage") || + IS("--tsv") || + IS("--best-prices")) o_outputmode= arg+2; + else if (IS("--raw-tsv")) o_outputmode= 0; + else if (IS("--screenshot-file")) o_screenshot_fn= ARGVAL; + else if (IS("--pctb-server")) o_serv_pctb= ARGVAL; + else if (IS("--dict-submit-server")) o_serv_dict_submit= ARGVAL; + else if (IS("--dict-update-server")) o_serv_dict_fetch= ARGVAL; #define DF(f) \ - else if (!strcmp(arg,"-D" #f)) \ + else if (IS("-D" #f)) \ debug_flags |= dbg_##f; DEBUG_FLAG_LIST #undef DF - else if (!strcmp(arg,"--window-id")) { + else if (IS("--window-id")) { char *ep; unsigned long windowid= strtoul(ARGVAL,&ep,0); if (*ep) badusage("invalid window id"); @@ -145,24 +206,43 @@ int main(int argc, char **argv) { } else badusage("unknown option `%s'",arg); } + + /* Consequential changes to options */ - if (!o_screenshot_fn) { - r= asprintf(&o_screenshot_fn,"%s/#pages#.ppm",get_vardir()); - sysassert(r>=0); - } + if (!strcmp("upload",o_outputmode)) + o_flags |= ffs_upload; + /* Defaults */ + + set_server("YPPSC_PCTB_PCTB", + "http://", "pctb.ilk.org", + o_serv_pctb, o_flags & (ff_needisland|ff_upload)); + + set_server("YPPSC_PCTB_DICT_UPDATE", + "rsync://", "rsync.pctb.chiark.greenend.org.uk/pctb", + o_serv_dict_fetch, o_flags & ff_dict_fetch); + + set_server("YPPSC_PCTB_DICT_SUBMIT", + "http://", "dictup.pctb.chiark.greenend.org.uk", + o_serv_dict_submit, o_flags & ff_dict_submit); + + if (!o_screenshot_fn) + o_screenshot_fn= masprintf("%s/#pages#.ppm",get_vardir()); + + /* Actually do the work */ + if (o_mode & mf_findwindow) { screenshot_startup(); find_yppclient_window(); } if (o_mode & mf_screenshot) { open_screenshot_file("w"); - if (o_single_page) take_one_screenshot(); + if (o_flags & ff_singlepage) take_one_screenshot(); else take_screenshots(); } if (o_mode & mf_readscreenshot) { open_screenshot_file("r"); - if (o_single_page) read_one_screenshot(); + if (o_flags & ff_singlepage) read_one_screenshot(); else read_screenshots(); } if (o_mode & mf_analyse) { @@ -280,3 +360,13 @@ void waitpid_check_exitstatus(pid_t pid, const char *what) { fatal("%s gave strange wait status %d", what, st); } } + +char *masprintf(const char *fmt, ...) { + char *r; + va_list al; + va_start(al,fmt); + sysassert( vasprintf(&r,fmt,al) >= 0); + sysassert(r); + va_end(al); + return r; +} diff --git a/pctb/convert.h b/pctb/convert.h index b5e9230..354a6f1 100644 --- a/pctb/convert.h +++ b/pctb/convert.h @@ -70,6 +70,7 @@ void analyse(FILE *tsv_output); /*----- from convert.c -----*/ extern FILE *screenshot_file; +extern void fetch_with_rsync(const char *stem); void vwarning(const char *fmt, va_list) FMT(1,0); void warning(const char *fmt, ...) FMT(1,2); @@ -83,6 +84,21 @@ void progress_log(const char *fmt, ...) FMT(1,2); void vprogress_spinner(const char *fmt, va_list) FMT(1,0); void progress_spinner(const char *fmt, ...) FMT(1,2); +enum flags { + ff_editcharset= 00001, + ff_singlepage= 00002, + + ff_dict_fetch= 00010, + ff_dict_submit= 00020, + ff_dict_pirate= 00040, + ffs_dict= 00070, + + ff_needisland= 00100, + ff_upload= 00200, + ffs_upload= 00300, +}; +extern enum flags o_flags; + /*----- from pages.c -----*/ void screenshot_startup(void); diff --git a/pctb/dictionary-manager b/pctb/dictionary-manager index a2a82a4..aaa9e13 100755 --- a/pctb/dictionary-manager +++ b/pctb/dictionary-manager @@ -1051,7 +1051,7 @@ foreach arg $argv { {--debug-server} { proc debug {m} { puts stderr "DICT-MGR-SVR $m" }} {--noop-arg} { } {--approve-updates} { set mainkind approve; break } - {--automatic-1} { set mainkind automatic } + {--automatic-1} { set mainkind automatic; break } {--remote-server-1} { set mainkind remoteserv; break } {--automatic*} - {--remote-server} { error "incompatible versions - install problem" } diff --git a/pctb/ocr.c b/pctb/ocr.c index 927b93a..5828bd0 100644 --- a/pctb/ocr.c +++ b/pctb/ocr.c @@ -26,18 +26,17 @@ */ #include "ocr.h" +#include "convert.h" typedef struct { Pixcol col; struct DatabaseNode *then; } DatabaseLink; -#define MAXGLYPHCHRS 7 - typedef struct DatabaseNode { - char s[MAXGLYPHCHRS+1]; /* null-terminated; "" means no match here */ + char *str; int nlinks, alinks; - unsigned endsword:1; + unsigned match:1, defined:1, endsword:1; DatabaseLink *links; } DatabaseNode; @@ -81,26 +80,35 @@ DEBUG_DEFINE_DEBUGF(ocr) static void cleardb_node(DatabaseNode *n) { int i; - n->s[0]= 0; + free(n->str); n->str=0; + n->defined=n->match=n->endsword= 0; for (i=0; inlinks; i++) cleardb_node(n->links[i].then); } +static void readdb1(OcrReader *rd, const char *which); + static void readdb(OcrReader *rd) { + int ctxi; + + for (ctxi=0; ctxicontexts[ctxi]); + + readdb1(rd, "master"); + readdb1(rd, "local"); +} + +static void readdb1(OcrReader *rd, const char *which) { int nchrs; DatabaseNode *current, *additional; - char chrs[MAXGLYPHCHRS+1]; + char chrs[100]; Pixcol cv; int j,ctxi; int h, endsword; char lbuf[100]; - for (ctxi=0; ctxicontexts[ctxi]); - - char *dbfname=0; - asprintf(&dbfname,"%s/charset-%d.txt",get_vardir(),rd->h); - sysassert(dbfname); + char *dbfname= masprintf("%s/#%s-char%d#.txt", + get_vardir(), which, rd->h); if (!dbfile_open(dbfname)) goto x; @@ -126,23 +134,15 @@ static void readdb(OcrReader *rd) { found_ctx: for (nchrs=0;;) { int c= fgetc(dbfile); sysassert(!ferror(dbfile)); dbassert(c!=EOF); - if (c=='\n') { dbassert(nchrs); break; } - dbassert(nchrs0 && cr<=255); - c= cr; - } + if (c=='\n') break; /* forces no match */ + dbassert(nchrs1 && chrs[nchrs-1]==' ') { + if (nchrs>0 && chrs[nchrs-1]==' ') { endsword= 1; nchrs--; } - chrs[nchrs]= 0; current= &rd->contexts[ctxi]; for (;;) { @@ -159,7 +159,10 @@ static void readdb(OcrReader *rd) { } additional= mmalloc(sizeof(*additional)); - additional->s[0]= 0; + additional->str= 0; + additional->defined= 0; + additional->match= 0; + additional->endsword= 0; additional->nlinks= additional->alinks= 0; additional->links= 0; if (current->nlinks==current->alinks) { @@ -176,9 +179,20 @@ static void readdb(OcrReader *rd) { found_link:; } - dbassert(!current->s[0]); - strcpy(current->s, chrs); - current->endsword= endsword; + if (!current->defined) { + free(current->str); + current->str= 0; + current->defined= 1; + current->match= 0; + + if (nchrs) { + current->str= mmalloc(nchrs+1); + memcpy(current->str, chrs, nchrs); + current->str[nchrs]= 0; + current->match= 1; + current->endsword= endsword; + } + } } x: dbfile_close(); @@ -332,8 +346,8 @@ OcrResultGlyph *ocr(OcrReader *rd, OcrCellType ct, int w, Pixcol cols[]) { found: current= current->links[i].then; - if (current->s[0]) { - debugf(" \"%s\"%s",current->s,current->endsword?"_":""); + if (current->match) { + debugf(" \"%s\"%s",current->str,current->endsword?"_":""); bestmatch= current; bestmatch_rx= x; } else { @@ -344,7 +358,7 @@ OcrResultGlyph *ocr(OcrReader *rd, OcrCellType ct, int w, Pixcol cols[]) { } if (bestmatch) { - if (uniquematch && strcmp(bestmatch->s, uniquematch->s)) { + if (uniquematch && strcmp(bestmatch->str, uniquematch->str)) { debugf( " ambiguous"); uniquematch= 0; break; @@ -356,9 +370,9 @@ OcrResultGlyph *ocr(OcrReader *rd, OcrCellType ct, int w, Pixcol cols[]) { if (uniquematch) { debugf(" || YES"); - add_result(rd, uniquematch->s, lx, uniquematch_rx, ctxmap); + add_result(rd, uniquematch->str, lx, uniquematch_rx, ctxmap); x= uniquematch_rx+1; - if (uniquematch->s[0]) ctxmap= ct->midword; + if (uniquematch->match) ctxmap= ct->midword; else debugf(" (empty)"); if (uniquematch->endsword) { nspaces= ct->space_spaces; @@ -385,6 +399,13 @@ OcrResultGlyph *ocr(OcrReader *rd, OcrCellType ct, int w, Pixcol cols[]) { OcrReader *ocr_init(int h) { OcrReader *rd; + if (o_flags & ff_dict_fetch) { + char *fetchfile= masprintf("master-char%d",rd->h); + progress("Updating %s",fetchfile); + fetch_with_rsync(fetchfile); + free(fetchfile); + } + rd= mmalloc(sizeof(*rd)); memset(rd,0,sizeof(*rd)); rd->h= h; diff --git a/pctb/resolve.c b/pctb/resolve.c index 2c61e3b..556c54b 100644 --- a/pctb/resolve.c +++ b/pctb/resolve.c @@ -48,11 +48,11 @@ FILE *resolve_start(void) { /* we know donepipe[1] is >= 4 and we have dealt with all the others * so we aren't in any danger of overwriting some other fd 4: */ sysassert( dup2(donepipe[1],4) ==4 ); - execlp(o_resolver, o_resolver, - DEBUGP(callout) ? "--debug" : "--noop-arg", - "--automatic-1", - (char*)0); - sysassert(!"execlp dictionary-manager failed"); + EXECLP_HELPER("dictionary-manager", + DEBUGP(callout) ? "--debug" : "--noop-arg", + "--automatic-1", + (char*)0); + sysassert(!"execlp dictionary-manager --automatic failed"); } sysassert(! close(jobpipe[0]) ); sysassert(! close(donepipe[1]) ); @@ -78,7 +78,7 @@ void resolve_finish(void) { } if (r==0) { - waitpid_check_exitstatus(resolver_pid, "dictionary manager"); + waitpid_check_exitstatus(resolver_pid, "dictionary-manager"); fclose(resolver); close(resolver_done); resolver= 0; diff --git a/pctb/yppsc-commod-processor b/pctb/yppsc-commod-processor index 01db528..3276329 100755 --- a/pctb/yppsc-commod-processor +++ b/pctb/yppsc-commod-processor @@ -218,7 +218,8 @@ sub main__tsv () { our (%commodmap); -our ($pctb) = 'http://pctb.ilk.org/'; +our ($pctb) = $ENV{'YPPSC_PCTB_PCTB'}; die unless $pctb; + our ($ua)= LWP::UserAgent->new; sub load_commodmap() { diff --git a/pctb/yppsc-resolver-pixoptions b/pctb/yppsc-resolver-pixoptions index 7bbcb19..5b7a825 100755 --- a/pctb/yppsc-resolver-pixoptions +++ b/pctb/yppsc-resolver-pixoptions @@ -34,7 +34,7 @@ our ($which) = shift @ARGV; $which =~ s/\W//g; -our ($pctb)= 'http://pctb.ilk.org/'; +our ($pctb) = $ENV{'YPPSC_PCTB_PCTB'}; die unless $pctb; our ($ua)= LWP::UserAgent->new; our $jsonresp; -- 2.30.2