From f9635e06947ec6bc61c7977e7a3f9dba2c43d784 Mon Sep 17 00:00:00 2001 Message-Id: From: Mark Wooding Date: Tue, 20 Nov 2007 12:31:03 +0000 Subject: [PATCH] Convert track names and input lines to NFC. This is a database format change so a new read-only global preference _dbversion is introduced to record the current database version. Upgrade will implyy renormalizing all keys and regenerating the search/tags databases. Organization: Straylight/Edgeware From: Richard Kettlewell Fiddled with tests to put both composed and decomposed non-ASCII names through. Document limitations on line syntax. This really needs improving before release since the same thing applies to the config file! Make 'nothing' test quicker. --- .bzrignore | 1 + doc/disorder_config.5.in | 8 ++ doc/disorder_protocol.5.in | 65 ++++++++++++++++- lib/split.c | 5 ++ server/rescan.c | 6 ++ server/server.c | 10 +++ server/trackdb-int.h | 7 ++ server/trackdb.c | 146 ++++++++++++++++++++++++++++++------- server/trackdb.h | 3 + tests/dtest.py | 108 ++++++++++++++++----------- tests/nothing.py | 2 +- 11 files changed, 286 insertions(+), 75 deletions(-) diff --git a/.bzrignore b/.bzrignore index 12932f5..259f5aa 100644 --- a/.bzrignore +++ b/.bzrignore @@ -126,3 +126,4 @@ lib/WordBreakProperty.txt lib/SentenceBreakProperty.txt lib/WordBreakTest.txt lib/DerivedNormalizationProps.txt +tests/*.log diff --git a/doc/disorder_config.5.in b/doc/disorder_config.5.in index 0b1535f..f73f0bf 100644 --- a/doc/disorder_config.5.in +++ b/doc/disorder_config.5.in @@ -558,6 +558,14 @@ If unset or \fByes\fR then play is enabled. Otherwise it is disabled. Use .B random-play If unset or \fByes\fR then random play is enabled. Otherwise it is disabled. Use \fBdisable\fR rather than setting it directly. +.PP +Global preferences starting '_' are read-only (in the sense that you cannot +modify them; the server may modify them as part of its normal operation). They +are: +.TP +.B _dbversion +The database version string. This is used by DisOrder to detect when it must +modify the database after an upgrade. .SH "LIBAO DRIVER" .SS "Raw Protocol Players" Raw protocol players are expected to use the \fBdisorder\fR libao driver. diff --git a/doc/disorder_protocol.5.in b/doc/disorder_protocol.5.in index d20edcf..bf293e9 100644 --- a/doc/disorder_protocol.5.in +++ b/doc/disorder_protocol.5.in @@ -26,10 +26,12 @@ in this man page. The protocol is liable to change without notice. You are recommended to check the implementation before believing this document. .SH "GENERAL SYNTAX" -Everything is encoded using UTF-8. +Everything is encoded using UTF-8. See +.B "CHARACTER ENCODING" +below for more detail on character encoding issues. .PP -Commands and responses consist of a line followed (depending on the -command or response) by a message. +Commands and responses consist of a line perhaps followed (depending on the +command or response) by a body. .PP The line syntax is the same as described in \fBdisorder_config\fR(5) except that comments are prohibited. @@ -455,6 +457,63 @@ The volume changed. is as defined in .B "TRACK INFORMATION" above. +.SH "CHARACTER ENCODING" +All data sent by both server and client is encoded using UTF-8. Moreover it +must be valid UTF-8, i.e. non-minimal sequences are not permitted, nor are +surrogates, nor are code points outside the Unicode code space. +.PP +There are no particular normalization requirements on either side of the +protocol. The server currently converts internally to NFC, the client must +normalize the responses returned if it needs some normalized form for further +processing. +.PP +The various characters which divide up lines may not be followed by combining +characters. For instance all of the following are prohibited: +.TP +.B o +LINE FEED followed by a combining character. For example the sequence +LINE FEED, COMBINING GRAVE ACCENT is never permitted. +.TP +.B o +APOSTROPHE or QUOTATION MARK followed by a combining character when used to +delimit fields. For instance a line starting APOSTROPHE, COMBINING CEDILLA +is prohibited. +.IP +Note that such sequences are not prohibited when the quote character cannot be +interpreted as a field delimiter. For instance APOSTROPHE, REVERSE SOLIDUS, +APOSTROPHE, COMBINING CEDILLA, APOSTROPHE would be permitted. +.TP +.B o +REVERSE SOLIDUS (BACKSLASH) followed by a combining character in a quoted +string when it is the first character of an escape sequence. For instance a +line starting APOSTROPHE, REVERSE SOLIDUS, COMBINING TILDE is prohibited. +.IP +As above such sequences are not prohibited when the character is not being used +to start an escape sequence. For instance APOSTROPHE, REVERSE SOLIDUS, +REVERSE SOLIDS, COMBINING TILDER, APOSTROPHE is permitted. +.TP +.B o +Any of the field-splitting whitespace characters followed by a combining +character when not part of a quoted field. For instance a line starting COLON, +SPACE, COMBINING CANDRABINDU is prohibited. +.IP +As above non-delimiter uses are fine. +.TP +.B o +The FULL STOP characters used to quote or delimit a body. +.PP +Furthermore none of these characters are permitted to appear in the context of +a canonical decomposition (i.e. they must still be present when converted to +NFC). In practice however this is not an issue in Unicode 5.0. +.PP +These rules are consistent with the observation that the split() function is +essentially a naive ASCII parser. The implication is not that these sequences +never actually appear in the protocol, merely that the server is not required +to honor them in any useful way nor be consistent between versions: in current +versions the result will be lines and fields that start with combining +characters and are not necessarily split where you expect, but future versions +may remove them, reject them or ignore some or all of the delimiters that have +following combining characters, and no notice will be given of any change. .SH "SEE ALSO" \fBdisorder\fR(1), \fBtime\fR(2), diff --git a/lib/split.c b/lib/split.c index 110e096..1bb19a5 100644 --- a/lib/split.c +++ b/lib/split.c @@ -42,6 +42,9 @@ static void no_error_handler(const char attribute((unused)) *msg, void attribute((unused)) *u) { } +/* TODO: handle combining characters attached to delimiters in some + * sane way (might include reporting an error) */ + char **split(const char *p, int *np, unsigned flags, @@ -110,6 +113,8 @@ char **split(const char *p, return v.vec; } +/* TODO handle initial combining characters sanely */ + const char *quoteutf8(const char *s) { size_t len = 3 + strlen(s); const char *t; diff --git a/server/rescan.c b/server/rescan.c index 890e3fe..b9be72d 100644 --- a/server/rescan.c +++ b/server/rescan.c @@ -50,6 +50,7 @@ #include "trackdb.h" #include "trackdb-int.h" #include "trackname.h" +#include "unicode.h" static DB_TXN *global_tid; @@ -151,6 +152,11 @@ static void rescan_collection(const struct collection *c) { error(0, "cannot convert track path to UTF-8: %s", path); continue; } + /* We use NFC track names */ + if(!(track = utf8_compose_canon(track, strlen(track), 0))) { + error(0, "cannot convert track path to NFC: %s", path); + continue; + } D(("track %s", track)); /* only tracks with a known player are admitted */ for(n = 0; (n < config->player.n diff --git a/server/server.c b/server/server.c index 603b7db..64502c0 100644 --- a/server/server.c +++ b/server/server.c @@ -63,6 +63,7 @@ #include "eventlog.h" #include "defs.h" #include "cache.h" +#include "unicode.h" #ifndef NONCE_SIZE # define NONCE_SIZE 16 @@ -921,6 +922,10 @@ static int c_tags(struct conn *c, static int c_set_global(struct conn *c, char **vec, int attribute((unused)) nvec) { + if(vec[0][0] == '_') { + sink_writes(ev_writer_sink(c->w), "550 cannot set internal global preferences\n"); + return 1; + } trackdb_set_global(vec[0], vec[1], c->who); sink_printf(ev_writer_sink(c->w), "250 OK\n"); return 1; @@ -1040,6 +1045,11 @@ static int command(struct conn *c, char *line) { int nvec, n; D(("server command %s", line)); + /* We force everything into NFC as early as possible */ + if(!(line = utf8_compose_canon(line, strlen(line), 0))) { + sink_writes(ev_writer_sink(c->w), "500 cannot normalize command\n"); + return 1; + } if(!(vec = split(line, &nvec, SPLIT_QUOTES, command_error, c))) { sink_writes(ev_writer_sink(c->w), "500 cannot parse command\n"); return 1; diff --git a/server/trackdb-int.h b/server/trackdb-int.h index 2b12c81..f728e0a 100644 --- a/server/trackdb-int.h +++ b/server/trackdb-int.h @@ -113,6 +113,13 @@ static inline DBT *encode_data(DBT *data, const struct kvp *k) { return data; } +int trackdb_set_global_tid(const char *name, + const char *value, + DB_TXN *tid); +int trackdb_get_global_tid(const char *name, + DB_TXN *tid, + const char **rp); + #endif /* TRACKDB_INT_H */ /* diff --git a/server/trackdb.c b/server/trackdb.c index 8cfd71f..e1848c4 100644 --- a/server/trackdb.c +++ b/server/trackdb.c @@ -63,9 +63,6 @@ static const char *getpart(const char *track, const struct kvp *p, int *used_db); static int trackdb_alltags_tid(DB_TXN *tid, char ***taglistp); -static int trackdb_get_global_tid(const char *name, - DB_TXN *tid, - const char **rp); static char **trackdb_new_tid(int *ntracksp, int maxtracks, DB_TXN *tid); @@ -78,11 +75,59 @@ unsigned long cache_files_hits, cache_files_misses; static const char *home; /* home had better not change */ DB_ENV *trackdb_env; /* db environment */ -DB *trackdb_tracksdb; /* the db itself */ -DB *trackdb_prefsdb; /* preferences */ -DB *trackdb_searchdb; /* the search database */ + +/** @brief The tracks database + * - Keys are UTF-8(NFC(unicode(path name))) + * - Values are encoded key-value pairs + * - Data is reconstructable data about tracks that currently exist + */ +DB *trackdb_tracksdb; + +/** @brief The preferences database + * + * - Keys are UTF-8(NFC(unicode(path name))) + * - Values are encoded key-value pairs + * - Data is user data about tracks (that might not exist any more) + * and cannot be reconstructed + */ +DB *trackdb_prefsdb; + +/** @brief The search database + * + * - Keys are UTF-8(NFKC(casefold(search term))) + * - Values are UTF-8(NFC(unicode(path name))) + * - There can be more than one value per key + * - Presence of key,value means that path matches the search terms + * - Only tracks fond in @ref tracks_tracksdb are represented here + * - This database can be reconstructed, it contains no user data + */ +DB *trackdb_searchdb; + +/** @brief The tags database + * + * - Keys are UTF-8(NFKC(casefold(tag))) + * - Values are UTF-8(NFC(unicode(path name))) + * - There can be more than one value per key + * - Presence of key,value means that path matches the tag + * - This is always in sync with the tags preference + * - This database can be reconstructed, it contains no user data + */ DB *trackdb_tagsdb; /* the tags database */ + +/** @brief The global preferences database + * - Keys are UTF-8(NFC(preference)) + * - Values are global preference values + * - Data is user data and cannot be reconstructed + */ DB *trackdb_globaldb; /* global preferences */ + +/** @brief The noticed database + * - Keys are 64-bit big-endian timestamps + * - Values are UTF-8(NFC(unicode(path name))) + * - There can be more than one value per key + * - Presence of key,value means that path was added at the given time + * - Data cannot be reconstructed (but isn't THAT important) + */ DB *trackdb_noticeddb; /* when track noticed */ static pid_t db_deadlock_pid = -1; /* deadlock manager PID */ static pid_t rescan_pid = -1; /* rescanner PID */ @@ -230,16 +275,45 @@ static DB *open_db(const char *path, if((err = db->set_bt_compare(db, compare))) fatal(0, "db->set_bt_compare %s: %s", path, db_strerror(err)); if((err = db->open(db, 0, path, 0, dbtype, - openflags | DB_AUTO_COMMIT, mode))) - fatal(0, "db->open %s: %s", path, db_strerror(err)); + openflags | DB_AUTO_COMMIT, mode))) { + if((openflags & DB_CREATE) || errno != ENOENT) + fatal(0, "db->open %s: %s", path, db_strerror(err)); + db->close(db, 0); + db = 0; + } return db; } /* open track databases */ void trackdb_open(void) { + int newdb, err; + /* sanity checks */ assert(opened == 0); ++opened; + /* check the database version first */ + trackdb_globaldb = open_db("global.db", 0, DB_HASH, 0, 0666); + if(trackdb_globaldb) { + /* This is an existing database */ + const char *oldversion; + + oldversion = trackdb_get_global("_dbversion"); + if(!oldversion) + oldversion = "1.x"; + if(strcmp(oldversion, DBVERSION)) { + /* This database needs upgrading. This isn't implemented yet so we just + * fail. */ + fatal(0, "database needs upgrading from %s to %s", oldversion, DBVERSION); + } + newdb = 0; + /* Close the database again, we'll open it property below */ + if((err = trackdb_globaldb->close(trackdb_globaldb, 0))) + fatal(0, "error closing global.db: %s", db_strerror(err)); + trackdb_globaldb = 0; + } else { + /* This is a brand new database */ + newdb = 1; + } /* open the databases */ trackdb_tracksdb = open_db("tracks.db", DB_RECNUM, DB_BTREE, DB_CREATE, 0666); @@ -251,6 +325,9 @@ void trackdb_open(void) { trackdb_globaldb = open_db("global.db", 0, DB_HASH, DB_CREATE, 0666); trackdb_noticeddb = open_db("noticed.db", DB_DUPSORT, DB_BTREE, DB_CREATE, 0666); + /* Stash the database version */ + if(newdb) + trackdb_set_global("_dbversion", DBVERSION, 0); D(("opened databases")); } @@ -700,6 +777,9 @@ int trackdb_notice(const char *track, } /** @brief notice a possibly new track + * @param track NFC UTF-8 track name + * @param path Raw path name + * @param tid Transaction ID * @return @c DB_NOTFOUND if new, 0 if already known, @c DB_LOCK_DEADLOCK also */ int trackdb_notice_tid(const char *track, @@ -1850,27 +1930,13 @@ void trackdb_set_global(const char *name, const char *value, const char *who) { DB_TXN *tid; - DBT k, d; int err; int state; - memset(&k, 0, sizeof k); - memset(&d, 0, sizeof d); - k.data = (void *)name; - k.size = strlen(name); - if(value) { - d.data = (void *)value; - d.size = strlen(value); - } for(;;) { tid = trackdb_begin_transaction(); - if(value) - err = trackdb_globaldb->put(trackdb_globaldb, tid, &k, &d, 0); - else - err = trackdb_globaldb->del(trackdb_globaldb, tid, &k, 0); - if(!err || err == DB_NOTFOUND) break; - if(err != DB_LOCK_DEADLOCK) - fatal(0, "error updating database: %s", db_strerror(err)); + if(!(err = trackdb_set_global_tid(name, value, tid))) + break; trackdb_abort_transaction(tid); } trackdb_commit_transaction(tid); @@ -1893,6 +1959,30 @@ void trackdb_set_global(const char *name, reqtracks = 0; } +int trackdb_set_global_tid(const char *name, + const char *value, + DB_TXN *tid) { + DBT k, d; + int err; + + memset(&k, 0, sizeof k); + memset(&d, 0, sizeof d); + k.data = (void *)name; + k.size = strlen(name); + if(value) { + d.data = (void *)value; + d.size = strlen(value); + } + if(value) + err = trackdb_globaldb->put(trackdb_globaldb, tid, &k, &d, 0); + else + err = trackdb_globaldb->del(trackdb_globaldb, tid, &k, 0); + if(err == DB_LOCK_DEADLOCK) return err; + if(err) + fatal(0, "error updating database: %s", db_strerror(err)); + return 0; +} + const char *trackdb_get_global(const char *name) { DB_TXN *tid; int err; @@ -1908,9 +1998,9 @@ const char *trackdb_get_global(const char *name) { return r; } -static int trackdb_get_global_tid(const char *name, - DB_TXN *tid, - const char **rp) { +int trackdb_get_global_tid(const char *name, + DB_TXN *tid, + const char **rp) { DBT k, d; int err; @@ -1928,7 +2018,7 @@ static int trackdb_get_global_tid(const char *name, case DB_LOCK_DEADLOCK: return err; default: - fatal(0, "error updating database: %s", db_strerror(err)); + fatal(0, "error reading database: %s", db_strerror(err)); } } diff --git a/server/trackdb.h b/server/trackdb.h index fe43474..854f63c 100644 --- a/server/trackdb.h +++ b/server/trackdb.h @@ -23,6 +23,9 @@ struct ev_source; +/* Database version string */ +#define DBVERSION "2.0" + extern const struct cache_type cache_files_type; extern unsigned long cache_files_hits, cache_files_misses; /* Cache entry type and tracking for regexp-based lookups */ diff --git a/tests/dtest.py b/tests/dtest.py index 2bc834a..a6db3e3 100644 --- a/tests/dtest.py +++ b/tests/dtest.py @@ -20,9 +20,29 @@ Make track with relative path S exist""" copyfile("%s/sounds/slap.ogg" % topsrcdir, trackpath) def stdtracks(): - maketrack("Joe Bloggs/First Album/01:First track.ogg") + # We create some tracks with non-ASCII characters in the name and + # we (currently) force UTF-8. + # + # On a traditional UNIX filesystem, that treats filenames as byte strings + # with special significant for '/', this should just work, though the + # names will look wrong to ls(1) in a non UTF-8 locale. + # + # On Apple HFS+ filenames normalized to a decomposed form that isn't quite + # NFD, so our attempts to have both normalized and denormalized filenames + # is frustrated. Provided we test on traditional filesytsems too this + # shouldn't be a problem. + # (See http://developer.apple.com/qa/qa2001/qa1173.html) + + # C3 8C = 00CC LATIN CAPITAL LETTER I WITH GRAVE + # (in NFC) + maketrack("Joe Bloggs/First Album/01:F\xC3\x8Crst track.ogg") + maketrack("Joe Bloggs/First Album/02:Second track.ogg") - maketrack("Joe Bloggs/First Album/03:Third track.ogg") + + # CC 81 = 0301 COMBINING ACUTE ACCENT + # (giving an NFD i-acute) + maketrack("Joe Bloggs/First Album/03:ThI\xCC\x81rd track.ogg") + # ...hopefuly giving C3 8D = 00CD LATIN CAPITAL LETTER I WITH ACUTE maketrack("Joe Bloggs/First Album/04:Fourth track.ogg") maketrack("Joe Bloggs/First Album/05:Fifth track.ogg") maketrack("Joe Bloggs/First Album/05:Fifth track.ogg") @@ -32,11 +52,11 @@ def stdtracks(): maketrack("Joe Bloggs/Second Album/04:Fourth track.ogg") maketrack("Joe Bloggs/Second Album/05:Fifth track.ogg") maketrack("Joe Bloggs/Second Album/05:Fifth track.ogg") - maketrack("Joe Bloggs/First Album/01:First track.ogg") - maketrack("Joe Bloggs/First Album/02:Second track.ogg") - maketrack("Joe Bloggs/First Album/03:Third track.ogg") - maketrack("Joe Bloggs/First Album/04:Fourth track.ogg") - maketrack("Joe Bloggs/First Album/05:Fifth track.ogg") + maketrack("Joe Bloggs/Third Album/01:First track.ogg") + maketrack("Joe Bloggs/Third Album/02:Second track.ogg") + maketrack("Joe Bloggs/Third Album/03:Third track.ogg") + maketrack("Joe Bloggs/Third Album/04:Fourth track.ogg") + maketrack("Joe Bloggs/Third Album/05:Fifth track.ogg") maketrack("Fred Smith/Boring/01:Dull.ogg") maketrack("Fred Smith/Boring/02:Tedious.ogg") maketrack("Fred Smith/Boring/03:Drum Solo.ogg") @@ -48,16 +68,45 @@ def stdtracks(): def notracks(): pass -def start(test): - """start(TEST) - +def common_setup(): + remove_dir(testroot) + os.mkdir(testroot) + open("%s/config" % testroot, "w").write( + """player *.ogg shell 'echo "$TRACK" >> %s/played.log' + home %s + collection fs UTF-8 %s/tracks + scratch %s/scratch.ogg + gap 0 + stopword 01 02 03 04 05 06 07 08 09 10 + stopword 1 2 3 4 5 6 7 8 9 + stopword 11 12 13 14 15 16 17 18 19 20 + stopword 21 22 23 24 25 26 27 28 29 30 + stopword the a an and to too in on of we i am as im for is + username fred + password fredpass + allow fred fredpass + plugins ../plugins + player *.mp3 execraw disorder-decode + player *.ogg execraw disorder-decode + player *.wav execraw disorder-decode + player *.flac execraw disorder-decode + tracklength *.mp3 disorder-tracklength + tracklength *.ogg disorder-tracklength + tracklength *.wav disorder-tracklength + tracklength *.flac disorder-tracklength + """ % (testroot, testroot, testroot, testroot)) + copyfile("%s/sounds/scratch.ogg" % topsrcdir, + "%s/scratch.ogg" % testroot) + +def start_daemon(test): + """start_daemon(TEST) Start the daemon for test called TEST.""" global daemon assert daemon == None if test == None: errs = sys.stderr else: - errs = open("%s/%s.log" % (testroot, test), "w") + errs = open("%s.log" % test, "w") server = None print " starting daemon" daemon = subprocess.Popen(["disorderd", @@ -67,8 +116,8 @@ Start the daemon for test called TEST.""" disorder._configfile = "%s/config" % testroot disorder._userconf = False -def stop(): - """stop() +def stop_daemon(): + """stop_daemon() Stop the daemon if it has not stopped already""" global daemon @@ -85,8 +134,9 @@ def run(test, setup=None, report=True, name=None): tests += 1 if setup == None: setup = stdtracks + common_setup() setup() - start(name) + start_daemon(name) try: try: test() @@ -95,7 +145,7 @@ def run(test, setup=None, report=True, name=None): failures += 1 print e finally: - stop() + stop_daemon() if report: if failures: print " FAILED" @@ -123,31 +173,3 @@ failures = 0 daemon = None testroot = "%s/testroot" % os.getcwd() topsrcdir = os.path.abspath(os.getenv("topsrcdir")) -remove_dir(testroot) -os.mkdir(testroot) -open("%s/config" % testroot, "w").write( -"""player *.ogg shell 'echo "$TRACK" >> %s/played.log' -home %s -collection fs ASCII %s/tracks -scratch %s/scratch.ogg -gap 0 -stopword 01 02 03 04 05 06 07 08 09 10 -stopword 1 2 3 4 5 6 7 8 9 -stopword 11 12 13 14 15 16 17 18 19 20 -stopword 21 22 23 24 25 26 27 28 29 30 -stopword the a an and to too in on of we i am as im for is -username fred -password fredpass -allow fred fredpass -plugins ../plugins -player *.mp3 execraw disorder-decode -player *.ogg execraw disorder-decode -player *.wav execraw disorder-decode -player *.flac execraw disorder-decode -tracklength *.mp3 disorder-tracklength -tracklength *.ogg disorder-tracklength -tracklength *.wav disorder-tracklength -tracklength *.flac disorder-tracklength -""" % (testroot, testroot, testroot, testroot)) -copyfile("%s/sounds/scratch.ogg" % topsrcdir, - "%s/scratch.ogg" % testroot) diff --git a/tests/nothing.py b/tests/nothing.py index f946bf9..7a8e55d 100755 --- a/tests/nothing.py +++ b/tests/nothing.py @@ -3,7 +3,7 @@ import dtest,time def test(): """Just start the server and then stop it a few seconds later""" - time.sleep(5) + time.sleep(2) if __name__ == '__main__': dtest.run(test) -- [mdw]