chiark / gitweb /
libtests: Include the Unicode test files directly.
authorMark Wooding <>
Sat, 2 Dec 2017 21:44:38 +0000 (21:44 +0000)
committerMark Wooding <>
Sat, 2 Dec 2017 22:18:21 +0000 (22:18 +0000)
Rather than fetch the files using `wget' at test time, fire up `gzip' to
decompress them from local copies.  The files compress really rather
well, so this is an overall saving in disk space relative to the
previous version -- especially since we now share the test files among
all build trees rather than having a separate copy in each.  On the
other hand, they're moderately large things to have in the source
distribution, though small compared to the `images/' tree.

Of course, the main reason for doing this is to completely eliminate the
need for external network connectivity during a build.

The copyright notice, at, appears
to be compatible with the GPL (which is good, because I think we'd have
had a problem using these files even if we didn't distribute them).
I've included the copyright notice as COPYING.unicode-tests, in order to
comply with requirement (a).

Should it be necessary to update the copies of the test files, there's a
(slightly hairy) make target `update-unicode-tests' which can be invoked
by hand to do this.

libtests/COPYING.unicode-tests [new file with mode: 0644]
libtests/GraphemeBreakTest.txt.gz [new file with mode: 0644]
libtests/NormalizationTest.txt.gz [new file with mode: 0644]
libtests/WordBreakTest.txt.gz [new file with mode: 0644]

diff --git a/README b/README
index 5daf7b7..80e8d17 100644 (file)
--- a/README
+++ b/README
@@ -282,6 +282,8 @@ Portions extracted from MPG321,
   Copyright (C) 2000-2001 Robert Leslie
 Portions Copyright (C) 1997-2006 Free Software Foundation, Inc.
 Portions Copyright (C) 2000 Red Hat, Inc., Jonathan Blandford <>
+Unicode test files Copyright (C) 1991-2017 Unicode Inc.; see
+  libtests/COPYING.unicode-tests for details.
 Binaries may derive extra copyright owners through linkage (binary distributors
 are expected to do their own legwork)
diff --git a/libtests/COPYING.unicode-tests b/libtests/COPYING.unicode-tests
new file mode 100644 (file)
index 0000000..b456a04
--- /dev/null
@@ -0,0 +1,38 @@
+[The Unicode test files GraphemeBreakTest.txt, NormalizationTest.txt,
+and WordBreakTest.txt, included in this directory, are copyright (c)
+1991--2017 Unicode Inc., and subject to the license conditions below, as
+published at  These files are
+used for testing, but are not required at runtime.  In particular, they
+are not included in binary packages.  -- [mdw]]
+Copyright © 1991-2017 Unicode, Inc. All rights reserved.
+Distributed under the Terms of Use in
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of the Unicode data files and any associated documentation
+(the "Data Files") or Unicode software and any associated documentation
+(the "Software") to deal in the Data Files or Software
+without restriction, including without limitation the rights to use,
+copy, modify, merge, publish, distribute, and/or sell copies of
+the Data Files or Software, and to permit persons to whom the Data Files
+or Software are furnished to do so, provided that either
+(a) this copyright and permission notice appear with all copies
+of the Data Files or Software, or
+(b) this copyright and permission notice appear in associated
+Except as contained in this notice, the name of a copyright holder
+shall not be used in advertising or otherwise to promote the sale,
+use or other dealings in these Data Files or Software without prior
+written authorization of the copyright holder.
diff --git a/libtests/GraphemeBreakTest.txt.gz b/libtests/GraphemeBreakTest.txt.gz
new file mode 100644 (file)
index 0000000..128d91f
Binary files /dev/null and b/libtests/GraphemeBreakTest.txt.gz differ
index e4a9eb0..83a7f97 100644 (file)
@@ -56,6 +56,7 @@ t_split_SOURCES=t-split.c test.c test.h
 t_syscalls_SOURCES=t-syscalls.c test.c test.h
 t_trackname_SOURCES=t-trackname.c test.c test.h
 t_unicode_SOURCES=t-unicode.c test.c test.h
+t_unicode_CFLAGS=$(AM_CFLAGS) -DSRCDIR=\"$(srcdir)\"
 t_url_SOURCES=t-url.c test.c test.h
 t_utf8_SOURCES=t-utf8.c test.c test.h
 t_vector_SOURCES=t-vector.c test.c test.h
@@ -74,9 +75,22 @@ before-check:
 make-coverage-reports: check
        cd ../lib && ${GCOV} *.c | ${PYTHON} ../scripts/format-gcov-report --html . *.c
-EXTRA_DIST=t-macros-1.tmpl t-macros-2
+UNICODE_TEST_FILES=GraphemeBreakTest NormalizationTest WordBreakTest
+       set -e; \
+       for t in $(foreach t,$(UNICODE_TEST_FILES),$t:$($t_URL)); do \
+         f=$${t%%:*} u=$${t#*:}; \
+         echo $$f $$u; \
+         rm -f $$ $$; wget -O$$ $$u; \
+         gzip -9cv $$ >$$; \
+         mv -f $$ $(srcdir)/$$f.txt.gz; rm -f $$; \
+       done
-CLEANFILES=*.gcda *.gcov *.gcno *.c.html index.html
+EXTRA_DIST=t-macros-1.tmpl t-macros-2 \
+       COPYING.unicode-tests $(addsuffix .txt.gz, $(UNICODE_TEST_FILES))
-DISTCLEANFILES=GraphemeBreakTest.txt NormalizationTest.txt     \
-              WordBreakTest.txt
+CLEANFILES=*.gcda *.gcov *.gcno *.c.html index.html
diff --git a/libtests/NormalizationTest.txt.gz b/libtests/NormalizationTest.txt.gz
new file mode 100644 (file)
index 0000000..6524991
Binary files /dev/null and b/libtests/NormalizationTest.txt.gz differ
diff --git a/libtests/WordBreakTest.txt.gz b/libtests/WordBreakTest.txt.gz
new file mode 100644 (file)
index 0000000..72c0193
Binary files /dev/null and b/libtests/WordBreakTest.txt.gz differ
index 2a199e3..6ab20f5 100644 (file)
 #include "test.h"
+#ifndef SRCDIR
+# define SRCDIR "."
 /** @brief Open a Unicode test file */
 static FILE *open_unicode_test(const char *path) {
-  const char *base;
   FILE *fp;
   char buffer[1024];
-  int w;
-  if((base = strrchr(path, '/')))
-    ++base;
-  else
-    base = path;
-  if(!(fp = fopen(base, "r"))) {
-    snprintf(buffer, sizeof buffer,
-             "wget", path);
-    if((w = system(buffer)))
-      disorder_fatal(0, "%s: %s", buffer, wstat(w));
-    if(chmod(base, 0444) < 0)
-      disorder_fatal(errno, "chmod %s", base);
-    if(!(fp = fopen(base, "r")))
-      disorder_fatal(errno, "%s", base);
-  }
+  snprintf(buffer, sizeof buffer, "gzip -dc " SRCDIR "/%s.gz", path);
+  if(!(fp = popen(buffer, "r")))
+    disorder_fatal(errno, "decompressing %s", path);
   return fp;
+/** @brief Close a Unicode test file */
+static void close_unicode_test(const char *path, FILE *fp)
+  int w;
+  if((w = pclose(fp)))
+    disorder_fatal(0, "decompressing %s: %s", path, wstat(w));
 /** @brief Run breaking tests for utf32_grapheme_boundary() etc */
 static void breaktest(const char *path,
                       int (*breakfn)(const uint32_t *, size_t, size_t)) {
@@ -94,7 +94,7 @@ static void breaktest(const char *path,
-  fclose(fp);
+  close_unicode_test(path, fp);
 /** @brief Tests for @ref lib/unicode.h */
@@ -179,9 +179,9 @@ static void test_unicode(void) {
-  fclose(fp);
-  breaktest("auxiliary/GraphemeBreakTest.txt", utf32_is_grapheme_boundary);
-  breaktest("auxiliary/WordBreakTest.txt", utf32_is_word_boundary);
+  close_unicode_test("NormalizationTest.txt", fp);
+  breaktest("GraphemeBreakTest.txt", utf32_is_grapheme_boundary);
+  breaktest("WordBreakTest.txt", utf32_is_word_boundary);
   insist(utf32_combining_class(0x40000) == 0);
   insist(utf32_combining_class(0xE0000) == 0);