X-Git-Url: http://www.chiark.greenend.org.uk/ucgi/~mdw/git/disorder/blobdiff_plain/99695df91e7db54281bf397baa6989836c4ce9e3..263ed9c13181822dc2f641da2ea0ccb8e949f360:/scripts/make-unidata

diff --git a/scripts/make-unidata b/scripts/make-unidata
index bf4b324..d43533e 100755
--- a/scripts/make-unidata
+++ b/scripts/make-unidata
@@ -3,20 +3,18 @@
 # This file is part of DisOrder.
 # Copyright (C) 2007 Richard Kettlewell
 #
-# This program is free software; you can redistribute it and/or modify
+# This program is free software: you can redistribute it and/or modify
 # it under the terms of the GNU General Public License as published by
-# the Free Software Foundation; either version 2 of the License, or
+# the Free Software Foundation, either version 3 of the License, or
 # (at your option) any later version.
-#
-# This program is distributed in the hope that it will be useful, but
-# WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-# General Public License for more details.
-#
+# 
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+# 
 # You should have received a copy of the GNU General Public License
-# along with this program; if not, write to the Free Software
-# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
-# USA
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
 #
 #
 # Generate Unicode support tables
@@ -94,7 +92,7 @@ sub input {
     my $path = shift;
     my $lpath = basename($path);
     if(!-e $lpath) {
-	system("wget http://www.unicode.org/Public/5.0.0/ucd/$path");
+	system("wget http://www.unicode.org/Public/5.1.0/ucd/$path");
 	chmod(0444, $lpath) or die "$lpath: $!\n";
     }
     open(STDIN, "<$lpath") or die "$lpath: $!\n";
@@ -105,6 +103,10 @@ sub input {
 # Read the main data file
 input("UnicodeData.txt");
 my ($start, $end);
+my $maxcompat = 0;
+my $maxcanon = 0;
+my $hangul_syllable_decomps = 0;
+my $hangul_choseong_decomps = 0;
 while(<>) {
     my @f = split(/;/, $_);
     my $c = hex($f[0]);		# codepoint
@@ -147,12 +149,26 @@ while(<>) {
 	    "ld" => $ld,
 	};
 	if($dm ne '') {
+	    my $maxref;
 	    if($dm =~ /</) {
 		# This is a compatibility decomposition
 		$dm =~ s/^<.*>\s*//;
 		$d->{compat} = 1;
+		$maxref = \$maxcompat;
+	    } else {
+		$maxref = \$maxcanon;
 	    }
 	    $d->{decomp} = [map(hex($_), split(/\s+/, $dm))];
+	    my $len = scalar @{$d->{decomp}};
+	    $$maxref = $len if $len > $$maxref;
+	    if(!$d->{compat}) {
+		if(${$d->{decomp}}[0] >= 0xAC00 && ${$d->{decomp}}[0] <= 0xD7A3) {
+		    ++$hangul_syllable_decomps;
+		}
+		if(${$d->{decomp}}[0] >= 0x1100 && ${$d->{decomp}}[0] <= 0x115F) {
+		    ++$hangul_choseong_decomps;
+		}
+	    }
 	}
 	$data{$c} = $d;
     }
@@ -320,7 +336,13 @@ while(<>) {
 print STDERR "Generating unidata.h...\n";
 open(STDOUT, ">unidata.h") or die "unidata.h: $!\n";
 
-out("/* Automatically generated file, see scripts/make-unidata */\n",
+out("/** \@file lib/unidata.h\n",
+    " * \@brief Unicode tables\n",
+    " *\n",
+    " * Automatically generated file, see scripts/make-unidata\n",
+    " *\n",
+    " * DO NOT EDIT.\n",
+    " */\n",
     "#ifndef UNIDATA_H\n",
     "#define UNIDATA_H\n");
 
@@ -420,9 +442,14 @@ close STDOUT or die "unidata.h: $!\n";
 print STDERR "Generating unidata.c...\n";
 open(STDOUT, ">unidata.c") or die "unidata.c: $!\n";
 
-out("/* Automatically generated file, see scripts/make-unidata */\n",
-    "#include <config.h>\n",
-    "#include \"types.h\"\n",
+out("/** \@file lib/unidata.c\n",
+    " * \@brief Unicode tables\n",
+    " *\n",
+    " * Automatically generated file, see scripts/make-unidata\n",
+    " *\n",
+    " * DO NOT EDIT.\n",
+    " */\n",
+    "#include \"common.h\"\n",
     "#include \"unidata.h\"\n");
 
 # Short aliases to keep .c file small
@@ -473,9 +500,7 @@ sub dedupe {
     return "dd$ddnums{$s}";
 }
 
-# Generate the decomposition mapping tables.  We look out for duplicates
-# in order to save space and report this as decompsaved at the end.  In
-# Unicode 5.0.0 this saves 1795 entries, which is at least 14Kbytes.
+# Generate the decomposition mapping tables.
 print STDERR "> decomposition mappings\n";
 for(my $c = 0; $c <= $max; ++$c) {
     if(exists $data{$c} && exists $data{$c}->{decomp}) {
@@ -485,7 +510,7 @@ for(my $c = 0; $c <= $max; ++$c) {
 
 print STDERR "> composition mappings\n";
 # First we must generate the mapping of each code point to possible
-# compositions
+# compositions.
 for(my $c = 0; $c <= $max; ++$c) {
     if(exists $data{$c}
        && exists $data{$c}->{decomp}
@@ -495,21 +520,20 @@ for(my $c = 0; $c <= $max; ++$c) {
         # a primary composite.  Find the first code point of the decomposition
 	my $first = ${$data{$c}->{decomp}}[0];
 	if(!exists $data{$first}->{compose}) {
-	    $data{$first}->{compose} = [$first];
+	    $data{$first}->{compose} = [$c];
 	} else {
-	    push(@{$data{$first}->{compose}}, $first);
+	    push(@{$data{$first}->{compose}}, $c);
 	}
     }
 }
+# Then we can generate the tables.
 for(my $c = 0; $c <= $max; ++$c) {
     if(exists $data{$c} && exists $data{$c}->{compose}) {
 	$data{$c}->{compsym} = dedupe(@{$data{$c}->{compose}}, 0);
     }
 }
 
-# ...and the case folding table.  Again we compress equal entries to save
-# space.  In Unicode 5.0.0 this saves 51 entries or at least 408 bytes.
-# This doesns't seem as worthwhile as the decomposition mapping saving above.
+# The case folding table.
 print STDERR "> case-fold mappings\n";
 for(my $c = 0; $c <= $max; ++$c) {
     if(exists $data{$c} && exists $data{$c}->{casefold}) {
@@ -631,3 +655,12 @@ printf STDERR "modulus=%d\n", $modulus;
 printf STDERR "max=%04X\n", $max;
 print STDERR "subtables=$subtablecounter, subtablessaved=$subtablessaved\n";
 print STDERR "ddsaved=$ddsaved\n";
+print STDERR "maxcompat=$maxcompat maxcanon=$maxcanon\n";
+print STDERR "$hangul_syllable_decomps canonical decompositions to Hangul syllables\n";
+print STDERR "$hangul_choseong_decomps canonical decompositions to Hangul Choseong\n";
+
+die "We assumed that canonical decompositions were never more than 2 long!\n"
+    if $maxcanon > 2;
+
+die "We assumed no canonical decompositions to Hangul syllables/Choseong!\n"
+    if $hangul_syllable_decomps || $hangul_choseong_decomps;