X-Git-Url: http://www.chiark.greenend.org.uk/ucgi/~mdw/git/disorder/blobdiff_plain/99695df91e7db54281bf397baa6989836c4ce9e3..263ed9c13181822dc2f641da2ea0ccb8e949f360:/scripts/make-unidata diff --git a/scripts/make-unidata b/scripts/make-unidata index bf4b324..d43533e 100755 --- a/scripts/make-unidata +++ b/scripts/make-unidata @@ -3,20 +3,18 @@ # This file is part of DisOrder. # Copyright (C) 2007 Richard Kettlewell # -# This program is free software; you can redistribute it and/or modify +# This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by -# the Free Software Foundation; either version 2 of the License, or +# the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, but -# WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -# General Public License for more details. -# +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# # You should have received a copy of the GNU General Public License -# along with this program; if not, write to the Free Software -# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 -# USA +# along with this program. If not, see . # # # Generate Unicode support tables @@ -94,7 +92,7 @@ sub input { my $path = shift; my $lpath = basename($path); if(!-e $lpath) { - system("wget http://www.unicode.org/Public/5.0.0/ucd/$path"); + system("wget http://www.unicode.org/Public/5.1.0/ucd/$path"); chmod(0444, $lpath) or die "$lpath: $!\n"; } open(STDIN, "<$lpath") or die "$lpath: $!\n"; @@ -105,6 +103,10 @@ sub input { # Read the main data file input("UnicodeData.txt"); my ($start, $end); +my $maxcompat = 0; +my $maxcanon = 0; +my $hangul_syllable_decomps = 0; +my $hangul_choseong_decomps = 0; while(<>) { my @f = split(/;/, $_); my $c = hex($f[0]); # codepoint @@ -147,12 +149,26 @@ while(<>) { "ld" => $ld, }; if($dm ne '') { + my $maxref; if($dm =~ /\s*//; $d->{compat} = 1; + $maxref = \$maxcompat; + } else { + $maxref = \$maxcanon; } $d->{decomp} = [map(hex($_), split(/\s+/, $dm))]; + my $len = scalar @{$d->{decomp}}; + $$maxref = $len if $len > $$maxref; + if(!$d->{compat}) { + if(${$d->{decomp}}[0] >= 0xAC00 && ${$d->{decomp}}[0] <= 0xD7A3) { + ++$hangul_syllable_decomps; + } + if(${$d->{decomp}}[0] >= 0x1100 && ${$d->{decomp}}[0] <= 0x115F) { + ++$hangul_choseong_decomps; + } + } } $data{$c} = $d; } @@ -320,7 +336,13 @@ while(<>) { print STDERR "Generating unidata.h...\n"; open(STDOUT, ">unidata.h") or die "unidata.h: $!\n"; -out("/* Automatically generated file, see scripts/make-unidata */\n", +out("/** \@file lib/unidata.h\n", + " * \@brief Unicode tables\n", + " *\n", + " * Automatically generated file, see scripts/make-unidata\n", + " *\n", + " * DO NOT EDIT.\n", + " */\n", "#ifndef UNIDATA_H\n", "#define UNIDATA_H\n"); @@ -420,9 +442,14 @@ close STDOUT or die "unidata.h: $!\n"; print STDERR "Generating unidata.c...\n"; open(STDOUT, ">unidata.c") or die "unidata.c: $!\n"; -out("/* Automatically generated file, see scripts/make-unidata */\n", - "#include \n", - "#include \"types.h\"\n", +out("/** \@file lib/unidata.c\n", + " * \@brief Unicode tables\n", + " *\n", + " * Automatically generated file, see scripts/make-unidata\n", + " *\n", + " * DO NOT EDIT.\n", + " */\n", + "#include \"common.h\"\n", "#include \"unidata.h\"\n"); # Short aliases to keep .c file small @@ -473,9 +500,7 @@ sub dedupe { return "dd$ddnums{$s}"; } -# Generate the decomposition mapping tables. We look out for duplicates -# in order to save space and report this as decompsaved at the end. In -# Unicode 5.0.0 this saves 1795 entries, which is at least 14Kbytes. +# Generate the decomposition mapping tables. print STDERR "> decomposition mappings\n"; for(my $c = 0; $c <= $max; ++$c) { if(exists $data{$c} && exists $data{$c}->{decomp}) { @@ -485,7 +510,7 @@ for(my $c = 0; $c <= $max; ++$c) { print STDERR "> composition mappings\n"; # First we must generate the mapping of each code point to possible -# compositions +# compositions. for(my $c = 0; $c <= $max; ++$c) { if(exists $data{$c} && exists $data{$c}->{decomp} @@ -495,21 +520,20 @@ for(my $c = 0; $c <= $max; ++$c) { # a primary composite. Find the first code point of the decomposition my $first = ${$data{$c}->{decomp}}[0]; if(!exists $data{$first}->{compose}) { - $data{$first}->{compose} = [$first]; + $data{$first}->{compose} = [$c]; } else { - push(@{$data{$first}->{compose}}, $first); + push(@{$data{$first}->{compose}}, $c); } } } +# Then we can generate the tables. for(my $c = 0; $c <= $max; ++$c) { if(exists $data{$c} && exists $data{$c}->{compose}) { $data{$c}->{compsym} = dedupe(@{$data{$c}->{compose}}, 0); } } -# ...and the case folding table. Again we compress equal entries to save -# space. In Unicode 5.0.0 this saves 51 entries or at least 408 bytes. -# This doesns't seem as worthwhile as the decomposition mapping saving above. +# The case folding table. print STDERR "> case-fold mappings\n"; for(my $c = 0; $c <= $max; ++$c) { if(exists $data{$c} && exists $data{$c}->{casefold}) { @@ -631,3 +655,12 @@ printf STDERR "modulus=%d\n", $modulus; printf STDERR "max=%04X\n", $max; print STDERR "subtables=$subtablecounter, subtablessaved=$subtablessaved\n"; print STDERR "ddsaved=$ddsaved\n"; +print STDERR "maxcompat=$maxcompat maxcanon=$maxcanon\n"; +print STDERR "$hangul_syllable_decomps canonical decompositions to Hangul syllables\n"; +print STDERR "$hangul_choseong_decomps canonical decompositions to Hangul Choseong\n"; + +die "We assumed that canonical decompositions were never more than 2 long!\n" + if $maxcanon > 2; + +die "We assumed no canonical decompositions to Hangul syllables/Choseong!\n" + if $hangul_syllable_decomps || $hangul_choseong_decomps;