X-Git-Url: http://www.chiark.greenend.org.uk/ucgi/~mdw/git/disorder/blobdiff_plain/99695df91e7db54281bf397baa6989836c4ce9e3..8b15f02d9c4041d45a310dce13f67b5a8bf11ca8:/scripts/make-unidata diff --git a/scripts/make-unidata b/scripts/make-unidata index bf4b324..5611b0d 100755 --- a/scripts/make-unidata +++ b/scripts/make-unidata @@ -105,6 +105,10 @@ sub input { # Read the main data file input("UnicodeData.txt"); my ($start, $end); +my $maxcompat = 0; +my $maxcanon = 0; +my $hangul_syllable_decomps = 0; +my $hangul_choseong_decomps = 0; while(<>) { my @f = split(/;/, $_); my $c = hex($f[0]); # codepoint @@ -147,12 +151,26 @@ while(<>) { "ld" => $ld, }; if($dm ne '') { + my $maxref; if($dm =~ /\s*//; $d->{compat} = 1; + $maxref = \$maxcompat; + } else { + $maxref = \$maxcanon; } $d->{decomp} = [map(hex($_), split(/\s+/, $dm))]; + my $len = scalar @{$d->{decomp}}; + $$maxref = $len if $len > $$maxref; + if(!$d->{compat}) { + if(${$d->{decomp}}[0] >= 0xAC00 && ${$d->{decomp}}[0] <= 0xD7A3) { + ++$hangul_syllable_decomps; + } + if(${$d->{decomp}}[0] >= 0x1100 && ${$d->{decomp}}[0] <= 0x115F) { + ++$hangul_choseong_decomps; + } + } } $data{$c} = $d; } @@ -421,8 +439,7 @@ print STDERR "Generating unidata.c...\n"; open(STDOUT, ">unidata.c") or die "unidata.c: $!\n"; out("/* Automatically generated file, see scripts/make-unidata */\n", - "#include \n", - "#include \"types.h\"\n", + "#include \"common.h\"\n", "#include \"unidata.h\"\n"); # Short aliases to keep .c file small @@ -473,9 +490,7 @@ sub dedupe { return "dd$ddnums{$s}"; } -# Generate the decomposition mapping tables. We look out for duplicates -# in order to save space and report this as decompsaved at the end. In -# Unicode 5.0.0 this saves 1795 entries, which is at least 14Kbytes. +# Generate the decomposition mapping tables. print STDERR "> decomposition mappings\n"; for(my $c = 0; $c <= $max; ++$c) { if(exists $data{$c} && exists $data{$c}->{decomp}) { @@ -485,7 +500,7 @@ for(my $c = 0; $c <= $max; ++$c) { print STDERR "> composition mappings\n"; # First we must generate the mapping of each code point to possible -# compositions +# compositions. for(my $c = 0; $c <= $max; ++$c) { if(exists $data{$c} && exists $data{$c}->{decomp} @@ -495,21 +510,20 @@ for(my $c = 0; $c <= $max; ++$c) { # a primary composite. Find the first code point of the decomposition my $first = ${$data{$c}->{decomp}}[0]; if(!exists $data{$first}->{compose}) { - $data{$first}->{compose} = [$first]; + $data{$first}->{compose} = [$c]; } else { - push(@{$data{$first}->{compose}}, $first); + push(@{$data{$first}->{compose}}, $c); } } } +# Then we can generate the tables. for(my $c = 0; $c <= $max; ++$c) { if(exists $data{$c} && exists $data{$c}->{compose}) { $data{$c}->{compsym} = dedupe(@{$data{$c}->{compose}}, 0); } } -# ...and the case folding table. Again we compress equal entries to save -# space. In Unicode 5.0.0 this saves 51 entries or at least 408 bytes. -# This doesns't seem as worthwhile as the decomposition mapping saving above. +# The case folding table. print STDERR "> case-fold mappings\n"; for(my $c = 0; $c <= $max; ++$c) { if(exists $data{$c} && exists $data{$c}->{casefold}) { @@ -631,3 +645,12 @@ printf STDERR "modulus=%d\n", $modulus; printf STDERR "max=%04X\n", $max; print STDERR "subtables=$subtablecounter, subtablessaved=$subtablessaved\n"; print STDERR "ddsaved=$ddsaved\n"; +print STDERR "maxcompat=$maxcompat maxcanon=$maxcanon\n"; +print STDERR "$hangul_syllable_decomps canonical decompositions to Hangul syllables\n"; +print STDERR "$hangul_choseong_decomps canonical decompositions to Hangul Choseong\n"; + +die "We assumed that canonical decompositions were never more than 2 long!\n" + if $maxcanon > 2; + +die "We assumed no canonical decompositions to Hangul syllables/Choseong!\n" + if $hangul_syllable_decomps || $hangul_choseong_decomps;