# Read the main data file
input("UnicodeData.txt");
my ($start, $end);
+my $maxcompat = 0;
+my $maxcanon = 0;
+my $hangul_syllable_decomps = 0;
+my $hangul_choseong_decomps = 0;
while(<>) {
my @f = split(/;/, $_);
my $c = hex($f[0]); # codepoint
"ld" => $ld,
};
if($dm ne '') {
+ my $maxref;
if($dm =~ /</) {
# This is a compatibility decomposition
$dm =~ s/^<.*>\s*//;
$d->{compat} = 1;
+ $maxref = \$maxcompat;
+ } else {
+ $maxref = \$maxcanon;
}
$d->{decomp} = [map(hex($_), split(/\s+/, $dm))];
+ my $len = scalar @{$d->{decomp}};
+ $$maxref = $len if $len > $$maxref;
+ if(!$d->{compat}) {
+ if(${$d->{decomp}}[0] >= 0xAC00 && ${$d->{decomp}}[0] <= 0xD7A3) {
+ ++$hangul_syllable_decomps;
+ }
+ if(${$d->{decomp}}[0] >= 0x1100 && ${$d->{decomp}}[0] <= 0x115F) {
+ ++$hangul_choseong_decomps;
+ }
+ }
}
$data{$c} = $d;
}
return "dd$ddnums{$s}";
}
-# Generate the decomposition mapping tables. We look out for duplicates
-# in order to save space and report this as decompsaved at the end. In
-# Unicode 5.0.0 this saves 1795 entries, which is at least 14Kbytes.
+# Generate the decomposition mapping tables.
print STDERR "> decomposition mappings\n";
for(my $c = 0; $c <= $max; ++$c) {
if(exists $data{$c} && exists $data{$c}->{decomp}) {
print STDERR "> composition mappings\n";
# First we must generate the mapping of each code point to possible
-# compositions
+# compositions.
for(my $c = 0; $c <= $max; ++$c) {
if(exists $data{$c}
&& exists $data{$c}->{decomp}
# a primary composite. Find the first code point of the decomposition
my $first = ${$data{$c}->{decomp}}[0];
if(!exists $data{$first}->{compose}) {
- $data{$first}->{compose} = [$first];
+ $data{$first}->{compose} = [$c];
} else {
- push(@{$data{$first}->{compose}}, $first);
+ push(@{$data{$first}->{compose}}, $c);
}
}
}
+# Then we can generate the tables.
for(my $c = 0; $c <= $max; ++$c) {
if(exists $data{$c} && exists $data{$c}->{compose}) {
$data{$c}->{compsym} = dedupe(@{$data{$c}->{compose}}, 0);
}
}
-# ...and the case folding table. Again we compress equal entries to save
-# space. In Unicode 5.0.0 this saves 51 entries or at least 408 bytes.
-# This doesns't seem as worthwhile as the decomposition mapping saving above.
+# The case folding table.
print STDERR "> case-fold mappings\n";
for(my $c = 0; $c <= $max; ++$c) {
if(exists $data{$c} && exists $data{$c}->{casefold}) {
printf STDERR "max=%04X\n", $max;
print STDERR "subtables=$subtablecounter, subtablessaved=$subtablessaved\n";
print STDERR "ddsaved=$ddsaved\n";
+print STDERR "maxcompat=$maxcompat maxcanon=$maxcanon\n";
+print STDERR "$hangul_syllable_decomps canonical decompositions to Hangul syllables\n";
+print STDERR "$hangul_choseong_decomps canonical decompositions to Hangul Choseong\n";
+
+die "We assumed that canonical decompositions were never more than 2 long!\n"
+ if $maxcanon > 2;
+
+die "We assumed no canonical decompositions to Hangul syllables/Choseong!\n"
+ if $hangul_syllable_decomps || $hangul_choseong_decomps;