| 1 | #! /usr/bin/perl -w |
| 2 | # |
| 3 | # This file is part of DisOrder. |
| 4 | # Copyright (C) 2007 Richard Kettlewell |
| 5 | # |
| 6 | # This program is free software: you can redistribute it and/or modify |
| 7 | # it under the terms of the GNU General Public License as published by |
| 8 | # the Free Software Foundation, either version 3 of the License, or |
| 9 | # (at your option) any later version. |
| 10 | # |
| 11 | # This program is distributed in the hope that it will be useful, |
| 12 | # but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 13 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
| 14 | # GNU General Public License for more details. |
| 15 | # |
| 16 | # You should have received a copy of the GNU General Public License |
| 17 | # along with this program. If not, see <http://www.gnu.org/licenses/>. |
| 18 | # |
| 19 | # |
| 20 | # Generate Unicode support tables |
| 21 | # |
| 22 | # This script will download data from unicode.org if the required files |
| 23 | # aren't in the current directory. |
| 24 | # |
| 25 | # After modifying this script you should run: |
| 26 | # make -C lib rebuild-unicode check |
| 27 | # |
| 28 | # Things not supported yet: |
| 29 | # - SpecialCasing.txt data for case mapping |
| 30 | # - Title case offsets |
| 31 | # - Some kind of hinting for composition |
| 32 | # - ... |
| 33 | # |
| 34 | # NB the generated files DO NOT offer a stable ABI and so are not immediately |
| 35 | # suitable for use in a general-purpose library. Things that would need to |
| 36 | # be done: |
| 37 | # - Hide unidata.h from applications; it will never be ABI- or even API-stable. |
| 38 | # - Stablized General_Category values |
| 39 | # - Extend the unicode.h API to general utility rather than just what |
| 40 | # DisOrder needs. |
| 41 | # - ... |
| 42 | # |
| 43 | use strict; |
| 44 | use File::Basename; |
| 45 | |
| 46 | sub out { |
| 47 | print @_ or die "$!\n"; |
| 48 | } |
| 49 | |
| 50 | sub key { |
| 51 | my $d = shift; |
| 52 | local $_; |
| 53 | |
| 54 | return join("-", map($d->{$_}, sort keys %$d)); |
| 55 | } |
| 56 | |
| 57 | # Size of a subtable |
| 58 | # |
| 59 | # This can be varied to trade off the number of subtables against their size. |
| 60 | # 16 gave the smallest results last time I checked (on a Mac with a 32-bit |
| 61 | # build). |
| 62 | our $modulus = 16; |
| 63 | |
| 64 | if(@ARGV) { |
| 65 | $modulus = shift; |
| 66 | } |
| 67 | |
| 68 | # Where to break the table. There is a huge empty section of the Unicode |
| 69 | # code space and we deal with this by simply leaving it out of the table. |
| 70 | # This complicates the lookup function a little but should not affect |
| 71 | # performance in the cases we care about. |
| 72 | our $break_start = 0x30000; |
| 73 | our $break_end = 0xE0000; |
| 74 | |
| 75 | # Similarly we simply omit the very top of the table and sort it out in the |
| 76 | # lookup function. |
| 77 | our $break_top = 0xE0200; |
| 78 | |
| 79 | my %cats = (); # known general categories |
| 80 | my %data = (); # mapping of codepoints to information |
| 81 | my $max = 0; # maximum codepoint |
| 82 | my $maxccc = 0; # maximum combining class |
| 83 | my $maxud = 0; |
| 84 | my $minud = 0; # max/min upper case offset |
| 85 | my $maxld = 0; |
| 86 | my $minld = 0; # max/min lower case offset |
| 87 | |
| 88 | # Make sure we have our desired input files. We explicitly specify a |
| 89 | # Unicode standard version to make sure that a given version of DisOrder |
| 90 | # supports a given version of Unicode. |
| 91 | sub input { |
| 92 | my $path = shift; |
| 93 | my $lpath = basename($path); |
| 94 | if(!-e $lpath) { |
| 95 | system("wget http://www.unicode.org/Public/6.0.0/ucd/$path"); |
| 96 | chmod(0444, $lpath) or die "$lpath: $!\n"; |
| 97 | } |
| 98 | open(STDIN, "<$lpath") or die "$lpath: $!\n"; |
| 99 | print STDERR "Reading $lpath...\n"; |
| 100 | } |
| 101 | |
| 102 | |
| 103 | # Read the main data file |
| 104 | input("UnicodeData.txt"); |
| 105 | my ($start, $end); |
| 106 | my $maxcompat = 0; |
| 107 | my $maxcanon = 0; |
| 108 | my $hangul_syllable_decomps = 0; |
| 109 | my $hangul_choseong_decomps = 0; |
| 110 | while(<>) { |
| 111 | my @f = split(/;/, $_); |
| 112 | my $c = hex($f[0]); # codepoint |
| 113 | my $name = $f[1]; |
| 114 | die "$f[0] $name is in the break\n" |
| 115 | if $c >= $break_start && $c < $break_end; |
| 116 | my $gc = $f[2]; # General_Category |
| 117 | # Variuos GCs we don't expect to see in UnicodeData.txt |
| 118 | $cats{$gc} = 1; # always record all GCs |
| 119 | if($name =~ /first>/i) { |
| 120 | $start = $c; |
| 121 | next; |
| 122 | } elsif($name =~ /last>/i) { |
| 123 | $end = $c; |
| 124 | } else { |
| 125 | $start = $end = $c; |
| 126 | } |
| 127 | die "unexpected Cn" if $gc eq 'Cn'; |
| 128 | my $ccc = $f[3]; # Canonical_Combining_Class |
| 129 | my $dm = $f[5]; # Decomposition_Type + Decomposition_Mapping |
| 130 | my $sum = hex($f[12]) || $c; # Simple_Uppercase_Mapping |
| 131 | my $slm = hex($f[13]) || $c; # Simple_Lowercase_Mapping |
| 132 | # recalculate the upper/lower case mappings as offsets |
| 133 | my $ud = $sum - $c; |
| 134 | my $ld = $slm - $c; |
| 135 | # update bounds on various values |
| 136 | $maxccc = $ccc if $ccc > $maxccc; # assumed never to be -ve |
| 137 | $minud = $ud if $ud < $minud; |
| 138 | $maxud = $ud if $ud > $maxud; |
| 139 | $minld = $ld if $ld < $minld; |
| 140 | $maxld = $ld if $ld > $maxld; |
| 141 | if($start != $end) { |
| 142 | printf STDERR "> range %04X-%04X is %s\n", $start, $end, $gc; |
| 143 | } |
| 144 | for($c = $start; $c <= $end; ++$c) { |
| 145 | my $d = { |
| 146 | "gc" => $gc, |
| 147 | "ccc" => $ccc, |
| 148 | "ud" => $ud, |
| 149 | "ld" => $ld, |
| 150 | }; |
| 151 | if($dm ne '') { |
| 152 | my $maxref; |
| 153 | if($dm =~ /</) { |
| 154 | # This is a compatibility decomposition |
| 155 | $dm =~ s/^<.*>\s*//; |
| 156 | $d->{compat} = 1; |
| 157 | $maxref = \$maxcompat; |
| 158 | } else { |
| 159 | $maxref = \$maxcanon; |
| 160 | } |
| 161 | $d->{decomp} = [map(hex($_), split(/\s+/, $dm))]; |
| 162 | my $len = scalar @{$d->{decomp}}; |
| 163 | $$maxref = $len if $len > $$maxref; |
| 164 | if(!$d->{compat}) { |
| 165 | if(${$d->{decomp}}[0] >= 0xAC00 && ${$d->{decomp}}[0] <= 0xD7A3) { |
| 166 | ++$hangul_syllable_decomps; |
| 167 | } |
| 168 | if(${$d->{decomp}}[0] >= 0x1100 && ${$d->{decomp}}[0] <= 0x115F) { |
| 169 | ++$hangul_choseong_decomps; |
| 170 | } |
| 171 | } |
| 172 | } |
| 173 | $data{$c} = $d; |
| 174 | } |
| 175 | $cats{$gc} = 1; |
| 176 | $max = $end if $end > $max; |
| 177 | } |
| 178 | |
| 179 | sub read_prop_with_ranges { |
| 180 | my $path = shift; |
| 181 | my $propkey = shift; |
| 182 | input($path); |
| 183 | while(<>) { |
| 184 | chomp; |
| 185 | s/\s*\#.*//; |
| 186 | next if $_ eq ''; |
| 187 | my ($range, $propval) = split(/\s*;\s*/, $_); |
| 188 | if($range =~ /(.*)\.\.(.*)/) { |
| 189 | for my $c (hex($1) .. hex($2)) { |
| 190 | $data{$c}->{$propkey} = $propval; |
| 191 | } |
| 192 | } else { |
| 193 | my $c = hex($range); |
| 194 | $data{$c}->{$propkey} = $propval; |
| 195 | } |
| 196 | } |
| 197 | } |
| 198 | |
| 199 | # Grapheme_Break etc |
| 200 | read_prop_with_ranges("auxiliary/GraphemeBreakProperty.txt", "gbreak"); |
| 201 | read_prop_with_ranges("auxiliary/WordBreakProperty.txt", "wbreak"); |
| 202 | read_prop_with_ranges("auxiliary/SentenceBreakProperty.txt", "sbreak"); |
| 203 | |
| 204 | # Compute the full list and fill in the Extend category properly |
| 205 | my %gbreak = (); |
| 206 | my %wbreak = (); |
| 207 | my %sbreak = (); |
| 208 | for my $c (keys %data) { |
| 209 | if(!exists $data{$c}->{gbreak}) { |
| 210 | $data{$c}->{gbreak} = 'Other'; |
| 211 | } |
| 212 | $gbreak{$data{$c}->{gbreak}} = 1; |
| 213 | |
| 214 | if(!exists $data{$c}->{wbreak}) { |
| 215 | if($data{$c}->{gbreak} eq 'Extend') { |
| 216 | $data{$c}->{wbreak} = 'Extend'; |
| 217 | } else { |
| 218 | $data{$c}->{wbreak} = 'Other'; |
| 219 | } |
| 220 | } |
| 221 | $wbreak{$data{$c}->{wbreak}} = 1; |
| 222 | |
| 223 | if(!exists $data{$c}->{sbreak}) { |
| 224 | if($data{$c}->{gbreak} eq 'Extend') { |
| 225 | $data{$c}->{sbreak} = 'Extend'; |
| 226 | } else { |
| 227 | $data{$c}->{sbreak} = 'Other'; |
| 228 | } |
| 229 | } |
| 230 | $sbreak{$data{$c}->{sbreak}} = 1; |
| 231 | } |
| 232 | |
| 233 | # Various derived properties |
| 234 | input("DerivedNormalizationProps.txt"); |
| 235 | while(<>) { |
| 236 | chomp; |
| 237 | s/\s*\#.*//; |
| 238 | next if $_ eq ''; |
| 239 | my @f = split(/\s*;\s*/, $_); |
| 240 | if(@f == 2) { |
| 241 | push(@f, 1); |
| 242 | } |
| 243 | my ($range, $propkey, $propval) = @f; |
| 244 | if($range =~ /(.*)\.\.(.*)/) { |
| 245 | for my $c (hex($1) .. hex($2)) { |
| 246 | $data{$c}->{$propkey} = $propval |
| 247 | } |
| 248 | } else { |
| 249 | my $c = hex($range); |
| 250 | $data{$c}->{$propkey} = $propval |
| 251 | } |
| 252 | } |
| 253 | |
| 254 | # Round up the maximum value to a whole number of subtables |
| 255 | $max += ($modulus - 1) - ($max % $modulus); |
| 256 | |
| 257 | # Private use characters |
| 258 | # We only fill in values below $max, utf32__unidata() |
| 259 | my $Co = { |
| 260 | "gc" => "Co", |
| 261 | "ccc" => 0, |
| 262 | "ud" => 0, |
| 263 | "ld" => 0 |
| 264 | }; |
| 265 | for(my $c = 0xE000; $c <= 0xF8FF && $c <= $max; ++$c) { |
| 266 | $data{$c} = $Co; |
| 267 | } |
| 268 | for(my $c = 0xF0000; $c <= 0xFFFFD && $c <= $max; ++$c) { |
| 269 | $data{$c} = $Co; |
| 270 | } |
| 271 | for(my $c = 0x100000; $c <= 0x10FFFD && $c <= $max; ++$c) { |
| 272 | $data{$c} = $Co; |
| 273 | } |
| 274 | |
| 275 | # Anything left is not assigned |
| 276 | my $Cn = { |
| 277 | "gc" => "Cn", # not assigned |
| 278 | "ccc" => 0, |
| 279 | "ud" => 0, |
| 280 | "ld" => 0 |
| 281 | }; |
| 282 | for(my $c = 0; $c <= $max; ++$c) { |
| 283 | if(!exists $data{$c}) { |
| 284 | $data{$c} = $Cn; |
| 285 | } |
| 286 | if(!exists $data{$c}->{wbreak}) { |
| 287 | $data{$c}->{wbreak} = 'Other'; |
| 288 | } |
| 289 | if(!exists $data{$c}->{gbreak}) { |
| 290 | $data{$c}->{gbreak} = 'Other'; |
| 291 | } |
| 292 | if(!exists $data{$c}->{sbreak}) { |
| 293 | $data{$c}->{sbreak} = 'Other'; |
| 294 | } |
| 295 | } |
| 296 | $cats{'Cn'} = 1; |
| 297 | |
| 298 | # Read the casefolding data too |
| 299 | input("CaseFolding.txt"); |
| 300 | while(<>) { |
| 301 | chomp; |
| 302 | next if /^\#/ or $_ eq ''; |
| 303 | my @f = split(/\s*;\s*/, $_); |
| 304 | # Full case folding means use status C and F. |
| 305 | # We discard status T, Turkish users may wish to change this. |
| 306 | if($f[1] eq 'C' or $f[1] eq 'F') { |
| 307 | my $c = hex($f[0]); |
| 308 | $data{$c}->{casefold} = $f[2]; |
| 309 | # We are particularly interest in combining characters that |
| 310 | # case-fold to non-combining characters, or characters that |
| 311 | # case-fold to sequences with combining characters in non-initial |
| 312 | # positions, as these required decomposiiton before case-folding |
| 313 | my @d = map(hex($_), split(/\s+/, $data{$c}->{casefold})); |
| 314 | if($data{$c}->{ccc} != 0) { |
| 315 | # This is a combining character |
| 316 | if($data{$d[0]}->{ccc} == 0) { |
| 317 | # The first character of its case-folded form is NOT |
| 318 | # a combining character. The field name is the example |
| 319 | # explicitly mentioned in the spec. |
| 320 | $data{$c}->{ypogegrammeni} = 1; |
| 321 | } |
| 322 | } else { |
| 323 | # This is a non-combining character; inspect the non-initial |
| 324 | # code points of the case-folded sequence |
| 325 | shift(@d); |
| 326 | if(grep($data{$_}->{ccc} != 0, @d)) { |
| 327 | # Some non-initial code point in the case-folded for is NOT a |
| 328 | # a combining character. |
| 329 | $data{$c}->{ypogegrammeni} = 1; |
| 330 | } |
| 331 | } |
| 332 | } |
| 333 | } |
| 334 | |
| 335 | # Generate the header file |
| 336 | print STDERR "Generating unidata.h...\n"; |
| 337 | open(STDOUT, ">unidata.h") or die "unidata.h: $!\n"; |
| 338 | |
| 339 | out("/** \@file lib/unidata.h\n", |
| 340 | " * \@brief Unicode tables\n", |
| 341 | " *\n", |
| 342 | " * Automatically generated file, see scripts/make-unidata\n", |
| 343 | " *\n", |
| 344 | " * DO NOT EDIT.\n", |
| 345 | " */\n", |
| 346 | "#ifndef UNIDATA_H\n", |
| 347 | "#define UNIDATA_H\n"); |
| 348 | |
| 349 | # TODO choose stable values for General_Category |
| 350 | out("enum unicode_General_Category {\n", |
| 351 | join(",\n", |
| 352 | map(" unicode_General_Category_$_", sort keys %cats)), "\n};\n"); |
| 353 | |
| 354 | out("enum unicode_Grapheme_Break {\n", |
| 355 | join(",\n", |
| 356 | map(" unicode_Grapheme_Break_$_", sort keys %gbreak)), |
| 357 | "\n};\n"); |
| 358 | out("extern const char *const unicode_Grapheme_Break_names[];\n"); |
| 359 | |
| 360 | out("enum unicode_Word_Break {\n", |
| 361 | join(",\n", |
| 362 | map(" unicode_Word_Break_$_", sort keys %wbreak)), |
| 363 | "\n};\n"); |
| 364 | out("extern const char *const unicode_Word_Break_names[];\n"); |
| 365 | |
| 366 | out("enum unicode_Sentence_Break {\n", |
| 367 | join(",\n", |
| 368 | map(" unicode_Sentence_Break_$_", sort keys %sbreak)), |
| 369 | "\n};\n"); |
| 370 | out("extern const char *const unicode_Sentence_Break_names[];\n"); |
| 371 | |
| 372 | out("enum unicode_flags {\n", |
| 373 | " unicode_normalize_before_casefold = 1,\n", |
| 374 | " unicode_compatibility_decomposition = 2\n", |
| 375 | "};\n", |
| 376 | "\n"); |
| 377 | |
| 378 | # Choose the narrowest type that will fit the required values |
| 379 | sub choosetype { |
| 380 | my ($min, $max) = @_; |
| 381 | if($min >= 0) { |
| 382 | return "char" if $max <= 127; |
| 383 | return "unsigned char" if $max <= 255; |
| 384 | return "int16_t" if $max < 32767; |
| 385 | return "uint16_t" if $max < 65535; |
| 386 | return "int32_t"; |
| 387 | } else { |
| 388 | return "char" if $min >= -127 && $max <= 127; |
| 389 | return "int16_t" if $min >= -32767 && $max <= 32767; |
| 390 | return "int32_t"; |
| 391 | } |
| 392 | } |
| 393 | |
| 394 | out("struct unidata {\n", |
| 395 | # decomposition (canonical or compatibility; |
| 396 | # unicode_compatibility_decomposition distinguishes) or NULL |
| 397 | " const uint32_t *decomp;\n", |
| 398 | |
| 399 | # case-folded string or NULL |
| 400 | " const uint32_t *casefold;\n", |
| 401 | |
| 402 | # composed characters that start with this code point. This only |
| 403 | # includes primary composites, i.e. the decomposition mapping is |
| 404 | # canonical and this code point is not in the exclusion table. |
| 405 | " const uint32_t *composed;\n", |
| 406 | |
| 407 | # " ".choosetype($minud, $maxud)." upper_offset;\n", |
| 408 | # " ".choosetype($minld, $maxld)." lower_offset;\n", |
| 409 | |
| 410 | # canonical combining class |
| 411 | " ".choosetype(0, $maxccc)." ccc;\n", |
| 412 | " char general_category;\n", |
| 413 | |
| 414 | # see unicode_flags enum |
| 415 | " uint8_t flags;\n", |
| 416 | " char grapheme_break;\n", |
| 417 | " char word_break;\n", |
| 418 | " char sentence_break;\n", |
| 419 | "};\n"); |
| 420 | # decomp and casefold do have have non-BMP characters, so we |
| 421 | # can't use a simple 16-bit table. We could use UTF-8 or UTF-16 |
| 422 | # though, saving a bit of space (probably not that much...) at the |
| 423 | # cost of marginally reduced performance and additional complexity |
| 424 | |
| 425 | out("extern const struct unidata *const unidata[];\n"); |
| 426 | |
| 427 | out("extern const struct unicode_utf8_row {\n", |
| 428 | " uint8_t count;\n", |
| 429 | " uint8_t min2, max2;\n", |
| 430 | "} unicode_utf8_valid[];\n"); |
| 431 | |
| 432 | out("#define UNICODE_NCHARS ", ($max + 1), "\n"); |
| 433 | out("#define UNICODE_MODULUS $modulus\n"); |
| 434 | out("#define UNICODE_BREAK_START $break_start\n"); |
| 435 | out("#define UNICODE_BREAK_END $break_end\n"); |
| 436 | out("#define UNICODE_BREAK_TOP $break_top\n"); |
| 437 | |
| 438 | out("#endif\n"); |
| 439 | |
| 440 | close STDOUT or die "unidata.h: $!\n"; |
| 441 | |
| 442 | print STDERR "Generating unidata.c...\n"; |
| 443 | open(STDOUT, ">unidata.c") or die "unidata.c: $!\n"; |
| 444 | |
| 445 | out("/** \@file lib/unidata.c\n", |
| 446 | " * \@brief Unicode tables\n", |
| 447 | " *\n", |
| 448 | " * Automatically generated file, see scripts/make-unidata\n", |
| 449 | " *\n", |
| 450 | " * DO NOT EDIT.\n", |
| 451 | " */\n", |
| 452 | "#include \"common.h\"\n", |
| 453 | "#include \"unidata.h\"\n"); |
| 454 | |
| 455 | # Short aliases to keep .c file small |
| 456 | |
| 457 | out(map(sprintf("#define %s unicode_General_Category_%s\n", $_, $_), |
| 458 | sort keys %cats)); |
| 459 | out(map(sprintf("#define GB%s unicode_Grapheme_Break_%s\n", $_, $_), |
| 460 | sort keys %gbreak)); |
| 461 | out(map(sprintf("#define WB%s unicode_Word_Break_%s\n", $_, $_), |
| 462 | sort keys %wbreak)); |
| 463 | out(map(sprintf("#define SB%s unicode_Sentence_Break_%s\n", $_, $_), |
| 464 | sort keys %sbreak)); |
| 465 | out("#define NBC unicode_normalize_before_casefold\n"); |
| 466 | out("#define CD unicode_compatibility_decomposition\n"); |
| 467 | |
| 468 | # Names for *_Break properties |
| 469 | out("const char *const unicode_Grapheme_Break_names[] = {\n", |
| 470 | join(",\n", |
| 471 | map(" \"$_\"", sort keys %gbreak)), |
| 472 | "\n};\n"); |
| 473 | out("const char *const unicode_Word_Break_names[] = {\n", |
| 474 | join(",\n", |
| 475 | map(" \"$_\"", sort keys %wbreak)), |
| 476 | "\n};\n"); |
| 477 | out("const char *const unicode_Sentence_Break_names[] = {\n", |
| 478 | join(",\n", |
| 479 | map(" \"$_\"", sort keys %sbreak)), |
| 480 | "\n};\n"); |
| 481 | |
| 482 | our $ddnum = 0; |
| 483 | our $ddsaved = 0; |
| 484 | our %ddnums = (); |
| 485 | my $ddfirst = 1; |
| 486 | out("static const uint32_t "); |
| 487 | sub dedupe { |
| 488 | my $s = join(",", @_); |
| 489 | if(!exists $ddnums{$s}) { |
| 490 | if($ddfirst) { |
| 491 | $ddfirst = 0; |
| 492 | } else { |
| 493 | out(",\n"); |
| 494 | } |
| 495 | out("dd$ddnum\[]={$s}"); |
| 496 | $ddnums{$s} = $ddnum++; |
| 497 | } else { |
| 498 | ++$ddsaved; |
| 499 | } |
| 500 | return "dd$ddnums{$s}"; |
| 501 | } |
| 502 | |
| 503 | # Generate the decomposition mapping tables. |
| 504 | print STDERR "> decomposition mappings\n"; |
| 505 | for(my $c = 0; $c <= $max; ++$c) { |
| 506 | if(exists $data{$c} && exists $data{$c}->{decomp}) { |
| 507 | $data{$c}->{decompsym} = dedupe(@{$data{$c}->{decomp}}, 0); |
| 508 | } |
| 509 | } |
| 510 | |
| 511 | print STDERR "> composition mappings\n"; |
| 512 | # First we must generate the mapping of each code point to possible |
| 513 | # compositions. |
| 514 | for(my $c = 0; $c <= $max; ++$c) { |
| 515 | if(exists $data{$c} |
| 516 | && exists $data{$c}->{decomp} |
| 517 | && !exists $data{$c}->{compat} |
| 518 | && !$data{$c}->{Full_Composition_Exclusion}) { |
| 519 | # $c has a non-excluded canonical decomposition, i.e. it is |
| 520 | # a primary composite. Find the first code point of the decomposition |
| 521 | my $first = ${$data{$c}->{decomp}}[0]; |
| 522 | if(!exists $data{$first}->{compose}) { |
| 523 | $data{$first}->{compose} = [$c]; |
| 524 | } else { |
| 525 | push(@{$data{$first}->{compose}}, $c); |
| 526 | } |
| 527 | } |
| 528 | } |
| 529 | # Then we can generate the tables. |
| 530 | for(my $c = 0; $c <= $max; ++$c) { |
| 531 | if(exists $data{$c} && exists $data{$c}->{compose}) { |
| 532 | $data{$c}->{compsym} = dedupe(@{$data{$c}->{compose}}, 0); |
| 533 | } |
| 534 | } |
| 535 | |
| 536 | # The case folding table. |
| 537 | print STDERR "> case-fold mappings\n"; |
| 538 | for(my $c = 0; $c <= $max; ++$c) { |
| 539 | if(exists $data{$c} && exists $data{$c}->{casefold}) { |
| 540 | $data{$c}->{cfsym} = dedupe(map(hex($_), split(/\s+/, |
| 541 | $data{$c}->{casefold})), |
| 542 | 0); |
| 543 | } |
| 544 | } |
| 545 | |
| 546 | # End of de-dupable arrays |
| 547 | out(";\n"); |
| 548 | |
| 549 | # Visit all the $modulus-character blocks in turn and generate the |
| 550 | # required subtables. As above we spot duplicates to save space. In |
| 551 | # Unicode 5.0.0 with $modulus=128 and current table data this saves |
| 552 | # 1372 subtables or at least three and a half megabytes on 32-bit |
| 553 | # platforms. |
| 554 | print STDERR "> subtables\n"; |
| 555 | my %subtable = (); # base->subtable number |
| 556 | my %subtableno = (); # subtable number -> content |
| 557 | my $subtablecounter = 0; # counter for subtable numbers |
| 558 | my $subtablessaved = 0; # number of tables saved |
| 559 | for(my $base = 0; $base <= $max; $base += $modulus) { |
| 560 | next if $base >= $break_start && $base < $break_end; |
| 561 | next if $base >= $break_top; |
| 562 | my @t; |
| 563 | for(my $c = $base; $c < $base + $modulus; ++$c) { |
| 564 | my $d = $data{$c}; |
| 565 | my $decompsym = ($d->{decompsym} or "0"); |
| 566 | my $cfsym = ($d->{cfsym} or "0"); |
| 567 | my $compsym = ($d->{compsym} or "0"); |
| 568 | my $ccc = ($d->{ccc} or "0"); |
| 569 | my $gc = ($d->{gc} or "Cn"); |
| 570 | my @flags = (); |
| 571 | if($data{$c}->{ypogegrammeni}) { |
| 572 | push(@flags, "NBC"); |
| 573 | } |
| 574 | if($data{$c}->{compat}) { |
| 575 | push(@flags, "CD"); |
| 576 | } |
| 577 | my $flags = @flags ? join("|", @flags) : 0; |
| 578 | push(@t, "{". |
| 579 | join(",", |
| 580 | $decompsym, |
| 581 | $cfsym, |
| 582 | $compsym, |
| 583 | # $d->{ud}, |
| 584 | # $d->{ld}, |
| 585 | $ccc, |
| 586 | $gc, |
| 587 | $flags, |
| 588 | "GB$d->{gbreak}", |
| 589 | "WB$d->{wbreak}", |
| 590 | "SB$d->{sbreak}", |
| 591 | )."}"); |
| 592 | } |
| 593 | my $t = join(",\n", @t); |
| 594 | if(!exists $subtable{$t}) { |
| 595 | out(sprintf("/* %04X-%04X */\n", $base, $base + $modulus - 1)); |
| 596 | out("static const struct unidata st$subtablecounter\[] = {\n", |
| 597 | "$t\n", |
| 598 | "};\n"); |
| 599 | $subtable{$t} = $subtablecounter++; |
| 600 | } else { |
| 601 | ++$subtablessaved; |
| 602 | } |
| 603 | $subtableno{$base} = $subtable{$t}; |
| 604 | } |
| 605 | |
| 606 | print STDERR "> main table\n"; |
| 607 | out("const struct unidata *const unidata[]={\n"); |
| 608 | for(my $base = 0; $base <= $max; $base += $modulus) { |
| 609 | next if $base >= $break_start && $base < $break_end; |
| 610 | next if $base >= $break_top; |
| 611 | #out("st$subtableno{$base} /* ".sprintf("%04x", $base)." */,\n"); |
| 612 | out("st$subtableno{$base},\n"); |
| 613 | } |
| 614 | out("};\n"); |
| 615 | |
| 616 | print STDERR "> UTF-8 table\n"; |
| 617 | out("const struct unicode_utf8_row unicode_utf8_valid[] = {\n"); |
| 618 | for(my $c = 0; $c <= 0x7F; ++$c) { |
| 619 | out(" { 1, 0, 0 }, /* $c */\n"); |
| 620 | } |
| 621 | for(my $c = 0x80; $c < 0xC2; ++$c) { |
| 622 | out(" { 0, 0, 0 }, /* $c */\n"); |
| 623 | } |
| 624 | for(my $c = 0xC2; $c <= 0xDF; ++$c) { |
| 625 | out(" { 2, 0x80, 0xBF }, /* $c */\n"); |
| 626 | } |
| 627 | for(my $c = 0xE0; $c <= 0xE0; ++$c) { |
| 628 | out(" { 3, 0xA0, 0xBF }, /* $c */\n"); |
| 629 | } |
| 630 | for(my $c = 0xE1; $c <= 0xEC; ++$c) { |
| 631 | out(" { 3, 0x80, 0xBF }, /* $c */\n"); |
| 632 | } |
| 633 | for(my $c = 0xED; $c <= 0xED; ++$c) { |
| 634 | out(" { 3, 0x80, 0x9F }, /* $c */\n"); |
| 635 | } |
| 636 | for(my $c = 0xEE; $c <= 0xEF; ++$c) { |
| 637 | out(" { 3, 0x80, 0xBF }, /* $c */\n"); |
| 638 | } |
| 639 | for(my $c = 0xF0; $c <= 0xF0; ++$c) { |
| 640 | out(" { 4, 0x90, 0xBF }, /* $c */\n"); |
| 641 | } |
| 642 | for(my $c = 0xF1; $c <= 0xF3; ++$c) { |
| 643 | out(" { 4, 0x80, 0xBF }, /* $c */\n"); |
| 644 | } |
| 645 | for(my $c = 0xF4; $c <= 0xF4; ++$c) { |
| 646 | out(" { 4, 0x80, 0x8F }, /* $c */\n"); |
| 647 | } |
| 648 | for(my $c = 0xF5; $c <= 0xFF; ++$c) { |
| 649 | out(" { 0, 0, 0 }, /* $c */\n"); |
| 650 | } |
| 651 | out("};\n"); |
| 652 | |
| 653 | close STDOUT or die "unidata.c: $!\n"; |
| 654 | |
| 655 | print STDERR "Done.\n\n"; |
| 656 | printf STDERR "modulus=%d\n", $modulus; |
| 657 | printf STDERR "max=%04X\n", $max; |
| 658 | print STDERR "subtables=$subtablecounter, subtablessaved=$subtablessaved\n"; |
| 659 | print STDERR "ddsaved=$ddsaved\n"; |
| 660 | print STDERR "maxcompat=$maxcompat maxcanon=$maxcanon\n"; |
| 661 | print STDERR "$hangul_syllable_decomps canonical decompositions to Hangul syllables\n"; |
| 662 | print STDERR "$hangul_choseong_decomps canonical decompositions to Hangul Choseong\n"; |
| 663 | |
| 664 | die "We assumed that canonical decompositions were never more than 2 long!\n" |
| 665 | if $maxcanon > 2; |
| 666 | |
| 667 | die "We assumed no canonical decompositions to Hangul syllables/Choseong!\n" |
| 668 | if $hangul_syllable_decomps || $hangul_choseong_decomps; |