chiark - git - mdw - disorder/blob - scripts/make-unidata

   1 #! /usr/bin/perl -w
   2 #
   3 # This file is part of DisOrder.
   4 # Copyright (C) 2007 Richard Kettlewell
   5 #
   6 # This program is free software; you can redistribute it and/or modify
   7 # it under the terms of the GNU General Public License as published by
   8 # the Free Software Foundation; either version 2 of the License, or
   9 # (at your option) any later version.
  10 #
  11 # This program is distributed in the hope that it will be useful, but
  12 # WITHOUT ANY WARRANTY; without even the implied warranty of
  13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14 # General Public License for more details.
  15 #
  16 # You should have received a copy of the GNU General Public License
  17 # along with this program; if not, write to the Free Software
  18 # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
  19 # USA
  20 #
  21 #
  22 # Generate Unicode support tables
  23 #
  24 # This script will download data from unicode.org if the required files
  25 # aren't in the current directory.
  26 #
  27 # After modifying this script you should run:
  28 #  make -C lib rebuild-unicode check
  29 #
  30 # Things not supported yet:
  31 #  - SpecialCasing.txt data for case mapping
  32 #  - Title case offsets
  33 #  - Some kind of hinting for composition
  34 #  - ...
  35 #
  36 # NB the generated files DO NOT offer a stable ABI and so are not immediately
  37 # suitable for use in a general-purpose library.  Things that would need to
  38 # be done:
  39 #  - Hide unidata.h from applications; it will never be ABI- or even API-stable.
  40 #  - Stablized General_Category values
  41 #  - Extend the unicode.h API to general utility rather than just what
  42 #    DisOrder needs.
  43 #  - ...
  44 #
  45 use strict;
  46 use File::Basename;
  47
  48 sub out {
  49     print @_ or die "$!\n";
  50 }
  51
  52 sub key {
  53     my $d = shift;
  54     local $_;
  55
  56     return join("-", map($d->{$_}, sort keys %$d));
  57 }
  58
  59 # Size of a subtable
  60 #
  61 # This can be varied to trade off the number of subtables against their size.
  62 our $modulus = 128;
  63
  64 # Where to break the table.  There is a huge empty section of the Unicode
  65 # code space and we deal with this by simply leaving it out of the table.
  66 # This complicates the lookup function a little but should not affect
  67 # performance in the cases we care about.
  68 our $break_start = 0x30000;
  69 our $break_end = 0xE0000;
  70
  71 # Similarly we simply omit the very top of the table and sort it out in the
  72 # lookup function.
  73 our $break_top = 0xE0200;
  74
  75 my %cats = ();                  # known general categories
  76 my %data = ();                  # mapping of codepoints to information
  77 my $max = 0;                    # maximum codepoint
  78 my $maxccc = 0;                 # maximum combining class
  79 my $maxud = 0;
  80 my $minud = 0;                  # max/min upper case offset
  81 my $maxld = 0;
  82 my $minld = 0;                  # max/min lower case offset
  83
  84 # Make sure we have our desired input files.  We explicitly specify a
  85 # Unicode standard version to make sure that a given version of DisOrder
  86 # supports a given version of Unicode.
  87 sub input {
  88     my $path = shift;
  89     my $lpath = basename($path);
  90     if(!-e $lpath) {
  91         system("wget http://www.unicode.org/Public/5.0.0/ucd/$path");
  92         chmod(0444, $lpath) or die "$lpath: $!\n";
  93     }
  94     open(STDIN, "<$lpath") or die "$lpath: $!\n";
  95     print STDERR "Reading $lpath...\n";
  96 }
  97
  98
  99 # Read the main data file
 100 input("UnicodeData.txt");
 101 my ($start, $end);
 102 while(<>) {
 103     my @f = split(/;/, $_);
 104     my $c = hex($f[0]);         # codepoint
 105     my $name = $f[1];
 106     die "$f[0] $name is in the break\n"
 107         if $c >= $break_start && $c < $break_end;
 108     my $gc = $f[2];             # General_Category
 109     # Variuos GCs we don't expect to see in UnicodeData.txt
 110     $cats{$gc} = 1;             # always record all GCs
 111     if($name =~ /first>/i) {
 112         $start = $c;
 113         next;
 114     } elsif($name =~ /last>/i) {
 115         $end = $c;
 116     } else {
 117         $start = $end = $c;
 118     }
 119     die "unexpected Cn" if $gc eq 'Cn';
 120     my $ccc = $f[3];            # Canonical_Combining_Class
 121     my $dm = $f[5];             # Decomposition_Type + Decomposition_Mapping
 122     my $sum = hex($f[12]) || $c; # Simple_Uppercase_Mapping
 123     my $slm = hex($f[13]) || $c; # Simple_Lowercase_Mapping
 124     # recalculate the upper/lower case mappings as offsets
 125     my $ud = $sum - $c;
 126     my $ld = $slm - $c;
 127     # update bounds on various values
 128     $maxccc = $ccc if $ccc > $maxccc; # assumed never to be -ve
 129     $minud = $ud if $ud < $minud;
 130     $maxud = $ud if $ud > $maxud;
 131     $minld = $ld if $ld < $minld;
 132     $maxld = $ld if $ld > $maxld;
 133     if($start != $end) {
 134         printf STDERR "> range %04X-%04X is %s\n", $start, $end, $gc;
 135     }
 136     for($c = $start; $c <= $end; ++$c) {
 137         my $d = {
 138             "gc" => $gc,
 139             "ccc" => $ccc,
 140             "ud" => $ud,
 141             "ld" => $ld,
 142         };
 143         if($dm ne '') {
 144             if($dm !~ /</) {
 145                 # This is a canonical decomposition
 146                 $d->{canon} = $dm;
 147                 $d->{compat} = $dm;
 148             } else {
 149                 # This is only a compatibility decomposition
 150                 $dm =~ s/^<.*>\s*//;
 151                 $d->{compat} = $dm;
 152             }
 153         }
 154         $data{$c} = $d;
 155     }
 156     $cats{$gc} = 1;
 157     $max = $end if $end > $max;
 158 }
 159
 160 sub read_prop_with_ranges {
 161     my $path = shift;
 162     my $propkey = shift;
 163     input($path);
 164     while(<>) {
 165         chomp;
 166         s/\s*\#.*//;
 167         next if $_ eq '';
 168         my ($range, $propval) = split(/\s*;\s*/, $_);
 169         if($range =~ /(.*)\.\.(.*)/) {
 170             for my $c (hex($1) .. hex($2)) {
 171                 die "($range)\n" if($c == 0xAC00 and $propkey eq 'gbreak');
 172                 $data{$c}->{$propkey} = $propval;
 173             }
 174         } else {
 175             my $c = hex($range);
 176             $data{$c}->{$propkey} = $propval;
 177         }
 178     }
 179 }
 180
 181 # Grapheme_Break etc
 182 read_prop_with_ranges("auxiliary/GraphemeBreakProperty.txt", "gbreak");
 183 read_prop_with_ranges("auxiliary/WordBreakProperty.txt", "wbreak");
 184 read_prop_with_ranges("auxiliary/SentenceBreakProperty.txt", "sbreak");
 185
 186 # Compute the full list and fill in the Extend category properly
 187 my %gbreak = ();
 188 my %wbreak = ();
 189 my %sbreak = ();
 190 for my $c (keys %data) {
 191     if(!exists $data{$c}->{gbreak}) {
 192         $data{$c}->{gbreak} = 'Other';
 193     }
 194     $gbreak{$data{$c}->{gbreak}} = 1;
 195
 196     if(!exists $data{$c}->{wbreak}) {
 197         if($data{$c}->{gbreak} eq 'Extend') {
 198             $data{$c}->{wbreak} = 'Extend';
 199         } else {
 200             $data{$c}->{wbreak} = 'Other';
 201         }
 202     }
 203     $wbreak{$data{$c}->{wbreak}} = 1;
 204
 205     if(!exists $data{$c}->{sbreak}) {
 206         if($data{$c}->{gbreak} eq 'Extend') {
 207             $data{$c}->{sbreak} = 'Extend';
 208         } else {
 209             $data{$c}->{sbreak} = 'Other';
 210         }
 211     }
 212     $sbreak{$data{$c}->{sbreak}} = 1;
 213 }
 214
 215 # Round up the maximum value to a whole number of subtables
 216 $max += ($modulus - 1) - ($max % $modulus);
 217
 218 # Private use characters
 219 # We only fill in values below $max, utf32__unidata()
 220 my $Co = {
 221     "gc" => "Co",
 222     "ccc" => 0,
 223     "ud" => 0,
 224     "ld" => 0
 225 };
 226 for(my $c = 0xE000; $c <= 0xF8FF && $c <= $max; ++$c) {
 227     $data{$c} = $Co;
 228 }
 229 for(my $c = 0xF0000; $c <= 0xFFFFD && $c <= $max; ++$c) {
 230     $data{$c} = $Co;
 231 }
 232 for(my $c = 0x100000; $c <= 0x10FFFD && $c <= $max; ++$c) {
 233     $data{$c} = $Co;
 234 }
 235
 236 # Anything left is not assigned
 237 my $Cn = {
 238     "gc" => "Cn",               # not assigned
 239     "ccc" => 0,
 240     "ud" => 0,
 241     "ld" => 0
 242 };
 243 for(my $c = 0; $c <= $max; ++$c) {
 244     if(!exists $data{$c}) {
 245         $data{$c} = $Cn;
 246     }
 247     if(!exists $data{$c}->{wbreak}) {
 248         $data{$c}->{wbreak} = 'Other';
 249     }
 250     if(!exists $data{$c}->{gbreak}) {
 251         $data{$c}->{gbreak} = 'Other';
 252     }
 253     if(!exists $data{$c}->{sbreak}) {
 254         $data{$c}->{sbreak} = 'Other';
 255     }
 256 }
 257 $cats{'Cn'} = 1;
 258
 259 # Read the casefolding data too
 260 input("CaseFolding.txt");
 261 while(<>) {
 262     chomp;
 263     next if /^\#/ or $_ eq '';
 264     my @f = split(/\s*;\s*/, $_);
 265     # Full case folding means use status C and F.
 266     # We discard status T, Turkish users may wish to change this.
 267     if($f[1] eq 'C' or $f[1] eq 'F') {
 268         my $c = hex($f[0]);
 269         $data{$c}->{casefold} = $f[2];
 270         # We are particularly interest in combining characters that
 271         # case-fold to non-combining characters, or characters that
 272         # case-fold to sequences with combining characters in non-initial
 273         # positions, as these required decomposiiton before case-folding
 274         my @d = map(hex($_), split(/\s+/, $data{$c}->{casefold}));
 275         if($data{$c}->{ccc} != 0) {
 276             # This is a combining character
 277             if($data{$d[0]}->{ccc} == 0) {
 278                 # The first character of its case-folded form is NOT
 279                 # a combining character.  The field name is the example
 280                 # explicitly mentioned in the spec.
 281                 $data{$c}->{ypogegrammeni} = 1;
 282             }
 283         } else {
 284             # This is a non-combining character; inspect the non-initial
 285             # code points of the case-folded sequence
 286             shift(@d);
 287             if(grep($data{$_}->{ccc} != 0, @d)) {
 288                 # Some non-initial code point in the case-folded for is NOT a
 289                 # a combining character.
 290                 $data{$c}->{ypogegrammeni} = 1;
 291             }
 292         }
 293     }
 294 }
 295
 296 # Generate the header file
 297 print STDERR "Generating unidata.h...\n";
 298 open(STDOUT, ">unidata.h") or die "unidata.h: $!\n";
 299
 300 out("/* Automatically generated file, see scripts/make-unidata */\n",
 301     "#ifndef UNIDATA_H\n",
 302     "#define UNIDATA_H\n");
 303
 304 # TODO choose stable values for General_Category
 305 out("enum unicode_General_Category {\n",
 306     join(",\n",
 307          map("  unicode_General_Category_$_", sort keys %cats)), "\n};\n");
 308
 309 out("enum unicode_Grapheme_Break {\n",
 310     join(",\n",
 311          map("  unicode_Grapheme_Break_$_", sort keys %gbreak)),
 312     "\n};\n");
 313 out("extern const char *const unicode_Grapheme_Break_names[];\n");
 314
 315 out("enum unicode_Word_Break {\n",
 316     join(",\n",
 317          map("  unicode_Word_Break_$_", sort keys %wbreak)),
 318     "\n};\n");
 319 out("extern const char *const unicode_Word_Break_names[];\n");
 320
 321 out("enum unicode_Sentence_Break {\n",
 322     join(",\n",
 323          map("  unicode_Sentence_Break_$_", sort keys %sbreak)),
 324     "\n};\n");
 325 out("extern const char *const unicode_Sentence_Break_names[];\n");
 326
 327 out("enum unicode_flags {\n",
 328     "  unicode_normalize_before_casefold = 1\n",
 329     "};\n",
 330     "\n");
 331
 332 # Choose the narrowest type that will fit the required values
 333 sub choosetype {
 334     my ($min, $max) = @_;
 335     if($min >= 0) {
 336         return "char" if $max <= 127;
 337         return "unsigned char" if $max <= 255;
 338         return "int16_t" if $max < 32767;
 339         return "uint16_t" if $max < 65535;
 340         return "int32_t";
 341     } else {
 342         return "char" if $min >= -127 && $max <= 127;
 343         return "int16_t" if $min >= -32767 && $max <= 32767;
 344         return "int32_t";
 345     }
 346 }
 347
 348 out("struct unidata {\n",
 349     "  const uint32_t *compat;\n",
 350     "  const uint32_t *canon;\n",
 351     "  const uint32_t *casefold;\n",
 352 #    "  ".choosetype($minud, $maxud)." upper_offset;\n",
 353 #    "  ".choosetype($minld, $maxld)." lower_offset;\n",
 354     "  ".choosetype(0, $maxccc)." ccc;\n",
 355     "  char general_category;\n",
 356     "  uint8_t flags;\n",
 357     "  char grapheme_break;\n",
 358     "  char word_break;\n",
 359     "  char sentence_break;\n",
 360     "};\n");
 361 # compat, canon and casefold do have have non-BMP characters, so we
 362 # can't use a simple 16-bit table.  We could use UTF-8 or UTF-16
 363 # though, saving a bit of space (probably not that much...) at the
 364 # cost of marginally reduced performance and additional complexity
 365
 366 out("extern const struct unidata *const unidata[];\n");
 367
 368 out("#define UNICODE_NCHARS ", ($max + 1), "\n");
 369 out("#define UNICODE_MODULUS $modulus\n");
 370 out("#define UNICODE_BREAK_START $break_start\n");
 371 out("#define UNICODE_BREAK_END $break_end\n");
 372 out("#define UNICODE_BREAK_TOP $break_top\n");
 373
 374 out("#endif\n");
 375
 376 close STDOUT or die "unidata.h: $!\n";
 377
 378 print STDERR "Generating unidata.c...\n";
 379 open(STDOUT, ">unidata.c") or die "unidata.c: $!\n";
 380
 381 out("/* Automatically generated file, see scripts/make-unidata */\n",
 382     "#include <config.h>\n",
 383     "#include \"types.h\"\n",
 384     "#include \"unidata.h\"\n");
 385
 386 # Short aliases to keep .c file small
 387
 388 out(map(sprintf("#define %s unicode_General_Category_%s\n", $_, $_),
 389         sort keys %cats));
 390 out(map(sprintf("#define GB%s unicode_Grapheme_Break_%s\n", $_, $_),
 391         sort keys %gbreak));
 392 out(map(sprintf("#define WB%s unicode_Word_Break_%s\n", $_, $_),
 393         sort keys %wbreak));
 394 out(map(sprintf("#define SB%s unicode_Sentence_Break_%s\n", $_, $_),
 395         sort keys %sbreak));
 396
 397 # Names for *_Break properties
 398 out("const char *const unicode_Grapheme_Break_names[] = {\n",
 399     join(",\n",
 400          map("  \"$_\"", sort keys %gbreak)),
 401     "\n};\n");
 402 out("const char *const unicode_Word_Break_names[] = {\n",
 403     join(",\n",
 404          map("  \"$_\"", sort keys %wbreak)),
 405     "\n};\n");
 406 out("const char *const unicode_Sentence_Break_names[] = {\n",
 407     join(",\n",
 408          map("  \"$_\"", sort keys %sbreak)),
 409     "\n};\n");
 410
 411 # Generate the decomposition mapping tables.  We look out for duplicates
 412 # in order to save space and report this as decompsaved at the end.  In
 413 # Unicode 5.0.0 this saves 1795 entries, which is at least 14Kbytes.
 414 my $decompnum = 0;
 415 my %decompnums = ();
 416 my $decompsaved = 0;
 417 out("static const uint32_t ");
 418 for(my $c = 0; $c <= $max; ++$c) {
 419     # If canon is set then compat will be too and will be identical.
 420     # If compat is set the canon might be clear.  So we use the
 421     # compat version and fix up the symbols after.
 422     if(exists $data{$c} && exists $data{$c}->{compat}) {
 423         my $s = join(",",
 424                      (map(hex($_), split(/\s+/, $data{$c}->{compat})), 0));
 425         if(!exists $decompnums{$s}) {
 426             out(",\n") if $decompnum != 0;
 427             out("cd$decompnum\[]={$s}");
 428             $decompnums{$s} = $decompnum++;
 429         } else {
 430             ++$decompsaved;
 431         }
 432         $data{$c}->{compatsym} = "cd$decompnums{$s}";
 433         if(exists $data{$c}->{canon}) {
 434             $data{$c}->{canonsym} = "cd$decompnums{$s}";
 435         }
 436     }
 437 }
 438 out(";\n");
 439
 440 # ...and the case folding table.  Again we compress equal entries to save
 441 # space.  In Unicode 5.0.0 this saves 51 entries or at least 408 bytes.
 442 # This doesns't seem as worthwhile as the decomposition mapping saving above.
 443 my $cfnum = 0;
 444 my %cfnums = ();
 445 my $cfsaved = 0;
 446 out("static const uint32_t ");
 447 for(my $c = 0; $c <= $max; ++$c) {
 448     if(exists $data{$c} && exists $data{$c}->{casefold}) {
 449         my $s = join(",",
 450                      (map(hex($_), split(/\s+/, $data{$c}->{casefold})), 0));
 451         if(!exists $cfnums{$s}) {
 452             out(",\n") if $cfnum != 0;
 453             out("cf$cfnum\[]={$s}");
 454             $cfnums{$s} = $cfnum++;
 455         } else {
 456             ++$cfsaved;
 457         }
 458         $data{$c}->{cfsym} = "cf$cfnums{$s}";
 459     }
 460 }
 461 out(";\n");
 462
 463 # Visit all the $modulus-character blocks in turn and generate the
 464 # required subtables.  As above we spot duplicates to save space.  In
 465 # Unicode 5.0.0 with $modulus=128 and current table data this saves
 466 # 1372 subtables or at least three and a half megabytes on 32-bit
 467 # platforms.
 468
 469 my %subtable = ();              # base->subtable number
 470 my %subtableno = ();            # subtable number -> content
 471 my $subtablecounter = 0;        # counter for subtable numbers
 472 my $subtablessaved = 0;         # number of tables saved
 473 for(my $base = 0; $base <= $max; $base += $modulus) {
 474     next if $base >= $break_start && $base < $break_end;
 475     next if $base >= $break_top;
 476     my @t;
 477     for(my $c = $base; $c < $base + $modulus; ++$c) {
 478         my $d = $data{$c};
 479         my $canonsym = ($data{$c}->{canonsym} or "0");
 480         my $compatsym = ($data{$c}->{compatsym} or "0");
 481         my $cfsym = ($data{$c}->{cfsym} or "0");
 482         my @flags = ();
 483         if($data{$c}->{ypogegrammeni}) {
 484             push(@flags, "unicode_normalize_before_casefold");
 485         }
 486         my $flags = @flags ? join("|", @flags) : 0;
 487         push(@t, "{".
 488              join(",",
 489                   $compatsym,
 490                   $canonsym,
 491                   $cfsym,
 492 #                 $d->{ud},
 493 #                 $d->{ld},
 494                   $d->{ccc},
 495                   $d->{gc},
 496                   $flags,
 497                   "GB$d->{gbreak}",
 498                   "WB$d->{wbreak}",
 499                   "SB$d->{sbreak}",
 500              )."}");
 501     }
 502     my $t = join(",\n", @t);
 503     if(!exists $subtable{$t}) {
 504         out(sprintf("/* %04X-%04X */\n", $base, $base + $modulus - 1));
 505         out("static const struct unidata st$subtablecounter\[] = {\n",
 506             "$t\n",
 507             "};\n");
 508         $subtable{$t} = $subtablecounter++;
 509     } else {
 510         ++$subtablessaved;
 511     }
 512     $subtableno{$base} = $subtable{$t};
 513 }
 514
 515 out("const struct unidata *const unidata[]={\n");
 516 for(my $base = 0; $base <= $max; $base += $modulus) {
 517     next if $base >= $break_start && $base < $break_end;
 518     next if $base >= $break_top;
 519     #out("st$subtableno{$base} /* ".sprintf("%04x", $base)." */,\n");
 520     out("st$subtableno{$base},\n");
 521 }
 522 out("};\n");
 523
 524 close STDOUT or die "unidata.c: $!\n";
 525
 526 printf STDERR "max=%04X\n", $max;
 527 print STDERR "subtables=$subtablecounter, subtablessaved=$subtablessaved\n";
 528 print STDERR "decompsaved=$decompsaved cfsaved=$cfsaved\n";