chiark - git - mdw - disorder/blob - scripts/make-unidata

   1 #! /usr/bin/perl -w
   2 #
   3 # Generate a two-level table describing (some of) the fields of UnicodeData.txt
   4 use strict;
   5
   6 sub out {
   7     print @_ or die "$!\n";
   8 }
   9
  10 sub key {
  11     my $d = shift;
  12     local $_;
  13
  14     return join("-", map($d->{$_}, sort keys %$d));
  15 }
  16
  17 my %cats = ();                  # known general categories
  18 my %data = ();                  # mapping of codepoints to information
  19 my %comp = ();                  #
  20 my $max = 0;                    # maximum codepoint
  21
  22 while(<>) {
  23     my @f = split(/;/, $_);
  24     my $c = hex($f[0]);         # codepoint
  25     next if $c >= 0xE0000;      # ignore various high-numbered stuff
  26     my $name = $f[1];
  27     my $gc = $f[2];             # general category
  28     my $ccc = $f[3];            # canonical combining class
  29     my $sum = hex($f[12]) || $c; # simple upper case mapping
  30     my $slm = hex($f[13]) || $c; # simple lower case mapping
  31     # recalculate the upper/lower case mappings as offsets
  32     my $ud = $sum - $c;
  33     my $ld = $slm - $c;
  34     $data{$c} = {
  35         "gc" => $gc,
  36         "ccc" => $ccc,
  37         "ud" => $ud,
  38         "ld" => $ld
  39         };
  40     $cats{$gc} = 1;
  41     $max = $c if $c > $max;
  42 }
  43
  44 $max += 255 - ($max % 256);     # round up
  45
  46 # Make sure there are no gaps
  47 for(my $c = 0; $c <= $max; ++$c) {
  48     if(!exists $data{$c}) {
  49         $data{$c} = {
  50             "gc" => "Cn",       # not assigned
  51             "ccc" => 0,
  52             "ud" => 0,
  53             "ld" => 0
  54             };
  55     }
  56 }
  57 $cats{'Cn'} = 1;
  58
  59 open(STDOUT, ">unidata.h") or die "unidata.h: $!\n";
  60
  61 out("#ifndef UNIDATA_H\n",
  62     "#define UNIDATA_H\n");
  63
  64 out("enum unicode_gc_cat {\n",
  65     join(",\n",
  66          map("  unicode_gc_$_", sort keys %cats)), "\n};\n");
  67
  68 out("struct unidata {\n",
  69     "  enum unicode_gc_cat gc;\n",
  70     "  int ccc;\n",
  71     "  int upper_offset;\n",
  72     "  int lower_offset;\n",
  73     "};\n");
  74
  75 out("extern const struct unidata *const unidata[];\n");
  76
  77 out("#define UNICODE_NCHARS ", ($max + 1), "\n");
  78
  79 out("#endif\n");
  80
  81 close STDOUT or die "unidata.h: $!\n";
  82
  83 open(STDOUT, ">unidata.c") or die "unidata.c: $!\n";
  84
  85 out("#include \"unidata.h\"\n");
  86
  87 # Visit all the 256-character blocks in turn and generate the required
  88 # subtables
  89 my %subtable = ();              # base->subtable number
  90 my %subtableno = ();            # subtable number -> content
  91 my $subtablecounter = 0;        # counter for subtable numbers
  92 for(my $base = 0; $base <= $max; $base += 256) {
  93     my @t;
  94     for(my $c = $base; $c <= $base + 255; ++$c) {
  95         my $d = $data{$c};
  96         push(@t,
  97              "  { unicode_gc_$d->{gc}, $d->{ccc}, $d->{ud}, $d->{ld} }");
  98     }
  99     my $t = join(",\n", @t);
 100     if(!exists $subtable{$t}) {
 101         out("static const struct unidata subtable$subtablecounter\[] = {\n",
 102             "$t\n",
 103             "};\n");
 104         $subtable{$t} = $subtablecounter++;
 105     }
 106     $subtableno{$base} = $subtable{$t};
 107 }
 108
 109 out("const struct unidata *const unidata[] = {\n");
 110 for(my $base = 0; $base <= $max; $base += 256) {
 111     out("  subtable$subtableno{$base},\n");
 112 }
 113 out("};\n");
 114
 115 close STDOUT or die "unidata.c: $!\n";
 116
 117 print STDERR "max=$max, subtables=$subtablecounter\n";