| 1 | #! /usr/bin/perl -w |
| 2 | # |
| 3 | # Generate a two-level table describing (some of) the fields of UnicodeData.txt |
| 4 | use strict; |
| 5 | |
| 6 | sub out { |
| 7 | print @_ or die "$!\n"; |
| 8 | } |
| 9 | |
| 10 | sub key { |
| 11 | my $d = shift; |
| 12 | local $_; |
| 13 | |
| 14 | return join("-", map($d->{$_}, sort keys %$d)); |
| 15 | } |
| 16 | |
| 17 | my %cats = (); # known general categories |
| 18 | my %data = (); # mapping of codepoints to information |
| 19 | my %comp = (); # |
| 20 | my $max = 0; # maximum codepoint |
| 21 | |
| 22 | while(<>) { |
| 23 | my @f = split(/;/, $_); |
| 24 | my $c = hex($f[0]); # codepoint |
| 25 | next if $c >= 0xE0000; # ignore various high-numbered stuff |
| 26 | my $name = $f[1]; |
| 27 | my $gc = $f[2]; # general category |
| 28 | my $ccc = $f[3]; # canonical combining class |
| 29 | my $sum = hex($f[12]) || $c; # simple upper case mapping |
| 30 | my $slm = hex($f[13]) || $c; # simple lower case mapping |
| 31 | # recalculate the upper/lower case mappings as offsets |
| 32 | my $ud = $sum - $c; |
| 33 | my $ld = $slm - $c; |
| 34 | $data{$c} = { |
| 35 | "gc" => $gc, |
| 36 | "ccc" => $ccc, |
| 37 | "ud" => $ud, |
| 38 | "ld" => $ld |
| 39 | }; |
| 40 | $cats{$gc} = 1; |
| 41 | $max = $c if $c > $max; |
| 42 | } |
| 43 | |
| 44 | $max += 255 - ($max % 256); # round up |
| 45 | |
| 46 | # Make sure there are no gaps |
| 47 | for(my $c = 0; $c <= $max; ++$c) { |
| 48 | if(!exists $data{$c}) { |
| 49 | $data{$c} = { |
| 50 | "gc" => "Cn", # not assigned |
| 51 | "ccc" => 0, |
| 52 | "ud" => 0, |
| 53 | "ld" => 0 |
| 54 | }; |
| 55 | } |
| 56 | } |
| 57 | $cats{'Cn'} = 1; |
| 58 | |
| 59 | open(STDOUT, ">unidata.h") or die "unidata.h: $!\n"; |
| 60 | |
| 61 | out("#ifndef UNIDATA_H\n", |
| 62 | "#define UNIDATA_H\n"); |
| 63 | |
| 64 | out("enum unicode_gc_cat {\n", |
| 65 | join(",\n", |
| 66 | map(" unicode_gc_$_", sort keys %cats)), "\n};\n"); |
| 67 | |
| 68 | out("struct unidata {\n", |
| 69 | " enum unicode_gc_cat gc;\n", |
| 70 | " int ccc;\n", |
| 71 | " int upper_offset;\n", |
| 72 | " int lower_offset;\n", |
| 73 | "};\n"); |
| 74 | |
| 75 | out("extern const struct unidata *const unidata[];\n"); |
| 76 | |
| 77 | out("#define UNICODE_NCHARS ", ($max + 1), "\n"); |
| 78 | |
| 79 | out("#endif\n"); |
| 80 | |
| 81 | close STDOUT or die "unidata.h: $!\n"; |
| 82 | |
| 83 | open(STDOUT, ">unidata.c") or die "unidata.c: $!\n"; |
| 84 | |
| 85 | out("#include \"unidata.h\"\n"); |
| 86 | |
| 87 | # Visit all the 256-character blocks in turn and generate the required |
| 88 | # subtables |
| 89 | my %subtable = (); # base->subtable number |
| 90 | my %subtableno = (); # subtable number -> content |
| 91 | my $subtablecounter = 0; # counter for subtable numbers |
| 92 | for(my $base = 0; $base <= $max; $base += 256) { |
| 93 | my @t; |
| 94 | for(my $c = $base; $c <= $base + 255; ++$c) { |
| 95 | my $d = $data{$c}; |
| 96 | push(@t, |
| 97 | " { unicode_gc_$d->{gc}, $d->{ccc}, $d->{ud}, $d->{ld} }"); |
| 98 | } |
| 99 | my $t = join(",\n", @t); |
| 100 | if(!exists $subtable{$t}) { |
| 101 | out("static const struct unidata subtable$subtablecounter\[] = {\n", |
| 102 | "$t\n", |
| 103 | "};\n"); |
| 104 | $subtable{$t} = $subtablecounter++; |
| 105 | } |
| 106 | $subtableno{$base} = $subtable{$t}; |
| 107 | } |
| 108 | |
| 109 | out("const struct unidata *const unidata[] = {\n"); |
| 110 | for(my $base = 0; $base <= $max; $base += 256) { |
| 111 | out(" subtable$subtableno{$base},\n"); |
| 112 | } |
| 113 | out("};\n"); |
| 114 | |
| 115 | close STDOUT or die "unidata.c: $!\n"; |
| 116 | |
| 117 | print STDERR "max=$max, subtables=$subtablecounter\n"; |