Commit | Line | Data |
---|---|---|
61507e3c RK |
1 | #! /usr/bin/perl -w |
2 | # | |
3 | # Generate a two-level table describing (some of) the fields of UnicodeData.txt | |
4 | use strict; | |
5 | ||
6 | sub out { | |
7 | print @_ or die "$!\n"; | |
8 | } | |
9 | ||
10 | sub key { | |
11 | my $d = shift; | |
12 | local $_; | |
13 | ||
14 | return join("-", map($d->{$_}, sort keys %$d)); | |
15 | } | |
16 | ||
17 | my %cats = (); # known general categories | |
18 | my %data = (); # mapping of codepoints to information | |
19 | my %comp = (); # | |
20 | my $max = 0; # maximum codepoint | |
21 | ||
22 | while(<>) { | |
23 | my @f = split(/;/, $_); | |
24 | my $c = hex($f[0]); # codepoint | |
25 | next if $c >= 0xE0000; # ignore various high-numbered stuff | |
26 | my $name = $f[1]; | |
27 | my $gc = $f[2]; # general category | |
28 | my $ccc = $f[3]; # canonical combining class | |
29 | my $sum = hex($f[12]) || $c; # simple upper case mapping | |
30 | my $slm = hex($f[13]) || $c; # simple lower case mapping | |
31 | # recalculate the upper/lower case mappings as offsets | |
32 | my $ud = $sum - $c; | |
33 | my $ld = $slm - $c; | |
34 | $data{$c} = { | |
35 | "gc" => $gc, | |
36 | "ccc" => $ccc, | |
37 | "ud" => $ud, | |
38 | "ld" => $ld | |
39 | }; | |
40 | $cats{$gc} = 1; | |
41 | $max = $c if $c > $max; | |
42 | } | |
43 | ||
44 | $max += 255 - ($max % 256); # round up | |
45 | ||
46 | # Make sure there are no gaps | |
47 | for(my $c = 0; $c <= $max; ++$c) { | |
48 | if(!exists $data{$c}) { | |
49 | $data{$c} = { | |
50 | "gc" => "Cn", # not assigned | |
51 | "ccc" => 0, | |
52 | "ud" => 0, | |
53 | "ld" => 0 | |
54 | }; | |
55 | } | |
56 | } | |
57 | $cats{'Cn'} = 1; | |
58 | ||
59 | open(STDOUT, ">unidata.h") or die "unidata.h: $!\n"; | |
60 | ||
61 | out("#ifndef UNIDATA_H\n", | |
62 | "#define UNIDATA_H\n"); | |
63 | ||
64 | out("enum unicode_gc_cat {\n", | |
65 | join(",\n", | |
66 | map(" unicode_gc_$_", sort keys %cats)), "\n};\n"); | |
67 | ||
68 | out("struct unidata {\n", | |
69 | " enum unicode_gc_cat gc;\n", | |
70 | " int ccc;\n", | |
71 | " int upper_offset;\n", | |
72 | " int lower_offset;\n", | |
73 | "};\n"); | |
74 | ||
75 | out("extern const struct unidata *const unidata[];\n"); | |
76 | ||
77 | out("#define UNICODE_NCHARS ", ($max + 1), "\n"); | |
78 | ||
79 | out("#endif\n"); | |
80 | ||
81 | close STDOUT or die "unidata.h: $!\n"; | |
82 | ||
83 | open(STDOUT, ">unidata.c") or die "unidata.c: $!\n"; | |
84 | ||
85 | out("#include \"unidata.h\"\n"); | |
86 | ||
87 | # Visit all the 256-character blocks in turn and generate the required | |
88 | # subtables | |
89 | my %subtable = (); # base->subtable number | |
90 | my %subtableno = (); # subtable number -> content | |
91 | my $subtablecounter = 0; # counter for subtable numbers | |
92 | for(my $base = 0; $base <= $max; $base += 256) { | |
93 | my @t; | |
94 | for(my $c = $base; $c <= $base + 255; ++$c) { | |
95 | my $d = $data{$c}; | |
96 | push(@t, | |
97 | " { unicode_gc_$d->{gc}, $d->{ccc}, $d->{ud}, $d->{ld} }"); | |
98 | } | |
99 | my $t = join(",\n", @t); | |
100 | if(!exists $subtable{$t}) { | |
101 | out("static const struct unidata subtable$subtablecounter\[] = {\n", | |
102 | "$t\n", | |
103 | "};\n"); | |
104 | $subtable{$t} = $subtablecounter++; | |
105 | } | |
106 | $subtableno{$base} = $subtable{$t}; | |
107 | } | |
108 | ||
109 | out("const struct unidata *const unidata[] = {\n"); | |
110 | for(my $base = 0; $base <= $max; $base += 256) { | |
111 | out(" subtable$subtableno{$base},\n"); | |
112 | } | |
113 | out("};\n"); | |
114 | ||
115 | close STDOUT or die "unidata.c: $!\n"; | |
116 | ||
117 | print STDERR "max=$max, subtables=$subtablecounter\n"; |