3 # Generate a two-level table describing (some of) the fields of UnicodeData.txt
7 print @_ or die "$!\n";
14 return join("-", map($d->{$_}, sort keys %$d));
17 my %cats = (); # known general categories
18 my %data = (); # mapping of codepoints to information
20 my $max = 0; # maximum codepoint
23 my @f = split(/;/, $_);
24 my $c = hex($f[0]); # codepoint
25 next if $c >= 0xE0000; # ignore various high-numbered stuff
27 my $gc = $f[2]; # general category
28 my $ccc = $f[3]; # canonical combining class
29 my $sum = hex($f[12]) || $c; # simple upper case mapping
30 my $slm = hex($f[13]) || $c; # simple lower case mapping
31 # recalculate the upper/lower case mappings as offsets
41 $max = $c if $c > $max;
44 $max += 255 - ($max % 256); # round up
46 # Make sure there are no gaps
47 for(my $c = 0; $c <= $max; ++$c) {
48 if(!exists $data{$c}) {
50 "gc" => "Cn", # not assigned
59 open(STDOUT, ">unidata.h") or die "unidata.h: $!\n";
61 out("#ifndef UNIDATA_H\n",
62 "#define UNIDATA_H\n");
64 out("enum unicode_gc_cat {\n",
66 map(" unicode_gc_$_", sort keys %cats)), "\n};\n");
68 out("struct unidata {\n",
69 " enum unicode_gc_cat gc;\n",
71 " int upper_offset;\n",
72 " int lower_offset;\n",
75 out("extern const struct unidata *const unidata[];\n");
77 out("#define UNICODE_NCHARS ", ($max + 1), "\n");
81 close STDOUT or die "unidata.h: $!\n";
83 open(STDOUT, ">unidata.c") or die "unidata.c: $!\n";
85 out("#include \"unidata.h\"\n");
87 # Visit all the 256-character blocks in turn and generate the required
89 my %subtable = (); # base->subtable number
90 my %subtableno = (); # subtable number -> content
91 my $subtablecounter = 0; # counter for subtable numbers
92 for(my $base = 0; $base <= $max; $base += 256) {
94 for(my $c = $base; $c <= $base + 255; ++$c) {
97 " { unicode_gc_$d->{gc}, $d->{ccc}, $d->{ud}, $d->{ld} }");
99 my $t = join(",\n", @t);
100 if(!exists $subtable{$t}) {
101 out("static const struct unidata subtable$subtablecounter\[] = {\n",
104 $subtable{$t} = $subtablecounter++;
106 $subtableno{$base} = $subtable{$t};
109 out("const struct unidata *const unidata[] = {\n");
110 for(my $base = 0; $base <= $max; $base += 256) {
111 out(" subtable$subtableno{$base},\n");
115 close STDOUT or die "unidata.c: $!\n";
117 print STDERR "max=$max, subtables=$subtablecounter\n";