chiark / gitweb /
it's plugins not plugin
[disorder] / scripts / make-unidata
CommitLineData
61507e3c
RK
1#! /usr/bin/perl -w
2#
3# Generate a two-level table describing (some of) the fields of UnicodeData.txt
4use strict;
5
6sub out {
7 print @_ or die "$!\n";
8}
9
10sub key {
11 my $d = shift;
12 local $_;
13
14 return join("-", map($d->{$_}, sort keys %$d));
15}
16
17my %cats = (); # known general categories
18my %data = (); # mapping of codepoints to information
19my %comp = (); #
20my $max = 0; # maximum codepoint
21
22while(<>) {
23 my @f = split(/;/, $_);
24 my $c = hex($f[0]); # codepoint
25 next if $c >= 0xE0000; # ignore various high-numbered stuff
26 my $name = $f[1];
27 my $gc = $f[2]; # general category
28 my $ccc = $f[3]; # canonical combining class
29 my $sum = hex($f[12]) || $c; # simple upper case mapping
30 my $slm = hex($f[13]) || $c; # simple lower case mapping
31 # recalculate the upper/lower case mappings as offsets
32 my $ud = $sum - $c;
33 my $ld = $slm - $c;
34 $data{$c} = {
35 "gc" => $gc,
36 "ccc" => $ccc,
37 "ud" => $ud,
38 "ld" => $ld
39 };
40 $cats{$gc} = 1;
41 $max = $c if $c > $max;
42}
43
44$max += 255 - ($max % 256); # round up
45
46# Make sure there are no gaps
47for(my $c = 0; $c <= $max; ++$c) {
48 if(!exists $data{$c}) {
49 $data{$c} = {
50 "gc" => "Cn", # not assigned
51 "ccc" => 0,
52 "ud" => 0,
53 "ld" => 0
54 };
55 }
56}
57$cats{'Cn'} = 1;
58
59open(STDOUT, ">unidata.h") or die "unidata.h: $!\n";
60
61out("#ifndef UNIDATA_H\n",
62 "#define UNIDATA_H\n");
63
64out("enum unicode_gc_cat {\n",
65 join(",\n",
66 map(" unicode_gc_$_", sort keys %cats)), "\n};\n");
67
68out("struct unidata {\n",
69 " enum unicode_gc_cat gc;\n",
70 " int ccc;\n",
71 " int upper_offset;\n",
72 " int lower_offset;\n",
73 "};\n");
74
75out("extern const struct unidata *const unidata[];\n");
76
77out("#define UNICODE_NCHARS ", ($max + 1), "\n");
78
79out("#endif\n");
80
81close STDOUT or die "unidata.h: $!\n";
82
83open(STDOUT, ">unidata.c") or die "unidata.c: $!\n";
84
85out("#include \"unidata.h\"\n");
86
87# Visit all the 256-character blocks in turn and generate the required
88# subtables
89my %subtable = (); # base->subtable number
90my %subtableno = (); # subtable number -> content
91my $subtablecounter = 0; # counter for subtable numbers
92for(my $base = 0; $base <= $max; $base += 256) {
93 my @t;
94 for(my $c = $base; $c <= $base + 255; ++$c) {
95 my $d = $data{$c};
96 push(@t,
97 " { unicode_gc_$d->{gc}, $d->{ccc}, $d->{ud}, $d->{ld} }");
98 }
99 my $t = join(",\n", @t);
100 if(!exists $subtable{$t}) {
101 out("static const struct unidata subtable$subtablecounter\[] = {\n",
102 "$t\n",
103 "};\n");
104 $subtable{$t} = $subtablecounter++;
105 }
106 $subtableno{$base} = $subtable{$t};
107}
108
109out("const struct unidata *const unidata[] = {\n");
110for(my $base = 0; $base <= $max; $base += 256) {
111 out(" subtable$subtableno{$base},\n");
112}
113out("};\n");
114
115close STDOUT or die "unidata.c: $!\n";
116
117print STDERR "max=$max, subtables=$subtablecounter\n";