#! /usr/bin/perl -w # # Generate a two-level table describing (some of) the fields of UnicodeData.txt use strict; sub out { print @_ or die "$!\n"; } sub key { my $d = shift; local $_; return join("-", map($d->{$_}, sort keys %$d)); } my %cats = (); # known general categories my %data = (); # mapping of codepoints to information my %comp = (); # my $max = 0; # maximum codepoint while(<>) { my @f = split(/;/, $_); my $c = hex($f[0]); # codepoint next if $c >= 0xE0000; # ignore various high-numbered stuff my $name = $f[1]; my $gc = $f[2]; # general category my $ccc = $f[3]; # canonical combining class my $sum = hex($f[12]) || $c; # simple upper case mapping my $slm = hex($f[13]) || $c; # simple lower case mapping # recalculate the upper/lower case mappings as offsets my $ud = $sum - $c; my $ld = $slm - $c; $data{$c} = { "gc" => $gc, "ccc" => $ccc, "ud" => $ud, "ld" => $ld }; $cats{$gc} = 1; $max = $c if $c > $max; } $max += 255 - ($max % 256); # round up # Make sure there are no gaps for(my $c = 0; $c <= $max; ++$c) { if(!exists $data{$c}) { $data{$c} = { "gc" => "Cn", # not assigned "ccc" => 0, "ud" => 0, "ld" => 0 }; } } $cats{'Cn'} = 1; open(STDOUT, ">unidata.h") or die "unidata.h: $!\n"; out("#ifndef UNIDATA_H\n", "#define UNIDATA_H\n"); out("enum unicode_gc_cat {\n", join(",\n", map(" unicode_gc_$_", sort keys %cats)), "\n};\n"); out("struct unidata {\n", " enum unicode_gc_cat gc;\n", " int ccc;\n", " int upper_offset;\n", " int lower_offset;\n", "};\n"); out("extern const struct unidata *const unidata[];\n"); out("#define UNICODE_NCHARS ", ($max + 1), "\n"); out("#endif\n"); close STDOUT or die "unidata.h: $!\n"; open(STDOUT, ">unidata.c") or die "unidata.c: $!\n"; out("#include \"unidata.h\"\n"); # Visit all the 256-character blocks in turn and generate the required # subtables my %subtable = (); # base->subtable number my %subtableno = (); # subtable number -> content my $subtablecounter = 0; # counter for subtable numbers for(my $base = 0; $base <= $max; $base += 256) { my @t; for(my $c = $base; $c <= $base + 255; ++$c) { my $d = $data{$c}; push(@t, " { unicode_gc_$d->{gc}, $d->{ccc}, $d->{ud}, $d->{ld} }"); } my $t = join(",\n", @t); if(!exists $subtable{$t}) { out("static const struct unidata subtable$subtablecounter\[] = {\n", "$t\n", "};\n"); $subtable{$t} = $subtablecounter++; } $subtableno{$base} = $subtable{$t}; } out("const struct unidata *const unidata[] = {\n"); for(my $base = 0; $base <= $max; $base += 256) { out(" subtable$subtableno{$base},\n"); } out("};\n"); close STDOUT or die "unidata.c: $!\n"; print STDERR "max=$max, subtables=$subtablecounter\n";