[disorder] / scripts / make-unidata

#! /usr/bin/perl -w
#
# Generate a two-level table describing (some of) the fields of UnicodeData.txt
use strict;

sub out {
    print @_ or die "$!\n";
}

sub key {
    my $d = shift;
    local $_;

    return join("-", map($d->{$_}, sort keys %$d));
}

my %cats = ();			# known general categories
my %data = ();			# mapping of codepoints to information
my %comp = ();			# 
my $max = 0;			# maximum codepoint

while(<>) {
    my @f = split(/;/, $_);
    my $c = hex($f[0]);		# codepoint
    next if $c >= 0xE0000;	# ignore various high-numbered stuff
    my $name = $f[1];
    my $gc = $f[2];		# general category
    my $ccc = $f[3];		# canonical combining class
    my $sum = hex($f[12]) || $c; # simple upper case mapping
    my $slm = hex($f[13]) || $c; # simple lower case mapping
    # recalculate the upper/lower case mappings as offsets
    my $ud = $sum - $c;
    my $ld = $slm - $c;
    $data{$c} = {
	"gc" => $gc,
	"ccc" => $ccc,
	"ud" => $ud,
	"ld" => $ld
	};
    $cats{$gc} = 1;
    $max = $c if $c > $max;
}

$max += 255 - ($max % 256);	# round up

# Make sure there are no gaps
for(my $c = 0; $c <= $max; ++$c) {
    if(!exists $data{$c}) {
	$data{$c} = {
	    "gc" => "Cn",	# not assigned
	    "ccc" => 0,
	    "ud" => 0,
	    "ld" => 0
	    };
    }
}
$cats{'Cn'} = 1;

open(STDOUT, ">unidata.h") or die "unidata.h: $!\n";

out("#ifndef UNIDATA_H\n",
    "#define UNIDATA_H\n");

out("enum unicode_gc_cat {\n",
    join(",\n",
	 map("  unicode_gc_$_", sort keys %cats)), "\n};\n");
    
out("struct unidata {\n",
    "  enum unicode_gc_cat gc;\n",
    "  int ccc;\n",
    "  int upper_offset;\n",
    "  int lower_offset;\n",
    "};\n");

out("extern const struct unidata *const unidata[];\n");

out("#define UNICODE_NCHARS ", ($max + 1), "\n");

out("#endif\n");

close STDOUT or die "unidata.h: $!\n";

open(STDOUT, ">unidata.c") or die "unidata.c: $!\n";

out("#include \"unidata.h\"\n");

# Visit all the 256-character blocks in turn and generate the required
# subtables
my %subtable = ();		# base->subtable number
my %subtableno = ();		# subtable number -> content
my $subtablecounter = 0;	# counter for subtable numbers
for(my $base = 0; $base <= $max; $base += 256) {
    my @t;
    for(my $c = $base; $c <= $base + 255; ++$c) {
	my $d = $data{$c};
	push(@t,
	     "  { unicode_gc_$d->{gc}, $d->{ccc}, $d->{ud}, $d->{ld} }");
    }
    my $t = join(",\n", @t);
    if(!exists $subtable{$t}) {
	out("static const struct unidata subtable$subtablecounter\[] = {\n",
	    "$t\n",
	    "};\n");
	$subtable{$t} = $subtablecounter++;
    }
    $subtableno{$base} = $subtable{$t};
}

out("const struct unidata *const unidata[] = {\n");
for(my $base = 0; $base <= $max; $base += 256) {
    out("  subtable$subtableno{$base},\n");
}
out("};\n");

close STDOUT or die "unidata.c: $!\n";

print STDERR "max=$max, subtables=$subtablecounter\n";
Commit	Line	Data
61507e3c RK	1	#! /usr/bin/perl -w
	2	#
	3	# Generate a two-level table describing (some of) the fields of UnicodeData.txt
	4	use strict;
	5
	6	sub out {
	7	print @_ or die "$!\n";
	8	}
	9
	10	sub key {
	11	my $d = shift;
	12	local $_;
	13
	14	return join("-", map($d->{$_}, sort keys %$d));
	15	}
	16
	17	my %cats = (); # known general categories
	18	my %data = (); # mapping of codepoints to information
	19	my %comp = (); #
	20	my $max = 0; # maximum codepoint
	21
	22	while(<>) {
	23	my @f = split(/;/, $_);
	24	my $c = hex($f[0]); # codepoint
	25	next if $c >= 0xE0000; # ignore various high-numbered stuff
	26	my $name = $f[1];
	27	my $gc = $f[2]; # general category
	28	my $ccc = $f[3]; # canonical combining class
	29	my $sum = hex($f[12]) \|\| $c; # simple upper case mapping
	30	my $slm = hex($f[13]) \|\| $c; # simple lower case mapping
	31	# recalculate the upper/lower case mappings as offsets
	32	my $ud = $sum - $c;
	33	my $ld = $slm - $c;
	34	$data{$c} = {
	35	"gc" => $gc,
	36	"ccc" => $ccc,
	37	"ud" => $ud,
	38	"ld" => $ld
	39	};
	40	$cats{$gc} = 1;
	41	$max = $c if $c > $max;
	42	}
	43
	44	$max += 255 - ($max % 256); # round up
	45
	46	# Make sure there are no gaps
	47	for(my $c = 0; $c <= $max; ++$c) {
	48	if(!exists $data{$c}) {
	49	$data{$c} = {
	50	"gc" => "Cn", # not assigned
	51	"ccc" => 0,
	52	"ud" => 0,
	53	"ld" => 0
	54	};
	55	}
	56	}
	57	$cats{'Cn'} = 1;
	58
	59	open(STDOUT, ">unidata.h") or die "unidata.h: $!\n";
	60
	61	out("#ifndef UNIDATA_H\n",
	62	"#define UNIDATA_H\n");
	63
	64	out("enum unicode_gc_cat {\n",
65	join(",\n",
66	map(" unicode_gc_$_", sort keys %cats)), "\n};\n");
67
68	out("struct unidata {\n",
69	" enum unicode_gc_cat gc;\n",
70	" int ccc;\n",
71	" int upper_offset;\n",
72	" int lower_offset;\n",
73	"};\n");
74
75	out("extern const struct unidata *const unidata[];\n");
76
77	out("#define UNICODE_NCHARS ", ($max + 1), "\n");
78
79	out("#endif\n");
80
81	close STDOUT or die "unidata.h: $!\n";
82
83	open(STDOUT, ">unidata.c") or die "unidata.c: $!\n";
84
85	out("#include \"unidata.h\"\n");
86
87	# Visit all the 256-character blocks in turn and generate the required
88	# subtables
89	my %subtable = (); # base->subtable number
90	my %subtableno = (); # subtable number -> content
91	my $subtablecounter = 0; # counter for subtable numbers
92	for(my $base = 0; $base <= $max; $base += 256) {
93	my @t;
94	for(my $c = $base; $c <= $base + 255; ++$c) {
95	my $d = $data{$c};
96	push(@t,
97	" { unicode_gc_$d->{gc}, $d->{ccc}, $d->{ud}, $d->{ld} }");
98	}
99	my $t = join(",\n", @t);
100	if(!exists $subtable{$t}) {
101	out("static const struct unidata subtable$subtablecounter\[] = {\n",
102	"$t\n",
103	"};\n");
104	$subtable{$t} = $subtablecounter++;
105	}
106	$subtableno{$base} = $subtable{$t};
107	}
108
109	out("const struct unidata *const unidata[] = {\n");
110	for(my $base = 0; $base <= $max; $base += 256) {
111	out(" subtable$subtableno{$base},\n");
112	}
113	out("};\n");
114
115	close STDOUT or die "unidata.c: $!\n";
116
117	print STDERR "max=$max, subtables=$subtablecounter\n";