chiark - git - mdw - disorder/blame_incremental

... / ...

Commit	Line	Data
	1	#! /usr/bin/perl -w
	2	#
	3	# Generate a two-level table describing (some of) the fields of UnicodeData.txt
	4	use strict;
	5
	6	sub out {
	7	print @_ or die "$!\n";
	8	}
	9
	10	sub key {
	11	my $d = shift;
	12	local $_;
	13
	14	return join("-", map($d->{$_}, sort keys %$d));
	15	}
	16
	17	my %cats = (); # known general categories
	18	my %data = (); # mapping of codepoints to information
	19	my %comp = (); #
	20	my $max = 0; # maximum codepoint
	21
	22	while(<>) {
	23	my @f = split(/;/, $_);
	24	my $c = hex($f[0]); # codepoint
	25	next if $c >= 0xE0000; # ignore various high-numbered stuff
	26	my $name = $f[1];
	27	my $gc = $f[2]; # general category
	28	my $ccc = $f[3]; # canonical combining class
	29	my $sum = hex($f[12]) \|\| $c; # simple upper case mapping
	30	my $slm = hex($f[13]) \|\| $c; # simple lower case mapping
	31	# recalculate the upper/lower case mappings as offsets
	32	my $ud = $sum - $c;
	33	my $ld = $slm - $c;
	34	$data{$c} = {
	35	"gc" => $gc,
	36	"ccc" => $ccc,
	37	"ud" => $ud,
	38	"ld" => $ld
	39	};
	40	$cats{$gc} = 1;
	41	$max = $c if $c > $max;
	42	}
	43
	44	$max += 255 - ($max % 256); # round up
	45
	46	# Make sure there are no gaps
	47	for(my $c = 0; $c <= $max; ++$c) {
	48	if(!exists $data{$c}) {
	49	$data{$c} = {
	50	"gc" => "Cn", # not assigned
	51	"ccc" => 0,
	52	"ud" => 0,
	53	"ld" => 0
	54	};
	55	}
	56	}
	57	$cats{'Cn'} = 1;
	58
	59	open(STDOUT, ">unidata.h") or die "unidata.h: $!\n";
	60
	61	out("#ifndef UNIDATA_H\n",
	62	"#define UNIDATA_H\n");
	63
	64	out("enum unicode_gc_cat {\n",
	65	join(",\n",
	66	map(" unicode_gc_$_", sort keys %cats)), "\n};\n");
	67
	68	out("struct unidata {\n",
	69	" enum unicode_gc_cat gc;\n",
	70	" int ccc;\n",
	71	" int upper_offset;\n",
	72	" int lower_offset;\n",
	73	"};\n");
	74
	75	out("extern const struct unidata *const unidata[];\n");
	76
	77	out("#define UNICODE_NCHARS ", ($max + 1), "\n");
	78
	79	out("#endif\n");
	80
	81	close STDOUT or die "unidata.h: $!\n";
	82
	83	open(STDOUT, ">unidata.c") or die "unidata.c: $!\n";
	84
	85	out("#include \"unidata.h\"\n");
	86
	87	# Visit all the 256-character blocks in turn and generate the required
	88	# subtables
	89	my %subtable = (); # base->subtable number
	90	my %subtableno = (); # subtable number -> content
	91	my $subtablecounter = 0; # counter for subtable numbers
	92	for(my $base = 0; $base <= $max; $base += 256) {
	93	my @t;
	94	for(my $c = $base; $c <= $base + 255; ++$c) {
	95	my $d = $data{$c};
	96	push(@t,
	97	" { unicode_gc_$d->{gc}, $d->{ccc}, $d->{ud}, $d->{ld} }");
	98	}
	99	my $t = join(",\n", @t);
	100	if(!exists $subtable{$t}) {
	101	out("static const struct unidata subtable$subtablecounter\[] = {\n",
	102	"$t\n",
	103	"};\n");
	104	$subtable{$t} = $subtablecounter++;
	105	}
	106	$subtableno{$base} = $subtable{$t};
	107	}
	108
	109	out("const struct unidata *const unidata[] = {\n");
	110	for(my $base = 0; $base <= $max; $base += 256) {
	111	out(" subtable$subtableno{$base},\n");
	112	}
	113	out("};\n");
	114
	115	close STDOUT or die "unidata.c: $!\n";
	116
	117	print STDERR "max=$max, subtables=$subtablecounter\n";