From 1452363583a176aafcb00a17bf76c223e3a1f31c Mon Sep 17 00:00:00 2001 Message-Id: <1452363583a176aafcb00a17bf76c223e3a1f31c.1713906264.git.mdw@distorted.org.uk> From: Mark Wooding Date: Sun, 18 Nov 2007 12:14:24 +0000 Subject: [PATCH] unicode_gc_cat -> unicode_General_Category Organization: Straylight/Edgeware From: Richard Kettlewell --- lib/charset.c | 2 +- lib/unicode.c | 14 +++++----- lib/unidata.c | 60 ++++++++++++++++++++-------------------- lib/unidata.h | 64 +++++++++++++++++++++--------------------- lib/words.c | 66 ++++++++++++++++++++++---------------------- scripts/make-unidata | 18 +++++++----- 6 files changed, 114 insertions(+), 110 deletions(-) diff --git a/lib/charset.c b/lib/charset.c index 9d77adc..c763d10 100644 --- a/lib/charset.c +++ b/lib/charset.c @@ -183,7 +183,7 @@ static int combining(int c) { if(c < UNICODE_NCHARS) { const struct unidata *const ud = &unidata[c / UNICODE_MODULUS][c % UNICODE_MODULUS]; - return ud->gc == unicode_gc_Mn || ud->ccc != 0; + return ud->general_category == unicode_General_Category_Mn || ud->ccc != 0; } /* Assume unknown characters are noncombining */ return 0; diff --git a/lib/unicode.c b/lib/unicode.c index 032e36e..618ff06 100644 --- a/lib/unicode.c +++ b/lib/unicode.c @@ -577,12 +577,12 @@ int utf32_cmp(const uint32_t *a, const uint32_t *b) { * @param Code point * @return General_Category property value */ -static inline enum unicode_gc_cat utf32__general_category(uint32_t c) { +static inline enum unicode_General_Category utf32__general_category(uint32_t c) { if(c < UNICODE_NCHARS) { const struct unidata *const ud = &unidata[c / UNICODE_MODULUS][c % UNICODE_MODULUS]; - return ud->gc; + return ud->general_category; } else - return unicode_gc_Cn; + return unicode_General_Category_Cn; } /** @brief Check Grapheme_Cluster_Break property @@ -593,11 +593,11 @@ static int utf32__is_control_or_cr_or_lf(uint32_t c) { switch(utf32__general_category(c)) { default: return 0; - case unicode_gc_Zl: - case unicode_gc_Zp: - case unicode_gc_Cc: + case unicode_General_Category_Zl: + case unicode_General_Category_Zp: + case unicode_General_Category_Cc: return 1; - case unicode_gc_Cf: + case unicode_General_Category_Cf: if(c == 0x200C || c == 0x200D) return 0; return 1; diff --git a/lib/unidata.c b/lib/unidata.c index 5ee07ee..09bb353 100644 --- a/lib/unidata.c +++ b/lib/unidata.c @@ -2,36 +2,36 @@ #include #include "types.h" #include "unidata.h" -#define Cc unicode_gc_Cc -#define Cf unicode_gc_Cf -#define Cn unicode_gc_Cn -#define Co unicode_gc_Co -#define Cs unicode_gc_Cs -#define Ll unicode_gc_Ll -#define Lm unicode_gc_Lm -#define Lo unicode_gc_Lo -#define Lt unicode_gc_Lt -#define Lu unicode_gc_Lu -#define Mc unicode_gc_Mc -#define Me unicode_gc_Me -#define Mn unicode_gc_Mn -#define Nd unicode_gc_Nd -#define Nl unicode_gc_Nl -#define No unicode_gc_No -#define Pc unicode_gc_Pc -#define Pd unicode_gc_Pd -#define Pe unicode_gc_Pe -#define Pf unicode_gc_Pf -#define Pi unicode_gc_Pi -#define Po unicode_gc_Po -#define Ps unicode_gc_Ps -#define Sc unicode_gc_Sc -#define Sk unicode_gc_Sk -#define Sm unicode_gc_Sm -#define So unicode_gc_So -#define Zl unicode_gc_Zl -#define Zp unicode_gc_Zp -#define Zs unicode_gc_Zs +#define Cc unicode_General_Category_Cc +#define Cf unicode_General_Category_Cf +#define Cn unicode_General_Category_Cn +#define Co unicode_General_Category_Co +#define Cs unicode_General_Category_Cs +#define Ll unicode_General_Category_Ll +#define Lm unicode_General_Category_Lm +#define Lo unicode_General_Category_Lo +#define Lt unicode_General_Category_Lt +#define Lu unicode_General_Category_Lu +#define Mc unicode_General_Category_Mc +#define Me unicode_General_Category_Me +#define Mn unicode_General_Category_Mn +#define Nd unicode_General_Category_Nd +#define Nl unicode_General_Category_Nl +#define No unicode_General_Category_No +#define Pc unicode_General_Category_Pc +#define Pd unicode_General_Category_Pd +#define Pe unicode_General_Category_Pe +#define Pf unicode_General_Category_Pf +#define Pi unicode_General_Category_Pi +#define Po unicode_General_Category_Po +#define Ps unicode_General_Category_Ps +#define Sc unicode_General_Category_Sc +#define Sk unicode_General_Category_Sk +#define Sm unicode_General_Category_Sm +#define So unicode_General_Category_So +#define Zl unicode_General_Category_Zl +#define Zp unicode_General_Category_Zp +#define Zs unicode_General_Category_Zs #define GBCR unicode_Grapheme_Break_CR #define GBControl unicode_Grapheme_Break_Control #define GBExtend unicode_Grapheme_Break_Extend diff --git a/lib/unidata.h b/lib/unidata.h index 5f22127..3688a76 100644 --- a/lib/unidata.h +++ b/lib/unidata.h @@ -1,37 +1,37 @@ /* Automatically generated file, see scripts/make-unidata */ #ifndef UNIDATA_H #define UNIDATA_H -enum unicode_gc_cat { - unicode_gc_Cc, - unicode_gc_Cf, - unicode_gc_Cn, - unicode_gc_Co, - unicode_gc_Cs, - unicode_gc_Ll, - unicode_gc_Lm, - unicode_gc_Lo, - unicode_gc_Lt, - unicode_gc_Lu, - unicode_gc_Mc, - unicode_gc_Me, - unicode_gc_Mn, - unicode_gc_Nd, - unicode_gc_Nl, - unicode_gc_No, - unicode_gc_Pc, - unicode_gc_Pd, - unicode_gc_Pe, - unicode_gc_Pf, - unicode_gc_Pi, - unicode_gc_Po, - unicode_gc_Ps, - unicode_gc_Sc, - unicode_gc_Sk, - unicode_gc_Sm, - unicode_gc_So, - unicode_gc_Zl, - unicode_gc_Zp, - unicode_gc_Zs +enum unicode_General_Category { + unicode_General_Category_Cc, + unicode_General_Category_Cf, + unicode_General_Category_Cn, + unicode_General_Category_Co, + unicode_General_Category_Cs, + unicode_General_Category_Ll, + unicode_General_Category_Lm, + unicode_General_Category_Lo, + unicode_General_Category_Lt, + unicode_General_Category_Lu, + unicode_General_Category_Mc, + unicode_General_Category_Me, + unicode_General_Category_Mn, + unicode_General_Category_Nd, + unicode_General_Category_Nl, + unicode_General_Category_No, + unicode_General_Category_Pc, + unicode_General_Category_Pd, + unicode_General_Category_Pe, + unicode_General_Category_Pf, + unicode_General_Category_Pi, + unicode_General_Category_Po, + unicode_General_Category_Ps, + unicode_General_Category_Sc, + unicode_General_Category_Sk, + unicode_General_Category_Sm, + unicode_General_Category_So, + unicode_General_Category_Zl, + unicode_General_Category_Zp, + unicode_General_Category_Zs }; enum unicode_Grapheme_Break { unicode_Grapheme_Break_CR, @@ -84,7 +84,7 @@ struct unidata { int16_t upper_offset; int16_t lower_offset; unsigned char ccc; - char gc; + char general_category; uint8_t flags; char grapheme_break; char word_break; diff --git a/lib/words.c b/lib/words.c index 01c9db2..2638ea6 100644 --- a/lib/words.c +++ b/lib/words.c @@ -39,12 +39,12 @@ const char *casefold(const char *ptr) { return utf8_casefold_canon(ptr, strlen(ptr), 0); } -static enum unicode_gc_cat cat(uint32_t c) { +static enum unicode_General_Category cat(uint32_t c) { if(c < UNICODE_NCHARS) { const struct unidata *const ud = &unidata[c / UNICODE_MODULUS][c % UNICODE_MODULUS]; - return ud->gc; + return ud->general_category; } else - return unicode_gc_Cn; + return unicode_General_Category_Cn; } /* XXX this is a bit kludgy */ @@ -73,18 +73,18 @@ char **words(const char *s, int *nvecp) { } /* do the rest on category */ switch(cat(c)) { - case unicode_gc_Ll: - case unicode_gc_Lm: - case unicode_gc_Lo: - case unicode_gc_Lt: - case unicode_gc_Lu: - case unicode_gc_Nd: - case unicode_gc_Nl: - case unicode_gc_No: - case unicode_gc_Sc: - case unicode_gc_Sk: - case unicode_gc_Sm: - case unicode_gc_So: + case unicode_General_Category_Ll: + case unicode_General_Category_Lm: + case unicode_General_Category_Lo: + case unicode_General_Category_Lt: + case unicode_General_Category_Lu: + case unicode_General_Category_Nd: + case unicode_General_Category_Nl: + case unicode_General_Category_No: + case unicode_General_Category_Sc: + case unicode_General_Category_Sk: + case unicode_General_Category_Sm: + case unicode_General_Category_So: /* letters, digits and symbols are considered to be part of * words */ if(!in_word) { @@ -94,15 +94,15 @@ char **words(const char *s, int *nvecp) { dynstr_append_bytes(&d, start, s - start); break; - case unicode_gc_Cc: - case unicode_gc_Cf: - case unicode_gc_Co: - case unicode_gc_Cs: - case unicode_gc_Zl: - case unicode_gc_Zp: - case unicode_gc_Zs: - case unicode_gc_Pe: - case unicode_gc_Ps: + case unicode_General_Category_Cc: + case unicode_General_Category_Cf: + case unicode_General_Category_Co: + case unicode_General_Category_Cs: + case unicode_General_Category_Zl: + case unicode_General_Category_Zp: + case unicode_General_Category_Zs: + case unicode_General_Category_Pe: + case unicode_General_Category_Ps: separator: if(in_word) { dynstr_terminate(&d); @@ -111,15 +111,15 @@ char **words(const char *s, int *nvecp) { } break; - case unicode_gc_Mc: - case unicode_gc_Me: - case unicode_gc_Mn: - case unicode_gc_Pc: - case unicode_gc_Pd: - case unicode_gc_Pf: - case unicode_gc_Pi: - case unicode_gc_Po: - case unicode_gc_Cn: + case unicode_General_Category_Mc: + case unicode_General_Category_Me: + case unicode_General_Category_Mn: + case unicode_General_Category_Pc: + case unicode_General_Category_Pd: + case unicode_General_Category_Pf: + case unicode_General_Category_Pi: + case unicode_General_Category_Po: + case unicode_General_Category_Cn: /* control and punctuation is completely ignored */ break; diff --git a/scripts/make-unidata b/scripts/make-unidata index bbb4aff..81f347d 100755 --- a/scripts/make-unidata +++ b/scripts/make-unidata @@ -252,9 +252,9 @@ out("/* Automatically generated file, see scripts/make-unidata */\n", "#define UNIDATA_H\n"); # TODO choose stable values for General_Category -out("enum unicode_gc_cat {\n", +out("enum unicode_General_Category {\n", join(",\n", - map(" unicode_gc_$_", sort keys %cats)), "\n};\n"); + map(" unicode_General_Category_$_", sort keys %cats)), "\n};\n"); out("enum unicode_Grapheme_Break {\n", join(",\n", @@ -302,7 +302,7 @@ out("struct unidata {\n", " ".choosetype($minud, $maxud)." upper_offset;\n", " ".choosetype($minld, $maxld)." lower_offset;\n", " ".choosetype(0, $maxccc)." ccc;\n", - " char gc;\n", + " char general_category;\n", " uint8_t flags;\n", " char grapheme_break;\n", " char word_break;\n", @@ -331,10 +331,14 @@ out("/* Automatically generated file, see scripts/make-unidata */\n", # Short aliases to keep .c file small -out(map(sprintf("#define %s unicode_gc_%s\n", $_, $_), sort keys %cats)); -out(map(sprintf("#define GB%s unicode_Grapheme_Break_%s\n", $_, $_), sort keys %gbreak)); -out(map(sprintf("#define WB%s unicode_Word_Break_%s\n", $_, $_), sort keys %wbreak)); -out(map(sprintf("#define SB%s unicode_Sentence_Break_%s\n", $_, $_), sort keys %sbreak)); +out(map(sprintf("#define %s unicode_General_Category_%s\n", $_, $_), + sort keys %cats)); +out(map(sprintf("#define GB%s unicode_Grapheme_Break_%s\n", $_, $_), + sort keys %gbreak)); +out(map(sprintf("#define WB%s unicode_Word_Break_%s\n", $_, $_), + sort keys %wbreak)); +out(map(sprintf("#define SB%s unicode_Sentence_Break_%s\n", $_, $_), + sort keys %sbreak)); # Names for *_Break properties out("const char *const unicode_Grapheme_Break_names[] = {\n", -- [mdw]