if(c < UNICODE_NCHARS) {
const struct unidata *const ud = &unidata[c / UNICODE_MODULUS][c % UNICODE_MODULUS];
- return ud->gc == unicode_gc_Mn || ud->ccc != 0;
+ return ud->general_category == unicode_General_Category_Mn || ud->ccc != 0;
}
/* Assume unknown characters are noncombining */
return 0;
* @param Code point
* @return General_Category property value
*/
-static inline enum unicode_gc_cat utf32__general_category(uint32_t c) {
+static inline enum unicode_General_Category utf32__general_category(uint32_t c) {
if(c < UNICODE_NCHARS) {
const struct unidata *const ud = &unidata[c / UNICODE_MODULUS][c % UNICODE_MODULUS];
- return ud->gc;
+ return ud->general_category;
} else
- return unicode_gc_Cn;
+ return unicode_General_Category_Cn;
}
/** @brief Check Grapheme_Cluster_Break property
switch(utf32__general_category(c)) {
default:
return 0;
- case unicode_gc_Zl:
- case unicode_gc_Zp:
- case unicode_gc_Cc:
+ case unicode_General_Category_Zl:
+ case unicode_General_Category_Zp:
+ case unicode_General_Category_Cc:
return 1;
- case unicode_gc_Cf:
+ case unicode_General_Category_Cf:
if(c == 0x200C || c == 0x200D)
return 0;
return 1;
#include <config.h>
#include "types.h"
#include "unidata.h"
-#define Cc unicode_gc_Cc
-#define Cf unicode_gc_Cf
-#define Cn unicode_gc_Cn
-#define Co unicode_gc_Co
-#define Cs unicode_gc_Cs
-#define Ll unicode_gc_Ll
-#define Lm unicode_gc_Lm
-#define Lo unicode_gc_Lo
-#define Lt unicode_gc_Lt
-#define Lu unicode_gc_Lu
-#define Mc unicode_gc_Mc
-#define Me unicode_gc_Me
-#define Mn unicode_gc_Mn
-#define Nd unicode_gc_Nd
-#define Nl unicode_gc_Nl
-#define No unicode_gc_No
-#define Pc unicode_gc_Pc
-#define Pd unicode_gc_Pd
-#define Pe unicode_gc_Pe
-#define Pf unicode_gc_Pf
-#define Pi unicode_gc_Pi
-#define Po unicode_gc_Po
-#define Ps unicode_gc_Ps
-#define Sc unicode_gc_Sc
-#define Sk unicode_gc_Sk
-#define Sm unicode_gc_Sm
-#define So unicode_gc_So
-#define Zl unicode_gc_Zl
-#define Zp unicode_gc_Zp
-#define Zs unicode_gc_Zs
+#define Cc unicode_General_Category_Cc
+#define Cf unicode_General_Category_Cf
+#define Cn unicode_General_Category_Cn
+#define Co unicode_General_Category_Co
+#define Cs unicode_General_Category_Cs
+#define Ll unicode_General_Category_Ll
+#define Lm unicode_General_Category_Lm
+#define Lo unicode_General_Category_Lo
+#define Lt unicode_General_Category_Lt
+#define Lu unicode_General_Category_Lu
+#define Mc unicode_General_Category_Mc
+#define Me unicode_General_Category_Me
+#define Mn unicode_General_Category_Mn
+#define Nd unicode_General_Category_Nd
+#define Nl unicode_General_Category_Nl
+#define No unicode_General_Category_No
+#define Pc unicode_General_Category_Pc
+#define Pd unicode_General_Category_Pd
+#define Pe unicode_General_Category_Pe
+#define Pf unicode_General_Category_Pf
+#define Pi unicode_General_Category_Pi
+#define Po unicode_General_Category_Po
+#define Ps unicode_General_Category_Ps
+#define Sc unicode_General_Category_Sc
+#define Sk unicode_General_Category_Sk
+#define Sm unicode_General_Category_Sm
+#define So unicode_General_Category_So
+#define Zl unicode_General_Category_Zl
+#define Zp unicode_General_Category_Zp
+#define Zs unicode_General_Category_Zs
#define GBCR unicode_Grapheme_Break_CR
#define GBControl unicode_Grapheme_Break_Control
#define GBExtend unicode_Grapheme_Break_Extend
/* Automatically generated file, see scripts/make-unidata */
#ifndef UNIDATA_H
#define UNIDATA_H
-enum unicode_gc_cat {
- unicode_gc_Cc,
- unicode_gc_Cf,
- unicode_gc_Cn,
- unicode_gc_Co,
- unicode_gc_Cs,
- unicode_gc_Ll,
- unicode_gc_Lm,
- unicode_gc_Lo,
- unicode_gc_Lt,
- unicode_gc_Lu,
- unicode_gc_Mc,
- unicode_gc_Me,
- unicode_gc_Mn,
- unicode_gc_Nd,
- unicode_gc_Nl,
- unicode_gc_No,
- unicode_gc_Pc,
- unicode_gc_Pd,
- unicode_gc_Pe,
- unicode_gc_Pf,
- unicode_gc_Pi,
- unicode_gc_Po,
- unicode_gc_Ps,
- unicode_gc_Sc,
- unicode_gc_Sk,
- unicode_gc_Sm,
- unicode_gc_So,
- unicode_gc_Zl,
- unicode_gc_Zp,
- unicode_gc_Zs
+enum unicode_General_Category {
+ unicode_General_Category_Cc,
+ unicode_General_Category_Cf,
+ unicode_General_Category_Cn,
+ unicode_General_Category_Co,
+ unicode_General_Category_Cs,
+ unicode_General_Category_Ll,
+ unicode_General_Category_Lm,
+ unicode_General_Category_Lo,
+ unicode_General_Category_Lt,
+ unicode_General_Category_Lu,
+ unicode_General_Category_Mc,
+ unicode_General_Category_Me,
+ unicode_General_Category_Mn,
+ unicode_General_Category_Nd,
+ unicode_General_Category_Nl,
+ unicode_General_Category_No,
+ unicode_General_Category_Pc,
+ unicode_General_Category_Pd,
+ unicode_General_Category_Pe,
+ unicode_General_Category_Pf,
+ unicode_General_Category_Pi,
+ unicode_General_Category_Po,
+ unicode_General_Category_Ps,
+ unicode_General_Category_Sc,
+ unicode_General_Category_Sk,
+ unicode_General_Category_Sm,
+ unicode_General_Category_So,
+ unicode_General_Category_Zl,
+ unicode_General_Category_Zp,
+ unicode_General_Category_Zs
};
enum unicode_Grapheme_Break {
unicode_Grapheme_Break_CR,
int16_t upper_offset;
int16_t lower_offset;
unsigned char ccc;
- char gc;
+ char general_category;
uint8_t flags;
char grapheme_break;
char word_break;
return utf8_casefold_canon(ptr, strlen(ptr), 0);
}
-static enum unicode_gc_cat cat(uint32_t c) {
+static enum unicode_General_Category cat(uint32_t c) {
if(c < UNICODE_NCHARS) {
const struct unidata *const ud = &unidata[c / UNICODE_MODULUS][c % UNICODE_MODULUS];
- return ud->gc;
+ return ud->general_category;
} else
- return unicode_gc_Cn;
+ return unicode_General_Category_Cn;
}
/* XXX this is a bit kludgy */
}
/* do the rest on category */
switch(cat(c)) {
- case unicode_gc_Ll:
- case unicode_gc_Lm:
- case unicode_gc_Lo:
- case unicode_gc_Lt:
- case unicode_gc_Lu:
- case unicode_gc_Nd:
- case unicode_gc_Nl:
- case unicode_gc_No:
- case unicode_gc_Sc:
- case unicode_gc_Sk:
- case unicode_gc_Sm:
- case unicode_gc_So:
+ case unicode_General_Category_Ll:
+ case unicode_General_Category_Lm:
+ case unicode_General_Category_Lo:
+ case unicode_General_Category_Lt:
+ case unicode_General_Category_Lu:
+ case unicode_General_Category_Nd:
+ case unicode_General_Category_Nl:
+ case unicode_General_Category_No:
+ case unicode_General_Category_Sc:
+ case unicode_General_Category_Sk:
+ case unicode_General_Category_Sm:
+ case unicode_General_Category_So:
/* letters, digits and symbols are considered to be part of
* words */
if(!in_word) {
dynstr_append_bytes(&d, start, s - start);
break;
- case unicode_gc_Cc:
- case unicode_gc_Cf:
- case unicode_gc_Co:
- case unicode_gc_Cs:
- case unicode_gc_Zl:
- case unicode_gc_Zp:
- case unicode_gc_Zs:
- case unicode_gc_Pe:
- case unicode_gc_Ps:
+ case unicode_General_Category_Cc:
+ case unicode_General_Category_Cf:
+ case unicode_General_Category_Co:
+ case unicode_General_Category_Cs:
+ case unicode_General_Category_Zl:
+ case unicode_General_Category_Zp:
+ case unicode_General_Category_Zs:
+ case unicode_General_Category_Pe:
+ case unicode_General_Category_Ps:
separator:
if(in_word) {
dynstr_terminate(&d);
}
break;
- case unicode_gc_Mc:
- case unicode_gc_Me:
- case unicode_gc_Mn:
- case unicode_gc_Pc:
- case unicode_gc_Pd:
- case unicode_gc_Pf:
- case unicode_gc_Pi:
- case unicode_gc_Po:
- case unicode_gc_Cn:
+ case unicode_General_Category_Mc:
+ case unicode_General_Category_Me:
+ case unicode_General_Category_Mn:
+ case unicode_General_Category_Pc:
+ case unicode_General_Category_Pd:
+ case unicode_General_Category_Pf:
+ case unicode_General_Category_Pi:
+ case unicode_General_Category_Po:
+ case unicode_General_Category_Cn:
/* control and punctuation is completely ignored */
break;
"#define UNIDATA_H\n");
# TODO choose stable values for General_Category
-out("enum unicode_gc_cat {\n",
+out("enum unicode_General_Category {\n",
join(",\n",
- map(" unicode_gc_$_", sort keys %cats)), "\n};\n");
+ map(" unicode_General_Category_$_", sort keys %cats)), "\n};\n");
out("enum unicode_Grapheme_Break {\n",
join(",\n",
" ".choosetype($minud, $maxud)." upper_offset;\n",
" ".choosetype($minld, $maxld)." lower_offset;\n",
" ".choosetype(0, $maxccc)." ccc;\n",
- " char gc;\n",
+ " char general_category;\n",
" uint8_t flags;\n",
" char grapheme_break;\n",
" char word_break;\n",
# Short aliases to keep .c file small
-out(map(sprintf("#define %s unicode_gc_%s\n", $_, $_), sort keys %cats));
-out(map(sprintf("#define GB%s unicode_Grapheme_Break_%s\n", $_, $_), sort keys %gbreak));
-out(map(sprintf("#define WB%s unicode_Word_Break_%s\n", $_, $_), sort keys %wbreak));
-out(map(sprintf("#define SB%s unicode_Sentence_Break_%s\n", $_, $_), sort keys %sbreak));
+out(map(sprintf("#define %s unicode_General_Category_%s\n", $_, $_),
+ sort keys %cats));
+out(map(sprintf("#define GB%s unicode_Grapheme_Break_%s\n", $_, $_),
+ sort keys %gbreak));
+out(map(sprintf("#define WB%s unicode_Word_Break_%s\n", $_, $_),
+ sort keys %wbreak));
+out(map(sprintf("#define SB%s unicode_Sentence_Break_%s\n", $_, $_),
+ sort keys %sbreak));
# Names for *_Break properties
out("const char *const unicode_Grapheme_Break_names[] = {\n",