#include "charset.h"
#include "unidata.h"
+#include "unicode.h"
const char *casefold(const char *ptr) {
- struct dynstr d;
- uint32_t c;
- const char *s = ptr;
-
- dynstr_init(&d);
- while(*s) {
- /* Convert UTF-8 to UCS-32 */
- PARSE_UTF8(s, c, return ptr);
- /* Normalize */
- if(c < UNICODE_NCHARS) {
- /* If this a known character, convert it to lower case */
- const struct unidata *const ud = &unidata[c / 256][c % 256];
- c += ud->lower_offset;
- }
- /* Convert UCS-4 back to UTF-8 */
- one_ucs42utf8(c, &d);
- }
- dynstr_terminate(&d);
- return d.vec;
+ return utf8_casefold_canon(ptr, strlen(ptr), 0);
}
-static enum unicode_gc_cat cat(uint32_t c) {
+static enum unicode_General_Category cat(uint32_t c) {
if(c < UNICODE_NCHARS) {
- /* If this a known character, convert it to lower case */
- const struct unidata *const ud = &unidata[c / 256][c % 256];
- return ud->gc;
+ const struct unidata *const ud = &unidata[c / UNICODE_MODULUS][c % UNICODE_MODULUS];
+ return ud->general_category;
} else
- return unicode_gc_Cn;
+ return unicode_General_Category_Cn;
}
/* XXX this is a bit kludgy */
}
/* do the rest on category */
switch(cat(c)) {
- case unicode_gc_Ll:
- case unicode_gc_Lm:
- case unicode_gc_Lo:
- case unicode_gc_Lt:
- case unicode_gc_Lu:
- case unicode_gc_Nd:
- case unicode_gc_Nl:
- case unicode_gc_No:
- case unicode_gc_Sc:
- case unicode_gc_Sk:
- case unicode_gc_Sm:
- case unicode_gc_So:
+ case unicode_General_Category_Ll:
+ case unicode_General_Category_Lm:
+ case unicode_General_Category_Lo:
+ case unicode_General_Category_Lt:
+ case unicode_General_Category_Lu:
+ case unicode_General_Category_Nd:
+ case unicode_General_Category_Nl:
+ case unicode_General_Category_No:
+ case unicode_General_Category_Sc:
+ case unicode_General_Category_Sk:
+ case unicode_General_Category_Sm:
+ case unicode_General_Category_So:
/* letters, digits and symbols are considered to be part of
* words */
if(!in_word) {
dynstr_append_bytes(&d, start, s - start);
break;
- case unicode_gc_Cc:
- case unicode_gc_Cf:
- case unicode_gc_Co:
- case unicode_gc_Cs:
- case unicode_gc_Zl:
- case unicode_gc_Zp:
- case unicode_gc_Zs:
- case unicode_gc_Pe:
- case unicode_gc_Ps:
+ case unicode_General_Category_Cc:
+ case unicode_General_Category_Cf:
+ case unicode_General_Category_Co:
+ case unicode_General_Category_Cs:
+ case unicode_General_Category_Zl:
+ case unicode_General_Category_Zp:
+ case unicode_General_Category_Zs:
+ case unicode_General_Category_Pe:
+ case unicode_General_Category_Ps:
separator:
if(in_word) {
dynstr_terminate(&d);
}
break;
- case unicode_gc_Mc:
- case unicode_gc_Me:
- case unicode_gc_Mn:
- case unicode_gc_Pc:
- case unicode_gc_Pd:
- case unicode_gc_Pf:
- case unicode_gc_Pi:
- case unicode_gc_Po:
- case unicode_gc_Cn:
+ case unicode_General_Category_Mc:
+ case unicode_General_Category_Me:
+ case unicode_General_Category_Mn:
+ case unicode_General_Category_Pc:
+ case unicode_General_Category_Pd:
+ case unicode_General_Category_Pf:
+ case unicode_General_Category_Pi:
+ case unicode_General_Category_Po:
+ case unicode_General_Category_Cn:
/* control and punctuation is completely ignored */
break;