From 18cda350fe80c2226684f9660f2956869552b317 Mon Sep 17 00:00:00 2001 Message-Id: <18cda350fe80c2226684f9660f2956869552b317.1715121354.git.mdw@distorted.org.uk> From: Mark Wooding Date: Sun, 18 Nov 2007 22:03:51 +0000 Subject: [PATCH] table-drive UTF-8 validity checker Organization: Straylight/Edgeware From: Richard Kettlewell --- lib/test.c | 1 + lib/unicode.c | 45 ++++++++ lib/unicode.h | 1 + lib/unidata.c | 258 +++++++++++++++++++++++++++++++++++++++++++ lib/unidata.h | 4 + lib/utf8.c | 8 -- lib/utf8.h | 3 - scripts/make-unidata | 41 +++++++ server/cgi.c | 6 +- 9 files changed, 353 insertions(+), 14 deletions(-) diff --git a/lib/test.c b/lib/test.c index d3e8282..b0b5395 100644 --- a/lib/test.c +++ b/lib/test.c @@ -148,6 +148,7 @@ static void test_utf8(void) { } while(0) fprintf(stderr, "test_utf8\n"); +#define validutf8(S) utf8_valid((S), strlen(S)) /* empty string */ diff --git a/lib/unicode.c b/lib/unicode.c index 5d37cad..8fe20a6 100644 --- a/lib/unicode.c +++ b/lib/unicode.c @@ -295,6 +295,51 @@ error: return 0; } +/** @brief Test whether [s,s+ns) is valid UTF-8 + * @param s Start of string + * @param ns Length of string + * @return non-0 if @p s is valid UTF-8, 0 if it is not valid + * + * This function is intended to be much faster than calling utf8_to_utf32() and + * throwing away the result. + */ +int utf8_valid(const char *s, size_t ns) { + const uint8_t *ss = (const uint8_t *)s; + while(ns > 0) { + const struct unicode_utf8_row *const r = &unicode_utf8_valid[*ss]; + if(r->count <= ns) { + switch(r->count) { + case 1: + break; + case 2: + if(ss[1] < r->min2 || ss[1] > r->max2) + return 0; + break; + case 3: + if(ss[1] < r->min2 || ss[1] > r->max2) + return 0; + if(ss[2] < 0x80 || ss[2] > 0xBF) + return 0; + break; + case 4: + if(ss[1] < r->min2 || ss[1] > r->max2) + return 0; + if(ss[2] < 0x80 || ss[2] > 0xBF) + return 0; + if(ss[3] < 0x80 || ss[3] > 0xBF) + return 0; + break; + default: + return 0; + } + } else + return 0; + ss += r->count; + ns -= r->count; + } + return 1; +} + /*@}*/ /** @defgroup utf32iterator UTF-32 string iterators */ /*@{*/ diff --git a/lib/unicode.h b/lib/unicode.h index 0f156c4..1eb8c68 100644 --- a/lib/unicode.h +++ b/lib/unicode.h @@ -38,6 +38,7 @@ typedef struct utf32_iterator_data *utf32_iterator; char *utf32_to_utf8(const uint32_t *s, size_t ns, size_t *nd); uint32_t *utf8_to_utf32(const char *s, size_t ns, size_t *nd); +int utf8_valid(const char *s, size_t ns); size_t utf32_len(const uint32_t *s); int utf32_cmp(const uint32_t *a, const uint32_t *b); diff --git a/lib/unidata.c b/lib/unidata.c index 7baf4ba..8799172 100644 --- a/lib/unidata.c +++ b/lib/unidata.c @@ -30516,3 +30516,261 @@ st560, st560, st76, }; +const struct unicode_utf8_row unicode_utf8_valid[] = { + { 1, 0, 0 }, /* 0 */ + { 1, 0, 0 }, /* 1 */ + { 1, 0, 0 }, /* 2 */ + { 1, 0, 0 }, /* 3 */ + { 1, 0, 0 }, /* 4 */ + { 1, 0, 0 }, /* 5 */ + { 1, 0, 0 }, /* 6 */ + { 1, 0, 0 }, /* 7 */ + { 1, 0, 0 }, /* 8 */ + { 1, 0, 0 }, /* 9 */ + { 1, 0, 0 }, /* 10 */ + { 1, 0, 0 }, /* 11 */ + { 1, 0, 0 }, /* 12 */ + { 1, 0, 0 }, /* 13 */ + { 1, 0, 0 }, /* 14 */ + { 1, 0, 0 }, /* 15 */ + { 1, 0, 0 }, /* 16 */ + { 1, 0, 0 }, /* 17 */ + { 1, 0, 0 }, /* 18 */ + { 1, 0, 0 }, /* 19 */ + { 1, 0, 0 }, /* 20 */ + { 1, 0, 0 }, /* 21 */ + { 1, 0, 0 }, /* 22 */ + { 1, 0, 0 }, /* 23 */ + { 1, 0, 0 }, /* 24 */ + { 1, 0, 0 }, /* 25 */ + { 1, 0, 0 }, /* 26 */ + { 1, 0, 0 }, /* 27 */ + { 1, 0, 0 }, /* 28 */ + { 1, 0, 0 }, /* 29 */ + { 1, 0, 0 }, /* 30 */ + { 1, 0, 0 }, /* 31 */ + { 1, 0, 0 }, /* 32 */ + { 1, 0, 0 }, /* 33 */ + { 1, 0, 0 }, /* 34 */ + { 1, 0, 0 }, /* 35 */ + { 1, 0, 0 }, /* 36 */ + { 1, 0, 0 }, /* 37 */ + { 1, 0, 0 }, /* 38 */ + { 1, 0, 0 }, /* 39 */ + { 1, 0, 0 }, /* 40 */ + { 1, 0, 0 }, /* 41 */ + { 1, 0, 0 }, /* 42 */ + { 1, 0, 0 }, /* 43 */ + { 1, 0, 0 }, /* 44 */ + { 1, 0, 0 }, /* 45 */ + { 1, 0, 0 }, /* 46 */ + { 1, 0, 0 }, /* 47 */ + { 1, 0, 0 }, /* 48 */ + { 1, 0, 0 }, /* 49 */ + { 1, 0, 0 }, /* 50 */ + { 1, 0, 0 }, /* 51 */ + { 1, 0, 0 }, /* 52 */ + { 1, 0, 0 }, /* 53 */ + { 1, 0, 0 }, /* 54 */ + { 1, 0, 0 }, /* 55 */ + { 1, 0, 0 }, /* 56 */ + { 1, 0, 0 }, /* 57 */ + { 1, 0, 0 }, /* 58 */ + { 1, 0, 0 }, /* 59 */ + { 1, 0, 0 }, /* 60 */ + { 1, 0, 0 }, /* 61 */ + { 1, 0, 0 }, /* 62 */ + { 1, 0, 0 }, /* 63 */ + { 1, 0, 0 }, /* 64 */ + { 1, 0, 0 }, /* 65 */ + { 1, 0, 0 }, /* 66 */ + { 1, 0, 0 }, /* 67 */ + { 1, 0, 0 }, /* 68 */ + { 1, 0, 0 }, /* 69 */ + { 1, 0, 0 }, /* 70 */ + { 1, 0, 0 }, /* 71 */ + { 1, 0, 0 }, /* 72 */ + { 1, 0, 0 }, /* 73 */ + { 1, 0, 0 }, /* 74 */ + { 1, 0, 0 }, /* 75 */ + { 1, 0, 0 }, /* 76 */ + { 1, 0, 0 }, /* 77 */ + { 1, 0, 0 }, /* 78 */ + { 1, 0, 0 }, /* 79 */ + { 1, 0, 0 }, /* 80 */ + { 1, 0, 0 }, /* 81 */ + { 1, 0, 0 }, /* 82 */ + { 1, 0, 0 }, /* 83 */ + { 1, 0, 0 }, /* 84 */ + { 1, 0, 0 }, /* 85 */ + { 1, 0, 0 }, /* 86 */ + { 1, 0, 0 }, /* 87 */ + { 1, 0, 0 }, /* 88 */ + { 1, 0, 0 }, /* 89 */ + { 1, 0, 0 }, /* 90 */ + { 1, 0, 0 }, /* 91 */ + { 1, 0, 0 }, /* 92 */ + { 1, 0, 0 }, /* 93 */ + { 1, 0, 0 }, /* 94 */ + { 1, 0, 0 }, /* 95 */ + { 1, 0, 0 }, /* 96 */ + { 1, 0, 0 }, /* 97 */ + { 1, 0, 0 }, /* 98 */ + { 1, 0, 0 }, /* 99 */ + { 1, 0, 0 }, /* 100 */ + { 1, 0, 0 }, /* 101 */ + { 1, 0, 0 }, /* 102 */ + { 1, 0, 0 }, /* 103 */ + { 1, 0, 0 }, /* 104 */ + { 1, 0, 0 }, /* 105 */ + { 1, 0, 0 }, /* 106 */ + { 1, 0, 0 }, /* 107 */ + { 1, 0, 0 }, /* 108 */ + { 1, 0, 0 }, /* 109 */ + { 1, 0, 0 }, /* 110 */ + { 1, 0, 0 }, /* 111 */ + { 1, 0, 0 }, /* 112 */ + { 1, 0, 0 }, /* 113 */ + { 1, 0, 0 }, /* 114 */ + { 1, 0, 0 }, /* 115 */ + { 1, 0, 0 }, /* 116 */ + { 1, 0, 0 }, /* 117 */ + { 1, 0, 0 }, /* 118 */ + { 1, 0, 0 }, /* 119 */ + { 1, 0, 0 }, /* 120 */ + { 1, 0, 0 }, /* 121 */ + { 1, 0, 0 }, /* 122 */ + { 1, 0, 0 }, /* 123 */ + { 1, 0, 0 }, /* 124 */ + { 1, 0, 0 }, /* 125 */ + { 1, 0, 0 }, /* 126 */ + { 1, 0, 0 }, /* 127 */ + { 0, 0, 0 }, /* 128 */ + { 0, 0, 0 }, /* 129 */ + { 0, 0, 0 }, /* 130 */ + { 0, 0, 0 }, /* 131 */ + { 0, 0, 0 }, /* 132 */ + { 0, 0, 0 }, /* 133 */ + { 0, 0, 0 }, /* 134 */ + { 0, 0, 0 }, /* 135 */ + { 0, 0, 0 }, /* 136 */ + { 0, 0, 0 }, /* 137 */ + { 0, 0, 0 }, /* 138 */ + { 0, 0, 0 }, /* 139 */ + { 0, 0, 0 }, /* 140 */ + { 0, 0, 0 }, /* 141 */ + { 0, 0, 0 }, /* 142 */ + { 0, 0, 0 }, /* 143 */ + { 0, 0, 0 }, /* 144 */ + { 0, 0, 0 }, /* 145 */ + { 0, 0, 0 }, /* 146 */ + { 0, 0, 0 }, /* 147 */ + { 0, 0, 0 }, /* 148 */ + { 0, 0, 0 }, /* 149 */ + { 0, 0, 0 }, /* 150 */ + { 0, 0, 0 }, /* 151 */ + { 0, 0, 0 }, /* 152 */ + { 0, 0, 0 }, /* 153 */ + { 0, 0, 0 }, /* 154 */ + { 0, 0, 0 }, /* 155 */ + { 0, 0, 0 }, /* 156 */ + { 0, 0, 0 }, /* 157 */ + { 0, 0, 0 }, /* 158 */ + { 0, 0, 0 }, /* 159 */ + { 0, 0, 0 }, /* 160 */ + { 0, 0, 0 }, /* 161 */ + { 0, 0, 0 }, /* 162 */ + { 0, 0, 0 }, /* 163 */ + { 0, 0, 0 }, /* 164 */ + { 0, 0, 0 }, /* 165 */ + { 0, 0, 0 }, /* 166 */ + { 0, 0, 0 }, /* 167 */ + { 0, 0, 0 }, /* 168 */ + { 0, 0, 0 }, /* 169 */ + { 0, 0, 0 }, /* 170 */ + { 0, 0, 0 }, /* 171 */ + { 0, 0, 0 }, /* 172 */ + { 0, 0, 0 }, /* 173 */ + { 0, 0, 0 }, /* 174 */ + { 0, 0, 0 }, /* 175 */ + { 0, 0, 0 }, /* 176 */ + { 0, 0, 0 }, /* 177 */ + { 0, 0, 0 }, /* 178 */ + { 0, 0, 0 }, /* 179 */ + { 0, 0, 0 }, /* 180 */ + { 0, 0, 0 }, /* 181 */ + { 0, 0, 0 }, /* 182 */ + { 0, 0, 0 }, /* 183 */ + { 0, 0, 0 }, /* 184 */ + { 0, 0, 0 }, /* 185 */ + { 0, 0, 0 }, /* 186 */ + { 0, 0, 0 }, /* 187 */ + { 0, 0, 0 }, /* 188 */ + { 0, 0, 0 }, /* 189 */ + { 0, 0, 0 }, /* 190 */ + { 0, 0, 0 }, /* 191 */ + { 0, 0, 0 }, /* 192 */ + { 0, 0, 0 }, /* 193 */ + { 2, 0x80, 0xBF }, /* 194 */ + { 2, 0x80, 0xBF }, /* 195 */ + { 2, 0x80, 0xBF }, /* 196 */ + { 2, 0x80, 0xBF }, /* 197 */ + { 2, 0x80, 0xBF }, /* 198 */ + { 2, 0x80, 0xBF }, /* 199 */ + { 2, 0x80, 0xBF }, /* 200 */ + { 2, 0x80, 0xBF }, /* 201 */ + { 2, 0x80, 0xBF }, /* 202 */ + { 2, 0x80, 0xBF }, /* 203 */ + { 2, 0x80, 0xBF }, /* 204 */ + { 2, 0x80, 0xBF }, /* 205 */ + { 2, 0x80, 0xBF }, /* 206 */ + { 2, 0x80, 0xBF }, /* 207 */ + { 2, 0x80, 0xBF }, /* 208 */ + { 2, 0x80, 0xBF }, /* 209 */ + { 2, 0x80, 0xBF }, /* 210 */ + { 2, 0x80, 0xBF }, /* 211 */ + { 2, 0x80, 0xBF }, /* 212 */ + { 2, 0x80, 0xBF }, /* 213 */ + { 2, 0x80, 0xBF }, /* 214 */ + { 2, 0x80, 0xBF }, /* 215 */ + { 2, 0x80, 0xBF }, /* 216 */ + { 2, 0x80, 0xBF }, /* 217 */ + { 2, 0x80, 0xBF }, /* 218 */ + { 2, 0x80, 0xBF }, /* 219 */ + { 2, 0x80, 0xBF }, /* 220 */ + { 2, 0x80, 0xBF }, /* 221 */ + { 2, 0x80, 0xBF }, /* 222 */ + { 2, 0x80, 0xBF }, /* 223 */ + { 3, 0xA0, 0xBF }, /* 224 */ + { 3, 0x80, 0xBF }, /* 225 */ + { 3, 0x80, 0xBF }, /* 226 */ + { 3, 0x80, 0xBF }, /* 227 */ + { 3, 0x80, 0xBF }, /* 228 */ + { 3, 0x80, 0xBF }, /* 229 */ + { 3, 0x80, 0xBF }, /* 230 */ + { 3, 0x80, 0xBF }, /* 231 */ + { 3, 0x80, 0xBF }, /* 232 */ + { 3, 0x80, 0xBF }, /* 233 */ + { 3, 0x80, 0xBF }, /* 234 */ + { 3, 0x80, 0xBF }, /* 235 */ + { 3, 0x80, 0xBF }, /* 236 */ + { 3, 0x80, 0x9F }, /* 237 */ + { 3, 0x80, 0xBF }, /* 238 */ + { 3, 0x80, 0xBF }, /* 239 */ + { 4, 0x90, 0xBF }, /* 240 */ + { 4, 0x80, 0xBF }, /* 241 */ + { 4, 0x80, 0xBF }, /* 242 */ + { 4, 0x80, 0xBF }, /* 243 */ + { 4, 0x80, 0x8F }, /* 244 */ + { 0, 0, 0 }, /* 245 */ + { 0, 0, 0 }, /* 246 */ + { 0, 0, 0 }, /* 247 */ + { 0, 0, 0 }, /* 248 */ + { 0, 0, 0 }, /* 249 */ + { 0, 0, 0 }, /* 250 */ + { 0, 0, 0 }, /* 251 */ + { 0, 0, 0 }, /* 252 */ + { 0, 0, 0 }, /* 253 */ + { 0, 0, 0 }, /* 254 */ + { 0, 0, 0 }, /* 255 */ +}; diff --git a/lib/unidata.h b/lib/unidata.h index dbd15e6..c2daccb 100644 --- a/lib/unidata.h +++ b/lib/unidata.h @@ -89,6 +89,10 @@ struct unidata { char sentence_break; }; extern const struct unidata *const unidata[]; +extern const struct unicode_utf8_row { + uint8_t count; + uint8_t min2, max2; +} unicode_utf8_valid[]; #define UNICODE_NCHARS 1114112 #define UNICODE_MODULUS 16 #define UNICODE_BREAK_START 196608 diff --git a/lib/utf8.c b/lib/utf8.c index da78f00..9ca228c 100644 --- a/lib/utf8.c +++ b/lib/utf8.c @@ -23,14 +23,6 @@ #include "utf8.h" -int validutf8(const char *s) { - unsigned long c; - - while(*s) - PARSE_UTF8(s, c, return 0); - return 1; -} - /* Local Variables: c-basic-offset:2 diff --git a/lib/utf8.h b/lib/utf8.h index d6b9d59..683a8a9 100644 --- a/lib/utf8.h +++ b/lib/utf8.h @@ -52,9 +52,6 @@ } \ } while(0) -int validutf8(const char *s); -/* return nonzero if S is a valid UTF-8 sequence, else false */ - #endif /* UTF8_h */ /* diff --git a/scripts/make-unidata b/scripts/make-unidata index 9fa0dec..4339fee 100755 --- a/scripts/make-unidata +++ b/scripts/make-unidata @@ -371,6 +371,11 @@ out("struct unidata {\n", out("extern const struct unidata *const unidata[];\n"); +out("extern const struct unicode_utf8_row {\n", + " uint8_t count;\n", + " uint8_t min2, max2;\n", + "} unicode_utf8_valid[];\n"); + out("#define UNICODE_NCHARS ", ($max + 1), "\n"); out("#define UNICODE_MODULUS $modulus\n"); out("#define UNICODE_BREAK_START $break_start\n"); @@ -527,6 +532,42 @@ for(my $base = 0; $base <= $max; $base += $modulus) { } out("};\n"); +out("const struct unicode_utf8_row unicode_utf8_valid[] = {\n"); +for(my $c = 0; $c <= 0x7F; ++$c) { + out(" { 1, 0, 0 }, /* $c */\n"); +} +for(my $c = 0x80; $c < 0xC2; ++$c) { + out(" { 0, 0, 0 }, /* $c */\n"); +} +for(my $c = 0xC2; $c <= 0xDF; ++$c) { + out(" { 2, 0x80, 0xBF }, /* $c */\n"); +} +for(my $c = 0xE0; $c <= 0xE0; ++$c) { + out(" { 3, 0xA0, 0xBF }, /* $c */\n"); +} +for(my $c = 0xE1; $c <= 0xEC; ++$c) { + out(" { 3, 0x80, 0xBF }, /* $c */\n"); +} +for(my $c = 0xED; $c <= 0xED; ++$c) { + out(" { 3, 0x80, 0x9F }, /* $c */\n"); +} +for(my $c = 0xEE; $c <= 0xEF; ++$c) { + out(" { 3, 0x80, 0xBF }, /* $c */\n"); +} +for(my $c = 0xF0; $c <= 0xF0; ++$c) { + out(" { 4, 0x90, 0xBF }, /* $c */\n"); +} +for(my $c = 0xF1; $c <= 0xF3; ++$c) { + out(" { 4, 0x80, 0xBF }, /* $c */\n"); +} +for(my $c = 0xF4; $c <= 0xF4; ++$c) { + out(" { 4, 0x80, 0x8F }, /* $c */\n"); +} +for(my $c = 0xF5; $c <= 0xFF; ++$c) { + out(" { 0, 0, 0 }, /* $c */\n"); +} +out("};\n"); + close STDOUT or die "unidata.c: $!\n"; printf STDERR "modulus=%d\n", $modulus; diff --git a/server/cgi.c b/server/cgi.c index ce04563..37ac724 100644 --- a/server/cgi.c +++ b/server/cgi.c @@ -52,7 +52,7 @@ #include "cgi.h" #include "printf.h" #include "mime.h" -#include "utf8.h" +#include "unicode.h" struct kvp *cgi_args; @@ -187,8 +187,8 @@ void cgi_parse(void) { else fatal(0, "unknown request method %s", p); for(k = cgi_args; k; k = k->next) - if(!validutf8(k->name) - || !validutf8(k->value)) + if(!utf8_valid(k->name, strlen(k->name)) + || !utf8_valid(k->value, strlen(k->value))) fatal(0, "invalid UTF-8 sequence in cgi argument"); } -- [mdw]