- c = *ss++;
- --ns;
- /* Acceptable UTF-8 is that which codes for Unicode Scalar Values
- * (Unicode 5.0.0 s3.9 D76)
- *
- * 0xxxxxxx
- * 7 data bits gives 0x00 - 0x7F and all are acceptable
- *
- * 110xxxxx 10xxxxxx
- * 11 data bits gives 0x0000 - 0x07FF but only 0x0080 - 0x07FF acceptable
- *
- * 1110xxxx 10xxxxxx 10xxxxxx
- * 16 data bits gives 0x0000 - 0xFFFF but only 0x0800 - 0xFFFF acceptable
- * (and UTF-16 surrogates are not acceptable)
- *
- * 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
- * 21 data bits gives 0x00000000 - 0x001FFFFF
- * but only 0x00010000 - 0x0010FFFF are acceptable
- *
- * It is NOT always the case that the data bits in the first byte are
- * always non-0 for the acceptable values, so we do a separate check after
- * decoding.
- */
- if(c < 0x80)
- c32 = c;
- else if(c <= 0xDF) {
- if(ns < 1) goto error;
- c32 = c & 0x1F;
- c = *ss++;
- if((c & 0xC0) != 0x80) goto error;
- c32 = (c32 << 6) | (c & 0x3F);
- if(c32 < 0x80) goto error;
- } else if(c <= 0xEF) {
- if(ns < 2) goto error;
- c32 = c & 0x0F;
- c = *ss++;
- if((c & 0xC0) != 0x80) goto error;
- c32 = (c32 << 6) | (c & 0x3F);
- c = *ss++;
- if((c & 0xC0) != 0x80) goto error;
- c32 = (c32 << 6) | (c & 0x3F);
- if(c32 < 0x0800 || (c32 >= 0xD800 && c32 <= 0xDFFF)) goto error;
- } else if(c <= 0xF7) {
- if(ns < 3) goto error;
- c32 = c & 0x07;
- c = *ss++;
- if((c & 0xC0) != 0x80) goto error;
- c32 = (c32 << 6) | (c & 0x3F);
- c = *ss++;
- if((c & 0xC0) != 0x80) goto error;
- c32 = (c32 << 6) | (c & 0x3F);
- c = *ss++;
- if((c & 0xC0) != 0x80) goto error;
- c32 = (c32 << 6) | (c & 0x3F);
- if(c32 < 0x00010000 || c32 > 0x0010FFFF) goto error;
+ const struct unicode_utf8_row *const r = &unicode_utf8_valid[*ss];
+ if(r->count <= ns) {
+ switch(r->count) {
+ case 1:
+ c32 = *ss;
+ break;
+ case 2:
+ if(ss[1] < r->min2 || ss[1] > r->max2)
+ goto error;
+ c32 = *ss & 0x1F;
+ break;
+ case 3:
+ if(ss[1] < r->min2 || ss[1] > r->max2)
+ goto error;
+ c32 = *ss & 0x0F;
+ break;
+ case 4:
+ if(ss[1] < r->min2 || ss[1] > r->max2)
+ goto error;
+ c32 = *ss & 0x07;
+ break;
+ default:
+ goto error;
+ }