| 1 | /* |
| 2 | * This file is part of DisOrder. |
| 3 | * Copyright (C) 2005, 2007, 2008 Richard Kettlewell |
| 4 | * |
| 5 | * This program is free software: you can redistribute it and/or modify |
| 6 | * it under the terms of the GNU General Public License as published by |
| 7 | * the Free Software Foundation, either version 3 of the License, or |
| 8 | * (at your option) any later version. |
| 9 | * |
| 10 | * This program is distributed in the hope that it will be useful, |
| 11 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 12 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
| 13 | * GNU General Public License for more details. |
| 14 | * |
| 15 | * You should have received a copy of the GNU General Public License |
| 16 | * along with this program. If not, see <http://www.gnu.org/licenses/>. |
| 17 | */ |
| 18 | #include "test.h" |
| 19 | |
| 20 | static void test_utf8(void) { |
| 21 | /* Test validutf8, convert to UCS-4, check the answer is right, |
| 22 | * convert back to UTF-8, check we got to where we started */ |
| 23 | #define U8(CHARS, WORDS) do { \ |
| 24 | uint32_t *w = ucs4parse(WORDS); \ |
| 25 | uint32_t *ucs; \ |
| 26 | char *u8; \ |
| 27 | \ |
| 28 | insist(validutf8(CHARS)); \ |
| 29 | ucs = utf8_to_utf32(CHARS, strlen(CHARS), 0); \ |
| 30 | insist(ucs != 0); \ |
| 31 | insist(!utf32_cmp(w, ucs)); \ |
| 32 | u8 = utf32_to_utf8(ucs, utf32_len(ucs), 0); \ |
| 33 | insist(u8 != 0); \ |
| 34 | check_string(u8, CHARS); \ |
| 35 | } while(0) |
| 36 | |
| 37 | #define validutf8(S) utf8_valid((S), strlen(S)) |
| 38 | |
| 39 | /* empty string */ |
| 40 | |
| 41 | U8("", ""); |
| 42 | |
| 43 | /* ASCII characters */ |
| 44 | |
| 45 | U8(" !\"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~", |
| 46 | "0x20 0x21 0x22 0x23 0x24 0x25 0x26 0x27 0x28 0x29 0x2a 0x2b 0x2c 0x2d " |
| 47 | "0x2e 0x2f 0x30 0x31 0x32 0x33 0x34 0x35 0x36 0x37 0x38 0x39 0x3a " |
| 48 | "0x3b 0x3c 0x3d 0x3e 0x3f 0x40 0x41 0x42 0x43 0x44 0x45 0x46 0x47 " |
| 49 | "0x48 0x49 0x4a 0x4b 0x4c 0x4d 0x4e 0x4f 0x50 0x51 0x52 0x53 0x54 " |
| 50 | "0x55 0x56 0x57 0x58 0x59 0x5a 0x5b 0x5c 0x5d 0x5e 0x5f 0x60 0x61 " |
| 51 | "0x62 0x63 0x64 0x65 0x66 0x67 0x68 0x69 0x6a 0x6b 0x6c 0x6d 0x6e " |
| 52 | "0x6f 0x70 0x71 0x72 0x73 0x74 0x75 0x76 0x77 0x78 0x79 0x7a 0x7b " |
| 53 | "0x7c 0x7d 0x7e"); |
| 54 | U8("\001\002\003\004\005\006\007\010\011\012\013\014\015\016\017\020\021\022\023\024\025\026\027\030\031\032\033\034\035\036\037\177", |
| 55 | "0x1 0x2 0x3 0x4 0x5 0x6 0x7 0x8 0x9 0xa 0xb 0xc 0xd 0xe 0xf 0x10 " |
| 56 | "0x11 0x12 0x13 0x14 0x15 0x16 0x17 0x18 0x19 0x1a 0x1b 0x1c 0x1d " |
| 57 | "0x1e 0x1f 0x7f"); |
| 58 | |
| 59 | /* from RFC3629 */ |
| 60 | |
| 61 | /* UTF8-2 = %xC2-DF UTF8-tail */ |
| 62 | insist(!validutf8("\xC0\x80")); |
| 63 | insist(!validutf8("\xC1\x80")); |
| 64 | insist(!validutf8("\xC2\x7F")); |
| 65 | U8("\xC2\x80", "0x80"); |
| 66 | U8("\xDF\xBF", "0x7FF"); |
| 67 | insist(!validutf8("\xDF\xC0")); |
| 68 | |
| 69 | /* UTF8-3 = %xE0 %xA0-BF UTF8-tail / %xE1-EC 2( UTF8-tail ) / |
| 70 | * %xED %x80-9F UTF8-tail / %xEE-EF 2( UTF8-tail ) |
| 71 | */ |
| 72 | insist(!validutf8("\xE0\x9F\x80")); |
| 73 | U8("\xE0\xA0\x80", "0x800"); |
| 74 | U8("\xE0\xBF\xBF", "0xFFF"); |
| 75 | insist(!validutf8("\xE0\xC0\xBF")); |
| 76 | |
| 77 | insist(!validutf8("\xE1\x80\x7F")); |
| 78 | U8("\xE1\x80\x80", "0x1000"); |
| 79 | U8("\xEC\xBF\xBF", "0xCFFF"); |
| 80 | insist(!validutf8("\xEC\xC0\xBF")); |
| 81 | |
| 82 | U8("\xED\x80\x80", "0xD000"); |
| 83 | U8("\xED\x9F\xBF", "0xD7FF"); |
| 84 | insist(!validutf8("\xED\xA0\xBF")); |
| 85 | |
| 86 | insist(!validutf8("\xEE\x7f\x80")); |
| 87 | U8("\xEE\x80\x80", "0xE000"); |
| 88 | U8("\xEF\xBF\xBF", "0xFFFF"); |
| 89 | insist(!validutf8("\xEF\xC0\xBF")); |
| 90 | |
| 91 | /* UTF8-4 = %xF0 %x90-BF 2( UTF8-tail ) / %xF1-F3 3( UTF8-tail ) / |
| 92 | * %xF4 %x80-8F 2( UTF8-tail ) |
| 93 | */ |
| 94 | insist(!validutf8("\xF0\x8F\x80\x80")); |
| 95 | U8("\xF0\x90\x80\x80", "0x10000"); |
| 96 | U8("\xF0\xBF\xBF\xBF", "0x3FFFF"); |
| 97 | insist(!validutf8("\xF0\xC0\x80\x80")); |
| 98 | |
| 99 | insist(!validutf8("\xF1\x80\x80\x7F")); |
| 100 | U8("\xF1\x80\x80\x80", "0x40000"); |
| 101 | U8("\xF3\xBF\xBF\xBF", "0xFFFFF"); |
| 102 | insist(!validutf8("\xF3\xC0\x80\x80")); |
| 103 | |
| 104 | insist(!validutf8("\xF4\x80\x80\x7F")); |
| 105 | U8("\xF4\x80\x80\x80", "0x100000"); |
| 106 | U8("\xF4\x8F\xBF\xBF", "0x10FFFF"); |
| 107 | insist(!validutf8("\xF4\x90\x80\x80")); |
| 108 | insist(!validutf8("\xF4\x80\xFF\x80")); |
| 109 | |
| 110 | /* miscellaneous non-UTF-8 rubbish */ |
| 111 | insist(!validutf8("\x80")); |
| 112 | insist(!validutf8("\xBF")); |
| 113 | insist(!validutf8("\xC0")); |
| 114 | insist(!validutf8("\xC0\x7F")); |
| 115 | insist(!validutf8("\xC0\xC0")); |
| 116 | insist(!validutf8("\xE0")); |
| 117 | insist(!validutf8("\xE0\x7F")); |
| 118 | insist(!validutf8("\xE0\xC0")); |
| 119 | insist(!validutf8("\xE0\x80")); |
| 120 | insist(!validutf8("\xE0\x80\x7f")); |
| 121 | insist(!validutf8("\xE0\x80\xC0")); |
| 122 | insist(!validutf8("\xF0")); |
| 123 | insist(!validutf8("\xF0\x7F")); |
| 124 | insist(!validutf8("\xF0\xC0")); |
| 125 | insist(!validutf8("\xF0\x80")); |
| 126 | insist(!validutf8("\xF0\x80\x7f")); |
| 127 | insist(!validutf8("\xF0\x80\xC0")); |
| 128 | insist(!validutf8("\xF0\x80\x80\x7f")); |
| 129 | insist(!validutf8("\xF0\x80\x80\xC0")); |
| 130 | insist(!validutf8("\xF5\x80\x80\x80")); |
| 131 | insist(!validutf8("\xF8")); |
| 132 | } |
| 133 | |
| 134 | TEST(utf8); |
| 135 | |
| 136 | /* |
| 137 | Local Variables: |
| 138 | c-basic-offset:2 |
| 139 | comment-column:40 |
| 140 | fill-column:79 |
| 141 | indent-tabs-mode:nil |
| 142 | End: |
| 143 | */ |