1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
4 This file is part of systemd.
6 Copyright 2012 Lennart Poettering
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
22 /* This file is based on the GLIB utf8 validation functions. The
23 * original license text follows. */
25 /* gutf8.c - Operations on UTF-8 strings.
27 * Copyright (C) 1999 Tom Tromey
28 * Copyright (C) 2000 Red Hat, Inc.
30 * This library is free software; you can redistribute it and/or
31 * modify it under the terms of the GNU Library General Public
32 * License as published by the Free Software Foundation; either
33 * version 2 of the License, or (at your option) any later version.
35 * This library is distributed in the hope that it will be useful,
36 * but WITHOUT ANY WARRANTY; without even the implied warranty of
37 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
38 * Library General Public License for more details.
40 * You should have received a copy of the GNU Library General Public
41 * License along with this library; if not, write to the Free Software
42 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
54 #define FILTER_CHAR '_'
56 static inline bool is_unicode_valid(uint32_t ch) {
58 if (ch >= 0x110000) /* End of unicode space */
60 if ((ch & 0xFFFFF800) == 0xD800) /* Reserved area for UTF-16 */
62 if ((ch >= 0xFDD0) && (ch <= 0xFDEF)) /* Reserved */
64 if ((ch & 0xFFFE) == 0xFFFE) /* BOM (Byte Order Mark) */
70 static inline bool is_continuation_char(uint8_t ch) {
71 if ((ch & 0xc0) != 0x80) /* 10xxxxxx */
76 static inline void merge_continuation_char(uint32_t *u_ch, uint8_t ch) {
81 static bool is_unicode_control(uint32_t ch) {
84 0 to ' '-1 is the C0 range.
85 DEL=0x7F, and DEL+1 to 0x9F is C1 range.
86 '\t' is in C0 range, but more or less harmless and commonly used.
89 return (ch < ' ' && ch != '\t' && ch != '\n') ||
90 (0x7F <= ch && ch <= 0x9F);
93 bool utf8_is_printable(const char* str, size_t length) {
100 for (p = (const uint8_t*) str; length; p++, length--) {
104 if ((*p & 0xe0) == 0xc0) { /* 110xxxxx two-char seq. */
106 val = (uint32_t) (*p & 0x1e);
108 } else if ((*p & 0xf0) == 0xe0) { /* 1110xxxx three-char seq.*/
110 val = (uint32_t) (*p & 0x0f);
112 } else if ((*p & 0xf8) == 0xf0) { /* 11110xxx four-char seq */
114 val = (uint32_t) (*p & 0x07);
120 if (!length || !is_continuation_char(*p))
122 merge_continuation_char(&val, *p);
127 if (!is_continuation_char(*p))
129 merge_continuation_char(&val, *p);
134 if (!is_continuation_char(*p))
136 merge_continuation_char(&val, *p);
142 if (is_unicode_control(val))
149 static char* utf8_validate(const char *str, char *output) {
152 const uint8_t *p, *last;
158 o = (uint8_t*) output;
159 for (p = (const uint8_t*) str; *p; p++) {
166 if ((*p & 0xe0) == 0xc0) { /* 110xxxxx two-char seq. */
169 val = (uint32_t) (*p & 0x1e);
171 } else if ((*p & 0xf0) == 0xe0) { /* 1110xxxx three-char seq.*/
174 val = (uint32_t) (*p & 0x0f);
176 } else if ((*p & 0xf8) == 0xf0) { /* 11110xxx four-char seq */
179 val = (uint32_t) (*p & 0x07);
184 if (!is_continuation_char(*p))
186 merge_continuation_char(&val, *p);
190 if (!is_continuation_char(*p))
192 merge_continuation_char(&val, *p);
196 if (!is_continuation_char(*p))
198 merge_continuation_char(&val, *p);
203 if (!is_unicode_valid(val))
207 memcpy(o, last, (size_t) size);
216 p = last; /* We retry at the next character */
236 char* utf8_is_valid (const char *str) {
237 return utf8_validate(str, NULL);
240 char* utf8_filter (const char *str) {
245 new_str = malloc(strlen(str) + 1);
249 return utf8_validate(str, new_str);
252 char *ascii_is_valid(const char *str) {
257 for (p = str; *p; p++)
258 if ((unsigned char) *p >= 128)
264 char *ascii_filter(const char *str) {
276 for (s = str, d = r; *s; s++)
277 if ((unsigned char) *s < 128)
285 char *utf16_to_utf8(const void *s, size_t length) {
290 r = new(char, (length*3+1)/2 + 1);
296 for (f = s; f < (const uint8_t*) s + length; f += 2) {
299 c = (f[1] << 8) | f[0];
304 } else if (c < 0x80) {
305 *(t++) = (uint8_t) c;
306 } else if (c < 0x800) {
307 *(t++) = (uint8_t) (0xc0 | (c >> 6));
308 *(t++) = (uint8_t) (0x80 | (c & 0x3f));
310 *(t++) = (uint8_t) (0xe0 | (c >> 12));
311 *(t++) = (uint8_t) (0x80 | ((c >> 6) & 0x3f));
312 *(t++) = (uint8_t) (0x80 | (c & 0x3f));
321 /* count of characters used to encode one unicode char */
322 static int utf8_encoded_expected_len(const char *str) {
323 unsigned char c = (unsigned char)str[0];
327 if ((c & 0xe0) == 0xc0)
329 if ((c & 0xf0) == 0xe0)
331 if ((c & 0xf8) == 0xf0)
333 if ((c & 0xfc) == 0xf8)
335 if ((c & 0xfe) == 0xfc)
340 /* decode one unicode char */
341 static int utf8_encoded_to_unichar(const char *str) {
346 len = utf8_encoded_expected_len(str);
351 unichar = str[0] & 0x1f;
354 unichar = (int)str[0] & 0x0f;
357 unichar = (int)str[0] & 0x07;
360 unichar = (int)str[0] & 0x03;
363 unichar = (int)str[0] & 0x01;
369 for (i = 1; i < len; i++) {
370 if (((int)str[i] & 0xc0) != 0x80)
373 unichar |= (int)str[i] & 0x3f;
379 /* expected size used to encode one unicode char */
380 static int utf8_unichar_to_encoded_len(int unichar) {
385 if (unichar < 0x10000)
387 if (unichar < 0x200000)
389 if (unichar < 0x4000000)
394 /* validate one encoded unicode char and return its length */
395 int utf8_encoded_valid_unichar(const char *str) {
400 len = utf8_encoded_expected_len(str);
408 /* check if expected encoded chars are available */
409 for (i = 0; i < len; i++)
410 if ((str[i] & 0x80) != 0x80)
413 unichar = utf8_encoded_to_unichar(str);
415 /* check if encoded length matches encoded value */
416 if (utf8_unichar_to_encoded_len(unichar) != len)
419 /* check if value has valid range */
420 if (!is_unicode_valid(unichar))
426 int is_utf8_encoding_whitelisted(char c, const char *white) {
427 if ((c >= '0' && c <= '9') ||
428 (c >= 'A' && c <= 'Z') ||
429 (c >= 'a' && c <= 'z') ||
430 strchr("#+-.:=@_", c) != NULL ||
431 (white != NULL && strchr(white, c) != NULL))
436 int udev_encode_string(const char *str, char *str_enc, size_t len) {
439 if (str == NULL || str_enc == NULL)
442 for (i = 0, j = 0; str[i] != '\0'; i++) {
445 seqlen = utf8_encoded_valid_unichar(&str[i]);
447 if (len-j < (size_t)seqlen)
449 memcpy(&str_enc[j], &str[i], seqlen);
452 } else if (str[i] == '\\' || !is_utf8_encoding_whitelisted(str[i], NULL)) {
455 sprintf(&str_enc[j], "\\x%02x", (unsigned char) str[i]);