src/shared/utf8.c

   1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
   2
   3 /***
   4   This file is part of systemd.
   5
   6   Copyright 2012 Lennart Poettering
   7
   8   systemd is free software; you can redistribute it and/or modify it
   9   under the terms of the GNU Lesser General Public License as published by
  10   the Free Software Foundation; either version 2.1 of the License, or
  11   (at your option) any later version.
  12
  13   systemd is distributed in the hope that it will be useful, but
  14   WITHOUT ANY WARRANTY; without even the implied warranty of
  15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  16   Lesser General Public License for more details.
  17
  18   You should have received a copy of the GNU Lesser General Public License
  19   along with systemd; If not, see <http://www.gnu.org/licenses/>.
  20 ***/
  21
  22 /* This file is based on the GLIB utf8 validation functions. The
  23  * original license text follows. */
  24
  25 /* gutf8.c - Operations on UTF-8 strings.
  26  *
  27  * Copyright (C) 1999 Tom Tromey
  28  * Copyright (C) 2000 Red Hat, Inc.
  29  *
  30  * This library is free software; you can redistribute it and/or
  31  * modify it under the terms of the GNU Library General Public
  32  * License as published by the Free Software Foundation; either
  33  * version 2 of the License, or (at your option) any later version.
  34  *
  35  * This library is distributed in the hope that it will be useful,
  36  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  37  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  38  * Library General Public License for more details.
  39  *
  40  * You should have received a copy of the GNU Library General Public
  41  * License along with this library; if not, write to the Free Software
  42  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
  43  */
  44
  45 #include <errno.h>
  46 #include <stdlib.h>
  47 #include <inttypes.h>
  48 #include <string.h>
  49 #include <stdbool.h>
  50
  51 #include "utf8.h"
  52 #include "util.h"
  53
  54 static inline bool is_unicode_valid(uint32_t ch) {
  55
  56         if (ch >= 0x110000) /* End of unicode space */
  57                 return false;
  58         if ((ch & 0xFFFFF800) == 0xD800) /* Reserved area for UTF-16 */
  59                 return false;
  60         if ((ch >= 0xFDD0) && (ch <= 0xFDEF)) /* Reserved */
  61                 return false;
  62         if ((ch & 0xFFFE) == 0xFFFE) /* BOM (Byte Order Mark) */
  63                 return false;
  64
  65         return true;
  66 }
  67
  68 static bool is_unicode_control(uint32_t ch) {
  69
  70         /*
  71           0 to ' '-1 is the C0 range.
  72           DEL=0x7F, and DEL+1 to 0x9F is C1 range.
  73           '\t' is in C0 range, but more or less harmless and commonly used.
  74         */
  75
  76         return (ch < ' ' && ch != '\t' && ch != '\n') ||
  77                 (0x7F <= ch && ch <= 0x9F);
  78 }
  79
  80 /* count of characters used to encode one unicode char */
  81 static int utf8_encoded_expected_len(const char *str) {
  82         unsigned char c = (unsigned char)str[0];
  83
  84         if (c < 0x80)
  85                 return 1;
  86         if ((c & 0xe0) == 0xc0)
  87                 return 2;
  88         if ((c & 0xf0) == 0xe0)
  89                 return 3;
  90         if ((c & 0xf8) == 0xf0)
  91                 return 4;
  92         if ((c & 0xfc) == 0xf8)
  93                 return 5;
  94         if ((c & 0xfe) == 0xfc)
  95                 return 6;
  96         return 0;
  97 }
  98
  99 /* decode one unicode char */
 100 static int utf8_encoded_to_unichar(const char *str) {
 101         int unichar;
 102         int len;
 103         int i;
 104
 105         len = utf8_encoded_expected_len(str);
 106         switch (len) {
 107         case 1:
 108                 return (int)str[0];
 109         case 2:
 110                 unichar = str[0] & 0x1f;
 111                 break;
 112         case 3:
 113                 unichar = (int)str[0] & 0x0f;
 114                 break;
 115         case 4:
 116                 unichar = (int)str[0] & 0x07;
 117                 break;
 118         case 5:
 119                 unichar = (int)str[0] & 0x03;
 120                 break;
 121         case 6:
 122                 unichar = (int)str[0] & 0x01;
 123                 break;
 124         default:
 125                 return -1;
 126         }
 127
 128         for (i = 1; i < len; i++) {
 129                 if (((int)str[i] & 0xc0) != 0x80)
 130                         return -1;
 131                 unichar <<= 6;
 132                 unichar |= (int)str[i] & 0x3f;
 133         }
 134
 135         return unichar;
 136 }
 137
 138 bool utf8_is_printable(const char* str, size_t length) {
 139         const uint8_t *p;
 140
 141         assert(str);
 142
 143         for (p = (const uint8_t*) str; length; p++) {
 144                 int encoded_len = utf8_encoded_valid_unichar((const char *)p);
 145                 int32_t val = utf8_encoded_to_unichar((const char*)p);
 146
 147                 if (encoded_len < 0 || val < 0 || is_unicode_control(val))
 148                         return false;
 149
 150                 length -= encoded_len;
 151         }
 152
 153         return true;
 154 }
 155
 156 const char *utf8_is_valid(const char *str) {
 157         const uint8_t *p;
 158
 159         assert(str);
 160
 161         for (p = (const uint8_t*) str; *p; ) {
 162                 int len = utf8_encoded_valid_unichar((const char *)p);
 163
 164                 if (len < 0)
 165                         return NULL;
 166
 167                 p += len;
 168         }
 169
 170         return str;
 171 }
 172
 173 char *ascii_is_valid(const char *str) {
 174         const char *p;
 175
 176         assert(str);
 177
 178         for (p = str; *p; p++)
 179                 if ((unsigned char) *p >= 128)
 180                         return NULL;
 181
 182         return (char*) str;
 183 }
 184
 185 char *ascii_filter(const char *str) {
 186         const char *s;
 187         char *r, *d;
 188         size_t l;
 189
 190         assert(str);
 191
 192         l = strlen(str);
 193         r = malloc(l + 1);
 194         if (!r)
 195                 return NULL;
 196
 197         for (s = str, d = r; *s; s++)
 198                 if ((unsigned char) *s < 128)
 199                         *(d++) = *s;
 200
 201         *d = 0;
 202
 203         return r;
 204 }
 205
 206 char *utf16_to_utf8(const void *s, size_t length) {
 207         char *r;
 208         const uint8_t *f;
 209         uint8_t *t;
 210
 211         r = new(char, (length*3+1)/2 + 1);
 212         if (!r)
 213                 return NULL;
 214
 215         t = (uint8_t*) r;
 216
 217         for (f = s; f < (const uint8_t*) s + length; f += 2) {
 218                 uint16_t c;
 219
 220                 c = (f[1] << 8) | f[0];
 221
 222                 if (c == 0) {
 223                         *t = 0;
 224                         return r;
 225                 } else if (c < 0x80) {
 226                         *(t++) = (uint8_t) c;
 227                 } else if (c < 0x800) {
 228                         *(t++) = (uint8_t) (0xc0 | (c >> 6));
 229                         *(t++) = (uint8_t) (0x80 | (c & 0x3f));
 230                 } else {
 231                         *(t++) = (uint8_t) (0xe0 | (c >> 12));
 232                         *(t++) = (uint8_t) (0x80 | ((c >> 6) & 0x3f));
 233                         *(t++) = (uint8_t) (0x80 | (c & 0x3f));
 234                 }
 235         }
 236
 237         *t = 0;
 238
 239         return r;
 240 }
 241
 242 /* expected size used to encode one unicode char */
 243 static int utf8_unichar_to_encoded_len(int unichar) {
 244         if (unichar < 0x80)
 245                 return 1;
 246         if (unichar < 0x800)
 247                 return 2;
 248         if (unichar < 0x10000)
 249                 return 3;
 250         if (unichar < 0x200000)
 251                 return 4;
 252         if (unichar < 0x4000000)
 253                 return 5;
 254         return 6;
 255 }
 256
 257 /* validate one encoded unicode char and return its length */
 258 int utf8_encoded_valid_unichar(const char *str) {
 259         int len;
 260         int unichar;
 261         int i;
 262
 263         len = utf8_encoded_expected_len(str);
 264         if (len == 0)
 265                 return -1;
 266
 267         /* ascii is valid */
 268         if (len == 1)
 269                 return 1;
 270
 271         /* check if expected encoded chars are available */
 272         for (i = 0; i < len; i++)
 273                 if ((str[i] & 0x80) != 0x80)
 274                         return -1;
 275
 276         unichar = utf8_encoded_to_unichar(str);
 277
 278         /* check if encoded length matches encoded value */
 279         if (utf8_unichar_to_encoded_len(unichar) != len)
 280                 return -1;
 281
 282         /* check if value has valid range */
 283         if (!is_unicode_valid(unichar))
 284                 return -1;
 285
 286         return len;
 287 }
 288
 289 int is_utf8_encoding_whitelisted(char c, const char *white) {
 290         if ((c >= '0' && c <= '9') ||
 291             (c >= 'A' && c <= 'Z') ||
 292             (c >= 'a' && c <= 'z') ||
 293             strchr("#+-.:=@_", c) != NULL ||
 294             (white != NULL && strchr(white, c) != NULL))
 295                 return 1;
 296         return 0;
 297 }
 298
 299 int udev_encode_string(const char *str, char *str_enc, size_t len) {
 300         size_t i, j;
 301
 302         if (str == NULL || str_enc == NULL)
 303                 return -1;
 304
 305         for (i = 0, j = 0; str[i] != '\0'; i++) {
 306                 int seqlen;
 307
 308                 seqlen = utf8_encoded_valid_unichar(&str[i]);
 309                 if (seqlen > 1) {
 310                         if (len-j < (size_t)seqlen)
 311                                 goto err;
 312                         memcpy(&str_enc[j], &str[i], seqlen);
 313                         j += seqlen;
 314                         i += (seqlen-1);
 315                 } else if (str[i] == '\\' || !is_utf8_encoding_whitelisted(str[i], NULL)) {
 316                         if (len-j < 4)
 317                                 goto err;
 318                         sprintf(&str_enc[j], "\\x%02x", (unsigned char) str[i]);
 319                         j += 4;
 320                 } else {
 321                         if (len-j < 1)
 322                                 goto err;
 323                         str_enc[j] = str[i];
 324                         j++;
 325                 }
 326         }
 327         if (len-j < 1)
 328                 goto err;
 329         str_enc[j] = '\0';
 330         return 0;
 331 err:
 332         return -1;
 333 }