src/shared/utf8.c

   1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
   2
   3 /***
   4   This file is part of systemd.
   5
   6   Copyright 2012 Lennart Poettering
   7
   8   systemd is free software; you can redistribute it and/or modify it
   9   under the terms of the GNU Lesser General Public License as published by
  10   the Free Software Foundation; either version 2.1 of the License, or
  11   (at your option) any later version.
  12
  13   systemd is distributed in the hope that it will be useful, but
  14   WITHOUT ANY WARRANTY; without even the implied warranty of
  15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  16   Lesser General Public License for more details.
  17
  18   You should have received a copy of the GNU Lesser General Public License
  19   along with systemd; If not, see <http://www.gnu.org/licenses/>.
  20 ***/
  21
  22 /* This file is based on the GLIB utf8 validation functions. The
  23  * original license text follows. */
  24
  25 /* gutf8.c - Operations on UTF-8 strings.
  26  *
  27  * Copyright (C) 1999 Tom Tromey
  28  * Copyright (C) 2000 Red Hat, Inc.
  29  *
  30  * This library is free software; you can redistribute it and/or
  31  * modify it under the terms of the GNU Library General Public
  32  * License as published by the Free Software Foundation; either
  33  * version 2 of the License, or (at your option) any later version.
  34  *
  35  * This library is distributed in the hope that it will be useful,
  36  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  37  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  38  * Library General Public License for more details.
  39  *
  40  * You should have received a copy of the GNU Library General Public
  41  * License along with this library; if not, write to the Free Software
  42  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
  43  */
  44
  45 #include <errno.h>
  46 #include <stdlib.h>
  47 #include <inttypes.h>
  48 #include <string.h>
  49 #include <stdbool.h>
  50
  51 #include "utf8.h"
  52 #include "util.h"
  53
  54 #define FILTER_CHAR '_'
  55
  56 static inline bool is_unicode_valid(uint32_t ch) {
  57
  58         if (ch >= 0x110000) /* End of unicode space */
  59                 return false;
  60         if ((ch & 0xFFFFF800) == 0xD800) /* Reserved area for UTF-16 */
  61                 return false;
  62         if ((ch >= 0xFDD0) && (ch <= 0xFDEF)) /* Reserved */
  63                 return false;
  64         if ((ch & 0xFFFE) == 0xFFFE) /* BOM (Byte Order Mark) */
  65                 return false;
  66
  67         return true;
  68 }
  69
  70 static inline bool is_continuation_char(uint8_t ch) {
  71         if ((ch & 0xc0) != 0x80) /* 10xxxxxx */
  72                 return false;
  73         return true;
  74 }
  75
  76 static inline void merge_continuation_char(uint32_t *u_ch, uint8_t ch) {
  77         *u_ch <<= 6;
  78         *u_ch |= ch & 0x3f;
  79 }
  80
  81 static bool is_unicode_control(uint32_t ch) {
  82
  83         /*
  84           0 to ' '-1 is the C0 range.
  85           DEL=0x7F, and DEL+1 to 0x9F is C1 range.
  86           '\t' is in C0 range, but more or less harmless and commonly used.
  87         */
  88
  89         return (ch < ' ' && ch != '\t' && ch != '\n') ||
  90                 (0x7F <= ch && ch <= 0x9F);
  91 }
  92
  93 bool utf8_is_printable(const char* str, size_t length) {
  94         uint32_t val = 0;
  95         uint32_t min = 0;
  96         const uint8_t *p;
  97
  98         assert(str);
  99
 100         for (p = (const uint8_t*) str; length; p++, length--) {
 101                 if (*p < 128) {
 102                         val = *p;
 103                 } else {
 104                         if ((*p & 0xe0) == 0xc0) { /* 110xxxxx two-char seq. */
 105                                 min = 128;
 106                                 val = (uint32_t) (*p & 0x1e);
 107                                 goto ONE_REMAINING;
 108                         } else if ((*p & 0xf0) == 0xe0) { /* 1110xxxx three-char seq.*/
 109                                 min = (1 << 11);
 110                                 val = (uint32_t) (*p & 0x0f);
 111                                 goto TWO_REMAINING;
 112                         } else if ((*p & 0xf8) == 0xf0) { /* 11110xxx four-char seq */
 113                                 min = (1 << 16);
 114                                 val = (uint32_t) (*p & 0x07);
 115                         } else
 116                                 return false;
 117
 118                         p++;
 119                         length--;
 120                         if (!length || !is_continuation_char(*p))
 121                                 return false;
 122                         merge_continuation_char(&val, *p);
 123
 124                 TWO_REMAINING:
 125                         p++;
 126                         length--;
 127                         if (!is_continuation_char(*p))
 128                                 return false;
 129                         merge_continuation_char(&val, *p);
 130
 131                 ONE_REMAINING:
 132                         p++;
 133                         length--;
 134                         if (!is_continuation_char(*p))
 135                                 return false;
 136                         merge_continuation_char(&val, *p);
 137
 138                         if (val < min)
 139                                 return false;
 140                 }
 141
 142                 if (is_unicode_control(val))
 143                         return false;
 144         }
 145
 146         return true;
 147 }
 148
 149 static char* utf8_validate(const char *str, char *output) {
 150         uint32_t val = 0;
 151         uint32_t min = 0;
 152         const uint8_t *p, *last;
 153         int size;
 154         uint8_t *o;
 155
 156         assert(str);
 157
 158         o = (uint8_t*) output;
 159         for (p = (const uint8_t*) str; *p; p++) {
 160                 if (*p < 128) {
 161                         if (o)
 162                                 *o = *p;
 163                 } else {
 164                         last = p;
 165
 166                         if ((*p & 0xe0) == 0xc0) { /* 110xxxxx two-char seq. */
 167                                 size = 2;
 168                                 min = 128;
 169                                 val = (uint32_t) (*p & 0x1e);
 170                                 goto ONE_REMAINING;
 171                         } else if ((*p & 0xf0) == 0xe0) { /* 1110xxxx three-char seq.*/
 172                                 size = 3;
 173                                 min = (1 << 11);
 174                                 val = (uint32_t) (*p & 0x0f);
 175                                 goto TWO_REMAINING;
 176                         } else if ((*p & 0xf8) == 0xf0) { /* 11110xxx four-char seq */
 177                                 size = 4;
 178                                 min = (1 << 16);
 179                                 val = (uint32_t) (*p & 0x07);
 180                         } else
 181                                 goto error;
 182
 183                         p++;
 184                         if (!is_continuation_char(*p))
 185                                 goto error;
 186                         merge_continuation_char(&val, *p);
 187
 188                 TWO_REMAINING:
 189                         p++;
 190                         if (!is_continuation_char(*p))
 191                                 goto error;
 192                         merge_continuation_char(&val, *p);
 193
 194                 ONE_REMAINING:
 195                         p++;
 196                         if (!is_continuation_char(*p))
 197                                 goto error;
 198                         merge_continuation_char(&val, *p);
 199
 200                         if (val < min)
 201                                 goto error;
 202
 203                         if (!is_unicode_valid(val))
 204                                 goto error;
 205
 206                         if (o) {
 207                                 memcpy(o, last, (size_t) size);
 208                                 o += size;
 209                         }
 210
 211                         continue;
 212
 213                 error:
 214                         if (o) {
 215                                 *o = FILTER_CHAR;
 216                                 p = last; /* We retry at the next character */
 217                         } else
 218                                 goto failure;
 219                 }
 220
 221                 if (o)
 222                         o++;
 223         }
 224
 225         if (o) {
 226                 *o = '\0';
 227                 return output;
 228         }
 229
 230         return (char*) str;
 231
 232 failure:
 233         return NULL;
 234 }
 235
 236 char* utf8_is_valid (const char *str) {
 237         return utf8_validate(str, NULL);
 238 }
 239
 240 char* utf8_filter (const char *str) {
 241         char *new_str;
 242
 243         assert(str);
 244
 245         new_str = malloc(strlen(str) + 1);
 246         if (!new_str)
 247                 return NULL;
 248
 249         return utf8_validate(str, new_str);
 250 }
 251
 252 char *ascii_is_valid(const char *str) {
 253         const char *p;
 254
 255         assert(str);
 256
 257         for (p = str; *p; p++)
 258                 if ((unsigned char) *p >= 128)
 259                         return NULL;
 260
 261         return (char*) str;
 262 }
 263
 264 char *ascii_filter(const char *str) {
 265         const char *s;
 266         char *r, *d;
 267         size_t l;
 268
 269         assert(str);
 270
 271         l = strlen(str);
 272         r = malloc(l + 1);
 273         if (!r)
 274                 return NULL;
 275
 276         for (s = str, d = r; *s; s++)
 277                 if ((unsigned char) *s < 128)
 278                         *(d++) = *s;
 279
 280         *d = 0;
 281
 282         return r;
 283 }
 284
 285 char *utf16_to_utf8(const void *s, size_t length) {
 286         char *r;
 287         const uint8_t *f;
 288         uint8_t *t;
 289
 290         r = new(char, (length*3+1)/2 + 1);
 291         if (!r)
 292                 return NULL;
 293
 294         t = (uint8_t*) r;
 295
 296         for (f = s; f < (const uint8_t*) s + length; f += 2) {
 297                 uint16_t c;
 298
 299                 c = (f[1] << 8) | f[0];
 300
 301                 if (c == 0) {
 302                         *t = 0;
 303                         return r;
 304                 } else if (c < 0x80) {
 305                         *(t++) = (uint8_t) c;
 306                 } else if (c < 0x800) {
 307                         *(t++) = (uint8_t) (0xc0 | (c >> 6));
 308                         *(t++) = (uint8_t) (0x80 | (c & 0x3f));
 309                 } else {
 310                         *(t++) = (uint8_t) (0xe0 | (c >> 12));
 311                         *(t++) = (uint8_t) (0x80 | ((c >> 6) & 0x3f));
 312                         *(t++) = (uint8_t) (0x80 | (c & 0x3f));
 313                 }
 314         }
 315
 316         *t = 0;
 317
 318         return r;
 319 }
 320
 321 /* count of characters used to encode one unicode char */
 322 static int utf8_encoded_expected_len(const char *str) {
 323         unsigned char c = (unsigned char)str[0];
 324
 325         if (c < 0x80)
 326                 return 1;
 327         if ((c & 0xe0) == 0xc0)
 328                 return 2;
 329         if ((c & 0xf0) == 0xe0)
 330                 return 3;
 331         if ((c & 0xf8) == 0xf0)
 332                 return 4;
 333         if ((c & 0xfc) == 0xf8)
 334                 return 5;
 335         if ((c & 0xfe) == 0xfc)
 336                 return 6;
 337         return 0;
 338 }
 339
 340 /* decode one unicode char */
 341 static int utf8_encoded_to_unichar(const char *str) {
 342         int unichar;
 343         int len;
 344         int i;
 345
 346         len = utf8_encoded_expected_len(str);
 347         switch (len) {
 348         case 1:
 349                 return (int)str[0];
 350         case 2:
 351                 unichar = str[0] & 0x1f;
 352                 break;
 353         case 3:
 354                 unichar = (int)str[0] & 0x0f;
 355                 break;
 356         case 4:
 357                 unichar = (int)str[0] & 0x07;
 358                 break;
 359         case 5:
 360                 unichar = (int)str[0] & 0x03;
 361                 break;
 362         case 6:
 363                 unichar = (int)str[0] & 0x01;
 364                 break;
 365         default:
 366                 return -1;
 367         }
 368
 369         for (i = 1; i < len; i++) {
 370                 if (((int)str[i] & 0xc0) != 0x80)
 371                         return -1;
 372                 unichar <<= 6;
 373                 unichar |= (int)str[i] & 0x3f;
 374         }
 375
 376         return unichar;
 377 }
 378
 379 /* expected size used to encode one unicode char */
 380 static int utf8_unichar_to_encoded_len(int unichar) {
 381         if (unichar < 0x80)
 382                 return 1;
 383         if (unichar < 0x800)
 384                 return 2;
 385         if (unichar < 0x10000)
 386                 return 3;
 387         if (unichar < 0x200000)
 388                 return 4;
 389         if (unichar < 0x4000000)
 390                 return 5;
 391         return 6;
 392 }
 393
 394 /* validate one encoded unicode char and return its length */
 395 int utf8_encoded_valid_unichar(const char *str) {
 396         int len;
 397         int unichar;
 398         int i;
 399
 400         len = utf8_encoded_expected_len(str);
 401         if (len == 0)
 402                 return -1;
 403
 404         /* ascii is valid */
 405         if (len == 1)
 406                 return 1;
 407
 408         /* check if expected encoded chars are available */
 409         for (i = 0; i < len; i++)
 410                 if ((str[i] & 0x80) != 0x80)
 411                         return -1;
 412
 413         unichar = utf8_encoded_to_unichar(str);
 414
 415         /* check if encoded length matches encoded value */
 416         if (utf8_unichar_to_encoded_len(unichar) != len)
 417                 return -1;
 418
 419         /* check if value has valid range */
 420         if (!is_unicode_valid(unichar))
 421                 return -1;
 422
 423         return len;
 424 }
 425
 426 int is_utf8_encoding_whitelisted(char c, const char *white) {
 427         if ((c >= '0' && c <= '9') ||
 428             (c >= 'A' && c <= 'Z') ||
 429             (c >= 'a' && c <= 'z') ||
 430             strchr("#+-.:=@_", c) != NULL ||
 431             (white != NULL && strchr(white, c) != NULL))
 432                 return 1;
 433         return 0;
 434 }
 435
 436 int udev_encode_string(const char *str, char *str_enc, size_t len) {
 437         size_t i, j;
 438
 439         if (str == NULL || str_enc == NULL)
 440                 return -1;
 441
 442         for (i = 0, j = 0; str[i] != '\0'; i++) {
 443                 int seqlen;
 444
 445                 seqlen = utf8_encoded_valid_unichar(&str[i]);
 446                 if (seqlen > 1) {
 447                         if (len-j < (size_t)seqlen)
 448                                 goto err;
 449                         memcpy(&str_enc[j], &str[i], seqlen);
 450                         j += seqlen;
 451                         i += (seqlen-1);
 452                 } else if (str[i] == '\\' || !is_utf8_encoding_whitelisted(str[i], NULL)) {
 453                         if (len-j < 4)
 454                                 goto err;
 455                         sprintf(&str_enc[j], "\\x%02x", (unsigned char) str[i]);
 456                         j += 4;
 457                 } else {
 458                         if (len-j < 1)
 459                                 goto err;
 460                         str_enc[j] = str[i];
 461                         j++;
 462                 }
 463         }
 464         if (len-j < 1)
 465                 goto err;
 466         str_enc[j] = '\0';
 467         return 0;
 468 err:
 469         return -1;
 470 }