src/shared/utf8.c

   1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
   2
   3 /***
   4   This file is part of systemd.
   5
   6   Copyright 2012 Lennart Poettering
   7
   8   systemd is free software; you can redistribute it and/or modify it
   9   under the terms of the GNU Lesser General Public License as published by
  10   the Free Software Foundation; either version 2.1 of the License, or
  11   (at your option) any later version.
  12
  13   systemd is distributed in the hope that it will be useful, but
  14   WITHOUT ANY WARRANTY; without even the implied warranty of
  15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  16   Lesser General Public License for more details.
  17
  18   You should have received a copy of the GNU Lesser General Public License
  19   along with systemd; If not, see <http://www.gnu.org/licenses/>.
  20 ***/
  21
  22 /* This file is based on the GLIB utf8 validation functions. The
  23  * original license text follows. */
  24
  25 /* gutf8.c - Operations on UTF-8 strings.
  26  *
  27  * Copyright (C) 1999 Tom Tromey
  28  * Copyright (C) 2000 Red Hat, Inc.
  29  *
  30  * This library is free software; you can redistribute it and/or
  31  * modify it under the terms of the GNU Library General Public
  32  * License as published by the Free Software Foundation; either
  33  * version 2 of the License, or (at your option) any later version.
  34  *
  35  * This library is distributed in the hope that it will be useful,
  36  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  37  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  38  * Library General Public License for more details.
  39  *
  40  * You should have received a copy of the GNU Library General Public
  41  * License along with this library; if not, write to the Free Software
  42  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
  43  */
  44
  45 #include <errno.h>
  46 #include <stdlib.h>
  47 #include <inttypes.h>
  48 #include <string.h>
  49 #include <stdbool.h>
  50
  51 #include "utf8.h"
  52 #include "util.h"
  53
  54 #define FILTER_CHAR '_'
  55
  56 static inline bool is_unicode_valid(uint32_t ch) {
  57
  58         if (ch >= 0x110000) /* End of unicode space */
  59                 return false;
  60         if ((ch & 0xFFFFF800) == 0xD800) /* Reserved area for UTF-16 */
  61                 return false;
  62         if ((ch >= 0xFDD0) && (ch <= 0xFDEF)) /* Reserved */
  63                 return false;
  64         if ((ch & 0xFFFE) == 0xFFFE) /* BOM (Byte Order Mark) */
  65                 return false;
  66
  67         return true;
  68 }
  69
  70 static inline bool is_continuation_char(uint8_t ch) {
  71         if ((ch & 0xc0) != 0x80) /* 10xxxxxx */
  72                 return false;
  73         return true;
  74 }
  75
  76 static inline void merge_continuation_char(uint32_t *u_ch, uint8_t ch) {
  77         *u_ch <<= 6;
  78         *u_ch |= ch & 0x3f;
  79 }
  80
  81 static bool is_unicode_control(uint32_t ch) {
  82
  83         /*
  84           0 to ' '-1 is the C0 range.
  85           DEL=0x7F, and DEL+1 to 0x9F is C1 range.
  86           '\t' is in C0 range, but more or less harmless and commonly used.
  87         */
  88
  89         return (ch < ' ' && ch != '\t' && ch != '\n') ||
  90                 (0x7F <= ch && ch <= 0x9F);
  91 }
  92
  93 bool utf8_is_printable(const char* str, size_t length) {
  94         uint32_t val = 0;
  95         uint32_t min = 0;
  96         const uint8_t *p;
  97
  98         assert(str);
  99
 100         for (p = (const uint8_t*) str; length; p++, length--) {
 101                 if (*p < 128) {
 102                         val = *p;
 103                 } else {
 104                         if ((*p & 0xe0) == 0xc0) { /* 110xxxxx two-char seq. */
 105                                 min = 128;
 106                                 val = (uint32_t) (*p & 0x1e);
 107                                 goto ONE_REMAINING;
 108                         } else if ((*p & 0xf0) == 0xe0) { /* 1110xxxx three-char seq.*/
 109                                 min = (1 << 11);
 110                                 val = (uint32_t) (*p & 0x0f);
 111                                 goto TWO_REMAINING;
 112                         } else if ((*p & 0xf8) == 0xf0) { /* 11110xxx four-char seq */
 113                                 min = (1 << 16);
 114                                 val = (uint32_t) (*p & 0x07);
 115                         } else
 116                                 return false;
 117
 118                         p++;
 119                         length--;
 120                         if (!length || !is_continuation_char(*p))
 121                                 return false;
 122                         merge_continuation_char(&val, *p);
 123
 124                 TWO_REMAINING:
 125                         p++;
 126                         length--;
 127                         if (!is_continuation_char(*p))
 128                                 return false;
 129                         merge_continuation_char(&val, *p);
 130
 131                 ONE_REMAINING:
 132                         p++;
 133                         length--;
 134                         if (!is_continuation_char(*p))
 135                                 return false;
 136                         merge_continuation_char(&val, *p);
 137
 138                         if (val < min)
 139                                 return false;
 140                 }
 141
 142                 if (is_unicode_control(val))
 143                         return false;
 144         }
 145
 146         return true;
 147 }
 148
 149 static char* utf8_validate(const char *str, char *output) {
 150         uint32_t val = 0;
 151         uint32_t min = 0;
 152         const uint8_t *p, *last;
 153         int size;
 154         uint8_t *o;
 155
 156         assert(str);
 157
 158         o = (uint8_t*) output;
 159         for (p = (const uint8_t*) str; *p; p++) {
 160                 if (*p < 128) {
 161                         if (o)
 162                                 *o = *p;
 163                 } else {
 164                         last = p;
 165
 166                         if ((*p & 0xe0) == 0xc0) { /* 110xxxxx two-char seq. */
 167                                 size = 2;
 168                                 min = 128;
 169                                 val = (uint32_t) (*p & 0x1e);
 170                                 goto ONE_REMAINING;
 171                         } else if ((*p & 0xf0) == 0xe0) { /* 1110xxxx three-char seq.*/
 172                                 size = 3;
 173                                 min = (1 << 11);
 174                                 val = (uint32_t) (*p & 0x0f);
 175                                 goto TWO_REMAINING;
 176                         } else if ((*p & 0xf8) == 0xf0) { /* 11110xxx four-char seq */
 177                                 size = 4;
 178                                 min = (1 << 16);
 179                                 val = (uint32_t) (*p & 0x07);
 180                         } else
 181                                 goto error;
 182
 183                         p++;
 184                         if (!is_continuation_char(*p))
 185                                 goto error;
 186                         merge_continuation_char(&val, *p);
 187
 188                 TWO_REMAINING:
 189                         p++;
 190                         if (!is_continuation_char(*p))
 191                                 goto error;
 192                         merge_continuation_char(&val, *p);
 193
 194                 ONE_REMAINING:
 195                         p++;
 196                         if (!is_continuation_char(*p))
 197                                 goto error;
 198                         merge_continuation_char(&val, *p);
 199
 200                         if (val < min)
 201                                 goto error;
 202
 203                         if (!is_unicode_valid(val))
 204                                 goto error;
 205
 206                         if (o) {
 207                                 memcpy(o, last, (size_t) size);
 208                                 o += size;
 209                         }
 210
 211                         continue;
 212
 213                 error:
 214                         if (o) {
 215                                 *o = FILTER_CHAR;
 216                                 p = last; /* We retry at the next character */
 217                         } else
 218                                 goto failure;
 219                 }
 220
 221                 if (o)
 222                         o++;
 223         }
 224
 225         if (o) {
 226                 *o = '\0';
 227                 return output;
 228         }
 229
 230         return (char*) str;
 231
 232 failure:
 233         return NULL;
 234 }
 235
 236 char* utf8_is_valid (const char *str) {
 237         return utf8_validate(str, NULL);
 238 }
 239
 240 char* utf8_filter (const char *str) {
 241         char *new_str;
 242
 243         assert(str);
 244
 245         new_str = malloc(strlen(str) + 1);
 246         if (!new_str)
 247                 return NULL;
 248
 249         return utf8_validate(str, new_str);
 250 }
 251
 252 char *ascii_is_valid(const char *str) {
 253         const char *p;
 254
 255         assert(str);
 256
 257         for (p = str; *p; p++)
 258                 if ((unsigned char) *p >= 128)
 259                         return NULL;
 260
 261         return (char*) str;
 262 }
 263
 264 char *ascii_filter(const char *str) {
 265         const char *s;
 266         char *r, *d;
 267         size_t l;
 268
 269         assert(str);
 270
 271         l = strlen(str);
 272         r = malloc(l + 1);
 273         if (!r)
 274                 return NULL;
 275
 276         for (s = str, d = r; *s; s++)
 277                 if ((unsigned char) *s < 128)
 278                         *(d++) = *s;
 279
 280         *d = 0;
 281
 282         return r;
 283 }
 284
 285 char *utf16_to_utf8(const void *s, size_t length) {
 286         char *r;
 287         const uint8_t *f;
 288         uint8_t *t;
 289
 290         r = new(char, (length*3+1)/2 + 1);
 291         if (!r)
 292                 return NULL;
 293
 294         t = (uint8_t*) r;
 295
 296         for (f = s; f < (const uint8_t*) s + length; f += 2) {
 297                 uint16_t c;
 298
 299                 c = (f[1] << 8) | f[0];
 300
 301                 if (c == 0) {
 302                         *t = 0;
 303                         return r;
 304                 } else if (c < 0x80) {
 305                         *(t++) = (uint8_t) c;
 306                 } else if (c < 0x800) {
 307                         *(t++) = (uint8_t) (0xc0 | (c >> 6));
 308                         *(t++) = (uint8_t) (0x80 | (c & 0x3f));
 309                 } else {
 310                         *(t++) = (uint8_t) (0xe0 | (c >> 12));
 311                         *(t++) = (uint8_t) (0x80 | ((c >> 6) & 0x3f));
 312                         *(t++) = (uint8_t) (0x80 | (c & 0x3f));
 313                 }
 314         }
 315
 316         *t = 0;
 317
 318         return r;
 319 }