src/shared/utf8.c

   1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
   2
   3 /***
   4   This file is part of systemd.
   5
   6   Copyright 2012 Lennart Poettering
   7
   8   systemd is free software; you can redistribute it and/or modify it
   9   under the terms of the GNU Lesser General Public License as published by
  10   the Free Software Foundation; either version 2.1 of the License, or
  11   (at your option) any later version.
  12
  13   systemd is distributed in the hope that it will be useful, but
  14   WITHOUT ANY WARRANTY; without even the implied warranty of
  15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  16   Lesser General Public License for more details.
  17
  18   You should have received a copy of the GNU Lesser General Public License
  19   along with systemd; If not, see <http://www.gnu.org/licenses/>.
  20 ***/
  21
  22 /* This file is based on the GLIB utf8 validation functions. The
  23  * original license text follows. */
  24
  25 /* gutf8.c - Operations on UTF-8 strings.
  26  *
  27  * Copyright (C) 1999 Tom Tromey
  28  * Copyright (C) 2000 Red Hat, Inc.
  29  *
  30  * This library is free software; you can redistribute it and/or
  31  * modify it under the terms of the GNU Lesser General Public
  32  * License as published by the Free Software Foundation; either
  33  * version 2 of the License, or (at your option) any later version.
  34  *
  35  * This library is distributed in the hope that it will be useful,
  36  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  37  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.         See the GNU
  38  * Lesser General Public License for more details.
  39  *
  40  * You should have received a copy of the GNU Lesser General Public
  41  * License along with this library; if not, write to the
  42  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  43  * Boston, MA 02111-1307, USA.
  44  */
  45
  46 #include <errno.h>
  47 #include <stdlib.h>
  48 #include <inttypes.h>
  49 #include <string.h>
  50 #include <stdbool.h>
  51
  52 #include "utf8.h"
  53
  54 #define FILTER_CHAR '_'
  55
  56 static inline bool is_unicode_valid(uint32_t ch) {
  57
  58         if (ch >= 0x110000) /* End of unicode space */
  59                 return false;
  60         if ((ch & 0xFFFFF800) == 0xD800) /* Reserved area for UTF-16 */
  61                 return false;
  62         if ((ch >= 0xFDD0) && (ch <= 0xFDEF)) /* Reserved */
  63                 return false;
  64         if ((ch & 0xFFFE) == 0xFFFE) /* BOM (Byte Order Mark) */
  65                 return false;
  66
  67         return true;
  68 }
  69
  70 static inline bool is_continuation_char(uint8_t ch) {
  71         if ((ch & 0xc0) != 0x80) /* 10xxxxxx */
  72                 return false;
  73         return true;
  74 }
  75
  76 static inline void merge_continuation_char(uint32_t *u_ch, uint8_t ch) {
  77         *u_ch <<= 6;
  78         *u_ch |= ch & 0x3f;
  79 }
  80
  81 static bool is_unicode_control(uint32_t ch) {
  82
  83         /*
  84           0 to ' '-1 is the C0 range.
  85           DEL=0x7F, and DEL+1 to 0x9F is C1 range.
  86           '\t' is in C0 range, but more or less harmless and commonly used.
  87         */
  88
  89         return (ch < ' ' && ch != '\t') ||
  90                 (0x7F <= ch && ch <= 0x9F);
  91 }
  92
  93 char* utf8_is_printable_n(const char* str, size_t length) {
  94         uint32_t val = 0;
  95         uint32_t min = 0;
  96         const uint8_t *p;
  97
  98         assert(str);
  99
 100         for (p = (const uint8_t*) str; length; p++, length--) {
 101                 if (*p < 128) {
 102                         val = *p;
 103                 } else {
 104                         if ((*p & 0xe0) == 0xc0) { /* 110xxxxx two-char seq. */
 105                                 min = 128;
 106                                 val = (uint32_t) (*p & 0x1e);
 107                                 goto ONE_REMAINING;
 108                         } else if ((*p & 0xf0) == 0xe0) { /* 1110xxxx three-char seq.*/
 109                                 min = (1 << 11);
 110                                 val = (uint32_t) (*p & 0x0f);
 111                                 goto TWO_REMAINING;
 112                         } else if ((*p & 0xf8) == 0xf0) { /* 11110xxx four-char seq */
 113                                 min = (1 << 16);
 114                                 val = (uint32_t) (*p & 0x07);
 115                         } else
 116                                 goto error;
 117
 118                         p++;
 119                         length--;
 120                         if (!length || !is_continuation_char(*p))
 121                                 goto error;
 122                         merge_continuation_char(&val, *p);
 123
 124                 TWO_REMAINING:
 125                         p++;
 126                         length--;
 127                         if (!is_continuation_char(*p))
 128                                 goto error;
 129                         merge_continuation_char(&val, *p);
 130
 131                 ONE_REMAINING:
 132                         p++;
 133                         length--;
 134                         if (!is_continuation_char(*p))
 135                                 goto error;
 136                         merge_continuation_char(&val, *p);
 137
 138                         if (val < min)
 139                                 goto error;
 140                 }
 141
 142                 if (is_unicode_control(val))
 143                         goto error;
 144         }
 145
 146         return (char*) str;
 147
 148 error:
 149         return NULL;
 150 }
 151
 152 static char* utf8_validate(const char *str, char *output) {
 153         uint32_t val = 0;
 154         uint32_t min = 0;
 155         const uint8_t *p, *last;
 156         int size;
 157         uint8_t *o;
 158
 159         assert(str);
 160
 161         o = (uint8_t*) output;
 162         for (p = (const uint8_t*) str; *p; p++) {
 163                 if (*p < 128) {
 164                         if (o)
 165                                 *o = *p;
 166                 } else {
 167                         last = p;
 168
 169                         if ((*p & 0xe0) == 0xc0) { /* 110xxxxx two-char seq. */
 170                                 size = 2;
 171                                 min = 128;
 172                                 val = (uint32_t) (*p & 0x1e);
 173                                 goto ONE_REMAINING;
 174                         } else if ((*p & 0xf0) == 0xe0) { /* 1110xxxx three-char seq.*/
 175                                 size = 3;
 176                                 min = (1 << 11);
 177                                 val = (uint32_t) (*p & 0x0f);
 178                                 goto TWO_REMAINING;
 179                         } else if ((*p & 0xf8) == 0xf0) { /* 11110xxx four-char seq */
 180                                 size = 4;
 181                                 min = (1 << 16);
 182                                 val = (uint32_t) (*p & 0x07);
 183                         } else
 184                                 goto error;
 185
 186                         p++;
 187                         if (!is_continuation_char(*p))
 188                                 goto error;
 189                         merge_continuation_char(&val, *p);
 190
 191                 TWO_REMAINING:
 192                         p++;
 193                         if (!is_continuation_char(*p))
 194                                 goto error;
 195                         merge_continuation_char(&val, *p);
 196
 197                 ONE_REMAINING:
 198                         p++;
 199                         if (!is_continuation_char(*p))
 200                                 goto error;
 201                         merge_continuation_char(&val, *p);
 202
 203                         if (val < min)
 204                                 goto error;
 205
 206                         if (!is_unicode_valid(val))
 207                                 goto error;
 208
 209                         if (o) {
 210                                 memcpy(o, last, (size_t) size);
 211                                 o += size;
 212                         }
 213
 214                         continue;
 215
 216                 error:
 217                         if (o) {
 218                                 *o = FILTER_CHAR;
 219                                 p = last; /* We retry at the next character */
 220                         } else
 221                                 goto failure;
 222                 }
 223
 224                 if (o)
 225                         o++;
 226         }
 227
 228         if (o) {
 229                 *o = '\0';
 230                 return output;
 231         }
 232
 233         return (char*) str;
 234
 235 failure:
 236         return NULL;
 237 }
 238
 239 char* utf8_is_valid (const char *str) {
 240         return utf8_validate(str, NULL);
 241 }
 242
 243 char* utf8_filter (const char *str) {
 244         char *new_str;
 245
 246         assert(str);
 247
 248         new_str = malloc(strlen(str) + 1);
 249         if (!new_str)
 250                 return NULL;
 251
 252         return utf8_validate(str, new_str);
 253 }
 254
 255 char *ascii_is_valid(const char *str) {
 256         const char *p;
 257
 258         assert(str);
 259
 260         for (p = str; *p; p++)
 261                 if ((unsigned char) *p >= 128)
 262                         return NULL;
 263
 264         return (char*) str;
 265 }
 266
 267 char *ascii_filter(const char *str) {
 268         const char *s;
 269         char *r, *d;
 270         size_t l;
 271
 272         assert(str);
 273
 274         l = strlen(str);
 275         r = malloc(l + 1);
 276         if (!r)
 277                 return NULL;
 278
 279         for (s = str, d = r; *s; s++)
 280                 if ((unsigned char) *s < 128)
 281                         *(d++) = *s;
 282
 283         *d = 0;
 284
 285         return r;
 286 }