lib/uwildmat.c

   1 /*  $Id: uwildmat.c 6779 2004-05-17 07:25:28Z rra $
   2 **
   3 **  wildmat pattern matching with Unicode UTF-8 extensions.
   4 **
   5 **  Do shell-style pattern matching for ?, \, [], and * characters.  Might not
   6 **  be robust in face of malformed patterns; e.g., "foo[a-" could cause a
   7 **  segmentation violation.  It is 8-bit clean.  (Robustness hopefully fixed
   8 **  July 2000; all malformed patterns should now just fail to match anything.)
   9 **
  10 **  Original by Rich $alz, mirror!rs, Wed Nov 26 19:03:17 EST 1986.
  11 **  Rich $alz is now <rsalz@osf.org>.
  12 **
  13 **  April, 1991:  Replaced mutually-recursive calls with in-line code for the
  14 **  star character.
  15 **
  16 **  Special thanks to Lars Mathiesen <thorinn@diku.dk> for the ABORT code.
  17 **  This can greatly speed up failing wildcard patterns.  For example:
  18 **
  19 **      pattern: -*-*-*-*-*-*-12-*-*-*-m-*-*-*
  20 **      text 1:  -adobe-courier-bold-o-normal--12-120-75-75-m-70-iso8859-1
  21 **      text 2:  -adobe-courier-bold-o-normal--12-120-75-75-X-70-iso8859-1
  22 **
  23 **  Text 1 matches with 51 calls, while text 2 fails with 54 calls.  Without
  24 **  the ABORT code, it takes 22310 calls to fail.  Ugh.  The following
  25 **  explanation is from Lars:
  26 **
  27 **  The precondition that must be fulfilled is that DoMatch will consume at
  28 **  least one character in text.  This is true if *p is neither '*' nor '\0'.)
  29 **  The last return has ABORT instead of false to avoid quadratic behaviour in
  30 **  cases like pattern "*a*b*c*d" with text "abcxxxxx".  With false, each
  31 **  star-loop has to run to the end of the text; with ABORT only the last one
  32 **  does.
  33 **
  34 **  Once the control of one instance of DoMatch enters the star-loop, that
  35 **  instance will return either true or ABORT, and any calling instance will
  36 **  therefore return immediately after (without calling recursively again).
  37 **  In effect, only one star-loop is ever active.  It would be possible to
  38 **  modify the code to maintain this context explicitly, eliminating all
  39 **  recursive calls at the cost of some complication and loss of clarity (and
  40 **  the ABORT stuff seems to be unclear enough by itself).  I think it would
  41 **  be unwise to try to get this into a released version unless you have a
  42 **  good test data base to try it out on.
  43 **
  44 **  June, 1991:  Robert Elz <kre@munnari.oz.au> added minus and close bracket
  45 **  handling for character sets.
  46 **
  47 **  July, 2000:  Largely rewritten by Russ Allbery <rra@stanford.edu> to add
  48 **  support for ',', '!', and optionally '@' to the core wildmat routine.
  49 **  Broke the character class matching into a separate function for clarity
  50 **  since it's infrequently used in practice, and added some simple lookahead
  51 **  to significantly decrease the recursive calls in the '*' matching code.
  52 **  Added support for UTF-8 as the default character set for any high-bit
  53 **  characters.
  54 **
  55 **  For more information on UTF-8, see RFC 2279.
  56 **
  57 **  Please note that this file is intentionally written so that conditionally
  58 **  executed expressions are on separate lines from the condition to
  59 **  facilitate analysis of the coverage of the test suite using purecov.
  60 **  Please preserve this.  As of March 11, 2001, purecov reports that the
  61 **  accompanying test suite achieves 100% coverage of this file.
  62 */
  63
  64 #include "config.h"
  65 #include "clibrary.h"
  66 #include "libinn.h"
  67
  68 #define ABORT -1
  69
  70 /* Whether or not an octet looks like the start of a UTF-8 character. */
  71 #define ISUTF8(c)       (((c) & 0xc0) == 0xc0)
  72
  73
  74 /*
  75 **  Determine the length of a non-ASCII character in octets (for advancing
  76 **  pointers when skipping over characters).  Takes a pointer to the start of
  77 **  the character and to the last octet of the string.  If end is NULL, expect
  78 **  the string pointed to by start to be nul-terminated.  If the character is
  79 **  malformed UTF-8, return 1 to treat it like an eight-bit local character.
  80 */
  81 static int
  82 utf8_length(const unsigned char *start, const unsigned char *end)
  83 {
  84     unsigned char mask = 0x80;
  85     const unsigned char *p;
  86     int length = 0;
  87     int left;
  88
  89     for (; mask > 0 && (*start & mask) == mask; mask >>= 1)
  90         length++;
  91     if (length < 2 || length > 6)
  92         return 1;
  93     if (end != NULL && (end - start + 1) < length)
  94         return 1;
  95     left = length - 1;
  96     p = start + 1;
  97     for (p = start + 1; left > 0 && (*p & 0xc0) == 0x80; p++)
  98         left--;
  99     return (left == 0) ? length : 1;
 100 }
 101
 102
 103 /*
 104 **  Convert a UTF-8 character to UCS-4.  Takes a pointer to the start of the
 105 **  character and to the last octet of the string, and to a uint32_t into
 106 **  which to put the decoded UCS-4 value.  If end is NULL, expect the string
 107 **  pointed to by start to be nul-terminated.  Returns the number of octets in
 108 **  the UTF-8 encoding.  If the UTF-8 character is malformed, set result to
 109 **  the decimal value of the first octet; this is wrong, but it will generally
 110 **  cause the rest of the wildmat matching to do the right thing for non-UTF-8
 111 **  input.
 112 */
 113 static int
 114 utf8_decode(const unsigned char *start, const unsigned char *end,
 115             uint32_t *result)
 116 {
 117     uint32_t value = 0;
 118     int length, i;
 119     const unsigned char *p = start;
 120     unsigned char mask;
 121
 122     length = utf8_length(start, end);
 123     if (length < 2) {
 124         *result = *start;
 125         return 1;
 126     }
 127     mask = (1 << (7 - length)) - 1;
 128     value = *p & mask;
 129     p++;
 130     for (i = length - 1; i > 0; i--) {
 131         value = (value << 6) | (*p & 0x3f);
 132         p++;
 133     }
 134     *result = value;
 135     return length;
 136 }
 137
 138
 139 /*
 140 **  Match a character class against text, a UCS-4 character.  start is a
 141 **  pointer to the first character of the character class, end a pointer to
 142 **  the last.  Returns whether the class matches that character.
 143 */
 144 static bool
 145 match_class(uint32_t text, const unsigned char *start,
 146             const unsigned char *end)
 147 {
 148     bool reversed, allowrange;
 149     const unsigned char *p = start;
 150     uint32_t first, last;
 151
 152     /* Check for an inverted character class (starting with ^).  If the
 153        character matches the character class, we return !reversed; that way,
 154        we return true if it's a regular character class and false if it's a
 155        reversed one.  If the character doesn't match, we return reversed. */
 156     reversed = (*p == '^');
 157     if (reversed)
 158         p++;
 159
 160     /* Walk through the character class until we reach the end or find a
 161        match, handling character ranges as we go.  Only permit a range to
 162        start when allowrange is true; this allows - to be treated like a
 163        normal character as the first character of the class and catches
 164        malformed ranges like a-e-n.  We treat the character at the beginning
 165        of a range as both a regular member of the class and the beginning of
 166        the range; this is harmless (although it means that malformed ranges
 167        like m-a will match m and nothing else). */
 168     allowrange = false;
 169     while (p <= end) {
 170         if (allowrange && *p == '-' && p < end) {
 171             p++;
 172             p += utf8_decode(p, end, &last);
 173             if (text >= first && text <= last)
 174                 return !reversed;
 175             allowrange = false;
 176         } else {
 177             p += utf8_decode(p, end, &first);
 178             if (text == first)
 179                 return !reversed;
 180             allowrange = true;
 181         }
 182     }
 183     return reversed;
 184 }
 185
 186
 187 /*
 188 **  Match the text against the pattern between start and end.  This is a
 189 **  single pattern; a leading ! or @ must already be taken care of, and
 190 **  commas must be dealt with outside of this routine.
 191 */
 192 static int
 193 match_pattern(const unsigned char *text, const unsigned char *start,
 194               const unsigned char *end)
 195 {
 196     const unsigned char *q, *endclass;
 197     const unsigned char *p = start;
 198     bool ismeta;
 199     int matched, width;
 200     uint32_t c;
 201
 202     for (; p <= end; p++) {
 203         if (!*text && *p != '*')
 204             return ABORT;
 205
 206         switch (*p) {
 207         case '\\':
 208             if (!*++p)
 209                 return ABORT;
 210             /* Fall through. */
 211
 212         default:
 213             if (*text++ != *p)
 214                 return false;
 215             break;
 216
 217         case '?':
 218             text += ISUTF8(*text) ? utf8_length(text, NULL) : 1;
 219             break;
 220
 221         case '*':
 222             /* Consecutive stars are equivalent to one.  Advance pattern to
 223                the character after the star. */
 224             for (++p; *p == '*'; p++)
 225                 ;
 226
 227             /* A trailing star will match anything. */
 228             if (p > end)
 229                 return true;
 230
 231             /* Basic algorithm: Recurse at each point where the * could
 232                possibly match.  If the match succeeds or aborts, return
 233                immediately; otherwise, try the next position.
 234
 235                Optimization: If the character after the * in the pattern
 236                isn't a metacharacter (the common case), then the * has to
 237                consume characters at least up to the next occurance of that
 238                character in the text.  Scan forward for those points rather
 239                than recursing at every possible point to save the extra
 240                function call overhead. */
 241             ismeta = (*p == '[' || *p == '?' || *p == '\\');
 242             while (*text) {
 243                 width = ISUTF8(*text) ? utf8_length(text, NULL) : 1;
 244                 if (ismeta) {
 245                     matched = match_pattern(text, p, end);
 246                     text += width;
 247                 } else {
 248                     while (*text && *text != *p) {
 249                         text += width;
 250                         width = ISUTF8(*text) ? utf8_length(text, NULL) : 1;
 251                     }
 252                     if (!*text)
 253                         return ABORT;
 254                     matched = match_pattern(++text, p + 1, end);
 255                 }
 256                 if (matched != false)
 257                     return matched;
 258             }
 259             return ABORT;
 260
 261         case '[':
 262             /* Find the end of the character class, making sure not to pick
 263                up a close bracket at the beginning of the class. */
 264             p++;
 265             q = p + (*p == '^') + 1;
 266             if (q > end)
 267                 return ABORT;
 268             endclass = memchr(q, ']', (size_t) (end - q + 1));
 269             if (!endclass)
 270                 return ABORT;
 271
 272             /* Do the heavy lifting in another function for clarity, since
 273                character classes are an uncommon case. */
 274             text += utf8_decode(text, NULL, &c);
 275             if (!match_class(c, p, endclass - 1))
 276                 return false;
 277             p = endclass;
 278             break;
 279         }
 280     }
 281
 282     return (*text == '\0');
 283 }
 284
 285
 286 /*
 287 **  Takes text and a wildmat expression; a wildmat expression is a
 288 **  comma-separated list of wildmat patterns, optionally preceeded by ! to
 289 **  invert the sense of the expression.  Returns WILDMAT_MATCH if that
 290 **  expression matches the text, WILDMAT_FAIL otherwise.  If allowpoison is
 291 **  set, allow @ to introduce a poison expression (the same as !, but if it
 292 **  triggers the failed match the routine returns WILDMAT_POISON instead).
 293 */
 294 static enum uwildmat
 295 match_expression(const unsigned char *text, const unsigned char *start,
 296                  bool allowpoison)
 297 {
 298     const unsigned char *end, *split;
 299     const unsigned char *p = start;
 300     bool reverse, escaped;
 301     bool match = false;
 302     bool poison = false;
 303     bool poisoned = false;
 304
 305     /* Handle the empty expression separately, since otherwise end will be
 306        set to an invalid pointer. */
 307     if (!*p)
 308         return !*text ? UWILDMAT_MATCH : UWILDMAT_FAIL;
 309     end = start + strlen((const char *) start) - 1;
 310
 311     /* Main match loop.  Find each comma that separates patterns, and attempt
 312        to match the text with each pattern in order.  The last matching
 313        pattern determines whether the whole expression matches. */
 314     for (; p <= end + 1; p = split + 1) {
 315         if (allowpoison)
 316             poison = (*p == '@');
 317         reverse = (*p == '!') || poison;
 318         if (reverse)
 319             p++;
 320
 321         /* Find the first unescaped comma, if any.  If there is none, split
 322            will be one greater than end and point at the nul at the end of
 323            the string. */
 324         for (escaped = false, split = p; split <= end; split++) {
 325             if (*split == '[') {
 326                 split++;
 327                 if (*split == ']')
 328                     split++;
 329                 while (split <= end && *split != ']')
 330                     split++;
 331             }
 332             if (*split == ',' && !escaped)
 333                 break;
 334             escaped = (*split == '\\') ? !escaped : false;
 335         }
 336
 337         /* Optimization: If match == !reverse and poison == poisoned, this
 338            pattern can't change the result, so don't do any work. */
 339         if (match == !reverse && poison == poisoned)
 340             continue;
 341         if (match_pattern(text, p, split - 1) == true) {
 342             poisoned = poison;
 343             match = !reverse;
 344         }
 345     }
 346     if (poisoned)
 347         return UWILDMAT_POISON;
 348     return match ? UWILDMAT_MATCH : UWILDMAT_FAIL;
 349 }
 350
 351
 352 /*
 353 **  User-level routine used for wildmats where @ should be treated as a
 354 **  regular character.
 355 */
 356 bool
 357 uwildmat(const char *text, const char *pat)
 358 {
 359     const unsigned char *utext = (const unsigned char *) text;
 360     const unsigned char *upat = (const unsigned char *) pat;
 361
 362     if (upat[0] == '*' && upat[1] == '\0')
 363         return true;
 364     else
 365         return (match_expression(utext, upat, false) == UWILDMAT_MATCH);
 366 }
 367
 368
 369 /*
 370 **  User-level routine used for wildmats that support poison matches.
 371 */
 372 enum uwildmat
 373 uwildmat_poison(const char *text, const char *pat)
 374 {
 375     const unsigned char *utext = (const unsigned char *) text;
 376     const unsigned char *upat = (const unsigned char *) pat;
 377
 378     if (upat[0] == '*' && upat[1] == '\0')
 379         return UWILDMAT_MATCH;
 380     else
 381         return match_expression(utext, upat, true);
 382 }
 383
 384
 385 /*
 386 **  User-level routine for simple expressions (neither , nor ! are special).
 387 */
 388 bool
 389 uwildmat_simple(const char *text, const char *pat)
 390 {
 391     const unsigned char *utext = (const unsigned char *) text;
 392     const unsigned char *upat = (const unsigned char *) pat;
 393     size_t length;
 394
 395     if (upat[0] == '*' && upat[1] == '\0')
 396         return true;
 397     else {
 398         length = strlen(pat);
 399         return (match_pattern(utext, upat, upat + length - 1) == true);
 400     }
 401 }