1 /* $Id: uwildmat.c 6779 2004-05-17 07:25:28Z rra $
3 ** wildmat pattern matching with Unicode UTF-8 extensions.
5 ** Do shell-style pattern matching for ?, \, [], and * characters. Might not
6 ** be robust in face of malformed patterns; e.g., "foo[a-" could cause a
7 ** segmentation violation. It is 8-bit clean. (Robustness hopefully fixed
8 ** July 2000; all malformed patterns should now just fail to match anything.)
10 ** Original by Rich $alz, mirror!rs, Wed Nov 26 19:03:17 EST 1986.
11 ** Rich $alz is now <rsalz@osf.org>.
13 ** April, 1991: Replaced mutually-recursive calls with in-line code for the
16 ** Special thanks to Lars Mathiesen <thorinn@diku.dk> for the ABORT code.
17 ** This can greatly speed up failing wildcard patterns. For example:
19 ** pattern: -*-*-*-*-*-*-12-*-*-*-m-*-*-*
20 ** text 1: -adobe-courier-bold-o-normal--12-120-75-75-m-70-iso8859-1
21 ** text 2: -adobe-courier-bold-o-normal--12-120-75-75-X-70-iso8859-1
23 ** Text 1 matches with 51 calls, while text 2 fails with 54 calls. Without
24 ** the ABORT code, it takes 22310 calls to fail. Ugh. The following
25 ** explanation is from Lars:
27 ** The precondition that must be fulfilled is that DoMatch will consume at
28 ** least one character in text. This is true if *p is neither '*' nor '\0'.)
29 ** The last return has ABORT instead of false to avoid quadratic behaviour in
30 ** cases like pattern "*a*b*c*d" with text "abcxxxxx". With false, each
31 ** star-loop has to run to the end of the text; with ABORT only the last one
34 ** Once the control of one instance of DoMatch enters the star-loop, that
35 ** instance will return either true or ABORT, and any calling instance will
36 ** therefore return immediately after (without calling recursively again).
37 ** In effect, only one star-loop is ever active. It would be possible to
38 ** modify the code to maintain this context explicitly, eliminating all
39 ** recursive calls at the cost of some complication and loss of clarity (and
40 ** the ABORT stuff seems to be unclear enough by itself). I think it would
41 ** be unwise to try to get this into a released version unless you have a
42 ** good test data base to try it out on.
44 ** June, 1991: Robert Elz <kre@munnari.oz.au> added minus and close bracket
45 ** handling for character sets.
47 ** July, 2000: Largely rewritten by Russ Allbery <rra@stanford.edu> to add
48 ** support for ',', '!', and optionally '@' to the core wildmat routine.
49 ** Broke the character class matching into a separate function for clarity
50 ** since it's infrequently used in practice, and added some simple lookahead
51 ** to significantly decrease the recursive calls in the '*' matching code.
52 ** Added support for UTF-8 as the default character set for any high-bit
55 ** For more information on UTF-8, see RFC 2279.
57 ** Please note that this file is intentionally written so that conditionally
58 ** executed expressions are on separate lines from the condition to
59 ** facilitate analysis of the coverage of the test suite using purecov.
60 ** Please preserve this. As of March 11, 2001, purecov reports that the
61 ** accompanying test suite achieves 100% coverage of this file.
70 /* Whether or not an octet looks like the start of a UTF-8 character. */
71 #define ISUTF8(c) (((c) & 0xc0) == 0xc0)
75 ** Determine the length of a non-ASCII character in octets (for advancing
76 ** pointers when skipping over characters). Takes a pointer to the start of
77 ** the character and to the last octet of the string. If end is NULL, expect
78 ** the string pointed to by start to be nul-terminated. If the character is
79 ** malformed UTF-8, return 1 to treat it like an eight-bit local character.
82 utf8_length(const unsigned char *start, const unsigned char *end)
84 unsigned char mask = 0x80;
85 const unsigned char *p;
89 for (; mask > 0 && (*start & mask) == mask; mask >>= 1)
91 if (length < 2 || length > 6)
93 if (end != NULL && (end - start + 1) < length)
97 for (p = start + 1; left > 0 && (*p & 0xc0) == 0x80; p++)
99 return (left == 0) ? length : 1;
104 ** Convert a UTF-8 character to UCS-4. Takes a pointer to the start of the
105 ** character and to the last octet of the string, and to a uint32_t into
106 ** which to put the decoded UCS-4 value. If end is NULL, expect the string
107 ** pointed to by start to be nul-terminated. Returns the number of octets in
108 ** the UTF-8 encoding. If the UTF-8 character is malformed, set result to
109 ** the decimal value of the first octet; this is wrong, but it will generally
110 ** cause the rest of the wildmat matching to do the right thing for non-UTF-8
114 utf8_decode(const unsigned char *start, const unsigned char *end,
119 const unsigned char *p = start;
122 length = utf8_length(start, end);
127 mask = (1 << (7 - length)) - 1;
130 for (i = length - 1; i > 0; i--) {
131 value = (value << 6) | (*p & 0x3f);
140 ** Match a character class against text, a UCS-4 character. start is a
141 ** pointer to the first character of the character class, end a pointer to
142 ** the last. Returns whether the class matches that character.
145 match_class(uint32_t text, const unsigned char *start,
146 const unsigned char *end)
148 bool reversed, allowrange;
149 const unsigned char *p = start;
150 uint32_t first, last;
152 /* Check for an inverted character class (starting with ^). If the
153 character matches the character class, we return !reversed; that way,
154 we return true if it's a regular character class and false if it's a
155 reversed one. If the character doesn't match, we return reversed. */
156 reversed = (*p == '^');
160 /* Walk through the character class until we reach the end or find a
161 match, handling character ranges as we go. Only permit a range to
162 start when allowrange is true; this allows - to be treated like a
163 normal character as the first character of the class and catches
164 malformed ranges like a-e-n. We treat the character at the beginning
165 of a range as both a regular member of the class and the beginning of
166 the range; this is harmless (although it means that malformed ranges
167 like m-a will match m and nothing else). */
170 if (allowrange && *p == '-' && p < end) {
172 p += utf8_decode(p, end, &last);
173 if (text >= first && text <= last)
177 p += utf8_decode(p, end, &first);
188 ** Match the text against the pattern between start and end. This is a
189 ** single pattern; a leading ! or @ must already be taken care of, and
190 ** commas must be dealt with outside of this routine.
193 match_pattern(const unsigned char *text, const unsigned char *start,
194 const unsigned char *end)
196 const unsigned char *q, *endclass;
197 const unsigned char *p = start;
202 for (; p <= end; p++) {
203 if (!*text && *p != '*')
218 text += ISUTF8(*text) ? utf8_length(text, NULL) : 1;
222 /* Consecutive stars are equivalent to one. Advance pattern to
223 the character after the star. */
224 for (++p; *p == '*'; p++)
227 /* A trailing star will match anything. */
231 /* Basic algorithm: Recurse at each point where the * could
232 possibly match. If the match succeeds or aborts, return
233 immediately; otherwise, try the next position.
235 Optimization: If the character after the * in the pattern
236 isn't a metacharacter (the common case), then the * has to
237 consume characters at least up to the next occurance of that
238 character in the text. Scan forward for those points rather
239 than recursing at every possible point to save the extra
240 function call overhead. */
241 ismeta = (*p == '[' || *p == '?' || *p == '\\');
243 width = ISUTF8(*text) ? utf8_length(text, NULL) : 1;
245 matched = match_pattern(text, p, end);
248 while (*text && *text != *p) {
250 width = ISUTF8(*text) ? utf8_length(text, NULL) : 1;
254 matched = match_pattern(++text, p + 1, end);
256 if (matched != false)
262 /* Find the end of the character class, making sure not to pick
263 up a close bracket at the beginning of the class. */
265 q = p + (*p == '^') + 1;
268 endclass = memchr(q, ']', (size_t) (end - q + 1));
272 /* Do the heavy lifting in another function for clarity, since
273 character classes are an uncommon case. */
274 text += utf8_decode(text, NULL, &c);
275 if (!match_class(c, p, endclass - 1))
282 return (*text == '\0');
287 ** Takes text and a wildmat expression; a wildmat expression is a
288 ** comma-separated list of wildmat patterns, optionally preceeded by ! to
289 ** invert the sense of the expression. Returns WILDMAT_MATCH if that
290 ** expression matches the text, WILDMAT_FAIL otherwise. If allowpoison is
291 ** set, allow @ to introduce a poison expression (the same as !, but if it
292 ** triggers the failed match the routine returns WILDMAT_POISON instead).
295 match_expression(const unsigned char *text, const unsigned char *start,
298 const unsigned char *end, *split;
299 const unsigned char *p = start;
300 bool reverse, escaped;
303 bool poisoned = false;
305 /* Handle the empty expression separately, since otherwise end will be
306 set to an invalid pointer. */
308 return !*text ? UWILDMAT_MATCH : UWILDMAT_FAIL;
309 end = start + strlen((const char *) start) - 1;
311 /* Main match loop. Find each comma that separates patterns, and attempt
312 to match the text with each pattern in order. The last matching
313 pattern determines whether the whole expression matches. */
314 for (; p <= end + 1; p = split + 1) {
316 poison = (*p == '@');
317 reverse = (*p == '!') || poison;
321 /* Find the first unescaped comma, if any. If there is none, split
322 will be one greater than end and point at the nul at the end of
324 for (escaped = false, split = p; split <= end; split++) {
329 while (split <= end && *split != ']')
332 if (*split == ',' && !escaped)
334 escaped = (*split == '\\') ? !escaped : false;
337 /* Optimization: If match == !reverse and poison == poisoned, this
338 pattern can't change the result, so don't do any work. */
339 if (match == !reverse && poison == poisoned)
341 if (match_pattern(text, p, split - 1) == true) {
347 return UWILDMAT_POISON;
348 return match ? UWILDMAT_MATCH : UWILDMAT_FAIL;
353 ** User-level routine used for wildmats where @ should be treated as a
354 ** regular character.
357 uwildmat(const char *text, const char *pat)
359 const unsigned char *utext = (const unsigned char *) text;
360 const unsigned char *upat = (const unsigned char *) pat;
362 if (upat[0] == '*' && upat[1] == '\0')
365 return (match_expression(utext, upat, false) == UWILDMAT_MATCH);
370 ** User-level routine used for wildmats that support poison matches.
373 uwildmat_poison(const char *text, const char *pat)
375 const unsigned char *utext = (const unsigned char *) text;
376 const unsigned char *upat = (const unsigned char *) pat;
378 if (upat[0] == '*' && upat[1] == '\0')
379 return UWILDMAT_MATCH;
381 return match_expression(utext, upat, true);
386 ** User-level routine for simple expressions (neither , nor ! are special).
389 uwildmat_simple(const char *text, const char *pat)
391 const unsigned char *utext = (const unsigned char *) text;
392 const unsigned char *upat = (const unsigned char *) pat;
395 if (upat[0] == '*' && upat[1] == '\0')
398 length = strlen(pat);
399 return (match_pattern(utext, upat, upat + length - 1) == true);