18 /*************************************************
19 * PCRE DEMONSTRATION PROGRAM *
20 *************************************************/
22 /* This is a demonstration program to illustrate the most straightforward ways
23 of calling the PCRE regular expression library from a C program. See the
24 pcresample documentation for a short discussion ("man pcresample" if you have
25 the PCRE man pages installed).
27 In Unix-like environments, if PCRE is installed in your standard system
28 libraries, you should be able to compile this program using this command:
30 gcc -Wall pcredemo.c -lpcre -o pcredemo
32 If PCRE is not installed in a standard place, it is likely to be installed with
33 support for the pkg-config mechanism. If you have pkg-config, you can compile
34 this program using this command:
36 gcc -Wall pcredemo.c `pkg-config --cflags --libs libpcre` -o pcredemo
38 If you do not have pkg-config, you may have to use this:
40 gcc -Wall pcredemo.c -I/usr/local/include -L/usr/local/lib \e
41 -R/usr/local/lib -lpcre -o pcredemo
43 Replace "/usr/local/include" and "/usr/local/lib" with wherever the include and
44 library files for PCRE are installed on your system. Only some operating
45 systems (e.g. Solaris) use the -R option.
47 Building under Windows:
49 If you want to statically link this program against a non-dll .a file, you must
50 define PCRE_STATIC before including pcre.h, otherwise the pcre_malloc() and
51 pcre_free() exported functions will be declared __declspec(dllimport), with
52 unwanted results. So in this environment, uncomment the following line. */
54 /* #define PCRE_STATIC */
60 #define OVECCOUNT 30 /* should be a multiple of 3 */
63 int main(int argc, char **argv)
69 unsigned char *name_table;
70 unsigned int option_bits;
76 int ovector[OVECCOUNT];
82 /**************************************************************************
83 * First, sort out the command line. There is only one possible option at *
84 * the moment, "-g" to request repeated matching to find all occurrences, *
85 * like Perl's /g option. We set the variable find_all to a non-zero value *
86 * if the -g option is present. Apart from that, there must be exactly two *
88 **************************************************************************/
91 for (i = 1; i < argc; i++)
93 if (strcmp(argv[i], "-g") == 0) find_all = 1;
97 /* After the options, we require exactly two arguments, which are the pattern,
98 and the subject string. */
102 printf("Two arguments required: a regex and a subject string\en");
108 subject_length = (int)strlen(subject);
111 /*************************************************************************
112 * Now we are going to compile the regular expression pattern, and handle *
113 * and errors that are detected. *
114 *************************************************************************/
117 pattern, /* the pattern */
118 0, /* default options */
119 &error, /* for error message */
120 &erroffset, /* for error offset */
121 NULL); /* use default character tables */
123 /* Compilation failed: print the error message and exit */
127 printf("PCRE compilation failed at offset %d: %s\en", erroffset, error);
132 /*************************************************************************
133 * If the compilation succeeded, we call PCRE again, in order to do a *
134 * pattern match against the subject string. This does just ONE match. If *
135 * further matching is needed, it will be done below. *
136 *************************************************************************/
139 re, /* the compiled pattern */
140 NULL, /* no extra data - we didn't study the pattern */
141 subject, /* the subject string */
142 subject_length, /* the length of the subject */
143 0, /* start at offset 0 in the subject */
144 0, /* default options */
145 ovector, /* output vector for substring information */
146 OVECCOUNT); /* number of elements in the output vector */
148 /* Matching failed: handle error cases */
154 case PCRE_ERROR_NOMATCH: printf("No match\en"); break;
156 Handle other special cases if you like
158 default: printf("Matching error %d\en", rc); break;
160 pcre_free(re); /* Release memory used for the compiled pattern */
166 printf("\enMatch succeeded at offset %d\en", ovector[0]);
169 /*************************************************************************
170 * We have found the first match within the subject string. If the output *
171 * vector wasn't big enough, say so. Then output any substrings that were *
173 *************************************************************************/
175 /* The output vector wasn't big enough */
180 printf("ovector only has room for %d captured substrings\en", rc - 1);
183 /* Show substrings stored in the output vector by number. Obviously, in a real
184 application you might want to do things other than print them. */
186 for (i = 0; i < rc; i++)
188 char *substring_start = subject + ovector[2*i];
189 int substring_length = ovector[2*i+1] - ovector[2*i];
190 printf("%2d: %.*s\en", i, substring_length, substring_start);
194 /**************************************************************************
195 * That concludes the basic part of this demonstration program. We have *
196 * compiled a pattern, and performed a single match. The code that follows *
197 * shows first how to access named substrings, and then how to code for *
198 * repeated matches on the same subject. *
199 **************************************************************************/
201 /* See if there are any named substrings, and if so, show them by name. First
202 we have to extract the count of named parentheses from the pattern. */
205 re, /* the compiled pattern */
206 NULL, /* no extra data - we didn't study the pattern */
207 PCRE_INFO_NAMECOUNT, /* number of named substrings */
208 &namecount); /* where to put the answer */
210 if (namecount <= 0) printf("No named substrings\en"); else
212 unsigned char *tabptr;
213 printf("Named substrings\en");
215 /* Before we can access the substrings, we must extract the table for
216 translating names to numbers, and the size of each entry in the table. */
219 re, /* the compiled pattern */
220 NULL, /* no extra data - we didn't study the pattern */
221 PCRE_INFO_NAMETABLE, /* address of the table */
222 &name_table); /* where to put the answer */
225 re, /* the compiled pattern */
226 NULL, /* no extra data - we didn't study the pattern */
227 PCRE_INFO_NAMEENTRYSIZE, /* size of each entry in the table */
228 &name_entry_size); /* where to put the answer */
230 /* Now we can scan the table and, for each entry, print the number, the name,
231 and the substring itself. */
234 for (i = 0; i < namecount; i++)
236 int n = (tabptr[0] << 8) | tabptr[1];
237 printf("(%d) %*s: %.*s\en", n, name_entry_size - 3, tabptr + 2,
238 ovector[2*n+1] - ovector[2*n], subject + ovector[2*n]);
239 tabptr += name_entry_size;
244 /*************************************************************************
245 * If the "-g" option was given on the command line, we want to continue *
246 * to search for additional matches in the subject string, in a similar *
247 * way to the /g option in Perl. This turns out to be trickier than you *
248 * might think because of the possibility of matching an empty string. *
249 * What happens is as follows: *
251 * If the previous match was NOT for an empty string, we can just start *
252 * the next match at the end of the previous one. *
254 * If the previous match WAS for an empty string, we can't do that, as it *
255 * would lead to an infinite loop. Instead, a special call of pcre_exec() *
256 * is made with the PCRE_NOTEMPTY_ATSTART and PCRE_ANCHORED flags set. *
257 * The first of these tells PCRE that an empty string at the start of the *
258 * subject is not a valid match; other possibilities must be tried. The *
259 * second flag restricts PCRE to one match attempt at the initial string *
260 * position. If this match succeeds, an alternative to the empty string *
261 * match has been found, and we can print it and proceed round the loop, *
262 * advancing by the length of whatever was found. If this match does not *
263 * succeed, we still stay in the loop, advancing by just one character. *
264 * In UTF-8 mode, which can be set by (*UTF8) in the pattern, this may be *
265 * more than one byte. *
267 * However, there is a complication concerned with newlines. When the *
268 * newline convention is such that CRLF is a valid newline, we must *
269 * advance by two characters rather than one. The newline convention can *
270 * be set in the regex by (*CR), etc.; if not, we must find the default. *
271 *************************************************************************/
273 if (!find_all) /* Check for -g */
275 pcre_free(re); /* Release the memory used for the compiled pattern */
276 return 0; /* Finish unless -g was given */
279 /* Before running the loop, check for UTF-8 and whether CRLF is a valid newline
280 sequence. First, find the options with which the regex was compiled; extract
281 the UTF-8 state, and mask off all but the newline options. */
283 (void)pcre_fullinfo(re, NULL, PCRE_INFO_OPTIONS, &option_bits);
284 utf8 = option_bits & PCRE_UTF8;
285 option_bits &= PCRE_NEWLINE_CR|PCRE_NEWLINE_LF|PCRE_NEWLINE_CRLF|
286 PCRE_NEWLINE_ANY|PCRE_NEWLINE_ANYCRLF;
288 /* If no newline options were set, find the default newline convention from the
289 build configuration. */
291 if (option_bits == 0)
294 (void)pcre_config(PCRE_CONFIG_NEWLINE, &d);
295 /* Note that these values are always the ASCII ones, even in
296 EBCDIC environments. CR = 13, NL = 10. */
297 option_bits = (d == 13)? PCRE_NEWLINE_CR :
298 (d == 10)? PCRE_NEWLINE_LF :
299 (d == (13<<8 | 10))? PCRE_NEWLINE_CRLF :
300 (d == -2)? PCRE_NEWLINE_ANYCRLF :
301 (d == -1)? PCRE_NEWLINE_ANY : 0;
304 /* See if CRLF is a valid newline sequence. */
307 option_bits == PCRE_NEWLINE_ANY ||
308 option_bits == PCRE_NEWLINE_CRLF ||
309 option_bits == PCRE_NEWLINE_ANYCRLF;
311 /* Loop for second and subsequent matches */
315 int options = 0; /* Normally no options */
316 int start_offset = ovector[1]; /* Start at end of previous match */
318 /* If the previous match was for an empty string, we are finished if we are
319 at the end of the subject. Otherwise, arrange to run another match at the
320 same point to see if a non-empty match can be found. */
322 if (ovector[0] == ovector[1])
324 if (ovector[0] == subject_length) break;
325 options = PCRE_NOTEMPTY_ATSTART | PCRE_ANCHORED;
328 /* Run the next matching operation */
331 re, /* the compiled pattern */
332 NULL, /* no extra data - we didn't study the pattern */
333 subject, /* the subject string */
334 subject_length, /* the length of the subject */
335 start_offset, /* starting offset in the subject */
336 options, /* options */
337 ovector, /* output vector for substring information */
338 OVECCOUNT); /* number of elements in the output vector */
340 /* This time, a result of NOMATCH isn't an error. If the value in "options"
341 is zero, it just means we have found all possible matches, so the loop ends.
342 Otherwise, it means we have failed to find a non-empty-string match at a
343 point where there was a previous empty-string match. In this case, we do what
344 Perl does: advance the matching position by one character, and continue. We
345 do this by setting the "end of previous match" offset, because that is picked
346 up at the top of the loop as the point at which to start again.
348 There are two complications: (a) When CRLF is a valid newline sequence, and
349 the current position is just before it, advance by an extra byte. (b)
350 Otherwise we must ensure that we skip an entire UTF-8 character if we are in
353 if (rc == PCRE_ERROR_NOMATCH)
355 if (options == 0) break; /* All matches found */
356 ovector[1] = start_offset + 1; /* Advance one byte */
357 if (crlf_is_newline && /* If CRLF is newline & */
358 start_offset < subject_length - 1 && /* we are at CRLF, */
359 subject[start_offset] == '\er' &&
360 subject[start_offset + 1] == '\en')
361 ovector[1] += 1; /* Advance by one more. */
362 else if (utf8) /* Otherwise, ensure we */
363 { /* advance a whole UTF-8 */
364 while (ovector[1] < subject_length) /* character. */
366 if ((subject[ovector[1]] & 0xc0) != 0x80) break;
370 continue; /* Go round the loop again */
373 /* Other matching errors are not recoverable. */
377 printf("Matching error %d\en", rc);
378 pcre_free(re); /* Release memory used for the compiled pattern */
384 printf("\enMatch succeeded again at offset %d\en", ovector[0]);
386 /* The match succeeded, but the output vector wasn't big enough. */
391 printf("ovector only has room for %d captured substrings\en", rc - 1);
394 /* As before, show substrings stored in the output vector by number, and then
395 also any named substrings. */
397 for (i = 0; i < rc; i++)
399 char *substring_start = subject + ovector[2*i];
400 int substring_length = ovector[2*i+1] - ovector[2*i];
401 printf("%2d: %.*s\en", i, substring_length, substring_start);
404 if (namecount <= 0) printf("No named substrings\en"); else
406 unsigned char *tabptr = name_table;
407 printf("Named substrings\en");
408 for (i = 0; i < namecount; i++)
410 int n = (tabptr[0] << 8) | tabptr[1];
411 printf("(%d) %*s: %.*s\en", n, name_entry_size - 3, tabptr + 2,
412 ovector[2*n+1] - ovector[2*n], subject + ovector[2*n]);
413 tabptr += name_entry_size;
416 } /* End of loop to find second and subsequent matches */
419 pcre_free(re); /* Release memory used for the compiled pattern */
423 /* End of pcredemo.c */