chiark / gitweb /
Merge branch 'master' of git.distorted.org.uk:~mdw/publish/public-git/disorder
[disorder] / lib / strptime.c
CommitLineData
477f956c
RK
1/* strptime.c - partial strptime() reimplementation
2 *
cca89d7c 3 * Copyright (c) 2008, 2011, 2013 Richard Kettlewell.
477f956c
RK
4 * All rights reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 * 1. Redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 * notice, this list of conditions and the following disclaimer in the
13 * documentation and/or other materials provided with the distribution.
14 * 3. The name of the author may not be used to endorse or promote products
15 * derived from this software without specific prior written permission.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27 * SUCH DAMAGE.
28 */
1a164e63
RK
29/** @file lib/strptime.c
30 * @brief strptime() reimplementation
31 *
32 * strptime() is here reimplemented because the FreeBSD (and older MacOS) one
477f956c 33 * is broken and does not report errors properly. See TODO remarks below for
1a164e63
RK
34 * some missing bits.
35 */
477f956c 36
cca89d7c
RK
37#if HAVE_CONFIG_H
38# include <config.h>
39#endif
40
477f956c
RK
41#include <ctype.h>
42#include <limits.h>
43#include <string.h>
cca89d7c
RK
44#if HAVE_LANGINFO_H
45# include <langinfo.h>
46#endif
477f956c
RK
47#include "strptime.h"
48
cca89d7c
RK
49#if !HAVE_LANGINFO_H
50/* Fake plastic langinfo. Primarily for Windows.
51 * TODO WIN32 can we get these values out of the win32 api instead? */
52typedef enum {
53 DAY_1,
54 DAY_2,
55 DAY_3,
56 DAY_4,
57 DAY_5,
58 DAY_6,
59 DAY_7,
60 ABDAY_1,
61 ABDAY_2,
62 ABDAY_3,
63 ABDAY_4,
64 ABDAY_5,
65 ABDAY_6,
66 ABDAY_7,
67 MON_1,
68 MON_2,
69 MON_3,
70 MON_4,
71 MON_5,
72 MON_6,
73 MON_7,
74 MON_8,
75 MON_9,
76 MON_10,
77 MON_11,
78 MON_12,
79 ABMON_1,
80 ABMON_2,
81 ABMON_3,
82 ABMON_4,
83 ABMON_5,
84 ABMON_6,
85 ABMON_7,
86 ABMON_8,
87 ABMON_9,
88 ABMON_10,
89 ABMON_11,
90 ABMON_12,
91 D_FMT,
92 T_FMT,
93 D_T_FMT,
94 ERA_D_FMT,
95 ERA_T_FMT,
96 ERA_D_T_FMT,
97} nl_item;
98
99const char *nl_langinfo(nl_item item) {
100 switch(item) {
101 case DAY_1: return "Sunday";
102 case DAY_2: return "Monday";
103 case DAY_3: return "Tuesday";
104 case DAY_4: return "Wednesday";
105 case DAY_5: return "Thursday";
106 case DAY_6: return "Friday";
107 case DAY_7: return "Saturday";
108 case ABDAY_1: return "Sun";
109 case ABDAY_2: return "Mon";
110 case ABDAY_3: return "Tue";
111 case ABDAY_4: return "Wed";
112 case ABDAY_5: return "Thu";
113 case ABDAY_6: return "Fri";
114 case ABDAY_7: return "Sat";
115 case MON_1: return "January";
116 case MON_2: return "February";
117 case MON_3: return "March";
118 case MON_4: return "April";
119 case MON_5: return "May";
120 case MON_6: return "June";
121 case MON_7: return "July";
122 case MON_8: return "August";
123 case MON_9: return "September";
124 case MON_10: return "October";
125 case MON_11: return "November";
126 case MON_12: return "December";
127 case ABMON_1: return "Jan";
128 case ABMON_2: return "Feb";
129 case ABMON_3: return "Mar";
130 case ABMON_4: return "Apr";
131 case ABMON_5: return "May";
132 case ABMON_6: return "Jun";
133 case ABMON_7: return "Jul";
134 case ABMON_8: return "Aug";
135 case ABMON_9: return "Sep";
136 case ABMON_10: return "Oct";
137 case ABMON_11: return "Nov";
138 case ABMON_12: return "Dec";
139 case D_FMT: return "%d/%m/%y";
140 case T_FMT: return "%H:%M:%S";
141 case D_T_FMT: return "%a %d %b %Y %H:%M:%S %Z";
142 case ERA_D_FMT: return "";
143 case ERA_T_FMT: return "";
144 case ERA_D_T_FMT: return "";
145 default: return 0;
146 }
147}
148#endif
149
598b07b7 150/** @brief Lookup table entry for locale-specific strings */
477f956c 151struct locale_item_match {
598b07b7 152 /** @brief Locale key to try */
477f956c 153 nl_item key;
598b07b7
RK
154
155 /** @brief Value to return if value of @ref key matches subject string */
477f956c
RK
156 int value;
157};
158
159static const struct locale_item_match days[] = {
160 { DAY_1, 0 },
161 { DAY_2, 1 },
162 { DAY_3, 2 },
163 { DAY_4, 3 },
164 { DAY_5, 4 },
165 { DAY_6, 5 },
166 { DAY_7, 6 },
167 { ABDAY_1, 0 },
168 { ABDAY_2, 1 },
169 { ABDAY_3, 2 },
170 { ABDAY_4, 3 },
171 { ABDAY_5, 4 },
172 { ABDAY_6, 5 },
173 { ABDAY_7, 6 },
174 { -1, -1 }
175};
176
177static const struct locale_item_match months[] = {
178 { MON_1, 1 },
179 { MON_2, 2 },
180 { MON_3, 3 },
181 { MON_4, 4 },
182 { MON_5, 5 },
183 { MON_6, 6 },
184 { MON_7, 7 },
185 { MON_8, 8 },
186 { MON_9, 9 },
187 { MON_10, 10 },
188 { MON_11, 11 },
189 { MON_12, 12 },
190 { ABMON_1, 1 },
191 { ABMON_2, 2 },
192 { ABMON_3, 3 },
193 { ABMON_4, 4 },
194 { ABMON_5, 5 },
195 { ABMON_6, 6 },
196 { ABMON_7, 7 },
197 { ABMON_8, 8 },
198 { ABMON_9, 9 },
199 { ABMON_10, 10 },
200 { ABMON_11, 11 },
201 { ABMON_12, 12 },
202 { -1, -1 },
203};
204
205/** @brief Match a string
206 * @param buf Start of subject
207 * @param limit End of subject
208 * @param match String to match subject against
209 * @return True if match == [buf,limit) otherwise false
210 *
211 * The match is case-independent at least in ASCII.
212 */
213static int try_match(const char *buf,
214 const char *limit,
215 const char *match) {
216 /* TODO this won't work well outside single-byte encodings. A good bet is
217 * probably to convert to Unicode and then use utf32_casefold_compat() (or
218 * utf8_casefold_compat(); using compatibility matching will ensure missing
219 * accents and so on aren't a problem.
220 *
221 * en_GB and en_US will probably be in any reasonable encoding for them.
222 */
223 while(buf < limit && *match) {
224 if(tolower((unsigned char)*buf) != tolower((unsigned char)*match))
225 return 0;
226 ++buf;
227 ++match;
228 }
229 if(buf != limit || *match)
230 return 0;
231 return 1;
232}
233
234/** @brief Match from table of locale-specific strings
235 * @param buf Start of subject
236 * @param limit End of subject
237 * @param lim Table of locale lookups
238 * @return Looked up value or -1
239 *
240 * The match is case-independent.
241 */
242static int try_locale_match(const char *buf,
243 const char *limit,
244 const struct locale_item_match *lim) {
245 /* This is not very efficient! A (correct) built-in implementation will
246 * presumably have more direct access to locale information. */
247 while(lim->value != -1) {
248 if(try_match(buf, limit, nl_langinfo(lim->key)))
249 return lim->value;
250 ++lim;
251 }
252 return -1;
253}
254
255static int try_numeric_match(const char *buf,
256 const char *limit,
257 unsigned low,
258 unsigned high) {
259 unsigned n = 0;
260
261 while(buf < limit) {
262 int ch = (unsigned char)*buf++;
263 if(ch >= '0' && ch <= '9') {
264 if(n > INT_MAX / 10
265 || (n == INT_MAX / 10 && ch >= INT_MAX % 10 + '0'))
266 return -1; /* overflow */
267 n = 10 * n + ch - '0';
268 } else
269 return -1;
270 }
271 if(n < low || n > high)
272 return -1;
273 return (int)n;
274}
275
276static const char *my_strptime_guts(const char *buf,
277 const char *format,
278 struct tm *tm) {
279 int fc, mod, spec, next, value;
280 const char *limit;
281 /* nl_langinfo() is allowed to trash its last return value so we copy.
282 * (We're relying on it being usable at all in multithreaded environments
283 * though.) */
284#define USE_SUBFORMAT(ITEM, EITEM, DEF) do { \
285 const char *s; \
286 char subformat[128]; \
287 \
288 if(mod == 'E') { \
289 s = nl_langinfo(EITEM); \
290 if(!s || !*s) \
291 s = nl_langinfo(ITEM); \
292 } else \
293 s = nl_langinfo(ITEM); \
294 if(!s || !*s) \
295 s = DEF; \
296 if(strlen(s) >= sizeof subformat) \
297 s = DEF; \
298 strcpy(subformat, s); \
299 if(!(buf = my_strptime_guts(buf, subformat, tm))) \
300 return NULL; \
301} while(0)
302
303 while(*format) {
304 fc = (unsigned char)*format++;
305 if(fc == '%') {
306 /* Get the character defining the converstion specification */
307 spec = (unsigned char)*format++;
308 if(spec == 'E' || spec == 'O') {
309 /* Oops, there's a modifier first */
310 mod = spec;
311 spec = (unsigned char)*format++;
312 } else
313 mod = 0;
314 if(!spec)
315 return NULL; /* format string broken! */
316 /* See what the next directive is. The specification is written in terms
317 * of stopping the match at a character that matches the next directive.
318 * This implementation mirrors this aspect of the specification
319 * directly. */
320 next = (unsigned char)*format;
321 if(next) {
322 limit = buf;
323 if(isspace(next)) {
324 /* Next directive is whitespace, so bound the input string (at least)
325 * by that */
326 while(*limit && !isspace((unsigned char)*limit))
327 ++limit;
328 } else if(next == '%') {
329 /* Prohibited: "The application shall ensure that there is
330 * white-space or other non-alphanumeric characters between any two
331 * conversion specifications". In fact we let alphanumerics
332 * through.
333 *
334 * Forbidding even %% seems a bit harsh but is consistent with the
335 * specification as written.
336 */
337 return NULL;
338 } else {
339 /* Next directive is a specific character, so bound the input string
340 * (at least) by that. This will work badly in the face of multibyte
341 * characters, but then the spec is vague about what kind of string
342 * we're dealing with anyway so you probably couldn't safely use them
343 * in the format string at least in any case. */
344 while(*limit && *limit != next)
345 ++limit;
346 }
347 } else
348 limit = buf + strlen(buf);
349 switch(spec) {
350 case 'A': case 'a': /* day name (abbrev or full) */
351 if((value = try_locale_match(buf, limit, days)) == -1)
352 return NULL;
353 tm->tm_wday = value;
354 break;
355 case 'B': case 'b': case 'h': /* month name (abbrev or full) */
356 if((value = try_locale_match(buf, limit, months)) == -1)
357 return NULL;
358 tm->tm_mon = value - 1;
359 break;
360 case 'c': /* locale date+time */
361 USE_SUBFORMAT(D_T_FMT, ERA_D_T_FMT, "%a %b %e %H:%M:%S %Y");
362 break;
363 case 'C': /* century number 0-99 */
364 /* TODO */
365 return NULL;
366 case 'd': case 'e': /* day of month 1-31 */
367 if((value = try_numeric_match(buf, limit, 1, 31)) == -1)
368 return NULL;
369 tm->tm_mday = value;
370 break;
371 case 'D': /* == "%m / %d / %y" */
372 if(!(buf = my_strptime_guts(buf, "%m / %d / %y", tm)))
373 return NULL;
374 break;
375 case 'H': /* hour 0-23 */
376 if((value = try_numeric_match(buf, limit, 0, 23)) == -1)
377 return NULL;
378 tm->tm_hour = value;
379 break;
380 case 'I': /* hour 1-12 */
381 /* TODO */
382 return NULL;
383 case 'j': /* day 1-366 */
384 if((value = try_numeric_match(buf, limit, 1, 366)) == -1)
385 return NULL;
386 tm->tm_yday = value - 1;
387 return NULL;
388 case 'm': /* month 1-12 */
389 if((value = try_numeric_match(buf, limit, 1, 12)) == -1)
390 return NULL;
391 tm->tm_mon = value - 1;
392 break;
393 case 'M': /* minute 0-59 */
394 if((value = try_numeric_match(buf, limit, 0, 59)) == -1)
395 return NULL;
396 tm->tm_min = value;
397 break;
398 case 'n': case 't': /* any whitespace */
399 goto matchwhitespace;
400 case 'p': /* locale am/pm */
401 /* TODO */
402 return NULL;
403 case 'r': /* == "%I : %M : %S %p" */
404 /* TODO actually this is locale-dependent; and we don't implement %I
405 * anyway, so it's not going to work even as it stands. */
406 if(!(buf = my_strptime_guts(buf, "%I : %M : %S %p", tm)))
407 return NULL;
408 break;
409 case 'R': /* == "%H : %M" */
410 if(!(buf = my_strptime_guts(buf, "%H : %M", tm)))
411 return NULL;
412 break;
413 case 'S': /* seconds 0-60 */
414 if((value = try_numeric_match(buf, limit, 0, 60)) == -1)
415 return NULL;
416 tm->tm_sec = value;
417 break;
418 case 'U': /* week number from Sunday 0-53 */
419 /* TODO */
420 return NULL;
421 case 'w': /* day number 0-6 from Sunday */
422 if((value = try_numeric_match(buf, limit, 0, 6)) == -1)
423 return NULL;
424 tm->tm_wday = value;
425 break;
426 case 'W': /* week number from Monday 0-53 */
427 /* TODO */
428 return NULL;
429 case 'x': /* locale date format */
430 USE_SUBFORMAT(D_FMT, ERA_D_FMT, "%m/%d/%y");
431 break;
432 case 'X': /* locale time format */
433 USE_SUBFORMAT(T_FMT, ERA_T_FMT, "%H:%M:%S");
434 break;
435 case 'y': /* year mod 100 */
436 if((value = try_numeric_match(buf, limit, 0, INT_MAX)) == -1)
437 return NULL;
438 if(value >= 0 && value <= 68)
439 value = 2000 + value;
440 else if(value >= 69 && value <= 99)
441 value = 1900 + value;
442 tm->tm_year = value - 1900;
443 break;
444 case 'Y': /* year */
445 if((value = try_numeric_match(buf, limit, 1, INT_MAX)) == -1)
446 return NULL;
447 tm->tm_year = value - 1900;
448 break;
449 case '%':
450 goto matchself;
451 default:
452 /* The spec is a bit vague about what to do with invalid format
453 * strings. We return NULL immediately and hope someone will
454 * notice. */
455 return NULL;
456 }
457 buf = limit;
458 } else if(isspace(fc)) {
459 matchwhitespace:
460 /* Any format whitespace matches any number of input whitespace
461 * characters. The directive can formally contain more than one
462 * whitespace character; for the second and subsequent ones we'll match 0
463 * characters from the input. */
464 while(isspace((unsigned char)*buf))
465 ++buf;
466 } else {
467 matchself:
468 /* Non-% non-whitespace characters must match themselves exactly */
469 if(fc != (unsigned char)*buf++)
470 return NULL;
471 }
472 }
473 /* When we run out of format string we return a pointer to the rest of the
474 * input. */
475 return buf;
476}
477
478/** @brief Reimplementation of strptime()
479 * @param buf Input buffer
480 * @param format Format string
481 * @param tm Where to put result
482 * @return Pointer to first unparsed input character, or NULL on error
483 *
484 * Based on <a
485 * href="http://www.opengroup.org/onlinepubs/009695399/functions/strptime.html">http://www.opengroup.org/onlinepubs/009695399/functions/strptime.html</a>.
486 */
487char *my_strptime(const char *buf,
488 const char *format,
489 struct tm *tm) {
490 /* Whether to overwrite or update is unspecified (rather bizarrely). This
491 * implementation does not overwrites, as xgetdate() depends on this
492 * behavior. */
493
494 if(!(buf = my_strptime_guts(buf, format, tm)))
495 return NULL;
496 /* TODO various things we could/should do:
497 * - infer day/month from %j+year
498 * - infer day/month from %U/%W+%w/%a+year
499 * - infer hour from %p+%I
500 * - fill wday/yday from other fields
501 */
502 return (char *)buf;
503}
504
505/*
506Local Variables:
507c-basic-offset:2
508comment-column:40
509fill-column:79
510indent-tabs-mode:nil
511End:
512*/