Commit | Line | Data |
---|---|---|
477f956c RK |
1 | /* strptime.c - partial strptime() reimplementation |
2 | * | |
3 | * (c) 2008 Richard Kettlewell. | |
4 | * All rights reserved. | |
5 | * | |
6 | * Redistribution and use in source and binary forms, with or without | |
7 | * modification, are permitted provided that the following conditions | |
8 | * are met: | |
9 | * 1. Redistributions of source code must retain the above copyright | |
10 | * notice, this list of conditions and the following disclaimer. | |
11 | * 2. Redistributions in binary form must reproduce the above copyright | |
12 | * notice, this list of conditions and the following disclaimer in the | |
13 | * documentation and/or other materials provided with the distribution. | |
14 | * 3. The name of the author may not be used to endorse or promote products | |
15 | * derived from this software without specific prior written permission. | |
16 | * | |
17 | * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND | |
18 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |
19 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |
20 | * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE | |
21 | * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |
22 | * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS | |
23 | * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) | |
24 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT | |
25 | * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY | |
26 | * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF | |
27 | * SUCH DAMAGE. | |
28 | */ | |
29 | ||
30 | /* strptime() is here reimplemented because the FreeBSD (and older MacOS) one | |
31 | * is broken and does not report errors properly. See TODO remarks below for | |
32 | * some missing bits. */ | |
33 | ||
34 | #include <ctype.h> | |
35 | #include <limits.h> | |
36 | #include <string.h> | |
37 | #include <langinfo.h> | |
38 | #include "strptime.h" | |
39 | ||
40 | struct locale_item_match { | |
41 | nl_item key; | |
42 | int value; | |
43 | }; | |
44 | ||
45 | static const struct locale_item_match days[] = { | |
46 | { DAY_1, 0 }, | |
47 | { DAY_2, 1 }, | |
48 | { DAY_3, 2 }, | |
49 | { DAY_4, 3 }, | |
50 | { DAY_5, 4 }, | |
51 | { DAY_6, 5 }, | |
52 | { DAY_7, 6 }, | |
53 | { ABDAY_1, 0 }, | |
54 | { ABDAY_2, 1 }, | |
55 | { ABDAY_3, 2 }, | |
56 | { ABDAY_4, 3 }, | |
57 | { ABDAY_5, 4 }, | |
58 | { ABDAY_6, 5 }, | |
59 | { ABDAY_7, 6 }, | |
60 | { -1, -1 } | |
61 | }; | |
62 | ||
63 | static const struct locale_item_match months[] = { | |
64 | { MON_1, 1 }, | |
65 | { MON_2, 2 }, | |
66 | { MON_3, 3 }, | |
67 | { MON_4, 4 }, | |
68 | { MON_5, 5 }, | |
69 | { MON_6, 6 }, | |
70 | { MON_7, 7 }, | |
71 | { MON_8, 8 }, | |
72 | { MON_9, 9 }, | |
73 | { MON_10, 10 }, | |
74 | { MON_11, 11 }, | |
75 | { MON_12, 12 }, | |
76 | { ABMON_1, 1 }, | |
77 | { ABMON_2, 2 }, | |
78 | { ABMON_3, 3 }, | |
79 | { ABMON_4, 4 }, | |
80 | { ABMON_5, 5 }, | |
81 | { ABMON_6, 6 }, | |
82 | { ABMON_7, 7 }, | |
83 | { ABMON_8, 8 }, | |
84 | { ABMON_9, 9 }, | |
85 | { ABMON_10, 10 }, | |
86 | { ABMON_11, 11 }, | |
87 | { ABMON_12, 12 }, | |
88 | { -1, -1 }, | |
89 | }; | |
90 | ||
91 | /** @brief Match a string | |
92 | * @param buf Start of subject | |
93 | * @param limit End of subject | |
94 | * @param match String to match subject against | |
95 | * @return True if match == [buf,limit) otherwise false | |
96 | * | |
97 | * The match is case-independent at least in ASCII. | |
98 | */ | |
99 | static int try_match(const char *buf, | |
100 | const char *limit, | |
101 | const char *match) { | |
102 | /* TODO this won't work well outside single-byte encodings. A good bet is | |
103 | * probably to convert to Unicode and then use utf32_casefold_compat() (or | |
104 | * utf8_casefold_compat(); using compatibility matching will ensure missing | |
105 | * accents and so on aren't a problem. | |
106 | * | |
107 | * en_GB and en_US will probably be in any reasonable encoding for them. | |
108 | */ | |
109 | while(buf < limit && *match) { | |
110 | if(tolower((unsigned char)*buf) != tolower((unsigned char)*match)) | |
111 | return 0; | |
112 | ++buf; | |
113 | ++match; | |
114 | } | |
115 | if(buf != limit || *match) | |
116 | return 0; | |
117 | return 1; | |
118 | } | |
119 | ||
120 | /** @brief Match from table of locale-specific strings | |
121 | * @param buf Start of subject | |
122 | * @param limit End of subject | |
123 | * @param lim Table of locale lookups | |
124 | * @return Looked up value or -1 | |
125 | * | |
126 | * The match is case-independent. | |
127 | */ | |
128 | static int try_locale_match(const char *buf, | |
129 | const char *limit, | |
130 | const struct locale_item_match *lim) { | |
131 | /* This is not very efficient! A (correct) built-in implementation will | |
132 | * presumably have more direct access to locale information. */ | |
133 | while(lim->value != -1) { | |
134 | if(try_match(buf, limit, nl_langinfo(lim->key))) | |
135 | return lim->value; | |
136 | ++lim; | |
137 | } | |
138 | return -1; | |
139 | } | |
140 | ||
141 | static int try_numeric_match(const char *buf, | |
142 | const char *limit, | |
143 | unsigned low, | |
144 | unsigned high) { | |
145 | unsigned n = 0; | |
146 | ||
147 | while(buf < limit) { | |
148 | int ch = (unsigned char)*buf++; | |
149 | if(ch >= '0' && ch <= '9') { | |
150 | if(n > INT_MAX / 10 | |
151 | || (n == INT_MAX / 10 && ch >= INT_MAX % 10 + '0')) | |
152 | return -1; /* overflow */ | |
153 | n = 10 * n + ch - '0'; | |
154 | } else | |
155 | return -1; | |
156 | } | |
157 | if(n < low || n > high) | |
158 | return -1; | |
159 | return (int)n; | |
160 | } | |
161 | ||
162 | static const char *my_strptime_guts(const char *buf, | |
163 | const char *format, | |
164 | struct tm *tm) { | |
165 | int fc, mod, spec, next, value; | |
166 | const char *limit; | |
167 | /* nl_langinfo() is allowed to trash its last return value so we copy. | |
168 | * (We're relying on it being usable at all in multithreaded environments | |
169 | * though.) */ | |
170 | #define USE_SUBFORMAT(ITEM, EITEM, DEF) do { \ | |
171 | const char *s; \ | |
172 | char subformat[128]; \ | |
173 | \ | |
174 | if(mod == 'E') { \ | |
175 | s = nl_langinfo(EITEM); \ | |
176 | if(!s || !*s) \ | |
177 | s = nl_langinfo(ITEM); \ | |
178 | } else \ | |
179 | s = nl_langinfo(ITEM); \ | |
180 | if(!s || !*s) \ | |
181 | s = DEF; \ | |
182 | if(strlen(s) >= sizeof subformat) \ | |
183 | s = DEF; \ | |
184 | strcpy(subformat, s); \ | |
185 | if(!(buf = my_strptime_guts(buf, subformat, tm))) \ | |
186 | return NULL; \ | |
187 | } while(0) | |
188 | ||
189 | while(*format) { | |
190 | fc = (unsigned char)*format++; | |
191 | if(fc == '%') { | |
192 | /* Get the character defining the converstion specification */ | |
193 | spec = (unsigned char)*format++; | |
194 | if(spec == 'E' || spec == 'O') { | |
195 | /* Oops, there's a modifier first */ | |
196 | mod = spec; | |
197 | spec = (unsigned char)*format++; | |
198 | } else | |
199 | mod = 0; | |
200 | if(!spec) | |
201 | return NULL; /* format string broken! */ | |
202 | /* See what the next directive is. The specification is written in terms | |
203 | * of stopping the match at a character that matches the next directive. | |
204 | * This implementation mirrors this aspect of the specification | |
205 | * directly. */ | |
206 | next = (unsigned char)*format; | |
207 | if(next) { | |
208 | limit = buf; | |
209 | if(isspace(next)) { | |
210 | /* Next directive is whitespace, so bound the input string (at least) | |
211 | * by that */ | |
212 | while(*limit && !isspace((unsigned char)*limit)) | |
213 | ++limit; | |
214 | } else if(next == '%') { | |
215 | /* Prohibited: "The application shall ensure that there is | |
216 | * white-space or other non-alphanumeric characters between any two | |
217 | * conversion specifications". In fact we let alphanumerics | |
218 | * through. | |
219 | * | |
220 | * Forbidding even %% seems a bit harsh but is consistent with the | |
221 | * specification as written. | |
222 | */ | |
223 | return NULL; | |
224 | } else { | |
225 | /* Next directive is a specific character, so bound the input string | |
226 | * (at least) by that. This will work badly in the face of multibyte | |
227 | * characters, but then the spec is vague about what kind of string | |
228 | * we're dealing with anyway so you probably couldn't safely use them | |
229 | * in the format string at least in any case. */ | |
230 | while(*limit && *limit != next) | |
231 | ++limit; | |
232 | } | |
233 | } else | |
234 | limit = buf + strlen(buf); | |
235 | switch(spec) { | |
236 | case 'A': case 'a': /* day name (abbrev or full) */ | |
237 | if((value = try_locale_match(buf, limit, days)) == -1) | |
238 | return NULL; | |
239 | tm->tm_wday = value; | |
240 | break; | |
241 | case 'B': case 'b': case 'h': /* month name (abbrev or full) */ | |
242 | if((value = try_locale_match(buf, limit, months)) == -1) | |
243 | return NULL; | |
244 | tm->tm_mon = value - 1; | |
245 | break; | |
246 | case 'c': /* locale date+time */ | |
247 | USE_SUBFORMAT(D_T_FMT, ERA_D_T_FMT, "%a %b %e %H:%M:%S %Y"); | |
248 | break; | |
249 | case 'C': /* century number 0-99 */ | |
250 | /* TODO */ | |
251 | return NULL; | |
252 | case 'd': case 'e': /* day of month 1-31 */ | |
253 | if((value = try_numeric_match(buf, limit, 1, 31)) == -1) | |
254 | return NULL; | |
255 | tm->tm_mday = value; | |
256 | break; | |
257 | case 'D': /* == "%m / %d / %y" */ | |
258 | if(!(buf = my_strptime_guts(buf, "%m / %d / %y", tm))) | |
259 | return NULL; | |
260 | break; | |
261 | case 'H': /* hour 0-23 */ | |
262 | if((value = try_numeric_match(buf, limit, 0, 23)) == -1) | |
263 | return NULL; | |
264 | tm->tm_hour = value; | |
265 | break; | |
266 | case 'I': /* hour 1-12 */ | |
267 | /* TODO */ | |
268 | return NULL; | |
269 | case 'j': /* day 1-366 */ | |
270 | if((value = try_numeric_match(buf, limit, 1, 366)) == -1) | |
271 | return NULL; | |
272 | tm->tm_yday = value - 1; | |
273 | return NULL; | |
274 | case 'm': /* month 1-12 */ | |
275 | if((value = try_numeric_match(buf, limit, 1, 12)) == -1) | |
276 | return NULL; | |
277 | tm->tm_mon = value - 1; | |
278 | break; | |
279 | case 'M': /* minute 0-59 */ | |
280 | if((value = try_numeric_match(buf, limit, 0, 59)) == -1) | |
281 | return NULL; | |
282 | tm->tm_min = value; | |
283 | break; | |
284 | case 'n': case 't': /* any whitespace */ | |
285 | goto matchwhitespace; | |
286 | case 'p': /* locale am/pm */ | |
287 | /* TODO */ | |
288 | return NULL; | |
289 | case 'r': /* == "%I : %M : %S %p" */ | |
290 | /* TODO actually this is locale-dependent; and we don't implement %I | |
291 | * anyway, so it's not going to work even as it stands. */ | |
292 | if(!(buf = my_strptime_guts(buf, "%I : %M : %S %p", tm))) | |
293 | return NULL; | |
294 | break; | |
295 | case 'R': /* == "%H : %M" */ | |
296 | if(!(buf = my_strptime_guts(buf, "%H : %M", tm))) | |
297 | return NULL; | |
298 | break; | |
299 | case 'S': /* seconds 0-60 */ | |
300 | if((value = try_numeric_match(buf, limit, 0, 60)) == -1) | |
301 | return NULL; | |
302 | tm->tm_sec = value; | |
303 | break; | |
304 | case 'U': /* week number from Sunday 0-53 */ | |
305 | /* TODO */ | |
306 | return NULL; | |
307 | case 'w': /* day number 0-6 from Sunday */ | |
308 | if((value = try_numeric_match(buf, limit, 0, 6)) == -1) | |
309 | return NULL; | |
310 | tm->tm_wday = value; | |
311 | break; | |
312 | case 'W': /* week number from Monday 0-53 */ | |
313 | /* TODO */ | |
314 | return NULL; | |
315 | case 'x': /* locale date format */ | |
316 | USE_SUBFORMAT(D_FMT, ERA_D_FMT, "%m/%d/%y"); | |
317 | break; | |
318 | case 'X': /* locale time format */ | |
319 | USE_SUBFORMAT(T_FMT, ERA_T_FMT, "%H:%M:%S"); | |
320 | break; | |
321 | case 'y': /* year mod 100 */ | |
322 | if((value = try_numeric_match(buf, limit, 0, INT_MAX)) == -1) | |
323 | return NULL; | |
324 | if(value >= 0 && value <= 68) | |
325 | value = 2000 + value; | |
326 | else if(value >= 69 && value <= 99) | |
327 | value = 1900 + value; | |
328 | tm->tm_year = value - 1900; | |
329 | break; | |
330 | case 'Y': /* year */ | |
331 | if((value = try_numeric_match(buf, limit, 1, INT_MAX)) == -1) | |
332 | return NULL; | |
333 | tm->tm_year = value - 1900; | |
334 | break; | |
335 | case '%': | |
336 | goto matchself; | |
337 | default: | |
338 | /* The spec is a bit vague about what to do with invalid format | |
339 | * strings. We return NULL immediately and hope someone will | |
340 | * notice. */ | |
341 | return NULL; | |
342 | } | |
343 | buf = limit; | |
344 | } else if(isspace(fc)) { | |
345 | matchwhitespace: | |
346 | /* Any format whitespace matches any number of input whitespace | |
347 | * characters. The directive can formally contain more than one | |
348 | * whitespace character; for the second and subsequent ones we'll match 0 | |
349 | * characters from the input. */ | |
350 | while(isspace((unsigned char)*buf)) | |
351 | ++buf; | |
352 | } else { | |
353 | matchself: | |
354 | /* Non-% non-whitespace characters must match themselves exactly */ | |
355 | if(fc != (unsigned char)*buf++) | |
356 | return NULL; | |
357 | } | |
358 | } | |
359 | /* When we run out of format string we return a pointer to the rest of the | |
360 | * input. */ | |
361 | return buf; | |
362 | } | |
363 | ||
364 | /** @brief Reimplementation of strptime() | |
365 | * @param buf Input buffer | |
366 | * @param format Format string | |
367 | * @param tm Where to put result | |
368 | * @return Pointer to first unparsed input character, or NULL on error | |
369 | * | |
370 | * Based on <a | |
371 | * href="http://www.opengroup.org/onlinepubs/009695399/functions/strptime.html">http://www.opengroup.org/onlinepubs/009695399/functions/strptime.html</a>. | |
372 | */ | |
373 | char *my_strptime(const char *buf, | |
374 | const char *format, | |
375 | struct tm *tm) { | |
376 | /* Whether to overwrite or update is unspecified (rather bizarrely). This | |
377 | * implementation does not overwrites, as xgetdate() depends on this | |
378 | * behavior. */ | |
379 | ||
380 | if(!(buf = my_strptime_guts(buf, format, tm))) | |
381 | return NULL; | |
382 | /* TODO various things we could/should do: | |
383 | * - infer day/month from %j+year | |
384 | * - infer day/month from %U/%W+%w/%a+year | |
385 | * - infer hour from %p+%I | |
386 | * - fill wday/yday from other fields | |
387 | */ | |
388 | return (char *)buf; | |
389 | } | |
390 | ||
391 | /* | |
392 | Local Variables: | |
393 | c-basic-offset:2 | |
394 | comment-column:40 | |
395 | fill-column:79 | |
396 | indent-tabs-mode:nil | |
397 | End: | |
398 | */ |