chiark - git - mdw - disorder/blame

Commit	Line	Data
477f956c RK	1	/* strptime.c - partial strptime() reimplementation
477f956c RK	2	*
cca89d7c	3	* Copyright (c) 2008, 2011, 2013 Richard Kettlewell.
477f956c RK	4	* All rights reserved.
	5	*
	6	* Redistribution and use in source and binary forms, with or without
	7	* modification, are permitted provided that the following conditions
	8	* are met:
	9	* 1. Redistributions of source code must retain the above copyright
	10	* notice, this list of conditions and the following disclaimer.
	11	* 2. Redistributions in binary form must reproduce the above copyright
	12	* notice, this list of conditions and the following disclaimer in the
	13	* documentation and/or other materials provided with the distribution.
	14	* 3. The name of the author may not be used to endorse or promote products
	15	* derived from this software without specific prior written permission.
	16	*
	17	* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
	18	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	19	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	20	* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
	21	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	22	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	23	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	24	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	25	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	26	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	27	* SUCH DAMAGE.
	28	*/
1a164e63 RK	29	/** @file lib/strptime.c
	30	* @brief strptime() reimplementation
	31	*
	32	* strptime() is here reimplemented because the FreeBSD (and older MacOS) one
477f956c	33	* is broken and does not report errors properly. See TODO remarks below for
1a164e63 RK	34	* some missing bits.
1a164e63 RK	35	*/
477f956c	36
cca89d7c RK	37	#if HAVE_CONFIG_H
	38	# include <config.h>
	39	#endif
	40
477f956c RK	41	#include <ctype.h>
	42	#include <limits.h>
	43	#include <string.h>
cca89d7c RK	44	#if HAVE_LANGINFO_H
	45	# include <langinfo.h>
	46	#endif
477f956c RK	47	#include "strptime.h"
477f956c RK	48
cca89d7c RK	49	#if !HAVE_LANGINFO_H
	50	/* Fake plastic langinfo. Primarily for Windows.
	51	* TODO WIN32 can we get these values out of the win32 api instead? */
	52	typedef enum {
	53	DAY_1,
	54	DAY_2,
	55	DAY_3,
	56	DAY_4,
	57	DAY_5,
	58	DAY_6,
	59	DAY_7,
	60	ABDAY_1,
	61	ABDAY_2,
	62	ABDAY_3,
	63	ABDAY_4,
	64	ABDAY_5,
	65	ABDAY_6,
	66	ABDAY_7,
	67	MON_1,
	68	MON_2,
	69	MON_3,
	70	MON_4,
	71	MON_5,
	72	MON_6,
	73	MON_7,
	74	MON_8,
	75	MON_9,
	76	MON_10,
	77	MON_11,
	78	MON_12,
	79	ABMON_1,
	80	ABMON_2,
	81	ABMON_3,
	82	ABMON_4,
	83	ABMON_5,
	84	ABMON_6,
	85	ABMON_7,
	86	ABMON_8,
	87	ABMON_9,
	88	ABMON_10,
	89	ABMON_11,
	90	ABMON_12,
	91	D_FMT,
	92	T_FMT,
	93	D_T_FMT,
	94	ERA_D_FMT,
	95	ERA_T_FMT,
	96	ERA_D_T_FMT,
	97	} nl_item;
	98
	99	const char *nl_langinfo(nl_item item) {
	100	switch(item) {
	101	case DAY_1: return "Sunday";
	102	case DAY_2: return "Monday";
	103	case DAY_3: return "Tuesday";
	104	case DAY_4: return "Wednesday";
	105	case DAY_5: return "Thursday";
	106	case DAY_6: return "Friday";
	107	case DAY_7: return "Saturday";
	108	case ABDAY_1: return "Sun";
	109	case ABDAY_2: return "Mon";
	110	case ABDAY_3: return "Tue";
	111	case ABDAY_4: return "Wed";
	112	case ABDAY_5: return "Thu";
113	case ABDAY_6: return "Fri";
114	case ABDAY_7: return "Sat";
115	case MON_1: return "January";
116	case MON_2: return "February";
117	case MON_3: return "March";
118	case MON_4: return "April";
119	case MON_5: return "May";
120	case MON_6: return "June";
121	case MON_7: return "July";
122	case MON_8: return "August";
123	case MON_9: return "September";
124	case MON_10: return "October";
125	case MON_11: return "November";
126	case MON_12: return "December";
127	case ABMON_1: return "Jan";
128	case ABMON_2: return "Feb";
129	case ABMON_3: return "Mar";
130	case ABMON_4: return "Apr";
131	case ABMON_5: return "May";
132	case ABMON_6: return "Jun";
133	case ABMON_7: return "Jul";
134	case ABMON_8: return "Aug";
135	case ABMON_9: return "Sep";
136	case ABMON_10: return "Oct";
137	case ABMON_11: return "Nov";
138	case ABMON_12: return "Dec";
139	case D_FMT: return "%d/%m/%y";
140	case T_FMT: return "%H:%M:%S";
141	case D_T_FMT: return "%a %d %b %Y %H:%M:%S %Z";
142	case ERA_D_FMT: return "";
143	case ERA_T_FMT: return "";
144	case ERA_D_T_FMT: return "";
145	default: return 0;
146	}
147	}
148	#endif
149
598b07b7	150	/** @brief Lookup table entry for locale-specific strings */
477f956c	151	struct locale_item_match {
598b07b7	152	/** @brief Locale key to try */
477f956c	153	nl_item key;
598b07b7 RK	154
598b07b7 RK	155	/** @brief Value to return if value of @ref key matches subject string */
477f956c RK	156	int value;
	157	};
	158
	159	static const struct locale_item_match days[] = {
	160	{ DAY_1, 0 },
	161	{ DAY_2, 1 },
	162	{ DAY_3, 2 },
	163	{ DAY_4, 3 },
	164	{ DAY_5, 4 },
	165	{ DAY_6, 5 },
	166	{ DAY_7, 6 },
	167	{ ABDAY_1, 0 },
	168	{ ABDAY_2, 1 },
	169	{ ABDAY_3, 2 },
	170	{ ABDAY_4, 3 },
	171	{ ABDAY_5, 4 },
	172	{ ABDAY_6, 5 },
	173	{ ABDAY_7, 6 },
	174	{ -1, -1 }
	175	};
	176
	177	static const struct locale_item_match months[] = {
	178	{ MON_1, 1 },
	179	{ MON_2, 2 },
	180	{ MON_3, 3 },
	181	{ MON_4, 4 },
	182	{ MON_5, 5 },
	183	{ MON_6, 6 },
	184	{ MON_7, 7 },
	185	{ MON_8, 8 },
	186	{ MON_9, 9 },
	187	{ MON_10, 10 },
	188	{ MON_11, 11 },
	189	{ MON_12, 12 },
	190	{ ABMON_1, 1 },
	191	{ ABMON_2, 2 },
	192	{ ABMON_3, 3 },
	193	{ ABMON_4, 4 },
	194	{ ABMON_5, 5 },
	195	{ ABMON_6, 6 },
	196	{ ABMON_7, 7 },
	197	{ ABMON_8, 8 },
	198	{ ABMON_9, 9 },
	199	{ ABMON_10, 10 },
	200	{ ABMON_11, 11 },
	201	{ ABMON_12, 12 },
	202	{ -1, -1 },
	203	};
	204
	205	/** @brief Match a string
	206	* @param buf Start of subject
	207	* @param limit End of subject
	208	* @param match String to match subject against
	209	* @return True if match == [buf,limit) otherwise false
	210	*
	211	* The match is case-independent at least in ASCII.
	212	*/
	213	static int try_match(const char *buf,
	214	const char *limit,
	215	const char *match) {
	216	/* TODO this won't work well outside single-byte encodings. A good bet is
	217	* probably to convert to Unicode and then use utf32_casefold_compat() (or
	218	* utf8_casefold_compat(); using compatibility matching will ensure missing
	219	* accents and so on aren't a problem.
220	*
221	* en_GB and en_US will probably be in any reasonable encoding for them.
222	*/
223	while(buf < limit && *match) {
224	if(tolower((unsigned char)buf) != tolower((unsigned char)match))
225	return 0;
226	++buf;
227	++match;
228	}
229	if(buf != limit \|\| *match)
230	return 0;
231	return 1;
232	}
233
234	/** @brief Match from table of locale-specific strings
235	* @param buf Start of subject
236	* @param limit End of subject
237	* @param lim Table of locale lookups
238	* @return Looked up value or -1
239	*
240	* The match is case-independent.
241	*/
242	static int try_locale_match(const char *buf,
243	const char *limit,
244	const struct locale_item_match *lim) {
245	/* This is not very efficient! A (correct) built-in implementation will
246	* presumably have more direct access to locale information. */
247	while(lim->value != -1) {
248	if(try_match(buf, limit, nl_langinfo(lim->key)))
249	return lim->value;
250	++lim;
251	}
252	return -1;
253	}
254
255	static int try_numeric_match(const char *buf,
256	const char *limit,
257	unsigned low,
258	unsigned high) {
259	unsigned n = 0;
260
261	while(buf < limit) {
262	int ch = (unsigned char)*buf++;
263	if(ch >= '0' && ch <= '9') {
264	if(n > INT_MAX / 10
265	\|\| (n == INT_MAX / 10 && ch >= INT_MAX % 10 + '0'))
266	return -1; /* overflow */
267	n = 10 * n + ch - '0';
268	} else
269	return -1;
270	}
271	if(n < low \|\| n > high)
272	return -1;
273	return (int)n;
274	}
275
276	static const char my_strptime_guts(const char buf,
277	const char *format,
278	struct tm *tm) {
279	int fc, mod, spec, next, value;
280	const char *limit;
281	/* nl_langinfo() is allowed to trash its last return value so we copy.
282	* (We're relying on it being usable at all in multithreaded environments
283	* though.) */
284	#define USE_SUBFORMAT(ITEM, EITEM, DEF) do { \
285	const char *s; \
286	char subformat[128]; \
287	\
288	if(mod == 'E') { \
289	s = nl_langinfo(EITEM); \
290	if(!s \|\| !*s) \
291	s = nl_langinfo(ITEM); \
292	} else \
293	s = nl_langinfo(ITEM); \
294	if(!s \|\| !*s) \
295	s = DEF; \
296	if(strlen(s) >= sizeof subformat) \
297	s = DEF; \
298	strcpy(subformat, s); \
299	if(!(buf = my_strptime_guts(buf, subformat, tm))) \
300	return NULL; \
301	} while(0)
302
303	while(*format) {
304	fc = (unsigned char)*format++;
305	if(fc == '%') {
306	/* Get the character defining the converstion specification */
307	spec = (unsigned char)*format++;
308	if(spec == 'E' \|\| spec == 'O') {
309	/* Oops, there's a modifier first */
310	mod = spec;
311	spec = (unsigned char)*format++;
312	} else
313	mod = 0;
314	if(!spec)
315	return NULL; /* format string broken! */
316	/* See what the next directive is. The specification is written in terms
317	* of stopping the match at a character that matches the next directive.
318	* This implementation mirrors this aspect of the specification
319	* directly. */
320	next = (unsigned char)*format;
321	if(next) {
322	limit = buf;
323	if(isspace(next)) {
324	/* Next directive is whitespace, so bound the input string (at least)
325	* by that */
326	while(limit && !isspace((unsigned char)limit))
327	++limit;
328	} else if(next == '%') {
329	/* Prohibited: "The application shall ensure that there is
330	* white-space or other non-alphanumeric characters between any two
331	* conversion specifications". In fact we let alphanumerics
332	* through.
333	*
334	* Forbidding even %% seems a bit harsh but is consistent with the
335	* specification as written.
336	*/
337	return NULL;
338	} else {
339	/* Next directive is a specific character, so bound the input string
340	* (at least) by that. This will work badly in the face of multibyte
341	* characters, but then the spec is vague about what kind of string
342	* we're dealing with anyway so you probably couldn't safely use them
343	* in the format string at least in any case. */
344	while(limit && limit != next)
345	++limit;
346	}
347	} else
348	limit = buf + strlen(buf);
349	switch(spec) {
350	case 'A': case 'a': /* day name (abbrev or full) */
351	if((value = try_locale_match(buf, limit, days)) == -1)
352	return NULL;
353	tm->tm_wday = value;
354	break;
355	case 'B': case 'b': case 'h': /* month name (abbrev or full) */
356	if((value = try_locale_match(buf, limit, months)) == -1)
357	return NULL;
358	tm->tm_mon = value - 1;
359	break;
360	case 'c': /* locale date+time */
361	USE_SUBFORMAT(D_T_FMT, ERA_D_T_FMT, "%a %b %e %H:%M:%S %Y");
362	break;
363	case 'C': /* century number 0-99 */
364	/* TODO */
365	return NULL;
366	case 'd': case 'e': /* day of month 1-31 */
367	if((value = try_numeric_match(buf, limit, 1, 31)) == -1)
368	return NULL;
369	tm->tm_mday = value;
370	break;
371	case 'D': /* == "%m / %d / %y" */
372	if(!(buf = my_strptime_guts(buf, "%m / %d / %y", tm)))
373	return NULL;
374	break;
375	case 'H': /* hour 0-23 */
376	if((value = try_numeric_match(buf, limit, 0, 23)) == -1)
377	return NULL;
378	tm->tm_hour = value;
379	break;
380	case 'I': /* hour 1-12 */
381	/* TODO */
382	return NULL;
383	case 'j': /* day 1-366 */
384	if((value = try_numeric_match(buf, limit, 1, 366)) == -1)
385	return NULL;
386	tm->tm_yday = value - 1;
387	return NULL;
388	case 'm': /* month 1-12 */
389	if((value = try_numeric_match(buf, limit, 1, 12)) == -1)
390	return NULL;
391	tm->tm_mon = value - 1;
392	break;
393	case 'M': /* minute 0-59 */
394	if((value = try_numeric_match(buf, limit, 0, 59)) == -1)
395	return NULL;
396	tm->tm_min = value;
397	break;
398	case 'n': case 't': /* any whitespace */
399	goto matchwhitespace;
400	case 'p': /* locale am/pm */
401	/* TODO */
402	return NULL;
403	case 'r': /* == "%I : %M : %S %p" */
404	/* TODO actually this is locale-dependent; and we don't implement %I
405	* anyway, so it's not going to work even as it stands. */
406	if(!(buf = my_strptime_guts(buf, "%I : %M : %S %p", tm)))
407	return NULL;
408	break;
409	case 'R': /* == "%H : %M" */
410	if(!(buf = my_strptime_guts(buf, "%H : %M", tm)))
411	return NULL;
412	break;
413	case 'S': /* seconds 0-60 */
414	if((value = try_numeric_match(buf, limit, 0, 60)) == -1)
415	return NULL;
416	tm->tm_sec = value;
417	break;
418	case 'U': /* week number from Sunday 0-53 */
419	/* TODO */
420	return NULL;
421	case 'w': /* day number 0-6 from Sunday */
422	if((value = try_numeric_match(buf, limit, 0, 6)) == -1)
423	return NULL;
424	tm->tm_wday = value;
425	break;
426	case 'W': /* week number from Monday 0-53 */
427	/* TODO */
428	return NULL;
429	case 'x': /* locale date format */
430	USE_SUBFORMAT(D_FMT, ERA_D_FMT, "%m/%d/%y");
431	break;
432	case 'X': /* locale time format */
433	USE_SUBFORMAT(T_FMT, ERA_T_FMT, "%H:%M:%S");
434	break;
435	case 'y': /* year mod 100 */
436	if((value = try_numeric_match(buf, limit, 0, INT_MAX)) == -1)
437	return NULL;
438	if(value >= 0 && value <= 68)
439	value = 2000 + value;
440	else if(value >= 69 && value <= 99)
441	value = 1900 + value;
442	tm->tm_year = value - 1900;
443	break;
444	case 'Y': /* year */
445	if((value = try_numeric_match(buf, limit, 1, INT_MAX)) == -1)
446	return NULL;
447	tm->tm_year = value - 1900;
448	break;
449	case '%':
450	goto matchself;
451	default:
452	/* The spec is a bit vague about what to do with invalid format
453	* strings. We return NULL immediately and hope someone will
454	* notice. */
455	return NULL;
456	}
457	buf = limit;
458	} else if(isspace(fc)) {
459	matchwhitespace:
460	/* Any format whitespace matches any number of input whitespace
461	* characters. The directive can formally contain more than one
462	* whitespace character; for the second and subsequent ones we'll match 0
463	* characters from the input. */
464	while(isspace((unsigned char)*buf))
465	++buf;
466	} else {
467	matchself:
468	/* Non-% non-whitespace characters must match themselves exactly */
469	if(fc != (unsigned char)*buf++)
470	return NULL;
471	}
472	}
473	/* When we run out of format string we return a pointer to the rest of the
474	* input. */
475	return buf;
476	}
477
478	/** @brief Reimplementation of strptime()
479	* @param buf Input buffer
480	* @param format Format string
481	* @param tm Where to put result
482	* @return Pointer to first unparsed input character, or NULL on error
483	*
484	* Based on <a
485	* href="http://www.opengroup.org/onlinepubs/009695399/functions/strptime.html">http://www.opengroup.org/onlinepubs/009695399/functions/strptime.html</a>.
486	*/
487	char my_strptime(const char buf,
488	const char *format,
489	struct tm *tm) {
490	/* Whether to overwrite or update is unspecified (rather bizarrely). This
491	* implementation does not overwrites, as xgetdate() depends on this
492	* behavior. */
493
494	if(!(buf = my_strptime_guts(buf, format, tm)))
495	return NULL;
496	/* TODO various things we could/should do:
497	* - infer day/month from %j+year
498	* - infer day/month from %U/%W+%w/%a+year
499	* - infer hour from %p+%I
500	* - fill wday/yday from other fields
501	*/
502	return (char *)buf;
503	}
504
505	/*
506	Local Variables:
507	c-basic-offset:2
508	comment-column:40
509	fill-column:79
510	indent-tabs-mode:nil
511	End:
512	*/