chiark - git - mdw - disorder/blob - lib/strptime.c

   1 /* strptime.c - partial strptime() reimplementation
   2  *
   3  * (c) 2008 Richard Kettlewell.
   4  * All rights reserved.
   5  *
   6  * Redistribution and use in source and binary forms, with or without
   7  * modification, are permitted provided that the following conditions
   8  * are met:
   9  * 1. Redistributions of source code must retain the above copyright
  10  *    notice, this list of conditions and the following disclaimer.
  11  * 2. Redistributions in binary form must reproduce the above copyright
  12  *    notice, this list of conditions and the following disclaimer in the
  13  *    documentation and/or other materials provided with the distribution.
  14  * 3. The name of the author may not be used to endorse or promote products
  15  *    derived from this software without specific prior written permission.
  16  *
  17  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  19  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  20  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  21  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  22  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  23  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  24  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  25  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  26  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  27  * SUCH DAMAGE.
  28  */
  29
  30 /* strptime() is here reimplemented because the FreeBSD (and older MacOS) one
  31  * is broken and does not report errors properly.  See TODO remarks below for
  32  * some missing bits. */
  33
  34 #include <ctype.h>
  35 #include <limits.h>
  36 #include <string.h>
  37 #include <langinfo.h>
  38 #include "strptime.h"
  39
  40 struct locale_item_match {
  41   nl_item key;
  42   int value;
  43 };
  44
  45 static const struct locale_item_match days[] = {
  46   { DAY_1, 0 },
  47   { DAY_2, 1 },
  48   { DAY_3, 2 },
  49   { DAY_4, 3 },
  50   { DAY_5, 4 },
  51   { DAY_6, 5 },
  52   { DAY_7, 6 },
  53   { ABDAY_1, 0 },
  54   { ABDAY_2, 1 },
  55   { ABDAY_3, 2 },
  56   { ABDAY_4, 3 },
  57   { ABDAY_5, 4 },
  58   { ABDAY_6, 5 },
  59   { ABDAY_7, 6 },
  60   { -1, -1 }
  61 };
  62
  63 static const struct locale_item_match months[] = {
  64   { MON_1, 1 },
  65   { MON_2, 2 },
  66   { MON_3, 3 },
  67   { MON_4, 4 },
  68   { MON_5, 5 },
  69   { MON_6, 6 },
  70   { MON_7, 7 },
  71   { MON_8, 8 },
  72   { MON_9, 9 },
  73   { MON_10, 10 },
  74   { MON_11, 11 },
  75   { MON_12, 12 },
  76   { ABMON_1, 1 },
  77   { ABMON_2, 2 },
  78   { ABMON_3, 3 },
  79   { ABMON_4, 4 },
  80   { ABMON_5, 5 },
  81   { ABMON_6, 6 },
  82   { ABMON_7, 7 },
  83   { ABMON_8, 8 },
  84   { ABMON_9, 9 },
  85   { ABMON_10, 10 },
  86   { ABMON_11, 11 },
  87   { ABMON_12, 12 },
  88   { -1, -1 },
  89 };
  90
  91 /** @brief Match a string
  92  * @param buf Start of subject
  93  * @param limit End of subject
  94  * @param match String to match subject against
  95  * @return True if match == [buf,limit) otherwise false
  96  *
  97  * The match is case-independent at least in ASCII.
  98  */
  99 static int try_match(const char *buf,
 100                      const char *limit,
 101                      const char *match) {
 102   /* TODO this won't work well outside single-byte encodings.  A good bet is
 103    * probably to convert to Unicode and then use utf32_casefold_compat() (or
 104    * utf8_casefold_compat(); using compatibility matching will ensure missing
 105    * accents and so on aren't a problem.
 106    *
 107    * en_GB and en_US will probably be in any reasonable encoding for them.
 108    */
 109   while(buf < limit && *match) {
 110     if(tolower((unsigned char)*buf) != tolower((unsigned char)*match))
 111       return 0;
 112     ++buf;
 113     ++match;
 114   }
 115   if(buf != limit || *match)
 116     return 0;
 117   return 1;
 118 }
 119
 120 /** @brief Match from table of locale-specific strings
 121  * @param buf Start of subject
 122  * @param limit End of subject
 123  * @param lim Table of locale lookups
 124  * @return Looked up value or -1
 125  *
 126  * The match is case-independent.
 127  */
 128 static int try_locale_match(const char *buf,
 129                             const char *limit,
 130                             const struct locale_item_match *lim) {
 131   /* This is not very efficient!  A (correct) built-in implementation will
 132    * presumably have more direct access to locale information. */
 133   while(lim->value != -1) {
 134     if(try_match(buf, limit, nl_langinfo(lim->key)))
 135       return lim->value;
 136     ++lim;
 137   }
 138   return -1;
 139 }
 140
 141 static int try_numeric_match(const char *buf,
 142                              const char *limit,
 143                              unsigned low,
 144                              unsigned high) {
 145   unsigned n = 0;
 146
 147   while(buf < limit) {
 148     int ch = (unsigned char)*buf++;
 149     if(ch >= '0' && ch <= '9') {
 150       if(n > INT_MAX / 10
 151          || (n == INT_MAX / 10 && ch >= INT_MAX % 10 + '0'))
 152         return -1;                      /* overflow */
 153       n = 10 * n + ch - '0';
 154     } else
 155       return -1;
 156   }
 157   if(n < low || n > high)
 158     return -1;
 159   return (int)n;
 160 }
 161
 162 static const char *my_strptime_guts(const char *buf,
 163                                     const char *format,
 164                                     struct tm *tm) {
 165   int fc, mod, spec, next, value;
 166   const char *limit;
 167   /* nl_langinfo() is allowed to trash its last return value so we copy.
 168    * (We're relying on it being usable at all in multithreaded environments
 169    * though.) */
 170 #define USE_SUBFORMAT(ITEM, EITEM, DEF) do {            \
 171   const char *s;                                        \
 172   char subformat[128];                                  \
 173                                                         \
 174   if(mod == 'E') {                                      \
 175     s = nl_langinfo(EITEM);                             \
 176     if(!s || !*s)                                       \
 177       s = nl_langinfo(ITEM);                            \
 178   } else                                                \
 179     s = nl_langinfo(ITEM);                              \
 180   if(!s || !*s)                                         \
 181     s = DEF;                                            \
 182   if(strlen(s) >= sizeof subformat)                     \
 183     s = DEF;                                            \
 184   strcpy(subformat, s);                                 \
 185   if(!(buf = my_strptime_guts(buf, subformat, tm)))     \
 186     return NULL;                                        \
 187 } while(0)
 188
 189   while(*format) {
 190     fc = (unsigned char)*format++;
 191     if(fc == '%') {
 192       /* Get the character defining the converstion specification */
 193       spec = (unsigned char)*format++;
 194       if(spec == 'E' || spec == 'O') {
 195         /* Oops, there's a modifier first */
 196         mod = spec;
 197         spec = (unsigned char)*format++;
 198       } else
 199         mod = 0;
 200       if(!spec)
 201         return NULL;                    /* format string broken! */
 202       /* See what the next directive is.  The specification is written in terms
 203        * of stopping the match at a character that matches the next directive.
 204        * This implementation mirrors this aspect of the specification
 205        * directly. */
 206       next = (unsigned char)*format;
 207       if(next) {
 208         limit = buf;
 209         if(isspace(next)) {
 210           /* Next directive is whitespace, so bound the input string (at least)
 211            * by that */
 212           while(*limit && !isspace((unsigned char)*limit))
 213             ++limit;
 214         } else if(next == '%') {
 215           /* Prohibited: "The application shall ensure that there is
 216            * white-space or other non-alphanumeric characters between any two
 217            * conversion specifications".  In fact we let alphanumerics
 218            * through.
 219            *
 220            * Forbidding even %% seems a bit harsh but is consistent with the
 221            * specification as written.
 222            */
 223           return NULL;
 224         } else {
 225           /* Next directive is a specific character, so bound the input string
 226            * (at least) by that.  This will work badly in the face of multibyte
 227            * characters, but then the spec is vague about what kind of string
 228            * we're dealing with anyway so you probably couldn't safely use them
 229            * in the format string at least in any case. */
 230           while(*limit && *limit != next)
 231             ++limit;
 232         }
 233       } else
 234         limit = buf + strlen(buf);
 235       switch(spec) {
 236       case 'A': case 'a':               /* day name (abbrev or full) */
 237         if((value = try_locale_match(buf, limit, days)) == -1)
 238           return NULL;
 239         tm->tm_wday = value;
 240         break;
 241       case 'B': case 'b': case 'h':     /* month name (abbrev or full) */
 242         if((value = try_locale_match(buf, limit, months)) == -1)
 243           return NULL;
 244         tm->tm_mon = value - 1;
 245         break;
 246       case 'c':                         /* locale date+time */
 247         USE_SUBFORMAT(D_T_FMT, ERA_D_T_FMT, "%a %b %e %H:%M:%S %Y");
 248         break;
 249       case 'C':                         /* century number 0-99 */
 250         /* TODO  */
 251         return NULL;
 252       case 'd': case 'e':               /* day of month 1-31 */
 253         if((value = try_numeric_match(buf, limit, 1, 31)) == -1)
 254           return NULL;
 255         tm->tm_mday = value;
 256         break;
 257       case 'D':                         /* == "%m / %d / %y" */
 258         if(!(buf = my_strptime_guts(buf, "%m / %d / %y", tm)))
 259           return NULL;
 260         break;
 261       case 'H':                         /* hour 0-23 */
 262         if((value = try_numeric_match(buf, limit, 0, 23)) == -1)
 263           return NULL;
 264         tm->tm_hour = value;
 265         break;
 266       case 'I':                         /* hour 1-12 */
 267         /* TODO */
 268         return NULL;
 269       case 'j':                         /* day 1-366 */
 270         if((value = try_numeric_match(buf, limit, 1, 366)) == -1)
 271           return NULL;
 272         tm->tm_yday = value - 1;
 273         return NULL;
 274       case 'm':                         /* month 1-12 */
 275         if((value = try_numeric_match(buf, limit, 1, 12)) == -1)
 276           return NULL;
 277         tm->tm_mon = value - 1;
 278         break;
 279       case 'M':                         /* minute 0-59 */
 280         if((value = try_numeric_match(buf, limit, 0, 59)) == -1)
 281           return NULL;
 282         tm->tm_min = value;
 283         break;
 284       case 'n': case 't':               /* any whitespace */
 285         goto matchwhitespace;
 286       case 'p':                         /* locale am/pm */
 287         /* TODO */
 288         return NULL;
 289       case 'r':                         /* == "%I : %M : %S %p" */
 290         /* TODO actually this is locale-dependent; and we don't implement %I
 291          * anyway, so it's not going to work even as it stands. */
 292         if(!(buf = my_strptime_guts(buf, "%I : %M : %S %p", tm)))
 293           return NULL;
 294         break;
 295       case 'R':                         /* == "%H : %M" */
 296         if(!(buf = my_strptime_guts(buf, "%H : %M", tm)))
 297           return NULL;
 298         break;
 299       case 'S':                         /* seconds 0-60 */
 300         if((value = try_numeric_match(buf, limit, 0, 60)) == -1)
 301           return NULL;
 302         tm->tm_sec = value;
 303         break;
 304       case 'U':                         /* week number from Sunday 0-53 */
 305         /* TODO */
 306         return NULL;
 307       case 'w':                         /* day number 0-6 from Sunday */
 308         if((value = try_numeric_match(buf, limit, 0, 6)) == -1)
 309           return NULL;
 310         tm->tm_wday = value;
 311         break;
 312       case 'W':                         /* week number from Monday 0-53 */
 313         /* TODO */
 314         return NULL;
 315       case 'x':                         /* locale date format */
 316         USE_SUBFORMAT(D_FMT, ERA_D_FMT, "%m/%d/%y");
 317         break;
 318       case 'X':                         /* locale time format */
 319         USE_SUBFORMAT(T_FMT, ERA_T_FMT, "%H:%M:%S");
 320         break;
 321       case 'y':                         /* year mod 100 */
 322         if((value = try_numeric_match(buf, limit, 0, INT_MAX)) == -1)
 323           return NULL;
 324         if(value >= 0 && value <= 68)
 325           value = 2000 + value;
 326         else if(value >= 69 && value <= 99)
 327           value = 1900 + value;
 328         tm->tm_year = value - 1900;
 329         break;
 330       case 'Y':                         /* year */
 331         if((value = try_numeric_match(buf, limit, 1, INT_MAX)) == -1)
 332           return NULL;
 333         tm->tm_year = value - 1900;
 334         break;
 335       case '%':
 336         goto matchself;
 337       default:
 338         /* The spec is a bit vague about what to do with invalid format
 339          * strings.  We return NULL immediately and hope someone will
 340          * notice. */
 341         return NULL;
 342       }
 343       buf = limit;
 344     } else if(isspace(fc)) {
 345     matchwhitespace:
 346       /* Any format whitespace matches any number of input whitespace
 347        * characters.  The directive can formally contain more than one
 348        * whitespace character; for the second and subsequent ones we'll match 0
 349        * characters from the input. */
 350       while(isspace((unsigned char)*buf))
 351         ++buf;
 352     } else {
 353     matchself:
 354       /* Non-% non-whitespace characters must match themselves exactly */
 355       if(fc != (unsigned char)*buf++)
 356         return NULL;
 357     }
 358   }
 359   /* When we run out of format string we return a pointer to the rest of the
 360    * input. */
 361   return buf;
 362 }
 363
 364 /** @brief Reimplementation of strptime()
 365  * @param buf Input buffer
 366  * @param format Format string
 367  * @param tm Where to put result
 368  * @return Pointer to first unparsed input character, or NULL on error
 369  *
 370  * Based on <a
 371  * href="http://www.opengroup.org/onlinepubs/009695399/functions/strptime.html">http://www.opengroup.org/onlinepubs/009695399/functions/strptime.html</a>.
 372  */
 373 char *my_strptime(const char *buf,
 374                   const char *format,
 375                   struct tm *tm) {
 376   /* Whether to overwrite or update is unspecified (rather bizarrely).  This
 377    * implementation does not overwrites, as xgetdate() depends on this
 378    * behavior. */
 379
 380   if(!(buf = my_strptime_guts(buf, format, tm)))
 381     return NULL;
 382   /* TODO various things we could/should do:
 383    * - infer day/month from %j+year
 384    * - infer day/month from %U/%W+%w/%a+year
 385    * - infer hour from %p+%I
 386    * - fill wday/yday from other fields
 387    */
 388   return (char *)buf;
 389 }
 390
 391 /*
 392 Local Variables:
 393 c-basic-offset:2
 394 comment-column:40
 395 fill-column:79
 396 indent-tabs-mode:nil
 397 End:
 398 */