chiark - git - mdw - disorder/blob - lib/strptime.c

   1 /* strptime.c - partial strptime() reimplementation
   2  *
   3  * Copyright (c) 2008, 2011, 2013 Richard Kettlewell.
   4  * All rights reserved.
   5  *
   6  * Redistribution and use in source and binary forms, with or without
   7  * modification, are permitted provided that the following conditions
   8  * are met:
   9  * 1. Redistributions of source code must retain the above copyright
  10  *    notice, this list of conditions and the following disclaimer.
  11  * 2. Redistributions in binary form must reproduce the above copyright
  12  *    notice, this list of conditions and the following disclaimer in the
  13  *    documentation and/or other materials provided with the distribution.
  14  * 3. The name of the author may not be used to endorse or promote products
  15  *    derived from this software without specific prior written permission.
  16  *
  17  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  19  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  20  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  21  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  22  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  23  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  24  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  25  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  26  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  27  * SUCH DAMAGE.
  28  */
  29 /** @file lib/strptime.c
  30  * @brief strptime() reimplementation
  31  *
  32  * strptime() is here reimplemented because the FreeBSD (and older MacOS) one
  33  * is broken and does not report errors properly.  See TODO remarks below for
  34  * some missing bits.
  35  */
  36
  37 #if HAVE_CONFIG_H
  38 # include <config.h>
  39 #endif
  40
  41 #include <ctype.h>
  42 #include <limits.h>
  43 #include <string.h>
  44 #if HAVE_LANGINFO_H
  45 # include <langinfo.h>
  46 #endif
  47 #include "strptime.h"
  48
  49 #if !HAVE_LANGINFO_H
  50 /* Fake plastic langinfo.  Primarily for Windows.
  51  * TODO WIN32 can we get these values out of the win32 api instead? */
  52 typedef enum {
  53   DAY_1,
  54   DAY_2,
  55   DAY_3,
  56   DAY_4,
  57   DAY_5,
  58   DAY_6,
  59   DAY_7,
  60   ABDAY_1,
  61   ABDAY_2,
  62   ABDAY_3,
  63   ABDAY_4,
  64   ABDAY_5,
  65   ABDAY_6,
  66   ABDAY_7,
  67   MON_1,
  68   MON_2,
  69   MON_3,
  70   MON_4,
  71   MON_5,
  72   MON_6,
  73   MON_7,
  74   MON_8,
  75   MON_9,
  76   MON_10,
  77   MON_11,
  78   MON_12,
  79   ABMON_1,
  80   ABMON_2,
  81   ABMON_3,
  82   ABMON_4,
  83   ABMON_5,
  84   ABMON_6,
  85   ABMON_7,
  86   ABMON_8,
  87   ABMON_9,
  88   ABMON_10,
  89   ABMON_11,
  90   ABMON_12,
  91   D_FMT,
  92   T_FMT,
  93   D_T_FMT,
  94   ERA_D_FMT,
  95   ERA_T_FMT,
  96   ERA_D_T_FMT,
  97 } nl_item;
  98
  99 const char *nl_langinfo(nl_item item) {
 100   switch(item) {
 101   case DAY_1: return "Sunday";
 102   case DAY_2: return "Monday";
 103   case DAY_3: return "Tuesday";
 104   case DAY_4: return "Wednesday";
 105   case DAY_5: return "Thursday";
 106   case DAY_6: return "Friday";
 107   case DAY_7: return "Saturday";
 108   case ABDAY_1: return "Sun";
 109   case ABDAY_2: return "Mon";
 110   case ABDAY_3: return "Tue";
 111   case ABDAY_4: return "Wed";
 112   case ABDAY_5: return "Thu";
 113   case ABDAY_6: return "Fri";
 114   case ABDAY_7: return "Sat";
 115   case MON_1: return "January";
 116   case MON_2: return "February";
 117   case MON_3: return "March";
 118   case MON_4: return "April";
 119   case MON_5: return "May";
 120   case MON_6: return "June";
 121   case MON_7: return "July";
 122   case MON_8: return "August";
 123   case MON_9: return "September";
 124   case MON_10: return "October";
 125   case MON_11: return "November";
 126   case MON_12: return "December";
 127   case ABMON_1: return "Jan";
 128   case ABMON_2: return "Feb";
 129   case ABMON_3: return "Mar";
 130   case ABMON_4: return "Apr";
 131   case ABMON_5: return "May";
 132   case ABMON_6: return "Jun";
 133   case ABMON_7: return "Jul";
 134   case ABMON_8: return "Aug";
 135   case ABMON_9: return "Sep";
 136   case ABMON_10: return "Oct";
 137   case ABMON_11: return "Nov";
 138   case ABMON_12: return "Dec";
 139   case D_FMT: return "%d/%m/%y";
 140   case T_FMT: return "%H:%M:%S";
 141   case D_T_FMT: return "%a %d %b %Y %H:%M:%S %Z";
 142   case ERA_D_FMT: return "";
 143   case ERA_T_FMT: return "";
 144   case ERA_D_T_FMT: return "";
 145   default: return 0;
 146   }
 147 }
 148 #endif
 149
 150 /** @brief Lookup table entry for locale-specific strings */
 151 struct locale_item_match {
 152   /** @brief Locale key to try */
 153   nl_item key;
 154
 155   /** @brief Value to return if value of @ref key matches subject string */
 156   int value;
 157 };
 158
 159 static const struct locale_item_match days[] = {
 160   { DAY_1, 0 },
 161   { DAY_2, 1 },
 162   { DAY_3, 2 },
 163   { DAY_4, 3 },
 164   { DAY_5, 4 },
 165   { DAY_6, 5 },
 166   { DAY_7, 6 },
 167   { ABDAY_1, 0 },
 168   { ABDAY_2, 1 },
 169   { ABDAY_3, 2 },
 170   { ABDAY_4, 3 },
 171   { ABDAY_5, 4 },
 172   { ABDAY_6, 5 },
 173   { ABDAY_7, 6 },
 174   { -1, -1 }
 175 };
 176
 177 static const struct locale_item_match months[] = {
 178   { MON_1, 1 },
 179   { MON_2, 2 },
 180   { MON_3, 3 },
 181   { MON_4, 4 },
 182   { MON_5, 5 },
 183   { MON_6, 6 },
 184   { MON_7, 7 },
 185   { MON_8, 8 },
 186   { MON_9, 9 },
 187   { MON_10, 10 },
 188   { MON_11, 11 },
 189   { MON_12, 12 },
 190   { ABMON_1, 1 },
 191   { ABMON_2, 2 },
 192   { ABMON_3, 3 },
 193   { ABMON_4, 4 },
 194   { ABMON_5, 5 },
 195   { ABMON_6, 6 },
 196   { ABMON_7, 7 },
 197   { ABMON_8, 8 },
 198   { ABMON_9, 9 },
 199   { ABMON_10, 10 },
 200   { ABMON_11, 11 },
 201   { ABMON_12, 12 },
 202   { -1, -1 },
 203 };
 204
 205 /** @brief Match a string
 206  * @param buf Start of subject
 207  * @param limit End of subject
 208  * @param match String to match subject against
 209  * @return True if match == [buf,limit) otherwise false
 210  *
 211  * The match is case-independent at least in ASCII.
 212  */
 213 static int try_match(const char *buf,
 214                      const char *limit,
 215                      const char *match) {
 216   /* TODO this won't work well outside single-byte encodings.  A good bet is
 217    * probably to convert to Unicode and then use utf32_casefold_compat() (or
 218    * utf8_casefold_compat(); using compatibility matching will ensure missing
 219    * accents and so on aren't a problem.
 220    *
 221    * en_GB and en_US will probably be in any reasonable encoding for them.
 222    */
 223   while(buf < limit && *match) {
 224     if(tolower((unsigned char)*buf) != tolower((unsigned char)*match))
 225       return 0;
 226     ++buf;
 227     ++match;
 228   }
 229   if(buf != limit || *match)
 230     return 0;
 231   return 1;
 232 }
 233
 234 /** @brief Match from table of locale-specific strings
 235  * @param buf Start of subject
 236  * @param limit End of subject
 237  * @param lim Table of locale lookups
 238  * @return Looked up value or -1
 239  *
 240  * The match is case-independent.
 241  */
 242 static int try_locale_match(const char *buf,
 243                             const char *limit,
 244                             const struct locale_item_match *lim) {
 245   /* This is not very efficient!  A (correct) built-in implementation will
 246    * presumably have more direct access to locale information. */
 247   while(lim->value != -1) {
 248     if(try_match(buf, limit, nl_langinfo(lim->key)))
 249       return lim->value;
 250     ++lim;
 251   }
 252   return -1;
 253 }
 254
 255 static int try_numeric_match(const char *buf,
 256                              const char *limit,
 257                              unsigned low,
 258                              unsigned high) {
 259   unsigned n = 0;
 260
 261   while(buf < limit) {
 262     int ch = (unsigned char)*buf++;
 263     if(ch >= '0' && ch <= '9') {
 264       if(n > INT_MAX / 10
 265          || (n == INT_MAX / 10 && ch >= INT_MAX % 10 + '0'))
 266         return -1;                      /* overflow */
 267       n = 10 * n + ch - '0';
 268     } else
 269       return -1;
 270   }
 271   if(n < low || n > high)
 272     return -1;
 273   return (int)n;
 274 }
 275
 276 static const char *my_strptime_guts(const char *buf,
 277                                     const char *format,
 278                                     struct tm *tm) {
 279   int fc, mod, spec, next, value;
 280   const char *limit;
 281   /* nl_langinfo() is allowed to trash its last return value so we copy.
 282    * (We're relying on it being usable at all in multithreaded environments
 283    * though.) */
 284 #define USE_SUBFORMAT(ITEM, EITEM, DEF) do {            \
 285   const char *s;                                        \
 286   char subformat[128];                                  \
 287                                                         \
 288   if(mod == 'E') {                                      \
 289     s = nl_langinfo(EITEM);                             \
 290     if(!s || !*s)                                       \
 291       s = nl_langinfo(ITEM);                            \
 292   } else                                                \
 293     s = nl_langinfo(ITEM);                              \
 294   if(!s || !*s)                                         \
 295     s = DEF;                                            \
 296   if(strlen(s) >= sizeof subformat)                     \
 297     s = DEF;                                            \
 298   strcpy(subformat, s);                                 \
 299   if(!(buf = my_strptime_guts(buf, subformat, tm)))     \
 300     return NULL;                                        \
 301 } while(0)
 302
 303   while(*format) {
 304     fc = (unsigned char)*format++;
 305     if(fc == '%') {
 306       /* Get the character defining the converstion specification */
 307       spec = (unsigned char)*format++;
 308       if(spec == 'E' || spec == 'O') {
 309         /* Oops, there's a modifier first */
 310         mod = spec;
 311         spec = (unsigned char)*format++;
 312       } else
 313         mod = 0;
 314       if(!spec)
 315         return NULL;                    /* format string broken! */
 316       /* See what the next directive is.  The specification is written in terms
 317        * of stopping the match at a character that matches the next directive.
 318        * This implementation mirrors this aspect of the specification
 319        * directly. */
 320       next = (unsigned char)*format;
 321       if(next) {
 322         limit = buf;
 323         if(isspace(next)) {
 324           /* Next directive is whitespace, so bound the input string (at least)
 325            * by that */
 326           while(*limit && !isspace((unsigned char)*limit))
 327             ++limit;
 328         } else if(next == '%') {
 329           /* Prohibited: "The application shall ensure that there is
 330            * white-space or other non-alphanumeric characters between any two
 331            * conversion specifications".  In fact we let alphanumerics
 332            * through.
 333            *
 334            * Forbidding even %% seems a bit harsh but is consistent with the
 335            * specification as written.
 336            */
 337           return NULL;
 338         } else {
 339           /* Next directive is a specific character, so bound the input string
 340            * (at least) by that.  This will work badly in the face of multibyte
 341            * characters, but then the spec is vague about what kind of string
 342            * we're dealing with anyway so you probably couldn't safely use them
 343            * in the format string at least in any case. */
 344           while(*limit && *limit != next)
 345             ++limit;
 346         }
 347       } else
 348         limit = buf + strlen(buf);
 349       switch(spec) {
 350       case 'A': case 'a':               /* day name (abbrev or full) */
 351         if((value = try_locale_match(buf, limit, days)) == -1)
 352           return NULL;
 353         tm->tm_wday = value;
 354         break;
 355       case 'B': case 'b': case 'h':     /* month name (abbrev or full) */
 356         if((value = try_locale_match(buf, limit, months)) == -1)
 357           return NULL;
 358         tm->tm_mon = value - 1;
 359         break;
 360       case 'c':                         /* locale date+time */
 361         USE_SUBFORMAT(D_T_FMT, ERA_D_T_FMT, "%a %b %e %H:%M:%S %Y");
 362         break;
 363       case 'C':                         /* century number 0-99 */
 364         /* TODO  */
 365         return NULL;
 366       case 'd': case 'e':               /* day of month 1-31 */
 367         if((value = try_numeric_match(buf, limit, 1, 31)) == -1)
 368           return NULL;
 369         tm->tm_mday = value;
 370         break;
 371       case 'D':                         /* == "%m / %d / %y" */
 372         if(!(buf = my_strptime_guts(buf, "%m / %d / %y", tm)))
 373           return NULL;
 374         break;
 375       case 'H':                         /* hour 0-23 */
 376         if((value = try_numeric_match(buf, limit, 0, 23)) == -1)
 377           return NULL;
 378         tm->tm_hour = value;
 379         break;
 380       case 'I':                         /* hour 1-12 */
 381         /* TODO */
 382         return NULL;
 383       case 'j':                         /* day 1-366 */
 384         if((value = try_numeric_match(buf, limit, 1, 366)) == -1)
 385           return NULL;
 386         tm->tm_yday = value - 1;
 387         return NULL;
 388       case 'm':                         /* month 1-12 */
 389         if((value = try_numeric_match(buf, limit, 1, 12)) == -1)
 390           return NULL;
 391         tm->tm_mon = value - 1;
 392         break;
 393       case 'M':                         /* minute 0-59 */
 394         if((value = try_numeric_match(buf, limit, 0, 59)) == -1)
 395           return NULL;
 396         tm->tm_min = value;
 397         break;
 398       case 'n': case 't':               /* any whitespace */
 399         goto matchwhitespace;
 400       case 'p':                         /* locale am/pm */
 401         /* TODO */
 402         return NULL;
 403       case 'r':                         /* == "%I : %M : %S %p" */
 404         /* TODO actually this is locale-dependent; and we don't implement %I
 405          * anyway, so it's not going to work even as it stands. */
 406         if(!(buf = my_strptime_guts(buf, "%I : %M : %S %p", tm)))
 407           return NULL;
 408         break;
 409       case 'R':                         /* == "%H : %M" */
 410         if(!(buf = my_strptime_guts(buf, "%H : %M", tm)))
 411           return NULL;
 412         break;
 413       case 'S':                         /* seconds 0-60 */
 414         if((value = try_numeric_match(buf, limit, 0, 60)) == -1)
 415           return NULL;
 416         tm->tm_sec = value;
 417         break;
 418       case 'U':                         /* week number from Sunday 0-53 */
 419         /* TODO */
 420         return NULL;
 421       case 'w':                         /* day number 0-6 from Sunday */
 422         if((value = try_numeric_match(buf, limit, 0, 6)) == -1)
 423           return NULL;
 424         tm->tm_wday = value;
 425         break;
 426       case 'W':                         /* week number from Monday 0-53 */
 427         /* TODO */
 428         return NULL;
 429       case 'x':                         /* locale date format */
 430         USE_SUBFORMAT(D_FMT, ERA_D_FMT, "%m/%d/%y");
 431         break;
 432       case 'X':                         /* locale time format */
 433         USE_SUBFORMAT(T_FMT, ERA_T_FMT, "%H:%M:%S");
 434         break;
 435       case 'y':                         /* year mod 100 */
 436         if((value = try_numeric_match(buf, limit, 0, INT_MAX)) == -1)
 437           return NULL;
 438         if(value >= 0 && value <= 68)
 439           value = 2000 + value;
 440         else if(value >= 69 && value <= 99)
 441           value = 1900 + value;
 442         tm->tm_year = value - 1900;
 443         break;
 444       case 'Y':                         /* year */
 445         if((value = try_numeric_match(buf, limit, 1, INT_MAX)) == -1)
 446           return NULL;
 447         tm->tm_year = value - 1900;
 448         break;
 449       case '%':
 450         goto matchself;
 451       default:
 452         /* The spec is a bit vague about what to do with invalid format
 453          * strings.  We return NULL immediately and hope someone will
 454          * notice. */
 455         return NULL;
 456       }
 457       buf = limit;
 458     } else if(isspace(fc)) {
 459     matchwhitespace:
 460       /* Any format whitespace matches any number of input whitespace
 461        * characters.  The directive can formally contain more than one
 462        * whitespace character; for the second and subsequent ones we'll match 0
 463        * characters from the input. */
 464       while(isspace((unsigned char)*buf))
 465         ++buf;
 466     } else {
 467     matchself:
 468       /* Non-% non-whitespace characters must match themselves exactly */
 469       if(fc != (unsigned char)*buf++)
 470         return NULL;
 471     }
 472   }
 473   /* When we run out of format string we return a pointer to the rest of the
 474    * input. */
 475   return buf;
 476 }
 477
 478 /** @brief Reimplementation of strptime()
 479  * @param buf Input buffer
 480  * @param format Format string
 481  * @param tm Where to put result
 482  * @return Pointer to first unparsed input character, or NULL on error
 483  *
 484  * Based on <a
 485  * href="http://www.opengroup.org/onlinepubs/009695399/functions/strptime.html">http://www.opengroup.org/onlinepubs/009695399/functions/strptime.html</a>.
 486  */
 487 char *my_strptime(const char *buf,
 488                   const char *format,
 489                   struct tm *tm) {
 490   /* Whether to overwrite or update is unspecified (rather bizarrely).  This
 491    * implementation does not overwrites, as xgetdate() depends on this
 492    * behavior. */
 493
 494   if(!(buf = my_strptime_guts(buf, format, tm)))
 495     return NULL;
 496   /* TODO various things we could/should do:
 497    * - infer day/month from %j+year
 498    * - infer day/month from %U/%W+%w/%a+year
 499    * - infer hour from %p+%I
 500    * - fill wday/yday from other fields
 501    */
 502   return (char *)buf;
 503 }
 504
 505 /*
 506 Local Variables:
 507 c-basic-offset:2
 508 comment-column:40
 509 fill-column:79
 510 indent-tabs-mode:nil
 511 End:
 512 */