chiark - git - mdw - ezmlm/blob - unfoldHDR.c

   1 /*$Id: unfoldHDR.c,v 1.14 1999/11/06 05:25:14 lindberg Exp $*/
   2 /*$Name: ezmlm-idx-040 $*/
   3
   4 #include "stralloc.h"
   5 #include "strerr.h"
   6 #include "case.h"
   7 #include "byte.h"
   8 #include "errtxt.h"
   9 #include "mime.h"
  10
  11 static stralloc tmpdata = {0};
  12
  13 static int trimre(cpp,cpend,prefix,fatal)
  14 char **cpp;
  15 char *cpend;
  16 stralloc *prefix;
  17 char *fatal;
  18
  19 {
  20   int r = 0;
  21   register char *cp;
  22   char *cpnew;
  23   int junk;
  24   unsigned int i,j;
  25   unsigned int serial;
  26
  27   cp = *cpp;
  28   serial = prefix->len;         /* pointer to serial number */
  29   if (serial)
  30     serial = byte_rchr(prefix->s,prefix->len,'#');
  31
  32   junk = 1;
  33   while (junk) {
  34     junk = 0;
  35     while (cp <= cpend && (*cp == ' ' || *cp == '\t')) cp++;
  36     cpnew = cp;
  37     while (++cpnew <= cpend) {  /* /(..+:\s)/ is a reply indicator */
  38       if (*cpnew == ' ') {
  39         if (cpnew < cp + 3) break;      /* at least 3 char before ' ' */
  40         if (*(cpnew - 1) != ':') break; /* require ':' before ' ' */
  41         if (cpnew > cp + 5) {           /* if > 4 char before ':' require */
  42           register char ch;
  43           ch = *(cpnew - 2);            /* XX^3, XX[3], XX(3) */
  44           if (ch != ')' && ch != ']' && (ch < '0' || ch > '9'))
  45             break;
  46         }
  47         junk = 1;
  48         r |= 1;
  49         cp = cpnew + 1;
  50         break;
  51       }
  52     }
  53         /* prefix removal is complicated by the inconsistent handling of ' ' */
  54         /* when there are rfc2047-encoded words in the subject. We first     */
  55         /* compare prefix before "serial" ignoring space, then skip the      */
  56         /* number, then compare after "serial". If both matched we've found  */
  57         /* the prefix. */
  58     if (serial) {
  59       cpnew = cp;
  60       i = 0;
  61       while (i < serial && cpnew <= cpend) {
  62         if (*cpnew != ' ') {
  63           if (prefix->s[i] == ' ') {
  64             ++i;
  65             continue;
  66           }
  67           if (*cpnew != prefix->s[i]) break;
  68           ++i;
  69         }
  70         ++cpnew;
  71       }
  72       if (i == serial) {                /* match before serial */
  73         j = prefix->len;
  74         if (serial != j) {              /* got a '#' */
  75           while (cpnew <= cpend &&      /* skip number/space */
  76                 *cpnew == ' ' || (*cpnew <= '9' && *cpnew >= '0')) ++cpnew;
  77           i = serial + 1;
  78           while (i < j && cpnew <= cpend) {
  79             if (*cpnew != ' ') {
  80               if (prefix->s[i] == ' ') {
  81                 ++i;
  82                 continue;
  83               }
  84               if (*cpnew != prefix->s[i]) break;
  85               ++i;
  86             }
  87             ++cpnew;
  88           }
  89         }
  90         if (i == j) {
  91           cp = cpnew;
  92           junk = 1;
  93           r |= 2;
  94         }
  95       }
  96     }
  97   }
  98   *cpp = cp;
  99   return r;
 100 }
 101
 102 static int trimend(indata,np,fatal)
 103 char *indata;
 104 unsigned int *np;
 105 char *fatal;
 106         /* looks at indata of length n from the end removing LWSP & '\n' */
 107         /* and any trailing '-Reply'. Sets n to new length and returns:  */
 108         /* 0 - not reply, 1 - reply. */
 109 {
 110   char *cplast;
 111   int junk;
 112   int r = 0;
 113
 114   if (*np == 0) return 0;
 115   cplast = indata + *np - 1;    /* points to last char on line */
 116   junk = 1;
 117   while (junk) {
 118     junk = 0;
 119     while (cplast >= indata &&
 120              (*cplast == ' ' || *cplast == '\t' ||
 121               *cplast == '\r' || *cplast == '\n'))
 122             --cplast;
 123     if (cplast - indata  >= 5 && case_startb(cplast - 5,6,"-Reply")) {
 124       cplast -= 6;
 125       r = 1;
 126       junk = 1;
 127     }
 128   }
 129   *np = (unsigned int) (cplast - indata + 1);   /* new length */
 130   return r;
 131 }
 132
 133 int unfoldHDR(indata,n,outdata,charset,prefix,flagtrimsub,fatal)
 134 char *indata;
 135 unsigned int n;
 136 stralloc *outdata;
 137 char *charset;
 138 stralloc *prefix;
 139 int flagtrimsub;
 140 char *fatal;
 141         /* takes a header as indata. Removal of reply-indicators is done */
 142         /* but removal of line breaks and Q and B decoding should have   */
 143         /* been done. Returns a */
 144         /* single line header without trailing \n or \0. Mainly, we      */
 145         /* remove redundant shift codes   */
 146         /* returns 0 = no reply no prefix */
 147         /*         1 = reply no prefix    */
 148         /*         2 = no reply, prefix   */
 149         /*         3 = reply & pefix      */
 150 {
 151   int r = 0;
 152   char *cp,*cpesc,*cpnext,*cpend,*cpout;
 153   char state,cset,newcset;
 154   int reg,newreg;
 155
 156   cp = indata;          /* JIS X 0201 -> ISO646 us-ascii */
 157   cpend = cp + n - 1;
 158   cpnext = cp;
 159   if (!stralloc_copys(&tmpdata,"")) die_nomem(fatal);
 160   if (!stralloc_ready(&tmpdata,n)) die_nomem(fatal);
 161
 162   if(!case_diffb(charset,11,"iso-2022-jp")) {
 163         /* iso-2022-jp-2 (rfc1554) and its subset iso-2022-jp. The reg #s */
 164         /* are from the rfc. Don't ask why they have multiple length G0   */
 165         /* charset designations ... JIS X 0201-roman is identical to      */
 166         /* iso646 us-ascii except for currency and tilde. Making them the */
 167         /* same increases hits without significant loss. JIS X 0208-1978  */
 168         /* is superceded by JIS X 0208-1983 and converted here as well.   */
 169
 170     while (cp < cpend) {
 171       if (*cp++ != ESC) continue;
 172       if (*cp == '(') {
 173         if (++cp > cpend) break;
 174         if (*cp == 'J') *cp = 'B';
 175         ++cp;
 176       } else if (*cp == '$') {
 177         if (++cp > cpend) break;
 178         if (*cp == '@') *cp = 'B';
 179         ++cp;
 180       }
 181     }
 182                 /* eliminate redundant ESC seqs */
 183     cp = indata;
 184     cpnext = cp;
 185     reg = 6;
 186     while (cp < cpend) {
 187       if (*cp++ != ESC) continue;
 188       cpesc = cp - 1;
 189       if (*cp == '$') {
 190         if (++cp > cpend) break;
 191         if (*cp == 'B') newreg = 87;
 192         else if (*cp == 'A') newreg = 58;
 193         else if (*cp == '(') {
 194           if (++cp > cpend) break;
 195           if (*cp == 'C') newreg = 149;
 196           else if (*cp == 'D') newreg = 159;
 197           else continue;
 198         } else continue;
 199       } else if (*cp == '(') {
 200         if (++cp > cpend) break;
 201         if (*cp == 'B') newreg = 6;
 202         else continue;
 203       } else continue;
 204       if (++cp > cpend) break;
 205       while (*cp == ' ' || *cp == '\t')
 206         if (++cp >= cpend) break;       /* skip space */
 207       if (*cp == ESC)                   /* maybe another G0 designation */
 208         if (*(cp+1) == '(' || *(cp+1) == '$') {  /* yep! */
 209           if (!stralloc_catb(&tmpdata,cpnext,cpesc-cpnext)) die_nomem(fatal);
 210           cpnext = cp;
 211           continue;
 212       }
 213       if (reg == newreg) {
 214         if (!stralloc_catb(&tmpdata,cpnext,cpesc-cpnext)) die_nomem(fatal);
 215         cpnext = cp;
 216       } else {
 217         reg = newreg;
 218       }         /* copy remainder of line */
 219     }
 220     if (!stralloc_catb(&tmpdata,cpnext,cpend - cpnext + 1)) die_nomem(fatal);
 221     if (reg != 6) {     /* need to return to us-ascii at the end of the line */
 222       if (!stralloc_cats(&tmpdata,TOASCII)) die_nomem(fatal);
 223     } else {            /* maybe "-Reply at the end?" */
 224       r = trimend(tmpdata.s,&(tmpdata.len),fatal);
 225     }
 226
 227   } else if (!case_diffb(charset,11,"iso-2022-cn") ||
 228              !case_diffb(charset,11,"iso-2022-kr")) {
 229         /* these use SI/SO and ESC $ ) x as the SO designation. In -cn and */
 230         /* -cn-ext, 'x' can be a number of different letters. In -kr it's  */
 231         /* always 'C'. This routine may work also for other iso-2022 sets  */
 232         /* also handles iso-2022-cn-ext */
 233     cpesc = (char *) 0; /* points to latest ESC */
 234     state = SI;         /* us-ascii */
 235     --cp;               /* set up for loop */
 236
 237     while (++cp <= cpend) {
 238       if (*cp == SI || *cp == SO) {
 239         if (state == *cp) {              /* already in state. Skip shift seq */
 240           if (!stralloc_catb(&tmpdata,cpnext,cp-cpnext-1)) die_nomem(fatal);
 241           cpnext = cp;
 242         } else                          /* set new state */
 243           state = *cp;
 244         if (++cp > cpend) break;
 245         continue;
 246       }
 247       if (*cp != ESC) continue;
 248       if (cp + 3 > cpend) break;        /* not space for full SO-designation */
 249       cpesc = cp;
 250       if (*cp != '$') continue;
 251       if (++cp > cpend) break;
 252       if (*cp != ')') continue;
 253       if (++cp > cpend) break;
 254       newcset = *cp;
 255       if (++cp > cpend) break;
 256       while (cp <= cpend && (*cp == ' ' || *cp == '\t')) ++cp;
 257       if (cp + 3 > cpend) break;        /* no space for full SO-designation */
 258       if ((*cp == ESC && *(cp+1) == '$' && *(cp+2) == ')')
 259                 || (newcset == cset)) {
 260                         /* skip if a second SO-designation right after or */
 261                         /* this SO-designation is already active, skip */
 262         if (!stralloc_catb(&tmpdata,cpnext,cpesc-cpnext)) die_nomem(fatal);
 263         --cp;           /* "unpeek" so that next iteration will see char */
 264         cpnext = cpesc + 4;
 265         continue;
 266       } else {
 267         cset = newcset;
 268         continue;
 269       }
 270     }
 271                         /* get remainder of line */
 272     if (!stralloc_catb(&tmpdata,cpnext,cpend - cpnext + 1)) die_nomem(fatal);
 273     if (state != SI)    /* need to end in ascii */
 274       if (!stralloc_cats(&tmpdata,TOSI)) die_nomem(fatal);
 275     else                /* ascii end; maybe "-Reply" at the end? */
 276       r = trimend(tmpdata.s,&(tmpdata.len),fatal);
 277
 278   } else {              /* other character sets = no special treatment */
 279     r = trimend(cp,&n,fatal);           /* -reply */
 280     if (!stralloc_copyb(&tmpdata,cp,n)) die_nomem(fatal);
 281   }
 282
 283   cp = tmpdata.s;
 284   n = tmpdata.len;
 285   cpend = cp + n - 1;
 286   if (flagtrimsub) {     /* remove leading reply indicators & prefix*/
 287     r |= trimre(&cp,cpend,prefix,fatal);
 288     n = (unsigned int) (cpend-cp+1);
 289   }
 290                         /* there shouldn't be '\0' or '\n', but make sure as */
 291                         /* it would break the message index */
 292   if (!stralloc_copys(outdata,"")) die_nomem(fatal);
 293   if (!stralloc_ready(outdata,n)) die_nomem(fatal);
 294   outdata->len = n;
 295   cpout = outdata->s;
 296   while (n--) {         /* '\n' and '\0' would break the subject index */
 297     if (!*cp || *cp == '\n') *cpout = ' ';
 298     else *cpout = *cp;
 299     ++cp; ++cpout;
 300   }
 301   return r;
 302 }
 303