1 /*$Id: unfoldHDR.c,v 1.14 1999/11/06 05:25:14 lindberg Exp $*/
2 /*$Name: ezmlm-idx-040 $*/
11 static stralloc tmpdata = {0};
13 static int trimre(cpp,cpend,prefix,fatal)
28 serial = prefix->len; /* pointer to serial number */
30 serial = byte_rchr(prefix->s,prefix->len,'#');
35 while (cp <= cpend && (*cp == ' ' || *cp == '\t')) cp++;
37 while (++cpnew <= cpend) { /* /(..+:\s)/ is a reply indicator */
39 if (cpnew < cp + 3) break; /* at least 3 char before ' ' */
40 if (*(cpnew - 1) != ':') break; /* require ':' before ' ' */
41 if (cpnew > cp + 5) { /* if > 4 char before ':' require */
43 ch = *(cpnew - 2); /* XX^3, XX[3], XX(3) */
44 if (ch != ')' && ch != ']' && (ch < '0' || ch > '9'))
53 /* prefix removal is complicated by the inconsistent handling of ' ' */
54 /* when there are rfc2047-encoded words in the subject. We first */
55 /* compare prefix before "serial" ignoring space, then skip the */
56 /* number, then compare after "serial". If both matched we've found */
61 while (i < serial && cpnew <= cpend) {
63 if (prefix->s[i] == ' ') {
67 if (*cpnew != prefix->s[i]) break;
72 if (i == serial) { /* match before serial */
74 if (serial != j) { /* got a '#' */
75 while (cpnew <= cpend && /* skip number/space */
76 *cpnew == ' ' || (*cpnew <= '9' && *cpnew >= '0')) ++cpnew;
78 while (i < j && cpnew <= cpend) {
80 if (prefix->s[i] == ' ') {
84 if (*cpnew != prefix->s[i]) break;
102 static int trimend(indata,np,fatal)
106 /* looks at indata of length n from the end removing LWSP & '\n' */
107 /* and any trailing '-Reply'. Sets n to new length and returns: */
108 /* 0 - not reply, 1 - reply. */
114 if (*np == 0) return 0;
115 cplast = indata + *np - 1; /* points to last char on line */
119 while (cplast >= indata &&
120 (*cplast == ' ' || *cplast == '\t' ||
121 *cplast == '\r' || *cplast == '\n'))
123 if (cplast - indata >= 5 && case_startb(cplast - 5,6,"-Reply")) {
129 *np = (unsigned int) (cplast - indata + 1); /* new length */
133 int unfoldHDR(indata,n,outdata,charset,prefix,flagtrimsub,fatal)
141 /* takes a header as indata. Removal of reply-indicators is done */
142 /* but removal of line breaks and Q and B decoding should have */
143 /* been done. Returns a */
144 /* single line header without trailing \n or \0. Mainly, we */
145 /* remove redundant shift codes */
146 /* returns 0 = no reply no prefix */
147 /* 1 = reply no prefix */
148 /* 2 = no reply, prefix */
149 /* 3 = reply & pefix */
152 char *cp,*cpesc,*cpnext,*cpend,*cpout;
153 char state,cset,newcset;
156 cp = indata; /* JIS X 0201 -> ISO646 us-ascii */
159 if (!stralloc_copys(&tmpdata,"")) die_nomem(fatal);
160 if (!stralloc_ready(&tmpdata,n)) die_nomem(fatal);
162 if(!case_diffb(charset,11,"iso-2022-jp")) {
163 /* iso-2022-jp-2 (rfc1554) and its subset iso-2022-jp. The reg #s */
164 /* are from the rfc. Don't ask why they have multiple length G0 */
165 /* charset designations ... JIS X 0201-roman is identical to */
166 /* iso646 us-ascii except for currency and tilde. Making them the */
167 /* same increases hits without significant loss. JIS X 0208-1978 */
168 /* is superceded by JIS X 0208-1983 and converted here as well. */
171 if (*cp++ != ESC) continue;
173 if (++cp > cpend) break;
174 if (*cp == 'J') *cp = 'B';
176 } else if (*cp == '$') {
177 if (++cp > cpend) break;
178 if (*cp == '@') *cp = 'B';
182 /* eliminate redundant ESC seqs */
187 if (*cp++ != ESC) continue;
190 if (++cp > cpend) break;
191 if (*cp == 'B') newreg = 87;
192 else if (*cp == 'A') newreg = 58;
193 else if (*cp == '(') {
194 if (++cp > cpend) break;
195 if (*cp == 'C') newreg = 149;
196 else if (*cp == 'D') newreg = 159;
199 } else if (*cp == '(') {
200 if (++cp > cpend) break;
201 if (*cp == 'B') newreg = 6;
204 if (++cp > cpend) break;
205 while (*cp == ' ' || *cp == '\t')
206 if (++cp >= cpend) break; /* skip space */
207 if (*cp == ESC) /* maybe another G0 designation */
208 if (*(cp+1) == '(' || *(cp+1) == '$') { /* yep! */
209 if (!stralloc_catb(&tmpdata,cpnext,cpesc-cpnext)) die_nomem(fatal);
214 if (!stralloc_catb(&tmpdata,cpnext,cpesc-cpnext)) die_nomem(fatal);
218 } /* copy remainder of line */
220 if (!stralloc_catb(&tmpdata,cpnext,cpend - cpnext + 1)) die_nomem(fatal);
221 if (reg != 6) { /* need to return to us-ascii at the end of the line */
222 if (!stralloc_cats(&tmpdata,TOASCII)) die_nomem(fatal);
223 } else { /* maybe "-Reply at the end?" */
224 r = trimend(tmpdata.s,&(tmpdata.len),fatal);
227 } else if (!case_diffb(charset,11,"iso-2022-cn") ||
228 !case_diffb(charset,11,"iso-2022-kr")) {
229 /* these use SI/SO and ESC $ ) x as the SO designation. In -cn and */
230 /* -cn-ext, 'x' can be a number of different letters. In -kr it's */
231 /* always 'C'. This routine may work also for other iso-2022 sets */
232 /* also handles iso-2022-cn-ext */
233 cpesc = (char *) 0; /* points to latest ESC */
234 state = SI; /* us-ascii */
235 --cp; /* set up for loop */
237 while (++cp <= cpend) {
238 if (*cp == SI || *cp == SO) {
239 if (state == *cp) { /* already in state. Skip shift seq */
240 if (!stralloc_catb(&tmpdata,cpnext,cp-cpnext-1)) die_nomem(fatal);
242 } else /* set new state */
244 if (++cp > cpend) break;
247 if (*cp != ESC) continue;
248 if (cp + 3 > cpend) break; /* not space for full SO-designation */
250 if (*cp != '$') continue;
251 if (++cp > cpend) break;
252 if (*cp != ')') continue;
253 if (++cp > cpend) break;
255 if (++cp > cpend) break;
256 while (cp <= cpend && (*cp == ' ' || *cp == '\t')) ++cp;
257 if (cp + 3 > cpend) break; /* no space for full SO-designation */
258 if ((*cp == ESC && *(cp+1) == '$' && *(cp+2) == ')')
259 || (newcset == cset)) {
260 /* skip if a second SO-designation right after or */
261 /* this SO-designation is already active, skip */
262 if (!stralloc_catb(&tmpdata,cpnext,cpesc-cpnext)) die_nomem(fatal);
263 --cp; /* "unpeek" so that next iteration will see char */
271 /* get remainder of line */
272 if (!stralloc_catb(&tmpdata,cpnext,cpend - cpnext + 1)) die_nomem(fatal);
273 if (state != SI) /* need to end in ascii */
274 if (!stralloc_cats(&tmpdata,TOSI)) die_nomem(fatal);
275 else /* ascii end; maybe "-Reply" at the end? */
276 r = trimend(tmpdata.s,&(tmpdata.len),fatal);
278 } else { /* other character sets = no special treatment */
279 r = trimend(cp,&n,fatal); /* -reply */
280 if (!stralloc_copyb(&tmpdata,cp,n)) die_nomem(fatal);
286 if (flagtrimsub) { /* remove leading reply indicators & prefix*/
287 r |= trimre(&cp,cpend,prefix,fatal);
288 n = (unsigned int) (cpend-cp+1);
290 /* there shouldn't be '\0' or '\n', but make sure as */
291 /* it would break the message index */
292 if (!stralloc_copys(outdata,"")) die_nomem(fatal);
293 if (!stralloc_ready(outdata,n)) die_nomem(fatal);
296 while (n--) { /* '\n' and '\0' would break the subject index */
297 if (!*cp || *cp == '\n') *cpout = ' ';