1 /* $Id: keywords.c 6269 2003-03-28 01:52:36Z rra $
3 ** Optional keyword generation code.
5 ** Additional code for sake of manufacturing Keywords: headers out of air in
6 ** order to provide better (scorable) XOVER data, containing bits of article
7 ** body content which have a reasonable expectation of utility.
9 ** Basic idea: Simple word-counting. We find words in the article body,
10 ** separated by whitespace. Remove punctuation. Sort words, count unique
11 ** words, sort those counts. Write the resulting Keywords: header containing
12 ** the poster's original Keywords: (if any) followed by a magic cookie
13 ** separator and then the sorted list of words.
21 #include "inn/innconf.h"
24 /* If keyword support wasn't requested, stub out the main function provided by
28 KEYgenerate(HDRCONTENT *header UNUSED, const char *body UNUSED,
29 const char *orig UNUSED, size_t length UNUSED)
35 /* For regex-based common word elimination. */
38 #define MIN_WORD_LENGTH 3 /* 1- and 2-char words don't count. */
39 #define MAX_WORD_LENGTH 28 /* fits "antidisestablishmentarianism". */
42 ** A trivial structure for keeping track of words via both
43 ** index to the overall word list and their counts.
52 ** Wrapper for qsort(3) comparison of word_entry (frequency).
56 wvec_freq_cmp(const void *p1, const void *p2)
58 return ((const struct word_entry *)p2)->count - /* decreasing sort */
59 ((const struct word_entry *)p1)->count;
63 ** Wrapper for qsort(3) comparison of word_entry (word length).
67 wvec_length_cmp(const void *p1, const void *p2)
69 return ((const struct word_entry *)p2)->length - /* decreasing sort */
70 ((const struct word_entry *)p1)->length;
74 ** Wrapper for qsort(3), for pointer-to-pointer strings.
78 ptr_strcmp(const void *p1, const void *p2)
82 cdiff = (**(const char **)p1) - (**(const char **)p2);
85 return strcmp((*(const char **)p1)+1, (*(const char **)p2)+1);
89 ** Build new Keywords.
94 HDRCONTENT *hc, /* header data */
95 const char *body, /* article body */
96 const char *v, /* old kw value */
97 size_t l) /* old kw length */
100 int word_count, word_length, bodylen, word_index, distinct_words;
102 char *text, *orig_text, *text_end, *this_word, *chase, *punc;
103 static struct word_entry *word_vec;
105 static const char *whitespace = " \t\r\n";
107 /* ---------------------------------------------------------------- */
108 /* Prototype setup: Regex match preparation. */
109 static int regex_lib_init = 0;
111 static const char *elim_regexp = "^\\([-+/0-9][-+/0-9]*\\|.*1st\\|.*2nd\\|.*3rd\\|.*[04-9]th\\|about\\|after\\|ago\\|all\\|already\\|also\\|among\\|and\\|any\\|anybody\\|anyhow\\|anyone\\|anywhere\\|are\\|bad\\|because\\|been\\|before\\|being\\|between\\|but\\|can\\|could\\|did\\|does\\|doing\\|done\\|dont\\|during\\|eight\\|eighth\\|eleven\\|else\\|elsewhere\\|every\\|everywhere\\|few\\|five\\|fifth\\|first\\|for\\|four\\|fourth\\|from\\|get\\|going\\|gone\\|good\\|got\\|had\\|has\\|have\\|having\\|he\\|her\\|here\\|hers\\|herself\\|him\\|himself\\|his\\|how\\|ill\\|into\\|its\\|ive\\|just\\|kn[eo]w\\|least\\|less\\|let\\|like\\|look\\|many\\|may\\|more\\|m[ou]st\\|myself\\|next\\|nine\\|ninth\\|not\\|now\\|off\\|one\\|only\\|onto\\|our\\|out\\|over\\|really\\|said\\|saw\\|says\\|second\\|see\\|set\\|seven\\|seventh\\|several\\|shall\\|she\\|should\\|since\\|six\\|sixth\\|some\\|somehow\\|someone\\|something\\|somewhere\\|such\\|take\\|ten\\|tenth\\|than\\|that\\|the\\|their\\!|them\\|then\\|there\\|therell\\|theres\\|these\\|they\\|thing\\|things\\|third\\|this\\|those\\|three\\|thus\\|together\\|told\\|too\\|twelve\\|two\\|under\\|upon\\|very\\|via\\|want\\|wants\\|was\\|wasnt\\|way\\|were\\|weve\\|what\\|whatever\\|when\\|where\\|wherell\\|wheres\\|whether\\|which\\|while\\|who\\|why\\|will\\|will\\|with\\|would\\|write\\|writes\\|wrote\\|yes\\|yet\\|you\\|your\\|youre\\|yourself\\)$";
114 word_vec = xmalloc(innconf->keymaxwords * sizeof(struct word_entry));
117 word = xmalloc(innconf->keymaxwords * sizeof(char *));
124 if (regex_lib_init == 0) {
127 if (regcomp(&preg, elim_regexp, REG_ICASE|REG_NOSUB) != 0) {
128 syslog(L_FATAL, "%s regcomp failure", LogName);
132 /* ---------------------------------------------------------------- */
134 /* first re-init kw from original value. */
135 if (l > innconf->keylimit - (MAX_WORD_LENGTH+5)) /* mostly arbitrary cutoff: */
136 l = innconf->keylimit - (MAX_WORD_LENGTH+5); /* room for minimal word vec */
137 hc->Value = xmalloc(innconf->keylimit+1);
138 if ((v != NULL) && (*v != '\0')) {
139 memcpy(hc->Value, v, l);
143 l = hc->Length = strlen(hc->Value);
146 * now figure acceptable extents, and copy body to working string.
147 * (Memory-intensive for hefty articles: limit to non-ABSURD articles.)
149 bodylen = strlen(body);
150 if ((bodylen < 100) || (bodylen > innconf->keyartlimit)) /* too small/big to bother */
153 orig_text = text = xstrdup(body); /* orig_text is for free() later on */
155 text_end = text + bodylen;
157 /* abusive punctuation stripping: turn it all into SPCs. */
158 for (punc = text; *punc; punc++)
159 if (!CTYPE(isalpha, *punc))
162 /* move to first word. */
163 text += strspn(text, whitespace);
166 /* hunt down words */
167 while ((text < text_end) && /* while there might be words... */
169 (word_count < innconf->keymaxwords)) {
172 word_length = strcspn(text, whitespace);
173 if (word_length == 0)
174 break; /* no words left */
176 /* bookkeep to save word location, then move through text. */
177 word[word_count++] = this_word = text;
180 text += strspn(text, whitespace); /* move to next word. */
182 /* 1- and 2-char words don't count, nor do excessively long ones. */
183 if ((word_length < MIN_WORD_LENGTH) ||
184 (word_length > MAX_WORD_LENGTH)) {
189 /* squash to lowercase. */
190 for (chase = this_word; *chase; chase++)
191 if (CTYPE(isupper, *chase))
192 *chase = tolower(*chase);
195 /* If there were no words, we're done. */
199 /* Sort the words. */
200 qsort(word, word_count, sizeof(word[0]), ptr_strcmp);
202 /* Count unique words. */
203 distinct_words = 0; /* the 1st word is "pre-figured". */
204 word_vec[0].index = 0;
205 word_vec[0].length = strlen(word[0]);
206 word_vec[0].count = 1;
208 for (word_index = 1; /* we compare (N-1)th and Nth words. */
209 word_index < word_count;
211 if (strcmp(word[word_index-1], word[word_index]) == 0)
212 word_vec[distinct_words].count++;
215 word_vec[distinct_words].index = word_index;
216 word_vec[distinct_words].length = strlen(word[word_index]);
217 word_vec[distinct_words].count = 1;
221 /* Sort the counts. */
222 distinct_words++; /* we were off-by-1 until this. */
223 qsort(word_vec, distinct_words, sizeof(struct word_entry), wvec_freq_cmp);
225 /* Sub-sort same-frequency words on word length. */
226 for (last = 0, word_index = 1; /* again, (N-1)th and Nth entries. */
227 word_index < distinct_words;
229 if (word_vec[last].count != word_vec[word_index].count) {
230 if ((word_index - last) != 1) /* 2+ entries to sub-sort. */
231 qsort(&word_vec[last], word_index - last,
232 sizeof(struct word_entry), wvec_length_cmp);
236 /* do it one last time for the only-one-appearance words. */
237 if ((word_index - last) != 1)
238 qsort(&word_vec[last], word_index - last,
239 sizeof(struct word_entry), wvec_length_cmp);
241 /* Scribble onto end of Keywords:. */
242 strcpy(hc->Value + l, ",\377"); /* magic separator, 'ΓΏ' */
243 for (chase = hc->Value + l + 2, word_index = 0;
244 word_index < distinct_words;
246 /* ---------------------------------------------------------------- */
247 /* "noise" words don't count */
248 if (regexec(&preg, word[word_vec[word_index].index], 0, NULL, 0) == 0)
250 /* ---------------------------------------------------------------- */
254 strcpy(chase, word[word_vec[word_index].index]);
255 chase += word_vec[word_index].length;
257 if (chase - hc->Value > (innconf->keylimit - (MAX_WORD_LENGTH + 4)))
260 /* note #words we didn't get to add. */
261 /* This code can potentially lead to a buffer overflow if the number of
262 ignored words is greater than 100, under some circumstances. It's
263 temporarily disabled until fixed. */
264 hc->Length = strlen(hc->Value);
267 /* We must dispose of the original strdup'd text area. */
271 #endif /* DO_KEYWORDS */