innd/keywords.c

   1 /*  $Id: keywords.c 6269 2003-03-28 01:52:36Z rra $
   2 **
   3 **  Optional keyword generation code.
   4 **
   5 **  Additional code for sake of manufacturing Keywords: headers out of air in
   6 **  order to provide better (scorable) XOVER data, containing bits of article
   7 **  body content which have a reasonable expectation of utility.
   8 **
   9 **  Basic idea: Simple word-counting.  We find words in the article body,
  10 **  separated by whitespace.  Remove punctuation.  Sort words, count unique
  11 **  words, sort those counts.  Write the resulting Keywords: header containing
  12 **  the poster's original Keywords: (if any) followed by a magic cookie
  13 **  separator and then the sorted list of words.
  14 */
  15
  16 #include "config.h"
  17 #include "clibrary.h"
  18
  19 #include "libinn.h"
  20
  21 #include "inn/innconf.h"
  22 #include "innd.h"
  23
  24 /* If keyword support wasn't requested, stub out the main function provided by
  25    this file. */
  26 #if !DO_KEYWORDS
  27 void
  28 KEYgenerate(HDRCONTENT *header UNUSED, const char *body UNUSED,
  29             const char *orig UNUSED, size_t length UNUSED)
  30 {
  31 }
  32
  33 #else
  34
  35 /* For regex-based common word elimination. */
  36 #include <regex.h>
  37
  38 #define MIN_WORD_LENGTH 3       /* 1- and 2-char words don't count. */
  39 #define MAX_WORD_LENGTH 28      /* fits "antidisestablishmentarianism". */
  40
  41 /*
  42 ** A trivial structure for keeping track of words via both
  43 ** index to the overall word list and their counts.
  44 */
  45 struct word_entry {
  46     int index;
  47     int length;
  48     int count;
  49 };
  50
  51 /*
  52 ** Wrapper for qsort(3) comparison of word_entry (frequency).
  53 */
  54
  55 static int
  56 wvec_freq_cmp(const void *p1, const void *p2)
  57 {
  58     return ((const struct word_entry *)p2)->count -     /* decreasing sort */
  59            ((const struct word_entry *)p1)->count;
  60 }
  61
  62 /*
  63 ** Wrapper for qsort(3) comparison of word_entry (word length).
  64 */
  65
  66 static int
  67 wvec_length_cmp(const void *p1, const void *p2)
  68 {
  69     return ((const struct word_entry *)p2)->length -    /* decreasing sort */
  70            ((const struct word_entry *)p1)->length;
  71 }
  72
  73 /*
  74 ** Wrapper for qsort(3), for pointer-to-pointer strings.
  75 */
  76
  77 static int
  78 ptr_strcmp(const void *p1, const void *p2)
  79 {
  80     int cdiff;
  81
  82     cdiff = (**(const char **)p1) - (**(const char **)p2);
  83     if (cdiff)
  84         return cdiff;
  85     return strcmp((*(const char **)p1)+1, (*(const char **)p2)+1);
  86 }
  87
  88 /*
  89 **  Build new Keywords.
  90 */
  91
  92 void
  93 KEYgenerate(
  94     HDRCONTENT  *hc,    /* header data */
  95     const char  *body,  /* article body */
  96     const char  *v,     /* old kw value */
  97     size_t      l)      /* old kw length */
  98 {
  99
 100     int         word_count, word_length, bodylen, word_index, distinct_words;
 101     int         last;
 102     char        *text, *orig_text, *text_end, *this_word, *chase, *punc;
 103     static struct word_entry    *word_vec;
 104     static char         **word;
 105     static const char   *whitespace  = " \t\r\n";
 106
 107     /* ---------------------------------------------------------------- */
 108     /* Prototype setup: Regex match preparation. */
 109     static      int     regex_lib_init = 0;
 110     static      regex_t preg;
 111     static const char   *elim_regexp = "^\\([-+/0-9][-+/0-9]*\\|.*1st\\|.*2nd\\|.*3rd\\|.*[04-9]th\\|about\\|after\\|ago\\|all\\|already\\|also\\|among\\|and\\|any\\|anybody\\|anyhow\\|anyone\\|anywhere\\|are\\|bad\\|because\\|been\\|before\\|being\\|between\\|but\\|can\\|could\\|did\\|does\\|doing\\|done\\|dont\\|during\\|eight\\|eighth\\|eleven\\|else\\|elsewhere\\|every\\|everywhere\\|few\\|five\\|fifth\\|first\\|for\\|four\\|fourth\\|from\\|get\\|going\\|gone\\|good\\|got\\|had\\|has\\|have\\|having\\|he\\|her\\|here\\|hers\\|herself\\|him\\|himself\\|his\\|how\\|ill\\|into\\|its\\|ive\\|just\\|kn[eo]w\\|least\\|less\\|let\\|like\\|look\\|many\\|may\\|more\\|m[ou]st\\|myself\\|next\\|nine\\|ninth\\|not\\|now\\|off\\|one\\|only\\|onto\\|our\\|out\\|over\\|really\\|said\\|saw\\|says\\|second\\|see\\|set\\|seven\\|seventh\\|several\\|shall\\|she\\|should\\|since\\|six\\|sixth\\|some\\|somehow\\|someone\\|something\\|somewhere\\|such\\|take\\|ten\\|tenth\\|than\\|that\\|the\\|their\\!|them\\|then\\|there\\|therell\\|theres\\|these\\|they\\|thing\\|things\\|third\\|this\\|those\\|three\\|thus\\|together\\|told\\|too\\|twelve\\|two\\|under\\|upon\\|very\\|via\\|want\\|wants\\|was\\|wasnt\\|way\\|were\\|weve\\|what\\|whatever\\|when\\|where\\|wherell\\|wheres\\|whether\\|which\\|while\\|who\\|why\\|will\\|will\\|with\\|would\\|write\\|writes\\|wrote\\|yes\\|yet\\|you\\|your\\|youre\\|yourself\\)$";
 112
 113     if (word_vec == 0) {
 114         word_vec = xmalloc(innconf->keymaxwords * sizeof(struct word_entry));
 115         if (word_vec == 0)
 116             return;
 117         word = xmalloc(innconf->keymaxwords * sizeof(char *));
 118         if (word == NULL) {
 119             free(word_vec);
 120             return;
 121         }
 122     }
 123
 124     if (regex_lib_init == 0) {
 125         regex_lib_init++;
 126
 127         if (regcomp(&preg, elim_regexp, REG_ICASE|REG_NOSUB) != 0) {
 128             syslog(L_FATAL, "%s regcomp failure", LogName);
 129             abort();
 130         }
 131     }
 132     /* ---------------------------------------------------------------- */
 133
 134     /* first re-init kw from original value. */
 135     if (l > innconf->keylimit - (MAX_WORD_LENGTH+5))    /* mostly arbitrary cutoff: */
 136         l = innconf->keylimit - (MAX_WORD_LENGTH+5);    /* room for minimal word vec */
 137     hc->Value = xmalloc(innconf->keylimit+1);
 138     if ((v != NULL) && (*v != '\0')) {
 139         memcpy(hc->Value, v, l);
 140         hc->Value[l] = '\0';
 141     } else
 142         *hc->Value = '\0';
 143     l = hc->Length = strlen(hc->Value);
 144
 145     /*
 146      * now figure acceptable extents, and copy body to working string.
 147      * (Memory-intensive for hefty articles: limit to non-ABSURD articles.)
 148      */
 149     bodylen = strlen(body);
 150     if ((bodylen < 100) || (bodylen > innconf->keyartlimit)) /* too small/big to bother */
 151         return;
 152
 153     orig_text = text = xstrdup(body);   /* orig_text is for free() later on */
 154
 155     text_end = text + bodylen;
 156
 157     /* abusive punctuation stripping: turn it all into SPCs. */
 158     for (punc = text; *punc; punc++)
 159         if (!CTYPE(isalpha, *punc))
 160             *punc = ' ';
 161
 162     /* move to first word. */
 163     text += strspn(text, whitespace);
 164     word_count = 0;
 165
 166     /* hunt down words */
 167     while ((text < text_end) &&         /* while there might be words... */
 168            (*text != '\0') &&
 169            (word_count < innconf->keymaxwords)) {
 170
 171         /* find a word. */
 172         word_length = strcspn(text, whitespace);
 173         if (word_length == 0)
 174             break;                      /* no words left */
 175
 176         /* bookkeep to save word location, then move through text. */
 177         word[word_count++] = this_word = text;
 178         text += word_length;
 179         *(text++) = '\0';
 180         text += strspn(text, whitespace);       /* move to next word. */
 181
 182         /* 1- and 2-char words don't count, nor do excessively long ones. */
 183         if ((word_length < MIN_WORD_LENGTH) ||
 184             (word_length > MAX_WORD_LENGTH)) {
 185             word_count--;
 186             continue;
 187         }
 188
 189         /* squash to lowercase. */
 190         for (chase = this_word; *chase; chase++)
 191             if (CTYPE(isupper, *chase))
 192                 *chase = tolower(*chase);
 193     }
 194
 195     /* If there were no words, we're done. */
 196     if (word_count < 1)
 197         goto out;
 198
 199     /* Sort the words. */
 200     qsort(word, word_count, sizeof(word[0]), ptr_strcmp);
 201
 202     /* Count unique words. */
 203     distinct_words = 0;                 /* the 1st word is "pre-figured". */
 204     word_vec[0].index = 0;
 205     word_vec[0].length = strlen(word[0]);
 206     word_vec[0].count = 1;
 207
 208     for (word_index = 1;                /* we compare (N-1)th and Nth words. */
 209          word_index < word_count;
 210          word_index++) {
 211         if (strcmp(word[word_index-1], word[word_index]) == 0)
 212             word_vec[distinct_words].count++;
 213         else {
 214             distinct_words++;
 215             word_vec[distinct_words].index = word_index;
 216             word_vec[distinct_words].length = strlen(word[word_index]);
 217             word_vec[distinct_words].count = 1;
 218         }
 219     }
 220
 221     /* Sort the counts. */
 222     distinct_words++;                   /* we were off-by-1 until this. */
 223     qsort(word_vec, distinct_words, sizeof(struct word_entry), wvec_freq_cmp);
 224
 225     /* Sub-sort same-frequency words on word length. */
 226     for (last = 0, word_index = 1;      /* again, (N-1)th and Nth entries. */
 227          word_index < distinct_words;
 228          word_index++) {
 229         if (word_vec[last].count != word_vec[word_index].count) {
 230             if ((word_index - last) != 1)       /* 2+ entries to sub-sort. */
 231                 qsort(&word_vec[last], word_index - last,
 232                       sizeof(struct word_entry), wvec_length_cmp);
 233             last = word_index;
 234         }
 235     }
 236     /* do it one last time for the only-one-appearance words. */
 237     if ((word_index - last) != 1)
 238         qsort(&word_vec[last], word_index - last,
 239               sizeof(struct word_entry), wvec_length_cmp);
 240
 241     /* Scribble onto end of Keywords:. */
 242     strcpy(hc->Value + l, ",\377");             /* magic separator, 'ÿ' */
 243     for (chase = hc->Value + l + 2, word_index = 0;
 244          word_index < distinct_words;
 245          word_index++) {
 246         /* ---------------------------------------------------------------- */
 247         /* "noise" words don't count */
 248         if (regexec(&preg, word[word_vec[word_index].index], 0, NULL, 0) == 0)
 249             continue;
 250         /* ---------------------------------------------------------------- */
 251
 252         /* add to list. */
 253         *chase++ = ',';
 254         strcpy(chase, word[word_vec[word_index].index]);
 255         chase += word_vec[word_index].length;
 256
 257         if (chase - hc->Value > (innconf->keylimit - (MAX_WORD_LENGTH + 4)))
 258             break;
 259     }
 260     /* note #words we didn't get to add. */
 261     /* This code can potentially lead to a buffer overflow if the number of
 262        ignored words is greater than 100, under some circumstances.  It's
 263        temporarily disabled until fixed. */
 264     hc->Length = strlen(hc->Value);
 265
 266 out:
 267     /* We must dispose of the original strdup'd text area. */
 268     free(orig_text);
 269 }
 270
 271 #endif /* DO_KEYWORDS */