makeconcfile.c

   1 /* -*-C-*-
   2 *******************************************************************************
   3 *
   4 * File:         makeconcfile.c
   5 * RCS:          $Header: /home/matthew/cvs/bible-kjv-4.10/makeconcfile.c,v 2.7 2005/01/23 11:27:00 matthew Exp $
   6 * Description:  Generate Concordance Data File
   7 * Author:       Chip Chapin
   8 * Created:      Thu Dec 17 11:28:20 1992
   9 * Modified:     Mon Apr 26 11:14:38 1993 (Chip Chapin) chip@hpclbis
  10 * Language:     C
  11 * Package:      Text Storage Library (Bible Retrieval System)
  12 * Status:       Experimental (Do Not Distribute)
  13 *
  14 *******************************************************************************
  15 *
  16 * Revisions:
  17 *
  18 * Thu Jan  7 12:07:05 1993 (Chip Chapin) chip@hpclbis
  19 *  Revised fileheader.
  20 *  Implemented cumulative index entries to save space (24kbytes for KJV).
  21 * Wed Jan  6 10:08:12 1993 (Chip Chapin) chip@hpclbis
  22 *  Began implementation of list compression.
  23 * Wed Dec 23 13:43:11 1992 (Chip Chapin) chip@hpclbis
  24 *  Unlink tmp files when done.
  25 *******************************************************************************
  26 * $Log: makeconcfile.c,v $
  27 * Revision 2.7  2005/01/23 11:27:00  matthew
  28 * include more standard header files
  29 *
  30 * Revision 2.6  2005/01/22 18:07:28  matthew
  31 * prototype main properly
  32 *
  33 * Revision 2.5  2005/01/22 17:17:54  matthew
  34 * cast return value of sizeof
  35 *
  36 * Revision 2.4  2005/01/22 16:28:28  matthew
  37 * initialise cmpnum.
  38 *
  39 * Revision 2.3  2005/01/22 16:27:34  matthew
  40 * sort out function prototyping, and include util.h
  41 *
  42 * Revision 2.2  2003/07/26 10:02:53  matthew
  43 * fprintf("\0") is not very useful
  44 *
  45 * Revision 2.1  2003/02/01 02:39:53  matthew
  46 * make main int main ..., and accordingly return 0 at the end of it, if
  47 * all is well
  48 *
  49 * Revision 2.0  2003/01/08 15:29:52  matthew
  50 * versions collected from the net
  51 *
  52  * Revision 1.5  93/04/26  11:18:17  11:18:17  chip (Chip Chapin)
  53  * Release 4.00
  54  * Public release of portable datafile version.
  55  *
  56  * Revision 1.4  93/04/23  13:08:07  13:08:07  chip (Chip Chapin)
  57  * PORTABILITY RELEASE
  58  * This version supports portable data files, usable on machines with
  59  * differing native byte-orders.
  60  * Also, this version compiles and runs on non-HPUX systems.  It has been
  61  * tested on SunOS 4.? and ULTRIX 4.?, using SPARC and DEC 3100 hardware
  62  * respectively.  Note that the data file format has rolled again.
  63  *
  64  * Revision 1.3  93/01/07  12:18:06  12:18:06  chip (Chip Chapin)
  65  * Release 3.01: Greatly improved compression of concordance data file.
  66  *
  67 *
  68 *******************************************************************************
  69 */
  70
  71 #include <stdio.h>
  72 #include <errno.h>
  73 #include <unistd.h>
  74 #include <string.h>
  75 #include <stdlib.h>
  76 #include "tsl.h"
  77 #include "util.h"
  78
  79 #define FALSE   (0)
  80 #define TRUE    (1)
  81
  82 #define INDEXTMPF       "/tmp/mci"
  83 #define DATATMPF        "/tmp/mcd"
  84
  85 static void outerr(int n);
  86 static void print_header(void);
  87
  88 char *myname;                   /* Program Name */
  89
  90 /*
  91   Structure of Input Data ...
  92
  93     Input data is a sorted ASCII file.
  94     Each line is a record.
  95     Each record is a variable-length list of fields, separated by blanks,
  96     as follows:
  97
  98        <word> <ref> {<ref>...}
  99
 100     where <word> is a lower-case alpha string, and <ref> is an integer
 101     numeric string.  Each <word> is guaranteed to be unique.  <ref> is
 102     expected to describe a location in the subject document where
 103     <word> occurs, though this program doesn't actually care about
 104     that.
 105
 106   Structure of Output Data ...
 107
 108     Please refer to description in tsl.h.
 109  */
 110
 111 struct tsl_conc_fileheader fh;
 112
 113
 114 int main(int argc,char **argv)
 115 /*----------------------------------------------------------------------
 116 |   NAME:
 117 |       main
 118 |
 119 |   ALGORITHM:
 120 |       Main Program.
 121 |
 122 |   HISTORY:
 123 |       921221 cc Initial creation.
 124 |
 125 \*----------------------------------------------------------------------*/
 126 {
 127     FILE        *outfp, *indexfp, *datafp;
 128     int         ref;            /* NOT a ref_t, because it must be signed */
 129     ref_t       rbuf[SELECTSZ];
 130     unsigned char obuf[SELECTSZ*sizeof(ref_t)];
 131     int         n, oinx, cnt;
 132     unsigned char bhi, blo;
 133     int         refs_left;
 134     int         no_comp;
 135     int         cmpnum=0;
 136     int         base, curbase;
 137     int         m_n, m_ref, m_cmpnum, m_cmpsize;
 138     file_ptr_t  word_index, data_index;
 139     int         entry_size;
 140     short int   this_index;
 141     Short_Univ_Int indextmp;
 142 #define WRDSZ    100
 143     char word[WRDSZ];
 144
 145     myname = *argv++; argc--;   /* Program name */
 146
 147     if (argc < 1) {
 148         fprintf( stderr, "%s: Missing output file name\n", myname );
 149         fprintf( stderr, "Usage: %s datafile\n", myname );
 150         exit( 1 );
 151     }
 152
 153     outfp = fopen( *argv, "w" );
 154     if (outfp == NULL) {
 155         fprintf( stderr, "%s: Cannot open output file %s\n", myname, *argv );
 156         exit( 1 );
 157     }
 158     indexfp = fopen( INDEXTMPF, "w+" );
 159     datafp = fopen( DATATMPF, "w+" );
 160     if ((indexfp == NULL) || (datafp == NULL)) {
 161         fprintf( stderr, "%s: Cannot open temp files.\n", myname );
 162         exit( 1 );
 163     }
 164
 165     /* Initialize the header and write it out... */
 166     fh.magic[0] = TSL_CONCMAGIC1;
 167     fh.magic[1] = TSL_CONCMAGIC2;
 168     fh.version[0] = TSL_CONCFVERSION1;
 169     fh.version[1] = TSL_CONCFVERSION2;
 170     strncpy( fh.name, "KJV Concordance", TSL_DESCRSZ );
 171     univ_assign(fh.word_ptr, sizeof( fh ));
 172     univ_assign(fh.word_cnt, 0);
 173     univ_assign(fh.index_ptr, 0);
 174     univ_assign(fh.data_ptr, 0);
 175     fwrite( &fh, sizeof(fh), 1, outfp );
 176
 177
 178     /* Process input data.
 179        Each line consists of a word, followed by a list of numeric
 180        references where the word occurs.  Separated by blanks.
 181
 182        We will write the strings for the word list into the output file
 183        as we go, meanwhile counting the words and building both the reference
 184        index and the reference data pool in temp files.
 185
 186        To save space, the reference index entries are not actually pointers
 187        into the reference data pool, but are the SIZE of the entry's ref data
 188        pool.
 189
 190        So the offset in the ref data pool for entry N is the
 191        sum over i=0..N-1 of index(i).
 192
 193        To complicate things more, there is a special case when an
 194        entry N has only a single ref.  In that case, the ref is NOT
 195        entered into the ref data pool but the ref itself is stored as
 196        index(N).  It is negated to distinguish it from a regular index
 197        entry.
 198
 199      */
 200     word_index = 0;     /* The index of each word, 0..N */
 201     data_index = 0;     /* The offset in the ref data pool of current entry */
 202     while (scanf( "%s", word) > 0) {
 203         /* Append string to word list */
 204         if ((n=fputs( word, outfp )) <= 0)
 205             outerr(n);
 206         putc( 0, outfp );
 207
 208         /* Start reading references */
 209         cnt = 0;
 210         while (scanf( "%d", &n) > 0) {
 211             /* Read all refs into buffer */
 212             rbuf[cnt++] = (ref_t) n;
 213         }
 214         if (cnt <= 0) {
 215             /* this should never happen */
 216             fprintf( stderr, "%s: Bad input format.  No refs for %s\n",
 217                     myname, word );
 218         }
 219
 220         entry_size = 0;         /* size (bytes) of current ref data entry */
 221         if (cnt == 1) {
 222             /* Special handling for this case: word has a SINGLE REFERENCE.
 223                Instead of putting the ref into the data pool, just keep it
 224                in the index.  Encode it by negating it.
 225              */
 226             this_index = -rbuf[0];
 227         } else {
 228             /* Write reference list to reference data pool */
 229             for (n=oinx=0, refs_left=cnt; refs_left--; n++) {
 230                 ref = rbuf[n];
 231                 no_comp = TRUE;
 232                 if (refs_left && (rbuf[n+1]-ref <= 16)) {
 233                     /* We can compress at least one (though possibly to
 234                        no advantage).  How many more might we compress?
 235                      */
 236                     cmpnum=1;
 237                     while ((cmpnum < refs_left) &&
 238                            (rbuf[n+cmpnum+1]-rbuf[n+cmpnum] <= 16)) {
 239                         cmpnum++;
 240                     }
 241
 242                     /*
 243                        How much will we really save?
 244                        Model the actual algorithm and compare...
 245                      */
 246                     m_cmpsize = 0;
 247                     curbase = ref;
 248                     m_n = n;
 249                     m_ref = rbuf[++m_n]-curbase-1;
 250                     m_cmpnum = cmpnum-1;
 251                     while (m_ref >= 0) {
 252                         /* For each byte in bitmap... */
 253                         while ((m_ref < 8) && (m_ref >= 0)) {
 254                             /* For each ref that fits in this byte... */
 255                             if (m_cmpnum > 0) {
 256                                 m_cmpnum--;
 257                                 m_ref = rbuf[++m_n]-curbase-1;
 258                             } else {
 259                                 m_ref = -1;     /* term. flag, both loops */
 260                             }
 261                         }
 262                         m_cmpsize++;
 263                         curbase += 8;
 264                         m_ref -= 8;
 265                     }
 266                     m_cmpsize += 2;             /* Add terminator size */
 267
 268                     /* Well, did it pay off? */
 269                     no_comp = (m_cmpsize > (cmpnum*(int)sizeof(ref_t)));
 270                 }
 271                 if (no_comp) {
 272                     /* No compression */
 273                     bhi = (ref >> 8);
 274                     blo = ref & 0x00ff;
 275                     obuf[oinx++] = bhi;
 276                     obuf[oinx++] = blo;
 277                 } else {
 278                     /* Do the compression.
 279                        Format:
 280                          <base address><bitmap>
 281                        Where <base address> is the NEGATED reference to the
 282                        line BEFORE the start of the bitmap refs.
 283                      */
 284                     /* Write base address */
 285                     base = ref;
 286                     ref = -ref;
 287                     bhi = (ref >> 8);
 288                     blo = ref & 0x00ff;
 289                     obuf[oinx++] = bhi;
 290                     obuf[oinx++] = blo;
 291
 292                     /* Write bitmap.
 293                          ref is always with respect to curbase.
 294                          It is the number of verses, MINUS ONE, that
 295                          separate them.  The MINUS ONE is so the
 296                          left-shift works correctly.
 297                      */
 298                     curbase = base;
 299                     ref = rbuf[++n]-curbase-1;
 300                     refs_left -= cmpnum;
 301                     cmpnum--;
 302                     while (ref >= 0) {
 303                         /* For each byte in bitmap... */
 304                         blo = 0;
 305                         while ((ref < 8) && (ref >= 0)) {
 306                             /* For each ref that fits in this byte... */
 307                             blo |= 0x01 << ref;
 308                             if (cmpnum > 0) {
 309                                 cmpnum--;
 310                                 ref = rbuf[++n]-curbase-1;
 311                             } else {
 312                                 ref = -1;       /* term. flag, both loops */
 313                             }
 314                         }
 315                         obuf[oinx++] = blo;
 316                         curbase += 8;
 317                         ref -= 8;
 318                     }
 319
 320                     /* Write TWO terminator bytes */
 321                     /* Might be better to use ONE, and change the requirement
 322                        to being within 8 lines, not 16 */
 323                     obuf[oinx++] = 0;
 324                     obuf[oinx++] = 0;
 325                 } /* compression */
 326             } /* for */
 327             if ((n=fwrite( obuf, 1, oinx, datafp )) <= 0)
 328                 outerr(n);
 329
 330             entry_size = oinx;
 331             if (entry_size > 32767) {
 332                 fprintf( stderr, "%s: Too many entries for \"%s\"!\n",
 333                         myname, word );
 334                 exit(1);
 335             }
 336             this_index = entry_size;
 337         } /* write ref list */
 338
 339         /* Write the index entry for this word */
 340         shortuniv_assign( indextmp, this_index );
 341         if ((n=fwrite( indextmp, sizeof(Short_Univ_Int), 1, indexfp )) <= 0)
 342             outerr(n);
 343
 344         data_index += entry_size;
 345         word_index++;
 346     }
 347
 348     /* It makes searching easier if we guarantee an ending string */
 349     if ((n=fprintf( outfp, "~" )) <= 0)
 350         outerr(n);
 351     shortuniv_assign( indextmp, 0 );
 352     if ((n=fwrite( indextmp, sizeof(Short_Univ_Int), 1, indexfp )) <= 0)
 353         outerr(n);
 354
 355     /* OK, now finish putting the output file together. */
 356     /* First, another header field */
 357     univ_assign(fh.word_cnt, word_index);
 358
 359     /* Now copy the reference index from temp file to output file */
 360     univ_assign(fh.index_ptr, ftell( outfp ));
 361     rewind( indexfp );
 362     while (fread( obuf, sizeof(Short_Univ_Int), 1, indexfp ) > 0) {
 363         fwrite( obuf, sizeof(Short_Univ_Int), 1, outfp );
 364     }
 365
 366     /* Then copy the reference data from temp file to output file */
 367     univ_assign(fh.data_ptr, ftell( outfp ));
 368     rewind( datafp );
 369     while (fread( obuf, 1, 1, datafp ) > 0) {
 370         fwrite( obuf, 1, 1, outfp );
 371     }
 372
 373     /* Finally, write out the updated file header */
 374     rewind( outfp );
 375     fwrite( &fh, sizeof(fh), 1, outfp );
 376
 377     /* Prove that we've been working hard... */
 378     print_header();
 379
 380     fclose(outfp);
 381     fclose(indexfp);
 382     fclose(datafp);
 383     unlink( INDEXTMPF );
 384     unlink( DATATMPF );
 385     return(0);
 386 } /* main */
 387
 388
 389 static void outerr(int n)
 390 {
 391     fprintf( stderr, "%s: File I/O error #%d, %d\n", __FILE__, n, errno );
 392 }
 393
 394
 395 static void print_header(void)
 396 {
 397     printf( "Concordance data file:\n" );
 398     printf( "  Version:  %c%c\n", fh.version[0], fh.version[1] );
 399     printf( "  Name:     %s\n", fh.name );
 400     printf( "  Contents: %d words\n", univ2int(fh.word_cnt) );
 401     printf( "  Word list at file offset %d\n", univ2int(fh.word_ptr) );
 402     printf( "  Index at file offset %d\n", univ2int(fh.index_ptr) );
 403     printf( "  Data at file offset %d\n", univ2int(fh.data_ptr) );
 404 }