2 *******************************************************************************
5 * RCS: $Header: /home/matthew/cvs/bible-kjv-4.10/makeconcfile.c,v 2.7 2005/01/23 11:27:00 matthew Exp $
6 * Description: Generate Concordance Data File
8 * Created: Thu Dec 17 11:28:20 1992
9 * Modified: Mon Apr 26 11:14:38 1993 (Chip Chapin) chip@hpclbis
11 * Package: Text Storage Library (Bible Retrieval System)
12 * Status: Experimental (Do Not Distribute)
14 *******************************************************************************
18 * Thu Jan 7 12:07:05 1993 (Chip Chapin) chip@hpclbis
20 * Implemented cumulative index entries to save space (24kbytes for KJV).
21 * Wed Jan 6 10:08:12 1993 (Chip Chapin) chip@hpclbis
22 * Began implementation of list compression.
23 * Wed Dec 23 13:43:11 1992 (Chip Chapin) chip@hpclbis
24 * Unlink tmp files when done.
25 *******************************************************************************
26 * $Log: makeconcfile.c,v $
27 * Revision 2.7 2005/01/23 11:27:00 matthew
28 * include more standard header files
30 * Revision 2.6 2005/01/22 18:07:28 matthew
31 * prototype main properly
33 * Revision 2.5 2005/01/22 17:17:54 matthew
34 * cast return value of sizeof
36 * Revision 2.4 2005/01/22 16:28:28 matthew
39 * Revision 2.3 2005/01/22 16:27:34 matthew
40 * sort out function prototyping, and include util.h
42 * Revision 2.2 2003/07/26 10:02:53 matthew
43 * fprintf("\0") is not very useful
45 * Revision 2.1 2003/02/01 02:39:53 matthew
46 * make main int main ..., and accordingly return 0 at the end of it, if
49 * Revision 2.0 2003/01/08 15:29:52 matthew
50 * versions collected from the net
52 * Revision 1.5 93/04/26 11:18:17 11:18:17 chip (Chip Chapin)
54 * Public release of portable datafile version.
56 * Revision 1.4 93/04/23 13:08:07 13:08:07 chip (Chip Chapin)
58 * This version supports portable data files, usable on machines with
59 * differing native byte-orders.
60 * Also, this version compiles and runs on non-HPUX systems. It has been
61 * tested on SunOS 4.? and ULTRIX 4.?, using SPARC and DEC 3100 hardware
62 * respectively. Note that the data file format has rolled again.
64 * Revision 1.3 93/01/07 12:18:06 12:18:06 chip (Chip Chapin)
65 * Release 3.01: Greatly improved compression of concordance data file.
68 *******************************************************************************
82 #define INDEXTMPF "/tmp/mci"
83 #define DATATMPF "/tmp/mcd"
85 static void outerr(int n);
86 static void print_header(void);
88 char *myname; /* Program Name */
91 Structure of Input Data ...
93 Input data is a sorted ASCII file.
94 Each line is a record.
95 Each record is a variable-length list of fields, separated by blanks,
98 <word> <ref> {<ref>...}
100 where <word> is a lower-case alpha string, and <ref> is an integer
101 numeric string. Each <word> is guaranteed to be unique. <ref> is
102 expected to describe a location in the subject document where
103 <word> occurs, though this program doesn't actually care about
106 Structure of Output Data ...
108 Please refer to description in tsl.h.
111 struct tsl_conc_fileheader fh;
114 int main(int argc,char **argv)
115 /*----------------------------------------------------------------------
123 | 921221 cc Initial creation.
125 \*----------------------------------------------------------------------*/
127 FILE *outfp, *indexfp, *datafp;
128 int ref; /* NOT a ref_t, because it must be signed */
129 ref_t rbuf[SELECTSZ];
130 unsigned char obuf[SELECTSZ*sizeof(ref_t)];
132 unsigned char bhi, blo;
137 int m_n, m_ref, m_cmpnum, m_cmpsize;
138 file_ptr_t word_index, data_index;
140 short int this_index;
141 Short_Univ_Int indextmp;
145 myname = *argv++; argc--; /* Program name */
148 fprintf( stderr, "%s: Missing output file name\n", myname );
149 fprintf( stderr, "Usage: %s datafile\n", myname );
153 outfp = fopen( *argv, "w" );
155 fprintf( stderr, "%s: Cannot open output file %s\n", myname, *argv );
158 indexfp = fopen( INDEXTMPF, "w+" );
159 datafp = fopen( DATATMPF, "w+" );
160 if ((indexfp == NULL) || (datafp == NULL)) {
161 fprintf( stderr, "%s: Cannot open temp files.\n", myname );
165 /* Initialize the header and write it out... */
166 fh.magic[0] = TSL_CONCMAGIC1;
167 fh.magic[1] = TSL_CONCMAGIC2;
168 fh.version[0] = TSL_CONCFVERSION1;
169 fh.version[1] = TSL_CONCFVERSION2;
170 strncpy( fh.name, "KJV Concordance", TSL_DESCRSZ );
171 univ_assign(fh.word_ptr, sizeof( fh ));
172 univ_assign(fh.word_cnt, 0);
173 univ_assign(fh.index_ptr, 0);
174 univ_assign(fh.data_ptr, 0);
175 fwrite( &fh, sizeof(fh), 1, outfp );
178 /* Process input data.
179 Each line consists of a word, followed by a list of numeric
180 references where the word occurs. Separated by blanks.
182 We will write the strings for the word list into the output file
183 as we go, meanwhile counting the words and building both the reference
184 index and the reference data pool in temp files.
186 To save space, the reference index entries are not actually pointers
187 into the reference data pool, but are the SIZE of the entry's ref data
190 So the offset in the ref data pool for entry N is the
191 sum over i=0..N-1 of index(i).
193 To complicate things more, there is a special case when an
194 entry N has only a single ref. In that case, the ref is NOT
195 entered into the ref data pool but the ref itself is stored as
196 index(N). It is negated to distinguish it from a regular index
200 word_index = 0; /* The index of each word, 0..N */
201 data_index = 0; /* The offset in the ref data pool of current entry */
202 while (scanf( "%s", word) > 0) {
203 /* Append string to word list */
204 if ((n=fputs( word, outfp )) <= 0)
208 /* Start reading references */
210 while (scanf( "%d", &n) > 0) {
211 /* Read all refs into buffer */
212 rbuf[cnt++] = (ref_t) n;
215 /* this should never happen */
216 fprintf( stderr, "%s: Bad input format. No refs for %s\n",
220 entry_size = 0; /* size (bytes) of current ref data entry */
222 /* Special handling for this case: word has a SINGLE REFERENCE.
223 Instead of putting the ref into the data pool, just keep it
224 in the index. Encode it by negating it.
226 this_index = -rbuf[0];
228 /* Write reference list to reference data pool */
229 for (n=oinx=0, refs_left=cnt; refs_left--; n++) {
232 if (refs_left && (rbuf[n+1]-ref <= 16)) {
233 /* We can compress at least one (though possibly to
234 no advantage). How many more might we compress?
237 while ((cmpnum < refs_left) &&
238 (rbuf[n+cmpnum+1]-rbuf[n+cmpnum] <= 16)) {
243 How much will we really save?
244 Model the actual algorithm and compare...
249 m_ref = rbuf[++m_n]-curbase-1;
252 /* For each byte in bitmap... */
253 while ((m_ref < 8) && (m_ref >= 0)) {
254 /* For each ref that fits in this byte... */
257 m_ref = rbuf[++m_n]-curbase-1;
259 m_ref = -1; /* term. flag, both loops */
266 m_cmpsize += 2; /* Add terminator size */
268 /* Well, did it pay off? */
269 no_comp = (m_cmpsize > (cmpnum*(int)sizeof(ref_t)));
278 /* Do the compression.
280 <base address><bitmap>
281 Where <base address> is the NEGATED reference to the
282 line BEFORE the start of the bitmap refs.
284 /* Write base address */
293 ref is always with respect to curbase.
294 It is the number of verses, MINUS ONE, that
295 separate them. The MINUS ONE is so the
296 left-shift works correctly.
299 ref = rbuf[++n]-curbase-1;
303 /* For each byte in bitmap... */
305 while ((ref < 8) && (ref >= 0)) {
306 /* For each ref that fits in this byte... */
310 ref = rbuf[++n]-curbase-1;
312 ref = -1; /* term. flag, both loops */
320 /* Write TWO terminator bytes */
321 /* Might be better to use ONE, and change the requirement
322 to being within 8 lines, not 16 */
327 if ((n=fwrite( obuf, 1, oinx, datafp )) <= 0)
331 if (entry_size > 32767) {
332 fprintf( stderr, "%s: Too many entries for \"%s\"!\n",
336 this_index = entry_size;
337 } /* write ref list */
339 /* Write the index entry for this word */
340 shortuniv_assign( indextmp, this_index );
341 if ((n=fwrite( indextmp, sizeof(Short_Univ_Int), 1, indexfp )) <= 0)
344 data_index += entry_size;
348 /* It makes searching easier if we guarantee an ending string */
349 if ((n=fprintf( outfp, "~" )) <= 0)
351 shortuniv_assign( indextmp, 0 );
352 if ((n=fwrite( indextmp, sizeof(Short_Univ_Int), 1, indexfp )) <= 0)
355 /* OK, now finish putting the output file together. */
356 /* First, another header field */
357 univ_assign(fh.word_cnt, word_index);
359 /* Now copy the reference index from temp file to output file */
360 univ_assign(fh.index_ptr, ftell( outfp ));
362 while (fread( obuf, sizeof(Short_Univ_Int), 1, indexfp ) > 0) {
363 fwrite( obuf, sizeof(Short_Univ_Int), 1, outfp );
366 /* Then copy the reference data from temp file to output file */
367 univ_assign(fh.data_ptr, ftell( outfp ));
369 while (fread( obuf, 1, 1, datafp ) > 0) {
370 fwrite( obuf, 1, 1, outfp );
373 /* Finally, write out the updated file header */
375 fwrite( &fh, sizeof(fh), 1, outfp );
377 /* Prove that we've been working hard... */
389 static void outerr(int n)
391 fprintf( stderr, "%s: File I/O error #%d, %d\n", __FILE__, n, errno );
395 static void print_header(void)
397 printf( "Concordance data file:\n" );
398 printf( " Version: %c%c\n", fh.version[0], fh.version[1] );
399 printf( " Name: %s\n", fh.name );
400 printf( " Contents: %d words\n", univ2int(fh.word_cnt) );
401 printf( " Word list at file offset %d\n", univ2int(fh.word_ptr) );
402 printf( " Index at file offset %d\n", univ2int(fh.index_ptr) );
403 printf( " Data at file offset %d\n", univ2int(fh.data_ptr) );