2 *******************************************************************************
5 * RCS: $Header: /home/matthew/cvs/bible-kjv-4.10/tsl.c,v 2.10 2005/01/22 17:49:03 matthew Exp $
6 * Description: Text Storage Library
7 * Author: Chip Chapin, Hewlett Packard Company
8 * Created: Thu Aug 24 15:37:16 1989
9 * Modified: Mon Apr 26 11:16:05 1993 (Chip Chapin) chip@hpclbis
11 * Package: Bible Retrieval System
12 * Status: Experimental (Do Not Distribute)
14 *******************************************************************************
18 * Fri Apr 23 10:14:37 1993 (Chip Chapin) chip@hpclbis
19 * Revised tsl_init for new portable data file headers.
20 * Rewrite shift-and-or expressions to account for Ultrix C oddities.
21 * Thu Apr 22 11:38:47 1993 (Chip Chapin) chip@hpclbis
22 * Fix incorrect cast to cf_index.
23 * Thu Mar 4 12:11:24 1993 (Chip Chapin) chip@hpclbis
24 * Added SEEK_SET definition for non-SysV systems.
25 * Wed Jan 6 10:05:43 1993 (Chip Chapin) chip@hpclbis
26 * Added compression support for concordance lists.
27 * Mon Jan 4 11:21:50 1993 (Chip Chapin) chip@hpclbis
28 * Added range parameters to tsl_scan_concordance.
29 * Thu Dec 24 11:32:22 1992 (Chip Chapin) chip@hpclbis
30 * Tweak a couple of things to eliminate compile warnings.
31 * Mon Dec 21 13:40:54 1992 (Chip Chapin) chip@hpclbis
32 * Made findfile a separate function in util.c
33 *******************************************************************************
35 * Revision 2.10 2005/01/22 17:49:03 matthew
36 * remove spurious definition of malloc and strtok
38 * Revision 2.9 2005/01/22 16:54:13 matthew
39 * another cast to make things explicit
41 * Revision 2.8 2005/01/22 00:25:08 matthew
44 * Revision 2.7 2005/01/21 19:47:07 matthew
45 * include cmp.h which prototypes the compression routines we call
47 * Revision 2.6 2003/07/26 11:57:02 matthew
48 * tidy up of code (__attribute__ ((noreturn)), remove unused variables, etc
50 * Revision 2.5 2003/07/26 11:44:55 matthew
51 * add function prototypes to the tsl header file (and add some const modifiers)
53 * Revision 2.4 2003/07/26 09:25:05 matthew
54 * Move tsl_error declaration to tsl.h
56 * Revision 2.3 2003/07/26 09:18:06 matthew
57 * use starg.h-style variable arguments. Also use gccs format checking
59 * Revision 2.2 2003/01/08 19:10:24 matthew
60 * correct the .win assignments to be 0 not null, since they are ints not pointers
62 * Revision 2.1 2003/01/08 15:50:53 matthew
63 * applied debian patch
65 * Revision 2.0 2003/01/08 15:29:52 matthew
66 * versions collected from the net
68 * Revision 1.9 93/04/26 11:18:14 11:18:14 chip (Chip Chapin)
70 * Public release of portable datafile version.
72 * Revision 1.8 93/04/23 13:08:02 13:08:02 chip (Chip Chapin)
74 * This version supports portable data files, usable on machines with
75 * differing native byte-orders.
76 * Also, this version compiles and runs on non-HPUX systems. It has been
77 * tested on SunOS 4.? and ULTRIX 4.?, using SPARC and DEC 3100 hardware
78 * respectively. Note that the data file format has rolled again.
80 * Revision 1.7 93/01/07 12:17:29 12:17:29 chip (Chip Chapin)
81 * Release 3.01: Greatly improved compression of concordance data file.
83 * Revision 1.6 93/01/04 16:20:59 16:20:59 chip (Chip Chapin)
84 * Release 2.1, implements ?in and ?or commands.
86 * Revision 1.5 92/12/24 11:37:10 11:37:10 chip (Chip Chapin)
87 * Minor release 2.04a, fixes certain compile warnings.
89 * Revision 1.4 92/12/22 11:28:57 11:28:57 chip (Chip Chapin)
90 * Minor release 2.01 -- fix a couple of bugs.
92 * Revision 1.3 92/12/21 20:00:49 20:00:49 chip (Chip Chapin)
93 * Release 2.0. This release adds the concordance, and some small fixes.
95 * Revision 1.2 89/09/14 20:33:52 20:33:52 chip (Chip Chapin)
96 * Release 1-2. Supports -f and -l options for formatting the output.
97 * Updates primarily brl.c, bible.c, and bible.1.
99 * Revision 1.1 89/09/05 17:49:19 17:49:19 chip (Chip Chapin)
105 /*----------------------------------------------------------------------
110 | This file implements the library of routines that are
111 | dependent on the storage structure of the text database.
115 | Return text for a particular range of lines.
117 | Write text to stdout instead of returning a buffer.
118 | tsl_scan_concordance
119 | Return list of references for a particular target.
126 | 890824 cc Extracted storage-dependent functions from brl.c
128 \*----------------------------------------------------------------------*/
134 /* #include <search.h> */
141 # endif /* SEEK_SET */
147 static char rcs_ident[] __attribute__ ((unused)) ="@(#)$Header: /home/matthew/cvs/bible-kjv-4.10/tsl.c,v 2.10 2005/01/22 17:49:03 matthew Exp $";
149 static char rcs_ident[]="@(#)$Header: /home/matthew/cvs/bible-kjv-4.10/tsl.c,v 2.10 2005/01/22 17:49:03 matthew Exp $";
152 FILE *tfp; /* Text data file pointer */
153 FILE *cfp; /* Concordance data file pointer */
154 struct tsl_conc_fileheader cfh; /* Concordance file header */
155 char *cf_words; /* Concordance string (word) buffer */
156 short int *cf_index; /* Concordance index */
157 int tsl_wsize; /* Window size (bytes) */
158 int tsl_wnum; /* Number of windows */
159 file_ptr_t *tsl_wtable=NULL; /* Table of window offsets in data file */
161 /* buffer structures.
162 We maintain a doubly linked list of uncompressed text buffers, sorted in
166 struct buffer_rec *prev, *next; /* doubly linked list */
167 int win; /* number of associated window */
168 char *bufferp; /* the buffer */
171 struct buffer_rec tsl_firstbuffer; /* ptr to first buffer. */
172 struct buffer_rec tsl_lastbuffer; /* take a guess... */
173 struct buffer_rec **tsl_wbuffers=NULL; /* table for associating a window with
174 a particular buffer. Indexed by
175 window number, the table yields a
176 pointer to a buffer_rec. */
178 char *tsl_cmpbuffer=NULL; /* Global buffer for compressed text */
179 int tsl_numbuffs; /* Count how many buffers active */
180 int tsl_maxbuffs; /* Maximum number of buffers we're allowed */
181 int tsl_maxbuffusage=0x100000; /* Max buffer mem usage (bytes) */
184 void tsl_error(const int fatal, const char *format, ...)
185 /*----------------------------------------------------------------------
190 | Report an error specific to the TSL library.
192 | fatal TRUE if the error should cause an exit.
193 | va_alist Variable argument list for printing the error
199 \*----------------------------------------------------------------------*/
203 va_start(ap, format);
205 vfprintf(stderr, format, ap);
214 int tsl_scan_concordance(const char *target, ref_t *sbuf, ref_t range_start,
216 /*----------------------------------------------------------------------
218 | tsl_scan_concordance
221 | Read concordance data file, searching for a specific
222 | target word. Return list of references, optionally
223 | limited to those within a specific range.
225 | target Target string to search for
226 | sbuf Output buffer for references
227 | range_start Starting reference. If zero, then return
229 | range_end Ending reference.
231 | RETURN VALUE: number of matches found
234 | 921217 cc Initial creation.
235 | 921221 cc Revised to use new concordance file format.
236 | 930104 cc Added range_start, range_end params.
238 \*----------------------------------------------------------------------*/
244 unsigned char mapbyte;
245 file_ptr_t inx_start;
246 unsigned char tbuf[SELECTSZ*sizeof(ref_t)];
249 tsl_error( FALSE, "(No concordance data file)" );
253 /* Search through concordance string buffer */
254 /* Note that the last string is guaranteed to be "~" */
257 inx_start = 0; /* Keep track of index into ref data pool */
258 /* See makeconcfile.c */
259 while ((n=strcmp(word, target)) < 0) {
260 /* This isn't it, but keep looking... */
261 while (*word++) /* advance to next word */
263 rsize = cf_index[indx];
264 inx_start += rsize < 0 ? 0 : rsize;
265 indx++; /* indx is count of words */
268 /* This isn't it, and we've passed where it would be */
273 rsize = cf_index[indx];
275 /* Special case 1: singletons
276 If a word has only a single ref, then the ref is
277 negated and put in the word's index entry.
280 if (!range_start || (range_start <= *sbuf && *sbuf <= range_end))
286 /* Read all the refs for this word */
287 /* Where do they end? */
288 fseek( cfp, (int)(inx_start + univ2int(cfh.data_ptr)), SEEK_SET );
289 fread( tbuf, 1, rsize, cfp );
291 /* Process the ref list.
292 Expand compressed references.
293 If a range has been specified, exclude refs that are out of it.
295 for (n=i=0; i < rsize; i++) {
296 if (tbuf[i] > 0x7f) {
297 /* This begins a compressed entry */
298 /* First get the base ref, which has been negated */
299 ref = (tbuf[i] << 8); /* Do this in two parts. */
300 ref |= tbuf[++i]; /* Ultrix C goofs otherwise. */
301 curbase = ref = -ref;
303 if (ref > range_end) break;
304 if (range_start > ref) {
314 /* Two zero bytes means end of map */
315 while (mapbyte | tbuf[i+1]) {
316 /* Extract refs from mapbyte */
317 for (j=0; j<8; j++) {
318 if (mapbyte & (0x01 <<j)) {
321 if (ref > range_end) return( n );
322 if (range_start > ref) continue;
330 i++; /* Advance to second zero byte */
333 /* This is a simple reference.
334 References are two-byte integers, in hi,lo byte order.
336 ref = (tbuf[i] << 8); /* Do this in two parts. */
337 ref |= tbuf[++i]; /* Ultrix C goofs otherwise. */
339 if (ref > range_end) break;
340 if (range_start > ref) continue;
346 /* Return count of entries */
348 } /* tsl_scan_concordance */
352 int tsl_gettext(const int vn, const int vc, char *vb, const int vbsize )
353 /*----------------------------------------------------------------------
358 | Stuff buffer "vb" with text of line number "vn" and the
359 | "vc-1" following lines, but no more than "vbsize" (buffer
362 | Returns the size (characters) of the text, *not* counting
363 | the terminating null.
366 | 890114 cc Initial implementation using simple plain text file.
367 | 890829 cc Updated to think about buffer size limits.
369 \*----------------------------------------------------------------------*/
373 vstart = line_locator[vn];
374 vsize = line_locator[ vn+vc ] - vstart;
376 vsize = vbsize-1; /* Leave room for trailing null */
377 return tsl_textread( vstart, vsize, vb );
382 int tsl_printtext(const int vn, const int vc)
383 /*----------------------------------------------------------------------
388 | Write text of line number "vn" and the "vc-1" following
391 | Returns the number of characters written.
394 | 890902 cc Creation from tsl_gettext.
396 \*----------------------------------------------------------------------*/
400 vstart = line_locator[vn];
401 vsize = line_locator[ vn+vc ] - vstart;
403 return tsl_textread( vstart, vsize, NULL );
404 } /* tsl_printtext */
408 int tsl_textread(int start, const int vsize, char *vb)
409 /*----------------------------------------------------------------------
414 | Get text starting at absolute byte location "start", and
415 | continuing for "vsize" bytes. If "vb" is NULL, then write
416 | the text to stdout, otherwise put it into the buffer
417 | pointed to by "vb" and append a null (\0).
419 | Returns the size (characters) of the text, *not* counting
420 | the terminating null.
423 | 890824 cc Rewritten to handle windowed compressed files.
424 | 890829 cc Added all the buffer handling -- used to throw
425 | them away each time.
426 | 890902 cc Added stdout option.
427 | 890904 cc Iterate on multiple windows, instead of recursing.
429 \*----------------------------------------------------------------------*/
432 /* here's the version that works with a plain text file */
433 if (fseek( tfp, (int)start, 0 ) == EOF) {
434 tsl_error( TRUE, "Cannot seek" );
437 if (fread( vb, 1, vsize, tfp ) != vsize) {
438 tsl_error( TRUE, "Short read" );
443 /* here's the version that works with a windowed compressed file */
444 /* We use the starting byte in the original file ("start"), and
445 the window size ("tsl_wsize") to determine which window we need.
446 Using "tsl_wtable", we know where the compressed window starts in the
447 file, and how big it is. Read the compressed window and uncompress it.
448 Now we can locate the start of the text and return it.
450 int window; /* current window number */
451 int bstart; /* starting byte relative to beginning
452 of uncompressed window */
453 file_ptr_t wstart; /* starting position of compressed
454 window within the data file */
455 int cmpwsize; /* size of compressed window within the
457 int bytes_remaining; /* Number of bytes yet to be done */
458 int size; /* bytes needed from current window */
459 char *uncbuf; /* buffer for uncompressed data */
460 struct buffer_rec *brp; /* current buffer rec */
461 char *cp, *ep; /* Handy pointers */
463 bytes_remaining = vsize;
464 while (bytes_remaining > 0) {
465 window = start / tsl_wsize; /* which window? */
466 if (window >= tsl_wnum) /* Yikes! Off the end! */
467 tsl_error( TRUE, "Window %d out of range 0-%d", window, tsl_wnum );
468 bstart = start % tsl_wsize; /* where in [uncompressed] window? */
469 if (bstart+bytes_remaining > tsl_wsize)
470 /* Request crosses boundary into next window */
471 size = tsl_wsize-bstart;
473 /* Request can be completed in current window */
474 size = bytes_remaining;
477 /* Notes on buffer handling ...
478 Three main cases are recognized:
479 1) The buffer for this window is already present.
480 2) It's not present and we can allocate a new buffer.
481 3) It's not present and we reuse an existing buffer.
484 if (tsl_wbuffers[window] != NULL) {
485 /* Buffer is already present */
486 brp = tsl_wbuffers[window];
487 uncbuf = brp->bufferp;
489 /* Unlink the buffer from the list.
490 We completely unlink it so the code for putting it back in
491 can be the same regardless of whether or not this is a new buffer.
493 brp->prev->next = brp->next;
494 brp->next->prev = brp->prev;
497 wstart = tsl_wtable[window]; /* window start in file */
498 cmpwsize = tsl_wtable[window+1]
499 - wstart; /*Size of compressed window*/
500 if (fseek( tfp, (int)wstart, 0 ) == EOF) {
501 tsl_error( TRUE, "Bad seek" );
503 if (cmpwsize > tsl_wsize) {
504 /* This should never happen */
505 tsl_error( TRUE, "Compressed window bigger than window size!");
507 if ((int)fread( tsl_cmpbuffer, 1, cmpwsize, tfp ) != cmpwsize) {
508 tsl_error( TRUE, "Short read" );
511 /* Need a new buffer */
512 if ( tsl_numbuffs >= tsl_maxbuffs ) {
513 /* We're at the limit -- need to recycle one
514 Grab the buffer at the end of the LRU list.
516 brp = tsl_lastbuffer.prev; /* there it is */
517 brp->prev->next = brp->next; /* unlink it */
518 brp->next->prev = brp->prev;
519 uncbuf = brp->bufferp;
520 tsl_wbuffers[brp->win] = NULL; /* former owner loses */
522 /* allocate a new buffer */
523 tsl_wbuffers[window] = brp =
524 (struct buffer_rec *)malloc( sizeof(struct buffer_rec) );
526 tsl_error( TRUE, "Bad malloc" );
527 brp->bufferp = uncbuf = malloc( tsl_wsize );
529 tsl_error( TRUE, "Bad malloc" );
532 tsl_wbuffers[window] = brp;
535 if (cmp_decompress( (unsigned char*)tsl_cmpbuffer, (unsigned char*)uncbuf, cmpwsize ) != tsl_wsize) {
536 /* Last window is probably small. Just ignore its size */
537 if (window != (tsl_wnum-1)) {
539 tsl_error( TRUE, "Bad decompression, result is wrong size" );
542 } /* else we read and decompressed the window */
544 /* Insert this buffer at head of list */
545 brp->next = tsl_firstbuffer.next;
546 brp->next->prev = brp;
547 tsl_firstbuffer.next = brp;
548 brp->prev = &tsl_firstbuffer;
550 /* If we've gotten this far, we have a nice decompressed buffer to use */
551 cp = &uncbuf[bstart]; /* starting address */
554 while (cp != ep) putchar( *cp++ );
556 memcpy( vb, cp, size );
559 bytes_remaining -= size;
562 if (vb != NULL) *vb = '\0';
565 return vsize - bytes_remaining;
570 void tsl_init(char *dfname,char *path, const int memlimit)
571 /*----------------------------------------------------------------------
576 | Initialize the TSL library.
578 | dfname Name of data file.
579 | path Search path to use for file.
580 | memlimit Limit (in Kbytes) on buffer space to use.
583 | 890825 cc Rewrite for compressed windowed files.
584 | 890830 cc Added memlimit.
585 | 890904 cc Implemented search path.
586 | 921221 cc Moved path search into separate function findfile().
587 | Added initialization for concordance.
588 | 930423 cc Revised data file headers using Univ_Int.
590 \*----------------------------------------------------------------------*/
592 struct tsl_fileheader fh;
601 tsl_maxbuffusage = memlimit<<10; /* times 1024 */
603 /* Open main data file */
604 if ((tfp = findfile(dfname, path)) == NULL) {
605 tsl_error( TRUE, "Cannot open data file %s", dfname );
608 /* What do we have here? Let's check out the file header... */
609 if (!fread( &fh, sizeof(fh), 1, tfp )) {
610 tsl_error( TRUE, "Cannot read data file %s", dfname );
612 if ((fh.magic[0] != TSL_MAGIC1) || (fh.magic[1] != TSL_MAGIC2))
613 tsl_error( TRUE, "Cannot use data file %s: Bad magic number",
615 if ((fh.version[0] != TSL_FVERSION1) || (fh.version[1] != TSL_FVERSION2))
616 tsl_error( TRUE, "Cannot use data file %s: Wrong version",
619 tsl_wsize = univ2int( fh.wsize );
620 tsl_wnum = univ2int( fh.wnum );
622 /* Grab the window table */
623 tablesize = sizeof(int)*(tsl_wnum+1); /* +1 for ending entry */
624 if ((tsl_wtable = (file_ptr_t *)malloc( tablesize )) == NULL)
625 tsl_error( TRUE, "Bad malloc" );
626 if (!fread( tsl_wtable, tablesize, 1, tfp )) {
627 tsl_error( TRUE, "Error reading data file %s", dfname );
629 /* Convert Univ_Ints in window table to regular ints */
630 up = (Univ_Int *) tsl_wtable;
631 for (i=0; i<=tsl_wnum; i++) {
632 tsl_wtable[i] = (file_ptr_t) univ2int( *up++ );
635 /* Create buffer table (parallel array to window table) */
637 (struct buffer_rec **)malloc(sizeof(*tsl_wbuffers)*tsl_wnum )) == NULL)
638 tsl_error( TRUE, "Bad malloc" );
639 for (i=0; i< tsl_wnum; i++) tsl_wbuffers[i] = NULL;
641 tsl_numbuffs = 0; /* active buffers of uncompressed text */
642 tsl_maxbuffs = tsl_maxbuffusage / tsl_wsize;
643 if (tsl_maxbuffs < 1) tsl_maxbuffs = 1;
644 tsl_firstbuffer.next = &tsl_lastbuffer;
645 tsl_firstbuffer.prev = NULL;
646 tsl_firstbuffer.win = 0;
647 tsl_firstbuffer.bufferp = NULL;
648 tsl_lastbuffer.prev = &tsl_firstbuffer;
649 tsl_lastbuffer.next = NULL;
650 tsl_lastbuffer.win = 0;
651 tsl_lastbuffer.bufferp = NULL;
653 /* Global buffer for compressed text. Much bigger than needed. :-) */
654 if ((tsl_cmpbuffer = malloc( tsl_wsize )) == NULL)
655 tsl_error( TRUE, "Bad malloc" );
657 cmp_init(); /* Initialize decompression */
659 /* OK, now let's see if there's a matching concordance file */
660 strncpy( cfname, dfname, STRSZ );
661 strncat( cfname, ".conc", STRSZ );
662 if ((cfp = findfile(cfname, path)) == NULL) {
663 tsl_error( FALSE, "(No concordance file '%s' found)", cfname );
665 /* Got a file. Now read the header. */
666 if (!fread( &cfh, sizeof(cfh), 1, cfp )) {
668 "Warning: Error reading concordance '%s'", cfname );
671 if ((cfh.magic[0] != TSL_CONCMAGIC1) ||
672 (cfh.magic[1] != TSL_CONCMAGIC2))
674 "Cannot use concordanc file '%s': Bad magic number",
676 if ((cfh.version[0] != TSL_CONCFVERSION1) ||
677 (cfh.version[1] != TSL_CONCFVERSION2))
679 "Cannot use concordance file %s: Wrong version",
682 /* Allocate & initialize buffer for strings (all words) */
683 i=univ2int(cfh.index_ptr) - univ2int(cfh.word_ptr);
684 cf_words = malloc( i );
685 fread( cf_words, 1, i, cfp );
687 /* Allocate & initialize buffer for index */
688 i=univ2int(cfh.data_ptr) - univ2int(cfh.index_ptr);
689 cf_index = (short int *) malloc( i );
690 fread( cf_index, 1, i, cfp );
691 /* Convert from Short_Univ_Int to short int */
692 sup = (Short_Univ_Int *) cf_index;
693 for (i=0; i<=univ2int(cfh.word_cnt); i++) {
694 cf_index[i] = (short int) shortuniv2int( *sup++ );
702 /*----------------------------------------------------------------------
707 | Tidy up before leaving the TSL library.
711 \*----------------------------------------------------------------------*/
713 struct buffer_rec *bufp, *nbufp;
718 /* Free all kinds of buffers and tables */
719 bufp=tsl_firstbuffer.next;
720 while (bufp != &tsl_lastbuffer) {
722 if (bufp->bufferp != NULL)
723 free(bufp->bufferp); /* free the buffer */
724 free(bufp); /* free the buffer rec */
725 bufp = nbufp; /* on to next buffer rec */
727 if (tsl_wtable != NULL) free(tsl_wtable);
728 if (tsl_cmpbuffer != NULL) free(tsl_cmpbuffer);