2 ###############################################################################
4 # File: makeconcordance
5 # RCS: $Header: /home/matthew/cvs/bible-kjv-4.10/makeconcordance,v 2.0 2003/01/08 15:29:52 matthew Exp $
6 # Description: Create concordance database
7 # Author: Chip Chapin, Hewlett-Packard Company
8 # Created: Tue Sep 19 18:18:00 1989
9 # Modified: Thu Apr 22 14:31:46 1993 (Chip Chapin) chip@hpclbis
10 # Language: Bourne shell
11 # Package: Text Storage Library (Bible Retrieval System)
12 # Status: Experimental (Do Not Distribute)
14 ###############################################################################
18 # Thu Apr 22 14:31:18 1993 (Chip Chapin) chip@hpclbis
19 # Revised tr usage to work with either Ultrix or HP-UX.
20 # Revert to sh, instead of ksh, for greater portability.
21 # Wed Dec 23 13:41:19 1992 (Chip Chapin) chip@hpclbis
22 # Added time/space warning notice.
23 # Tue Dec 22 09:28:24 1992 (Chip Chapin) chip@hpclbis
24 # Revised to use lots of pipes to save lots of disk space.
25 # Mon Dec 21 19:25:32 1992 (Chip Chapin) chip@hpclbis
26 # Tidy up for release with BRS 2.0.
27 ###############################################################################
29 # Setting PATH ensures that the bible program will be found.
36 echo "*********************************************************************"
37 echo "Notice: This script may take a long time, and require 8-10mb of disk."
38 echo "*********************************************************************"
42 # commands for building word index
44 # 1. Generate plain text file, one "record" (e.g. bible verse) per line.
45 # 2. Cut off the record reference that starts each line.
46 # 3. Translate ALL non-alpha characters (EXCEPT new-line) into blanks.
47 # 4. Translate all upper-case chars into lower-case.
48 # 5. Create list containing each occurrence of each word.
49 # 6. Sort the list and eliminate dups.
50 # 7. Create "rawconcordance", listing the lines where each word occurs.
52 ### NOTICE: the long string of blanks in the first tr(1) seems to be the
53 # best way to deal with incompatibilities between BSD and POSIX tr.
54 $PROG -f gen1:1-rev99:99 |
56 tr -cs "[A-Z][a-z]\012" "$BLANKS $BLANKS" |
59 # Create a list of each occurrence of each word in the
60 # text. Format: "word line-no."
61 # Note that there are 12545 different words in KJV bible.
62 {for (i=1;i<=NF;i++) printf "%s %05d\n", $i, NR}
64 # Gather all references to a word together,
65 # and eliminate multiple refs for the same word occurring several
66 # times in the same record.
69 # Create raw concordance: each word that occurs in the text on a
70 # single line, followed by a blank-separated list of line numbers
71 # on which that word occurred.
72 # As a by-product, we [can] also create a wordcounts file, which
73 # gives the number of lines in which each word occurs. NOTE that,
74 # because we used -u in the sort above, we are not counting cases
75 # where the same word is used several times in the same record.
76 NR == 1 {word = $1; printf "%s %s", $1, $2; next}
86 printf "\n%s %s", $1, $2;
93 ' - > "$PROG".rawconcordance
95 # Next ... create a binary form of the raw concordance.
96 # This is handled by "makeconcfile", a program invoked from the
99 # so we're all done now.
101 # Interesting statistic: 89198 chars in all the words in the Bible,
102 # 617371 word-verse occurrances
104 # awk '{chars += length($1); counts += $2}
105 # END {print "chars=" chars " counts=" counts}' bible.wordcounts
109 ###############################################################################
110 # Gnu Emacs variables...
114 # eval: (auto-fill-mode 0)
115 # default-header-comment-character: ?#
116 # header-prefix: "#!/bin/ksh"
118 # header-comment-character: ?#