#!/bin/sh ############################################################################### # # File: makeconcordance # RCS: $Header: /home/matthew/cvs/bible-kjv-4.10/makeconcordance,v 2.0 2003/01/08 15:29:52 matthew Exp $ # Description: Create concordance database # Author: Chip Chapin, Hewlett-Packard Company # Created: Tue Sep 19 18:18:00 1989 # Modified: Thu Apr 22 14:31:46 1993 (Chip Chapin) chip@hpclbis # Language: Bourne shell # Package: Text Storage Library (Bible Retrieval System) # Status: Experimental (Do Not Distribute) # ############################################################################### # # Revisions: # # Thu Apr 22 14:31:18 1993 (Chip Chapin) chip@hpclbis # Revised tr usage to work with either Ultrix or HP-UX. # Revert to sh, instead of ksh, for greater portability. # Wed Dec 23 13:41:19 1992 (Chip Chapin) chip@hpclbis # Added time/space warning notice. # Tue Dec 22 09:28:24 1992 (Chip Chapin) chip@hpclbis # Revised to use lots of pipes to save lots of disk space. # Mon Dec 21 19:25:32 1992 (Chip Chapin) chip@hpclbis # Tidy up for release with BRS 2.0. ############################################################################### # Setting PATH ensures that the bible program will be found. PATH=".:$PATH" PROG=bible BLANKS=" " echo echo "*********************************************************************" echo "Notice: This script may take a long time, and require 8-10mb of disk." echo "*********************************************************************" echo # # commands for building word index # # 1. Generate plain text file, one "record" (e.g. bible verse) per line. # 2. Cut off the record reference that starts each line. # 3. Translate ALL non-alpha characters (EXCEPT new-line) into blanks. # 4. Translate all upper-case chars into lower-case. # 5. Create list containing each occurrence of each word. # 6. Sort the list and eliminate dups. # 7. Create "rawconcordance", listing the lines where each word occurs. # ### NOTICE: the long string of blanks in the first tr(1) seems to be the # best way to deal with incompatibilities between BSD and POSIX tr. $PROG -f gen1:1-rev99:99 | cut -f2- -d" " | tr -cs "[A-Z][a-z]\012" "$BLANKS $BLANKS" | tr "[A-Z]" "[a-z]" | awk ' # Create a list of each occurrence of each word in the # text. Format: "word line-no." # Note that there are 12545 different words in KJV bible. {for (i=1;i<=NF;i++) printf "%s %05d\n", $i, NR} ' - | # Gather all references to a word together, # and eliminate multiple refs for the same word occurring several # times in the same record. sort -uy - | awk ' # Create raw concordance: each word that occurs in the text on a # single line, followed by a blank-separated list of line numbers # on which that word occurred. # As a by-product, we [can] also create a wordcounts file, which # gives the number of lines in which each word occurs. NOTE that, # because we used -u in the sort above, we are not counting cases # where the same word is used several times in the same record. NR == 1 {word = $1; printf "%s %s", $1, $2; next} { if ($1 == word) { printf " %s", $2; wordcount++; } else { word = $1 wordcount = 1; printf "\n%s %s", $1, $2; } } END { printf "\n"; } ' - > "$PROG".rawconcordance # Next ... create a binary form of the raw concordance. # This is handled by "makeconcfile", a program invoked from the # BRS makefile. # so we're all done now. # Interesting statistic: 89198 chars in all the words in the Bible, # 617371 word-verse occurrances # from... # awk '{chars += length($1); counts += $2} # END {print "chars=" chars " counts=" counts}' bible.wordcounts # end ############################################################################### # Gnu Emacs variables... # # Local Variables: # mode: sh # eval: (auto-fill-mode 0) # default-header-comment-character: ?# # header-prefix: "#!/bin/ksh" # header-suffix: "#" # header-comment-character: ?# # end: