makeconcordance

   1 #!/bin/sh
   2 ###############################################################################
   3 #
   4 # File:         makeconcordance
   5 # RCS:          $Header: /home/matthew/cvs/bible-kjv-4.10/makeconcordance,v 2.0 2003/01/08 15:29:52 matthew Exp $
   6 # Description:  Create concordance database
   7 # Author:       Chip Chapin, Hewlett-Packard Company
   8 # Created:      Tue Sep 19 18:18:00 1989
   9 # Modified:     Thu Apr 22 14:31:46 1993 (Chip Chapin) chip@hpclbis
  10 # Language:     Bourne shell
  11 # Package:      Text Storage Library (Bible Retrieval System)
  12 # Status:       Experimental (Do Not Distribute)
  13 #
  14 ###############################################################################
  15 #
  16 # Revisions:
  17 #
  18 # Thu Apr 22 14:31:18 1993 (Chip Chapin) chip@hpclbis
  19 #  Revised tr usage to work with either Ultrix or HP-UX.
  20 #  Revert to sh, instead of ksh, for greater portability.
  21 # Wed Dec 23 13:41:19 1992 (Chip Chapin) chip@hpclbis
  22 #  Added time/space warning notice.
  23 # Tue Dec 22 09:28:24 1992 (Chip Chapin) chip@hpclbis
  24 #  Revised to use lots of pipes to save lots of disk space.
  25 # Mon Dec 21 19:25:32 1992 (Chip Chapin) chip@hpclbis
  26 #  Tidy up for release with BRS 2.0.
  27 ###############################################################################
  28
  29 # Setting PATH ensures that the bible program will be found.
  30 PATH=".:$PATH"
  31
  32 PROG=bible
  33 BLANKS="                                                                       "
  34
  35 echo
  36 echo "*********************************************************************"
  37 echo "Notice: This script may take a long time, and require 8-10mb of disk."
  38 echo "*********************************************************************"
  39 echo
  40
  41 #
  42 # commands for building word index
  43 #
  44 # 1. Generate plain text file, one "record" (e.g. bible verse) per line.
  45 # 2. Cut off the record reference that starts each line.
  46 # 3. Translate ALL non-alpha characters (EXCEPT new-line) into blanks.
  47 # 4. Translate all upper-case chars into lower-case.
  48 # 5. Create list containing each occurrence of each word.
  49 # 6. Sort the list and eliminate dups.
  50 # 7. Create "rawconcordance", listing the lines where each word occurs.
  51 #
  52 ### NOTICE: the long string of blanks in the first tr(1) seems to be the
  53 # best way to deal with incompatibilities between BSD and POSIX tr.
  54 $PROG -f gen1:1-rev99:99 |
  55     cut -f2- -d" " |
  56     tr -cs "[A-Z][a-z]\012" "$BLANKS $BLANKS" |
  57     tr "[A-Z]" "[a-z]"  |
  58     awk '
  59       # Create a list of each occurrence of each word in the
  60       # text.  Format:  "word  line-no."
  61       # Note that there are 12545 different words in KJV bible.
  62       {for (i=1;i<=NF;i++) printf "%s %05d\n", $i, NR}
  63       ' - |
  64     # Gather all references to a word together,
  65     # and eliminate multiple refs for the same word occurring several
  66     # times in the same record.
  67     sort -uy - |
  68     awk '
  69       # Create raw concordance: each word that occurs in the text on a
  70       # single line, followed by a blank-separated list of line numbers
  71       # on which that word occurred.
  72       # As a by-product, we [can] also create a wordcounts file, which
  73       # gives the number of lines in which each word occurs.  NOTE that,
  74       # because we used -u in the sort above, we are not counting cases
  75       # where the same word is used several times in the same record.
  76       NR == 1 {word = $1; printf "%s %s", $1, $2; next}
  77
  78       {
  79           if ($1 == word) {
  80               printf " %s", $2;
  81               wordcount++;
  82           }
  83           else {
  84               word = $1
  85               wordcount = 1;
  86               printf "\n%s %s", $1, $2;
  87           }
  88       }
  89
  90       END {
  91           printf "\n";
  92       }
  93       ' - > "$PROG".rawconcordance
  94
  95 # Next ... create a binary form of the raw concordance.
  96 # This is handled by "makeconcfile", a program invoked from the
  97 # BRS makefile.
  98
  99 # so we're all done now.
 100
 101 # Interesting statistic: 89198 chars in all the words in the Bible,
 102 #                        617371 word-verse occurrances
 103 # from...
 104 #       awk '{chars += length($1); counts += $2}
 105 #               END {print "chars=" chars " counts=" counts}' bible.wordcounts
 106
 107 # end
 108
 109 ###############################################################################
 110 # Gnu Emacs variables...
 111 #
 112 #   Local Variables:
 113 #   mode:                               sh
 114 #   eval:                               (auto-fill-mode 0)
 115 #   default-header-comment-character:   ?#
 116 #   header-prefix:                      "#!/bin/ksh"
 117 #   header-suffix:                      "#"
 118 #   header-comment-character:           ?#
 119 #   end: