makeconc.pl

   1 #! /usr/bin/perl
   2 ################################################################################
   3 #
   4 # File:         mkconc.pl
   5 # RCS:          $Header: /home/matthew/cvs/bible-kjv-4.10/makeconc.pl,v 2.0 2003/01/08 15:29:52 matthew Exp $
   6 # Description:  make Bible concordance: translation of Chip Chapin's ksh script
   7 # Author:       Chris Eich, SRSD
   8 # Created:      Wed Dec 23 11:00:18 1992
   9 # Modified:     Wed Dec 23 15:49:23 1992 (Chip Chapin) chip@hpclbis
  10 # Language:     perl
  11 # Status:       Experimental (Do Not Distribute)
  12 #
  13 ################################################################################
  14 #
  15 # Revisions:
  16 #
  17 # Wed Dec 23 15:19:45 1992 (Chip Chapin) chip@hpclbis
  18 #  Received from Chris Eich, replaces "makeconcordance" script.
  19 #  Made use of stopwords conditional.
  20 ###############################################################################
  21
  22 # Putting . on PATH ensures that the bible program will be found.
  23 $ENV{'PATH'} =~ s/^:*/.:/;
  24
  25 $PROG = 'bible';
  26
  27 # Read a list of stop words, if any, one per line.
  28
  29 if (open(STOP, "$ARGV[0]")) {
  30     print "Excluding stopwords ($ARGV[0]) from concordance.\n";
  31     while (<STOP>) {
  32         # Ignore comments, mark stop word if one is found.
  33         $stopword{$&}++ if !/^#/ && /[a-z]+/;
  34     }
  35     close(STOP);
  36 } else {
  37     print "All words will be included in concordance (no stopwords).\n";
  38 }
  39
  40 # Generate plain text file, one "record" (e.g. bible verse) per line.
  41 # Fill %lines and $count tables, which are keyed by words.
  42
  43 open(BIBLE, "$PROG -f gen1:1-rev99:99 |");
  44 while (<BIBLE>) {
  45     s/^\S+\s+//;        # Cut off the record reference that starts each line.
  46     tr/A-Z/a-z/;        # Downcase.
  47     tr/a-z/ /c;         # Turn non-alpha into space.
  48     %seenonthisline = ();
  49     for $word (split(' ')) {
  50         next if $stopword{$word};
  51         $count{$word}++;        # Move below next line to count per-line.
  52         next if $seenonthisline{$word}++;
  53         $lines{$word} .= " " . $.;
  54     }
  55 }
  56 close(BIBLE);
  57
  58 # Create raw concordance, listing the lines where each word occurs.
  59
  60 open(RAWCONC, "> $PROG.rawconcordance") || die "$PROG.rawconcordance: $!\n";
  61 for $word (sort keys %lines) {
  62     print RAWCONC $word, $lines{$word}, "\n";
  63 }
  64 close(RAWCONC);
  65
  66 # Also create a wordcounts file, which gives the number of lines in
  67 # which each word occurs.  Note that we ARE counting cases where the
  68 # same word is used several times in the same record.  See the comment
  69 # above for "$count{$word}++" to change this to per-record.
  70
  71 open(COUNTS, "| sort -nrk 2 > $PROG.wordcounts");
  72 while (($word, $count) = each %count) {
  73     print COUNTS $word, "\t", $count, "\n";
  74 }
  75 close(COUNTS);
  76
  77 __END__
  78
  79 # Next ... create a binary form of the raw concordance.
  80 # This is handled by "makeconcfile", a program invoked from the
  81 # BRS makefile.
  82
  83 # so we're all done now.
  84
  85 # Interesting statistic: 89198 chars in all the words in the Bible,
  86 #                        617371 word-verse occurrances
  87 # from...
  88 #       awk '{chars += length($1); counts += $2}
  89 #               END {print "chars=" chars " counts=" counts}' bible.wordcounts
  90
  91 # end
  92
  93 ###############################################################################
  94 # Gnu Emacs variables...
  95 #
  96 #   Local Variables:
  97 #   mode:                               perl
  98 #   eval:                               (auto-fill-mode 0)
  99 #   default-header-comment-character:   ?#
 100 #   header-prefix:                      "#! /usr/bin/perl"
 101 #   header-suffix:                      "#"
 102 #   header-comment-character:           ?#
 103 #   end: