2 ################################################################################
5 # RCS: $Header: /home/matthew/cvs/bible-kjv-4.10/makeconc.pl,v 2.0 2003/01/08 15:29:52 matthew Exp $
6 # Description: make Bible concordance: translation of Chip Chapin's ksh script
7 # Author: Chris Eich, SRSD
8 # Created: Wed Dec 23 11:00:18 1992
9 # Modified: Wed Dec 23 15:49:23 1992 (Chip Chapin) chip@hpclbis
11 # Status: Experimental (Do Not Distribute)
13 ################################################################################
17 # Wed Dec 23 15:19:45 1992 (Chip Chapin) chip@hpclbis
18 # Received from Chris Eich, replaces "makeconcordance" script.
19 # Made use of stopwords conditional.
20 ###############################################################################
23 # Putting . on PATH ensures that the bible program will be found.
24 $ENV{'PATH'} =~ s/^:*/.:/;
28 # Read a list of stop words, if any, one per line.
30 if (open(STOP, "$ARGV[0]")) {
31 print "Excluding stopwords ($ARGV[0]) from concordance.\n";
33 # Ignore comments, mark stop word if one is found.
34 $stopword{$&}++ if !/^#/ && /[a-z]+/;
38 print "All words will be included in concordance (no stopwords).\n";
41 # Generate plain text file, one "record" (e.g. bible verse) per line.
42 # Fill %lines and $count tables, which are keyed by words.
44 open(BIBLE, "bible.rawtext");
45 <BIBLE>; #discard the header line
47 s/^\S+\s+//; # Cut off the record reference that starts each line.
48 tr/A-Z/a-z/; # Downcase.
49 tr/a-z/ /c; # Turn non-alpha into space.
51 for $word (split(' ')) {
52 next if $stopword{$word};
53 $count{$word}++; # Move below next line to count per-line.
54 next if $seenonthisline{$word}++;
55 #the header line discard still leaves $. 1 higher than we want
56 $lines{$word} .= " " . ($. - 1);
59 die $! if BIBLE->error();
61 # Create raw concordance, listing the lines where each word occurs.
63 open(RAWCONC, "> $PROG.rawconcordance") || die "$PROG.rawconcordance: $!\n";
64 for $word (sort keys %lines) {
65 print RAWCONC $word, $lines{$word}, "\n";
69 # Also create a wordcounts file, which gives the number of lines in
70 # which each word occurs. Note that we ARE counting cases where the
71 # same word is used several times in the same record. See the comment
72 # above for "$count{$word}++" to change this to per-record.
74 open(COUNTS, "| sort -nrk 2 > $PROG.wordcounts");
75 while (($word, $count) = each %count) {
76 print COUNTS $word, "\t", $count, "\n";
82 # Next ... create a binary form of the raw concordance.
83 # This is handled by "makeconcfile", a program invoked from the
86 # so we're all done now.
88 # Interesting statistic: 89198 chars in all the words in the Bible,
89 # 617371 word-verse occurrances
91 # awk '{chars += length($1); counts += $2}
92 # END {print "chars=" chars " counts=" counts}' bible.wordcounts
96 ###############################################################################
97 # Gnu Emacs variables...
101 # eval: (auto-fill-mode 0)
102 # default-header-comment-character: ?#
103 # header-prefix: "#! /usr/bin/perl"
105 # header-comment-character: ?#