chiark / gitweb /
wip word list processing
authorIan Jackson <ijackson@chiark.greenend.org.uk>
Wed, 4 Sep 2013 20:41:25 +0000 (21:41 +0100)
committerIan Jackson <ijackson@chiark.greenend.org.uk>
Wed, 4 Sep 2013 20:41:25 +0000 (21:41 +0100)
Makefile [new file with mode: 0644]
forbidden-words [new file with mode: 0644]
massage-lemmas [new file with mode: 0755]

diff --git a/Makefile b/Makefile
new file mode 100644 (file)
index 0000000..534dbc2
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,14 @@
+#
+
+WORDLIST_LEN=3000
+
+all:   words.list
+
+o= >$@.tmp && mv -f $@.tmp
+
+words.final:   words.interim Makefile
+               grep -Ff forbidden-words $< >$@.1.tmp
+               head - $o
+
+words.interim: massage-lemmas lemma.al
+               ./$^ $o
diff --git a/forbidden-words b/forbidden-words
new file mode 100644 (file)
index 0000000..797ecd0
--- /dev/null
@@ -0,0 +1,2 @@
+dot
+at
diff --git a/massage-lemmas b/massage-lemmas
new file mode 100755 (executable)
index 0000000..ae46021
--- /dev/null
@@ -0,0 +1,25 @@
+#!/usr/bin/perl -w
+use strict;
+
+our %allow_class =
+    ((map { $_=>1 } qw(v n a interjection conj pron prep
+                      modal infinitive-marker)),
+     (map { $_=>0 } qw(det adv)));
+
+our %words;
+
+while (<>) {
+    # frequency sort rank, frequency, word, word-class
+    m/^\d+ (\d+) (\S+) (\S+)$/ or die "$_ ?";
+    my ($freq,$word,$class) = ($1,$2,$3);
+    my $allow = $allow_class{$class};
+    if (!defined $allow) {
+       warn "$class ?";
+       $allow = $allow_class{$class} = 1;
+    }
+    next unless $allow;
+    $words{$word} += $freq;
+}
+
+our @words = sort { $words{$b} <=> $words{$a} } keys %words;
+print $_,"\n" or die $! foreach @words;