From: Ian Jackson Date: Wed, 4 Sep 2013 20:41:25 +0000 (+0100) Subject: wip word list processing X-Git-Tag: test~35 X-Git-Url: http://www.chiark.greenend.org.uk/ucgi/~ijackson/git?p=d.git;a=commitdiff_plain;h=6b563af238fd343dec40af8c55f3e4c51a0a0239 wip word list processing --- diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..534dbc2 --- /dev/null +++ b/Makefile @@ -0,0 +1,14 @@ +# + +WORDLIST_LEN=3000 + +all: words.list + +o= >$@.tmp && mv -f $@.tmp + +words.final: words.interim Makefile + grep -Ff forbidden-words $< >$@.1.tmp + head - $o + +words.interim: massage-lemmas lemma.al + ./$^ $o diff --git a/forbidden-words b/forbidden-words new file mode 100644 index 0000000..797ecd0 --- /dev/null +++ b/forbidden-words @@ -0,0 +1,2 @@ +dot +at diff --git a/massage-lemmas b/massage-lemmas new file mode 100755 index 0000000..ae46021 --- /dev/null +++ b/massage-lemmas @@ -0,0 +1,25 @@ +#!/usr/bin/perl -w +use strict; + +our %allow_class = + ((map { $_=>1 } qw(v n a interjection conj pron prep + modal infinitive-marker)), + (map { $_=>0 } qw(det adv))); + +our %words; + +while (<>) { + # frequency sort rank, frequency, word, word-class + m/^\d+ (\d+) (\S+) (\S+)$/ or die "$_ ?"; + my ($freq,$word,$class) = ($1,$2,$3); + my $allow = $allow_class{$class}; + if (!defined $allow) { + warn "$class ?"; + $allow = $allow_class{$class} = 1; + } + next unless $allow; + $words{$word} += $freq; +} + +our @words = sort { $words{$b} <=> $words{$a} } keys %words; +print $_,"\n" or die $! foreach @words;