From 6b563af238fd343dec40af8c55f3e4c51a0a0239 Mon Sep 17 00:00:00 2001 From: Ian Jackson Date: Wed, 4 Sep 2013 21:41:25 +0100 Subject: [PATCH] wip word list processing --- Makefile | 14 ++++++++++++++ forbidden-words | 2 ++ massage-lemmas | 25 +++++++++++++++++++++++++ 3 files changed, 41 insertions(+) create mode 100644 Makefile create mode 100644 forbidden-words create mode 100755 massage-lemmas diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..534dbc2 --- /dev/null +++ b/Makefile @@ -0,0 +1,14 @@ +# + +WORDLIST_LEN=3000 + +all: words.list + +o= >$@.tmp && mv -f $@.tmp + +words.final: words.interim Makefile + grep -Ff forbidden-words $< >$@.1.tmp + head - $o + +words.interim: massage-lemmas lemma.al + ./$^ $o diff --git a/forbidden-words b/forbidden-words new file mode 100644 index 0000000..797ecd0 --- /dev/null +++ b/forbidden-words @@ -0,0 +1,2 @@ +dot +at diff --git a/massage-lemmas b/massage-lemmas new file mode 100755 index 0000000..ae46021 --- /dev/null +++ b/massage-lemmas @@ -0,0 +1,25 @@ +#!/usr/bin/perl -w +use strict; + +our %allow_class = + ((map { $_=>1 } qw(v n a interjection conj pron prep + modal infinitive-marker)), + (map { $_=>0 } qw(det adv))); + +our %words; + +while (<>) { + # frequency sort rank, frequency, word, word-class + m/^\d+ (\d+) (\S+) (\S+)$/ or die "$_ ?"; + my ($freq,$word,$class) = ($1,$2,$3); + my $allow = $allow_class{$class}; + if (!defined $allow) { + warn "$class ?"; + $allow = $allow_class{$class} = 1; + } + next unless $allow; + $words{$word} += $freq; +} + +our @words = sort { $words{$b} <=> $words{$a} } keys %words; +print $_,"\n" or die $! foreach @words; -- 2.30.2