summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorredpony <redpony@ec762483-ff6d-05da-a07a-a48fb63a330f>2010-07-25 21:03:19 +0000
committerredpony <redpony@ec762483-ff6d-05da-a07a-a48fb63a330f>2010-07-25 21:03:19 +0000
commit7e54f38591ab2aaa0c9a8738425e7f388fec7a1d (patch)
tree9e97444d03f39d6c53bb78295d30d733678a853f
parent65a47d21082deb41aceb6516212568408bddaeac (diff)
configure number of translations to keep
git-svn-id: https://ws10smt.googlecode.com/svn/trunk@410 ec762483-ff6d-05da-a07a-a48fb63a330f
-rw-r--r--gi/pipeline/clsp.config2
-rwxr-xr-xgi/pipeline/evaluation-pipeline.pl9
-rwxr-xr-xgi/pipeline/scripts/filter-by-f.pl7
-rwxr-xr-xgi/pipeline/scripts/refilter.pl5
4 files changed, 16 insertions, 7 deletions
diff --git a/gi/pipeline/clsp.config b/gi/pipeline/clsp.config
index f8b899f9..c23d409f 100644
--- a/gi/pipeline/clsp.config
+++ b/gi/pipeline/clsp.config
@@ -3,7 +3,7 @@
/export/ws10smt/data
btec /export/ws10smt/data/btec/ split.zh-en.al lm/en.3gram.lm.gz devtest/devset1_2.zh devtest/devset1_2.lc.en* devtest/devset3.zh eval-devset3.sh
fbis /export/ws10smt/data/chinese-english.fbis corpus.zh-en.al
-zhen /export/ws10smt/data/chinese-english corpus.zh-en.al
+zhen /export/ws10smt/data/chinese-english corpus.zh-en.al lm/c2e.3gram.lm.gz dev_and_test/mt02.src.txt dev_and_test/mt02.ref.* dev_and_test/mt03.src.txt eval-mt03.sh
aren /export/ws10smt/data/arabic-english corpus.ar-en-al lm/a2e.3gram.lm.gz dev_and_test/dev.src.txt dev_and_test/dev.ref.txt.* dev_and_test/mt05.src.txt eval-mt05.sh
uren /export/ws10smt/data/urdu-english corpus.ur-en.al lm/u2e.en.lm.gz dev/dev.ur dev/dev.en* devtest/devtest.ur eval-devtest.sh
nlfr /export/ws10smt/data/dutch-french corpus.nl-fr.al
diff --git a/gi/pipeline/evaluation-pipeline.pl b/gi/pipeline/evaluation-pipeline.pl
index e940a5b9..675fd8c2 100755
--- a/gi/pipeline/evaluation-pipeline.pl
+++ b/gi/pipeline/evaluation-pipeline.pl
@@ -8,6 +8,7 @@ my $SCRIPT_DIR; BEGIN { use Cwd qw/ abs_path /; use File::Basename; $SCRIPT_DIR
my $JOBS = 15;
my $PMEM = "9G";
+my $NUM_TRANSLATIONS = 30;
# featurize_grammar may add multiple features from a single feature extractor
# the key in this map is the extractor name, the value is a list of the extracted features
@@ -132,6 +133,7 @@ if (GetOptions(
"oov-grammar=s" => \$oovgram,
"data=s" => \$dataDir,
"pmem=s" => \$PMEM,
+ "n=i" => \$NUM_TRANSLATIONS,
"features=s@" => \@features,
"use-fork" => \$usefork,
"jobs=i" => \$JOBS,
@@ -279,7 +281,7 @@ sub filter {
safesystem($out1, $cmd) or die "Filtering failed.";
$cmd = "gunzip -c $out1 | $FEATURIZE $FEATURIZER_OPTS -g $out1 -c $CORPUS | gzip > $out2";
safesystem($out2, $cmd) or die "Featurizing failed";
- $cmd = "$FILTERBYF $out2 $outgrammar";
+ $cmd = "$FILTERBYF $NUM_TRANSLATIONS $out2 $outgrammar";
safesystem($outgrammar, $cmd) or die "Secondary filtering failed";
}
return $outgrammar;
@@ -317,11 +319,12 @@ EOT
sub print_help {
print STDERR<<EOT;
-Usage: $0 [-c data-config-file] language-pair grammar.bidir.gz [OPTIONS]
+Usage: $0 [-c data-config-file] [-n N] language-pair grammar.bidir.gz [OPTIONS]
Given an induced grammar for an entire corpus (i.e., generated by
local-gi-pipeline.pl), filter and featurize it for a dev and test set,
-run MERT, report scores.
+run MERT, report scores. Use -n to specify the number of translations
+to keep for a given source (30 is default).
EOT
}
diff --git a/gi/pipeline/scripts/filter-by-f.pl b/gi/pipeline/scripts/filter-by-f.pl
index 3dd03bdd..0cef0606 100755
--- a/gi/pipeline/scripts/filter-by-f.pl
+++ b/gi/pipeline/scripts/filter-by-f.pl
@@ -8,13 +8,16 @@ my $REFILTER="$SCRIPT_DIR/refilter.pl";
my $SORT="$SCRIPT_DIR/sort-by-key.sh";
assert_exec($REKEY, $REFILTER, $SORT);
-die "Usage: $0 ingrammar.gz outgrammar.gz\n" unless scalar @ARGV == 2;
+
+die "Usage: $0 NUM-TRANSLATIONS ingrammar.gz outgrammar.gz\n" unless scalar @ARGV == 3;
+my $translations = shift @ARGV;
+die "Need number: $translations" unless $translations > 0;
die unless $ARGV[0] =~ /\.gz$/;
die unless $ARGV[1] =~ /\.gz$/;
die if $ARGV[0] eq $ARGV[1];
die "Can't find $ARGV[0]" unless -f $ARGV[0];
-my $cmd = "gunzip -c $ARGV[0] | $REKEY | $SORT | $REFILTER | gzip > $ARGV[1]";
+my $cmd = "gunzip -c $ARGV[0] | $REKEY | $SORT | $REFILTER $translations | gzip > $ARGV[1]";
safesystem($ARGV[1], $cmd) or die "Filtering failed";
exit 0;
diff --git a/gi/pipeline/scripts/refilter.pl b/gi/pipeline/scripts/refilter.pl
index 11a36ebe..a783eb4e 100755
--- a/gi/pipeline/scripts/refilter.pl
+++ b/gi/pipeline/scripts/refilter.pl
@@ -1,7 +1,10 @@
#!/usr/bin/perl -w
use strict;
-my $NUM_TRANSLATIONS = 30;
+my $NUM_TRANSLATIONS = shift @ARGV;
+unless ($NUM_TRANSLATIONS) { $NUM_TRANSLATIONS=30; }
+print STDERR "KEEPING $NUM_TRANSLATIONS TRANSLATIONS FOR SOURCE\n";
+
my $pk = '';
my %dict;
while(<>) {