configure number of translations to keep

git-svn-id: https://ws10smt.googlecode.com/svn/trunk@410 ec762483-ff6d-05da-a07a-a48fb63a330f
author: redpony <redpony@ec762483-ff6d-05da-a07a-a48fb63a330f> 2010-07-25 21:03:19 +0000
committer: redpony <redpony@ec762483-ff6d-05da-a07a-a48fb63a330f> 2010-07-25 21:03:19 +0000
commit: 7e54f38591ab2aaa0c9a8738425e7f388fec7a1d (patch)
tree: 9e97444d03f39d6c53bb78295d30d733678a853f
parent: 65a47d21082deb41aceb6516212568408bddaeac (diff)
4 files changed, 16 insertions, 7 deletions
diff --git a/gi/pipeline/clsp.config b/gi/pipeline/clsp.config
index f8b899f9..c23d409f 100644
--- a/gi/pipeline/clsp.config
+++ b/gi/pipeline/clsp.config
@@ -3,7 +3,7 @@
 /export/ws10smt/data
 btec /export/ws10smt/data/btec/ split.zh-en.al lm/en.3gram.lm.gz devtest/devset1_2.zh devtest/devset1_2.lc.en* devtest/devset3.zh eval-devset3.sh
 fbis /export/ws10smt/data/chinese-english.fbis corpus.zh-en.al
-zhen /export/ws10smt/data/chinese-english corpus.zh-en.al
+zhen /export/ws10smt/data/chinese-english corpus.zh-en.al lm/c2e.3gram.lm.gz dev_and_test/mt02.src.txt dev_and_test/mt02.ref.* dev_and_test/mt03.src.txt eval-mt03.sh
 aren /export/ws10smt/data/arabic-english corpus.ar-en-al lm/a2e.3gram.lm.gz dev_and_test/dev.src.txt dev_and_test/dev.ref.txt.* dev_and_test/mt05.src.txt eval-mt05.sh
 uren /export/ws10smt/data/urdu-english corpus.ur-en.al lm/u2e.en.lm.gz dev/dev.ur dev/dev.en* devtest/devtest.ur eval-devtest.sh
 nlfr /export/ws10smt/data/dutch-french corpus.nl-fr.al
diff --git a/gi/pipeline/evaluation-pipeline.pl b/gi/pipeline/evaluation-pipeline.pl
index e940a5b9..675fd8c2 100755
--- a/gi/pipeline/evaluation-pipeline.pl
+++ b/gi/pipeline/evaluation-pipeline.pl
@@ -8,6 +8,7 @@ my $SCRIPT_DIR; BEGIN { use Cwd qw/ abs_path /; use File::Basename; $SCRIPT_DIR
 
 my $JOBS = 15;
 my $PMEM = "9G";
+my $NUM_TRANSLATIONS = 30;
 
 # featurize_grammar may add multiple features from a single feature extractor
 # the key in this map is the extractor name, the value is a list of the extracted features
@@ -132,6 +133,7 @@ if (GetOptions(
         "oov-grammar=s" => \$oovgram,
         "data=s" => \$dataDir,
         "pmem=s" => \$PMEM,
+        "n=i" => \$NUM_TRANSLATIONS,
         "features=s@" => \@features,
         "use-fork" => \$usefork,
         "jobs=i" => \$JOBS,
@@ -279,7 +281,7 @@ sub filter {
     safesystem($out1, $cmd) or die "Filtering failed.";
     $cmd = "gunzip -c $out1 | $FEATURIZE $FEATURIZER_OPTS -g $out1 -c $CORPUS | gzip > $out2";
     safesystem($out2, $cmd) or die "Featurizing failed";
-    $cmd = "$FILTERBYF $out2 $outgrammar";
+    $cmd = "$FILTERBYF $NUM_TRANSLATIONS $out2 $outgrammar";
     safesystem($outgrammar, $cmd) or die "Secondary filtering failed";
   }
   return $outgrammar;
@@ -317,11 +319,12 @@ EOT
 sub print_help {
   print STDERR<<EOT;
 
-Usage: $0 [-c data-config-file] language-pair grammar.bidir.gz [OPTIONS]
+Usage: $0 [-c data-config-file] [-n N] language-pair grammar.bidir.gz [OPTIONS]
 
 Given an induced grammar for an entire corpus (i.e., generated by
 local-gi-pipeline.pl), filter and featurize it for a dev and test set,
-run MERT, report scores.
+run MERT, report scores. Use -n to specify the number of translations
+to keep for a given source (30 is default).
 
 EOT
 }
diff --git a/gi/pipeline/scripts/filter-by-f.pl b/gi/pipeline/scripts/filter-by-f.pl
index 3dd03bdd..0cef0606 100755
--- a/gi/pipeline/scripts/filter-by-f.pl
+++ b/gi/pipeline/scripts/filter-by-f.pl
@@ -8,13 +8,16 @@ my $REFILTER="$SCRIPT_DIR/refilter.pl";
 my $SORT="$SCRIPT_DIR/sort-by-key.sh";
 assert_exec($REKEY, $REFILTER, $SORT);
 
-die "Usage: $0 ingrammar.gz outgrammar.gz\n" unless scalar @ARGV == 2;
+
+die "Usage: $0 NUM-TRANSLATIONS ingrammar.gz outgrammar.gz\n" unless scalar @ARGV == 3;
+my $translations = shift @ARGV;
+die "Need number: $translations" unless $translations > 0;
 die unless $ARGV[0] =~ /\.gz$/;
 die unless $ARGV[1] =~ /\.gz$/;
 die if $ARGV[0] eq $ARGV[1];
 die "Can't find $ARGV[0]" unless -f $ARGV[0];
 
-my $cmd = "gunzip -c $ARGV[0] | $REKEY | $SORT | $REFILTER | gzip > $ARGV[1]";
+my $cmd = "gunzip -c $ARGV[0] | $REKEY | $SORT | $REFILTER $translations | gzip > $ARGV[1]";
 safesystem($ARGV[1], $cmd) or die "Filtering failed";
 exit 0;
 
diff --git a/gi/pipeline/scripts/refilter.pl b/gi/pipeline/scripts/refilter.pl
index 11a36ebe..a783eb4e 100755
--- a/gi/pipeline/scripts/refilter.pl
+++ b/gi/pipeline/scripts/refilter.pl
@@ -1,7 +1,10 @@
 #!/usr/bin/perl -w
 use strict;
 
-my $NUM_TRANSLATIONS = 30;
+my $NUM_TRANSLATIONS = shift @ARGV;
+unless ($NUM_TRANSLATIONS) { $NUM_TRANSLATIONS=30; }
+print STDERR "KEEPING $NUM_TRANSLATIONS TRANSLATIONS FOR SOURCE\n";
+
 my $pk = '';
 my %dict;
 while(<>) {
author	redpony <redpony@ec762483-ff6d-05da-a07a-a48fb63a330f>	2010-07-25 21:03:19 +0000
committer	redpony <redpony@ec762483-ff6d-05da-a07a-a48fb63a330f>	2010-07-25 21:03:19 +0000
commit	7e54f38591ab2aaa0c9a8738425e7f388fec7a1d (patch)
tree	9e97444d03f39d6c53bb78295d30d733678a853f
parent	65a47d21082deb41aceb6516212568408bddaeac (diff)