diff options
-rw-r--r-- | gi/pipeline/clsp.config | 2 | ||||
-rwxr-xr-x | gi/pipeline/evaluation-pipeline.pl | 9 | ||||
-rwxr-xr-x | gi/pipeline/scripts/filter-by-f.pl | 7 | ||||
-rwxr-xr-x | gi/pipeline/scripts/refilter.pl | 5 |
4 files changed, 16 insertions, 7 deletions
diff --git a/gi/pipeline/clsp.config b/gi/pipeline/clsp.config index f8b899f9..c23d409f 100644 --- a/gi/pipeline/clsp.config +++ b/gi/pipeline/clsp.config @@ -3,7 +3,7 @@ /export/ws10smt/data btec /export/ws10smt/data/btec/ split.zh-en.al lm/en.3gram.lm.gz devtest/devset1_2.zh devtest/devset1_2.lc.en* devtest/devset3.zh eval-devset3.sh fbis /export/ws10smt/data/chinese-english.fbis corpus.zh-en.al -zhen /export/ws10smt/data/chinese-english corpus.zh-en.al +zhen /export/ws10smt/data/chinese-english corpus.zh-en.al lm/c2e.3gram.lm.gz dev_and_test/mt02.src.txt dev_and_test/mt02.ref.* dev_and_test/mt03.src.txt eval-mt03.sh aren /export/ws10smt/data/arabic-english corpus.ar-en-al lm/a2e.3gram.lm.gz dev_and_test/dev.src.txt dev_and_test/dev.ref.txt.* dev_and_test/mt05.src.txt eval-mt05.sh uren /export/ws10smt/data/urdu-english corpus.ur-en.al lm/u2e.en.lm.gz dev/dev.ur dev/dev.en* devtest/devtest.ur eval-devtest.sh nlfr /export/ws10smt/data/dutch-french corpus.nl-fr.al diff --git a/gi/pipeline/evaluation-pipeline.pl b/gi/pipeline/evaluation-pipeline.pl index e940a5b9..675fd8c2 100755 --- a/gi/pipeline/evaluation-pipeline.pl +++ b/gi/pipeline/evaluation-pipeline.pl @@ -8,6 +8,7 @@ my $SCRIPT_DIR; BEGIN { use Cwd qw/ abs_path /; use File::Basename; $SCRIPT_DIR my $JOBS = 15; my $PMEM = "9G"; +my $NUM_TRANSLATIONS = 30; # featurize_grammar may add multiple features from a single feature extractor # the key in this map is the extractor name, the value is a list of the extracted features @@ -132,6 +133,7 @@ if (GetOptions( "oov-grammar=s" => \$oovgram, "data=s" => \$dataDir, "pmem=s" => \$PMEM, + "n=i" => \$NUM_TRANSLATIONS, "features=s@" => \@features, "use-fork" => \$usefork, "jobs=i" => \$JOBS, @@ -279,7 +281,7 @@ sub filter { safesystem($out1, $cmd) or die "Filtering failed."; $cmd = "gunzip -c $out1 | $FEATURIZE $FEATURIZER_OPTS -g $out1 -c $CORPUS | gzip > $out2"; safesystem($out2, $cmd) or die "Featurizing failed"; - $cmd = "$FILTERBYF $out2 $outgrammar"; + $cmd = "$FILTERBYF $NUM_TRANSLATIONS $out2 $outgrammar"; safesystem($outgrammar, $cmd) or die "Secondary filtering failed"; } return $outgrammar; @@ -317,11 +319,12 @@ EOT sub print_help { print STDERR<<EOT; -Usage: $0 [-c data-config-file] language-pair grammar.bidir.gz [OPTIONS] +Usage: $0 [-c data-config-file] [-n N] language-pair grammar.bidir.gz [OPTIONS] Given an induced grammar for an entire corpus (i.e., generated by local-gi-pipeline.pl), filter and featurize it for a dev and test set, -run MERT, report scores. +run MERT, report scores. Use -n to specify the number of translations +to keep for a given source (30 is default). EOT } diff --git a/gi/pipeline/scripts/filter-by-f.pl b/gi/pipeline/scripts/filter-by-f.pl index 3dd03bdd..0cef0606 100755 --- a/gi/pipeline/scripts/filter-by-f.pl +++ b/gi/pipeline/scripts/filter-by-f.pl @@ -8,13 +8,16 @@ my $REFILTER="$SCRIPT_DIR/refilter.pl"; my $SORT="$SCRIPT_DIR/sort-by-key.sh"; assert_exec($REKEY, $REFILTER, $SORT); -die "Usage: $0 ingrammar.gz outgrammar.gz\n" unless scalar @ARGV == 2; + +die "Usage: $0 NUM-TRANSLATIONS ingrammar.gz outgrammar.gz\n" unless scalar @ARGV == 3; +my $translations = shift @ARGV; +die "Need number: $translations" unless $translations > 0; die unless $ARGV[0] =~ /\.gz$/; die unless $ARGV[1] =~ /\.gz$/; die if $ARGV[0] eq $ARGV[1]; die "Can't find $ARGV[0]" unless -f $ARGV[0]; -my $cmd = "gunzip -c $ARGV[0] | $REKEY | $SORT | $REFILTER | gzip > $ARGV[1]"; +my $cmd = "gunzip -c $ARGV[0] | $REKEY | $SORT | $REFILTER $translations | gzip > $ARGV[1]"; safesystem($ARGV[1], $cmd) or die "Filtering failed"; exit 0; diff --git a/gi/pipeline/scripts/refilter.pl b/gi/pipeline/scripts/refilter.pl index 11a36ebe..a783eb4e 100755 --- a/gi/pipeline/scripts/refilter.pl +++ b/gi/pipeline/scripts/refilter.pl @@ -1,7 +1,10 @@ #!/usr/bin/perl -w use strict; -my $NUM_TRANSLATIONS = 30; +my $NUM_TRANSLATIONS = shift @ARGV; +unless ($NUM_TRANSLATIONS) { $NUM_TRANSLATIONS=30; } +print STDERR "KEEPING $NUM_TRANSLATIONS TRANSLATIONS FOR SOURCE\n"; + my $pk = ''; my %dict; while(<>) { |