From 7e54f38591ab2aaa0c9a8738425e7f388fec7a1d Mon Sep 17 00:00:00 2001 From: redpony Date: Sun, 25 Jul 2010 21:03:19 +0000 Subject: configure number of translations to keep git-svn-id: https://ws10smt.googlecode.com/svn/trunk@410 ec762483-ff6d-05da-a07a-a48fb63a330f --- gi/pipeline/clsp.config | 2 +- gi/pipeline/evaluation-pipeline.pl | 9 ++++++--- gi/pipeline/scripts/filter-by-f.pl | 7 +++++-- gi/pipeline/scripts/refilter.pl | 5 ++++- 4 files changed, 16 insertions(+), 7 deletions(-) diff --git a/gi/pipeline/clsp.config b/gi/pipeline/clsp.config index f8b899f9..c23d409f 100644 --- a/gi/pipeline/clsp.config +++ b/gi/pipeline/clsp.config @@ -3,7 +3,7 @@ /export/ws10smt/data btec /export/ws10smt/data/btec/ split.zh-en.al lm/en.3gram.lm.gz devtest/devset1_2.zh devtest/devset1_2.lc.en* devtest/devset3.zh eval-devset3.sh fbis /export/ws10smt/data/chinese-english.fbis corpus.zh-en.al -zhen /export/ws10smt/data/chinese-english corpus.zh-en.al +zhen /export/ws10smt/data/chinese-english corpus.zh-en.al lm/c2e.3gram.lm.gz dev_and_test/mt02.src.txt dev_and_test/mt02.ref.* dev_and_test/mt03.src.txt eval-mt03.sh aren /export/ws10smt/data/arabic-english corpus.ar-en-al lm/a2e.3gram.lm.gz dev_and_test/dev.src.txt dev_and_test/dev.ref.txt.* dev_and_test/mt05.src.txt eval-mt05.sh uren /export/ws10smt/data/urdu-english corpus.ur-en.al lm/u2e.en.lm.gz dev/dev.ur dev/dev.en* devtest/devtest.ur eval-devtest.sh nlfr /export/ws10smt/data/dutch-french corpus.nl-fr.al diff --git a/gi/pipeline/evaluation-pipeline.pl b/gi/pipeline/evaluation-pipeline.pl index e940a5b9..675fd8c2 100755 --- a/gi/pipeline/evaluation-pipeline.pl +++ b/gi/pipeline/evaluation-pipeline.pl @@ -8,6 +8,7 @@ my $SCRIPT_DIR; BEGIN { use Cwd qw/ abs_path /; use File::Basename; $SCRIPT_DIR my $JOBS = 15; my $PMEM = "9G"; +my $NUM_TRANSLATIONS = 30; # featurize_grammar may add multiple features from a single feature extractor # the key in this map is the extractor name, the value is a list of the extracted features @@ -132,6 +133,7 @@ if (GetOptions( "oov-grammar=s" => \$oovgram, "data=s" => \$dataDir, "pmem=s" => \$PMEM, + "n=i" => \$NUM_TRANSLATIONS, "features=s@" => \@features, "use-fork" => \$usefork, "jobs=i" => \$JOBS, @@ -279,7 +281,7 @@ sub filter { safesystem($out1, $cmd) or die "Filtering failed."; $cmd = "gunzip -c $out1 | $FEATURIZE $FEATURIZER_OPTS -g $out1 -c $CORPUS | gzip > $out2"; safesystem($out2, $cmd) or die "Featurizing failed"; - $cmd = "$FILTERBYF $out2 $outgrammar"; + $cmd = "$FILTERBYF $NUM_TRANSLATIONS $out2 $outgrammar"; safesystem($outgrammar, $cmd) or die "Secondary filtering failed"; } return $outgrammar; @@ -317,11 +319,12 @@ EOT sub print_help { print STDERR< 0; die unless $ARGV[0] =~ /\.gz$/; die unless $ARGV[1] =~ /\.gz$/; die if $ARGV[0] eq $ARGV[1]; die "Can't find $ARGV[0]" unless -f $ARGV[0]; -my $cmd = "gunzip -c $ARGV[0] | $REKEY | $SORT | $REFILTER | gzip > $ARGV[1]"; +my $cmd = "gunzip -c $ARGV[0] | $REKEY | $SORT | $REFILTER $translations | gzip > $ARGV[1]"; safesystem($ARGV[1], $cmd) or die "Filtering failed"; exit 0; diff --git a/gi/pipeline/scripts/refilter.pl b/gi/pipeline/scripts/refilter.pl index 11a36ebe..a783eb4e 100755 --- a/gi/pipeline/scripts/refilter.pl +++ b/gi/pipeline/scripts/refilter.pl @@ -1,7 +1,10 @@ #!/usr/bin/perl -w use strict; -my $NUM_TRANSLATIONS = 30; +my $NUM_TRANSLATIONS = shift @ARGV; +unless ($NUM_TRANSLATIONS) { $NUM_TRANSLATIONS=30; } +print STDERR "KEEPING $NUM_TRANSLATIONS TRANSLATIONS FOR SOURCE\n"; + my $pk = ''; my %dict; while(<>) { -- cgit v1.2.3