add additional filtering step

git-svn-id: https://ws10smt.googlecode.com/svn/trunk@368 ec762483-ff6d-05da-a07a-a48fb63a330f
author: redpony <redpony@ec762483-ff6d-05da-a07a-a48fb63a330f> 2010-07-22 20:52:22 +0000
committer: redpony <redpony@ec762483-ff6d-05da-a07a-a48fb63a330f> 2010-07-22 20:52:22 +0000
commit: fd920188bd5f912a45ef862b2a2ed7b1062b564e (patch)
tree: b0af1fd850d38c645fb82a389443c72008d90545
parent: 113ebafe09fb0474f30d12190b38da74f8b08b4d (diff)
6 files changed, 76 insertions, 8 deletions
diff --git a/gi/pipeline/clsp.config b/gi/pipeline/clsp.config
index f7f131a0..f8b899f9 100644
--- a/gi/pipeline/clsp.config
+++ b/gi/pipeline/clsp.config
@@ -4,7 +4,7 @@
 btec /export/ws10smt/data/btec/ split.zh-en.al lm/en.3gram.lm.gz devtest/devset1_2.zh devtest/devset1_2.lc.en* devtest/devset3.zh eval-devset3.sh
 fbis /export/ws10smt/data/chinese-english.fbis corpus.zh-en.al
 zhen /export/ws10smt/data/chinese-english corpus.zh-en.al
-aren /export/ws10smt/data/arabic-english corpus.ar-en.al
+aren /export/ws10smt/data/arabic-english corpus.ar-en-al lm/a2e.3gram.lm.gz dev_and_test/dev.src.txt dev_and_test/dev.ref.txt.* dev_and_test/mt05.src.txt eval-mt05.sh
 uren /export/ws10smt/data/urdu-english corpus.ur-en.al lm/u2e.en.lm.gz dev/dev.ur dev/dev.en* devtest/devtest.ur eval-devtest.sh
 nlfr /export/ws10smt/data/dutch-french corpus.nl-fr.al
 
diff --git a/gi/pipeline/evaluation-pipeline.pl b/gi/pipeline/evaluation-pipeline.pl
index b2656985..13fe07cf 100755
--- a/gi/pipeline/evaluation-pipeline.pl
+++ b/gi/pipeline/evaluation-pipeline.pl
@@ -7,6 +7,7 @@ my $CWD = getcwd;
 my $SCRIPT_DIR; BEGIN { use Cwd qw/ abs_path /; use File::Basename; $SCRIPT_DIR = dirname(abs_path($0)); push @INC, $SCRIPT_DIR; }
 
 my $JOBS = 15;
+my $PMEM = "9G";
 
 # featurize_grammar may add multiple features from a single feature extractor
 # the key in this map is the extractor name, the value is a list of the extracted features
@@ -58,6 +59,7 @@ my @DEFAULT_FEATS = qw( PassThrough Glue GlueTop LanguageModel WordPenalty );
 
 
 
+my $FILTERBYF = "$SCRIPT_DIR/scripts/filter-by-f.pl";
 my $CDEC = "$SCRIPT_DIR/../../decoder/cdec";
 my $PARALLELIZE = "$SCRIPT_DIR/../../vest/parallelize.pl";
 my $EXTOOLS = "$SCRIPT_DIR/../../extools";
@@ -67,7 +69,7 @@ die "Can't find vest: $VEST" unless -e $VEST && -d $VEST;
 my $DISTVEST = "$VEST/dist-vest.pl";
 my $FILTER = "$EXTOOLS/filter_grammar";
 my $FEATURIZE = "$EXTOOLS/featurize_grammar";
-assert_exec($CDEC, $PARALLELIZE, $FILTER, $FEATURIZE, $DISTVEST);
+assert_exec($CDEC, $PARALLELIZE, $FILTER, $FEATURIZE, $DISTVEST, $FILTERBYF);
 
 my $numtopics = 25;
 
@@ -126,6 +128,7 @@ if (GetOptions(
         "backoff-grammar=s" => \$bkoffgram,
         "glue-grammar=s" => \$gluegram,
         "data=s" => \$dataDir,
+        "pmem=s" => \$PMEM,
         "features=s@" => \@features,
         "use-fork" => \$usefork,
         "jobs=i" => \$JOBS,
@@ -218,7 +221,7 @@ my $tuned_weights = mydircat($outdir, 'weights.tuned');
 if (-f $tuned_weights) {
   print STDERR "TUNED WEIGHTS $tuned_weights EXISTS: REUSING\n";
 } else {
-  my $cmd = "$DISTVEST $usefork --decode-nodes $JOBS --ref-files=$drefs --source-file=$dev --weights $weights $devini";
+  my $cmd = "$DISTVEST $usefork --decode-nodes $JOBS --pmem=$PMEM --ref-files=$drefs --source-file=$dev --weights $weights $devini";
   print STDERR "MERT COMMAND: $cmd\n";
   `rm -rf $outdir/vest 2> /dev/null`;
   chdir $outdir or die "Can't chdir to $outdir: $!";
@@ -248,6 +251,10 @@ exit 0;
 
 sub write_random_weights_file {
   my ($file, @extras) = @_;
+  if (-f $file) {
+    print STDERR "$file exists - REUSING!\n";
+    return;
+  }
   open F, ">$file" or die "Can't write $file: $!";
   my @feats = (@DEFAULT_FEATS, @extras);
   for my $feat (@feats) {
@@ -262,12 +269,15 @@ sub write_random_weights_file {
 sub filter {
   my ($grammar, $set, $name, $outdir) = @_;
   my $out1 = mydircat($outdir, "$name.filt.gz");
+  my $out2 = mydircat($outdir, "$name.f_feat.gz");
   my $outgrammar = mydircat($outdir, "$name.scfg.gz");
   if (-f $outgrammar) { print STDERR "$outgrammar exists - REUSING!\n"; } else {
     my $cmd = "gunzip -c $grammar | $FILTER -t $set | gzip > $out1";
     safesystem($out1, $cmd) or die "Filtering failed.";
-    $cmd = "gunzip -c $out1 | $FEATURIZE $FEATURIZER_OPTS -g $out1 -c $CORPUS | gzip > $outgrammar";
-    safesystem($outgrammar, $cmd) or die "Featurizing failed";
+    $cmd = "gunzip -c $out1 | $FEATURIZE $FEATURIZER_OPTS -g $out1 -c $CORPUS | gzip > $out2";
+    safesystem($out2, $cmd) or die "Featurizing failed";
+    $cmd = "$FILTERBYF $out2 $outgrammar";
+    safesystem($outgrammar, $cmd) or die "Secondary filtering failed";
   }
   return $outgrammar;
 }  
diff --git a/gi/pipeline/scripts/refilter.pl b/gi/pipeline/scripts/refilter.pl
new file mode 100755
index 00000000..11a36ebe
--- /dev/null
+++ b/gi/pipeline/scripts/refilter.pl
@@ -0,0 +1,37 @@
+#!/usr/bin/perl -w
+use strict;
+
+my $NUM_TRANSLATIONS = 30;
+my $pk = '';
+my %dict;
+while(<>) {
+  s/^(.+)\t//;
+  my $key = $1;
+  if ($key ne $pk) {
+    if ($pk) {
+      emit_dict();
+    }
+    %dict = ();
+    $pk = $key;
+  }
+  my ($lhs, $f, $e, $s) = split / \|\|\| /;
+  my $score = 0;
+  if ($s =~ /XEF=([^ ]+)/) {
+    $score += $1;
+  } else { die; }
+  if ($s =~ /GenerativeProb=([^ ]+)/) {
+    $score += ($1 / 10);
+  } else { die; }
+  $dict{"$lhs ||| $f ||| $e ||| $s"} = $score;
+}
+emit_dict();
+
+sub emit_dict {
+  my $cc = 0;
+  for my $k (sort { $dict{$a} <=> $dict{$b} } keys %dict) {
+    print "$k";
+    $cc++;
+    if ($cc >= $NUM_TRANSLATIONS) { last; }
+  }
+}
+
diff --git a/gi/pipeline/scripts/rekey.pl b/gi/pipeline/scripts/rekey.pl
new file mode 100755
index 00000000..31eb86b8
--- /dev/null
+++ b/gi/pipeline/scripts/rekey.pl
@@ -0,0 +1,8 @@
+#!/usr/bin/perl
+
+while(<>) {
+  my ($lhs, $f, $e, $s) = split / \|\|\| /;
+  $f =~ s/\[X[0-9]+\]/\[X\]/g;
+  print "$f\t$_";
+}
+
diff --git a/gi/pipeline/scripts/sort-by-key.sh b/gi/pipeline/scripts/sort-by-key.sh
index 948dd4df..7ae33e03 100755
--- a/gi/pipeline/scripts/sort-by-key.sh
+++ b/gi/pipeline/scripts/sort-by-key.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
 
 export LANG=C
-sort -t $'\t' -k 1
+sort -t $'\t' -k 1 -T /tmp -S 6000000000
 
diff --git a/vest/dist-vest.pl b/vest/dist-vest.pl
index 97b72f45..8251de30 100755
--- a/vest/dist-vest.pl
+++ b/vest/dist-vest.pl
@@ -56,6 +56,7 @@ my $maxsim=0;
 my $oraclen=0;
 my $oracleb=20;
 my $dirargs='';
+my $density_prune;
 my $usefork;
 
 # Process command-line options
@@ -63,6 +64,7 @@ Getopt::Long::Configure("no_auto_abbrev");
 if (GetOptions(
 	"decoder=s" => \$decoderOpt,
 	"decode-nodes=i" => \$decode_nodes,
+  "density-prune=f" => \$density_prune,
 	"dont-clean" => \$disable_clean,
         "use-fork" => \$usefork,
 	"dry-run" => \$dryrun,
@@ -93,6 +95,10 @@ if (GetOptions(
 	exit;
 }
 
+if (defined $density_prune) {
+  die "--density_prune n: n must be greater than 1.0\n" unless $density_prune > 1.0;
+}
+
 if ($usefork) { $usefork = "--use-fork"; } else { $usefork = ''; }
 
 if ($metric =~ /^(combi|ter)$/i) {
@@ -236,9 +242,12 @@ while (1){
 	my $im1 = $iteration - 1;
 	my $weightsFile="$dir/weights.$im1";
 	my $decoder_cmd = "$decoder -c $iniFile -w $weightsFile -O $dir/hgs";
+  if ($density_prune) {
+    $decoder_cmd .= " --density_prune $density_prune";
+  }
 	my $pcmd = "cat $srcFile | $parallelize $usefork -p $pmem -e $logdir -j $decode_nodes -- ";
-        if ($run_local) { $pcmd = "cat $srcFile |"; }
-        my $cmd = $pcmd . "$decoder_cmd 2> $decoderLog 1> $runFile";
+  if ($run_local) { $pcmd = "cat $srcFile |"; }
+  my $cmd = $pcmd . "$decoder_cmd 2> $decoderLog 1> $runFile";
 	print STDERR "COMMAND:\n$cmd\n";
 	my $result = 0;
 	$result = system($cmd);
@@ -547,6 +556,10 @@ Options:
 	--decoder <decoder path>
 		Decoder binary to use.
 
+  --density-prune <N>
+    Limit the density of the hypergraph on each iteration to N times
+    the number of edges on the Viterbi path.
+
 	--help
 		Print this message and exit.
author	redpony <redpony@ec762483-ff6d-05da-a07a-a48fb63a330f>	2010-07-22 20:52:22 +0000
committer	redpony <redpony@ec762483-ff6d-05da-a07a-a48fb63a330f>	2010-07-22 20:52:22 +0000
commit	fd920188bd5f912a45ef862b2a2ed7b1062b564e (patch)
tree	b0af1fd850d38c645fb82a389443c72008d90545
parent	113ebafe09fb0474f30d12190b38da74f8b08b4d (diff)