4 files changed, 69 insertions, 35 deletions
diff --git a/gi/pipeline/clsp.config b/gi/pipeline/clsp.config
new file mode 100644
index 00000000..cd0f9d65
--- /dev/null
+++ b/gi/pipeline/clsp.config
@@ -0,0 +1,9 @@
+# THIS FILE GIVES THE LOCATIONS OF THE CORPORA USED
+# name path aligned-corpus LM xfeats.grammar dev dev-refs test1 testt-eval.sh ...
+btec /export/ws10smt/data/btec/ split.zh-en.al lm/en.3gram.lm.gz xgrammar/grammar.gz devtest/devset1_2.zh devtest/devset1_2.lc.en* devtest/devset3.zh eval-devset3.sh
+fbis /export/ws10smt/data/chinese-english.fbis corpus.zh-en.al
+zhen /export/ws10smt/data/chinese-english corpus.zh-en.al
+aren /export/ws10smt/data/arabic-english corpus.ar-en.al
+uren /export/ws10smt/data/urdu-english corpus.ur-en.al
+nlfr /export/ws10smt/data/dutch-french corpus.nl-fr.al
+
diff --git a/gi/pipeline/config.eval b/gi/pipeline/config.eval
deleted file mode 100644
index 9f8da238..00000000
--- a/gi/pipeline/config.eval
+++ /dev/null
@@ -1,8 +0,0 @@
-# name path aligned-corpus LM dev dev-refs test1 testt-eval.sh ...
-btec btec/ split.zh-en.al lm/en.3gram.lm.gz devtest/devset1_2.zh devtest/devset1_2.lc.en* devtest/devset3.zh eval-devset3.sh
-fbis chinese-english.fbis corpus.zh-en.al
-zhen chinese-english corpus.zh-en.al
-aren arabic-english corpus.ar-en.al
-uren urdu-english corpus.ur-en.al
-nlfr dutch-french corpus.nl-fr.al
-
diff --git a/gi/pipeline/evaluation-pipeline.pl b/gi/pipeline/evaluation-pipeline.pl
index c27cd37e..bbab2d64 100755
--- a/gi/pipeline/evaluation-pipeline.pl
+++ b/gi/pipeline/evaluation-pipeline.pl
@@ -8,20 +8,28 @@ my $SCRIPT_DIR; BEGIN { use Cwd qw/ abs_path /; use File::Basename; $SCRIPT_DIR
 
 my @DEFAULT_FEATS = qw(
   LogRuleCount SingletonRule LexE2F LexF2E WordPenalty
-  LanguageModel Glue GlueTop PassThrough);
+  LogFCount LanguageModel Glue GlueTop PassThrough SingletonF
+);
 
 my %init_weights = qw(
   LogRuleCount 0.2
-  SingletonRule -0.6
   LexE2F -0.3
   LexF2E -0.3
+  LogFCount 0.1
   WordPenalty -1.5
   LanguageModel 1.2
   Glue -1.0
   GlueTop 0.00001
   PassThrough -10.0
+  SingletonRule -0.1
   X_EGivenF -0.3
   X_FGivenE -0.3
+  X_LogECount -1
+  X_LogFCount -0.1
+  X_LogRuleCount 0.3
+  X_SingletonE -0.1
+  X_SingletonF -0.1
+  X_SingletonRule -0.5
 );
 
 my $CDEC = "$SCRIPT_DIR/../../decoder/cdec";
@@ -32,9 +40,11 @@ my $VEST = "$SCRIPT_DIR/../../vest";
 die "Can't find vest: $VEST" unless -e $VEST && -d $VEST;
 my $DISTVEST = "$VEST/dist-vest.pl";
 my $FILTSCORE = "$EXTOOLS/filter_score_grammar";
-assert_exec($CDEC, $PARALLELIZE, $FILTSCORE, $DISTVEST);
+my $ADDXFEATS = "$SCRIPT_DIR/scripts/xfeats.pl";
+assert_exec($CDEC, $PARALLELIZE, $FILTSCORE, $DISTVEST, $ADDXFEATS);
 
-my $config = "$SCRIPT_DIR/config.eval";
+my $config = "$SCRIPT_DIR/clsp.config";
+print STDERR "CORPORA CONFIGURATION: $config\n";
 open CONF, "<$config" or die "Can't read $config: $!";
 my %paths;
 my %corpora;
@@ -43,17 +53,19 @@ my %devs;
 my %devrefs;
 my %tests;
 my %testevals;
-print STDERR "LANGUAGE PAIRS:";
+my %xgrammars;
+print STDERR "       LANGUAGE PAIRS:";
 while(<CONF>) {
   chomp;
   next if /^#/;
   next if /^\s*$/;
   s/^\s+//;
   s/\s+$//;
-  my ($name, $path, $corpus, $lm, $dev, $devref, @xtests) = split /\s+/;
+  my ($name, $path, $corpus, $lm, $xgrammar, $dev, $devref, @xtests) = split /\s+/;
   $paths{$name} = $path;
   $corpora{$name} = $corpus;
   $lms{$name} = $lm;
+  $xgrammars{$name} = $xgrammar;
   $devs{$name} = $dev;
   $devrefs{$name} = $devref;
   $tests{$name} = $xtests[0];
@@ -67,16 +79,15 @@ my %langpairs = map { $_ => 1 } qw( btec zhen fbis aren uren nlfr );
 my $outdir = "$CWD/exp";
 my $help;
 my $XFEATS;
+my $EXTRA_FILTER = '';
 my $dataDir = '/export/ws10smt/data';
 if (GetOptions(
-        "xfeats" => \$XFEATS,
         "data=s" => \$dataDir,
+        "xfeats" => \$XFEATS,
 ) == 0 || @ARGV!=2 || $help) {
         print_help();
         exit;
 }
-if ($XFEATS) { die "TODO: implement adding of X-features\n"; }
-
 my $lp = $ARGV[0];
 my $grammar = $ARGV[1];
 print STDERR "   CORPUS REPO: $dataDir\n";
@@ -102,6 +113,13 @@ my $teval = mydircat($corpdir, $testevals{$lp});
 die "Can't find test: $test\n" unless -f $test;
 assert_exec($teval);
 
+if ($XFEATS) {
+  my $xgram = mydircat($corpdir, $xgrammars{$lp});
+  die "Can't find x-grammar: $xgram" unless -f $xgram;
+  $EXTRA_FILTER = "$ADDXFEATS $xgram |";
+  print STDERR "ADDING X-FEATS FROM $xgram\n";
+}
+
 # MAKE DEV
 print STDERR "\nFILTERING FOR dev...\n";
 print STDERR "DEV: $dev (REFS=$drefs)\n";
@@ -128,31 +146,34 @@ write_random_weights_file($weights);
 
 # VEST
 print STDERR "\nMINIMUM ERROR TRAINING\n";
-my $cmd = "$DISTVEST --ref-files=$drefs --source-file=$dev --weights $weights $devini";
-print STDERR "MERT COMMAND: $cmd\n";
-`rm -rf $outdir/vest 2> /dev/null`;
-chdir $outdir or die "Can't chdir to $outdir: $!";
-$weights = `$cmd`;
-die "MERT reported non-zero exit code" unless $? == 0;
 my $tuned_weights = mydircat($outdir, 'weights.tuned');
-`cp $weights $tuned_weights`;
-print STDERR "TUNED WEIGHTS: $tuned_weights\n";
-die "$tuned_weights is missing!" unless -f $tuned_weights;
-
+if (-f $tuned_weights) {
+  print STDERR "TUNED WEIGHTS $tuned_weights EXISTS: REUSING\n";
+} else {
+  my $cmd = "$DISTVEST --ref-files=$drefs --source-file=$dev --weights $weights $devini";
+  print STDERR "MERT COMMAND: $cmd\n";
+  `rm -rf $outdir/vest 2> /dev/null`;
+  chdir $outdir or die "Can't chdir to $outdir: $!";
+  $weights = `$cmd`;
+  die "MERT reported non-zero exit code" unless $? == 0;
+  `cp $weights $tuned_weights`;
+  print STDERR "TUNED WEIGHTS: $tuned_weights\n";
+  die "$tuned_weights is missing!" unless -f $tuned_weights;
+}
 
 # DECODE
 print STDERR "\nDECODE TEST SET\n";
 my $decolog = mydircat($outdir, "test-decode.log");
 my $testtrans = mydircat($outdir, "test.trans");
-$cmd = "cat $test | $PARALLELIZE -j 20 -e $decolog -- $CDEC -c $testini -w $tuned_weights > $testtrans";
-safesystem($cmd) or die "Failed to decode test set!";
+my $cmd = "cat $test | $PARALLELIZE -j 20 -e $decolog -- $CDEC -c $testini -w $tuned_weights > $testtrans";
+safesystem($testtrans, $cmd) or die "Failed to decode test set!";
 
 
 # EVALUATE
 print STDERR "\nEVALUATE TEST SET\n";
 print STDERR "TEST: $testtrans\n";
 $cmd = "$teval $testtrans";
-safesystem($cmd) or die "Failed to evaluate!";
+safesystem(undef, $cmd) or die "Failed to evaluate!";
 exit 0;
 
 
@@ -160,10 +181,16 @@ sub write_random_weights_file {
   my ($file, @extras) = @_;
   open F, ">$file" or die "Can't write $file: $!";
   my @feats = (@DEFAULT_FEATS, @extras);
-  if ($XFEATS) { push @feats, "X_FGivenE"; push @feats, "X_EGivenF"; }
+  if ($XFEATS) {
+    my @xfeats = qw(
+      X_LogRuleCount X_LogECount X_LogFCount X_EGivenF X_FGivenE X_SingletonRule X_SingletonE X_SingletonF
+    );
+    @feats = (@feats, @xfeats);
+  }
   for my $feat (@feats) {
     my $r = rand(1.6);
     my $w = $init_weights{$feat} * $r;
+    if ($w == 0) { $w = 0.0001; print STDERR "WARNING: $feat had no initial weight!\n"; }
     print F "$feat $w\n";
   }
   close F;
@@ -173,8 +200,8 @@ sub filter {
   my ($grammar, $set, $name, $outdir) = @_;
   my $outgrammar = mydircat($outdir, "$name.scfg.gz");
   if (-f $outgrammar) { print STDERR "$outgrammar exists - REUSING!\n"; } else {
-    my $cmd = "gunzip -c $grammar | $FILTSCORE -c $CORPUS -t $dev | gzip > $outgrammar";
-    safesystem($cmd) or die "Can't filter and score grammar!";
+    my $cmd = "gunzip -c $grammar | $FILTSCORE -c $CORPUS -t $set | $EXTRA_FILTER gzip > $outgrammar";
+    safesystem($outgrammar, $cmd) or die "Can't filter and score grammar!";
   }
   return $outgrammar;
 }
@@ -215,20 +242,26 @@ EOT
 }
 
 sub safesystem {
+  my $output = shift @_;
   print STDERR "Executing: @_\n";
   system(@_);
   if ($? == -1) {
       print STDERR "ERROR: Failed to execute: @_\n  $!\n";
+      if (defined $output && -e $output) { printf STDERR "Removing $output\n"; `rm -rf $output`; }
       exit(1);
   }
   elsif ($? & 127) {
       printf STDERR "ERROR: Execution of: @_\n  died with signal %d, %s coredump\n",
           ($? & 127),  ($? & 128) ? 'with' : 'without';
+      if (defined $output && -e $output) { printf STDERR "Removing $output\n"; `rm -rf $output`; }
       exit(1);
   }
   else {
     my $exitcode = $? >> 8;
-    print STDERR "Exit code: $exitcode\n" if $exitcode;
+    if ($exitcode) {
+      print STDERR "Exit code: $exitcode\n";
+      if (defined $output && -e $output) { printf STDERR "Removing $output\n"; `rm -rf $output`; }
+    }
     return ! $exitcode;
   }
 }
diff --git a/gi/pipeline/local-gi-pipeline.pl b/gi/pipeline/local-gi-pipeline.pl
index 6e396165..9ae7fd95 100755
--- a/gi/pipeline/local-gi-pipeline.pl
+++ b/gi/pipeline/local-gi-pipeline.pl
@@ -14,7 +14,7 @@ my $ITEMS_IN_MEMORY = 10000000;  # cache size in extractors
 my $NUM_TOPICS = 50;
 my $NUM_SAMPLES = 1000;
 my $CONTEXT_SIZE = 1;
-my $BIDIR = 1;
+my $BIDIR = 0;
 my $TOPICS_CONFIG = "pyp-topics.conf";
 
 my $EXTOOLS = "$SCRIPT_DIR/../../extools";