From 963b7b96576de000a743ef377c439ea5c6787e2e Mon Sep 17 00:00:00 2001
From: redpony <redpony@ec762483-ff6d-05da-a07a-a48fb63a330f>
Date: Tue, 10 Aug 2010 20:03:53 +0000
Subject: support for running in multiple environments which are automatically
 detected

git-svn-id: https://ws10smt.googlecode.com/svn/trunk@501 ec762483-ff6d-05da-a07a-a48fb63a330f
---
 gi/pipeline/evaluation-pipeline.pl | 27 ++++++++++++---------------
 gi/pipeline/lticluster.config      |  9 +++++++++
 2 files changed, 21 insertions(+), 15 deletions(-)
 create mode 100644 gi/pipeline/lticluster.config

(limited to 'gi')

diff --git a/gi/pipeline/evaluation-pipeline.pl b/gi/pipeline/evaluation-pipeline.pl
index 2660155f..4b4529d9 100755
--- a/gi/pipeline/evaluation-pipeline.pl
+++ b/gi/pipeline/evaluation-pipeline.pl
@@ -4,11 +4,12 @@ use Getopt::Long;
 use Cwd;
 my $CWD = getcwd;
 
-my $SCRIPT_DIR; BEGIN { use Cwd qw/ abs_path /; use File::Basename; $SCRIPT_DIR = dirname(abs_path($0)); push @INC, $SCRIPT_DIR; }
+my $SCRIPT_DIR; BEGIN { use Cwd qw/ abs_path /; use File::Basename; $SCRIPT_DIR = dirname(abs_path($0)); push @INC, $SCRIPT_DIR, "$SCRIPT_DIR/../../environment"; }
+use LocalConfig;
 
 my $JOBS = 15;
 my $PMEM = "9G";
-my $NUM_TRANSLATIONS = 30;
+my $NUM_TRANSLATIONS = 50;
 my $GOAL = "S";
 
 # featurize_grammar may add multiple features from a single feature extractor
@@ -75,17 +76,7 @@ assert_exec($CDEC, $PARALLELIZE, $FILTER, $FEATURIZE, $DISTVEST, $FILTERBYF);
 
 my $numtopics = 25;
 
-my $config = "$SCRIPT_DIR/clsp.config";
-if ((scalar @ARGV) >= 2 && ($ARGV[0] eq '-c')) {
-  $config = $ARGV[1];
-  shift @ARGV; shift @ARGV;
-  unless (-f $config) {
-    $config = "$SCRIPT_DIR/$config";
-    unless (-f $config) {
-      $config .= ".config";
-    }
-  }
-}
+my $config = "$SCRIPT_DIR/" . (lc environment_name()) . '.config';
 print STDERR "CORPORA CONFIGURATION: $config\n";
 open CONF, "<$config" or die "Can't read $config: $!";
 my %paths;
@@ -128,8 +119,10 @@ my $gluegram;
 my $oovgram;
 my $usefork;
 my $lmorder = 3;
+my $density;
 if (GetOptions(
         "backoff-grammar=s" => \$bkoffgram,
+        "density-prune=f" => \$density,
         "glue-grammar=s" => \$gluegram,
         "oov-grammar=s" => \$oovgram,
         "data=s" => \$dataDir,
@@ -145,6 +138,10 @@ if (GetOptions(
         print_help();
         exit;
 }
+my $DENSITY_PRUNE = '';
+if ($density) {
+  $DENSITY_PRUNE = "--density-prune $density";
+}
 if ($usefork) { $usefork="--use-fork"; } else { $usefork = ''; }
 my @fkeys = keys %$feat_map;
 die "You must specify one or more features with -f. Known features: @fkeys\n" unless scalar @features > 0;
@@ -228,7 +225,7 @@ my $tuned_weights = mydircat($outdir, 'weights.tuned');
 if (-f $tuned_weights) {
   print STDERR "TUNED WEIGHTS $tuned_weights EXISTS: REUSING\n";
 } else {
-  my $cmd = "$DISTVEST $usefork --decode-nodes $JOBS --pmem=$PMEM --ref-files=$drefs --source-file=$dev --weights $weights $devini";
+  my $cmd = "$DISTVEST $usefork $DENSITY_PRUNE --decode-nodes $JOBS --pmem=$PMEM --ref-files=$drefs --source-file=$dev --weights $weights $devini";
   print STDERR "MERT COMMAND: $cmd\n";
   `rm -rf $outdir/vest 2> /dev/null`;
   chdir $outdir or die "Can't chdir to $outdir: $!";
@@ -265,7 +262,7 @@ sub write_random_weights_file {
   open F, ">$file" or die "Can't write $file: $!";
   my @feats = (@DEFAULT_FEATS, @extras);
   for my $feat (@feats) {
-    my $r = rand(1.6);
+    my $r = rand(0.4) + 0.8;
     my $w = $init_weights{$feat} * $r;
     if ($w == 0) { $w = 0.0001; print STDERR "WARNING: $feat had no initial weight!\n"; }
     print F "$feat $w\n";
diff --git a/gi/pipeline/lticluster.config b/gi/pipeline/lticluster.config
new file mode 100644
index 00000000..3e23c8cb
--- /dev/null
+++ b/gi/pipeline/lticluster.config
@@ -0,0 +1,9 @@
+# THIS FILE GIVES THE LOCATIONS OF THE CORPORA USED
+# name path aligned-corpus LM dev dev-refs test1 testt-eval.sh ...
+/home/cdyer/ws10smt-data
+btec /home/cdyer/ws10smt-data/btec/ split.zh-en.al lm/en.3gram.lm.gz devtest/devset1_2.zh devtest/devset1_2.lc.en* devtest/devset3.zh eval-devset3.sh
+zhen /home/cdyer/ws10smt-data/chinese-english corpus.zh-en.al lm/c2e.3gram.lm.gz dev_and_test/mt02.src.txt dev_and_test/mt02.ref.* dev_and_test/mt03.src.txt eval-mt03.sh
+aren /home/cdyer/ws10smt-data/arabic-english corpus.ar-en-al lm/a2e.3gram.lm.gz dev_and_test/dev.src.txt dev_and_test/dev.ref.txt.* dev_and_test/mt05.src.txt eval-mt05.sh
+uren /home/cdyer/ws10smt-data/urdu-english corpus.ur-en.al lm/u2e.en.lm.gz dev/dev.ur dev/dev.en* devtest/devtest.ur eval-devtest.sh
+nlfr /home/cdyer/ws10smt-data/dutch-french corpus.nl-fr.al
+
-- 
cgit v1.2.3