summaryrefslogtreecommitdiff
path: root/gi/pipeline
diff options
context:
space:
mode:
Diffstat (limited to 'gi/pipeline')
-rwxr-xr-xgi/pipeline/evaluation-pipeline.pl27
-rw-r--r--gi/pipeline/lticluster.config9
2 files changed, 21 insertions, 15 deletions
diff --git a/gi/pipeline/evaluation-pipeline.pl b/gi/pipeline/evaluation-pipeline.pl
index 2660155f..4b4529d9 100755
--- a/gi/pipeline/evaluation-pipeline.pl
+++ b/gi/pipeline/evaluation-pipeline.pl
@@ -4,11 +4,12 @@ use Getopt::Long;
use Cwd;
my $CWD = getcwd;
-my $SCRIPT_DIR; BEGIN { use Cwd qw/ abs_path /; use File::Basename; $SCRIPT_DIR = dirname(abs_path($0)); push @INC, $SCRIPT_DIR; }
+my $SCRIPT_DIR; BEGIN { use Cwd qw/ abs_path /; use File::Basename; $SCRIPT_DIR = dirname(abs_path($0)); push @INC, $SCRIPT_DIR, "$SCRIPT_DIR/../../environment"; }
+use LocalConfig;
my $JOBS = 15;
my $PMEM = "9G";
-my $NUM_TRANSLATIONS = 30;
+my $NUM_TRANSLATIONS = 50;
my $GOAL = "S";
# featurize_grammar may add multiple features from a single feature extractor
@@ -75,17 +76,7 @@ assert_exec($CDEC, $PARALLELIZE, $FILTER, $FEATURIZE, $DISTVEST, $FILTERBYF);
my $numtopics = 25;
-my $config = "$SCRIPT_DIR/clsp.config";
-if ((scalar @ARGV) >= 2 && ($ARGV[0] eq '-c')) {
- $config = $ARGV[1];
- shift @ARGV; shift @ARGV;
- unless (-f $config) {
- $config = "$SCRIPT_DIR/$config";
- unless (-f $config) {
- $config .= ".config";
- }
- }
-}
+my $config = "$SCRIPT_DIR/" . (lc environment_name()) . '.config';
print STDERR "CORPORA CONFIGURATION: $config\n";
open CONF, "<$config" or die "Can't read $config: $!";
my %paths;
@@ -128,8 +119,10 @@ my $gluegram;
my $oovgram;
my $usefork;
my $lmorder = 3;
+my $density;
if (GetOptions(
"backoff-grammar=s" => \$bkoffgram,
+ "density-prune=f" => \$density,
"glue-grammar=s" => \$gluegram,
"oov-grammar=s" => \$oovgram,
"data=s" => \$dataDir,
@@ -145,6 +138,10 @@ if (GetOptions(
print_help();
exit;
}
+my $DENSITY_PRUNE = '';
+if ($density) {
+ $DENSITY_PRUNE = "--density-prune $density";
+}
if ($usefork) { $usefork="--use-fork"; } else { $usefork = ''; }
my @fkeys = keys %$feat_map;
die "You must specify one or more features with -f. Known features: @fkeys\n" unless scalar @features > 0;
@@ -228,7 +225,7 @@ my $tuned_weights = mydircat($outdir, 'weights.tuned');
if (-f $tuned_weights) {
print STDERR "TUNED WEIGHTS $tuned_weights EXISTS: REUSING\n";
} else {
- my $cmd = "$DISTVEST $usefork --decode-nodes $JOBS --pmem=$PMEM --ref-files=$drefs --source-file=$dev --weights $weights $devini";
+ my $cmd = "$DISTVEST $usefork $DENSITY_PRUNE --decode-nodes $JOBS --pmem=$PMEM --ref-files=$drefs --source-file=$dev --weights $weights $devini";
print STDERR "MERT COMMAND: $cmd\n";
`rm -rf $outdir/vest 2> /dev/null`;
chdir $outdir or die "Can't chdir to $outdir: $!";
@@ -265,7 +262,7 @@ sub write_random_weights_file {
open F, ">$file" or die "Can't write $file: $!";
my @feats = (@DEFAULT_FEATS, @extras);
for my $feat (@feats) {
- my $r = rand(1.6);
+ my $r = rand(0.4) + 0.8;
my $w = $init_weights{$feat} * $r;
if ($w == 0) { $w = 0.0001; print STDERR "WARNING: $feat had no initial weight!\n"; }
print F "$feat $w\n";
diff --git a/gi/pipeline/lticluster.config b/gi/pipeline/lticluster.config
new file mode 100644
index 00000000..3e23c8cb
--- /dev/null
+++ b/gi/pipeline/lticluster.config
@@ -0,0 +1,9 @@
+# THIS FILE GIVES THE LOCATIONS OF THE CORPORA USED
+# name path aligned-corpus LM dev dev-refs test1 testt-eval.sh ...
+/home/cdyer/ws10smt-data
+btec /home/cdyer/ws10smt-data/btec/ split.zh-en.al lm/en.3gram.lm.gz devtest/devset1_2.zh devtest/devset1_2.lc.en* devtest/devset3.zh eval-devset3.sh
+zhen /home/cdyer/ws10smt-data/chinese-english corpus.zh-en.al lm/c2e.3gram.lm.gz dev_and_test/mt02.src.txt dev_and_test/mt02.ref.* dev_and_test/mt03.src.txt eval-mt03.sh
+aren /home/cdyer/ws10smt-data/arabic-english corpus.ar-en-al lm/a2e.3gram.lm.gz dev_and_test/dev.src.txt dev_and_test/dev.ref.txt.* dev_and_test/mt05.src.txt eval-mt05.sh
+uren /home/cdyer/ws10smt-data/urdu-english corpus.ur-en.al lm/u2e.en.lm.gz dev/dev.ur dev/dev.en* devtest/devtest.ur eval-devtest.sh
+nlfr /home/cdyer/ws10smt-data/dutch-french corpus.nl-fr.al
+