From 963b7b96576de000a743ef377c439ea5c6787e2e Mon Sep 17 00:00:00 2001 From: redpony Date: Tue, 10 Aug 2010 20:03:53 +0000 Subject: support for running in multiple environments which are automatically detected git-svn-id: https://ws10smt.googlecode.com/svn/trunk@501 ec762483-ff6d-05da-a07a-a48fb63a330f --- gi/pipeline/evaluation-pipeline.pl | 27 ++++++++++++--------------- gi/pipeline/lticluster.config | 9 +++++++++ 2 files changed, 21 insertions(+), 15 deletions(-) create mode 100644 gi/pipeline/lticluster.config (limited to 'gi') diff --git a/gi/pipeline/evaluation-pipeline.pl b/gi/pipeline/evaluation-pipeline.pl index 2660155f..4b4529d9 100755 --- a/gi/pipeline/evaluation-pipeline.pl +++ b/gi/pipeline/evaluation-pipeline.pl @@ -4,11 +4,12 @@ use Getopt::Long; use Cwd; my $CWD = getcwd; -my $SCRIPT_DIR; BEGIN { use Cwd qw/ abs_path /; use File::Basename; $SCRIPT_DIR = dirname(abs_path($0)); push @INC, $SCRIPT_DIR; } +my $SCRIPT_DIR; BEGIN { use Cwd qw/ abs_path /; use File::Basename; $SCRIPT_DIR = dirname(abs_path($0)); push @INC, $SCRIPT_DIR, "$SCRIPT_DIR/../../environment"; } +use LocalConfig; my $JOBS = 15; my $PMEM = "9G"; -my $NUM_TRANSLATIONS = 30; +my $NUM_TRANSLATIONS = 50; my $GOAL = "S"; # featurize_grammar may add multiple features from a single feature extractor @@ -75,17 +76,7 @@ assert_exec($CDEC, $PARALLELIZE, $FILTER, $FEATURIZE, $DISTVEST, $FILTERBYF); my $numtopics = 25; -my $config = "$SCRIPT_DIR/clsp.config"; -if ((scalar @ARGV) >= 2 && ($ARGV[0] eq '-c')) { - $config = $ARGV[1]; - shift @ARGV; shift @ARGV; - unless (-f $config) { - $config = "$SCRIPT_DIR/$config"; - unless (-f $config) { - $config .= ".config"; - } - } -} +my $config = "$SCRIPT_DIR/" . (lc environment_name()) . '.config'; print STDERR "CORPORA CONFIGURATION: $config\n"; open CONF, "<$config" or die "Can't read $config: $!"; my %paths; @@ -128,8 +119,10 @@ my $gluegram; my $oovgram; my $usefork; my $lmorder = 3; +my $density; if (GetOptions( "backoff-grammar=s" => \$bkoffgram, + "density-prune=f" => \$density, "glue-grammar=s" => \$gluegram, "oov-grammar=s" => \$oovgram, "data=s" => \$dataDir, @@ -145,6 +138,10 @@ if (GetOptions( print_help(); exit; } +my $DENSITY_PRUNE = ''; +if ($density) { + $DENSITY_PRUNE = "--density-prune $density"; +} if ($usefork) { $usefork="--use-fork"; } else { $usefork = ''; } my @fkeys = keys %$feat_map; die "You must specify one or more features with -f. Known features: @fkeys\n" unless scalar @features > 0; @@ -228,7 +225,7 @@ my $tuned_weights = mydircat($outdir, 'weights.tuned'); if (-f $tuned_weights) { print STDERR "TUNED WEIGHTS $tuned_weights EXISTS: REUSING\n"; } else { - my $cmd = "$DISTVEST $usefork --decode-nodes $JOBS --pmem=$PMEM --ref-files=$drefs --source-file=$dev --weights $weights $devini"; + my $cmd = "$DISTVEST $usefork $DENSITY_PRUNE --decode-nodes $JOBS --pmem=$PMEM --ref-files=$drefs --source-file=$dev --weights $weights $devini"; print STDERR "MERT COMMAND: $cmd\n"; `rm -rf $outdir/vest 2> /dev/null`; chdir $outdir or die "Can't chdir to $outdir: $!"; @@ -265,7 +262,7 @@ sub write_random_weights_file { open F, ">$file" or die "Can't write $file: $!"; my @feats = (@DEFAULT_FEATS, @extras); for my $feat (@feats) { - my $r = rand(1.6); + my $r = rand(0.4) + 0.8; my $w = $init_weights{$feat} * $r; if ($w == 0) { $w = 0.0001; print STDERR "WARNING: $feat had no initial weight!\n"; } print F "$feat $w\n"; diff --git a/gi/pipeline/lticluster.config b/gi/pipeline/lticluster.config new file mode 100644 index 00000000..3e23c8cb --- /dev/null +++ b/gi/pipeline/lticluster.config @@ -0,0 +1,9 @@ +# THIS FILE GIVES THE LOCATIONS OF THE CORPORA USED +# name path aligned-corpus LM dev dev-refs test1 testt-eval.sh ... +/home/cdyer/ws10smt-data +btec /home/cdyer/ws10smt-data/btec/ split.zh-en.al lm/en.3gram.lm.gz devtest/devset1_2.zh devtest/devset1_2.lc.en* devtest/devset3.zh eval-devset3.sh +zhen /home/cdyer/ws10smt-data/chinese-english corpus.zh-en.al lm/c2e.3gram.lm.gz dev_and_test/mt02.src.txt dev_and_test/mt02.ref.* dev_and_test/mt03.src.txt eval-mt03.sh +aren /home/cdyer/ws10smt-data/arabic-english corpus.ar-en-al lm/a2e.3gram.lm.gz dev_and_test/dev.src.txt dev_and_test/dev.ref.txt.* dev_and_test/mt05.src.txt eval-mt05.sh +uren /home/cdyer/ws10smt-data/urdu-english corpus.ur-en.al lm/u2e.en.lm.gz dev/dev.ur dev/dev.en* devtest/devtest.ur eval-devtest.sh +nlfr /home/cdyer/ws10smt-data/dutch-french corpus.nl-fr.al + -- cgit v1.2.3