summaryrefslogtreecommitdiff
path: root/gi/pipeline
diff options
context:
space:
mode:
authorredpony <redpony@ec762483-ff6d-05da-a07a-a48fb63a330f>2010-07-17 18:17:34 +0000
committerredpony <redpony@ec762483-ff6d-05da-a07a-a48fb63a330f>2010-07-17 18:17:34 +0000
commit5c1cbf1d5e6dcdf14984697e85c803f1dd549515 (patch)
tree8e9997a71774ffad6f79a3f03761c3a9e3898ff3 /gi/pipeline
parent6435c790b37b910b8acd7dc621e66a4e0e03f63c (diff)
more support for other clusters
git-svn-id: https://ws10smt.googlecode.com/svn/trunk@307 ec762483-ff6d-05da-a07a-a48fb63a330f
Diffstat (limited to 'gi/pipeline')
-rw-r--r--gi/pipeline/clsp.config1
-rwxr-xr-xgi/pipeline/evaluation-pipeline.pl16
-rw-r--r--gi/pipeline/valhalla.config1
3 files changed, 14 insertions, 4 deletions
diff --git a/gi/pipeline/clsp.config b/gi/pipeline/clsp.config
index 27161fab..f7f131a0 100644
--- a/gi/pipeline/clsp.config
+++ b/gi/pipeline/clsp.config
@@ -1,5 +1,6 @@
# THIS FILE GIVES THE LOCATIONS OF THE CORPORA USED
# name path aligned-corpus LM dev dev-refs test1 testt-eval.sh ...
+/export/ws10smt/data
btec /export/ws10smt/data/btec/ split.zh-en.al lm/en.3gram.lm.gz devtest/devset1_2.zh devtest/devset1_2.lc.en* devtest/devset3.zh eval-devset3.sh
fbis /export/ws10smt/data/chinese-english.fbis corpus.zh-en.al
zhen /export/ws10smt/data/chinese-english corpus.zh-en.al
diff --git a/gi/pipeline/evaluation-pipeline.pl b/gi/pipeline/evaluation-pipeline.pl
index c6dcca05..178159b9 100755
--- a/gi/pipeline/evaluation-pipeline.pl
+++ b/gi/pipeline/evaluation-pipeline.pl
@@ -6,6 +6,8 @@ my $CWD = getcwd;
my $SCRIPT_DIR; BEGIN { use Cwd qw/ abs_path /; use File::Basename; $SCRIPT_DIR = dirname(abs_path($0)); push @INC, $SCRIPT_DIR; }
+my $JOBS = 15;
+
# featurize_grammar may add multiple features from a single feature extractor
# the key in this map is the extractor name, the value is a list of the extracted features
my $feat_map = {
@@ -89,6 +91,7 @@ my %devs;
my %devrefs;
my %tests;
my %testevals;
+my $datadir;
print STDERR " LANGUAGE PAIRS:";
while(<CONF>) {
chomp;
@@ -96,6 +99,7 @@ while(<CONF>) {
next if /^\s*$/;
s/^\s+//;
s/\s+$//;
+ if (! defined $datadir) { $datadir = $_; next; }
my ($name, $path, $corpus, $lm, $dev, $devref, @xtests) = split /\s+/;
$paths{$name} = $path;
$corpora{$name} = $corpus;
@@ -116,15 +120,19 @@ my $FEATURIZER_OPTS = '';
my $dataDir = '/export/ws10smt/data';
my @features;
my $bkoffgram;
+my $usefork;
if (GetOptions(
"backoff_grammar" => \$bkoffgram,
"data=s" => \$dataDir,
"features=s@" => \@features,
+ "use-fork" => \$usefork,
+ "jobs=i" => \$JOBS,
"out-dir=s" => \$outdir,
) == 0 || @ARGV!=2 || $help) {
print_help();
exit;
}
+if ($usefork) { $usefork="--use-fork"; } else { $usefork = ''; }
my @fkeys = keys %$feat_map;
push(@features, "BackoffRule") if $bkoffgram;
die "You must specify one or more features with -f. Known features: @fkeys\n" unless scalar @features > 0;
@@ -200,7 +208,7 @@ my $tuned_weights = mydircat($outdir, 'weights.tuned');
if (-f $tuned_weights) {
print STDERR "TUNED WEIGHTS $tuned_weights EXISTS: REUSING\n";
} else {
- my $cmd = "$DISTVEST --ref-files=$drefs --source-file=$dev --weights $weights $devini";
+ my $cmd = "$DISTVEST $usefork --decode-nodes $JOBS --ref-files=$drefs --source-file=$dev --weights $weights $devini";
print STDERR "MERT COMMAND: $cmd\n";
`rm -rf $outdir/vest 2> /dev/null`;
chdir $outdir or die "Can't chdir to $outdir: $!";
@@ -216,7 +224,7 @@ if (-f $tuned_weights) {
print STDERR "\nDECODE TEST SET\n";
my $decolog = mydircat($outdir, "test-decode.log");
my $testtrans = mydircat($outdir, "test.trans");
-my $cmd = "cat $test | $PARALLELIZE -j 20 -e $decolog -- $CDEC -c $testini -w $tuned_weights > $testtrans";
+my $cmd = "cat $test | $PARALLELIZE $usefork -j $JOBS -e $decolog -- $CDEC -c $testini -w $tuned_weights > $testtrans";
safesystem($testtrans, $cmd) or die "Failed to decode test set!";
@@ -292,8 +300,8 @@ sub write_cdec_ini {
formalism=scfg
cubepruning_pop_limit=100
add_pass_through_rules=true
-scfg_extra_glue_grammar=/export/ws10smt/data/glue/glue.scfg.gz
-grammar=/export/ws10smt/data/oov.scfg.gz
+scfg_extra_glue_grammar=$datadir/glue/glue.scfg.gz
+grammar=$datadir/oov.scfg.gz
grammar=$grammar_path
scfg_default_nt=OOV
scfg_no_hiero_glue_grammar=true
diff --git a/gi/pipeline/valhalla.config b/gi/pipeline/valhalla.config
index 503cbd4a..e00a8485 100644
--- a/gi/pipeline/valhalla.config
+++ b/gi/pipeline/valhalla.config
@@ -1,5 +1,6 @@
# THIS FILE GIVES THE LOCATIONS OF THE CORPORA USED
# name path aligned-corpus LM dev dev-refs test1 testt-eval.sh ...
+/home/chris/ws10smt/data
btec /home/chris/ws10smt/data/btec/ split.zh-en.al lm/en.3gram.lm.gz devtest/devset1_2.zh devtest/devset1_2.lc.en* devtest/devset3.zh eval-devset3.sh
fbis /home/chris/ws10smt/data/chinese-english.fbis corpus.zh-en.al
zhen /home/chris/ws10smt/data/chinese-english corpus.zh-en.al