diff options
author | redpony <redpony@ec762483-ff6d-05da-a07a-a48fb63a330f> | 2010-07-17 18:17:34 +0000 |
---|---|---|
committer | redpony <redpony@ec762483-ff6d-05da-a07a-a48fb63a330f> | 2010-07-17 18:17:34 +0000 |
commit | d24aa658436cbae3404146c106b0b7569eac60ed (patch) | |
tree | 047c40e0b8958726570dcccb64fbf562f94b04a4 /gi | |
parent | a2d3ef53c27bfaf4a0e9b638d8bfe24d1114fe9c (diff) |
more support for other clusters
git-svn-id: https://ws10smt.googlecode.com/svn/trunk@307 ec762483-ff6d-05da-a07a-a48fb63a330f
Diffstat (limited to 'gi')
-rw-r--r-- | gi/pipeline/clsp.config | 1 | ||||
-rwxr-xr-x | gi/pipeline/evaluation-pipeline.pl | 16 | ||||
-rw-r--r-- | gi/pipeline/valhalla.config | 1 |
3 files changed, 14 insertions, 4 deletions
diff --git a/gi/pipeline/clsp.config b/gi/pipeline/clsp.config index 27161fab..f7f131a0 100644 --- a/gi/pipeline/clsp.config +++ b/gi/pipeline/clsp.config @@ -1,5 +1,6 @@ # THIS FILE GIVES THE LOCATIONS OF THE CORPORA USED # name path aligned-corpus LM dev dev-refs test1 testt-eval.sh ... +/export/ws10smt/data btec /export/ws10smt/data/btec/ split.zh-en.al lm/en.3gram.lm.gz devtest/devset1_2.zh devtest/devset1_2.lc.en* devtest/devset3.zh eval-devset3.sh fbis /export/ws10smt/data/chinese-english.fbis corpus.zh-en.al zhen /export/ws10smt/data/chinese-english corpus.zh-en.al diff --git a/gi/pipeline/evaluation-pipeline.pl b/gi/pipeline/evaluation-pipeline.pl index c6dcca05..178159b9 100755 --- a/gi/pipeline/evaluation-pipeline.pl +++ b/gi/pipeline/evaluation-pipeline.pl @@ -6,6 +6,8 @@ my $CWD = getcwd; my $SCRIPT_DIR; BEGIN { use Cwd qw/ abs_path /; use File::Basename; $SCRIPT_DIR = dirname(abs_path($0)); push @INC, $SCRIPT_DIR; } +my $JOBS = 15; + # featurize_grammar may add multiple features from a single feature extractor # the key in this map is the extractor name, the value is a list of the extracted features my $feat_map = { @@ -89,6 +91,7 @@ my %devs; my %devrefs; my %tests; my %testevals; +my $datadir; print STDERR " LANGUAGE PAIRS:"; while(<CONF>) { chomp; @@ -96,6 +99,7 @@ while(<CONF>) { next if /^\s*$/; s/^\s+//; s/\s+$//; + if (! defined $datadir) { $datadir = $_; next; } my ($name, $path, $corpus, $lm, $dev, $devref, @xtests) = split /\s+/; $paths{$name} = $path; $corpora{$name} = $corpus; @@ -116,15 +120,19 @@ my $FEATURIZER_OPTS = ''; my $dataDir = '/export/ws10smt/data'; my @features; my $bkoffgram; +my $usefork; if (GetOptions( "backoff_grammar" => \$bkoffgram, "data=s" => \$dataDir, "features=s@" => \@features, + "use-fork" => \$usefork, + "jobs=i" => \$JOBS, "out-dir=s" => \$outdir, ) == 0 || @ARGV!=2 || $help) { print_help(); exit; } +if ($usefork) { $usefork="--use-fork"; } else { $usefork = ''; } my @fkeys = keys %$feat_map; push(@features, "BackoffRule") if $bkoffgram; die "You must specify one or more features with -f. Known features: @fkeys\n" unless scalar @features > 0; @@ -200,7 +208,7 @@ my $tuned_weights = mydircat($outdir, 'weights.tuned'); if (-f $tuned_weights) { print STDERR "TUNED WEIGHTS $tuned_weights EXISTS: REUSING\n"; } else { - my $cmd = "$DISTVEST --ref-files=$drefs --source-file=$dev --weights $weights $devini"; + my $cmd = "$DISTVEST $usefork --decode-nodes $JOBS --ref-files=$drefs --source-file=$dev --weights $weights $devini"; print STDERR "MERT COMMAND: $cmd\n"; `rm -rf $outdir/vest 2> /dev/null`; chdir $outdir or die "Can't chdir to $outdir: $!"; @@ -216,7 +224,7 @@ if (-f $tuned_weights) { print STDERR "\nDECODE TEST SET\n"; my $decolog = mydircat($outdir, "test-decode.log"); my $testtrans = mydircat($outdir, "test.trans"); -my $cmd = "cat $test | $PARALLELIZE -j 20 -e $decolog -- $CDEC -c $testini -w $tuned_weights > $testtrans"; +my $cmd = "cat $test | $PARALLELIZE $usefork -j $JOBS -e $decolog -- $CDEC -c $testini -w $tuned_weights > $testtrans"; safesystem($testtrans, $cmd) or die "Failed to decode test set!"; @@ -292,8 +300,8 @@ sub write_cdec_ini { formalism=scfg cubepruning_pop_limit=100 add_pass_through_rules=true -scfg_extra_glue_grammar=/export/ws10smt/data/glue/glue.scfg.gz -grammar=/export/ws10smt/data/oov.scfg.gz +scfg_extra_glue_grammar=$datadir/glue/glue.scfg.gz +grammar=$datadir/oov.scfg.gz grammar=$grammar_path scfg_default_nt=OOV scfg_no_hiero_glue_grammar=true diff --git a/gi/pipeline/valhalla.config b/gi/pipeline/valhalla.config index 503cbd4a..e00a8485 100644 --- a/gi/pipeline/valhalla.config +++ b/gi/pipeline/valhalla.config @@ -1,5 +1,6 @@ # THIS FILE GIVES THE LOCATIONS OF THE CORPORA USED # name path aligned-corpus LM dev dev-refs test1 testt-eval.sh ... +/home/chris/ws10smt/data btec /home/chris/ws10smt/data/btec/ split.zh-en.al lm/en.3gram.lm.gz devtest/devset1_2.zh devtest/devset1_2.lc.en* devtest/devset3.zh eval-devset3.sh fbis /home/chris/ws10smt/data/chinese-english.fbis corpus.zh-en.al zhen /home/chris/ws10smt/data/chinese-english corpus.zh-en.al |