From 5c1cbf1d5e6dcdf14984697e85c803f1dd549515 Mon Sep 17 00:00:00 2001 From: redpony Date: Sat, 17 Jul 2010 18:17:34 +0000 Subject: more support for other clusters git-svn-id: https://ws10smt.googlecode.com/svn/trunk@307 ec762483-ff6d-05da-a07a-a48fb63a330f --- gi/pipeline/clsp.config | 1 + gi/pipeline/evaluation-pipeline.pl | 16 ++++++++++++---- gi/pipeline/valhalla.config | 1 + 3 files changed, 14 insertions(+), 4 deletions(-) (limited to 'gi/pipeline') diff --git a/gi/pipeline/clsp.config b/gi/pipeline/clsp.config index 27161fab..f7f131a0 100644 --- a/gi/pipeline/clsp.config +++ b/gi/pipeline/clsp.config @@ -1,5 +1,6 @@ # THIS FILE GIVES THE LOCATIONS OF THE CORPORA USED # name path aligned-corpus LM dev dev-refs test1 testt-eval.sh ... +/export/ws10smt/data btec /export/ws10smt/data/btec/ split.zh-en.al lm/en.3gram.lm.gz devtest/devset1_2.zh devtest/devset1_2.lc.en* devtest/devset3.zh eval-devset3.sh fbis /export/ws10smt/data/chinese-english.fbis corpus.zh-en.al zhen /export/ws10smt/data/chinese-english corpus.zh-en.al diff --git a/gi/pipeline/evaluation-pipeline.pl b/gi/pipeline/evaluation-pipeline.pl index c6dcca05..178159b9 100755 --- a/gi/pipeline/evaluation-pipeline.pl +++ b/gi/pipeline/evaluation-pipeline.pl @@ -6,6 +6,8 @@ my $CWD = getcwd; my $SCRIPT_DIR; BEGIN { use Cwd qw/ abs_path /; use File::Basename; $SCRIPT_DIR = dirname(abs_path($0)); push @INC, $SCRIPT_DIR; } +my $JOBS = 15; + # featurize_grammar may add multiple features from a single feature extractor # the key in this map is the extractor name, the value is a list of the extracted features my $feat_map = { @@ -89,6 +91,7 @@ my %devs; my %devrefs; my %tests; my %testevals; +my $datadir; print STDERR " LANGUAGE PAIRS:"; while() { chomp; @@ -96,6 +99,7 @@ while() { next if /^\s*$/; s/^\s+//; s/\s+$//; + if (! defined $datadir) { $datadir = $_; next; } my ($name, $path, $corpus, $lm, $dev, $devref, @xtests) = split /\s+/; $paths{$name} = $path; $corpora{$name} = $corpus; @@ -116,15 +120,19 @@ my $FEATURIZER_OPTS = ''; my $dataDir = '/export/ws10smt/data'; my @features; my $bkoffgram; +my $usefork; if (GetOptions( "backoff_grammar" => \$bkoffgram, "data=s" => \$dataDir, "features=s@" => \@features, + "use-fork" => \$usefork, + "jobs=i" => \$JOBS, "out-dir=s" => \$outdir, ) == 0 || @ARGV!=2 || $help) { print_help(); exit; } +if ($usefork) { $usefork="--use-fork"; } else { $usefork = ''; } my @fkeys = keys %$feat_map; push(@features, "BackoffRule") if $bkoffgram; die "You must specify one or more features with -f. Known features: @fkeys\n" unless scalar @features > 0; @@ -200,7 +208,7 @@ my $tuned_weights = mydircat($outdir, 'weights.tuned'); if (-f $tuned_weights) { print STDERR "TUNED WEIGHTS $tuned_weights EXISTS: REUSING\n"; } else { - my $cmd = "$DISTVEST --ref-files=$drefs --source-file=$dev --weights $weights $devini"; + my $cmd = "$DISTVEST $usefork --decode-nodes $JOBS --ref-files=$drefs --source-file=$dev --weights $weights $devini"; print STDERR "MERT COMMAND: $cmd\n"; `rm -rf $outdir/vest 2> /dev/null`; chdir $outdir or die "Can't chdir to $outdir: $!"; @@ -216,7 +224,7 @@ if (-f $tuned_weights) { print STDERR "\nDECODE TEST SET\n"; my $decolog = mydircat($outdir, "test-decode.log"); my $testtrans = mydircat($outdir, "test.trans"); -my $cmd = "cat $test | $PARALLELIZE -j 20 -e $decolog -- $CDEC -c $testini -w $tuned_weights > $testtrans"; +my $cmd = "cat $test | $PARALLELIZE $usefork -j $JOBS -e $decolog -- $CDEC -c $testini -w $tuned_weights > $testtrans"; safesystem($testtrans, $cmd) or die "Failed to decode test set!"; @@ -292,8 +300,8 @@ sub write_cdec_ini { formalism=scfg cubepruning_pop_limit=100 add_pass_through_rules=true -scfg_extra_glue_grammar=/export/ws10smt/data/glue/glue.scfg.gz -grammar=/export/ws10smt/data/oov.scfg.gz +scfg_extra_glue_grammar=$datadir/glue/glue.scfg.gz +grammar=$datadir/oov.scfg.gz grammar=$grammar_path scfg_default_nt=OOV scfg_no_hiero_glue_grammar=true diff --git a/gi/pipeline/valhalla.config b/gi/pipeline/valhalla.config index 503cbd4a..e00a8485 100644 --- a/gi/pipeline/valhalla.config +++ b/gi/pipeline/valhalla.config @@ -1,5 +1,6 @@ # THIS FILE GIVES THE LOCATIONS OF THE CORPORA USED # name path aligned-corpus LM dev dev-refs test1 testt-eval.sh ... +/home/chris/ws10smt/data btec /home/chris/ws10smt/data/btec/ split.zh-en.al lm/en.3gram.lm.gz devtest/devset1_2.zh devtest/devset1_2.lc.en* devtest/devset3.zh eval-devset3.sh fbis /home/chris/ws10smt/data/chinese-english.fbis corpus.zh-en.al zhen /home/chris/ws10smt/data/chinese-english corpus.zh-en.al -- cgit v1.2.3