summaryrefslogtreecommitdiff
path: root/gi/pipeline/evaluation-pipeline.pl
diff options
context:
space:
mode:
authorredpony <redpony@ec762483-ff6d-05da-a07a-a48fb63a330f>2010-07-06 17:54:45 +0000
committerredpony <redpony@ec762483-ff6d-05da-a07a-a48fb63a330f>2010-07-06 17:54:45 +0000
commitcd533050dc1e78e1476ae3b54f475cfa63f2623b (patch)
tree5703d0f4c8555f5501ff29590e7cb0de44239210 /gi/pipeline/evaluation-pipeline.pl
parent4c158811b747ff367e76d721d5f722daa4f26f56 (diff)
small fixes
git-svn-id: https://ws10smt.googlecode.com/svn/trunk@156 ec762483-ff6d-05da-a07a-a48fb63a330f
Diffstat (limited to 'gi/pipeline/evaluation-pipeline.pl')
-rwxr-xr-xgi/pipeline/evaluation-pipeline.pl85
1 files changed, 59 insertions, 26 deletions
diff --git a/gi/pipeline/evaluation-pipeline.pl b/gi/pipeline/evaluation-pipeline.pl
index c27cd37e..bbab2d64 100755
--- a/gi/pipeline/evaluation-pipeline.pl
+++ b/gi/pipeline/evaluation-pipeline.pl
@@ -8,20 +8,28 @@ my $SCRIPT_DIR; BEGIN { use Cwd qw/ abs_path /; use File::Basename; $SCRIPT_DIR
my @DEFAULT_FEATS = qw(
LogRuleCount SingletonRule LexE2F LexF2E WordPenalty
- LanguageModel Glue GlueTop PassThrough);
+ LogFCount LanguageModel Glue GlueTop PassThrough SingletonF
+);
my %init_weights = qw(
LogRuleCount 0.2
- SingletonRule -0.6
LexE2F -0.3
LexF2E -0.3
+ LogFCount 0.1
WordPenalty -1.5
LanguageModel 1.2
Glue -1.0
GlueTop 0.00001
PassThrough -10.0
+ SingletonRule -0.1
X_EGivenF -0.3
X_FGivenE -0.3
+ X_LogECount -1
+ X_LogFCount -0.1
+ X_LogRuleCount 0.3
+ X_SingletonE -0.1
+ X_SingletonF -0.1
+ X_SingletonRule -0.5
);
my $CDEC = "$SCRIPT_DIR/../../decoder/cdec";
@@ -32,9 +40,11 @@ my $VEST = "$SCRIPT_DIR/../../vest";
die "Can't find vest: $VEST" unless -e $VEST && -d $VEST;
my $DISTVEST = "$VEST/dist-vest.pl";
my $FILTSCORE = "$EXTOOLS/filter_score_grammar";
-assert_exec($CDEC, $PARALLELIZE, $FILTSCORE, $DISTVEST);
+my $ADDXFEATS = "$SCRIPT_DIR/scripts/xfeats.pl";
+assert_exec($CDEC, $PARALLELIZE, $FILTSCORE, $DISTVEST, $ADDXFEATS);
-my $config = "$SCRIPT_DIR/config.eval";
+my $config = "$SCRIPT_DIR/clsp.config";
+print STDERR "CORPORA CONFIGURATION: $config\n";
open CONF, "<$config" or die "Can't read $config: $!";
my %paths;
my %corpora;
@@ -43,17 +53,19 @@ my %devs;
my %devrefs;
my %tests;
my %testevals;
-print STDERR "LANGUAGE PAIRS:";
+my %xgrammars;
+print STDERR " LANGUAGE PAIRS:";
while(<CONF>) {
chomp;
next if /^#/;
next if /^\s*$/;
s/^\s+//;
s/\s+$//;
- my ($name, $path, $corpus, $lm, $dev, $devref, @xtests) = split /\s+/;
+ my ($name, $path, $corpus, $lm, $xgrammar, $dev, $devref, @xtests) = split /\s+/;
$paths{$name} = $path;
$corpora{$name} = $corpus;
$lms{$name} = $lm;
+ $xgrammars{$name} = $xgrammar;
$devs{$name} = $dev;
$devrefs{$name} = $devref;
$tests{$name} = $xtests[0];
@@ -67,16 +79,15 @@ my %langpairs = map { $_ => 1 } qw( btec zhen fbis aren uren nlfr );
my $outdir = "$CWD/exp";
my $help;
my $XFEATS;
+my $EXTRA_FILTER = '';
my $dataDir = '/export/ws10smt/data';
if (GetOptions(
- "xfeats" => \$XFEATS,
"data=s" => \$dataDir,
+ "xfeats" => \$XFEATS,
) == 0 || @ARGV!=2 || $help) {
print_help();
exit;
}
-if ($XFEATS) { die "TODO: implement adding of X-features\n"; }
-
my $lp = $ARGV[0];
my $grammar = $ARGV[1];
print STDERR " CORPUS REPO: $dataDir\n";
@@ -102,6 +113,13 @@ my $teval = mydircat($corpdir, $testevals{$lp});
die "Can't find test: $test\n" unless -f $test;
assert_exec($teval);
+if ($XFEATS) {
+ my $xgram = mydircat($corpdir, $xgrammars{$lp});
+ die "Can't find x-grammar: $xgram" unless -f $xgram;
+ $EXTRA_FILTER = "$ADDXFEATS $xgram |";
+ print STDERR "ADDING X-FEATS FROM $xgram\n";
+}
+
# MAKE DEV
print STDERR "\nFILTERING FOR dev...\n";
print STDERR "DEV: $dev (REFS=$drefs)\n";
@@ -128,31 +146,34 @@ write_random_weights_file($weights);
# VEST
print STDERR "\nMINIMUM ERROR TRAINING\n";
-my $cmd = "$DISTVEST --ref-files=$drefs --source-file=$dev --weights $weights $devini";
-print STDERR "MERT COMMAND: $cmd\n";
-`rm -rf $outdir/vest 2> /dev/null`;
-chdir $outdir or die "Can't chdir to $outdir: $!";
-$weights = `$cmd`;
-die "MERT reported non-zero exit code" unless $? == 0;
my $tuned_weights = mydircat($outdir, 'weights.tuned');
-`cp $weights $tuned_weights`;
-print STDERR "TUNED WEIGHTS: $tuned_weights\n";
-die "$tuned_weights is missing!" unless -f $tuned_weights;
-
+if (-f $tuned_weights) {
+ print STDERR "TUNED WEIGHTS $tuned_weights EXISTS: REUSING\n";
+} else {
+ my $cmd = "$DISTVEST --ref-files=$drefs --source-file=$dev --weights $weights $devini";
+ print STDERR "MERT COMMAND: $cmd\n";
+ `rm -rf $outdir/vest 2> /dev/null`;
+ chdir $outdir or die "Can't chdir to $outdir: $!";
+ $weights = `$cmd`;
+ die "MERT reported non-zero exit code" unless $? == 0;
+ `cp $weights $tuned_weights`;
+ print STDERR "TUNED WEIGHTS: $tuned_weights\n";
+ die "$tuned_weights is missing!" unless -f $tuned_weights;
+}
# DECODE
print STDERR "\nDECODE TEST SET\n";
my $decolog = mydircat($outdir, "test-decode.log");
my $testtrans = mydircat($outdir, "test.trans");
-$cmd = "cat $test | $PARALLELIZE -j 20 -e $decolog -- $CDEC -c $testini -w $tuned_weights > $testtrans";
-safesystem($cmd) or die "Failed to decode test set!";
+my $cmd = "cat $test | $PARALLELIZE -j 20 -e $decolog -- $CDEC -c $testini -w $tuned_weights > $testtrans";
+safesystem($testtrans, $cmd) or die "Failed to decode test set!";
# EVALUATE
print STDERR "\nEVALUATE TEST SET\n";
print STDERR "TEST: $testtrans\n";
$cmd = "$teval $testtrans";
-safesystem($cmd) or die "Failed to evaluate!";
+safesystem(undef, $cmd) or die "Failed to evaluate!";
exit 0;
@@ -160,10 +181,16 @@ sub write_random_weights_file {
my ($file, @extras) = @_;
open F, ">$file" or die "Can't write $file: $!";
my @feats = (@DEFAULT_FEATS, @extras);
- if ($XFEATS) { push @feats, "X_FGivenE"; push @feats, "X_EGivenF"; }
+ if ($XFEATS) {
+ my @xfeats = qw(
+ X_LogRuleCount X_LogECount X_LogFCount X_EGivenF X_FGivenE X_SingletonRule X_SingletonE X_SingletonF
+ );
+ @feats = (@feats, @xfeats);
+ }
for my $feat (@feats) {
my $r = rand(1.6);
my $w = $init_weights{$feat} * $r;
+ if ($w == 0) { $w = 0.0001; print STDERR "WARNING: $feat had no initial weight!\n"; }
print F "$feat $w\n";
}
close F;
@@ -173,8 +200,8 @@ sub filter {
my ($grammar, $set, $name, $outdir) = @_;
my $outgrammar = mydircat($outdir, "$name.scfg.gz");
if (-f $outgrammar) { print STDERR "$outgrammar exists - REUSING!\n"; } else {
- my $cmd = "gunzip -c $grammar | $FILTSCORE -c $CORPUS -t $dev | gzip > $outgrammar";
- safesystem($cmd) or die "Can't filter and score grammar!";
+ my $cmd = "gunzip -c $grammar | $FILTSCORE -c $CORPUS -t $set | $EXTRA_FILTER gzip > $outgrammar";
+ safesystem($outgrammar, $cmd) or die "Can't filter and score grammar!";
}
return $outgrammar;
}
@@ -215,20 +242,26 @@ EOT
}
sub safesystem {
+ my $output = shift @_;
print STDERR "Executing: @_\n";
system(@_);
if ($? == -1) {
print STDERR "ERROR: Failed to execute: @_\n $!\n";
+ if (defined $output && -e $output) { printf STDERR "Removing $output\n"; `rm -rf $output`; }
exit(1);
}
elsif ($? & 127) {
printf STDERR "ERROR: Execution of: @_\n died with signal %d, %s coredump\n",
($? & 127), ($? & 128) ? 'with' : 'without';
+ if (defined $output && -e $output) { printf STDERR "Removing $output\n"; `rm -rf $output`; }
exit(1);
}
else {
my $exitcode = $? >> 8;
- print STDERR "Exit code: $exitcode\n" if $exitcode;
+ if ($exitcode) {
+ print STDERR "Exit code: $exitcode\n";
+ if (defined $output && -e $output) { printf STDERR "Removing $output\n"; `rm -rf $output`; }
+ }
return ! $exitcode;
}
}