summaryrefslogtreecommitdiff
path: root/gi/pipeline
diff options
context:
space:
mode:
authorolivia.buzek <olivia.buzek@ec762483-ff6d-05da-a07a-a48fb63a330f>2010-07-09 22:03:30 +0000
committerolivia.buzek <olivia.buzek@ec762483-ff6d-05da-a07a-a48fb63a330f>2010-07-09 22:03:30 +0000
commitd5105daa487d67752cd599267f74b7c8d502ef1e (patch)
tree8aecdff8b68fb901845a424862eb2ef9211e4e7b /gi/pipeline
parent3eac13e1c9bb50fdec5fdfa2fb7004ab05646e73 (diff)
Fixing backoff grammar.
git-svn-id: https://ws10smt.googlecode.com/svn/trunk@214 ec762483-ff6d-05da-a07a-a48fb63a330f
Diffstat (limited to 'gi/pipeline')
-rwxr-xr-xgi/pipeline/evaluation-pipeline.pl33
-rwxr-xr-xgi/pipeline/local-gi-pipeline.pl3
2 files changed, 35 insertions, 1 deletions
diff --git a/gi/pipeline/evaluation-pipeline.pl b/gi/pipeline/evaluation-pipeline.pl
index 3e6d0cfd..a1631b0f 100755
--- a/gi/pipeline/evaluation-pipeline.pl
+++ b/gi/pipeline/evaluation-pipeline.pl
@@ -65,6 +65,8 @@ my $FILTER = "$EXTOOLS/filter_grammar";
my $FEATURIZE = "$EXTOOLS/featurize_grammar";
assert_exec($CDEC, $PARALLELIZE, $FILTER, $FEATURIZE, $DISTVEST);
+my $numtopics = 25;
+
my $config = "$SCRIPT_DIR/clsp.config";
print STDERR "CORPORA CONFIGURATION: $config\n";
open CONF, "<$config" or die "Can't read $config: $!";
@@ -101,7 +103,9 @@ my $help;
my $FEATURIZER_OPTS = '';
my $dataDir = '/export/ws10smt/data';
my @features;
+my $bkoffgram;
if (GetOptions(
+ "backoff_grammar" => \$bkoffgram,
"data=s" => \$dataDir,
"features=s@" => \@features,
) == 0 || @ARGV!=2 || $help) {
@@ -156,6 +160,9 @@ write_random_weights_file($weights, @xfeats);
print STDERR "\nFILTERING FOR dev...\n";
print STDERR "DEV: $dev (REFS=$drefs)\n";
my $devgrammar = filter($grammar, $dev, 'dev', $outdir);
+if($bkoffgram) {
+ $devgrammar = add_backoff($devgrammar, $numtopics, 'dev', $outdir);
+}
my $devini = mydircat($outdir, "cdec-dev.ini");
write_cdec_ini($devini, $devgrammar);
@@ -165,6 +172,9 @@ print STDERR "\nFILTERING FOR test...\n";
print STDERR "TEST: $test (EVAL=$teval)\n";
`mkdir -p $outdir`;
my $testgrammar = filter($grammar, $test, 'test', $outdir);
+if($bkoffgram) {
+ $testgrammar = add_backoff($testgrammar, $numtopics, 'test', $outdir);
+}
my $testini = mydircat($outdir, "cdec-test.ini");
write_cdec_ini($testini, $testgrammar);
@@ -230,6 +240,29 @@ sub filter {
return $outgrammar;
}
+sub add_backoff {
+ my ($grammar, $topics, $name, $outdir) = @_;
+ my $out = mydircat($outdir, "backoff.$name.scfg");
+ my $outgrammar = mydircat($outdir, "$name.scfg.gz");
+ my $cmd = "zcat $grammar > $out";
+ safesystem($out,$cmd) or die "Adding backoff rules failed.";
+ for(my $tpcnum=0;$tpcnum<$topics;$tpcnum++) {
+ for(my $tpc2=0;$tpc2<$topics;$tpc2++) {
+ my $bkoff = "1";
+ if($tpc2 == $tpcnum) {
+ $bkoff = "0";
+ }
+ my $rule = "[X$tpcnum\_] ||| [X$tpc2,1] ||| [1] ||| BackoffRule=$bkoff";
+ $cmd = "echo '$rule' >> $out";
+ safesystem($out,$cmd) or die "Adding backoff rules failed.";
+ }
+ }
+ $cmd = "cat $out | gzip > $outgrammar";
+ safesystem($outgrammar, $cmd) or die "Adding backoff rules failed.";
+ return $outgrammar;
+}
+
+
sub mydircat {
my ($base, $suffix) = @_;
if ($suffix =~ /^\//) { return $suffix; }
diff --git a/gi/pipeline/local-gi-pipeline.pl b/gi/pipeline/local-gi-pipeline.pl
index 817d5c90..259dcd9c 100755
--- a/gi/pipeline/local-gi-pipeline.pl
+++ b/gi/pipeline/local-gi-pipeline.pl
@@ -245,7 +245,8 @@ sub grammar_extract_bidir {
if (-e $OUTGRAMMAR) {
print STDERR "$OUTGRAMMAR exists, reusing...\n";
} else {
- safesystem("$EXTRACTOR -i $LABELED -c $ITEMS_IN_MEMORY -L $BASE_PHRASE_MAX_SIZE -b -t $NUM_TOPICS -g | $SORT_KEYS | $REDUCER -p -b | $SORT_KEYS | $REDUCER | $GZIP > $OUTGRAMMAR") or die "Couldn't extract grammar";
+ my $BACKOFF_ARG = ($BACKOFF_GRAMMAR ? "-g" : "");
+ safesystem("$EXTRACTOR -i $LABELED -c $ITEMS_IN_MEMORY -L $BASE_PHRASE_MAX_SIZE -b -t $NUM_TOPICS $BACKOFF_ARG | $SORT_KEYS | $REDUCER -p -b | $SORT_KEYS | $REDUCER | $GZIP > $OUTGRAMMAR") or die "Couldn't extract grammar";
}
return $OUTGRAMMAR;
}