summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorolivia.buzek <olivia.buzek@ec762483-ff6d-05da-a07a-a48fb63a330f>2010-07-09 22:03:30 +0000
committerolivia.buzek <olivia.buzek@ec762483-ff6d-05da-a07a-a48fb63a330f>2010-07-09 22:03:30 +0000
commit6211d023c559f3969ac0a827f4635c5b0959f230 (patch)
tree0033966baa5c3b915439c89a359d7dd17d90c504
parent261027816caf7676dbb082ce58da086b5f5cb707 (diff)
Fixing backoff grammar.
git-svn-id: https://ws10smt.googlecode.com/svn/trunk@214 ec762483-ff6d-05da-a07a-a48fb63a330f
-rw-r--r--extools/extract.cc19
-rwxr-xr-xgi/pipeline/evaluation-pipeline.pl33
-rwxr-xr-xgi/pipeline/local-gi-pipeline.pl3
3 files changed, 41 insertions, 14 deletions
diff --git a/extools/extract.cc b/extools/extract.cc
index c2c413e2..14497089 100644
--- a/extools/extract.cc
+++ b/extools/extract.cc
@@ -283,7 +283,7 @@ void Extract::ExtractConsistentRules(const AnnotatedParallelSentence& sentence,
for (short j = 0; j < cur_es.size(); ++j)
if (cur_es[j] >= 0 && sentence.aligned(cur_fs[i],cur_es[j]))
cur_terminal_align.push_back(make_pair(i,j));
- observer->CountRule(lhs, cur_rhs_f, cur_rhs_e, cur_terminal_align);
+ //observer->CountRule(lhs, cur_rhs_f, cur_rhs_e, cur_terminal_align);
if(!all_cats->empty()) {
//produce the backoff grammar if the category wordIDs are available
@@ -294,7 +294,7 @@ void Extract::ExtractConsistentRules(const AnnotatedParallelSentence& sentence,
nonterm+=bkoff_mrkr;
bkoff = -TD::Convert(nonterm);
cur_rhs_f[i]=bkoff;
- vector<WordID> rhs_f_bkoff;
+ /*vector<WordID> rhs_f_bkoff;
vector<WordID> rhs_e_bkoff;
vector<pair<short,short> > bkoff_align;
bkoff_align.clear();
@@ -307,19 +307,12 @@ void Extract::ExtractConsistentRules(const AnnotatedParallelSentence& sentence,
rhs_e_bkoff.push_back(0);
observer->CountRule(bkoff,rhs_f_bkoff,rhs_e_bkoff,bkoff_align);
- }
- }//else
- //cerr << cur_rhs_f[i] << ": (words,f) |" << TD::Convert(cur_rhs_f[i]) << endl;
+ }*/
+ }
}
- /*for (int i=0; i < cur_rhs_e.size(); ++i)
- if(cur_rhs_e[i] <= 0)
- cerr << cur_rhs_e[i] << ": (cats,e) |" << TD::Convert(1-cur_rhs_e[i]) << endl;
- else
- cerr << cur_rhs_e[i] << ": (words,e) |" << TD::Convert(cur_rhs_e[i]) << endl;
- */
-
+
+ }
observer->CountRule(lhs, cur_rhs_f, cur_rhs_e, cur_terminal_align);
- }
}
}
}
diff --git a/gi/pipeline/evaluation-pipeline.pl b/gi/pipeline/evaluation-pipeline.pl
index 3e6d0cfd..a1631b0f 100755
--- a/gi/pipeline/evaluation-pipeline.pl
+++ b/gi/pipeline/evaluation-pipeline.pl
@@ -65,6 +65,8 @@ my $FILTER = "$EXTOOLS/filter_grammar";
my $FEATURIZE = "$EXTOOLS/featurize_grammar";
assert_exec($CDEC, $PARALLELIZE, $FILTER, $FEATURIZE, $DISTVEST);
+my $numtopics = 25;
+
my $config = "$SCRIPT_DIR/clsp.config";
print STDERR "CORPORA CONFIGURATION: $config\n";
open CONF, "<$config" or die "Can't read $config: $!";
@@ -101,7 +103,9 @@ my $help;
my $FEATURIZER_OPTS = '';
my $dataDir = '/export/ws10smt/data';
my @features;
+my $bkoffgram;
if (GetOptions(
+ "backoff_grammar" => \$bkoffgram,
"data=s" => \$dataDir,
"features=s@" => \@features,
) == 0 || @ARGV!=2 || $help) {
@@ -156,6 +160,9 @@ write_random_weights_file($weights, @xfeats);
print STDERR "\nFILTERING FOR dev...\n";
print STDERR "DEV: $dev (REFS=$drefs)\n";
my $devgrammar = filter($grammar, $dev, 'dev', $outdir);
+if($bkoffgram) {
+ $devgrammar = add_backoff($devgrammar, $numtopics, 'dev', $outdir);
+}
my $devini = mydircat($outdir, "cdec-dev.ini");
write_cdec_ini($devini, $devgrammar);
@@ -165,6 +172,9 @@ print STDERR "\nFILTERING FOR test...\n";
print STDERR "TEST: $test (EVAL=$teval)\n";
`mkdir -p $outdir`;
my $testgrammar = filter($grammar, $test, 'test', $outdir);
+if($bkoffgram) {
+ $testgrammar = add_backoff($testgrammar, $numtopics, 'test', $outdir);
+}
my $testini = mydircat($outdir, "cdec-test.ini");
write_cdec_ini($testini, $testgrammar);
@@ -230,6 +240,29 @@ sub filter {
return $outgrammar;
}
+sub add_backoff {
+ my ($grammar, $topics, $name, $outdir) = @_;
+ my $out = mydircat($outdir, "backoff.$name.scfg");
+ my $outgrammar = mydircat($outdir, "$name.scfg.gz");
+ my $cmd = "zcat $grammar > $out";
+ safesystem($out,$cmd) or die "Adding backoff rules failed.";
+ for(my $tpcnum=0;$tpcnum<$topics;$tpcnum++) {
+ for(my $tpc2=0;$tpc2<$topics;$tpc2++) {
+ my $bkoff = "1";
+ if($tpc2 == $tpcnum) {
+ $bkoff = "0";
+ }
+ my $rule = "[X$tpcnum\_] ||| [X$tpc2,1] ||| [1] ||| BackoffRule=$bkoff";
+ $cmd = "echo '$rule' >> $out";
+ safesystem($out,$cmd) or die "Adding backoff rules failed.";
+ }
+ }
+ $cmd = "cat $out | gzip > $outgrammar";
+ safesystem($outgrammar, $cmd) or die "Adding backoff rules failed.";
+ return $outgrammar;
+}
+
+
sub mydircat {
my ($base, $suffix) = @_;
if ($suffix =~ /^\//) { return $suffix; }
diff --git a/gi/pipeline/local-gi-pipeline.pl b/gi/pipeline/local-gi-pipeline.pl
index 817d5c90..259dcd9c 100755
--- a/gi/pipeline/local-gi-pipeline.pl
+++ b/gi/pipeline/local-gi-pipeline.pl
@@ -245,7 +245,8 @@ sub grammar_extract_bidir {
if (-e $OUTGRAMMAR) {
print STDERR "$OUTGRAMMAR exists, reusing...\n";
} else {
- safesystem("$EXTRACTOR -i $LABELED -c $ITEMS_IN_MEMORY -L $BASE_PHRASE_MAX_SIZE -b -t $NUM_TOPICS -g | $SORT_KEYS | $REDUCER -p -b | $SORT_KEYS | $REDUCER | $GZIP > $OUTGRAMMAR") or die "Couldn't extract grammar";
+ my $BACKOFF_ARG = ($BACKOFF_GRAMMAR ? "-g" : "");
+ safesystem("$EXTRACTOR -i $LABELED -c $ITEMS_IN_MEMORY -L $BASE_PHRASE_MAX_SIZE -b -t $NUM_TOPICS $BACKOFF_ARG | $SORT_KEYS | $REDUCER -p -b | $SORT_KEYS | $REDUCER | $GZIP > $OUTGRAMMAR") or die "Couldn't extract grammar";
}
return $OUTGRAMMAR;
}