diff options
-rw-r--r-- | extools/extract.cc | 19 | ||||
-rwxr-xr-x | gi/pipeline/evaluation-pipeline.pl | 33 | ||||
-rwxr-xr-x | gi/pipeline/local-gi-pipeline.pl | 3 |
3 files changed, 41 insertions, 14 deletions
diff --git a/extools/extract.cc b/extools/extract.cc index c2c413e2..14497089 100644 --- a/extools/extract.cc +++ b/extools/extract.cc @@ -283,7 +283,7 @@ void Extract::ExtractConsistentRules(const AnnotatedParallelSentence& sentence, for (short j = 0; j < cur_es.size(); ++j) if (cur_es[j] >= 0 && sentence.aligned(cur_fs[i],cur_es[j])) cur_terminal_align.push_back(make_pair(i,j)); - observer->CountRule(lhs, cur_rhs_f, cur_rhs_e, cur_terminal_align); + //observer->CountRule(lhs, cur_rhs_f, cur_rhs_e, cur_terminal_align); if(!all_cats->empty()) { //produce the backoff grammar if the category wordIDs are available @@ -294,7 +294,7 @@ void Extract::ExtractConsistentRules(const AnnotatedParallelSentence& sentence, nonterm+=bkoff_mrkr; bkoff = -TD::Convert(nonterm); cur_rhs_f[i]=bkoff; - vector<WordID> rhs_f_bkoff; + /*vector<WordID> rhs_f_bkoff; vector<WordID> rhs_e_bkoff; vector<pair<short,short> > bkoff_align; bkoff_align.clear(); @@ -307,19 +307,12 @@ void Extract::ExtractConsistentRules(const AnnotatedParallelSentence& sentence, rhs_e_bkoff.push_back(0); observer->CountRule(bkoff,rhs_f_bkoff,rhs_e_bkoff,bkoff_align); - } - }//else - //cerr << cur_rhs_f[i] << ": (words,f) |" << TD::Convert(cur_rhs_f[i]) << endl; + }*/ + } } - /*for (int i=0; i < cur_rhs_e.size(); ++i) - if(cur_rhs_e[i] <= 0) - cerr << cur_rhs_e[i] << ": (cats,e) |" << TD::Convert(1-cur_rhs_e[i]) << endl; - else - cerr << cur_rhs_e[i] << ": (words,e) |" << TD::Convert(cur_rhs_e[i]) << endl; - */ - + + } observer->CountRule(lhs, cur_rhs_f, cur_rhs_e, cur_terminal_align); - } } } } diff --git a/gi/pipeline/evaluation-pipeline.pl b/gi/pipeline/evaluation-pipeline.pl index 3e6d0cfd..a1631b0f 100755 --- a/gi/pipeline/evaluation-pipeline.pl +++ b/gi/pipeline/evaluation-pipeline.pl @@ -65,6 +65,8 @@ my $FILTER = "$EXTOOLS/filter_grammar"; my $FEATURIZE = "$EXTOOLS/featurize_grammar"; assert_exec($CDEC, $PARALLELIZE, $FILTER, $FEATURIZE, $DISTVEST); +my $numtopics = 25; + my $config = "$SCRIPT_DIR/clsp.config"; print STDERR "CORPORA CONFIGURATION: $config\n"; open CONF, "<$config" or die "Can't read $config: $!"; @@ -101,7 +103,9 @@ my $help; my $FEATURIZER_OPTS = ''; my $dataDir = '/export/ws10smt/data'; my @features; +my $bkoffgram; if (GetOptions( + "backoff_grammar" => \$bkoffgram, "data=s" => \$dataDir, "features=s@" => \@features, ) == 0 || @ARGV!=2 || $help) { @@ -156,6 +160,9 @@ write_random_weights_file($weights, @xfeats); print STDERR "\nFILTERING FOR dev...\n"; print STDERR "DEV: $dev (REFS=$drefs)\n"; my $devgrammar = filter($grammar, $dev, 'dev', $outdir); +if($bkoffgram) { + $devgrammar = add_backoff($devgrammar, $numtopics, 'dev', $outdir); +} my $devini = mydircat($outdir, "cdec-dev.ini"); write_cdec_ini($devini, $devgrammar); @@ -165,6 +172,9 @@ print STDERR "\nFILTERING FOR test...\n"; print STDERR "TEST: $test (EVAL=$teval)\n"; `mkdir -p $outdir`; my $testgrammar = filter($grammar, $test, 'test', $outdir); +if($bkoffgram) { + $testgrammar = add_backoff($testgrammar, $numtopics, 'test', $outdir); +} my $testini = mydircat($outdir, "cdec-test.ini"); write_cdec_ini($testini, $testgrammar); @@ -230,6 +240,29 @@ sub filter { return $outgrammar; } +sub add_backoff { + my ($grammar, $topics, $name, $outdir) = @_; + my $out = mydircat($outdir, "backoff.$name.scfg"); + my $outgrammar = mydircat($outdir, "$name.scfg.gz"); + my $cmd = "zcat $grammar > $out"; + safesystem($out,$cmd) or die "Adding backoff rules failed."; + for(my $tpcnum=0;$tpcnum<$topics;$tpcnum++) { + for(my $tpc2=0;$tpc2<$topics;$tpc2++) { + my $bkoff = "1"; + if($tpc2 == $tpcnum) { + $bkoff = "0"; + } + my $rule = "[X$tpcnum\_] ||| [X$tpc2,1] ||| [1] ||| BackoffRule=$bkoff"; + $cmd = "echo '$rule' >> $out"; + safesystem($out,$cmd) or die "Adding backoff rules failed."; + } + } + $cmd = "cat $out | gzip > $outgrammar"; + safesystem($outgrammar, $cmd) or die "Adding backoff rules failed."; + return $outgrammar; +} + + sub mydircat { my ($base, $suffix) = @_; if ($suffix =~ /^\//) { return $suffix; } diff --git a/gi/pipeline/local-gi-pipeline.pl b/gi/pipeline/local-gi-pipeline.pl index 817d5c90..259dcd9c 100755 --- a/gi/pipeline/local-gi-pipeline.pl +++ b/gi/pipeline/local-gi-pipeline.pl @@ -245,7 +245,8 @@ sub grammar_extract_bidir { if (-e $OUTGRAMMAR) { print STDERR "$OUTGRAMMAR exists, reusing...\n"; } else { - safesystem("$EXTRACTOR -i $LABELED -c $ITEMS_IN_MEMORY -L $BASE_PHRASE_MAX_SIZE -b -t $NUM_TOPICS -g | $SORT_KEYS | $REDUCER -p -b | $SORT_KEYS | $REDUCER | $GZIP > $OUTGRAMMAR") or die "Couldn't extract grammar"; + my $BACKOFF_ARG = ($BACKOFF_GRAMMAR ? "-g" : ""); + safesystem("$EXTRACTOR -i $LABELED -c $ITEMS_IN_MEMORY -L $BASE_PHRASE_MAX_SIZE -b -t $NUM_TOPICS $BACKOFF_ARG | $SORT_KEYS | $REDUCER -p -b | $SORT_KEYS | $REDUCER | $GZIP > $OUTGRAMMAR") or die "Couldn't extract grammar"; } return $OUTGRAMMAR; } |