From 6211d023c559f3969ac0a827f4635c5b0959f230 Mon Sep 17 00:00:00 2001 From: "olivia.buzek" Date: Fri, 9 Jul 2010 22:03:30 +0000 Subject: Fixing backoff grammar. git-svn-id: https://ws10smt.googlecode.com/svn/trunk@214 ec762483-ff6d-05da-a07a-a48fb63a330f --- gi/pipeline/evaluation-pipeline.pl | 33 +++++++++++++++++++++++++++++++++ gi/pipeline/local-gi-pipeline.pl | 3 ++- 2 files changed, 35 insertions(+), 1 deletion(-) (limited to 'gi') diff --git a/gi/pipeline/evaluation-pipeline.pl b/gi/pipeline/evaluation-pipeline.pl index 3e6d0cfd..a1631b0f 100755 --- a/gi/pipeline/evaluation-pipeline.pl +++ b/gi/pipeline/evaluation-pipeline.pl @@ -65,6 +65,8 @@ my $FILTER = "$EXTOOLS/filter_grammar"; my $FEATURIZE = "$EXTOOLS/featurize_grammar"; assert_exec($CDEC, $PARALLELIZE, $FILTER, $FEATURIZE, $DISTVEST); +my $numtopics = 25; + my $config = "$SCRIPT_DIR/clsp.config"; print STDERR "CORPORA CONFIGURATION: $config\n"; open CONF, "<$config" or die "Can't read $config: $!"; @@ -101,7 +103,9 @@ my $help; my $FEATURIZER_OPTS = ''; my $dataDir = '/export/ws10smt/data'; my @features; +my $bkoffgram; if (GetOptions( + "backoff_grammar" => \$bkoffgram, "data=s" => \$dataDir, "features=s@" => \@features, ) == 0 || @ARGV!=2 || $help) { @@ -156,6 +160,9 @@ write_random_weights_file($weights, @xfeats); print STDERR "\nFILTERING FOR dev...\n"; print STDERR "DEV: $dev (REFS=$drefs)\n"; my $devgrammar = filter($grammar, $dev, 'dev', $outdir); +if($bkoffgram) { + $devgrammar = add_backoff($devgrammar, $numtopics, 'dev', $outdir); +} my $devini = mydircat($outdir, "cdec-dev.ini"); write_cdec_ini($devini, $devgrammar); @@ -165,6 +172,9 @@ print STDERR "\nFILTERING FOR test...\n"; print STDERR "TEST: $test (EVAL=$teval)\n"; `mkdir -p $outdir`; my $testgrammar = filter($grammar, $test, 'test', $outdir); +if($bkoffgram) { + $testgrammar = add_backoff($testgrammar, $numtopics, 'test', $outdir); +} my $testini = mydircat($outdir, "cdec-test.ini"); write_cdec_ini($testini, $testgrammar); @@ -230,6 +240,29 @@ sub filter { return $outgrammar; } +sub add_backoff { + my ($grammar, $topics, $name, $outdir) = @_; + my $out = mydircat($outdir, "backoff.$name.scfg"); + my $outgrammar = mydircat($outdir, "$name.scfg.gz"); + my $cmd = "zcat $grammar > $out"; + safesystem($out,$cmd) or die "Adding backoff rules failed."; + for(my $tpcnum=0;$tpcnum<$topics;$tpcnum++) { + for(my $tpc2=0;$tpc2<$topics;$tpc2++) { + my $bkoff = "1"; + if($tpc2 == $tpcnum) { + $bkoff = "0"; + } + my $rule = "[X$tpcnum\_] ||| [X$tpc2,1] ||| [1] ||| BackoffRule=$bkoff"; + $cmd = "echo '$rule' >> $out"; + safesystem($out,$cmd) or die "Adding backoff rules failed."; + } + } + $cmd = "cat $out | gzip > $outgrammar"; + safesystem($outgrammar, $cmd) or die "Adding backoff rules failed."; + return $outgrammar; +} + + sub mydircat { my ($base, $suffix) = @_; if ($suffix =~ /^\//) { return $suffix; } diff --git a/gi/pipeline/local-gi-pipeline.pl b/gi/pipeline/local-gi-pipeline.pl index 817d5c90..259dcd9c 100755 --- a/gi/pipeline/local-gi-pipeline.pl +++ b/gi/pipeline/local-gi-pipeline.pl @@ -245,7 +245,8 @@ sub grammar_extract_bidir { if (-e $OUTGRAMMAR) { print STDERR "$OUTGRAMMAR exists, reusing...\n"; } else { - safesystem("$EXTRACTOR -i $LABELED -c $ITEMS_IN_MEMORY -L $BASE_PHRASE_MAX_SIZE -b -t $NUM_TOPICS -g | $SORT_KEYS | $REDUCER -p -b | $SORT_KEYS | $REDUCER | $GZIP > $OUTGRAMMAR") or die "Couldn't extract grammar"; + my $BACKOFF_ARG = ($BACKOFF_GRAMMAR ? "-g" : ""); + safesystem("$EXTRACTOR -i $LABELED -c $ITEMS_IN_MEMORY -L $BASE_PHRASE_MAX_SIZE -b -t $NUM_TOPICS $BACKOFF_ARG | $SORT_KEYS | $REDUCER -p -b | $SORT_KEYS | $REDUCER | $GZIP > $OUTGRAMMAR") or die "Couldn't extract grammar"; } return $OUTGRAMMAR; } -- cgit v1.2.3