From d598c5072738736716cd4370dd1b6d0eb1322e06 Mon Sep 17 00:00:00 2001 From: "olivia.buzek" Date: Tue, 20 Jul 2010 20:03:25 +0000 Subject: Added backoff pipe. git-svn-id: https://ws10smt.googlecode.com/svn/trunk@340 ec762483-ff6d-05da-a07a-a48fb63a330f --- gi/pipeline/backoff-pipe.pl | 178 +++++++++++++++++++++++++++++++++++++ gi/pipeline/evaluation-pipeline.pl | 27 ++++-- gi/pipeline/local-gi-pipeline.pl | 25 ++++-- 3 files changed, 212 insertions(+), 18 deletions(-) create mode 100644 gi/pipeline/backoff-pipe.pl (limited to 'gi/pipeline') diff --git a/gi/pipeline/backoff-pipe.pl b/gi/pipeline/backoff-pipe.pl new file mode 100644 index 00000000..d03b43be --- /dev/null +++ b/gi/pipeline/backoff-pipe.pl @@ -0,0 +1,178 @@ +#!/usr/bin/perl -w +use strict; + +use Getopt::Long "GetOptions"; + +my @grammars; +my $OUTPUTPREFIX = './giwork/bo.hier.grammar'; +my $backoff_levels = 1; +my $glue_levels = 1; +my %FREQ_HIER = (); + +usage() unless &GetOptions('grmr=s@' => \ @grammars, + 'outprefix=s' => \ $OUTPUTPREFIX, + 'bo-lvls=i' => \ $backoff_levels, + 'glue-lvls=i' => \ $glue_levels, +); + +my $OUTDIR = $OUTPUTPREFIX . '/hier'; + +my %grmr = (); +foreach my $grammar (@grammars) { + $grammar =~ m/\/[^\/]*\.t(\d+)\.[^\/]*$/; + my $grains = $1; + $grmr{$grains} = $grammar; +} + +my @index = sort keys %grmr; +$OUTDIR = $OUTDIR . join('-',@index); +my $BACKOFF_GRMR = $OUTDIR . '/backoff.gz'; +my $GLUE_GRMR = $OUTDIR . '/glue.gz'; +my $joinedgrammars = $OUTDIR . '/grammar.hier.gz'; + +join_grammars(); + +for my $i (0..(scalar @index)-2) { + my $freqs = extract_freqs($index[$i], $index[$i+1]); + if ($i < $backoff_levels) { + create_backoff_rules($index[$i],$index[$i+1],$freqs); + } + if ($i < $glue_levels) { + add_glue_rules($index[$i]); + } +} + +output_grammar_info(); + + +sub usage { + print <> 8; + print STDERR "Exit code: $exitcode\n" if $exitcode; + return ! $exitcode; + } +} + + +sub join_grammars { + safesystem("echo \"\" | gzip > $joinedgrammars"); + foreach my $i (@index) { + my $g = $grmr{$i}; + safesystem("zcat $g | sed -r -e 's/(X[0-9]+)/\\1-$i/g' - | gzip > $g"); + safesystem("zcat $joinedgrammars $g | gzip > $joinedgrammars"); + } +} + + +sub extract_freqs { + my($grmr1,$grmr2) = @_; + print STDERR "\n!!!EXTRACTING FREQUENCIES: $grmr1->$grmr2\n"; + my $IN_COARSE = substr($grammars{$grmr1},0,index($grammars{$grmr1},".grammar/")) . "/labeled_spans.txt"; + my $IN_FINE = substr($grammars{$grmr2},0,index($grammars{$grmr2},".grammar/")) . "/labeled_spans.txt"; + my $OUT_SPANS = "$OUTDIR/labeled_spans.hier$NUM_TOPICS_COARSE-$NUM_TOPICS_FINE.txt"; + my $FREQS = "$OUTDIR/label_freq.hier$NUM_TOPICS_COARSE-$NUM_TOPICS_FINE.txt"; + my %finehier = (); + if (-e $OUT_SPANS) { + print STDERR "$OUT_SPANS exists, reusing...\n"; + } else { + safesystem("paste -d ' ' $IN_COARSE $IN_FINE > $OUT_SPANS"); + } + open SPANS, $OUT_SPANS or die $!; + while () { + my ($tmp, $coarse, $fine) = split /\|\|\|/; + my @coarse_spans = $coarse =~ /\d+-\d+:X(\d+)/g; + my @fine_spans = $fine =~ /\d+-\d+:X(\d+)/g; + + foreach my $i (0..(scalar @coarse_spans)-1) { + my $coarse_cat = $coarse_spans[$i]; + my $fine_cat = $fine_spans[$i]; + + $FREQ_HIER{$coarse_cat}{$fine_cat}++; + } + } + close SPANS; + foreach (values %FREQ_HIER) { + my $coarse_freq = $_; + my $total = 0; + $total+=$_ for (values %{ $coarse_freq }); + $coarse_freq->{$_}=log($coarse_freq->{$_}/$total) for (keys %{ $coarse_freq }); + } + open FREQS, ">", $FREQS or die $!; + foreach my $coarse_cat (keys %FREQ_HIER) { + print FREQS "$coarse_cat |||"; + foreach my $fine_cat (keys %{$FREQ_HIER{$coarse_cat}}) { + my $freq = $FREQ_HIER{$coarse_cat}{$fine_cat}; + print FREQS " $fine_cat:$freq"; + if(! exists $finehier{$fine_cat} || $finehier{$fine_cat} < $freq) { + $finehier{$fine_cat} = $coarse_cat; + } + } + print FREQS "\n"; + } +# foreach my $fine_cat (keys %finehier) { +# print FREQS "$fine_cat -> $finehier{$fine_cat}\n"; +# } + close FREQS; + return $FREQS; +} + + +sub create_backoff_rules { + my ($grmr1, $grmr2, $freq) = @_; + open FREQS, $freqs or die $!; + open TMP, ">", "tmp" or die $!; + while () { + my $coarse = m/^(\d+) \|\|\|/; + if ($coarse == $grmr1) { + my @finefreq = m/(\d+):(-?\d+\.?\d*)/g; + for(my $i = 0; $i < scalar @finefreq; $i+=2) { + my $finecat = @finefreq[$i]; + my $finefreq = @finefreq[$i+1]; + print TMP "[X$coarse-$grmr1] ||| [X$finecat-$grmr2,1]\t[1] ||| BackoffRule=$finefreq\n"; + } + } + } + close TMP; + close FREQS; + safesystem('zcat $BACKOFF_GRMR | cat - tmp | gzip > $BACKOFF_GRMR'); +} + +sub add_glue_rules { + my ($grmr) = @_; + open TMP, ">", "tmp" or die $!; + for my $i (0..($grmr-1)) { + print TMP "[S] ||| [S,1] [X$i-$grmr,2] ||| [1] [2] ||| Glue=1\n"; + print TMP "[S] ||| [X$i-$grmr,1] ||| [1] ||| GlueTop=1\n"; + } + close TMP; + safesystem('zcat $GLUE_GRMR | cat - tmp | gzip > $GLUE_GRMR'); +} + +sub output_grammar_info { + print STDOUT "GRAMMAR: \t$joinedgrammars\n"; + print STDOUT "GLUE: \t$GLUE_GRMR\n"; + print STDOUT "BACKOFF: \t$BACKOFF_GRAMMAR\n"; +} diff --git a/gi/pipeline/evaluation-pipeline.pl b/gi/pipeline/evaluation-pipeline.pl index 178159b9..64803fd0 100755 --- a/gi/pipeline/evaluation-pipeline.pl +++ b/gi/pipeline/evaluation-pipeline.pl @@ -1,4 +1,4 @@ -#!/usr/bin/perl -w +sub m#!/usr/bin/perl -w use strict; use Getopt::Long; use Cwd; @@ -120,9 +120,11 @@ my $FEATURIZER_OPTS = ''; my $dataDir = '/export/ws10smt/data'; my @features; my $bkoffgram; +my $gluegram; my $usefork; if (GetOptions( - "backoff_grammar" => \$bkoffgram, + "backoff_grammar=s" => \$bkoffgram, + "glue_grammar=s" => \$gluegram, "data=s" => \$dataDir, "features=s@" => \@features, "use-fork" => \$usefork, @@ -178,13 +180,21 @@ print STDERR "\nCREATING INITIAL WEIGHTS FILE: weights.init\n"; my $weights = mydircat($outdir, "weights.init"); write_random_weights_file($weights, @xfeats); +my $bkoff_grmr; +my $glue_grmr; +if($bkoffgram) { + $bkoff_grmr = mydircat($outdir, "backoff.scfg.gz"); + safesystem("cp $bkoffgram $bkoff_grmr"); +} +if($gluegram) { + $glue_grmr = mydircat($outdir, "glue.bo.scfg.gz"); + safesystem("cp $gluegram $glue_grmr"); +} + # MAKE DEV print STDERR "\nFILTERING FOR dev...\n"; print STDERR "DEV: $dev (REFS=$drefs)\n"; my $devgrammar = filter($grammar, $dev, 'dev', $outdir); -if($bkoffgram) { - $devgrammar = add_backoff($devgrammar, $numtopics, 'dev', $outdir); -} my $devini = mydircat($outdir, "cdec-dev.ini"); write_cdec_ini($devini, $devgrammar); @@ -194,9 +204,6 @@ print STDERR "\nFILTERING FOR test...\n"; print STDERR "TEST: $test (EVAL=$teval)\n"; `mkdir -p $outdir`; my $testgrammar = filter($grammar, $test, 'test', $outdir); -if($bkoffgram) { - $testgrammar = add_backoff($testgrammar, $numtopics, 'test', $outdir); -} my $testini = mydircat($outdir, "cdec-test.ini"); write_cdec_ini($testini, $testgrammar); @@ -296,11 +303,12 @@ sub mydircat { sub write_cdec_ini { my ($filename, $grammar_path) = (@_); open CDECINI, ">$filename" or die "Can't write $filename: $!"; + my $glue = ($gluegram ? "$glue_grmr" : "$datadir/glue/glue.scfg.gz"); print CDECINI < $finehier{$fine_cat}\n"; - } +# foreach my $fine_cat (keys %finehier) { +# print FREQS "$fine_cat -> $finehier{$fine_cat}\n"; +# } close FREQS; $CLUSTER_DIR = $CLUSTER_DIR_F; } @@ -364,7 +371,7 @@ sub grammar_extract { sub grammar_extract_bidir { #gzcat ex.output.gz | ./mr_stripe_rule_reduce -p -b | sort -t $'\t' -k 1 | ./mr_stripe_rule_reduce | gzip > phrase-table.gz - my $LABELED = ($HIER_CAT ? "$CLUSTER_DIR_F/corpus.src_trg_al_label.hier" : "$LABELED_DIR/corpus.src_trg_al_label"); + my $LABELED = "$LABELED_DIR/corpus.src_trg_al_label"; print STDERR "\n!!!EXTRACTING GRAMMAR\n"; my $OUTGRAMMAR = "$GRAMMAR_DIR/grammar.bidir.gz"; if (-e $OUTGRAMMAR) { -- cgit v1.2.3