From 59fdf560bfcb324a08971bbd3c0ebbbb39c6409c Mon Sep 17 00:00:00 2001
From: "olivia.buzek" <olivia.buzek@ec762483-ff6d-05da-a07a-a48fb63a330f>
Date: Wed, 21 Jul 2010 18:22:57 +0000
Subject: Debugging backoff.

git-svn-id: https://ws10smt.googlecode.com/svn/trunk@359 ec762483-ff6d-05da-a07a-a48fb63a330f
---
 gi/pipeline/backoff-pipe.pl        | 101 +++++++++++++++++++++++++------------
 gi/pipeline/evaluation-pipeline.pl |  40 ++++-----------
 2 files changed, 79 insertions(+), 62 deletions(-)

(limited to 'gi/pipeline')

diff --git a/gi/pipeline/backoff-pipe.pl b/gi/pipeline/backoff-pipe.pl
index d03b43be..5e36b3d7 100644
--- a/gi/pipeline/backoff-pipe.pl
+++ b/gi/pipeline/backoff-pipe.pl
@@ -5,9 +5,9 @@ use Getopt::Long "GetOptions";
 
 my @grammars;
 my $OUTPUTPREFIX = './giwork/bo.hier.grammar';
+safemkdir($OUTPUTPREFIX);
 my $backoff_levels = 1;
 my $glue_levels = 1;
-my %FREQ_HIER = ();
 
 usage() unless &GetOptions('grmr=s@' => \ @grammars,
                            'outprefix=s' => \ $OUTPUTPREFIX,
@@ -16,18 +16,22 @@ usage() unless &GetOptions('grmr=s@' => \ @grammars,
 );
                            
 my $OUTDIR = $OUTPUTPREFIX . '/hier';
+print STDERR "@grammars\n";
+
 
 my %grmr = ();
 foreach my $grammar (@grammars) {
-    $grammar =~ m/\/[^\/]*\.t(\d+)\.[^\/]*$/;
-    my $grains = $1;
-    $grmr{$grains} = $grammar;
+    $grammar =~ m/\/[^\/]*\.t(\d+)\.[^\/]*/;
+    $grmr{$1} = $grammar;
 }
 
 my @index = sort keys %grmr;
 $OUTDIR = $OUTDIR . join('-',@index);
-my $BACKOFF_GRMR = $OUTDIR . '/backoff.gz';
-my $GLUE_GRMR = $OUTDIR . '/glue.gz';
+safemkdir($OUTDIR);
+my $BACKOFF_GRMR = $OUTDIR . '/backoff.hier.gz';
+safesystem("echo \"\" | gzip > $BACKOFF_GRMR");
+my $GLUE_GRMR = $OUTDIR . '/glue.hier.gz';
+safesystem("echo \"\" | gzip > $GLUE_GRMR");
 my $joinedgrammars = $OUTDIR . '/grammar.hier.gz';
 
 join_grammars();
@@ -56,6 +60,12 @@ EOT
   exit 1;
 };
 
+sub safemkdir {
+  my $dir = shift;
+  if (-d $dir) { return 1; }
+  return mkdir($dir);
+}
+
 
 sub safesystem {
   print STDERR "Executing: @_\n";
@@ -78,11 +88,17 @@ sub safesystem {
 
 
 sub join_grammars {
+    print STDERR "\n!!! JOINING GRAMMARS\n";
+    if(-e $joinedgrammars) {
+        print STDERR "$joinedgrammars exists, reusing...\n";
+        return;
+    }
     safesystem("echo \"\" | gzip > $joinedgrammars");
     foreach my $i (@index) {
         my $g = $grmr{$i};
-        safesystem("zcat $g | sed -r -e 's/(X[0-9]+)/\\1-$i/g' - | gzip > $g");
-        safesystem("zcat $joinedgrammars $g | gzip > $joinedgrammars");
+        safesystem("zcat $g | sed -r -e 's/X([0-9]+)/X$i\\1/g' - | gzip > $g.2.gz");
+        safesystem("zcat $joinedgrammars $g.2.gz | gzip > $joinedgrammars.2.gz");
+        safesystem("mv $joinedgrammars.2.gz $joinedgrammars");
     }
 }
 
@@ -90,16 +106,21 @@ sub join_grammars {
 sub extract_freqs {
     my($grmr1,$grmr2) = @_;
     print STDERR "\n!!!EXTRACTING FREQUENCIES: $grmr1->$grmr2\n";
-    my $IN_COARSE = substr($grammars{$grmr1},0,index($grammars{$grmr1},".grammar/")) . "/labeled_spans.txt";
-    my $IN_FINE = substr($grammars{$grmr2},0,index($grammars{$grmr2},".grammar/")) . "/labeled_spans.txt";
-    my $OUT_SPANS = "$OUTDIR/labeled_spans.hier$NUM_TOPICS_COARSE-$NUM_TOPICS_FINE.txt";
-    my $FREQS = "$OUTDIR/label_freq.hier$NUM_TOPICS_COARSE-$NUM_TOPICS_FINE.txt";
-    my %finehier = ();
-    if (-e $OUT_SPANS) {
+    my $IN_COARSE = substr($grmr{$grmr1},0,index($grmr{$grmr1},".grammar/")) . "/labeled_spans.txt";
+    my $IN_FINE = substr($grmr{$grmr2},0,index($grmr{$grmr2},".grammar/")) . "/labeled_spans.txt";
+    my $OUT_SPANS = "$OUTDIR/labeled_spans.hier$grmr1-$grmr2.txt";
+    my $FREQS = "$OUTDIR/label_freq.hier$grmr1-$grmr2.txt";
+    if(-e $OUT_SPANS && -e $FREQS) {
         print STDERR "$OUT_SPANS exists, reusing...\n";
-    } else {
-        safesystem("paste -d ' ' $IN_COARSE $IN_FINE > $OUT_SPANS");
+        print STDERR "$FREQS exists, reusing...\n";
+        return $FREQS;
     }
+    
+    safesystem("paste -d ' ' $IN_COARSE $IN_FINE > $OUT_SPANS");
+    
+    my %FREQ_HIER = ();
+    my %finehier = ();
+    
     open SPANS, $OUT_SPANS or die $!;
     while (<SPANS>) {
         my ($tmp, $coarse, $fine) = split /\|\|\|/;
@@ -141,38 +162,54 @@ sub extract_freqs {
 
 
 sub create_backoff_rules {
+    print STDERR "\n!!! CREATING BACKOFF RULES\n";
     my ($grmr1, $grmr2, $freq) = @_;
-    open FREQS, $freqs or die $!;
-    open TMP, ">", "tmp" or die $!;
+    my $OUTFILE = "$OUTDIR/backoff.hier$grmr1-$grmr2.txt";
+    if(-e $OUTFILE) {
+        print STDERR "$OUTFILE exists, reusing...\n";
+        return;
+    }
+    open FREQS, $freq or die $!;
+    open TMP, ">", $OUTFILE or die $!;
     while (<FREQS>) {
-        my $coarse = m/^(\d+) \|\|\|/;
-        if ($coarse == $grmr1) {
-            my @finefreq = m/(\d+):(-?\d+\.?\d*)/g;
-            for(my $i = 0; $i < scalar @finefreq; $i+=2) {
-                my $finecat = @finefreq[$i];
-                my $finefreq = @finefreq[$i+1];
-                print TMP "[X$coarse-$grmr1] ||| [X$finecat-$grmr2,1]\t[1] ||| BackoffRule=$finefreq\n";
-            }
+        my $line = $_;
+        $line = m/^(\d+) \|\|\| (.+)$/;
+        my $coarse = $1;
+        $line = $2;
+        my @finefreq = $line =~ m/(\d+):(\S+)/g;
+        for(my $i = 0; $i < scalar @finefreq; $i+=2) {
+            my $finecat = $finefreq[$i];
+            my $finefreq = $finefreq[$i+1];
+            print TMP "[X$grmr1$coarse] ||| [X$grmr2$finecat,1]\t[1] ||| BackoffRule=$finefreq\n";
         }
     }
     close TMP;
     close FREQS;
-    safesystem('zcat $BACKOFF_GRMR | cat - tmp | gzip > $BACKOFF_GRMR');
+    safesystem("zcat $BACKOFF_GRMR | cat - $OUTFILE | gzip > $BACKOFF_GRMR.2.gz");
+    safesystem("mv $BACKOFF_GRMR.2.gz $BACKOFF_GRMR");
 }
 
 sub add_glue_rules {
+    print STDERR "\n!!! CREATING GLUE RULES\n";
     my ($grmr) = @_;
-    open TMP, ">", "tmp" or die $!;
+    my $OUTFILE = "$OUTDIR/glue.$grmr.gz";
+    if (-e $OUTFILE) {
+        print STDERR "$OUTFILE exists, reusing...\n";
+        return;
+    }
+    open TMP, ">", $OUTFILE or die $!;
     for my $i (0..($grmr-1)) {
-        print TMP "[S] ||| [S,1] [X$i-$grmr,2] ||| [1] [2] ||| Glue=1\n";
-        print TMP "[S] ||| [X$i-$grmr,1] ||| [1] ||| GlueTop=1\n";
+        print TMP "[S] ||| [S,1] [X$grmr$i,2] ||| [1] [2] ||| Glue=1\n";
+        print TMP "[S] ||| [X$grmr$i,1] ||| [1] ||| GlueTop=1\n";
     }
     close TMP;
-    safesystem('zcat $GLUE_GRMR | cat - tmp | gzip > $GLUE_GRMR');
+    safesystem("zcat $GLUE_GRMR | cat - $OUTFILE | gzip > $GLUE_GRMR.2.gz");
+    safesystem("mv $GLUE_GRMR.2.gz $GLUE_GRMR");
 }
 
 sub output_grammar_info {
+    print STDERR "\n!!! GRAMMAR INFORMATION\n";
     print STDOUT "GRAMMAR: \t$joinedgrammars\n";
     print STDOUT "GLUE: \t$GLUE_GRMR\n";
-    print STDOUT "BACKOFF: \t$BACKOFF_GRAMMAR\n";
+    print STDOUT "BACKOFF: \t$BACKOFF_GRMR\n";
 }
diff --git a/gi/pipeline/evaluation-pipeline.pl b/gi/pipeline/evaluation-pipeline.pl
index 64803fd0..fe0fb1a5 100755
--- a/gi/pipeline/evaluation-pipeline.pl
+++ b/gi/pipeline/evaluation-pipeline.pl
@@ -1,4 +1,4 @@
-sub m#!/usr/bin/perl -w
+#!/usr/bin/perl -w
 use strict;
 use Getopt::Long;
 use Cwd;
@@ -123,8 +123,8 @@ my $bkoffgram;
 my $gluegram;
 my $usefork;
 if (GetOptions(
-        "backoff_grammar=s" => \$bkoffgram,
-        "glue_grammar=s" => \$gluegram,
+        "backoff-grammar=s" => \$bkoffgram,
+        "glue-grammar=s" => \$gluegram,
         "data=s" => \$dataDir,
         "features=s@" => \@features,
         "use-fork" => \$usefork,
@@ -136,7 +136,6 @@ if (GetOptions(
 }
 if ($usefork) { $usefork="--use-fork"; } else { $usefork = ''; }
 my @fkeys = keys %$feat_map;
-push(@features, "BackoffRule") if $bkoffgram;
 die "You must specify one or more features with -f. Known features: @fkeys\n" unless scalar @features > 0;
 my @xfeats;
 for my $feat (@features) {
@@ -183,12 +182,16 @@ write_random_weights_file($weights, @xfeats);
 my $bkoff_grmr;
 my $glue_grmr;
 if($bkoffgram) {
+    print STDERR "Placing backoff grammar…\n";
     $bkoff_grmr = mydircat($outdir, "backoff.scfg.gz");
-    safesystem("cp $bkoffgram $bkoff_grmr");
+    print STDERR "cp $bkoffgram $bkoff_grmr\n";
+    safesystem(undef,"cp $bkoffgram $bkoff_grmr");
 }
 if($gluegram) {
+    print STDERR "Placing glue grammar…\n";
     $glue_grmr = mydircat($outdir, "glue.bo.scfg.gz");
-    safesystem("cp $gluegram $glue_grmr");
+    print STDERR "cp $gluegram $glue_grmr\n";
+    safesystem(undef,"cp $gluegram $glue_grmr");
 }
 
 # MAKE DEV
@@ -267,30 +270,7 @@ sub filter {
     safesystem($outgrammar, $cmd) or die "Featurizing failed";
   }
   return $outgrammar;
-}
-
-sub add_backoff {
-  my ($grammar, $topics, $name, $outdir) = @_;
-  my $out = mydircat($outdir, "backoff.$name.scfg");
-  my $outgrammar = mydircat($outdir, "$name.scfg.gz");
-  my $cmd = "zcat $grammar > $out";
-  safesystem($out,$cmd) or die "Adding backoff rules failed.";
-  for(my $tpcnum=0;$tpcnum<$topics;$tpcnum++) {
-    for(my $tpc2=0;$tpc2<$topics;$tpc2++) {
-      my $bkoff = "1";
-      if($tpc2 == $tpcnum) {
-        $bkoff = "0";
-      }
-      my $rule = "[X$tpcnum\_] ||| [X$tpc2,1] ||| [1] ||| BackoffRule=$bkoff";
-      $cmd = "echo '$rule' >> $out";
-      safesystem($out,$cmd) or die "Adding backoff rules failed.";
-    }
-  }
-  $cmd = "cat $out | gzip > $outgrammar";
-  safesystem($outgrammar, $cmd) or die "Adding backoff rules failed.";
-  return $outgrammar;
-}
-  
+}  
 
 sub mydircat {
  my ($base, $suffix) = @_;
-- 
cgit v1.2.3