From 19bd995999b6935c16ecbd35d8277939b3d51bc2 Mon Sep 17 00:00:00 2001
From: "olivia.buzek" <olivia.buzek@ec762483-ff6d-05da-a07a-a48fb63a330f>
Date: Thu, 15 Jul 2010 21:54:58 +0000
Subject: Unified backoff_grammar and hier_cat.

git-svn-id: https://ws10smt.googlecode.com/svn/trunk@270 ec762483-ff6d-05da-a07a-a48fb63a330f
---
 gi/pipeline/evaluation-pipeline.pl |  1 +
 gi/pipeline/local-gi-pipeline.pl   | 63 +++++++++++++++++++++++++++++---------
 2 files changed, 50 insertions(+), 14 deletions(-)

(limited to 'gi')

diff --git a/gi/pipeline/evaluation-pipeline.pl b/gi/pipeline/evaluation-pipeline.pl
index 6e786a8a..8ee41122 100755
--- a/gi/pipeline/evaluation-pipeline.pl
+++ b/gi/pipeline/evaluation-pipeline.pl
@@ -116,6 +116,7 @@ if (GetOptions(
         exit;
 }
 my @fkeys = keys %$feat_map;
+push(@features, "BackoffRule") if $bkoffgram;
 die "You must specify one or more features with -f. Known features: @fkeys\n" unless scalar @features > 0;
 my @xfeats;
 for my $feat (@features) {
diff --git a/gi/pipeline/local-gi-pipeline.pl b/gi/pipeline/local-gi-pipeline.pl
index a895a03f..131c22aa 100755
--- a/gi/pipeline/local-gi-pipeline.pl
+++ b/gi/pipeline/local-gi-pipeline.pl
@@ -13,7 +13,7 @@ my $BASE_PHRASE_MAX_SIZE = 10;
 my $COMPLETE_CACHE = 1;
 my $ITEMS_IN_MEMORY = 10000000;  # cache size in extractors
 my $NUM_TOPICS = 50;
-my $NUM_TOPICS_COARSE = 10;
+my $NUM_TOPICS_COARSE;
 my $NUM_TOPICS_FINE = $NUM_TOPICS;
 my $NUM_SAMPLES = 1000;
 my $CONTEXT_SIZE = 1;
@@ -54,6 +54,7 @@ assert_exec($PATCH_CORPUS, $SORT_KEYS, $REDUCER, $EXTRACTOR, $PYP_TOPICS_TRAIN,
 my $BACKOFF_GRAMMAR;
 my $DEFAULT_CAT;
 my $HIER_CAT;
+my %FREQ_HIER = ();
 my $TAGGED_CORPUS;
 
 my $NAME_SHORTCUT;
@@ -61,7 +62,6 @@ my $NAME_SHORTCUT;
 my $OUTPUT = './giwork';
 usage() unless &GetOptions('base_phrase_max_size=i' => \$BASE_PHRASE_MAX_SIZE,
                            'backoff_grammar' => \$BACKOFF_GRAMMAR,
-                           'hier_cat' => \$HIER_CAT,
                            'output=s' => \$OUTPUT,
                            'model=s' => \$MODEL,
                            'topics=i' => \$NUM_TOPICS_FINE,
@@ -91,6 +91,8 @@ open F, "<$CORPUS" or die "Can't read $CORPUS: $!"; close F;
 
 $NUM_TOPICS = $NUM_TOPICS_FINE;
 
+$HIER_CAT = ( $NUM_TOPICS_COARSE ? 1 : 0 );
+
 print STDERR "   Output: $OUTPUT\n";
 my $DATA_DIR = $OUTPUT . '/corpora';
 my $LEX_NAME = 'corpus.f_e_a.lex';
@@ -147,7 +149,7 @@ if($HIER_CAT) {
     $NUM_TOPICS = $NUM_TOPICS_FINE;
     $CLUSTER_DIR = $CLUSTER_DIR_F;
     label_spans_with_topics();
-    combine_labelled_spans();
+    extract_freqs();
 } else {
     label_spans_with_topics();
 }
@@ -295,28 +297,61 @@ sub label_spans_with_topics {
   }
 }
 
-sub combine_labelled_spans {
-    print STDERR "\n!!!COMBINING SPAN LABELS\n";
+sub extract_freqs {
+    print STDERR "\n!!!EXTRACTING FREQUENCIES\n";
     my $IN_COARSE = "$CLUSTER_DIR_C/labeled_spans.txt";
-    my $OUT_COARSE = "$CLUSTER_DIR_C/labeled_spans_c.txt";
     my $IN_FINE = "$CLUSTER_DIR_F/labeled_spans.txt";
-    my $OUT_FINE = "$CLUSTER_DIR_F/labeled_spans_f.txt";
     my $OUT_SPANS = "$CLUSTER_DIR_F/labeled_spans.hier$NUM_TOPICS_COARSE-$NUM_TOPICS_FINE.txt";
-    my $COARSE_EXPR = "\'s/\\(X[0-9][0-9]*\\)/\\1c/g\'";
-    my $FINE_EXPR = "\'s/\\(X[0-9][0-9]*\\)/\\1f/g\'";
+    my $FREQS = "$CLUSTER_DIR_F/label_freq.hier$NUM_TOPICS_COARSE-$NUM_TOPICS_FINE.txt";
+    my $COARSE_EXPR = "\'s/\\(X[0-9][0-9]*\\)/\\1c/g\'"; #'
+    my $FINE_EXPR = "\'s/\\(X[0-9][0-9]*\\)/\\1f/g\'"; #'
+    my %finehier = ();
     if (-e $OUT_SPANS) {
         print STDERR "$OUT_SPANS exists, reusing...\n";
     } else {
-        safesystem("$SED $COARSE_EXPR < $IN_COARSE > $OUT_COARSE") or die "Couldn't create coarse labels.";
-        safesystem("$SED $FINE_EXPR < $IN_FINE > $OUT_FINE") or die "Couldn't create fine labels.";
-        safesystem("sed -e 's/||| \\(.*\\)\$/\\1/' < $OUT_COARSE | paste -d ' ' $OUT_FINE - > $OUT_SPANS") or die "Couldn't paste coarse and fine labels.";
-        safesystem("paste -d ' ' $CORPUS_LEX $OUT_SPANS > $CLUSTER_DIR_F/corpus.src_trg_al_label.hier") or die "Couldn't paste corpus";
+        safesystem("paste -d ' ' $IN_COARSE $IN_FINE > $OUT_SPANS");
+    }
+    open SPANS, $OUT_SPANS or die $!;
+    while (<SPANS>) {
+        my ($tmp, $coarse, $fine) = split /\|\|\|/;
+        my @coarse_spans = $coarse =~ /\d+-\d+:X(\d+)/g;
+        my @fine_spans = $fine =~ /\d+-\d+:X(\d+)/g;
+        
+        foreach my $i (0..(scalar @coarse_spans)-1) {
+            my $coarse_cat = $coarse_spans[$i];
+            my $fine_cat = $fine_spans[$i];
+            
+            $FREQ_HIER{$coarse_cat}{$fine_cat}++;
+        }
+    }
+    close SPANS;
+    foreach (values %FREQ_HIER) {
+        my $coarse_freq = $_;
+        my $total = 0;
+        $total+=$_ for (values %{ $coarse_freq });
+        $coarse_freq->{$_}=log($coarse_freq->{$_}/$total) for (keys %{ $coarse_freq });
     }
+    open FREQS, ">", $FREQS or die $!;
+    foreach my $coarse_cat (keys %FREQ_HIER) {
+        print FREQS "$coarse_cat |||";
+        foreach my $fine_cat (keys %{$FREQ_HIER{$coarse_cat}}) {
+            my $res = $FREQ_HIER{$coarse_cat}{$fine_cat};
+            print FREQS " $fine_cat:$res";
+            if(! exists $finehier{$fine_cat} || $finehier{$fine_cat} < $res) {
+               $finehier{$fine_cat} = $coarse_cat;
+            }  
+        }
+        print FREQS "\n";
+    }
+    foreach my $fine_cat (keys %finehier) {
+        print FREQS "$fine_cat -> $finehier{$fine_cat}\n";
+    }
+    close FREQS;
     $CLUSTER_DIR = $CLUSTER_DIR_F;
 }
 
 sub grammar_extract {
-  my $LABELED = ($HIER_CAT ? "$CLUSTER_DIR_F/corpus.src_trg_al_label.hier" : "$LABELED_DIR/corpus.src_trg_al_label");
+  my $LABELED = "$LABELED_DIR/corpus.src_trg_al_label";
   print STDERR "\n!!!EXTRACTING GRAMMAR\n";
   my $OUTGRAMMAR = "$GRAMMAR_DIR/grammar.gz";
   if (-e $OUTGRAMMAR) {
-- 
cgit v1.2.3