From 40ee5446b84b5cdbc4e4a613e4c1aa19231c42d3 Mon Sep 17 00:00:00 2001
From: "trevor.cohn" <trevor.cohn@ec762483-ff6d-05da-a07a-a48fb63a330f>
Date: Thu, 15 Jul 2010 00:34:58 +0000
Subject: Massacred the pipeline to support source language phrases and
 contexts.

git-svn-id: https://ws10smt.googlecode.com/svn/trunk@255 ec762483-ff6d-05da-a07a-a48fb63a330f
---
 extools/extract.cc                                 | 26 +++++--
 extools/extract.h                                  |  4 +-
 extools/sentence_pair.cc                           | 87 +++++++++++++++-------
 extools/sentence_pair.h                            |  8 +-
 extools/striped_grammar.cc                         |  2 +-
 gi/pipeline/local-gi-pipeline.pl                   | 18 +++--
 .../prjava/src/phrase/Corpus.java                  |  4 +-
 gi/pyp-topics/scripts/spans2labels.py              | 41 ++++++++--
 8 files changed, 139 insertions(+), 51 deletions(-)
diff --git a/extools/extract.cc b/extools/extract.cc
index 567348f4..44cd51af 100644
--- a/extools/extract.cc
+++ b/extools/extract.cc
@@ -5,6 +5,7 @@
 #include <utility>
 #include <tr1/unordered_map>
 #include <set>
+#include <boost/tuple/tuple_comparison.hpp>
 
 #include <boost/functional/hash.hpp>
 
@@ -15,6 +16,7 @@
 
 using namespace std;
 using namespace tr1;
+using namespace boost;
 
 namespace {
   inline bool IsWhitespace(char c) { return c == ' ' || c == '\t'; }
@@ -114,27 +116,37 @@ void Extract::LoosenPhraseBounds(const AnnotatedParallelSentence& sentence,
   }
 }
 
+template <typename K, typename V>
+void
+lookup_and_append(const map<K, V> &dict, const K &key, V &output)
+{
+    typename map<K, V>::const_iterator found = dict.find(key);
+    if (found != dict.end())
+        copy(found->second.begin(), found->second.end(), back_inserter(output));
+}
+
 // this uses the TARGET span (i,j) to annotate phrases, will copy
 // phrases if there is more than one annotation.
 // TODO: support source annotation
 void Extract::AnnotatePhrasesWithCategoryTypes(const WordID default_cat,
-                                      const Array2D<vector<WordID> >& types,
+                                      const map< tuple<short,short,short,short>, vector<WordID> > &types,
                                       vector<ParallelSpan>* phrases) {
   const int num_unannotated_phrases = phrases->size();
   // have to use num_unannotated_phrases since we may grow the vector
   for (int i = 0; i < num_unannotated_phrases; ++i) {
     ParallelSpan& phrase = (*phrases)[i];
-    const vector<WordID>* pcats = &types(phrase.j1, phrase.j2);
-    if (pcats->empty() && default_cat != 0) {
-      static vector<WordID> s_default(1, default_cat);
-      pcats = &s_default;
+    vector<WordID> cats;
+    lookup_and_append(types, make_tuple(phrase.i1, phrase.i2, phrase.j1, phrase.j2), cats);
+    lookup_and_append(types, make_tuple((short)-1, (short)-1, phrase.j1, phrase.j2), cats);
+    lookup_and_append(types, make_tuple(phrase.i1, phrase.i2, (short)-1, (short)-1), cats);
+    if (cats.empty() && default_cat != 0) {
+      cats = vector<WordID>(1, default_cat);
     }
-    if (pcats->empty()) {
+    if (cats.empty()) {
       cerr << "ERROR span " << phrase.i1 << "," << phrase.i2 << "-"
            << phrase.j1 << "," << phrase.j2 << " has no type. "
               "Did you forget --default_category?\n";
     }
-    const vector<WordID>& cats = *pcats;
     phrase.cat = cats[0];
     for (int ci = 1; ci < cats.size(); ++ci) {
       ParallelSpan new_phrase = phrase;
diff --git a/extools/extract.h b/extools/extract.h
index 76292bed..e9ea5e65 100644
--- a/extools/extract.h
+++ b/extools/extract.h
@@ -4,6 +4,7 @@
 #include <iostream>
 #include <utility>
 #include <vector>
+#include <boost/tuple/tuple.hpp>
 #include "array2d.h"
 #include "wordid.h"
 #include "sparse_vector.h"
@@ -74,9 +75,8 @@ struct Extract {
 
   // this uses the TARGET span (i,j) to annotate phrases, will copy
   // phrases if there is more than one annotation.
-  // TODO: support source annotation
   static void AnnotatePhrasesWithCategoryTypes(const WordID default_cat,
-                                      const Array2D<std::vector<WordID> >& types,
+                                      const std::map< boost::tuple<short,short,short,short>, std::vector<WordID> > &types,
                                       std::vector<ParallelSpan>* phrases);
 
   // use the Chiang (2007) extraction logic to extract consistent subphrases
diff --git a/extools/sentence_pair.cc b/extools/sentence_pair.cc
index 02df3349..b2881737 100644
--- a/extools/sentence_pair.cc
+++ b/extools/sentence_pair.cc
@@ -6,12 +6,14 @@
 #include <vector>
 #include <utility>
 #include <set>
+#include <boost/tuple/tuple_comparison.hpp>
 
 #include "tdict.h"
 #include "wordid.h"
 #include "array2d.h"
 
 using namespace std;
+using namespace boost;
 
 namespace {
   inline bool IsWhitespace(char c) { return c == ' ' || c == '\t'; }
@@ -38,7 +40,6 @@ void AnnotatedParallelSentence::AllocateForAlignment() {
   f_aligned.resize(f_len, 0);
   e_aligned.resize(e_len, 0);
   aligns_by_fword.resize(f_len);
-  span_types.resize(e_len, e_len+1);
 }
 
 // read an alignment point of the form X-Y where X and Y are strings
@@ -48,44 +49,76 @@ int AnnotatedParallelSentence::ReadAlignmentPoint(const char* buf,
                                                   const int start,
                                                   const int end,
                                                   const bool permit_col,
-                                                  short* a,
-                                                  short* b) {
+                                                  short* a, short* b, short* c, short* d) {
   if (end - start < 3) {
-    cerr << "Alignment point badly formed: " << string(buf, start, end-start) << endl << buf << endl;
+    cerr << "Alignment point badly formed 1: " << string(buf, start, end-start) << endl << buf << endl;
     exit(1);
   }
-  int c = start;
+  int ch = start;
   *a = 0;
-  while(c < end && buf[c] != '-') {
-    if (buf[c] < '0' || buf[c] > '9') {
-      cerr << "Alignment point badly formed: " << string(buf, start, end-start) << endl << buf << endl;
+  while(ch < end && buf[ch] != '-') {
+    if (buf[ch] < '0' || buf[ch] > '9') {
+      cerr << "Alignment point badly formed 2: " << string(buf, start, end-start) << endl << buf << endl;
       exit(1);
     }
     (*a) *= 10;
-    (*a) += buf[c] - '0';
-    ++c;
+    (*a) += buf[ch] - '0';
+    ++ch;
   }
-  ++c;
-  if (c >= end) {
-    cerr << "Alignment point badly formed: " << string(buf, start, end-start) << endl << buf << endl;
+  ++ch;
+  if (ch >= end) {
+    cerr << "Alignment point badly formed 3: " << string(buf, start, end-start) << endl << buf << endl;
     exit(1);
   }
   (*b) = 0;
-  while(c < end && (!permit_col || (permit_col && buf[c] != ':'))) {
-    if (buf[c] < '0' || buf[c] > '9') {
-      cerr << "Alignment point badly formed: " << string(buf, start, end-start) << endl << buf << endl;
+  while(ch < end && (c == 0 && (!permit_col || (permit_col && buf[ch] != ':')) || c != 0 && buf[ch] != '-')) {
+    if (buf[ch] < '0' || buf[ch] > '9') {
+      cerr << "Alignment point badly formed 4: " << string(buf, start, end-start) << endl << buf << endl;
       exit(1);
     }
     (*b) *= 10;
-    (*b) += buf[c] - '0';
-    ++c;
+    (*b) += buf[ch] - '0';
+    ++ch;
   }
-  return c;
+  if (c != 0)
+  {
+      ++ch;
+      if (ch >= end) {
+        cerr << "Alignment point badly formed 5: " << string(buf, start, end-start) << endl << buf << endl;
+        exit(1);
+      }
+      (*c) = 0;
+      while(ch < end && buf[ch] != '-') {
+        if (buf[ch] < '0' || buf[ch] > '9') {
+          cerr << "Alignment point badly formed 6: " << string(buf, start, end-start) << endl << buf << endl;
+          exit(1);
+        }
+        (*c) *= 10;
+        (*c) += buf[ch] - '0';
+        ++ch;
+      }
+      ++ch;
+      if (ch >= end) {
+        cerr << "Alignment point badly formed 7: " << string(buf, start, end-start) << endl << buf << endl;
+        exit(1);
+      }
+      (*d) = 0;
+      while(ch < end && (!permit_col || (permit_col && buf[ch] != ':'))) {
+        if (buf[ch] < '0' || buf[ch] > '9') {
+          cerr << "Alignment point badly formed 8: " << string(buf, start, end-start) << endl << buf << endl;
+          exit(1);
+        }
+        (*d) *= 10;
+        (*d) += buf[ch] - '0';
+        ++ch;
+      }
+  }
+  return ch;
 }
 
 void AnnotatedParallelSentence::ParseAlignmentPoint(const char* buf, int start, int end) {
   short a, b;
-  ReadAlignmentPoint(buf, start, end, false, &a, &b);
+  ReadAlignmentPoint(buf, start, end, false, &a, &b, 0, 0);
   if (a >= f_len || b >= e_len) {
     cerr << "(" << a << ',' << b << ") is out of bounds. INPUT=\n" << buf << endl;
     exit(1);
@@ -98,18 +131,22 @@ void AnnotatedParallelSentence::ParseAlignmentPoint(const char* buf, int start,
 }
 
 void AnnotatedParallelSentence::ParseSpanLabel(const char* buf, int start, int end) {
-  short a,b;
-  int c = ReadAlignmentPoint(buf, start, end, true, &a, &b) + 1;
-  if (buf[c-1] != ':' || c >= end) {
+  short a,b,c,d;
+  int ch = ReadAlignmentPoint(buf, start, end, true, &a, &b, &c, &d) + 1;
+  if (buf[ch-1] != ':' || ch >= end) {
     cerr << "Span badly formed: " << string(buf, start, end-start) << endl << buf << endl;
     exit(1);
   }
-  if (a >= e_len || b > e_len) {
+  if (a >= f_len || b > f_len) {
     cerr << "(" << a << ',' << b << ") is out of bounds in labeled span. INPUT=\n" << buf << endl;
     exit(1);
   }
+  if (c >= e_len || d > e_len) {
+    cerr << "(" << c << ',' << d << ") is out of bounds in labeled span. INPUT=\n" << buf << endl;
+    exit(1);
+  }
   // cerr << a << " " << b << " " << string(buf,c,end-c) << endl;
-  span_types(a,b).push_back(-TD::Convert(string(buf, c, end-c)));
+  span_types[make_tuple(a,b,c,d)].push_back(-TD::Convert(string(buf, ch, end-ch)));
 }
 
 // INPUT FORMAT
diff --git a/extools/sentence_pair.h b/extools/sentence_pair.h
index d78be359..b5a7ca93 100644
--- a/extools/sentence_pair.h
+++ b/extools/sentence_pair.h
@@ -1,8 +1,10 @@
 #ifndef _SENTENCE_PAIR_H_
 #define _SENTENCE_PAIR_H_
 
+#include <map>
 #include <utility>
 #include <vector>
+#include <boost/tuple/tuple.hpp>
 #include "wordid.h"
 #include "array2d.h"
 
@@ -22,12 +24,12 @@ struct AnnotatedParallelSentence {
   std::vector<std::vector<std::pair<short, short> > > aligns_by_fword;
 
   // span type information
-  Array2D<std::vector<WordID> > span_types;  // span_types(i,j) is the list of category
-                               // types for a span (i,j) in the TARGET language.
+  std::map< boost::tuple<short,short,short,short>, std::vector<WordID> > span_types;
+  // span_types(i,j,k,l) is the list of category span (i,j) in source and (k,l) in the target language.
 
   int f_len, e_len;
 
-  static int ReadAlignmentPoint(const char* buf, int start, int end, bool permit_col, short* a, short* b);
+  static int ReadAlignmentPoint(const char* buf, int start, int end, bool permit_col, short* a, short* b, short* c, short* d);
 
  private:
   void Reset();
diff --git a/extools/striped_grammar.cc b/extools/striped_grammar.cc
index accf44eb..785f4bbe 100644
--- a/extools/striped_grammar.cc
+++ b/extools/striped_grammar.cc
@@ -33,7 +33,7 @@ void RuleStatistics::ParseRuleStatistics(const char* buf, int start, int end) {
         while(ptr < end && buf[ptr] != ',' && !IsWhitespace(buf[ptr])) { ++ptr; }
         if (ptr > vstart) {
           short a, b;
-          AnnotatedParallelSentence::ReadAlignmentPoint(buf, vstart, ptr, false, &a, &b);
+          AnnotatedParallelSentence::ReadAlignmentPoint(buf, vstart, ptr, false, &a, &b, 0, 0);
           aligns.push_back(make_pair(a,b));
         }
       }
diff --git a/gi/pipeline/local-gi-pipeline.pl b/gi/pipeline/local-gi-pipeline.pl
index 0d6c553c..cb411c1b 100755
--- a/gi/pipeline/local-gi-pipeline.pl
+++ b/gi/pipeline/local-gi-pipeline.pl
@@ -19,7 +19,8 @@ my $NUM_SAMPLES = 1000;
 my $CONTEXT_SIZE = 1;
 my $BIDIR = 0;
 my $TOPICS_CONFIG = "pyp-topics.conf";
-my $LABEL_THRESHOLD = 0;
+my $LANGUAGE = "target";
+
 my $MODEL = "pyp";
 my $NUM_EM_ITERS = 100;
 my $NUM_PR_ITERS = 0;
@@ -71,6 +72,7 @@ usage() unless &GetOptions('base_phrase_max_size=i' => \$BASE_PHRASE_MAX_SIZE,
                            'pr-scale-context=f' => \$PR_SCALE_C,
                            'pr-threads=i' => \$PR_THREADS,
                            'tagged_corpus=s' => \$TAGGED_CORPUS,
+                           'language=s' => \$LANGUAGE,
                           );
 
 usage() unless scalar @ARGV == 1;
@@ -166,7 +168,7 @@ sub setup_data {
 }
 
 sub context_dir {
-  return "ct${CONTEXT_SIZE}s0.L$BASE_PHRASE_MAX_SIZE";
+  return "ct${CONTEXT_SIZE}s0.L$BASE_PHRASE_MAX_SIZE.l$LANGUAGE";
 }
 
 sub cluster_dir {
@@ -231,10 +233,10 @@ sub extract_context {
  if (-e $OUT_CONTEXTS) {
    print STDERR "$OUT_CONTEXTS exists, reusing...\n";
  } else {
-   my $cmd = "$EXTRACTOR -i $CORPUS_CLUSTER -c $ITEMS_IN_MEMORY -L $BASE_PHRASE_MAX_SIZE -C -S $CONTEXT_SIZE | $SORT_KEYS | $REDUCER | $GZIP > $OUT_CONTEXTS";
+   my $cmd = "$EXTRACTOR -i $CORPUS_CLUSTER -c $ITEMS_IN_MEMORY -L $BASE_PHRASE_MAX_SIZE -C -S $CONTEXT_SIZE --phrase_language $LANGUAGE --context_language $LANGUAGE | $SORT_KEYS | $REDUCER | $GZIP > $OUT_CONTEXTS";
    if ($COMPLETE_CACHE) {
      print STDERR "COMPLETE_CACHE is set: removing memory limits on cache.\n";
-     $cmd = "$EXTRACTOR -i $CORPUS_CLUSTER -c 0 -L $BASE_PHRASE_MAX_SIZE -C -S $CONTEXT_SIZE | $SORT_KEYS | $GZIP > $OUT_CONTEXTS";
+     $cmd = "$EXTRACTOR -i $CORPUS_CLUSTER -c 0 -L $BASE_PHRASE_MAX_SIZE -C -S $CONTEXT_SIZE  --phrase_language $LANGUAGE --context_language $LANGUAGE  | $SORT_KEYS | $GZIP > $OUT_CONTEXTS";
    }
    safesystem($cmd) or die "Failed to extract contexts.";
   }
@@ -270,8 +272,14 @@ sub label_spans_with_topics {
   if (-e $OUT_SPANS) {
     print STDERR "$OUT_SPANS exists, reusing...\n";
   } else {
+    my $l = "tt";
+    if ($LANGUAGE eq "source") {
+        $l = "ss";
+    } elsif ($LANGUAGE eq "both") {
+        $l = "bb";
+    } else { die "Invalid language specifier $LANGUAGE\n" unless $LANGUAGE eq "target" };
     safesystem("$ZCAT $IN_CLUSTERS > $CLUSTER_DIR/clusters.txt") or die "Failed to unzip";
-    safesystem("$EXTRACTOR --base_phrase_spans -i $CORPUS_CLUSTER -c $ITEMS_IN_MEMORY -L $BASE_PHRASE_MAX_SIZE -S $CONTEXT_SIZE | $S2L $CLUSTER_DIR/clusters.txt $CONTEXT_SIZE $LABEL_THRESHOLD > $OUT_SPANS") or die "Failed to label spans";
+    safesystem("$EXTRACTOR --base_phrase_spans -i $CORPUS_CLUSTER -c $ITEMS_IN_MEMORY -L $BASE_PHRASE_MAX_SIZE -S $CONTEXT_SIZE | $S2L $CLUSTER_DIR/clusters.txt $CONTEXT_SIZE $LABEL_THRESHOLD $l > $OUT_SPANS") or die "Failed to label spans";
     unlink("$CLUSTER_DIR/clusters.txt") or warn "Failed to remove $CLUSTER_DIR/clusters.txt";
     safesystem("paste -d ' ' $CORPUS_LEX $OUT_SPANS > $LABELED_DIR/corpus.src_trg_al_label") or die "Couldn't paste";
   }
diff --git a/gi/posterior-regularisation/prjava/src/phrase/Corpus.java b/gi/posterior-regularisation/prjava/src/phrase/Corpus.java
index 81264ab9..d57f3c04 100644
--- a/gi/posterior-regularisation/prjava/src/phrase/Corpus.java
+++ b/gi/posterior-regularisation/prjava/src/phrase/Corpus.java
@@ -151,7 +151,7 @@ public class Corpus
 		for (int i = 0; i < c.size(); ++i)
 		{
 			if (i > 0) b.append(" ");
-			if (i == c.size() / 2) b.append("<PHRASE> ");
+			//if (i == c.size() / 2) b.append("<PHRASE> ");
 			b.append(wordLexicon.lookup(c.get(i)));
 		}
 		return b.toString();
@@ -198,7 +198,7 @@ public class Corpus
 				while (ctxStrtok.hasMoreTokens())
 				{
 					String token = ctxStrtok.nextToken();
-					if (!token.equals("<PHRASE>"))
+					//if (!token.equals("<PHRASE>"))
 						ctx.add(c.wordLexicon.insert(token));
 				}
 				int contextId = c.contextLexicon.insert(ctx);
diff --git a/gi/pyp-topics/scripts/spans2labels.py b/gi/pyp-topics/scripts/spans2labels.py
index f990582e..3dc60835 100755
--- a/gi/pyp-topics/scripts/spans2labels.py
+++ b/gi/pyp-topics/scripts/spans2labels.py
@@ -4,7 +4,7 @@ import sys
 from operator import itemgetter
 
 if len(sys.argv) <= 2:
-  print "Usage: spans2labels.py phrase_context_index [order] [threshold]"
+  print "Usage: spans2labels.py phrase_context_index [order] [threshold] [languages={s,t,b}{s,t,b}]"
   exit(1)
 
 order=1
@@ -14,6 +14,11 @@ if len(sys.argv) > 2:
   order = int(sys.argv[2])
 if len(sys.argv) > 3:
   threshold = float(sys.argv[3])
+phr=ctx='t'
+if len(sys.argv) > 4:
+  phr, ctx = sys.argv[4]
+  assert phr in 'stb'
+  assert ctx in 'stb'
 
 phrase_context_index = {}
 for line in file(sys.argv[1], 'r'):
@@ -52,11 +57,35 @@ for line in sys.stdin:
     t1 += order
     t2 += order
 
-    phrase = reduce(lambda x, y: x+y+" ", target[t1:t2], "").strip()
-    left_context = reduce(lambda x, y: x+y+" ", target[t1-order:t1], "")
-    right_context = reduce(lambda x, y: x+y+" ", target[t2:t2+order], "").strip()
-    context = "%s<PHRASE> %s" % (left_context, right_context)
+    phraset = phrases = contextt = contexts = ''
+    if phr in 'tb':
+        phraset = reduce(lambda x, y: x+y+" ", target[t1:t2], "").strip()
+    if phr in 'sb':
+        phrases = reduce(lambda x, y: x+y+" ", source[s1:s2], "").strip()
+
+    if ctx in 'tb':
+        left_context = reduce(lambda x, y: x+y+" ", target[t1-order:t1], "")
+        right_context = reduce(lambda x, y: x+y+" ", target[t2:t2+order], "").strip()
+        contextt = "%s<PHRASE> %s" % (left_context, right_context)
+    if ctx in 'sb':
+        left_context = reduce(lambda x, y: x+y+" ", source[s1-order:s1], "")
+        right_context = reduce(lambda x, y: x+y+" ", source[s2:s2+order], "").strip()
+        contexts = "%s<PHRASE> %s" % (left_context, right_context)
+
+    if phr == 'b':
+        phrase = phraset + ' <SPLIT> ' + phrases
+    elif phr == 's':
+        phrase = phrases
+    else:
+        phrase = phraset
+
+    if ctx == 'b':
+        context = contextt + ' <SPLIT> ' + contexts
+    elif ctx == 's':
+        context = contexts
+    else:
+        context = contextt
 
     label = phrase_context_index.get((phrase,context), "<UNK>")
-    print "%s-%s:X%s" % (t1-order,t2-order,label),
+    print "%d-%d-%d-%d:X%s" % (s1-order,s2-order,t1-order,t2-order,label),
   print
-- 
cgit v1.2.3