4 files changed, 37 insertions, 28 deletions
diff --git a/configure.ac b/configure.ac
index a7f4cfe7..e6119f29 100644
--- a/configure.ac
+++ b/configure.ac
@@ -8,9 +8,10 @@ AC_PROG_CXX
 AC_LANG_CPLUSPLUS
 BOOST_REQUIRE
 BOOST_PROGRAM_OPTIONS
+BOOST_REGEX
 CPPFLAGS="$CPPFLAGS $BOOST_CPPFLAGS"
-LDFLAGS="$LDFLAGS $BOOST_PROGRAM_OPTIONS_LDFLAGS"
-LIBS="$LIBS $BOOST_PROGRAM_OPTIONS_LIBS"
+LDFLAGS="$LDFLAGS $BOOST_PROGRAM_OPTIONS_LDFLAGS $BOOST_REGEX_LDFLAGS"
+LIBS="$LIBS $BOOST_PROGRAM_OPTIONS_LIBS $BOOST_REGEX_LIBS"
 
 AC_CHECK_HEADER(boost/math/special_functions/digamma.hpp,
                [AC_DEFINE([HAVE_BOOST_DIGAMMA], [], [flag for boost::math::digamma])])
diff --git a/decoder/dict.h b/decoder/dict.h
index bc3a904a..39baf6ed 100644
--- a/decoder/dict.h
+++ b/decoder/dict.h
@@ -8,6 +8,8 @@
 #include <vector>
 
 #include <boost/functional/hash.hpp>
+#include <boost/regex.hpp>
+#include <boost/algorithm/string/regex.hpp>
 
 #include "wordid.h"
 
@@ -36,7 +38,7 @@ class Dict {
     std::string word= "";
     for (std::vector<std::string>::const_iterator it=words.begin();
          it != words.end(); ++it) {
-      if (it != words.begin()) word += "__";
+      if (it != words.begin()) word += "|||";
       word += *it;
     }
 
@@ -49,6 +51,12 @@ class Dict {
     return words_[id-1];
   }
 
+  inline std::vector<std::string> AsVector(const WordID& id) const {
+    std::vector<std::string> result;
+    boost::algorithm::split_regex(result, Convert(id), boost::regex("\\|\\|\\|"));
+    return result;
+  }
+
   void clear() { words_.clear(); d_.clear(); }
 
  private:
diff --git a/gi/pipeline/local-gi-pipeline.pl b/gi/pipeline/local-gi-pipeline.pl
index af83beb8..be91f9ad 100755
--- a/gi/pipeline/local-gi-pipeline.pl
+++ b/gi/pipeline/local-gi-pipeline.pl
@@ -138,7 +138,8 @@ sub label_spans_with_topics {
     print STDERR "$OUT_SPANS exists, reusing...\n";
   } else {
     safesystem("$ZCAT $IN_CLUSTERS > $OUTPUT/clusters.txt") or die "Failed to unzip";
-    safesystem("$EXTRACTOR --base_phrase_spans -i $CORPUS -c $ITEMS_IN_MEMORY -L $BASE_PHRASE_MAX_SIZE -S $CONTEXT_SIZE | $S2L $OUTPUT/phrases.index $OUTPUT/contexts.index $OUTPUT/clusters.txt > $OUT_SPANS") or die "Failed to label spans";
+#   safesystem("$EXTRACTOR --base_phrase_spans -i $CORPUS -c $ITEMS_IN_MEMORY -L $BASE_PHRASE_MAX_SIZE -S $CONTEXT_SIZE | $S2L $OUTPUT/phrases.index $OUTPUT/contexts.index $OUTPUT/clusters.txt > $OUT_SPANS") or die "Failed to label spans";
+    safesystem("$EXTRACTOR --base_phrase_spans -i $CORPUS -c $ITEMS_IN_MEMORY -L $BASE_PHRASE_MAX_SIZE -S $CONTEXT_SIZE | $S2L $OUTPUT/clusters.txt $CONTEXT_SIZE > $OUT_SPANS") or die "Failed to label spans";
     unlink("$OUTPUT/clusters.txt") or warn "Failed to remove $OUTPUT/clusters.txt";
     safesystem("paste -d ' ' $CORPUS $OUT_SPANS > $OUTPUT/corpus.src_trg_al") or die "Couldn't paste";
   }
diff --git a/gi/pyp-topics/scripts/spans2labels.py b/gi/pyp-topics/scripts/spans2labels.py
index 409fda92..3221dbf0 100755
--- a/gi/pyp-topics/scripts/spans2labels.py
+++ b/gi/pyp-topics/scripts/spans2labels.py
@@ -3,44 +3,43 @@
 import sys
 from operator import itemgetter
 
-if len(sys.argv) != 4:
-  print "Usage: spans2labels.py phrase_index context_index phrase_context_index"
+if len(sys.argv) <= 2:
+  print "Usage: spans2labels.py phrase_context_index [order]"
   exit(1)
 
-phrase_index = dict(map(lambda x: (x[1].strip(),x[0]), enumerate(file(sys.argv[1], 'r').readlines())))
-context_index = dict(map(lambda x: (x[1].strip(),x[0]), enumerate(file(sys.argv[2], 'r').readlines())))
+order=1
+if len(sys.argv) > 2:
+  order = int(sys.argv[2])
 
 phrase_context_index = {}
-for i,line in enumerate(file(sys.argv[3], 'r').readlines()):
-  for c,l in map(lambda x: x.split(':'), line.split()[1:]):
-    phrase_context_index[(int(i),int(c))] = l
+for line in file(sys.argv[1], 'r'):
+  phrase,tail= line.split('\t')
+  contexts = tail.split(" ||| ")
+  assert len(contexts) % 2 == 0
+  for i in range(0, len(contexts), 2):
+    category = contexts[i+1].split("=")[1].strip()
+    phrase_context_index[(phrase,contexts[i])] = category
+#   print (phrase,contexts[i]), category
 
 for line in sys.stdin:
   line_segments = line.split('|||')
-  source = ['<s>'] + line_segments[0].split() + ['</s>']
-  target = ['<s>'] + line_segments[1].split() + ['</s>']
+  source = ['<s>' for x in range(order)] + line_segments[0].split() + ['</s>' for x in range(order)]
+  target = ['<s>' for x in range(order)] + line_segments[1].split() + ['</s>' for x in range(order)]
   phrases = [ [int(i) for i in x.split('-')] for x in line_segments[2].split()]
 
-# for x in source[1:-1]: 
-#   print x,
-# print "|||",
-# for x in target[1:-1]: 
-#   print x,
   print "|||",
 
   for s1,s2,t1,t2 in phrases:
-    s1 += 1
-    s2 += 1
-    t1 += 1
-    t2 += 1
+    s1 += order
+    s2 += order
+    t1 += order
+    t2 += order
 
     phrase = reduce(lambda x, y: x+y+" ", target[t1:t2], "").strip()
-    context = "%s <PHRASE> %s" % (target[t1-1], target[t2])
+    left_context = reduce(lambda x, y: x+y+" ", target[t1-order:t1], "")
+    right_context = reduce(lambda x, y: x+y+" ", target[t2:t2+order], "").strip()
+    context = "%s<PHRASE> %s" % (left_context, right_context)
 
-    pi = phrase_index[phrase]
-    ci = context_index[context]
-    label = phrase_context_index[(pi,ci)]
+    label = phrase_context_index[(phrase,context)]
     print "%s-%s:X%s" % (t1-1,t2-1,label),
-#   print phrase, pi, context, ci
-#   print phrase_context_index[(pi,ci)]
   print