diff options
| -rw-r--r-- | configure.ac | 5 | ||||
| -rw-r--r-- | decoder/dict.h | 10 | ||||
| -rwxr-xr-x | gi/pipeline/local-gi-pipeline.pl | 3 | ||||
| -rwxr-xr-x | gi/pyp-topics/scripts/spans2labels.py | 47 | 
4 files changed, 37 insertions, 28 deletions
diff --git a/configure.ac b/configure.ac index a7f4cfe7..e6119f29 100644 --- a/configure.ac +++ b/configure.ac @@ -8,9 +8,10 @@ AC_PROG_CXX  AC_LANG_CPLUSPLUS  BOOST_REQUIRE  BOOST_PROGRAM_OPTIONS +BOOST_REGEX  CPPFLAGS="$CPPFLAGS $BOOST_CPPFLAGS" -LDFLAGS="$LDFLAGS $BOOST_PROGRAM_OPTIONS_LDFLAGS" -LIBS="$LIBS $BOOST_PROGRAM_OPTIONS_LIBS" +LDFLAGS="$LDFLAGS $BOOST_PROGRAM_OPTIONS_LDFLAGS $BOOST_REGEX_LDFLAGS" +LIBS="$LIBS $BOOST_PROGRAM_OPTIONS_LIBS $BOOST_REGEX_LIBS"  AC_CHECK_HEADER(boost/math/special_functions/digamma.hpp,                 [AC_DEFINE([HAVE_BOOST_DIGAMMA], [], [flag for boost::math::digamma])]) diff --git a/decoder/dict.h b/decoder/dict.h index bc3a904a..39baf6ed 100644 --- a/decoder/dict.h +++ b/decoder/dict.h @@ -8,6 +8,8 @@  #include <vector>  #include <boost/functional/hash.hpp> +#include <boost/regex.hpp> +#include <boost/algorithm/string/regex.hpp>  #include "wordid.h" @@ -36,7 +38,7 @@ class Dict {      std::string word= "";      for (std::vector<std::string>::const_iterator it=words.begin();           it != words.end(); ++it) { -      if (it != words.begin()) word += "__"; +      if (it != words.begin()) word += "|||";        word += *it;      } @@ -49,6 +51,12 @@ class Dict {      return words_[id-1];    } +  inline std::vector<std::string> AsVector(const WordID& id) const { +    std::vector<std::string> result; +    boost::algorithm::split_regex(result, Convert(id), boost::regex("\\|\\|\\|")); +    return result; +  } +    void clear() { words_.clear(); d_.clear(); }   private: diff --git a/gi/pipeline/local-gi-pipeline.pl b/gi/pipeline/local-gi-pipeline.pl index af83beb8..be91f9ad 100755 --- a/gi/pipeline/local-gi-pipeline.pl +++ b/gi/pipeline/local-gi-pipeline.pl @@ -138,7 +138,8 @@ sub label_spans_with_topics {      print STDERR "$OUT_SPANS exists, reusing...\n";    } else {      safesystem("$ZCAT $IN_CLUSTERS > $OUTPUT/clusters.txt") or die "Failed to unzip"; -    safesystem("$EXTRACTOR --base_phrase_spans -i $CORPUS -c $ITEMS_IN_MEMORY -L $BASE_PHRASE_MAX_SIZE -S $CONTEXT_SIZE | $S2L $OUTPUT/phrases.index $OUTPUT/contexts.index $OUTPUT/clusters.txt > $OUT_SPANS") or die "Failed to label spans"; +#   safesystem("$EXTRACTOR --base_phrase_spans -i $CORPUS -c $ITEMS_IN_MEMORY -L $BASE_PHRASE_MAX_SIZE -S $CONTEXT_SIZE | $S2L $OUTPUT/phrases.index $OUTPUT/contexts.index $OUTPUT/clusters.txt > $OUT_SPANS") or die "Failed to label spans"; +    safesystem("$EXTRACTOR --base_phrase_spans -i $CORPUS -c $ITEMS_IN_MEMORY -L $BASE_PHRASE_MAX_SIZE -S $CONTEXT_SIZE | $S2L $OUTPUT/clusters.txt $CONTEXT_SIZE > $OUT_SPANS") or die "Failed to label spans";      unlink("$OUTPUT/clusters.txt") or warn "Failed to remove $OUTPUT/clusters.txt";      safesystem("paste -d ' ' $CORPUS $OUT_SPANS > $OUTPUT/corpus.src_trg_al") or die "Couldn't paste";    } diff --git a/gi/pyp-topics/scripts/spans2labels.py b/gi/pyp-topics/scripts/spans2labels.py index 409fda92..3221dbf0 100755 --- a/gi/pyp-topics/scripts/spans2labels.py +++ b/gi/pyp-topics/scripts/spans2labels.py @@ -3,44 +3,43 @@  import sys  from operator import itemgetter -if len(sys.argv) != 4: -  print "Usage: spans2labels.py phrase_index context_index phrase_context_index" +if len(sys.argv) <= 2: +  print "Usage: spans2labels.py phrase_context_index [order]"    exit(1) -phrase_index = dict(map(lambda x: (x[1].strip(),x[0]), enumerate(file(sys.argv[1], 'r').readlines()))) -context_index = dict(map(lambda x: (x[1].strip(),x[0]), enumerate(file(sys.argv[2], 'r').readlines()))) +order=1 +if len(sys.argv) > 2: +  order = int(sys.argv[2])  phrase_context_index = {} -for i,line in enumerate(file(sys.argv[3], 'r').readlines()): -  for c,l in map(lambda x: x.split(':'), line.split()[1:]): -    phrase_context_index[(int(i),int(c))] = l +for line in file(sys.argv[1], 'r'): +  phrase,tail= line.split('\t') +  contexts = tail.split(" ||| ") +  assert len(contexts) % 2 == 0 +  for i in range(0, len(contexts), 2): +    category = contexts[i+1].split("=")[1].strip() +    phrase_context_index[(phrase,contexts[i])] = category +#   print (phrase,contexts[i]), category  for line in sys.stdin:    line_segments = line.split('|||') -  source = ['<s>'] + line_segments[0].split() + ['</s>'] -  target = ['<s>'] + line_segments[1].split() + ['</s>'] +  source = ['<s>' for x in range(order)] + line_segments[0].split() + ['</s>' for x in range(order)] +  target = ['<s>' for x in range(order)] + line_segments[1].split() + ['</s>' for x in range(order)]    phrases = [ [int(i) for i in x.split('-')] for x in line_segments[2].split()] -# for x in source[1:-1]:  -#   print x, -# print "|||", -# for x in target[1:-1]:  -#   print x,    print "|||",    for s1,s2,t1,t2 in phrases: -    s1 += 1 -    s2 += 1 -    t1 += 1 -    t2 += 1 +    s1 += order +    s2 += order +    t1 += order +    t2 += order      phrase = reduce(lambda x, y: x+y+" ", target[t1:t2], "").strip() -    context = "%s <PHRASE> %s" % (target[t1-1], target[t2]) +    left_context = reduce(lambda x, y: x+y+" ", target[t1-order:t1], "") +    right_context = reduce(lambda x, y: x+y+" ", target[t2:t2+order], "").strip() +    context = "%s<PHRASE> %s" % (left_context, right_context) -    pi = phrase_index[phrase] -    ci = context_index[context] -    label = phrase_context_index[(pi,ci)] +    label = phrase_context_index[(phrase,context)]      print "%s-%s:X%s" % (t1-1,t2-1,label), -#   print phrase, pi, context, ci -#   print phrase_context_index[(pi,ci)]    print  | 
