diff options
| -rw-r--r-- | extools/extract.cc | 26 | ||||
| -rw-r--r-- | extools/extract.h | 4 | ||||
| -rw-r--r-- | extools/sentence_pair.cc | 87 | ||||
| -rw-r--r-- | extools/sentence_pair.h | 8 | ||||
| -rw-r--r-- | extools/striped_grammar.cc | 2 | ||||
| -rwxr-xr-x | gi/pipeline/local-gi-pipeline.pl | 18 | ||||
| -rw-r--r-- | gi/posterior-regularisation/prjava/src/phrase/Corpus.java | 4 | ||||
| -rwxr-xr-x | gi/pyp-topics/scripts/spans2labels.py | 41 | 
8 files changed, 139 insertions, 51 deletions
| diff --git a/extools/extract.cc b/extools/extract.cc index 567348f4..44cd51af 100644 --- a/extools/extract.cc +++ b/extools/extract.cc @@ -5,6 +5,7 @@  #include <utility>  #include <tr1/unordered_map>  #include <set> +#include <boost/tuple/tuple_comparison.hpp>  #include <boost/functional/hash.hpp> @@ -15,6 +16,7 @@  using namespace std;  using namespace tr1; +using namespace boost;  namespace {    inline bool IsWhitespace(char c) { return c == ' ' || c == '\t'; } @@ -114,27 +116,37 @@ void Extract::LoosenPhraseBounds(const AnnotatedParallelSentence& sentence,    }  } +template <typename K, typename V> +void +lookup_and_append(const map<K, V> &dict, const K &key, V &output) +{ +    typename map<K, V>::const_iterator found = dict.find(key); +    if (found != dict.end()) +        copy(found->second.begin(), found->second.end(), back_inserter(output)); +} +  // this uses the TARGET span (i,j) to annotate phrases, will copy  // phrases if there is more than one annotation.  // TODO: support source annotation  void Extract::AnnotatePhrasesWithCategoryTypes(const WordID default_cat, -                                      const Array2D<vector<WordID> >& types, +                                      const map< tuple<short,short,short,short>, vector<WordID> > &types,                                        vector<ParallelSpan>* phrases) {    const int num_unannotated_phrases = phrases->size();    // have to use num_unannotated_phrases since we may grow the vector    for (int i = 0; i < num_unannotated_phrases; ++i) {      ParallelSpan& phrase = (*phrases)[i]; -    const vector<WordID>* pcats = &types(phrase.j1, phrase.j2); -    if (pcats->empty() && default_cat != 0) { -      static vector<WordID> s_default(1, default_cat); -      pcats = &s_default; +    vector<WordID> cats; +    lookup_and_append(types, make_tuple(phrase.i1, phrase.i2, phrase.j1, phrase.j2), cats); +    lookup_and_append(types, make_tuple((short)-1, (short)-1, phrase.j1, phrase.j2), cats); +    lookup_and_append(types, make_tuple(phrase.i1, phrase.i2, (short)-1, (short)-1), cats); +    if (cats.empty() && default_cat != 0) { +      cats = vector<WordID>(1, default_cat);      } -    if (pcats->empty()) { +    if (cats.empty()) {        cerr << "ERROR span " << phrase.i1 << "," << phrase.i2 << "-"             << phrase.j1 << "," << phrase.j2 << " has no type. "                "Did you forget --default_category?\n";      } -    const vector<WordID>& cats = *pcats;      phrase.cat = cats[0];      for (int ci = 1; ci < cats.size(); ++ci) {        ParallelSpan new_phrase = phrase; diff --git a/extools/extract.h b/extools/extract.h index 76292bed..e9ea5e65 100644 --- a/extools/extract.h +++ b/extools/extract.h @@ -4,6 +4,7 @@  #include <iostream>  #include <utility>  #include <vector> +#include <boost/tuple/tuple.hpp>  #include "array2d.h"  #include "wordid.h"  #include "sparse_vector.h" @@ -74,9 +75,8 @@ struct Extract {    // this uses the TARGET span (i,j) to annotate phrases, will copy    // phrases if there is more than one annotation. -  // TODO: support source annotation    static void AnnotatePhrasesWithCategoryTypes(const WordID default_cat, -                                      const Array2D<std::vector<WordID> >& types, +                                      const std::map< boost::tuple<short,short,short,short>, std::vector<WordID> > &types,                                        std::vector<ParallelSpan>* phrases);    // use the Chiang (2007) extraction logic to extract consistent subphrases diff --git a/extools/sentence_pair.cc b/extools/sentence_pair.cc index 02df3349..b2881737 100644 --- a/extools/sentence_pair.cc +++ b/extools/sentence_pair.cc @@ -6,12 +6,14 @@  #include <vector>  #include <utility>  #include <set> +#include <boost/tuple/tuple_comparison.hpp>  #include "tdict.h"  #include "wordid.h"  #include "array2d.h"  using namespace std; +using namespace boost;  namespace {    inline bool IsWhitespace(char c) { return c == ' ' || c == '\t'; } @@ -38,7 +40,6 @@ void AnnotatedParallelSentence::AllocateForAlignment() {    f_aligned.resize(f_len, 0);    e_aligned.resize(e_len, 0);    aligns_by_fword.resize(f_len); -  span_types.resize(e_len, e_len+1);  }  // read an alignment point of the form X-Y where X and Y are strings @@ -48,44 +49,76 @@ int AnnotatedParallelSentence::ReadAlignmentPoint(const char* buf,                                                    const int start,                                                    const int end,                                                    const bool permit_col, -                                                  short* a, -                                                  short* b) { +                                                  short* a, short* b, short* c, short* d) {    if (end - start < 3) { -    cerr << "Alignment point badly formed: " << string(buf, start, end-start) << endl << buf << endl; +    cerr << "Alignment point badly formed 1: " << string(buf, start, end-start) << endl << buf << endl;      exit(1);    } -  int c = start; +  int ch = start;    *a = 0; -  while(c < end && buf[c] != '-') { -    if (buf[c] < '0' || buf[c] > '9') { -      cerr << "Alignment point badly formed: " << string(buf, start, end-start) << endl << buf << endl; +  while(ch < end && buf[ch] != '-') { +    if (buf[ch] < '0' || buf[ch] > '9') { +      cerr << "Alignment point badly formed 2: " << string(buf, start, end-start) << endl << buf << endl;        exit(1);      }      (*a) *= 10; -    (*a) += buf[c] - '0'; -    ++c; +    (*a) += buf[ch] - '0'; +    ++ch;    } -  ++c; -  if (c >= end) { -    cerr << "Alignment point badly formed: " << string(buf, start, end-start) << endl << buf << endl; +  ++ch; +  if (ch >= end) { +    cerr << "Alignment point badly formed 3: " << string(buf, start, end-start) << endl << buf << endl;      exit(1);    }    (*b) = 0; -  while(c < end && (!permit_col || (permit_col && buf[c] != ':'))) { -    if (buf[c] < '0' || buf[c] > '9') { -      cerr << "Alignment point badly formed: " << string(buf, start, end-start) << endl << buf << endl; +  while(ch < end && (c == 0 && (!permit_col || (permit_col && buf[ch] != ':')) || c != 0 && buf[ch] != '-')) { +    if (buf[ch] < '0' || buf[ch] > '9') { +      cerr << "Alignment point badly formed 4: " << string(buf, start, end-start) << endl << buf << endl;        exit(1);      }      (*b) *= 10; -    (*b) += buf[c] - '0'; -    ++c; +    (*b) += buf[ch] - '0'; +    ++ch;    } -  return c; +  if (c != 0) +  { +      ++ch; +      if (ch >= end) { +        cerr << "Alignment point badly formed 5: " << string(buf, start, end-start) << endl << buf << endl; +        exit(1); +      } +      (*c) = 0; +      while(ch < end && buf[ch] != '-') { +        if (buf[ch] < '0' || buf[ch] > '9') { +          cerr << "Alignment point badly formed 6: " << string(buf, start, end-start) << endl << buf << endl; +          exit(1); +        } +        (*c) *= 10; +        (*c) += buf[ch] - '0'; +        ++ch; +      } +      ++ch; +      if (ch >= end) { +        cerr << "Alignment point badly formed 7: " << string(buf, start, end-start) << endl << buf << endl; +        exit(1); +      } +      (*d) = 0; +      while(ch < end && (!permit_col || (permit_col && buf[ch] != ':'))) { +        if (buf[ch] < '0' || buf[ch] > '9') { +          cerr << "Alignment point badly formed 8: " << string(buf, start, end-start) << endl << buf << endl; +          exit(1); +        } +        (*d) *= 10; +        (*d) += buf[ch] - '0'; +        ++ch; +      } +  } +  return ch;  }  void AnnotatedParallelSentence::ParseAlignmentPoint(const char* buf, int start, int end) {    short a, b; -  ReadAlignmentPoint(buf, start, end, false, &a, &b); +  ReadAlignmentPoint(buf, start, end, false, &a, &b, 0, 0);    if (a >= f_len || b >= e_len) {      cerr << "(" << a << ',' << b << ") is out of bounds. INPUT=\n" << buf << endl;      exit(1); @@ -98,18 +131,22 @@ void AnnotatedParallelSentence::ParseAlignmentPoint(const char* buf, int start,  }  void AnnotatedParallelSentence::ParseSpanLabel(const char* buf, int start, int end) { -  short a,b; -  int c = ReadAlignmentPoint(buf, start, end, true, &a, &b) + 1; -  if (buf[c-1] != ':' || c >= end) { +  short a,b,c,d; +  int ch = ReadAlignmentPoint(buf, start, end, true, &a, &b, &c, &d) + 1; +  if (buf[ch-1] != ':' || ch >= end) {      cerr << "Span badly formed: " << string(buf, start, end-start) << endl << buf << endl;      exit(1);    } -  if (a >= e_len || b > e_len) { +  if (a >= f_len || b > f_len) {      cerr << "(" << a << ',' << b << ") is out of bounds in labeled span. INPUT=\n" << buf << endl;      exit(1);    } +  if (c >= e_len || d > e_len) { +    cerr << "(" << c << ',' << d << ") is out of bounds in labeled span. INPUT=\n" << buf << endl; +    exit(1); +  }    // cerr << a << " " << b << " " << string(buf,c,end-c) << endl; -  span_types(a,b).push_back(-TD::Convert(string(buf, c, end-c))); +  span_types[make_tuple(a,b,c,d)].push_back(-TD::Convert(string(buf, ch, end-ch)));  }  // INPUT FORMAT diff --git a/extools/sentence_pair.h b/extools/sentence_pair.h index d78be359..b5a7ca93 100644 --- a/extools/sentence_pair.h +++ b/extools/sentence_pair.h @@ -1,8 +1,10 @@  #ifndef _SENTENCE_PAIR_H_  #define _SENTENCE_PAIR_H_ +#include <map>  #include <utility>  #include <vector> +#include <boost/tuple/tuple.hpp>  #include "wordid.h"  #include "array2d.h" @@ -22,12 +24,12 @@ struct AnnotatedParallelSentence {    std::vector<std::vector<std::pair<short, short> > > aligns_by_fword;    // span type information -  Array2D<std::vector<WordID> > span_types;  // span_types(i,j) is the list of category -                               // types for a span (i,j) in the TARGET language. +  std::map< boost::tuple<short,short,short,short>, std::vector<WordID> > span_types; +  // span_types(i,j,k,l) is the list of category span (i,j) in source and (k,l) in the target language.    int f_len, e_len; -  static int ReadAlignmentPoint(const char* buf, int start, int end, bool permit_col, short* a, short* b); +  static int ReadAlignmentPoint(const char* buf, int start, int end, bool permit_col, short* a, short* b, short* c, short* d);   private:    void Reset(); diff --git a/extools/striped_grammar.cc b/extools/striped_grammar.cc index accf44eb..785f4bbe 100644 --- a/extools/striped_grammar.cc +++ b/extools/striped_grammar.cc @@ -33,7 +33,7 @@ void RuleStatistics::ParseRuleStatistics(const char* buf, int start, int end) {          while(ptr < end && buf[ptr] != ',' && !IsWhitespace(buf[ptr])) { ++ptr; }          if (ptr > vstart) {            short a, b; -          AnnotatedParallelSentence::ReadAlignmentPoint(buf, vstart, ptr, false, &a, &b); +          AnnotatedParallelSentence::ReadAlignmentPoint(buf, vstart, ptr, false, &a, &b, 0, 0);            aligns.push_back(make_pair(a,b));          }        } diff --git a/gi/pipeline/local-gi-pipeline.pl b/gi/pipeline/local-gi-pipeline.pl index 0d6c553c..cb411c1b 100755 --- a/gi/pipeline/local-gi-pipeline.pl +++ b/gi/pipeline/local-gi-pipeline.pl @@ -19,7 +19,8 @@ my $NUM_SAMPLES = 1000;  my $CONTEXT_SIZE = 1;  my $BIDIR = 0;  my $TOPICS_CONFIG = "pyp-topics.conf"; -my $LABEL_THRESHOLD = 0; +my $LANGUAGE = "target"; +  my $MODEL = "pyp";  my $NUM_EM_ITERS = 100;  my $NUM_PR_ITERS = 0; @@ -71,6 +72,7 @@ usage() unless &GetOptions('base_phrase_max_size=i' => \$BASE_PHRASE_MAX_SIZE,                             'pr-scale-context=f' => \$PR_SCALE_C,                             'pr-threads=i' => \$PR_THREADS,                             'tagged_corpus=s' => \$TAGGED_CORPUS, +                           'language=s' => \$LANGUAGE,                            );  usage() unless scalar @ARGV == 1; @@ -166,7 +168,7 @@ sub setup_data {  }  sub context_dir { -  return "ct${CONTEXT_SIZE}s0.L$BASE_PHRASE_MAX_SIZE"; +  return "ct${CONTEXT_SIZE}s0.L$BASE_PHRASE_MAX_SIZE.l$LANGUAGE";  }  sub cluster_dir { @@ -231,10 +233,10 @@ sub extract_context {   if (-e $OUT_CONTEXTS) {     print STDERR "$OUT_CONTEXTS exists, reusing...\n";   } else { -   my $cmd = "$EXTRACTOR -i $CORPUS_CLUSTER -c $ITEMS_IN_MEMORY -L $BASE_PHRASE_MAX_SIZE -C -S $CONTEXT_SIZE | $SORT_KEYS | $REDUCER | $GZIP > $OUT_CONTEXTS"; +   my $cmd = "$EXTRACTOR -i $CORPUS_CLUSTER -c $ITEMS_IN_MEMORY -L $BASE_PHRASE_MAX_SIZE -C -S $CONTEXT_SIZE --phrase_language $LANGUAGE --context_language $LANGUAGE | $SORT_KEYS | $REDUCER | $GZIP > $OUT_CONTEXTS";     if ($COMPLETE_CACHE) {       print STDERR "COMPLETE_CACHE is set: removing memory limits on cache.\n"; -     $cmd = "$EXTRACTOR -i $CORPUS_CLUSTER -c 0 -L $BASE_PHRASE_MAX_SIZE -C -S $CONTEXT_SIZE | $SORT_KEYS | $GZIP > $OUT_CONTEXTS"; +     $cmd = "$EXTRACTOR -i $CORPUS_CLUSTER -c 0 -L $BASE_PHRASE_MAX_SIZE -C -S $CONTEXT_SIZE  --phrase_language $LANGUAGE --context_language $LANGUAGE  | $SORT_KEYS | $GZIP > $OUT_CONTEXTS";     }     safesystem($cmd) or die "Failed to extract contexts.";    } @@ -270,8 +272,14 @@ sub label_spans_with_topics {    if (-e $OUT_SPANS) {      print STDERR "$OUT_SPANS exists, reusing...\n";    } else { +    my $l = "tt"; +    if ($LANGUAGE eq "source") { +        $l = "ss"; +    } elsif ($LANGUAGE eq "both") { +        $l = "bb"; +    } else { die "Invalid language specifier $LANGUAGE\n" unless $LANGUAGE eq "target" };      safesystem("$ZCAT $IN_CLUSTERS > $CLUSTER_DIR/clusters.txt") or die "Failed to unzip"; -    safesystem("$EXTRACTOR --base_phrase_spans -i $CORPUS_CLUSTER -c $ITEMS_IN_MEMORY -L $BASE_PHRASE_MAX_SIZE -S $CONTEXT_SIZE | $S2L $CLUSTER_DIR/clusters.txt $CONTEXT_SIZE $LABEL_THRESHOLD > $OUT_SPANS") or die "Failed to label spans"; +    safesystem("$EXTRACTOR --base_phrase_spans -i $CORPUS_CLUSTER -c $ITEMS_IN_MEMORY -L $BASE_PHRASE_MAX_SIZE -S $CONTEXT_SIZE | $S2L $CLUSTER_DIR/clusters.txt $CONTEXT_SIZE $LABEL_THRESHOLD $l > $OUT_SPANS") or die "Failed to label spans";      unlink("$CLUSTER_DIR/clusters.txt") or warn "Failed to remove $CLUSTER_DIR/clusters.txt";      safesystem("paste -d ' ' $CORPUS_LEX $OUT_SPANS > $LABELED_DIR/corpus.src_trg_al_label") or die "Couldn't paste";    } diff --git a/gi/posterior-regularisation/prjava/src/phrase/Corpus.java b/gi/posterior-regularisation/prjava/src/phrase/Corpus.java index 81264ab9..d57f3c04 100644 --- a/gi/posterior-regularisation/prjava/src/phrase/Corpus.java +++ b/gi/posterior-regularisation/prjava/src/phrase/Corpus.java @@ -151,7 +151,7 @@ public class Corpus  		for (int i = 0; i < c.size(); ++i)  		{  			if (i > 0) b.append(" "); -			if (i == c.size() / 2) b.append("<PHRASE> "); +			//if (i == c.size() / 2) b.append("<PHRASE> ");  			b.append(wordLexicon.lookup(c.get(i)));  		}  		return b.toString(); @@ -198,7 +198,7 @@ public class Corpus  				while (ctxStrtok.hasMoreTokens())  				{  					String token = ctxStrtok.nextToken(); -					if (!token.equals("<PHRASE>")) +					//if (!token.equals("<PHRASE>"))  						ctx.add(c.wordLexicon.insert(token));  				}  				int contextId = c.contextLexicon.insert(ctx); diff --git a/gi/pyp-topics/scripts/spans2labels.py b/gi/pyp-topics/scripts/spans2labels.py index f990582e..3dc60835 100755 --- a/gi/pyp-topics/scripts/spans2labels.py +++ b/gi/pyp-topics/scripts/spans2labels.py @@ -4,7 +4,7 @@ import sys  from operator import itemgetter  if len(sys.argv) <= 2: -  print "Usage: spans2labels.py phrase_context_index [order] [threshold]" +  print "Usage: spans2labels.py phrase_context_index [order] [threshold] [languages={s,t,b}{s,t,b}]"    exit(1)  order=1 @@ -14,6 +14,11 @@ if len(sys.argv) > 2:    order = int(sys.argv[2])  if len(sys.argv) > 3:    threshold = float(sys.argv[3]) +phr=ctx='t' +if len(sys.argv) > 4: +  phr, ctx = sys.argv[4] +  assert phr in 'stb' +  assert ctx in 'stb'  phrase_context_index = {}  for line in file(sys.argv[1], 'r'): @@ -52,11 +57,35 @@ for line in sys.stdin:      t1 += order      t2 += order -    phrase = reduce(lambda x, y: x+y+" ", target[t1:t2], "").strip() -    left_context = reduce(lambda x, y: x+y+" ", target[t1-order:t1], "") -    right_context = reduce(lambda x, y: x+y+" ", target[t2:t2+order], "").strip() -    context = "%s<PHRASE> %s" % (left_context, right_context) +    phraset = phrases = contextt = contexts = '' +    if phr in 'tb': +        phraset = reduce(lambda x, y: x+y+" ", target[t1:t2], "").strip() +    if phr in 'sb': +        phrases = reduce(lambda x, y: x+y+" ", source[s1:s2], "").strip() + +    if ctx in 'tb': +        left_context = reduce(lambda x, y: x+y+" ", target[t1-order:t1], "") +        right_context = reduce(lambda x, y: x+y+" ", target[t2:t2+order], "").strip() +        contextt = "%s<PHRASE> %s" % (left_context, right_context) +    if ctx in 'sb': +        left_context = reduce(lambda x, y: x+y+" ", source[s1-order:s1], "") +        right_context = reduce(lambda x, y: x+y+" ", source[s2:s2+order], "").strip() +        contexts = "%s<PHRASE> %s" % (left_context, right_context) + +    if phr == 'b': +        phrase = phraset + ' <SPLIT> ' + phrases +    elif phr == 's': +        phrase = phrases +    else: +        phrase = phraset + +    if ctx == 'b': +        context = contextt + ' <SPLIT> ' + contexts +    elif ctx == 's': +        context = contexts +    else: +        context = contextt      label = phrase_context_index.get((phrase,context), "<UNK>") -    print "%s-%s:X%s" % (t1-order,t2-order,label), +    print "%d-%d-%d-%d:X%s" % (s1-order,s2-order,t1-order,t2-order,label),    print | 
