From 40ee5446b84b5cdbc4e4a613e4c1aa19231c42d3 Mon Sep 17 00:00:00 2001 From: "trevor.cohn" Date: Thu, 15 Jul 2010 00:34:58 +0000 Subject: Massacred the pipeline to support source language phrases and contexts. git-svn-id: https://ws10smt.googlecode.com/svn/trunk@255 ec762483-ff6d-05da-a07a-a48fb63a330f --- extools/extract.cc | 26 ++++++++++---- extools/extract.h | 4 +-- extools/sentence_pair.cc | 87 +++++++++++++++++++++++++++++++++------------- extools/sentence_pair.h | 8 +++-- extools/striped_grammar.cc | 2 +- 5 files changed, 89 insertions(+), 38 deletions(-) (limited to 'extools') diff --git a/extools/extract.cc b/extools/extract.cc index 567348f4..44cd51af 100644 --- a/extools/extract.cc +++ b/extools/extract.cc @@ -5,6 +5,7 @@ #include #include #include +#include #include @@ -15,6 +16,7 @@ using namespace std; using namespace tr1; +using namespace boost; namespace { inline bool IsWhitespace(char c) { return c == ' ' || c == '\t'; } @@ -114,27 +116,37 @@ void Extract::LoosenPhraseBounds(const AnnotatedParallelSentence& sentence, } } +template +void +lookup_and_append(const map &dict, const K &key, V &output) +{ + typename map::const_iterator found = dict.find(key); + if (found != dict.end()) + copy(found->second.begin(), found->second.end(), back_inserter(output)); +} + // this uses the TARGET span (i,j) to annotate phrases, will copy // phrases if there is more than one annotation. // TODO: support source annotation void Extract::AnnotatePhrasesWithCategoryTypes(const WordID default_cat, - const Array2D >& types, + const map< tuple, vector > &types, vector* phrases) { const int num_unannotated_phrases = phrases->size(); // have to use num_unannotated_phrases since we may grow the vector for (int i = 0; i < num_unannotated_phrases; ++i) { ParallelSpan& phrase = (*phrases)[i]; - const vector* pcats = &types(phrase.j1, phrase.j2); - if (pcats->empty() && default_cat != 0) { - static vector s_default(1, default_cat); - pcats = &s_default; + vector cats; + lookup_and_append(types, make_tuple(phrase.i1, phrase.i2, phrase.j1, phrase.j2), cats); + lookup_and_append(types, make_tuple((short)-1, (short)-1, phrase.j1, phrase.j2), cats); + lookup_and_append(types, make_tuple(phrase.i1, phrase.i2, (short)-1, (short)-1), cats); + if (cats.empty() && default_cat != 0) { + cats = vector(1, default_cat); } - if (pcats->empty()) { + if (cats.empty()) { cerr << "ERROR span " << phrase.i1 << "," << phrase.i2 << "-" << phrase.j1 << "," << phrase.j2 << " has no type. " "Did you forget --default_category?\n"; } - const vector& cats = *pcats; phrase.cat = cats[0]; for (int ci = 1; ci < cats.size(); ++ci) { ParallelSpan new_phrase = phrase; diff --git a/extools/extract.h b/extools/extract.h index 76292bed..e9ea5e65 100644 --- a/extools/extract.h +++ b/extools/extract.h @@ -4,6 +4,7 @@ #include #include #include +#include #include "array2d.h" #include "wordid.h" #include "sparse_vector.h" @@ -74,9 +75,8 @@ struct Extract { // this uses the TARGET span (i,j) to annotate phrases, will copy // phrases if there is more than one annotation. - // TODO: support source annotation static void AnnotatePhrasesWithCategoryTypes(const WordID default_cat, - const Array2D >& types, + const std::map< boost::tuple, std::vector > &types, std::vector* phrases); // use the Chiang (2007) extraction logic to extract consistent subphrases diff --git a/extools/sentence_pair.cc b/extools/sentence_pair.cc index 02df3349..b2881737 100644 --- a/extools/sentence_pair.cc +++ b/extools/sentence_pair.cc @@ -6,12 +6,14 @@ #include #include #include +#include #include "tdict.h" #include "wordid.h" #include "array2d.h" using namespace std; +using namespace boost; namespace { inline bool IsWhitespace(char c) { return c == ' ' || c == '\t'; } @@ -38,7 +40,6 @@ void AnnotatedParallelSentence::AllocateForAlignment() { f_aligned.resize(f_len, 0); e_aligned.resize(e_len, 0); aligns_by_fword.resize(f_len); - span_types.resize(e_len, e_len+1); } // read an alignment point of the form X-Y where X and Y are strings @@ -48,44 +49,76 @@ int AnnotatedParallelSentence::ReadAlignmentPoint(const char* buf, const int start, const int end, const bool permit_col, - short* a, - short* b) { + short* a, short* b, short* c, short* d) { if (end - start < 3) { - cerr << "Alignment point badly formed: " << string(buf, start, end-start) << endl << buf << endl; + cerr << "Alignment point badly formed 1: " << string(buf, start, end-start) << endl << buf << endl; exit(1); } - int c = start; + int ch = start; *a = 0; - while(c < end && buf[c] != '-') { - if (buf[c] < '0' || buf[c] > '9') { - cerr << "Alignment point badly formed: " << string(buf, start, end-start) << endl << buf << endl; + while(ch < end && buf[ch] != '-') { + if (buf[ch] < '0' || buf[ch] > '9') { + cerr << "Alignment point badly formed 2: " << string(buf, start, end-start) << endl << buf << endl; exit(1); } (*a) *= 10; - (*a) += buf[c] - '0'; - ++c; + (*a) += buf[ch] - '0'; + ++ch; } - ++c; - if (c >= end) { - cerr << "Alignment point badly formed: " << string(buf, start, end-start) << endl << buf << endl; + ++ch; + if (ch >= end) { + cerr << "Alignment point badly formed 3: " << string(buf, start, end-start) << endl << buf << endl; exit(1); } (*b) = 0; - while(c < end && (!permit_col || (permit_col && buf[c] != ':'))) { - if (buf[c] < '0' || buf[c] > '9') { - cerr << "Alignment point badly formed: " << string(buf, start, end-start) << endl << buf << endl; + while(ch < end && (c == 0 && (!permit_col || (permit_col && buf[ch] != ':')) || c != 0 && buf[ch] != '-')) { + if (buf[ch] < '0' || buf[ch] > '9') { + cerr << "Alignment point badly formed 4: " << string(buf, start, end-start) << endl << buf << endl; exit(1); } (*b) *= 10; - (*b) += buf[c] - '0'; - ++c; + (*b) += buf[ch] - '0'; + ++ch; } - return c; + if (c != 0) + { + ++ch; + if (ch >= end) { + cerr << "Alignment point badly formed 5: " << string(buf, start, end-start) << endl << buf << endl; + exit(1); + } + (*c) = 0; + while(ch < end && buf[ch] != '-') { + if (buf[ch] < '0' || buf[ch] > '9') { + cerr << "Alignment point badly formed 6: " << string(buf, start, end-start) << endl << buf << endl; + exit(1); + } + (*c) *= 10; + (*c) += buf[ch] - '0'; + ++ch; + } + ++ch; + if (ch >= end) { + cerr << "Alignment point badly formed 7: " << string(buf, start, end-start) << endl << buf << endl; + exit(1); + } + (*d) = 0; + while(ch < end && (!permit_col || (permit_col && buf[ch] != ':'))) { + if (buf[ch] < '0' || buf[ch] > '9') { + cerr << "Alignment point badly formed 8: " << string(buf, start, end-start) << endl << buf << endl; + exit(1); + } + (*d) *= 10; + (*d) += buf[ch] - '0'; + ++ch; + } + } + return ch; } void AnnotatedParallelSentence::ParseAlignmentPoint(const char* buf, int start, int end) { short a, b; - ReadAlignmentPoint(buf, start, end, false, &a, &b); + ReadAlignmentPoint(buf, start, end, false, &a, &b, 0, 0); if (a >= f_len || b >= e_len) { cerr << "(" << a << ',' << b << ") is out of bounds. INPUT=\n" << buf << endl; exit(1); @@ -98,18 +131,22 @@ void AnnotatedParallelSentence::ParseAlignmentPoint(const char* buf, int start, } void AnnotatedParallelSentence::ParseSpanLabel(const char* buf, int start, int end) { - short a,b; - int c = ReadAlignmentPoint(buf, start, end, true, &a, &b) + 1; - if (buf[c-1] != ':' || c >= end) { + short a,b,c,d; + int ch = ReadAlignmentPoint(buf, start, end, true, &a, &b, &c, &d) + 1; + if (buf[ch-1] != ':' || ch >= end) { cerr << "Span badly formed: " << string(buf, start, end-start) << endl << buf << endl; exit(1); } - if (a >= e_len || b > e_len) { + if (a >= f_len || b > f_len) { cerr << "(" << a << ',' << b << ") is out of bounds in labeled span. INPUT=\n" << buf << endl; exit(1); } + if (c >= e_len || d > e_len) { + cerr << "(" << c << ',' << d << ") is out of bounds in labeled span. INPUT=\n" << buf << endl; + exit(1); + } // cerr << a << " " << b << " " << string(buf,c,end-c) << endl; - span_types(a,b).push_back(-TD::Convert(string(buf, c, end-c))); + span_types[make_tuple(a,b,c,d)].push_back(-TD::Convert(string(buf, ch, end-ch))); } // INPUT FORMAT diff --git a/extools/sentence_pair.h b/extools/sentence_pair.h index d78be359..b5a7ca93 100644 --- a/extools/sentence_pair.h +++ b/extools/sentence_pair.h @@ -1,8 +1,10 @@ #ifndef _SENTENCE_PAIR_H_ #define _SENTENCE_PAIR_H_ +#include #include #include +#include #include "wordid.h" #include "array2d.h" @@ -22,12 +24,12 @@ struct AnnotatedParallelSentence { std::vector > > aligns_by_fword; // span type information - Array2D > span_types; // span_types(i,j) is the list of category - // types for a span (i,j) in the TARGET language. + std::map< boost::tuple, std::vector > span_types; + // span_types(i,j,k,l) is the list of category span (i,j) in source and (k,l) in the target language. int f_len, e_len; - static int ReadAlignmentPoint(const char* buf, int start, int end, bool permit_col, short* a, short* b); + static int ReadAlignmentPoint(const char* buf, int start, int end, bool permit_col, short* a, short* b, short* c, short* d); private: void Reset(); diff --git a/extools/striped_grammar.cc b/extools/striped_grammar.cc index accf44eb..785f4bbe 100644 --- a/extools/striped_grammar.cc +++ b/extools/striped_grammar.cc @@ -33,7 +33,7 @@ void RuleStatistics::ParseRuleStatistics(const char* buf, int start, int end) { while(ptr < end && buf[ptr] != ',' && !IsWhitespace(buf[ptr])) { ++ptr; } if (ptr > vstart) { short a, b; - AnnotatedParallelSentence::ReadAlignmentPoint(buf, vstart, ptr, false, &a, &b); + AnnotatedParallelSentence::ReadAlignmentPoint(buf, vstart, ptr, false, &a, &b, 0, 0); aligns.push_back(make_pair(a,b)); } } -- cgit v1.2.3