diff options
Diffstat (limited to 'extools/sentence_pair.cc')
-rw-r--r-- | extools/sentence_pair.cc | 198 |
1 files changed, 0 insertions, 198 deletions
diff --git a/extools/sentence_pair.cc b/extools/sentence_pair.cc deleted file mode 100644 index 7d60715a..00000000 --- a/extools/sentence_pair.cc +++ /dev/null @@ -1,198 +0,0 @@ -#include "sentence_pair.h" - -#include <queue> -#include <iostream> -#include <string> -#include <vector> -#include <utility> -#include <set> -#include <boost/tuple/tuple_comparison.hpp> - -#include "tdict.h" -#include "wordid.h" -#include "array2d.h" - -using namespace std; -using namespace boost; - -namespace { - inline bool IsWhitespace(char c) { return c == ' ' || c == '\t'; } - - inline void SkipWhitespace(const char* buf, int* ptr) { - while (buf[*ptr] && IsWhitespace(buf[*ptr])) { ++(*ptr); } - } -} - -void AnnotatedParallelSentence::Reset() { - f.clear(); - e.clear(); - e_aligned.clear(); - f_aligned.clear(); - aligns_by_fword.clear(); - aligned.clear(); - span_types.clear(); -} - -void AnnotatedParallelSentence::AllocateForAlignment() { - f_len = f.size(); - e_len = e.size(); - aligned.resize(f_len, e_len, false); - f_aligned.resize(f_len, 0); - e_aligned.resize(e_len, 0); - aligns_by_fword.resize(f_len); -} - -// read an alignment point of the form X-Y where X and Y are strings -// of digits. if permit_col is true, the right edge will be determined -// by the presence of a colon -int AnnotatedParallelSentence::ReadAlignmentPoint(const char* buf, - const int start, - const int end, - const bool permit_col, - short* a, short* b, short* c, short* d) { - if (end - start < 3) { - cerr << "Alignment point badly formed 1: " << string(buf, start, end-start) << endl << buf << endl; - exit(1); - } - int ch = start; - *a = 0; - while(ch < end && buf[ch] != '-') { - if (buf[ch] < '0' || buf[ch] > '9') { - cerr << "Alignment point badly formed 2: " << string(buf, start, end-start) << endl << buf << endl; - exit(1); - } - (*a) *= 10; - (*a) += buf[ch] - '0'; - ++ch; - } - ++ch; - if (ch >= end) { - cerr << "Alignment point badly formed 3: " << string(buf, start, end-start) << endl << buf << endl; - exit(1); - } - (*b) = 0; - while((ch < end) && (c == 0 && (!permit_col || (permit_col && buf[ch] != ':')) || c != 0 && buf[ch] != '-')) { - if ((buf[ch] < '0') || (buf[ch] > '9')) { - cerr << "Alignment point badly formed 4: " << string(buf, start, end-start) << endl << buf << endl << buf[ch] << endl; - exit(1); - } - (*b) *= 10; - (*b) += buf[ch] - '0'; - ++ch; - } - if (c != 0) - { - ++ch; - if (ch >= end) { - cerr << "Alignment point badly formed 5: " << string(buf, start, end-start) << endl << buf << endl; - exit(1); - } - (*c) = 0; - while(ch < end && buf[ch] != '-') { - if (buf[ch] < '0' || buf[ch] > '9') { - cerr << "Alignment point badly formed 6: " << string(buf, start, end-start) << endl << buf << endl; - exit(1); - } - (*c) *= 10; - (*c) += buf[ch] - '0'; - ++ch; - } - ++ch; - if (ch >= end) { - cerr << "Alignment point badly formed 7: " << string(buf, start, end-start) << endl << buf << endl; - exit(1); - } - (*d) = 0; - while(ch < end && (!permit_col || (permit_col && buf[ch] != ':'))) { - if (buf[ch] < '0' || buf[ch] > '9') { - cerr << "Alignment point badly formed 8: " << string(buf, start, end-start) << endl << buf << endl; - exit(1); - } - (*d) *= 10; - (*d) += buf[ch] - '0'; - ++ch; - } - } - return ch; -} - -void AnnotatedParallelSentence::Align(const short a, const short b) { - aligned(a,b) = true; - ++f_aligned[a]; - ++e_aligned[b]; - aligns_by_fword[a].push_back(make_pair(a,b)); - // cerr << a << " " << b << endl; -} - -void AnnotatedParallelSentence::ParseAlignmentPoint(const char* buf, int start, int end) { - short a, b; - ReadAlignmentPoint(buf, start, end, false, &a, &b, 0, 0); - if (a >= f_len || b >= e_len) { - cerr << "(" << a << ',' << b << ") is out of bounds. INPUT=\n" << buf << endl; - exit(1); - } - Align(a,b); -} - -void AnnotatedParallelSentence::ParseSpanLabel(const char* buf, int start, int end) { - short a,b,c,d; - int ch = ReadAlignmentPoint(buf, start, end, true, &a, &b, &c, &d) + 1; - if (buf[ch-1] != ':' || ch >= end) { - cerr << "Span badly formed: " << string(buf, start, end-start) << endl << buf << endl; - exit(1); - } - if (a >= f_len || b > f_len) { - cerr << "(" << a << ',' << b << ") is out of bounds in labeled span. INPUT=\n" << buf << endl; - exit(1); - } - if (c >= e_len || d > e_len) { - cerr << "(" << c << ',' << d << ") is out of bounds in labeled span. INPUT=\n" << buf << endl; - exit(1); - } - // cerr << a << " " << b << " " << string(buf,c,end-c) << endl; - span_types[boost::make_tuple(a,b,c,d)].push_back(-TD::Convert(string(buf, ch, end-ch))); -} - -// INPUT FORMAT -// ein haus ||| a house ||| 0-0 1-1 ||| 0-0:DT 1-1:NN 0-1:NP -void AnnotatedParallelSentence::ParseInputLine(const char* buf) { - Reset(); - int ptr = 0; - SkipWhitespace(buf, &ptr); - int start = ptr; - int state = 0; // 0 = French, 1 = English, 2 = Alignment, 3 = Spans - while(char c = buf[ptr]) { - if (!IsWhitespace(c)) { ++ptr; continue; } else { - if (ptr - start == 3 && buf[start] == '|' && buf[start+1] == '|' && buf[start+2] == '|') { - ++state; - if (state == 4) { cerr << "Too many fields (ignoring):\n " << buf << endl; return; } - if (state == 2) { - // cerr << "FLEN=" << f->size() << " ELEN=" << e->size() << endl; - AllocateForAlignment(); - } - SkipWhitespace(buf, &ptr); - start = ptr; - continue; - } - switch (state) { - case 0: f.push_back(TD::Convert(string(buf, start, ptr-start))); break; - case 1: e.push_back(TD::Convert(string(buf, start, ptr-start))); break; - case 2: ParseAlignmentPoint(buf, start, ptr); break; - case 3: ParseSpanLabel(buf, start, ptr); break; - default: cerr << "Can't happen\n"; abort(); - } - SkipWhitespace(buf, &ptr); - start = ptr; - } - } - if (ptr > start) { - switch (state) { - case 0: f.push_back(TD::Convert(string(buf, start, ptr-start))); break; - case 1: e.push_back(TD::Convert(string(buf, start, ptr-start))); break; - case 2: ParseAlignmentPoint(buf, start, ptr); break; - case 3: ParseSpanLabel(buf, start, ptr); break; - default: cerr << "Can't happen\n"; abort(); - } - } -} - |