summaryrefslogtreecommitdiff
path: root/extools/sentence_pair.cc
diff options
context:
space:
mode:
authortrevor.cohn <trevor.cohn@ec762483-ff6d-05da-a07a-a48fb63a330f>2010-07-15 00:34:58 +0000
committertrevor.cohn <trevor.cohn@ec762483-ff6d-05da-a07a-a48fb63a330f>2010-07-15 00:34:58 +0000
commit3ca6d4b4bbbd0401ab25e7731ce84afb118635d8 (patch)
tree6ce51c1d8fef8ebbd0649b946dd983950295cc25 /extools/sentence_pair.cc
parent1350b8e8e465acc9d4d8d43d807cc6093e8f37b9 (diff)
Massacred the pipeline to support source language phrases and contexts.
git-svn-id: https://ws10smt.googlecode.com/svn/trunk@255 ec762483-ff6d-05da-a07a-a48fb63a330f
Diffstat (limited to 'extools/sentence_pair.cc')
-rw-r--r--extools/sentence_pair.cc87
1 files changed, 62 insertions, 25 deletions
diff --git a/extools/sentence_pair.cc b/extools/sentence_pair.cc
index 02df3349..b2881737 100644
--- a/extools/sentence_pair.cc
+++ b/extools/sentence_pair.cc
@@ -6,12 +6,14 @@
#include <vector>
#include <utility>
#include <set>
+#include <boost/tuple/tuple_comparison.hpp>
#include "tdict.h"
#include "wordid.h"
#include "array2d.h"
using namespace std;
+using namespace boost;
namespace {
inline bool IsWhitespace(char c) { return c == ' ' || c == '\t'; }
@@ -38,7 +40,6 @@ void AnnotatedParallelSentence::AllocateForAlignment() {
f_aligned.resize(f_len, 0);
e_aligned.resize(e_len, 0);
aligns_by_fword.resize(f_len);
- span_types.resize(e_len, e_len+1);
}
// read an alignment point of the form X-Y where X and Y are strings
@@ -48,44 +49,76 @@ int AnnotatedParallelSentence::ReadAlignmentPoint(const char* buf,
const int start,
const int end,
const bool permit_col,
- short* a,
- short* b) {
+ short* a, short* b, short* c, short* d) {
if (end - start < 3) {
- cerr << "Alignment point badly formed: " << string(buf, start, end-start) << endl << buf << endl;
+ cerr << "Alignment point badly formed 1: " << string(buf, start, end-start) << endl << buf << endl;
exit(1);
}
- int c = start;
+ int ch = start;
*a = 0;
- while(c < end && buf[c] != '-') {
- if (buf[c] < '0' || buf[c] > '9') {
- cerr << "Alignment point badly formed: " << string(buf, start, end-start) << endl << buf << endl;
+ while(ch < end && buf[ch] != '-') {
+ if (buf[ch] < '0' || buf[ch] > '9') {
+ cerr << "Alignment point badly formed 2: " << string(buf, start, end-start) << endl << buf << endl;
exit(1);
}
(*a) *= 10;
- (*a) += buf[c] - '0';
- ++c;
+ (*a) += buf[ch] - '0';
+ ++ch;
}
- ++c;
- if (c >= end) {
- cerr << "Alignment point badly formed: " << string(buf, start, end-start) << endl << buf << endl;
+ ++ch;
+ if (ch >= end) {
+ cerr << "Alignment point badly formed 3: " << string(buf, start, end-start) << endl << buf << endl;
exit(1);
}
(*b) = 0;
- while(c < end && (!permit_col || (permit_col && buf[c] != ':'))) {
- if (buf[c] < '0' || buf[c] > '9') {
- cerr << "Alignment point badly formed: " << string(buf, start, end-start) << endl << buf << endl;
+ while(ch < end && (c == 0 && (!permit_col || (permit_col && buf[ch] != ':')) || c != 0 && buf[ch] != '-')) {
+ if (buf[ch] < '0' || buf[ch] > '9') {
+ cerr << "Alignment point badly formed 4: " << string(buf, start, end-start) << endl << buf << endl;
exit(1);
}
(*b) *= 10;
- (*b) += buf[c] - '0';
- ++c;
+ (*b) += buf[ch] - '0';
+ ++ch;
}
- return c;
+ if (c != 0)
+ {
+ ++ch;
+ if (ch >= end) {
+ cerr << "Alignment point badly formed 5: " << string(buf, start, end-start) << endl << buf << endl;
+ exit(1);
+ }
+ (*c) = 0;
+ while(ch < end && buf[ch] != '-') {
+ if (buf[ch] < '0' || buf[ch] > '9') {
+ cerr << "Alignment point badly formed 6: " << string(buf, start, end-start) << endl << buf << endl;
+ exit(1);
+ }
+ (*c) *= 10;
+ (*c) += buf[ch] - '0';
+ ++ch;
+ }
+ ++ch;
+ if (ch >= end) {
+ cerr << "Alignment point badly formed 7: " << string(buf, start, end-start) << endl << buf << endl;
+ exit(1);
+ }
+ (*d) = 0;
+ while(ch < end && (!permit_col || (permit_col && buf[ch] != ':'))) {
+ if (buf[ch] < '0' || buf[ch] > '9') {
+ cerr << "Alignment point badly formed 8: " << string(buf, start, end-start) << endl << buf << endl;
+ exit(1);
+ }
+ (*d) *= 10;
+ (*d) += buf[ch] - '0';
+ ++ch;
+ }
+ }
+ return ch;
}
void AnnotatedParallelSentence::ParseAlignmentPoint(const char* buf, int start, int end) {
short a, b;
- ReadAlignmentPoint(buf, start, end, false, &a, &b);
+ ReadAlignmentPoint(buf, start, end, false, &a, &b, 0, 0);
if (a >= f_len || b >= e_len) {
cerr << "(" << a << ',' << b << ") is out of bounds. INPUT=\n" << buf << endl;
exit(1);
@@ -98,18 +131,22 @@ void AnnotatedParallelSentence::ParseAlignmentPoint(const char* buf, int start,
}
void AnnotatedParallelSentence::ParseSpanLabel(const char* buf, int start, int end) {
- short a,b;
- int c = ReadAlignmentPoint(buf, start, end, true, &a, &b) + 1;
- if (buf[c-1] != ':' || c >= end) {
+ short a,b,c,d;
+ int ch = ReadAlignmentPoint(buf, start, end, true, &a, &b, &c, &d) + 1;
+ if (buf[ch-1] != ':' || ch >= end) {
cerr << "Span badly formed: " << string(buf, start, end-start) << endl << buf << endl;
exit(1);
}
- if (a >= e_len || b > e_len) {
+ if (a >= f_len || b > f_len) {
cerr << "(" << a << ',' << b << ") is out of bounds in labeled span. INPUT=\n" << buf << endl;
exit(1);
}
+ if (c >= e_len || d > e_len) {
+ cerr << "(" << c << ',' << d << ") is out of bounds in labeled span. INPUT=\n" << buf << endl;
+ exit(1);
+ }
// cerr << a << " " << b << " " << string(buf,c,end-c) << endl;
- span_types(a,b).push_back(-TD::Convert(string(buf, c, end-c)));
+ span_types[make_tuple(a,b,c,d)].push_back(-TD::Convert(string(buf, ch, end-ch)));
}
// INPUT FORMAT