summaryrefslogtreecommitdiff
path: root/extools/sentence_pair.cc
diff options
context:
space:
mode:
authorredpony <redpony@ec762483-ff6d-05da-a07a-a48fb63a330f>2010-06-22 05:12:27 +0000
committerredpony <redpony@ec762483-ff6d-05da-a07a-a48fb63a330f>2010-06-22 05:12:27 +0000
commit0172721855098ca02b207231a654dffa5e4eb1c9 (patch)
tree8069c3a62e2d72bd64a2cdeee9724b2679c8a56b /extools/sentence_pair.cc
parent37728b8be4d0b3df9da81fdda2198ff55b4b2d91 (diff)
initial checkin
git-svn-id: https://ws10smt.googlecode.com/svn/trunk@2 ec762483-ff6d-05da-a07a-a48fb63a330f
Diffstat (limited to 'extools/sentence_pair.cc')
-rw-r--r--extools/sentence_pair.cc155
1 files changed, 155 insertions, 0 deletions
diff --git a/extools/sentence_pair.cc b/extools/sentence_pair.cc
new file mode 100644
index 00000000..91286059
--- /dev/null
+++ b/extools/sentence_pair.cc
@@ -0,0 +1,155 @@
+#include "sentence_pair.h"
+
+#include <queue>
+#include <iostream>
+#include <string>
+#include <vector>
+#include <utility>
+#include <set>
+
+#include "tdict.h"
+#include "wordid.h"
+#include "array2d.h"
+
+using namespace std;
+
+namespace {
+ inline bool IsWhitespace(char c) { return c == ' ' || c == '\t'; }
+
+ inline void SkipWhitespace(const char* buf, int* ptr) {
+ while (buf[*ptr] && IsWhitespace(buf[*ptr])) { ++(*ptr); }
+ }
+}
+
+void AnnotatedParallelSentence::Reset() {
+ f.clear();
+ e.clear();
+ e_aligned.clear();
+ f_aligned.clear();
+ aligns_by_fword.clear();
+ aligned.clear();
+ span_types.clear();
+}
+
+void AnnotatedParallelSentence::AllocateForAlignment() {
+ f_len = f.size();
+ e_len = e.size();
+ aligned.resize(f_len, e_len, false);
+ f_aligned.resize(f_len, 0);
+ e_aligned.resize(e_len, 0);
+ aligns_by_fword.resize(f_len);
+ span_types.resize(e_len, e_len+1);
+}
+
+// read an alignment point of the form X-Y where X and Y are strings
+// of digits. if permit_col is true, the right edge will be determined
+// by the presence of a colon
+int AnnotatedParallelSentence::ReadAlignmentPoint(const char* buf,
+ const int start,
+ const int end,
+ const bool permit_col,
+ short* a,
+ short* b) {
+ if (end - start < 3) {
+ cerr << "Alignment point badly formed: " << string(buf, start, end-start) << endl; abort();
+ }
+ int c = start;
+ *a = 0;
+ while(c < end && buf[c] != '-') {
+ if (buf[c] < '0' || buf[c] > '9') {
+ cerr << "Alignment point badly formed: " << string(buf, start, end-start) << endl;
+ abort();
+ }
+ (*a) *= 10;
+ (*a) += buf[c] - '0';
+ ++c;
+ }
+ ++c;
+ if (c >= end) {
+ cerr << "Alignment point badly formed: " << string(buf, start, end-start) << endl; abort();
+ }
+ (*b) = 0;
+ while(c < end && (!permit_col || (permit_col && buf[c] != ':'))) {
+ if (buf[c] < '0' || buf[c] > '9') {
+ cerr << "Alignment point badly formed: " << string(buf, start, end-start) << endl;
+ abort();
+ }
+ (*b) *= 10;
+ (*b) += buf[c] - '0';
+ ++c;
+ }
+ return c;
+}
+
+void AnnotatedParallelSentence::ParseAlignmentPoint(const char* buf, int start, int end) {
+ short a, b;
+ ReadAlignmentPoint(buf, start, end, false, &a, &b);
+ assert(a < f_len);
+ assert(b < e_len);
+ aligned(a,b) = true;
+ ++f_aligned[a];
+ ++e_aligned[b];
+ aligns_by_fword[a].push_back(make_pair(a,b));
+ // cerr << a << " " << b << endl;
+}
+
+void AnnotatedParallelSentence::ParseSpanLabel(const char* buf, int start, int end) {
+ short a,b;
+ int c = ReadAlignmentPoint(buf, start, end, true, &a, &b) + 1;
+ if (buf[c-1] != ':' || c >= end) {
+ cerr << "Span badly formed: " << string(buf, start, end-start) << endl; abort();
+ }
+ // cerr << a << " " << b << " " << string(buf,c,end-c) << endl;
+ span_types(a,b).push_back(-TD::Convert(string(buf, c, end-c)));
+}
+
+// INPUT FORMAT
+// ein haus ||| a house ||| 0-0 1-1 ||| 0-0:DT 1-1:NN 0-1:NP
+void AnnotatedParallelSentence::ParseInputLine(const char* buf) {
+ Reset();
+ int ptr = 0;
+ SkipWhitespace(buf, &ptr);
+ int start = ptr;
+ int state = 0; // 0 = French, 1 = English, 2 = Alignment, 3 = Spans
+ while(char c = buf[ptr]) {
+ if (!IsWhitespace(c)) { ++ptr; continue; } else {
+ if (ptr - start == 3 && buf[start] == '|' && buf[start+1] == '|' && buf[start+2] == '|') {
+ ++state;
+ if (state == 4) { cerr << "Too many fields (ignoring):\n " << buf << endl; return; }
+ if (state == 2) {
+ // cerr << "FLEN=" << f->size() << " ELEN=" << e->size() << endl;
+ AllocateForAlignment();
+ }
+ SkipWhitespace(buf, &ptr);
+ start = ptr;
+ continue;
+ }
+ switch (state) {
+ case 0: f.push_back(TD::Convert(string(buf, start, ptr-start))); break;
+ case 1: e.push_back(TD::Convert(string(buf, start, ptr-start))); break;
+ case 2: ParseAlignmentPoint(buf, start, ptr); break;
+ case 3: ParseSpanLabel(buf, start, ptr); break;
+ default: cerr << "Can't happen\n"; abort();
+ }
+ SkipWhitespace(buf, &ptr);
+ start = ptr;
+ }
+ }
+ if (ptr > start) {
+ switch (state) {
+ case 0: f.push_back(TD::Convert(string(buf, start, ptr-start))); break;
+ case 1: e.push_back(TD::Convert(string(buf, start, ptr-start))); break;
+ case 2: ParseAlignmentPoint(buf, start, ptr); break;
+ case 3: ParseSpanLabel(buf, start, ptr); break;
+ default: cerr << "Can't happen\n"; abort();
+ }
+ }
+ if (state < 2) {
+ cerr << "Not enough fields: " << buf << endl;
+ abort();
+ }
+ if (e.empty() || f.empty()) {
+ cerr << "Sentences must not be empty: " << buf << endl;
+ }
+}
+