diff options
Diffstat (limited to 'extools/sentence_pair.cc')
| -rw-r--r-- | extools/sentence_pair.cc | 87 | 
1 files changed, 62 insertions, 25 deletions
| diff --git a/extools/sentence_pair.cc b/extools/sentence_pair.cc index 02df3349..b2881737 100644 --- a/extools/sentence_pair.cc +++ b/extools/sentence_pair.cc @@ -6,12 +6,14 @@  #include <vector>  #include <utility>  #include <set> +#include <boost/tuple/tuple_comparison.hpp>  #include "tdict.h"  #include "wordid.h"  #include "array2d.h"  using namespace std; +using namespace boost;  namespace {    inline bool IsWhitespace(char c) { return c == ' ' || c == '\t'; } @@ -38,7 +40,6 @@ void AnnotatedParallelSentence::AllocateForAlignment() {    f_aligned.resize(f_len, 0);    e_aligned.resize(e_len, 0);    aligns_by_fword.resize(f_len); -  span_types.resize(e_len, e_len+1);  }  // read an alignment point of the form X-Y where X and Y are strings @@ -48,44 +49,76 @@ int AnnotatedParallelSentence::ReadAlignmentPoint(const char* buf,                                                    const int start,                                                    const int end,                                                    const bool permit_col, -                                                  short* a, -                                                  short* b) { +                                                  short* a, short* b, short* c, short* d) {    if (end - start < 3) { -    cerr << "Alignment point badly formed: " << string(buf, start, end-start) << endl << buf << endl; +    cerr << "Alignment point badly formed 1: " << string(buf, start, end-start) << endl << buf << endl;      exit(1);    } -  int c = start; +  int ch = start;    *a = 0; -  while(c < end && buf[c] != '-') { -    if (buf[c] < '0' || buf[c] > '9') { -      cerr << "Alignment point badly formed: " << string(buf, start, end-start) << endl << buf << endl; +  while(ch < end && buf[ch] != '-') { +    if (buf[ch] < '0' || buf[ch] > '9') { +      cerr << "Alignment point badly formed 2: " << string(buf, start, end-start) << endl << buf << endl;        exit(1);      }      (*a) *= 10; -    (*a) += buf[c] - '0'; -    ++c; +    (*a) += buf[ch] - '0'; +    ++ch;    } -  ++c; -  if (c >= end) { -    cerr << "Alignment point badly formed: " << string(buf, start, end-start) << endl << buf << endl; +  ++ch; +  if (ch >= end) { +    cerr << "Alignment point badly formed 3: " << string(buf, start, end-start) << endl << buf << endl;      exit(1);    }    (*b) = 0; -  while(c < end && (!permit_col || (permit_col && buf[c] != ':'))) { -    if (buf[c] < '0' || buf[c] > '9') { -      cerr << "Alignment point badly formed: " << string(buf, start, end-start) << endl << buf << endl; +  while(ch < end && (c == 0 && (!permit_col || (permit_col && buf[ch] != ':')) || c != 0 && buf[ch] != '-')) { +    if (buf[ch] < '0' || buf[ch] > '9') { +      cerr << "Alignment point badly formed 4: " << string(buf, start, end-start) << endl << buf << endl;        exit(1);      }      (*b) *= 10; -    (*b) += buf[c] - '0'; -    ++c; +    (*b) += buf[ch] - '0'; +    ++ch;    } -  return c; +  if (c != 0) +  { +      ++ch; +      if (ch >= end) { +        cerr << "Alignment point badly formed 5: " << string(buf, start, end-start) << endl << buf << endl; +        exit(1); +      } +      (*c) = 0; +      while(ch < end && buf[ch] != '-') { +        if (buf[ch] < '0' || buf[ch] > '9') { +          cerr << "Alignment point badly formed 6: " << string(buf, start, end-start) << endl << buf << endl; +          exit(1); +        } +        (*c) *= 10; +        (*c) += buf[ch] - '0'; +        ++ch; +      } +      ++ch; +      if (ch >= end) { +        cerr << "Alignment point badly formed 7: " << string(buf, start, end-start) << endl << buf << endl; +        exit(1); +      } +      (*d) = 0; +      while(ch < end && (!permit_col || (permit_col && buf[ch] != ':'))) { +        if (buf[ch] < '0' || buf[ch] > '9') { +          cerr << "Alignment point badly formed 8: " << string(buf, start, end-start) << endl << buf << endl; +          exit(1); +        } +        (*d) *= 10; +        (*d) += buf[ch] - '0'; +        ++ch; +      } +  } +  return ch;  }  void AnnotatedParallelSentence::ParseAlignmentPoint(const char* buf, int start, int end) {    short a, b; -  ReadAlignmentPoint(buf, start, end, false, &a, &b); +  ReadAlignmentPoint(buf, start, end, false, &a, &b, 0, 0);    if (a >= f_len || b >= e_len) {      cerr << "(" << a << ',' << b << ") is out of bounds. INPUT=\n" << buf << endl;      exit(1); @@ -98,18 +131,22 @@ void AnnotatedParallelSentence::ParseAlignmentPoint(const char* buf, int start,  }  void AnnotatedParallelSentence::ParseSpanLabel(const char* buf, int start, int end) { -  short a,b; -  int c = ReadAlignmentPoint(buf, start, end, true, &a, &b) + 1; -  if (buf[c-1] != ':' || c >= end) { +  short a,b,c,d; +  int ch = ReadAlignmentPoint(buf, start, end, true, &a, &b, &c, &d) + 1; +  if (buf[ch-1] != ':' || ch >= end) {      cerr << "Span badly formed: " << string(buf, start, end-start) << endl << buf << endl;      exit(1);    } -  if (a >= e_len || b > e_len) { +  if (a >= f_len || b > f_len) {      cerr << "(" << a << ',' << b << ") is out of bounds in labeled span. INPUT=\n" << buf << endl;      exit(1);    } +  if (c >= e_len || d > e_len) { +    cerr << "(" << c << ',' << d << ") is out of bounds in labeled span. INPUT=\n" << buf << endl; +    exit(1); +  }    // cerr << a << " " << b << " " << string(buf,c,end-c) << endl; -  span_types(a,b).push_back(-TD::Convert(string(buf, c, end-c))); +  span_types[make_tuple(a,b,c,d)].push_back(-TD::Convert(string(buf, ch, end-ch)));  }  // INPUT FORMAT | 
