From b6eede632af4fa58a6f5325ee0d059c02a898b9f Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Sat, 24 Mar 2012 23:04:46 -0400 Subject: rename aligner, add support for distinguishing translation / transliteration --- utils/Makefile.am | 2 +- utils/alignment_io.cc | 97 ++++++++++++++++++++++++++++++++++++++++++++++ utils/alignment_io.h | 42 ++++++++++++++++++++ utils/alignment_pharaoh.cc | 77 ------------------------------------ utils/alignment_pharaoh.h | 14 ------- utils/atools.cc | 8 ++-- 6 files changed, 144 insertions(+), 96 deletions(-) create mode 100644 utils/alignment_io.cc create mode 100644 utils/alignment_io.h delete mode 100644 utils/alignment_pharaoh.cc delete mode 100644 utils/alignment_pharaoh.h (limited to 'utils') diff --git a/utils/Makefile.am b/utils/Makefile.am index 3ea21835..2fc6ae21 100644 --- a/utils/Makefile.am +++ b/utils/Makefile.am @@ -23,7 +23,7 @@ atools_SOURCES = atools.cc noinst_LIBRARIES = libutils.a libutils_a_SOURCES = \ - alignment_pharaoh.cc \ + alignment_io.cc \ b64tools.cc \ corpus_tools.cc \ dict.cc \ diff --git a/utils/alignment_io.cc b/utils/alignment_io.cc new file mode 100644 index 00000000..1d923f7f --- /dev/null +++ b/utils/alignment_io.cc @@ -0,0 +1,97 @@ +#include "utils/alignment_io.h" + +using namespace std; + +static bool is_digit(char x) { return x >= '0' && x <= '9'; } + +boost::shared_ptr > AlignmentIO::ReadPharaohAlignmentGrid(const string& al) { + int max_x = 0; + int max_y = 0; + int i = 0; + size_t pos = al.rfind(" ||| "); + if (pos != string::npos) { i = pos + 5; } + while (i < al.size()) { + if (al[i] == '\n' || al[i] == '\r') break; + int x = 0; + while(i < al.size() && is_digit(al[i])) { + x *= 10; + x += al[i] - '0'; + ++i; + } + if (x > max_x) max_x = x; + assert(i < al.size()); + if(al[i] != '-') { + cerr << "BAD ALIGNMENT: " << al << endl; + abort(); + } + ++i; + int y = 0; + while(i < al.size() && is_digit(al[i])) { + y *= 10; + y += al[i] - '0'; + ++i; + } + if (y > max_y) max_y = y; + while(i < al.size() && al[i] == ' ') { ++i; } + } + + boost::shared_ptr > grid(new Array2D(max_x + 1, max_y + 1)); + i = 0; + if (pos != string::npos) { i = pos + 5; } + while (i < al.size()) { + if (al[i] == '\n' || al[i] == '\r') break; + int x = 0; + while(i < al.size() && is_digit(al[i])) { + x *= 10; + x += al[i] - '0'; + ++i; + } + assert(i < al.size()); + assert(al[i] == '-'); + ++i; + int y = 0; + while(i < al.size() && is_digit(al[i])) { + y *= 10; + y += al[i] - '0'; + ++i; + } + (*grid)(x, y) = true; + while(i < al.size() && al[i] == ' ') { ++i; } + } + // cerr << *grid << endl; + return grid; +} + +void AlignmentIO::SerializePharaohFormat(const Array2D& alignment, ostream* o) { + ostream& out = *o; + bool need_space = false; + for (int i = 0; i < alignment.width(); ++i) + for (int j = 0; j < alignment.height(); ++j) + if (alignment(i,j)) { + if (need_space) out << ' '; else need_space = true; + out << i << '-' << j; + } + out << endl; +} + +void AlignmentIO::SerializeTypedAlignment(const Array2D& alignment, ostream* o) { + ostream& out = *o; + bool need_space = false; + for (int i = 0; i < alignment.width(); ++i) + for (int j = 0; j < alignment.height(); ++j) { + const AlignmentType& aij = alignment(i,j); + if (aij != kNONE) { + if (need_space) out << ' '; else need_space = true; + if (aij == kTRANSLATION) {} + else if (aij == kTRANSLITERATION) { + out << 'T' << ':'; + } else { + cerr << "\nUnexpected alignment point type: " << static_cast(aij) << endl; + abort(); + } + out << i << '-' << j; + } + } + out << endl; +} + diff --git a/utils/alignment_io.h b/utils/alignment_io.h new file mode 100644 index 00000000..36bcecd7 --- /dev/null +++ b/utils/alignment_io.h @@ -0,0 +1,42 @@ +#ifndef _ALIGNMENT_IO_H_ +#define _ALIGNMENT_IO_H_ + +#include +#include +#include +#include "array2d.h" + +struct AlignmentIO { + enum AlignmentType { kNONE = 0, kTRANSLATION = 1, kTRANSLITERATION = 2 }; + + static boost::shared_ptr > ReadPharaohAlignmentGrid(const std::string& al); + static void SerializePharaohFormat(const Array2D& alignment, std::ostream* out); + static void SerializeTypedAlignment(const Array2D& alignment, std::ostream* out); +}; + +inline std::ostream& operator<<(std::ostream& os, const Array2D& m) { + os << ' '; + for (int j=0; j - -using namespace std; - -static bool is_digit(char x) { return x >= '0' && x <= '9'; } - -boost::shared_ptr > AlignmentPharaoh::ReadPharaohAlignmentGrid(const string& al) { - int max_x = 0; - int max_y = 0; - int i = 0; - size_t pos = al.rfind(" ||| "); - if (pos != string::npos) { i = pos + 5; } - while (i < al.size()) { - if (al[i] == '\n' || al[i] == '\r') break; - int x = 0; - while(i < al.size() && is_digit(al[i])) { - x *= 10; - x += al[i] - '0'; - ++i; - } - if (x > max_x) max_x = x; - assert(i < al.size()); - if(al[i] != '-') { - cerr << "BAD ALIGNMENT: " << al << endl; - abort(); - } - ++i; - int y = 0; - while(i < al.size() && is_digit(al[i])) { - y *= 10; - y += al[i] - '0'; - ++i; - } - if (y > max_y) max_y = y; - while(i < al.size() && al[i] == ' ') { ++i; } - } - - boost::shared_ptr > grid(new Array2D(max_x + 1, max_y + 1)); - i = 0; - if (pos != string::npos) { i = pos + 5; } - while (i < al.size()) { - if (al[i] == '\n' || al[i] == '\r') break; - int x = 0; - while(i < al.size() && is_digit(al[i])) { - x *= 10; - x += al[i] - '0'; - ++i; - } - assert(i < al.size()); - assert(al[i] == '-'); - ++i; - int y = 0; - while(i < al.size() && is_digit(al[i])) { - y *= 10; - y += al[i] - '0'; - ++i; - } - (*grid)(x, y) = true; - while(i < al.size() && al[i] == ' ') { ++i; } - } - // cerr << *grid << endl; - return grid; -} - -void AlignmentPharaoh::SerializePharaohFormat(const Array2D& alignment, ostream* out) { - bool need_space = false; - for (int i = 0; i < alignment.width(); ++i) - for (int j = 0; j < alignment.height(); ++j) - if (alignment(i,j)) { - if (need_space) (*out) << ' '; else need_space = true; - (*out) << i << '-' << j; - } - (*out) << endl; -} - diff --git a/utils/alignment_pharaoh.h b/utils/alignment_pharaoh.h deleted file mode 100644 index d111c8bf..00000000 --- a/utils/alignment_pharaoh.h +++ /dev/null @@ -1,14 +0,0 @@ -#ifndef _PHARAOH_ALIGNMENT_H_ -#define _PHARAOH_ALIGNMENT_H_ - -#include -#include -#include -#include "array2d.h" - -struct AlignmentPharaoh { - static boost::shared_ptr > ReadPharaohAlignmentGrid(const std::string& al); - static void SerializePharaohFormat(const Array2D& alignment, std::ostream* out); -}; - -#endif diff --git a/utils/atools.cc b/utils/atools.cc index ba56dd6c..bce7822e 100644 --- a/utils/atools.cc +++ b/utils/atools.cc @@ -8,7 +8,7 @@ #include #include "filelib.h" -#include "alignment_pharaoh.h" +#include "alignment_io.h" namespace po = boost::program_options; using namespace std; @@ -348,9 +348,9 @@ int main(int argc, char **argv) { } if (line1.empty() && !*in1) break; boost::shared_ptr > out(new Array2D); - boost::shared_ptr > a1 = AlignmentPharaoh::ReadPharaohAlignmentGrid(line1); + boost::shared_ptr > a1 = AlignmentIO::ReadPharaohAlignmentGrid(line1); if (in2) { - boost::shared_ptr > a2 = AlignmentPharaoh::ReadPharaohAlignmentGrid(line2); + boost::shared_ptr > a2 = AlignmentIO::ReadPharaohAlignmentGrid(line2); cmd.Apply(*a1, *a2, out.get()); } else { Array2D dummy; @@ -358,7 +358,7 @@ int main(int argc, char **argv) { } if (cmd.Result() == 1) { - AlignmentPharaoh::SerializePharaohFormat(*out, &cout); + AlignmentIO::SerializePharaohFormat(*out, &cout); } } if (cmd.Result() == 2) -- cgit v1.2.3