summaryrefslogtreecommitdiff
path: root/decoder/stringlib.cc
diff options
context:
space:
mode:
authorChris Dyer <redpony@gmail.com>2009-12-14 20:35:11 -0500
committerChris Dyer <redpony@gmail.com>2009-12-14 20:35:11 -0500
commit851e389dffdd6996ea32d70defb8906de80b9edc (patch)
tree8c68ee77205badc056b8ab5b332e67e3e98017df /decoder/stringlib.cc
parentdc6930c00b4b276883280cff1ed6dcd9ddef03c7 (diff)
few small fixes of alignment tools, add new orthographic similarity feature for word aligner, final naming of directories, libraries in cdec
Diffstat (limited to 'decoder/stringlib.cc')
-rw-r--r--decoder/stringlib.cc97
1 files changed, 97 insertions, 0 deletions
diff --git a/decoder/stringlib.cc b/decoder/stringlib.cc
new file mode 100644
index 00000000..3ed74bef
--- /dev/null
+++ b/decoder/stringlib.cc
@@ -0,0 +1,97 @@
+#include "stringlib.h"
+
+#include <cstdlib>
+#include <cassert>
+#include <iostream>
+#include <map>
+
+#include "lattice.h"
+
+using namespace std;
+
+void ParseTranslatorInput(const string& line, string* input, string* ref) {
+ size_t hint = 0;
+ if (line.find("{\"rules\":") == 0) {
+ hint = line.find("}}");
+ if (hint == string::npos) {
+ cerr << "Syntax error: " << line << endl;
+ abort();
+ }
+ hint += 2;
+ }
+ size_t pos = line.find("|||", hint);
+ if (pos == string::npos) { *input = line; return; }
+ ref->clear();
+ *input = line.substr(0, pos - 1);
+ string rline = line.substr(pos + 4);
+ if (rline.size() > 0) {
+ assert(ref);
+ *ref = rline;
+ }
+}
+
+void ParseTranslatorInputLattice(const string& line, string* input, Lattice* ref) {
+ string sref;
+ ParseTranslatorInput(line, input, &sref);
+ if (sref.size() > 0) {
+ assert(ref);
+ LatticeTools::ConvertTextOrPLF(sref, ref);
+ }
+}
+
+void ProcessAndStripSGML(string* pline, map<string, string>* out) {
+ map<string, string>& meta = *out;
+ string& line = *pline;
+ string lline = LowercaseString(line);
+ if (lline.find("<seg")!=0) return;
+ size_t close = lline.find(">");
+ if (close == string::npos) return; // error
+ size_t end = lline.find("</seg>");
+ string seg = Trim(lline.substr(4, close-4));
+ string text = line.substr(close+1, end - close - 1);
+ for (size_t i = 1; i < seg.size(); i++) {
+ if (seg[i] == '=' && seg[i-1] == ' ') {
+ string less = seg.substr(0, i-1) + seg.substr(i);
+ seg = less; i = 0; continue;
+ }
+ if (seg[i] == '=' && seg[i+1] == ' ') {
+ string less = seg.substr(0, i+1);
+ if (i+2 < seg.size()) less += seg.substr(i+2);
+ seg = less; i = 0; continue;
+ }
+ }
+ line = Trim(text);
+ if (seg == "") return;
+ for (size_t i = 1; i < seg.size(); i++) {
+ if (seg[i] == '=') {
+ string label = seg.substr(0, i);
+ string val = seg.substr(i+1);
+ if (val[0] == '"') {
+ val = val.substr(1);
+ size_t close = val.find('"');
+ if (close == string::npos) {
+ cerr << "SGML parse error: missing \"\n";
+ seg = "";
+ i = 0;
+ } else {
+ seg = val.substr(close+1);
+ val = val.substr(0, close);
+ i = 0;
+ }
+ } else {
+ size_t close = val.find(' ');
+ if (close == string::npos) {
+ seg = "";
+ i = 0;
+ } else {
+ seg = val.substr(close+1);
+ val = val.substr(0, close);
+ }
+ }
+ label = Trim(label);
+ seg = Trim(seg);
+ meta[label] = val;
+ }
+ }
+}
+