diff options
author | Chris Dyer <redpony@gmail.com> | 2009-12-14 20:35:11 -0500 |
---|---|---|
committer | Chris Dyer <redpony@gmail.com> | 2009-12-14 20:35:11 -0500 |
commit | 851e389dffdd6996ea32d70defb8906de80b9edc (patch) | |
tree | 8c68ee77205badc056b8ab5b332e67e3e98017df /decoder/stringlib.cc | |
parent | dc6930c00b4b276883280cff1ed6dcd9ddef03c7 (diff) |
few small fixes of alignment tools, add new orthographic similarity feature for word aligner, final naming of directories, libraries in cdec
Diffstat (limited to 'decoder/stringlib.cc')
-rw-r--r-- | decoder/stringlib.cc | 97 |
1 files changed, 97 insertions, 0 deletions
diff --git a/decoder/stringlib.cc b/decoder/stringlib.cc new file mode 100644 index 00000000..3ed74bef --- /dev/null +++ b/decoder/stringlib.cc @@ -0,0 +1,97 @@ +#include "stringlib.h" + +#include <cstdlib> +#include <cassert> +#include <iostream> +#include <map> + +#include "lattice.h" + +using namespace std; + +void ParseTranslatorInput(const string& line, string* input, string* ref) { + size_t hint = 0; + if (line.find("{\"rules\":") == 0) { + hint = line.find("}}"); + if (hint == string::npos) { + cerr << "Syntax error: " << line << endl; + abort(); + } + hint += 2; + } + size_t pos = line.find("|||", hint); + if (pos == string::npos) { *input = line; return; } + ref->clear(); + *input = line.substr(0, pos - 1); + string rline = line.substr(pos + 4); + if (rline.size() > 0) { + assert(ref); + *ref = rline; + } +} + +void ParseTranslatorInputLattice(const string& line, string* input, Lattice* ref) { + string sref; + ParseTranslatorInput(line, input, &sref); + if (sref.size() > 0) { + assert(ref); + LatticeTools::ConvertTextOrPLF(sref, ref); + } +} + +void ProcessAndStripSGML(string* pline, map<string, string>* out) { + map<string, string>& meta = *out; + string& line = *pline; + string lline = LowercaseString(line); + if (lline.find("<seg")!=0) return; + size_t close = lline.find(">"); + if (close == string::npos) return; // error + size_t end = lline.find("</seg>"); + string seg = Trim(lline.substr(4, close-4)); + string text = line.substr(close+1, end - close - 1); + for (size_t i = 1; i < seg.size(); i++) { + if (seg[i] == '=' && seg[i-1] == ' ') { + string less = seg.substr(0, i-1) + seg.substr(i); + seg = less; i = 0; continue; + } + if (seg[i] == '=' && seg[i+1] == ' ') { + string less = seg.substr(0, i+1); + if (i+2 < seg.size()) less += seg.substr(i+2); + seg = less; i = 0; continue; + } + } + line = Trim(text); + if (seg == "") return; + for (size_t i = 1; i < seg.size(); i++) { + if (seg[i] == '=') { + string label = seg.substr(0, i); + string val = seg.substr(i+1); + if (val[0] == '"') { + val = val.substr(1); + size_t close = val.find('"'); + if (close == string::npos) { + cerr << "SGML parse error: missing \"\n"; + seg = ""; + i = 0; + } else { + seg = val.substr(close+1); + val = val.substr(0, close); + i = 0; + } + } else { + size_t close = val.find(' '); + if (close == string::npos) { + seg = ""; + i = 0; + } else { + seg = val.substr(close+1); + val = val.substr(0, close); + } + } + label = Trim(label); + seg = Trim(seg); + meta[label] = val; + } + } +} + |