summaryrefslogtreecommitdiff
path: root/decoder/stringlib.cc
blob: 3ed74bef21b899c23f18c1407073f6865664ca16 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
#include "stringlib.h"

#include <cstdlib>
#include <cassert>
#include <iostream>
#include <map>

#include "lattice.h"

using namespace std;

void ParseTranslatorInput(const string& line, string* input, string* ref) {
  size_t hint = 0;
  if (line.find("{\"rules\":") == 0) {
    hint = line.find("}}");
    if (hint == string::npos) {
      cerr << "Syntax error: " << line << endl;
      abort();
    }
    hint += 2;
  }
  size_t pos = line.find("|||", hint);
  if (pos == string::npos) { *input = line; return; }
  ref->clear();
  *input = line.substr(0, pos - 1);
  string rline = line.substr(pos + 4);
  if (rline.size() > 0) {
    assert(ref);
    *ref = rline;
  }
}

void ParseTranslatorInputLattice(const string& line, string* input, Lattice* ref) {
  string sref;
  ParseTranslatorInput(line, input, &sref);
  if (sref.size() > 0) {
    assert(ref);
    LatticeTools::ConvertTextOrPLF(sref, ref);
  }
}

void ProcessAndStripSGML(string* pline, map<string, string>* out) {
  map<string, string>& meta = *out;
  string& line = *pline;
  string lline = LowercaseString(line);
  if (lline.find("<seg")!=0) return;
  size_t close = lline.find(">");
  if (close == string::npos) return; // error
  size_t end = lline.find("</seg>");
  string seg = Trim(lline.substr(4, close-4));
  string text = line.substr(close+1, end - close - 1);
  for (size_t i = 1; i < seg.size(); i++) {
    if (seg[i] == '=' && seg[i-1] == ' ') {
      string less = seg.substr(0, i-1) + seg.substr(i);
      seg = less; i = 0; continue;
    }
    if (seg[i] == '=' && seg[i+1] == ' ') {
      string less = seg.substr(0, i+1);
      if (i+2 < seg.size()) less += seg.substr(i+2);
      seg = less; i = 0; continue;
    }
  }
  line = Trim(text);
  if (seg == "") return;
  for (size_t i = 1; i < seg.size(); i++) {
    if (seg[i] == '=') {
      string label = seg.substr(0, i);
      string val = seg.substr(i+1);
      if (val[0] == '"') {
        val = val.substr(1);
        size_t close = val.find('"');
        if (close == string::npos) {
          cerr << "SGML parse error: missing \"\n";
          seg = "";
          i = 0;
        } else {
          seg = val.substr(close+1);
          val = val.substr(0, close);
          i = 0;
        }
      } else {
        size_t close = val.find(' ');
        if (close == string::npos) {
          seg = "";
          i = 0;
        } else {
          seg = val.substr(close+1);
          val = val.substr(0, close);
        }
      }
      label = Trim(label);
      seg = Trim(seg);
      meta[label] = val;
    }
  }
}