summaryrefslogtreecommitdiff
path: root/gi/pf/transliterations.cc
blob: 8ea4ebd2748d7b5c5047f681a5ae9ebaf17c3107 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
#include "transliterations.h"

#include <iostream>
#include <vector>

#include "boost/shared_ptr.hpp"

#include "filelib.h"
#include "ccrp.h"
#include "m.h"
#include "reachability.h"

using namespace std;
using namespace std::tr1;

struct GraphStructure {
  GraphStructure() : r() {}
  // leak memory - these are basically static
  const Reachability* r;
  bool IsReachable() const { return r->nodes > 0; }
};

struct BackwardEstimates {
  BackwardEstimates() : gs(), backward() {}
  explicit BackwardEstimates(const GraphStructure& g) :
      gs(&g), backward() {
    if (g.r->nodes > 0)
      backward = new float[g.r->nodes];
  }
  // leak memory, these are static

  // returns an estimate of the marginal probability
  double MarginalEstimate() const {
    if (!backward) return 0;
    return backward[0];
  }

  // returns an backward estimate
  double operator()(int src_covered, int trg_covered) const {
    if (!backward) return 0;
    int ind = gs->r->node_addresses[src_covered][trg_covered];
    if (ind < 0) return 0;
    return backward[ind];
  }
 private:
  const GraphStructure* gs;
  float* backward;
};

struct TransliterationsImpl {
  TransliterationsImpl(int max_src, int max_trg, double fr) :
      kMAX_SRC_CHUNK(max_src),
      kMAX_TRG_CHUNK(max_trg),
      kFILTER_RATIO(fr),
      tot_pairs(), tot_mem() {
  }

  void Initialize(WordID src, const vector<WordID>& src_lets, WordID trg, const vector<WordID>& trg_lets) {
    const size_t src_len = src_lets.size();
    const size_t trg_len = trg_lets.size();

    // init graph structure
    if (src_len >= graphs.size()) graphs.resize(src_len + 1);
    if (trg_len >= graphs[src_len].size()) graphs[src_len].resize(trg_len + 1);
    GraphStructure& gs = graphs[src_len][trg_len];
    if (!gs.r)
      gs.r = new Reachability(src_len, trg_len, kMAX_SRC_CHUNK, kMAX_TRG_CHUNK, kFILTER_RATIO);
    const Reachability& r = *gs.r;

    // init backward estimates
    if (src >= bes.size()) bes.resize(src + 1);
    unordered_map<WordID, BackwardEstimates>::iterator it = bes[src].find(trg);
    if (it != bes[src].end()) return; // already initialized

    it = bes[src].insert(make_pair(trg, BackwardEstimates(gs))).first;
    BackwardEstimates& b = it->second;
    if (!gs.r->nodes) return;  // not derivable subject to length constraints

    // TODO
    tot_pairs++;
    tot_mem += sizeof(float) * gs.r->nodes;
  }

  void Forbid(WordID src, const vector<WordID>& src_lets, WordID trg, const vector<WordID>& trg_lets) {
    const size_t src_len = src_lets.size();
    const size_t trg_len = trg_lets.size();
    // TODO
  }

  prob_t EstimateProbability(WordID s, const vector<WordID>& src, WordID t, const vector<WordID>& trg) const {
    assert(src.size() < graphs.size());
    const vector<GraphStructure>& tv = graphs[src.size()];
    assert(trg.size() < tv.size());
    const GraphStructure& gs = tv[trg.size()];
    // TODO: do prob
    return prob_t::Zero();
  }

  void GraphSummary() const {
    double to = 0;
    double tn = 0;
    double tt = 0;
    for (int i = 0; i < graphs.size(); ++i) {
      const vector<GraphStructure>& vt = graphs[i];
      for (int j = 0; j < vt.size(); ++j) {
        const GraphStructure& gs = vt[j];
        if (!gs.r) continue;
        tt++;
        for (int k = 0; k < i; ++k) {
          for (int l = 0; l < j; ++l) {
            size_t c = gs.r->valid_deltas[k][l].size();
            if (c) {
              tn += 1;
              to += c;
            }
          }
        }
      }
    }
    cerr << "     Average nodes = " << (tn / tt) << endl;
    cerr << "Average out-degree = " << (to / tn) << endl;
    cerr << " Unique structures = " << tt << endl;
    cerr << "      Unique pairs = " << tot_pairs << endl;
    cerr << "          BEs size = " << (tot_mem / (1024.0*1024.0)) << " MB" << endl;
  }

  const int kMAX_SRC_CHUNK;
  const int kMAX_TRG_CHUNK;
  const double kFILTER_RATIO;
  unsigned tot_pairs;
  size_t tot_mem;
  vector<vector<GraphStructure> > graphs; // graphs[src_len][trg_len]
  vector<unordered_map<WordID, BackwardEstimates> > bes; // bes[src][trg]
};

Transliterations::Transliterations(int max_src, int max_trg, double fr) :
    pimpl_(new TransliterationsImpl(max_src, max_trg, fr)) {}
Transliterations::~Transliterations() { delete pimpl_; }

void Transliterations::Initialize(WordID src, const vector<WordID>& src_lets, WordID trg, const vector<WordID>& trg_lets) {
  pimpl_->Initialize(src, src_lets, trg, trg_lets);
}

prob_t Transliterations::EstimateProbability(WordID s, const vector<WordID>& src, WordID t, const vector<WordID>& trg) const {
  return pimpl_->EstimateProbability(s, src,t, trg);
}

void Transliterations::Forbid(WordID src, const vector<WordID>& src_lets, WordID trg, const vector<WordID>& trg_lets) {
  pimpl_->Forbid(src, src_lets, trg, trg_lets);
}

void Transliterations::GraphSummary() const {
  pimpl_->GraphSummary();
}