summaryrefslogtreecommitdiff
path: root/gi/pf/transliterations.cc
diff options
context:
space:
mode:
authorChris Dyer <cdyer@cs.cmu.edu>2012-03-09 22:23:50 -0500
committerChris Dyer <cdyer@cs.cmu.edu>2012-03-09 22:23:50 -0500
commit113317266853abff2e1c0c3e889017d0eee55c93 (patch)
tree3fb77e29acaf45e1a9a006f8f11fb2b021b5987b /gi/pf/transliterations.cc
parent78bf1457f606dd3880c2bc912201c4945d3f85b4 (diff)
moar
Diffstat (limited to 'gi/pf/transliterations.cc')
-rw-r--r--gi/pf/transliterations.cc223
1 files changed, 201 insertions, 22 deletions
diff --git a/gi/pf/transliterations.cc b/gi/pf/transliterations.cc
index 8ea4ebd2..2200715e 100644
--- a/gi/pf/transliterations.cc
+++ b/gi/pf/transliterations.cc
@@ -5,14 +5,173 @@
#include "boost/shared_ptr.hpp"
+#include "backward.h"
#include "filelib.h"
-#include "ccrp.h"
+#include "tdict.h"
+#include "trule.h"
+#include "filelib.h"
+#include "ccrp_nt.h"
#include "m.h"
#include "reachability.h"
using namespace std;
using namespace std::tr1;
+struct TruncatedConditionalLengthModel {
+ TruncatedConditionalLengthModel(unsigned max_src_size, unsigned max_trg_size, double expected_src_to_trg_ratio) :
+ plens(max_src_size+1, vector<prob_t>(max_trg_size+1, 0.0)) {
+ for (unsigned i = 1; i <= max_src_size; ++i) {
+ prob_t z = prob_t::Zero();
+ for (unsigned j = 1; j <= max_trg_size; ++j)
+ z += (plens[i][j] = prob_t(0.01 + exp(Md::log_poisson(j, i * expected_src_to_trg_ratio))));
+ for (unsigned j = 1; j <= max_trg_size; ++j)
+ plens[i][j] /= z;
+ //for (unsigned j = 1; j <= max_trg_size; ++j)
+ // cerr << "P(trg_len=" << j << " | src_len=" << i << ") = " << plens[i][j] << endl;
+ }
+ }
+
+ // return p(tlen | slen) for *chunks* not full words
+ inline const prob_t& operator()(int slen, int tlen) const {
+ return plens[slen][tlen];
+ }
+
+ vector<vector<prob_t> > plens;
+};
+
+struct CondBaseDist {
+ CondBaseDist(unsigned max_src_size, unsigned max_trg_size, double expected_src_to_trg_ratio) :
+ tclm(max_src_size, max_trg_size, expected_src_to_trg_ratio) {}
+
+ prob_t operator()(const vector<WordID>& src, unsigned sf, unsigned st,
+ const vector<WordID>& trg, unsigned tf, unsigned tt) const {
+ prob_t p = tclm(st - sf, tt - tf); // target len | source length ~ TCLM(source len)
+ assert(!"not impl");
+ return p;
+ }
+ inline prob_t operator()(const vector<WordID>& src, const vector<WordID>& trg) const {
+ return (*this)(src, 0, src.size(), trg, 0, trg.size());
+ }
+ TruncatedConditionalLengthModel tclm;
+};
+
+// represents transliteration phrase probabilities, e.g.
+// p( a l - | A l ) , p( o | A w ) , ...
+struct TransliterationChunkConditionalModel {
+ explicit TransliterationChunkConditionalModel(const CondBaseDist& pp0) :
+ d(0.0),
+ strength(1.0),
+ rp0(pp0) {
+ }
+
+ void Summary() const {
+ std::cerr << "Number of conditioning contexts: " << r.size() << std::endl;
+ for (RuleModelHash::const_iterator it = r.begin(); it != r.end(); ++it) {
+ std::cerr << TD::GetString(it->first) << " \t(\\alpha = " << it->second.alpha() << ") --------------------------" << std::endl;
+ for (CCRP_NoTable<TRule>::const_iterator i2 = it->second.begin(); i2 != it->second.end(); ++i2)
+ std::cerr << " " << i2->second << '\t' << i2->first << std::endl;
+ }
+ }
+
+ int DecrementRule(const TRule& rule) {
+ RuleModelHash::iterator it = r.find(rule.f_);
+ assert(it != r.end());
+ int count = it->second.decrement(rule);
+ if (count) {
+ if (it->second.num_customers() == 0) r.erase(it);
+ }
+ return count;
+ }
+
+ int IncrementRule(const TRule& rule) {
+ RuleModelHash::iterator it = r.find(rule.f_);
+ if (it == r.end()) {
+ it = r.insert(make_pair(rule.f_, CCRP_NoTable<TRule>(strength))).first;
+ }
+ int count = it->second.increment(rule);
+ return count;
+ }
+
+ void IncrementRules(const std::vector<TRulePtr>& rules) {
+ for (int i = 0; i < rules.size(); ++i)
+ IncrementRule(*rules[i]);
+ }
+
+ void DecrementRules(const std::vector<TRulePtr>& rules) {
+ for (int i = 0; i < rules.size(); ++i)
+ DecrementRule(*rules[i]);
+ }
+
+ prob_t RuleProbability(const TRule& rule) const {
+ prob_t p;
+ RuleModelHash::const_iterator it = r.find(rule.f_);
+ if (it == r.end()) {
+ p = rp0(rule.f_, rule.e_);
+ } else {
+ p = it->second.prob(rule, rp0(rule.f_, rule.e_));
+ }
+ return p;
+ }
+
+ double LogLikelihood(const double& dd, const double& aa) const {
+ if (aa <= -dd) return -std::numeric_limits<double>::infinity();
+ //double llh = Md::log_beta_density(dd, 10, 3) + Md::log_gamma_density(aa, 1, 1);
+ double llh = //Md::log_beta_density(dd, 1, 1) +
+ Md::log_gamma_density(dd + aa, 1, 1);
+ typename std::tr1::unordered_map<std::vector<WordID>, CCRP_NoTable<TRule>, boost::hash<std::vector<WordID> > >::const_iterator it;
+ for (it = r.begin(); it != r.end(); ++it)
+ llh += it->second.log_crp_prob(aa);
+ return llh;
+ }
+
+ struct AlphaResampler {
+ AlphaResampler(const TransliterationChunkConditionalModel& m) : m_(m) {}
+ const TransliterationChunkConditionalModel& m_;
+ double operator()(const double& proposed_strength) const {
+ return m_.LogLikelihood(m_.d, proposed_strength);
+ }
+ };
+
+ void ResampleHyperparameters(MT19937* rng) {
+ typename std::tr1::unordered_map<std::vector<WordID>, CCRP_NoTable<TRule>, boost::hash<std::vector<WordID> > >::iterator it;
+ //const unsigned nloop = 5;
+ const unsigned niterations = 10;
+ //DiscountResampler dr(*this);
+ AlphaResampler ar(*this);
+#if 0
+ for (int iter = 0; iter < nloop; ++iter) {
+ strength = slice_sampler1d(ar, strength, *rng, -d + std::numeric_limits<double>::min(),
+ std::numeric_limits<double>::infinity(), 0.0, niterations, 100*niterations);
+ double min_discount = std::numeric_limits<double>::min();
+ if (strength < 0.0) min_discount -= strength;
+ d = slice_sampler1d(dr, d, *rng, min_discount,
+ 1.0, 0.0, niterations, 100*niterations);
+ }
+#endif
+ strength = slice_sampler1d(ar, strength, *rng, -d,
+ std::numeric_limits<double>::infinity(), 0.0, niterations, 100*niterations);
+ std::cerr << "CTMModel(alpha=" << strength << ") = " << LogLikelihood(d, strength) << std::endl;
+ for (it = r.begin(); it != r.end(); ++it) {
+#if 0
+ it->second.set_discount(d);
+#endif
+ it->second.set_alpha(strength);
+ }
+ }
+
+ prob_t Likelihood() const {
+ prob_t p; p.logeq(LogLikelihood(d, strength));
+ return p;
+ }
+
+ const CondBaseDist& rp0;
+ typedef std::tr1::unordered_map<std::vector<WordID>,
+ CCRP_NoTable<TRule>,
+ boost::hash<std::vector<WordID> > > RuleModelHash;
+ RuleModelHash r;
+ double d, strength;
+};
+
struct GraphStructure {
GraphStructure() : r() {}
// leak memory - these are basically static
@@ -20,9 +179,9 @@ struct GraphStructure {
bool IsReachable() const { return r->nodes > 0; }
};
-struct BackwardEstimates {
- BackwardEstimates() : gs(), backward() {}
- explicit BackwardEstimates(const GraphStructure& g) :
+struct ProbabilityEstimates {
+ ProbabilityEstimates() : gs(), backward() {}
+ explicit ProbabilityEstimates(const GraphStructure& g) :
gs(&g), backward() {
if (g.r->nodes > 0)
backward = new float[g.r->nodes];
@@ -36,24 +195,32 @@ struct BackwardEstimates {
}
// returns an backward estimate
- double operator()(int src_covered, int trg_covered) const {
+ double Backward(int src_covered, int trg_covered) const {
if (!backward) return 0;
int ind = gs->r->node_addresses[src_covered][trg_covered];
if (ind < 0) return 0;
return backward[ind];
}
+
+ prob_t estp;
+ float* backward;
private:
const GraphStructure* gs;
- float* backward;
};
struct TransliterationsImpl {
- TransliterationsImpl(int max_src, int max_trg, double fr) :
+ TransliterationsImpl(int max_src, int max_trg, double sr, const BackwardEstimator& b) :
+ cp0(max_src, max_trg, sr),
+ tccm(cp0),
+ be(b),
kMAX_SRC_CHUNK(max_src),
kMAX_TRG_CHUNK(max_trg),
- kFILTER_RATIO(fr),
+ kS2T_RATIO(sr),
tot_pairs(), tot_mem() {
}
+ const CondBaseDist cp0;
+ TransliterationChunkConditionalModel tccm;
+ const BackwardEstimator& be;
void Initialize(WordID src, const vector<WordID>& src_lets, WordID trg, const vector<WordID>& trg_lets) {
const size_t src_len = src_lets.size();
@@ -63,20 +230,29 @@ struct TransliterationsImpl {
if (src_len >= graphs.size()) graphs.resize(src_len + 1);
if (trg_len >= graphs[src_len].size()) graphs[src_len].resize(trg_len + 1);
GraphStructure& gs = graphs[src_len][trg_len];
- if (!gs.r)
- gs.r = new Reachability(src_len, trg_len, kMAX_SRC_CHUNK, kMAX_TRG_CHUNK, kFILTER_RATIO);
+ if (!gs.r) {
+ double rat = exp(fabs(log(trg_len / (src_len * kS2T_RATIO))));
+ if (rat > 1.5 || (rat > 2.4 && src_len < 6)) {
+ cerr << " ** Forbidding transliterations of size " << src_len << "," << trg_len << ": " << rat << endl;
+ gs.r = new Reachability(src_len, trg_len, 0, 0);
+ } else {
+ gs.r = new Reachability(src_len, trg_len, kMAX_SRC_CHUNK, kMAX_TRG_CHUNK);
+ }
+ }
+
const Reachability& r = *gs.r;
// init backward estimates
- if (src >= bes.size()) bes.resize(src + 1);
- unordered_map<WordID, BackwardEstimates>::iterator it = bes[src].find(trg);
- if (it != bes[src].end()) return; // already initialized
+ if (src >= ests.size()) ests.resize(src + 1);
+ unordered_map<WordID, ProbabilityEstimates>::iterator it = ests[src].find(trg);
+ if (it != ests[src].end()) return; // already initialized
- it = bes[src].insert(make_pair(trg, BackwardEstimates(gs))).first;
- BackwardEstimates& b = it->second;
+ it = ests[src].insert(make_pair(trg, ProbabilityEstimates(gs))).first;
+ ProbabilityEstimates& est = it->second;
if (!gs.r->nodes) return; // not derivable subject to length constraints
- // TODO
+ be.InitializeGrid(src_lets, trg_lets, r, kS2T_RATIO, est.backward);
+ cerr << TD::GetString(src_lets) << " ||| " << TD::GetString(trg_lets) << " ||| " << (est.backward[0] / trg_lets.size()) << endl;
tot_pairs++;
tot_mem += sizeof(float) * gs.r->nodes;
}
@@ -92,8 +268,11 @@ struct TransliterationsImpl {
const vector<GraphStructure>& tv = graphs[src.size()];
assert(trg.size() < tv.size());
const GraphStructure& gs = tv[trg.size()];
- // TODO: do prob
- return prob_t::Zero();
+ if (gs.r->nodes == 0)
+ return prob_t::Zero();
+ const unordered_map<WordID, ProbabilityEstimates>::const_iterator it = ests[s].find(t);
+ assert(it != ests[s].end());
+ return it->second.estp;
}
void GraphSummary() const {
@@ -126,15 +305,15 @@ struct TransliterationsImpl {
const int kMAX_SRC_CHUNK;
const int kMAX_TRG_CHUNK;
- const double kFILTER_RATIO;
+ const double kS2T_RATIO;
unsigned tot_pairs;
size_t tot_mem;
vector<vector<GraphStructure> > graphs; // graphs[src_len][trg_len]
- vector<unordered_map<WordID, BackwardEstimates> > bes; // bes[src][trg]
+ vector<unordered_map<WordID, ProbabilityEstimates> > ests; // ests[src][trg]
};
-Transliterations::Transliterations(int max_src, int max_trg, double fr) :
- pimpl_(new TransliterationsImpl(max_src, max_trg, fr)) {}
+Transliterations::Transliterations(int max_src, int max_trg, double sr, const BackwardEstimator& be) :
+ pimpl_(new TransliterationsImpl(max_src, max_trg, sr, be)) {}
Transliterations::~Transliterations() { delete pimpl_; }
void Transliterations::Initialize(WordID src, const vector<WordID>& src_lets, WordID trg, const vector<WordID>& trg_lets) {