summaryrefslogtreecommitdiff
path: root/gi/pf/transliterations.cc
diff options
context:
space:
mode:
authorKenneth Heafield <github@kheafield.com>2012-10-22 12:07:20 +0100
committerKenneth Heafield <github@kheafield.com>2012-10-22 12:07:20 +0100
commit5f98fe5c4f2a2090eeb9d30c030305a70a8347d1 (patch)
tree9b6002f850e6dea1e3400c6b19bb31a9cdf3067f /gi/pf/transliterations.cc
parentcf9994131993b40be62e90e213b1e11e6b550143 (diff)
parent21825a09d97c2e0afd20512f306fb25fed55e529 (diff)
Merge remote branch 'upstream/master'
Conflicts: Jamroot bjam decoder/Jamfile decoder/cdec.cc dpmert/Jamfile jam-files/sanity.jam klm/lm/Jamfile klm/util/Jamfile mira/Jamfile
Diffstat (limited to 'gi/pf/transliterations.cc')
-rw-r--r--gi/pf/transliterations.cc334
1 files changed, 0 insertions, 334 deletions
diff --git a/gi/pf/transliterations.cc b/gi/pf/transliterations.cc
deleted file mode 100644
index b2996f65..00000000
--- a/gi/pf/transliterations.cc
+++ /dev/null
@@ -1,334 +0,0 @@
-#include "transliterations.h"
-
-#include <iostream>
-#include <vector>
-
-#include "boost/shared_ptr.hpp"
-
-#include "backward.h"
-#include "filelib.h"
-#include "tdict.h"
-#include "trule.h"
-#include "filelib.h"
-#include "ccrp_nt.h"
-#include "m.h"
-#include "reachability.h"
-
-using namespace std;
-using namespace std::tr1;
-
-struct TruncatedConditionalLengthModel {
- TruncatedConditionalLengthModel(unsigned max_src_size, unsigned max_trg_size, double expected_src_to_trg_ratio) :
- plens(max_src_size+1, vector<prob_t>(max_trg_size+1, 0.0)) {
- for (unsigned i = 1; i <= max_src_size; ++i) {
- prob_t z = prob_t::Zero();
- for (unsigned j = 1; j <= max_trg_size; ++j)
- z += (plens[i][j] = prob_t(0.01 + exp(Md::log_poisson(j, i * expected_src_to_trg_ratio))));
- for (unsigned j = 1; j <= max_trg_size; ++j)
- plens[i][j] /= z;
- //for (unsigned j = 1; j <= max_trg_size; ++j)
- // cerr << "P(trg_len=" << j << " | src_len=" << i << ") = " << plens[i][j] << endl;
- }
- }
-
- // return p(tlen | slen) for *chunks* not full words
- inline const prob_t& operator()(int slen, int tlen) const {
- return plens[slen][tlen];
- }
-
- vector<vector<prob_t> > plens;
-};
-
-struct CondBaseDist {
- CondBaseDist(unsigned max_src_size, unsigned max_trg_size, double expected_src_to_trg_ratio) :
- tclm(max_src_size, max_trg_size, expected_src_to_trg_ratio) {}
-
- prob_t operator()(const vector<WordID>& src, unsigned sf, unsigned st,
- const vector<WordID>& trg, unsigned tf, unsigned tt) const {
- prob_t p = tclm(st - sf, tt - tf); // target len | source length ~ TCLM(source len)
- assert(!"not impl");
- return p;
- }
- inline prob_t operator()(const vector<WordID>& src, const vector<WordID>& trg) const {
- return (*this)(src, 0, src.size(), trg, 0, trg.size());
- }
- TruncatedConditionalLengthModel tclm;
-};
-
-// represents transliteration phrase probabilities, e.g.
-// p( a l - | A l ) , p( o | A w ) , ...
-struct TransliterationChunkConditionalModel {
- explicit TransliterationChunkConditionalModel(const CondBaseDist& pp0) :
- d(0.0),
- strength(1.0),
- rp0(pp0) {
- }
-
- void Summary() const {
- std::cerr << "Number of conditioning contexts: " << r.size() << std::endl;
- for (RuleModelHash::const_iterator it = r.begin(); it != r.end(); ++it) {
- std::cerr << TD::GetString(it->first) << " \t(\\alpha = " << it->second.alpha() << ") --------------------------" << std::endl;
- for (CCRP_NoTable<TRule>::const_iterator i2 = it->second.begin(); i2 != it->second.end(); ++i2)
- std::cerr << " " << i2->second << '\t' << i2->first << std::endl;
- }
- }
-
- int DecrementRule(const TRule& rule) {
- RuleModelHash::iterator it = r.find(rule.f_);
- assert(it != r.end());
- int count = it->second.decrement(rule);
- if (count) {
- if (it->second.num_customers() == 0) r.erase(it);
- }
- return count;
- }
-
- int IncrementRule(const TRule& rule) {
- RuleModelHash::iterator it = r.find(rule.f_);
- if (it == r.end()) {
- it = r.insert(make_pair(rule.f_, CCRP_NoTable<TRule>(strength))).first;
- }
- int count = it->second.increment(rule);
- return count;
- }
-
- void IncrementRules(const std::vector<TRulePtr>& rules) {
- for (int i = 0; i < rules.size(); ++i)
- IncrementRule(*rules[i]);
- }
-
- void DecrementRules(const std::vector<TRulePtr>& rules) {
- for (int i = 0; i < rules.size(); ++i)
- DecrementRule(*rules[i]);
- }
-
- prob_t RuleProbability(const TRule& rule) const {
- prob_t p;
- RuleModelHash::const_iterator it = r.find(rule.f_);
- if (it == r.end()) {
- p = rp0(rule.f_, rule.e_);
- } else {
- p = it->second.prob(rule, rp0(rule.f_, rule.e_));
- }
- return p;
- }
-
- double LogLikelihood(const double& dd, const double& aa) const {
- if (aa <= -dd) return -std::numeric_limits<double>::infinity();
- //double llh = Md::log_beta_density(dd, 10, 3) + Md::log_gamma_density(aa, 1, 1);
- double llh = //Md::log_beta_density(dd, 1, 1) +
- Md::log_gamma_density(dd + aa, 1, 1);
- std::tr1::unordered_map<std::vector<WordID>, CCRP_NoTable<TRule>, boost::hash<std::vector<WordID> > >::const_iterator it;
- for (it = r.begin(); it != r.end(); ++it)
- llh += it->second.log_crp_prob(aa);
- return llh;
- }
-
- struct AlphaResampler {
- AlphaResampler(const TransliterationChunkConditionalModel& m) : m_(m) {}
- const TransliterationChunkConditionalModel& m_;
- double operator()(const double& proposed_strength) const {
- return m_.LogLikelihood(m_.d, proposed_strength);
- }
- };
-
- void ResampleHyperparameters(MT19937* rng) {
- std::tr1::unordered_map<std::vector<WordID>, CCRP_NoTable<TRule>, boost::hash<std::vector<WordID> > >::iterator it;
- //const unsigned nloop = 5;
- const unsigned niterations = 10;
- //DiscountResampler dr(*this);
- AlphaResampler ar(*this);
-#if 0
- for (int iter = 0; iter < nloop; ++iter) {
- strength = slice_sampler1d(ar, strength, *rng, -d + std::numeric_limits<double>::min(),
- std::numeric_limits<double>::infinity(), 0.0, niterations, 100*niterations);
- double min_discount = std::numeric_limits<double>::min();
- if (strength < 0.0) min_discount -= strength;
- d = slice_sampler1d(dr, d, *rng, min_discount,
- 1.0, 0.0, niterations, 100*niterations);
- }
-#endif
- strength = slice_sampler1d(ar, strength, *rng, -d,
- std::numeric_limits<double>::infinity(), 0.0, niterations, 100*niterations);
- std::cerr << "CTMModel(alpha=" << strength << ") = " << LogLikelihood(d, strength) << std::endl;
- for (it = r.begin(); it != r.end(); ++it) {
-#if 0
- it->second.set_discount(d);
-#endif
- it->second.set_alpha(strength);
- }
- }
-
- prob_t Likelihood() const {
- prob_t p; p.logeq(LogLikelihood(d, strength));
- return p;
- }
-
- const CondBaseDist& rp0;
- typedef std::tr1::unordered_map<std::vector<WordID>,
- CCRP_NoTable<TRule>,
- boost::hash<std::vector<WordID> > > RuleModelHash;
- RuleModelHash r;
- double d, strength;
-};
-
-struct GraphStructure {
- GraphStructure() : r() {}
- // leak memory - these are basically static
- const Reachability* r;
- bool IsReachable() const { return r->nodes > 0; }
-};
-
-struct ProbabilityEstimates {
- ProbabilityEstimates() : gs(), backward() {}
- explicit ProbabilityEstimates(const GraphStructure& g) :
- gs(&g), backward() {
- if (g.r->nodes > 0)
- backward = new float[g.r->nodes];
- }
- // leak memory, these are static
-
- // returns an estimate of the marginal probability
- double MarginalEstimate() const {
- if (!backward) return 0;
- return backward[0];
- }
-
- // returns an backward estimate
- double Backward(int src_covered, int trg_covered) const {
- if (!backward) return 0;
- int ind = gs->r->node_addresses[src_covered][trg_covered];
- if (ind < 0) return 0;
- return backward[ind];
- }
-
- prob_t estp;
- float* backward;
- private:
- const GraphStructure* gs;
-};
-
-struct TransliterationsImpl {
- TransliterationsImpl(int max_src, int max_trg, double sr, const BackwardEstimator& b) :
- cp0(max_src, max_trg, sr),
- tccm(cp0),
- be(b),
- kMAX_SRC_CHUNK(max_src),
- kMAX_TRG_CHUNK(max_trg),
- kS2T_RATIO(sr),
- tot_pairs(), tot_mem() {
- }
- const CondBaseDist cp0;
- TransliterationChunkConditionalModel tccm;
- const BackwardEstimator& be;
-
- void Initialize(WordID src, const vector<WordID>& src_lets, WordID trg, const vector<WordID>& trg_lets) {
- const size_t src_len = src_lets.size();
- const size_t trg_len = trg_lets.size();
-
- // init graph structure
- if (src_len >= graphs.size()) graphs.resize(src_len + 1);
- if (trg_len >= graphs[src_len].size()) graphs[src_len].resize(trg_len + 1);
- GraphStructure& gs = graphs[src_len][trg_len];
- if (!gs.r) {
- double rat = exp(fabs(log(trg_len / (src_len * kS2T_RATIO))));
- if (rat > 1.5 || (rat > 2.4 && src_len < 6)) {
- cerr << " ** Forbidding transliterations of size " << src_len << "," << trg_len << ": " << rat << endl;
- gs.r = new Reachability(src_len, trg_len, 0, 0);
- } else {
- gs.r = new Reachability(src_len, trg_len, kMAX_SRC_CHUNK, kMAX_TRG_CHUNK);
- }
- }
-
- const Reachability& r = *gs.r;
-
- // init backward estimates
- if (src >= ests.size()) ests.resize(src + 1);
- unordered_map<WordID, ProbabilityEstimates>::iterator it = ests[src].find(trg);
- if (it != ests[src].end()) return; // already initialized
-
- it = ests[src].insert(make_pair(trg, ProbabilityEstimates(gs))).first;
- ProbabilityEstimates& est = it->second;
- if (!gs.r->nodes) return; // not derivable subject to length constraints
-
- be.InitializeGrid(src_lets, trg_lets, r, kS2T_RATIO, est.backward);
- cerr << TD::GetString(src_lets) << " ||| " << TD::GetString(trg_lets) << " ||| " << (est.backward[0] / trg_lets.size()) << endl;
- tot_pairs++;
- tot_mem += sizeof(float) * gs.r->nodes;
- }
-
- void Forbid(WordID src, const vector<WordID>& src_lets, WordID trg, const vector<WordID>& trg_lets) {
- const size_t src_len = src_lets.size();
- const size_t trg_len = trg_lets.size();
- // TODO
- }
-
- prob_t EstimateProbability(WordID s, const vector<WordID>& src, WordID t, const vector<WordID>& trg) const {
- assert(src.size() < graphs.size());
- const vector<GraphStructure>& tv = graphs[src.size()];
- assert(trg.size() < tv.size());
- const GraphStructure& gs = tv[trg.size()];
- if (gs.r->nodes == 0)
- return prob_t::Zero();
- const unordered_map<WordID, ProbabilityEstimates>::const_iterator it = ests[s].find(t);
- assert(it != ests[s].end());
- return it->second.estp;
- }
-
- void GraphSummary() const {
- double to = 0;
- double tn = 0;
- double tt = 0;
- for (int i = 0; i < graphs.size(); ++i) {
- const vector<GraphStructure>& vt = graphs[i];
- for (int j = 0; j < vt.size(); ++j) {
- const GraphStructure& gs = vt[j];
- if (!gs.r) continue;
- tt++;
- for (int k = 0; k < i; ++k) {
- for (int l = 0; l < j; ++l) {
- size_t c = gs.r->valid_deltas[k][l].size();
- if (c) {
- tn += 1;
- to += c;
- }
- }
- }
- }
- }
- cerr << " Average nodes = " << (tn / tt) << endl;
- cerr << "Average out-degree = " << (to / tn) << endl;
- cerr << " Unique structures = " << tt << endl;
- cerr << " Unique pairs = " << tot_pairs << endl;
- cerr << " BEs size = " << (tot_mem / (1024.0*1024.0)) << " MB" << endl;
- }
-
- const int kMAX_SRC_CHUNK;
- const int kMAX_TRG_CHUNK;
- const double kS2T_RATIO;
- unsigned tot_pairs;
- size_t tot_mem;
- vector<vector<GraphStructure> > graphs; // graphs[src_len][trg_len]
- vector<unordered_map<WordID, ProbabilityEstimates> > ests; // ests[src][trg]
-};
-
-Transliterations::Transliterations(int max_src, int max_trg, double sr, const BackwardEstimator& be) :
- pimpl_(new TransliterationsImpl(max_src, max_trg, sr, be)) {}
-Transliterations::~Transliterations() { delete pimpl_; }
-
-void Transliterations::Initialize(WordID src, const vector<WordID>& src_lets, WordID trg, const vector<WordID>& trg_lets) {
- pimpl_->Initialize(src, src_lets, trg, trg_lets);
-}
-
-prob_t Transliterations::EstimateProbability(WordID s, const vector<WordID>& src, WordID t, const vector<WordID>& trg) const {
- return pimpl_->EstimateProbability(s, src,t, trg);
-}
-
-void Transliterations::Forbid(WordID src, const vector<WordID>& src_lets, WordID trg, const vector<WordID>& trg_lets) {
- pimpl_->Forbid(src, src_lets, trg, trg_lets);
-}
-
-void Transliterations::GraphSummary() const {
- pimpl_->GraphSummary();
-}
-