diff options
Diffstat (limited to 'gi')
| -rw-r--r-- | gi/pf/Makefile.am | 3 | ||||
| -rw-r--r-- | gi/pf/align-lexonly-pyp.cc | 207 | ||||
| -rw-r--r-- | gi/pf/align-tl.cc | 18 | ||||
| -rw-r--r-- | gi/pf/backward.cc | 89 | ||||
| -rw-r--r-- | gi/pf/backward.h | 33 | ||||
| -rw-r--r-- | gi/pf/base_distributions.h | 8 | ||||
| -rwxr-xr-x | gi/pf/guess-translits.pl | 2 | ||||
| -rw-r--r-- | gi/pf/nuisance_test.cc | 6 | ||||
| -rw-r--r-- | gi/pf/pyp_lm.cc | 2 | ||||
| -rw-r--r-- | gi/pf/pyp_tm.cc | 113 | ||||
| -rw-r--r-- | gi/pf/pyp_tm.h | 34 | ||||
| -rw-r--r-- | gi/pf/pyp_word_model.cc | 20 | ||||
| -rw-r--r-- | gi/pf/pyp_word_model.h | 58 | ||||
| -rw-r--r-- | gi/pf/reachability.cc | 8 | ||||
| -rw-r--r-- | gi/pf/reachability.h | 8 | ||||
| -rw-r--r-- | gi/pf/transliterations.cc | 223 | ||||
| -rw-r--r-- | gi/pf/transliterations.h | 3 | 
17 files changed, 622 insertions, 213 deletions
diff --git a/gi/pf/Makefile.am b/gi/pf/Makefile.am index 94364c3d..4ce72ba1 100644 --- a/gi/pf/Makefile.am +++ b/gi/pf/Makefile.am @@ -2,7 +2,7 @@ bin_PROGRAMS = cbgi brat dpnaive pfbrat pfdist itg pfnaive condnaive align-lexon  noinst_LIBRARIES = libpf.a -libpf_a_SOURCES = base_distributions.cc reachability.cc cfg_wfst_composer.cc corpus.cc unigrams.cc ngram_base.cc transliterations.cc +libpf_a_SOURCES = base_distributions.cc reachability.cc cfg_wfst_composer.cc corpus.cc unigrams.cc ngram_base.cc transliterations.cc backward.cc pyp_word_model.cc pyp_tm.cc  nuisance_test_SOURCES = nuisance_test.cc  nuisance_test_LDADD = libpf.a $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a $(top_srcdir)/klm/lm/libklm.a $(top_srcdir)/klm/util/libklm_util.a -lz @@ -10,6 +10,7 @@ nuisance_test_LDADD = libpf.a $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mtev  align_lexonly_SOURCES = align-lexonly.cc  align_lexonly_pyp_SOURCES = align-lexonly-pyp.cc +align_lexonly_pyp_LDADD = libpf.a $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a $(top_srcdir)/klm/lm/libklm.a $(top_srcdir)/klm/util/libklm_util.a -lz  align_tl_SOURCES = align-tl.cc  align_tl_LDADD = libpf.a $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a $(top_srcdir)/klm/lm/libklm.a $(top_srcdir)/klm/util/libklm_util.a -lz diff --git a/gi/pf/align-lexonly-pyp.cc b/gi/pf/align-lexonly-pyp.cc index 13a3a487..d68a4b8f 100644 --- a/gi/pf/align-lexonly-pyp.cc +++ b/gi/pf/align-lexonly-pyp.cc @@ -1,27 +1,18 @@  #include <iostream> -#include <tr1/memory>  #include <queue> -#include <boost/multi_array.hpp>  #include <boost/program_options.hpp>  #include <boost/program_options/variables_map.hpp> -#include "array2d.h" -#include "base_distributions.h" -#include "monotonic_pseg.h" -#include "conditional_pseg.h" -#include "trule.h"  #include "tdict.h"  #include "stringlib.h"  #include "filelib.h" -#include "dict.h" +#include "array2d.h"  #include "sampler.h" -#include "mfcr.h"  #include "corpus.h" -#include "ngram_base.h" +#include "pyp_tm.h"  using namespace std; -using namespace tr1;  namespace po = boost::program_options;  void InitCommandLine(int argc, char** argv, po::variables_map* conf) { @@ -51,7 +42,7 @@ void InitCommandLine(int argc, char** argv, po::variables_map* conf) {    }  } -shared_ptr<MT19937> prng; +MT19937* prng;  struct LexicalAlignment {    unsigned char src_index; @@ -66,159 +57,59 @@ struct AlignedSentencePair {    Array2D<short> posterior;  }; -struct HierarchicalWordBase { -  explicit HierarchicalWordBase(const unsigned vocab_e_size) : -      base(prob_t::One()), r(1,1,1,1,0.66,50.0), u0(-log(vocab_e_size)), l(1,prob_t::One()), v(1, prob_t::Zero()) {} - -  void ResampleHyperparameters(MT19937* rng) { -    r.resample_hyperparameters(rng); -  } - -  inline double logp0(const vector<WordID>& s) const { -    return Md::log_poisson(s.size(), 7.5) + s.size() * u0; -  } - -  // return p0 of rule.e_ -  prob_t operator()(const TRule& rule) const { -    v[0].logeq(logp0(rule.e_)); -    return r.prob(rule.e_, v.begin(), l.begin()); -  } - -  void Increment(const TRule& rule) { -    v[0].logeq(logp0(rule.e_)); -    if (r.increment(rule.e_, v.begin(), l.begin(), &*prng).count) { -      base *= v[0] * l[0]; -    } -  } - -  void Decrement(const TRule& rule) { -    if (r.decrement(rule.e_, &*prng).count) { -      base /= prob_t(exp(logp0(rule.e_))); -    } -  } - -  prob_t Likelihood() const { -    prob_t p; p.logeq(r.log_crp_prob()); -    p *= base; -    return p; +struct Aligner { +  Aligner(const vector<vector<WordID> >& lets, int num_letters, vector<AlignedSentencePair>* c) : +      corpus(*c), +      model(lets, num_letters), +      kNULL(TD::Convert("NULL")) { +    assert(lets[kNULL].size() == 0);    } -  void Summary() const { -    cerr << "NUMBER OF CUSTOMERS: " << r.num_customers() << "  (d=" << r.discount() << ",s=" << r.strength() << ')' << endl; -    for (MFCR<1,vector<WordID> >::const_iterator it = r.begin(); it != r.end(); ++it) -      cerr << "   " << it->second.total_dish_count_ << " (on " << it->second.table_counts_.size() << " tables) " << TD::GetString(it->first) << endl; -  } - -  prob_t base; -  MFCR<1,vector<WordID> > r; -  const double u0; -  const vector<prob_t> l; -  mutable vector<prob_t> v; -}; - -struct BasicLexicalAlignment { -  explicit BasicLexicalAlignment(const vector<vector<WordID> >& lets, -                                 const unsigned words_e, -                                 const unsigned letters_e, -                                 vector<AlignedSentencePair>* corp) : -      letters(lets), -      corpus(*corp), -      //up0(words_e), -      //up0("en.chars.1gram", letters_e), -      //up0("en.words.1gram"), -      up0(letters_e), -      //up0("en.chars.2gram"), -      tmodel(up0) { -  } +  vector<AlignedSentencePair>& corpus; +  PYPLexicalTranslation model; +  const WordID kNULL; -  void InstantiateRule(const WordID src, -                       const WordID trg, -                       TRule* rule) const { -    static const WordID kX = TD::Convert("X") * -1; -    rule->lhs_ = kX; -    rule->e_ = letters[trg]; -    rule->f_ = letters[src]; +  void ResampleHyperparameters() { +    model.ResampleHyperparameters(prng);    }    void InitializeRandom() { -    const WordID kNULL = TD::Convert("NULL");      cerr << "Initializing with random alignments ...\n";      for (unsigned i = 0; i < corpus.size(); ++i) {        AlignedSentencePair& asp = corpus[i];        asp.a.resize(asp.trg.size());        for (unsigned j = 0; j < asp.trg.size(); ++j) { -        const unsigned char a_j = prng->next() * (1 + asp.src.size()); +        unsigned char& a_j = asp.a[j].src_index; +        a_j = prng->next() * (1 + asp.src.size());          const WordID f_a_j = (a_j ? asp.src[a_j - 1] : kNULL); -        TRule r; -        InstantiateRule(f_a_j, asp.trg[j], &r); -        asp.a[j].is_transliteration = false; -        asp.a[j].src_index = a_j; -        if (tmodel.IncrementRule(r, &*prng)) -          up0.Increment(r); +        model.Increment(f_a_j, asp.trg[j], &*prng);        }      } -    cerr << "  LLH = " << Likelihood() << endl; -  } - -  prob_t Likelihood() const { -    prob_t p = tmodel.Likelihood(); -    p *= up0.Likelihood(); -    return p; -  } - -  void ResampleHyperparemeters() { -    tmodel.ResampleHyperparameters(&*prng); -    up0.ResampleHyperparameters(&*prng); -    cerr << "  (base d=" << up0.r.discount() << ",s=" << up0.r.strength() << ")\n"; +    cerr << "Corpus intialized randomly. LLH = " << model.Likelihood() << endl;    } -  void ResampleCorpus(); - -  const vector<vector<WordID> >& letters; // spelling dictionary -  vector<AlignedSentencePair>& corpus; -  //PhraseConditionalUninformativeBase up0; -  //PhraseConditionalUninformativeUnigramBase up0; -  //UnigramWordBase up0; -  //HierarchicalUnigramBase up0; -  HierarchicalWordBase up0; -  //CompletelyUniformBase up0; -  //FixedNgramBase up0; -  //ConditionalTranslationModel<PhraseConditionalUninformativeBase> tmodel; -  //ConditionalTranslationModel<PhraseConditionalUninformativeUnigramBase> tmodel; -  //ConditionalTranslationModel<UnigramWordBase> tmodel; -  //ConditionalTranslationModel<HierarchicalUnigramBase> tmodel; -  MConditionalTranslationModel<HierarchicalWordBase> tmodel; -  //ConditionalTranslationModel<FixedNgramBase> tmodel; -  //ConditionalTranslationModel<CompletelyUniformBase> tmodel; -}; - -void BasicLexicalAlignment::ResampleCorpus() { -  static const WordID kNULL = TD::Convert("NULL"); -  for (unsigned i = 0; i < corpus.size(); ++i) { -    AlignedSentencePair& asp = corpus[i]; -    SampleSet<prob_t> ss; ss.resize(asp.src.size() + 1); -    for (unsigned j = 0; j < asp.trg.size(); ++j) { -      TRule r; -      unsigned char& a_j = asp.a[j].src_index; -      WordID f_a_j = (a_j ? asp.src[a_j - 1] : kNULL); -      InstantiateRule(f_a_j, asp.trg[j], &r); -      if (tmodel.DecrementRule(r, &*prng)) -        up0.Decrement(r); - -      for (unsigned prop_a_j = 0; prop_a_j <= asp.src.size(); ++prop_a_j) { -        const WordID prop_f = (prop_a_j ? asp.src[prop_a_j - 1] : kNULL); -        InstantiateRule(prop_f, asp.trg[j], &r); -        ss[prop_a_j] = tmodel.RuleProbability(r); +  void ResampleCorpus() { +    for (unsigned i = 0; i < corpus.size(); ++i) { +      AlignedSentencePair& asp = corpus[i]; +      SampleSet<prob_t> ss; ss.resize(asp.src.size() + 1); +      for (unsigned j = 0; j < asp.trg.size(); ++j) { +        unsigned char& a_j = asp.a[j].src_index; +        const WordID e_j = asp.trg[j]; +        WordID f_a_j = (a_j ? asp.src[a_j - 1] : kNULL); +        model.Decrement(f_a_j, e_j, prng); + +        for (unsigned prop_a_j = 0; prop_a_j <= asp.src.size(); ++prop_a_j) { +          const WordID prop_f = (prop_a_j ? asp.src[prop_a_j - 1] : kNULL); +          ss[prop_a_j] = model.Prob(prop_f, e_j); +        } +        a_j = prng->SelectSample(ss); +        f_a_j = (a_j ? asp.src[a_j - 1] : kNULL); +        model.Increment(f_a_j, e_j, prng);        } -      a_j = prng->SelectSample(ss); -      f_a_j = (a_j ? asp.src[a_j - 1] : kNULL); -      InstantiateRule(f_a_j, asp.trg[j], &r); -      if (tmodel.IncrementRule(r, &*prng)) -        up0.Increment(r);      } +    cerr << "LLH = " << model.Likelihood() << " " << model.UniqueConditioningContexts() << endl;    } -  cerr << "  LLH = " << Likelihood() << endl; -} +};  void ExtractLetters(const set<WordID>& v, vector<vector<WordID> >* l, set<WordID>* letset = NULL) {    for (set<WordID>::const_iterator it = v.begin(); it != v.end(); ++it) { @@ -240,8 +131,10 @@ void ExtractLetters(const set<WordID>& v, vector<vector<WordID> >* l, set<WordID  void Debug(const AlignedSentencePair& asp) {    cerr << TD::GetString(asp.src) << endl << TD::GetString(asp.trg) << endl;    Array2D<bool> a(asp.src.size(), asp.trg.size()); -  for (unsigned j = 0; j < asp.trg.size(); ++j) +  for (unsigned j = 0; j < asp.trg.size(); ++j) { +    assert(asp.a[j].src_index <= asp.src.size());      if (asp.a[j].src_index) a(asp.a[j].src_index - 1, j) = true; +  }    cerr << a << endl;  } @@ -275,10 +168,9 @@ int main(int argc, char** argv) {    InitCommandLine(argc, argv, &conf);    if (conf.count("random_seed")) -    prng.reset(new MT19937(conf["random_seed"].as<uint32_t>())); +    prng = new MT19937(conf["random_seed"].as<uint32_t>());    else -    prng.reset(new MT19937); -//  MT19937& rng = *prng; +    prng = new MT19937;    vector<vector<int> > corpuse, corpusf;    set<int> vocabe, vocabf; @@ -304,23 +196,18 @@ int main(int argc, char** argv) {    ExtractLetters(vocabf, &letters, NULL);    letters[TD::Convert("NULL")].clear(); -  BasicLexicalAlignment x(letters, vocabe.size(), letset.size(), &corpus); -  x.InitializeRandom(); +  Aligner aligner(letters, letset.size(), &corpus); +  aligner.InitializeRandom(); +    const unsigned samples = conf["samples"].as<unsigned>();    for (int i = 0; i < samples; ++i) {      for (int j = 65; j < 67; ++j) Debug(corpus[j]); -    cerr << i << "\t" << x.tmodel.r.size() << "\t"; -    if (i % 7 == 6) x.ResampleHyperparemeters(); -    x.ResampleCorpus(); +    if (i % 7 == 6) aligner.ResampleHyperparameters(); +    aligner.ResampleCorpus();      if (i > (samples / 5) && (i % 10 == 9)) for (int j = 0; j < corpus.size(); ++j) AddSample(&corpus[j]);    }    for (unsigned i = 0; i < corpus.size(); ++i)      WriteAlignments(corpus[i]); -  //ModelAndData posterior(x, &corpus, vocabe, vocabf); -  x.tmodel.Summary(); -  x.up0.Summary(); - -  //posterior.Sample();    return 0;  } diff --git a/gi/pf/align-tl.cc b/gi/pf/align-tl.cc index fc9b7ca5..cbe8c6c8 100644 --- a/gi/pf/align-tl.cc +++ b/gi/pf/align-tl.cc @@ -6,6 +6,7 @@  #include <boost/program_options.hpp>  #include <boost/program_options/variables_map.hpp> +#include "backward.h"  #include "array2d.h"  #include "base_distributions.h"  #include "monotonic_pseg.h" @@ -30,10 +31,11 @@ void InitCommandLine(int argc, char** argv, po::variables_map* conf) {    opts.add_options()          ("samples,s",po::value<unsigned>()->default_value(1000),"Number of samples")          ("input,i",po::value<string>(),"Read parallel data from") +        ("s2t", po::value<string>(), "character level source-to-target prior transliteration probabilities") +        ("t2s", po::value<string>(), "character level target-to-source prior transliteration probabilities")          ("max_src_chunk", po::value<unsigned>()->default_value(4), "Maximum size of translitered chunk in source")          ("max_trg_chunk", po::value<unsigned>()->default_value(4), "Maximum size of translitered chunk in target") -        ("min_transliterated_src_length", po::value<unsigned>()->default_value(3), "Minimum length of source words considered for transliteration") -        ("filter_ratio", po::value<double>()->default_value(0.66), "Filter ratio: basically, if the lengths differ by less than this ratio, mark the pair as non-transliteratable") +        ("expected_src_to_trg_ratio", po::value<double>()->default_value(1.0), "If a word is transliterated, what is the expected length ratio from source to target?")          ("random_seed,S",po::value<uint32_t>(), "Random seed");    po::options_description clo("Command line options");    clo.add_options() @@ -303,7 +305,7 @@ int main(int argc, char** argv) {    corpusf.clear(); corpuse.clear();    vocabf.insert(TD::Convert("NULL")); -  vector<vector<WordID> > letters(TD::NumWords()); +  vector<vector<WordID> > letters(TD::NumWords() + 1);    set<WordID> letset;    ExtractLetters(vocabe, &letters, &letset);    ExtractLetters(vocabf, &letters, NULL); @@ -312,9 +314,9 @@ int main(int argc, char** argv) {    // TODO configure this    const int max_src_chunk = conf["max_src_chunk"].as<unsigned>();    const int max_trg_chunk = conf["max_trg_chunk"].as<unsigned>(); -  const double filter_rat = conf["filter_ratio"].as<double>(); -  const int min_trans_src = conf["min_transliterated_src_length"].as<unsigned>(); -  Transliterations tl(max_src_chunk, max_trg_chunk, filter_rat); +  const double s2t_rat = conf["expected_src_to_trg_ratio"].as<double>(); +  const BackwardEstimator be(conf["s2t"].as<string>(), conf["t2s"].as<string>()); +  Transliterations tl(max_src_chunk, max_trg_chunk, s2t_rat, be);     cerr << "Initializing transliteration graph structures ...\n";    for (int i = 0; i < corpus.size(); ++i) { @@ -325,8 +327,8 @@ int main(int argc, char** argv) {        for (int k = 0; k < trg.size(); ++k) {          const vector<int>& trg_let = letters[trg[k]];          tl.Initialize(src[j], src_let, trg[k], trg_let); -        if (src_let.size() < min_trans_src) -          tl.Forbid(src[j], src_let, trg[k], trg_let); +        //if (src_let.size() < min_trans_src) +        //  tl.Forbid(src[j], src_let, trg[k], trg_let);        }      }    } diff --git a/gi/pf/backward.cc b/gi/pf/backward.cc new file mode 100644 index 00000000..b92629fd --- /dev/null +++ b/gi/pf/backward.cc @@ -0,0 +1,89 @@ +#include "backward.h" + +#include <queue> +#include <utility> + +#include "array2d.h" +#include "reachability.h" +#include "base_distributions.h" + +using namespace std; + +BackwardEstimator::BackwardEstimator(const string& s2t, +                    const string& t2s) : m1(new Model1(s2t)), m1inv(new Model1(t2s)) {} + +BackwardEstimator::~BackwardEstimator() { +  delete m1; m1 = NULL; +  delete m1inv; m1inv = NULL; +} + +float BackwardEstimator::ComputeBackwardProb(const std::vector<WordID>& src, +                                             const std::vector<WordID>& trg, +                                             unsigned src_covered, +                                             unsigned trg_covered, +                                             double s2t_ratio) const { +  if (src_covered == src.size() || trg_covered == trg.size()) { +    assert(src_covered == src.size()); +    assert(trg_covered == trg.size()); +    return 0; +  } +  static const WordID kNULL = TD::Convert("<eps>"); +  const prob_t uniform_alignment(1.0 / (src.size() - src_covered + 1)); +  // TODO factor in expected length ratio +  prob_t e; e.logeq(Md::log_poisson(trg.size() - trg_covered, (src.size() - src_covered) * s2t_ratio)); // p(trg len remaining | src len remaining) +  for (unsigned j = trg_covered; j < trg.size(); ++j) { +    prob_t p = (*m1)(kNULL, trg[j]) + prob_t(1e-12); +    for (unsigned i = src_covered; i < src.size(); ++i) +      p += (*m1)(src[i], trg[j]); +    if (p.is_0()) { +      cerr << "ERROR: p(" << TD::Convert(trg[j]) << " | " << TD::GetString(src) << ") = 0!\n"; +      assert(!"failed"); +    } +    p *= uniform_alignment; +    e *= p; +  } +  // TODO factor in expected length ratio +  const prob_t inv_uniform(1.0 / (trg.size() - trg_covered + 1.0)); +  prob_t inv; +  inv.logeq(Md::log_poisson(src.size() - src_covered, (trg.size() - trg_covered) / s2t_ratio)); +  for (unsigned i = src_covered; i < src.size(); ++i) { +    prob_t p = (*m1inv)(kNULL, src[i]) + prob_t(1e-12); +    for (unsigned j = trg_covered; j < trg.size(); ++j) +      p += (*m1inv)(trg[j], src[i]); +    if (p.is_0()) { +      cerr << "ERROR: p_inv(" << TD::Convert(src[i]) << " | " << TD::GetString(trg) << ") = 0!\n"; +      assert(!"failed"); +    } +    p *= inv_uniform; +    inv *= p; +  } +  return (log(e) + log(inv)) / 2; +} + +void BackwardEstimator::InitializeGrid(const vector<WordID>& src, +                      const vector<WordID>& trg, +                      const Reachability& r, +                      double s2t_ratio, +                      float* grid) const { +  queue<pair<int,int> > q; +  q.push(make_pair(0,0)); +  Array2D<bool> done(src.size()+1, trg.size()+1, false); +  //cerr << TD::GetString(src) << " ||| " << TD::GetString(trg) << endl; +  while(!q.empty()) { +    const pair<int,int> n = q.front(); +    q.pop(); +    if (done(n.first,n.second)) continue; +    done(n.first,n.second) = true; + +    float lp = ComputeBackwardProb(src, trg, n.first, n.second, s2t_ratio); +    if (n.first == 0 && n.second == 0) grid[0] = lp; +    //cerr << "  " << n.first << "," << n.second << "\t" << lp << endl; + +    if (n.first == src.size() || n.second == trg.size()) continue; +    const vector<pair<short,short> >& edges = r.valid_deltas[n.first][n.second]; +    for (int i = 0; i < edges.size(); ++i) +      q.push(make_pair(n.first + edges[i].first, n.second + edges[i].second)); +  } +  //static int cc = 0; ++cc; if (cc == 80) exit(1); +} + diff --git a/gi/pf/backward.h b/gi/pf/backward.h new file mode 100644 index 00000000..e67eff0c --- /dev/null +++ b/gi/pf/backward.h @@ -0,0 +1,33 @@ +#ifndef _BACKWARD_H_ +#define _BACKWARD_H_ + +#include <vector> +#include <string> +#include "wordid.h" + +struct Reachability; +struct Model1; + +struct BackwardEstimator { +  BackwardEstimator(const std::string& s2t, +                    const std::string& t2s); +  ~BackwardEstimator(); + +  void InitializeGrid(const std::vector<WordID>& src, +                      const std::vector<WordID>& trg, +                      const Reachability& r, +                      double src2trg_ratio, +                      float* grid) const; + + private: +  float ComputeBackwardProb(const std::vector<WordID>& src, +                            const std::vector<WordID>& trg, +                            unsigned src_covered, +                            unsigned trg_covered, +                            double src2trg_ratio) const; + +  Model1* m1; +  Model1* m1inv; +}; + +#endif diff --git a/gi/pf/base_distributions.h b/gi/pf/base_distributions.h index 0d597c5c..84dacdf2 100644 --- a/gi/pf/base_distributions.h +++ b/gi/pf/base_distributions.h @@ -14,13 +14,7 @@  #include "tdict.h"  #include "sampler.h"  #include "m.h" - -inline std::ostream& operator<<(std::ostream& os, const std::vector<WordID>& p) { -  os << '['; -  for (int i = 0; i < p.size(); ++i) -    os << (i==0 ? "" : " ") << TD::Convert(p[i]); -  return os << ']'; -} +#include "os_phrase.h"  struct Model1 {    explicit Model1(const std::string& fname) : diff --git a/gi/pf/guess-translits.pl b/gi/pf/guess-translits.pl index aafec13a..d00c2168 100755 --- a/gi/pf/guess-translits.pl +++ b/gi/pf/guess-translits.pl @@ -69,4 +69,4 @@ for my $f (keys %fs) {    }  }  print STDERR "Extracted $num pairs.\n"; -print STDERR "Recommend running:\n   ../../training/model1 -t -99999 output.txt\n"; +print STDERR "Recommend running:\n   ../../training/model1 -v -d -t -99999 output.txt\n"; diff --git a/gi/pf/nuisance_test.cc b/gi/pf/nuisance_test.cc index 0f44fe95..fc0af9cb 100644 --- a/gi/pf/nuisance_test.cc +++ b/gi/pf/nuisance_test.cc @@ -124,9 +124,9 @@ int main(int argc, char** argv) {    WordID y = TD::Convert("remember");    vector<WordID> src; TD::ConvertSentence("s o u v e n o n s", &src);    vector<WordID> trg; TD::ConvertSentence("r e m e m b e r", &trg); -  Transliterations xx; -  xx.Initialize(x, src, y, trg); -  return 1; +//  Transliterations xx; +//  xx.Initialize(x, src, y, trg); +//  return 1;   for (int j = 0; j < ITERS; ++j) {    Base b; diff --git a/gi/pf/pyp_lm.cc b/gi/pf/pyp_lm.cc index 104f356b..52e6be2c 100644 --- a/gi/pf/pyp_lm.cc +++ b/gi/pf/pyp_lm.cc @@ -18,7 +18,7 @@  // I use templates to handle the recursive formalation of the prior, so  // the order of the model has to be specified here, at compile time: -#define kORDER 4 +#define kORDER 3  using namespace std;  using namespace tr1; diff --git a/gi/pf/pyp_tm.cc b/gi/pf/pyp_tm.cc new file mode 100644 index 00000000..94cbe7c3 --- /dev/null +++ b/gi/pf/pyp_tm.cc @@ -0,0 +1,113 @@ +#include "pyp_tm.h" + +#include <tr1/unordered_map> +#include <iostream> +#include <queue> + +#include "base_distributions.h" +#include "monotonic_pseg.h" +#include "conditional_pseg.h" +#include "tdict.h" +#include "ccrp.h" +#include "pyp_word_model.h" + +using namespace std; +using namespace std::tr1; + +template <typename Base> +struct ConditionalPYPWordModel { +  ConditionalPYPWordModel(Base* b) : base(*b) {} + +  void Summary() const { +    cerr << "Number of conditioning contexts: " << r.size() << endl; +    for (RuleModelHash::const_iterator it = r.begin(); it != r.end(); ++it) { +      cerr << TD::Convert(it->first) << "   \tPYP(d=" << it->second.discount() << ",s=" << it->second.strength() << ") --------------------------" << endl; +      for (CCRP<vector<WordID> >::const_iterator i2 = it->second.begin(); i2 != it->second.end(); ++i2) +        cerr << "   " << i2->second.total_dish_count_ << '\t' << TD::GetString(i2->first) << endl; +    } +  } + +  void ResampleHyperparameters(MT19937* rng) { +    for (RuleModelHash::iterator it = r.begin(); it != r.end(); ++it) +      it->second.resample_hyperparameters(rng); +  }  + +  prob_t Prob(const WordID src, const vector<WordID>& trglets) const { +    RuleModelHash::const_iterator it = r.find(src); +    if (it == r.end()) { +      return base(trglets); +    } else { +      return it->second.prob(trglets, base(trglets)); +    } +  } + +  void Increment(const WordID src, const vector<WordID>& trglets, MT19937* rng) { +    RuleModelHash::iterator it = r.find(src); +    if (it == r.end()) +      it = r.insert(make_pair(src, CCRP<vector<WordID> >(1,1,1,1,0.5,1.0))).first; +    if (it->second.increment(trglets, base(trglets), rng)) +      base.Increment(trglets, rng); +  } + +  void Decrement(const WordID src, const vector<WordID>& trglets, MT19937* rng) { +    RuleModelHash::iterator it = r.find(src); +    assert(it != r.end()); +    if (it->second.decrement(trglets, rng)) { +      base.Decrement(trglets, rng); +      if (it->second.num_customers() == 0) +        r.erase(it); +    } +  } + +  prob_t Likelihood() const { +    prob_t p = prob_t::One(); +    for (RuleModelHash::const_iterator it = r.begin(); it != r.end(); ++it) { +      prob_t q; q.logeq(it->second.log_crp_prob()); +      p *= q; +    } +    return p; +  } + +  unsigned UniqueConditioningContexts() const { +    return r.size(); +  } + +  Base& base; +  typedef unordered_map<WordID, CCRP<vector<WordID> > > RuleModelHash; +  RuleModelHash r; +}; + +PYPLexicalTranslation::PYPLexicalTranslation(const vector<vector<WordID> >& lets, +                                             const unsigned num_letters) : +    letters(lets), +    up0(new PYPWordModel(num_letters)), +    tmodel(new ConditionalPYPWordModel<PYPWordModel>(up0)), +    kX(-TD::Convert("X")) {} + +prob_t PYPLexicalTranslation::Likelihood() const { +  prob_t p = up0->Likelihood(); +  p *= tmodel->Likelihood(); +  return p; +} + +void PYPLexicalTranslation::ResampleHyperparameters(MT19937* rng) { +  tmodel->ResampleHyperparameters(rng); +  up0->ResampleHyperparameters(rng); +} + +unsigned PYPLexicalTranslation::UniqueConditioningContexts() const { +  return tmodel->UniqueConditioningContexts(); +} + +prob_t PYPLexicalTranslation::Prob(WordID src, WordID trg) const { +  return tmodel->Prob(src, letters[trg]); +} + +void PYPLexicalTranslation::Increment(WordID src, WordID trg, MT19937* rng) { +  tmodel->Increment(src, letters[trg], rng); +} + +void PYPLexicalTranslation::Decrement(WordID src, WordID trg, MT19937* rng) { +  tmodel->Decrement(src, letters[trg], rng); +} + diff --git a/gi/pf/pyp_tm.h b/gi/pf/pyp_tm.h new file mode 100644 index 00000000..fa0fb28f --- /dev/null +++ b/gi/pf/pyp_tm.h @@ -0,0 +1,34 @@ +#ifndef PYP_LEX_TRANS +#define PYP_LEX_TRANS + +#include <vector> +#include "wordid.h" +#include "prob.h" +#include "sampler.h" + +struct TRule; +struct PYPWordModel; +template <typename T> struct ConditionalPYPWordModel; + +struct PYPLexicalTranslation { +  explicit PYPLexicalTranslation(const std::vector<std::vector<WordID> >& lets, +                                 const unsigned num_letters); + +  prob_t Likelihood() const; + +  void ResampleHyperparameters(MT19937* rng); +  prob_t Prob(WordID src, WordID trg) const;  // return p(trg | src) +  void Summary() const; +  void Increment(WordID src, WordID trg, MT19937* rng); +  void Decrement(WordID src, WordID trg, MT19937* rng); +  unsigned UniqueConditioningContexts() const; + + private: +  const std::vector<std::vector<WordID> >& letters;   // spelling dictionary +  PYPWordModel* up0;  // base distribuction (model English word) +  ConditionalPYPWordModel<PYPWordModel>* tmodel;  // translation distributions +                      // (model English word | French word) +  const WordID kX; +}; + +#endif diff --git a/gi/pf/pyp_word_model.cc b/gi/pf/pyp_word_model.cc new file mode 100644 index 00000000..12df4abf --- /dev/null +++ b/gi/pf/pyp_word_model.cc @@ -0,0 +1,20 @@ +#include "pyp_word_model.h" + +#include <iostream> + +using namespace std; + +void PYPWordModel::ResampleHyperparameters(MT19937* rng) { +  r.resample_hyperparameters(rng); +  cerr << " PYPWordModel(d=" << r.discount() << ",s=" << r.strength() << ")\n"; +} + +void PYPWordModel::Summary() const { +  cerr << "PYPWordModel: generations=" << r.num_customers() +       << " PYP(d=" << r.discount() << ",s=" << r.strength() << ')' << endl; +  for (CCRP<vector<WordID> >::const_iterator it = r.begin(); it != r.end(); ++it) +    cerr << "   " << it->second.total_dish_count_ +              << " (on " << it->second.table_counts_.size() << " tables) " +              << TD::GetString(it->first) << endl; +} + diff --git a/gi/pf/pyp_word_model.h b/gi/pf/pyp_word_model.h new file mode 100644 index 00000000..800a4fd7 --- /dev/null +++ b/gi/pf/pyp_word_model.h @@ -0,0 +1,58 @@ +#ifndef _PYP_WORD_MODEL_H_ +#define _PYP_WORD_MODEL_H_ + +#include <iostream> +#include <cmath> +#include <vector> +#include "prob.h" +#include "ccrp.h" +#include "m.h" +#include "tdict.h" +#include "os_phrase.h" + +// PYP(d,s,poisson-uniform) represented as a CRP +struct PYPWordModel { +  explicit PYPWordModel(const unsigned vocab_e_size, const double mean_len = 7.5) : +      base(prob_t::One()), r(1,1,1,1,0.66,50.0), u0(-std::log(vocab_e_size)), mean_length(mean_len) {} + +  void ResampleHyperparameters(MT19937* rng); + +  inline prob_t operator()(const std::vector<WordID>& s) const { +    return r.prob(s, p0(s)); +  } + +  inline void Increment(const std::vector<WordID>& s, MT19937* rng) { +    if (r.increment(s, p0(s), rng)) +      base *= p0(s); +  } + +  inline void Decrement(const std::vector<WordID>& s, MT19937 *rng) { +    if (r.decrement(s, rng)) +      base /= p0(s); +  } + +  inline prob_t Likelihood() const { +    prob_t p; p.logeq(r.log_crp_prob()); +    p *= base; +    return p; +  } + +  void Summary() const; + + private: +  inline double logp0(const std::vector<WordID>& s) const { +    return Md::log_poisson(s.size(), mean_length) + s.size() * u0; +  } + +  inline prob_t p0(const std::vector<WordID>& s) const { +    prob_t p; p.logeq(logp0(s)); +    return p; +  } + +  prob_t base;  // keeps track of the draws from the base distribution +  CCRP<std::vector<WordID> > r; +  const double u0;  // uniform log prob of generating a letter +  const double mean_length;  // mean length of a word in the base distribution +}; + +#endif diff --git a/gi/pf/reachability.cc b/gi/pf/reachability.cc index c10000f2..7d0d04ac 100644 --- a/gi/pf/reachability.cc +++ b/gi/pf/reachability.cc @@ -12,7 +12,7 @@ struct SState {    int prev_trg_covered;  }; -void Reachability::ComputeReachability(int srclen, int trglen, int src_max_phrase_len, int trg_max_phrase_len, double filter_ratio) { +void Reachability::ComputeReachability(int srclen, int trglen, int src_max_phrase_len, int trg_max_phrase_len) {      typedef boost::multi_array<vector<SState>, 2> array_type;      array_type a(boost::extents[srclen + 1][trglen + 1]);      a[0][0].push_back(SState()); @@ -31,9 +31,9 @@ void Reachability::ComputeReachability(int srclen, int trglen, int src_max_phras      }      a[0][0].clear();      //cerr << srclen << "," << trglen << ": Final cell contains " << a[srclen][trglen].size() << " back pointers\n"; -    size_t min_allowed = (src_max_phrase_len + 1) * (trg_max_phrase_len + 1) * (filter_ratio * filter_ratio); -    if (a[srclen][trglen].size() < min_allowed) { -      cerr << "Sequence pair with lengths (" << srclen << ',' << trglen << ") violates reachability constraint of min indegree " << min_allowed << " with " << a[srclen][trglen].size() << " in edges\n"; +    if (a[srclen][trglen].empty()) { +      cerr << "Sequence pair with lengths (" << srclen << ',' << trglen << ") violates reachability constraints\n"; +      nodes = 0;        return;      } diff --git a/gi/pf/reachability.h b/gi/pf/reachability.h index 03967d44..1e22c76a 100644 --- a/gi/pf/reachability.h +++ b/gi/pf/reachability.h @@ -18,19 +18,17 @@ struct Reachability {    boost::multi_array<short, 2> node_addresses; // na[src_covered][trg_covered] -- the index of the node in a one-dimensional array (of size "nodes")    boost::multi_array<std::vector<std::pair<short,short> >, 2> valid_deltas; // valid_deltas[src_covered][trg_covered] list of valid transitions leaving a particular node -  // filter_ratio says if the number of outgoing edges from the first cell is less than -  //    src_max * trg_max * filter_rat^2 then mark as non reachable -  Reachability(int srclen, int trglen, int src_max_phrase_len, int trg_max_phrase_len, double filter_ratio = 0.0) : +  Reachability(int srclen, int trglen, int src_max_phrase_len, int trg_max_phrase_len) :        nodes(),        edges(boost::extents[srclen][trglen][src_max_phrase_len+1][trg_max_phrase_len+1]),        max_src_delta(boost::extents[srclen][trglen]),        node_addresses(boost::extents[srclen][trglen]),        valid_deltas(boost::extents[srclen][trglen]) { -    ComputeReachability(srclen, trglen, src_max_phrase_len, trg_max_phrase_len, filter_ratio); +    ComputeReachability(srclen, trglen, src_max_phrase_len, trg_max_phrase_len);    }   private: -  void ComputeReachability(int srclen, int trglen, int src_max_phrase_len, int trg_max_phrase_len, double filter_ratio); +  void ComputeReachability(int srclen, int trglen, int src_max_phrase_len, int trg_max_phrase_len);  };  #endif diff --git a/gi/pf/transliterations.cc b/gi/pf/transliterations.cc index 8ea4ebd2..2200715e 100644 --- a/gi/pf/transliterations.cc +++ b/gi/pf/transliterations.cc @@ -5,14 +5,173 @@  #include "boost/shared_ptr.hpp" +#include "backward.h"  #include "filelib.h" -#include "ccrp.h" +#include "tdict.h" +#include "trule.h" +#include "filelib.h" +#include "ccrp_nt.h"  #include "m.h"  #include "reachability.h"  using namespace std;  using namespace std::tr1; +struct TruncatedConditionalLengthModel { +  TruncatedConditionalLengthModel(unsigned max_src_size, unsigned max_trg_size, double expected_src_to_trg_ratio) : +      plens(max_src_size+1, vector<prob_t>(max_trg_size+1, 0.0)) { +    for (unsigned i = 1; i <= max_src_size; ++i) { +      prob_t z = prob_t::Zero(); +      for (unsigned j = 1; j <= max_trg_size; ++j) +        z += (plens[i][j] = prob_t(0.01 + exp(Md::log_poisson(j, i * expected_src_to_trg_ratio)))); +      for (unsigned j = 1; j <= max_trg_size; ++j) +        plens[i][j] /= z; +      //for (unsigned j = 1; j <= max_trg_size; ++j) +      //  cerr << "P(trg_len=" << j << " | src_len=" << i << ") = " << plens[i][j] << endl; +    } +  } + +  // return p(tlen | slen) for *chunks* not full words +  inline const prob_t& operator()(int slen, int tlen) const { +    return plens[slen][tlen]; +  } + +  vector<vector<prob_t> > plens; +}; + +struct CondBaseDist { +  CondBaseDist(unsigned max_src_size, unsigned max_trg_size, double expected_src_to_trg_ratio) : +    tclm(max_src_size, max_trg_size, expected_src_to_trg_ratio) {} + +  prob_t operator()(const vector<WordID>& src, unsigned sf, unsigned st, +                    const vector<WordID>& trg, unsigned tf, unsigned tt) const { +    prob_t p = tclm(st - sf, tt - tf);  // target len | source length ~ TCLM(source len) +    assert(!"not impl"); +    return p; +  } +  inline prob_t operator()(const vector<WordID>& src, const vector<WordID>& trg) const { +    return (*this)(src, 0, src.size(), trg, 0, trg.size()); +  } +  TruncatedConditionalLengthModel tclm; +}; + +// represents transliteration phrase probabilities, e.g. +//   p( a l - | A l ) , p( o | A w ) , ... +struct TransliterationChunkConditionalModel { +  explicit TransliterationChunkConditionalModel(const CondBaseDist& pp0) : +      d(0.0), +      strength(1.0), +      rp0(pp0) { +  } + +  void Summary() const { +    std::cerr << "Number of conditioning contexts: " << r.size() << std::endl; +    for (RuleModelHash::const_iterator it = r.begin(); it != r.end(); ++it) { +      std::cerr << TD::GetString(it->first) << "   \t(\\alpha = " << it->second.alpha() << ") --------------------------" << std::endl; +      for (CCRP_NoTable<TRule>::const_iterator i2 = it->second.begin(); i2 != it->second.end(); ++i2) +        std::cerr << "   " << i2->second << '\t' << i2->first << std::endl; +    } +  } + +  int DecrementRule(const TRule& rule) { +    RuleModelHash::iterator it = r.find(rule.f_); +    assert(it != r.end());     +    int count = it->second.decrement(rule); +    if (count) { +      if (it->second.num_customers() == 0) r.erase(it); +    } +    return count; +  } + +  int IncrementRule(const TRule& rule) { +    RuleModelHash::iterator it = r.find(rule.f_); +    if (it == r.end()) { +      it = r.insert(make_pair(rule.f_, CCRP_NoTable<TRule>(strength))).first; +    }  +    int count = it->second.increment(rule); +    return count; +  } + +  void IncrementRules(const std::vector<TRulePtr>& rules) { +    for (int i = 0; i < rules.size(); ++i) +      IncrementRule(*rules[i]); +  } + +  void DecrementRules(const std::vector<TRulePtr>& rules) { +    for (int i = 0; i < rules.size(); ++i) +      DecrementRule(*rules[i]); +  } + +  prob_t RuleProbability(const TRule& rule) const { +    prob_t p; +    RuleModelHash::const_iterator it = r.find(rule.f_); +    if (it == r.end()) { +      p = rp0(rule.f_, rule.e_); +    } else { +      p = it->second.prob(rule, rp0(rule.f_, rule.e_)); +    } +    return p; +  } + +  double LogLikelihood(const double& dd, const double& aa) const { +    if (aa <= -dd) return -std::numeric_limits<double>::infinity(); +    //double llh = Md::log_beta_density(dd, 10, 3) + Md::log_gamma_density(aa, 1, 1); +    double llh = //Md::log_beta_density(dd, 1, 1) + +                 Md::log_gamma_density(dd + aa, 1, 1); +    typename std::tr1::unordered_map<std::vector<WordID>, CCRP_NoTable<TRule>, boost::hash<std::vector<WordID> > >::const_iterator it; +    for (it = r.begin(); it != r.end(); ++it) +      llh += it->second.log_crp_prob(aa); +    return llh; +  } + +  struct AlphaResampler { +    AlphaResampler(const TransliterationChunkConditionalModel& m) : m_(m) {} +    const TransliterationChunkConditionalModel& m_; +    double operator()(const double& proposed_strength) const { +      return m_.LogLikelihood(m_.d, proposed_strength); +    } +  }; + +  void ResampleHyperparameters(MT19937* rng) { +    typename std::tr1::unordered_map<std::vector<WordID>, CCRP_NoTable<TRule>, boost::hash<std::vector<WordID> > >::iterator it; +    //const unsigned nloop = 5; +    const unsigned niterations = 10; +    //DiscountResampler dr(*this); +    AlphaResampler ar(*this); +#if 0 +    for (int iter = 0; iter < nloop; ++iter) { +      strength = slice_sampler1d(ar, strength, *rng, -d + std::numeric_limits<double>::min(), +                              std::numeric_limits<double>::infinity(), 0.0, niterations, 100*niterations); +      double min_discount = std::numeric_limits<double>::min(); +      if (strength < 0.0) min_discount -= strength; +      d = slice_sampler1d(dr, d, *rng, min_discount, +                          1.0, 0.0, niterations, 100*niterations); +    } +#endif +    strength = slice_sampler1d(ar, strength, *rng, -d, +                            std::numeric_limits<double>::infinity(), 0.0, niterations, 100*niterations); +    std::cerr << "CTMModel(alpha=" << strength << ") = " << LogLikelihood(d, strength) << std::endl; +    for (it = r.begin(); it != r.end(); ++it) { +#if 0 +      it->second.set_discount(d); +#endif +      it->second.set_alpha(strength); +    } +  } + +  prob_t Likelihood() const { +    prob_t p; p.logeq(LogLikelihood(d, strength)); +    return p; +  } + +  const CondBaseDist& rp0; +  typedef std::tr1::unordered_map<std::vector<WordID>, +                                  CCRP_NoTable<TRule>, +                                  boost::hash<std::vector<WordID> > > RuleModelHash; +  RuleModelHash r; +  double d, strength; +}; +  struct GraphStructure {    GraphStructure() : r() {}    // leak memory - these are basically static @@ -20,9 +179,9 @@ struct GraphStructure {    bool IsReachable() const { return r->nodes > 0; }  }; -struct BackwardEstimates { -  BackwardEstimates() : gs(), backward() {} -  explicit BackwardEstimates(const GraphStructure& g) : +struct ProbabilityEstimates { +  ProbabilityEstimates() : gs(), backward() {} +  explicit ProbabilityEstimates(const GraphStructure& g) :        gs(&g), backward() {      if (g.r->nodes > 0)        backward = new float[g.r->nodes]; @@ -36,24 +195,32 @@ struct BackwardEstimates {    }    // returns an backward estimate -  double operator()(int src_covered, int trg_covered) const { +  double Backward(int src_covered, int trg_covered) const {      if (!backward) return 0;      int ind = gs->r->node_addresses[src_covered][trg_covered];      if (ind < 0) return 0;      return backward[ind];    } + +  prob_t estp; +  float* backward;   private:    const GraphStructure* gs; -  float* backward;  };  struct TransliterationsImpl { -  TransliterationsImpl(int max_src, int max_trg, double fr) : +  TransliterationsImpl(int max_src, int max_trg, double sr, const BackwardEstimator& b) : +      cp0(max_src, max_trg, sr), +      tccm(cp0), +      be(b),        kMAX_SRC_CHUNK(max_src),        kMAX_TRG_CHUNK(max_trg), -      kFILTER_RATIO(fr), +      kS2T_RATIO(sr),        tot_pairs(), tot_mem() {    } +  const CondBaseDist cp0; +  TransliterationChunkConditionalModel tccm; +  const BackwardEstimator& be;    void Initialize(WordID src, const vector<WordID>& src_lets, WordID trg, const vector<WordID>& trg_lets) {      const size_t src_len = src_lets.size(); @@ -63,20 +230,29 @@ struct TransliterationsImpl {      if (src_len >= graphs.size()) graphs.resize(src_len + 1);      if (trg_len >= graphs[src_len].size()) graphs[src_len].resize(trg_len + 1);      GraphStructure& gs = graphs[src_len][trg_len]; -    if (!gs.r) -      gs.r = new Reachability(src_len, trg_len, kMAX_SRC_CHUNK, kMAX_TRG_CHUNK, kFILTER_RATIO); +    if (!gs.r) { +      double rat = exp(fabs(log(trg_len / (src_len * kS2T_RATIO)))); +      if (rat > 1.5 || (rat > 2.4 && src_len < 6)) { +        cerr << " ** Forbidding transliterations of size " << src_len << "," << trg_len << ": " << rat << endl; +        gs.r = new Reachability(src_len, trg_len, 0, 0); +      } else { +        gs.r = new Reachability(src_len, trg_len, kMAX_SRC_CHUNK, kMAX_TRG_CHUNK); +      } +    } +      const Reachability& r = *gs.r;      // init backward estimates -    if (src >= bes.size()) bes.resize(src + 1); -    unordered_map<WordID, BackwardEstimates>::iterator it = bes[src].find(trg); -    if (it != bes[src].end()) return; // already initialized +    if (src >= ests.size()) ests.resize(src + 1); +    unordered_map<WordID, ProbabilityEstimates>::iterator it = ests[src].find(trg); +    if (it != ests[src].end()) return; // already initialized -    it = bes[src].insert(make_pair(trg, BackwardEstimates(gs))).first; -    BackwardEstimates& b = it->second; +    it = ests[src].insert(make_pair(trg, ProbabilityEstimates(gs))).first; +    ProbabilityEstimates& est = it->second;      if (!gs.r->nodes) return;  // not derivable subject to length constraints -    // TODO +    be.InitializeGrid(src_lets, trg_lets, r, kS2T_RATIO, est.backward); +    cerr << TD::GetString(src_lets) << " ||| " << TD::GetString(trg_lets) << " ||| " << (est.backward[0] / trg_lets.size()) << endl;      tot_pairs++;      tot_mem += sizeof(float) * gs.r->nodes;    } @@ -92,8 +268,11 @@ struct TransliterationsImpl {      const vector<GraphStructure>& tv = graphs[src.size()];      assert(trg.size() < tv.size());      const GraphStructure& gs = tv[trg.size()]; -    // TODO: do prob -    return prob_t::Zero(); +    if (gs.r->nodes == 0) +      return prob_t::Zero(); +    const unordered_map<WordID, ProbabilityEstimates>::const_iterator it = ests[s].find(t); +    assert(it != ests[s].end()); +    return it->second.estp;    }    void GraphSummary() const { @@ -126,15 +305,15 @@ struct TransliterationsImpl {    const int kMAX_SRC_CHUNK;    const int kMAX_TRG_CHUNK; -  const double kFILTER_RATIO; +  const double kS2T_RATIO;    unsigned tot_pairs;    size_t tot_mem;    vector<vector<GraphStructure> > graphs; // graphs[src_len][trg_len] -  vector<unordered_map<WordID, BackwardEstimates> > bes; // bes[src][trg] +  vector<unordered_map<WordID, ProbabilityEstimates> > ests; // ests[src][trg]  }; -Transliterations::Transliterations(int max_src, int max_trg, double fr) : -    pimpl_(new TransliterationsImpl(max_src, max_trg, fr)) {} +Transliterations::Transliterations(int max_src, int max_trg, double sr, const BackwardEstimator& be) : +    pimpl_(new TransliterationsImpl(max_src, max_trg, sr, be)) {}  Transliterations::~Transliterations() { delete pimpl_; }  void Transliterations::Initialize(WordID src, const vector<WordID>& src_lets, WordID trg, const vector<WordID>& trg_lets) { diff --git a/gi/pf/transliterations.h b/gi/pf/transliterations.h index ea9f9d3f..49d14684 100644 --- a/gi/pf/transliterations.h +++ b/gi/pf/transliterations.h @@ -5,11 +5,12 @@  #include "wordid.h"  #include "prob.h" +struct BackwardEstimator;  struct TransliterationsImpl;  struct Transliterations {    // max_src and max_trg indicate how big the transliteration phrases can be    // see reachability.h for information about filter_ratio -  explicit Transliterations(int max_src, int max_trg, double filter_ratio); +  explicit Transliterations(int max_src, int max_trg, double s2t_rat, const BackwardEstimator& be);    ~Transliterations();    void Initialize(WordID src, const std::vector<WordID>& src_lets, WordID trg, const std::vector<WordID>& trg_lets);    void Forbid(WordID src, const std::vector<WordID>& src_lets, WordID trg, const std::vector<WordID>& trg_lets);  | 
