From 113317266853abff2e1c0c3e889017d0eee55c93 Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Fri, 9 Mar 2012 22:23:50 -0500 Subject: moar --- gi/pf/pyp_tm.cc | 113 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 113 insertions(+) create mode 100644 gi/pf/pyp_tm.cc (limited to 'gi/pf/pyp_tm.cc') diff --git a/gi/pf/pyp_tm.cc b/gi/pf/pyp_tm.cc new file mode 100644 index 00000000..94cbe7c3 --- /dev/null +++ b/gi/pf/pyp_tm.cc @@ -0,0 +1,113 @@ +#include "pyp_tm.h" + +#include +#include +#include + +#include "base_distributions.h" +#include "monotonic_pseg.h" +#include "conditional_pseg.h" +#include "tdict.h" +#include "ccrp.h" +#include "pyp_word_model.h" + +using namespace std; +using namespace std::tr1; + +template +struct ConditionalPYPWordModel { + ConditionalPYPWordModel(Base* b) : base(*b) {} + + void Summary() const { + cerr << "Number of conditioning contexts: " << r.size() << endl; + for (RuleModelHash::const_iterator it = r.begin(); it != r.end(); ++it) { + cerr << TD::Convert(it->first) << " \tPYP(d=" << it->second.discount() << ",s=" << it->second.strength() << ") --------------------------" << endl; + for (CCRP >::const_iterator i2 = it->second.begin(); i2 != it->second.end(); ++i2) + cerr << " " << i2->second.total_dish_count_ << '\t' << TD::GetString(i2->first) << endl; + } + } + + void ResampleHyperparameters(MT19937* rng) { + for (RuleModelHash::iterator it = r.begin(); it != r.end(); ++it) + it->second.resample_hyperparameters(rng); + } + + prob_t Prob(const WordID src, const vector& trglets) const { + RuleModelHash::const_iterator it = r.find(src); + if (it == r.end()) { + return base(trglets); + } else { + return it->second.prob(trglets, base(trglets)); + } + } + + void Increment(const WordID src, const vector& trglets, MT19937* rng) { + RuleModelHash::iterator it = r.find(src); + if (it == r.end()) + it = r.insert(make_pair(src, CCRP >(1,1,1,1,0.5,1.0))).first; + if (it->second.increment(trglets, base(trglets), rng)) + base.Increment(trglets, rng); + } + + void Decrement(const WordID src, const vector& trglets, MT19937* rng) { + RuleModelHash::iterator it = r.find(src); + assert(it != r.end()); + if (it->second.decrement(trglets, rng)) { + base.Decrement(trglets, rng); + if (it->second.num_customers() == 0) + r.erase(it); + } + } + + prob_t Likelihood() const { + prob_t p = prob_t::One(); + for (RuleModelHash::const_iterator it = r.begin(); it != r.end(); ++it) { + prob_t q; q.logeq(it->second.log_crp_prob()); + p *= q; + } + return p; + } + + unsigned UniqueConditioningContexts() const { + return r.size(); + } + + Base& base; + typedef unordered_map > > RuleModelHash; + RuleModelHash r; +}; + +PYPLexicalTranslation::PYPLexicalTranslation(const vector >& lets, + const unsigned num_letters) : + letters(lets), + up0(new PYPWordModel(num_letters)), + tmodel(new ConditionalPYPWordModel(up0)), + kX(-TD::Convert("X")) {} + +prob_t PYPLexicalTranslation::Likelihood() const { + prob_t p = up0->Likelihood(); + p *= tmodel->Likelihood(); + return p; +} + +void PYPLexicalTranslation::ResampleHyperparameters(MT19937* rng) { + tmodel->ResampleHyperparameters(rng); + up0->ResampleHyperparameters(rng); +} + +unsigned PYPLexicalTranslation::UniqueConditioningContexts() const { + return tmodel->UniqueConditioningContexts(); +} + +prob_t PYPLexicalTranslation::Prob(WordID src, WordID trg) const { + return tmodel->Prob(src, letters[trg]); +} + +void PYPLexicalTranslation::Increment(WordID src, WordID trg, MT19937* rng) { + tmodel->Increment(src, letters[trg], rng); +} + +void PYPLexicalTranslation::Decrement(WordID src, WordID trg, MT19937* rng) { + tmodel->Decrement(src, letters[trg], rng); +} + -- cgit v1.2.3 From ef614a1d968aebbf463ed57876fee395b4c24635 Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Fri, 9 Mar 2012 23:13:09 -0500 Subject: logging after alignment --- gi/pf/align-lexonly-pyp.cc | 1 + gi/pf/pyp_tm.cc | 7 +++++-- gi/pf/pyp_word_model.h | 2 +- utils/ccrp.h | 1 + 4 files changed, 8 insertions(+), 3 deletions(-) (limited to 'gi/pf/pyp_tm.cc') diff --git a/gi/pf/align-lexonly-pyp.cc b/gi/pf/align-lexonly-pyp.cc index d68a4b8f..4a1d1db6 100644 --- a/gi/pf/align-lexonly-pyp.cc +++ b/gi/pf/align-lexonly-pyp.cc @@ -208,6 +208,7 @@ int main(int argc, char** argv) { } for (unsigned i = 0; i < corpus.size(); ++i) WriteAlignments(corpus[i]); + aligner.model.Summary(); return 0; } diff --git a/gi/pf/pyp_tm.cc b/gi/pf/pyp_tm.cc index 94cbe7c3..b5262f47 100644 --- a/gi/pf/pyp_tm.cc +++ b/gi/pf/pyp_tm.cc @@ -54,8 +54,6 @@ struct ConditionalPYPWordModel { assert(it != r.end()); if (it->second.decrement(trglets, rng)) { base.Decrement(trglets, rng); - if (it->second.num_customers() == 0) - r.erase(it); } } @@ -84,6 +82,11 @@ PYPLexicalTranslation::PYPLexicalTranslation(const vector >& lets tmodel(new ConditionalPYPWordModel(up0)), kX(-TD::Convert("X")) {} +void PYPLexicalTranslation::Summary() const { + tmodel->Summary(); + up0->Summary(); +} + prob_t PYPLexicalTranslation::Likelihood() const { prob_t p = up0->Likelihood(); p *= tmodel->Likelihood(); diff --git a/gi/pf/pyp_word_model.h b/gi/pf/pyp_word_model.h index 800a4fd7..ff366865 100644 --- a/gi/pf/pyp_word_model.h +++ b/gi/pf/pyp_word_model.h @@ -12,7 +12,7 @@ // PYP(d,s,poisson-uniform) represented as a CRP struct PYPWordModel { - explicit PYPWordModel(const unsigned vocab_e_size, const double mean_len = 7.5) : + explicit PYPWordModel(const unsigned vocab_e_size, const double mean_len = 5) : base(prob_t::One()), r(1,1,1,1,0.66,50.0), u0(-std::log(vocab_e_size)), mean_length(mean_len) {} void ResampleHyperparameters(MT19937* rng); diff --git a/utils/ccrp.h b/utils/ccrp.h index 439d7e1e..4a8b80e7 100644 --- a/utils/ccrp.h +++ b/utils/ccrp.h @@ -221,6 +221,7 @@ class CCRP { void resample_hyperparameters(MT19937* rng, const unsigned nloop = 5, const unsigned niterations = 10) { assert(has_discount_prior() || has_strength_prior()); + if (num_customers() == 0) return; DiscountResampler dr(*this); StrengthResampler sr(*this); for (int iter = 0; iter < nloop; ++iter) { -- cgit v1.2.3 From de136247bdedb960dc0f317cd65b28c02a441532 Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Sat, 10 Mar 2012 01:08:23 -0500 Subject: tie params --- gi/pf/pyp_lm.cc | 66 +++++++++------------------------------- gi/pf/pyp_tm.cc | 2 ++ gi/pf/tied_resampler.h | 82 ++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 99 insertions(+), 51 deletions(-) create mode 100644 gi/pf/tied_resampler.h (limited to 'gi/pf/pyp_tm.cc') diff --git a/gi/pf/pyp_lm.cc b/gi/pf/pyp_lm.cc index 52e6be2c..85635b8f 100644 --- a/gi/pf/pyp_lm.cc +++ b/gi/pf/pyp_lm.cc @@ -11,6 +11,7 @@ #include "tdict.h" #include "sampler.h" #include "ccrp.h" +#include "tied_resampler.h" // A not very memory-efficient implementation of an N-gram LM based on PYPs // as described in Y.-W. Teh. (2006) A Hierarchical Bayesian Language Model @@ -66,7 +67,7 @@ template<> struct PYPLM<0> { void increment(WordID, const vector&, MT19937*) { ++draws; } void decrement(WordID, const vector&, MT19937*) { --draws; assert(draws >= 0); } double prob(WordID, const vector&) const { return p0; } - void resample_hyperparameters(MT19937*, const unsigned, const unsigned) {} + void resample_hyperparameters(MT19937*) {} double log_likelihood() const { return draws * log(p0); } const double p0; int draws; @@ -76,16 +77,17 @@ template<> struct PYPLM<0> { template struct PYPLM { PYPLM(unsigned vs, double da, double db, double ss, double sr) : backoff(vs, da, db, ss, sr), - discount_a(da), discount_b(db), - strength_s(ss), strength_r(sr), - d(0.8), strength(1.0), lookup(N-1) {} + tr(da, db, ss, sr, 0.8, 1.0), + lookup(N-1) {} void increment(WordID w, const vector& context, MT19937* rng) { const double bo = backoff.prob(w, context); for (unsigned i = 0; i < N-1; ++i) lookup[i] = context[context.size() - 1 - i]; typename unordered_map, CCRP, boost::hash > >::iterator it = p.find(lookup); - if (it == p.end()) - it = p.insert(make_pair(lookup, CCRP(d,strength))).first; + if (it == p.end()) { + it = p.insert(make_pair(lookup, CCRP(0.5,1))).first; + tr.Add(&it->second); // add to resampler + } if (it->second.increment(w, bo, rng)) backoff.increment(w, context, rng); } @@ -107,59 +109,21 @@ template struct PYPLM { } double log_likelihood() const { - return log_likelihood(d, strength) + backoff.log_likelihood(); - } - - double log_likelihood(const double& dd, const double& aa) const { - if (aa <= -dd) return -std::numeric_limits::infinity(); - //double llh = Md::log_beta_density(dd, 10, 3) + Md::log_gamma_density(aa, 1, 1); - double llh = Md::log_beta_density(dd, discount_a, discount_b) + - Md::log_gamma_density(aa + dd, strength_s, strength_r); + double llh = backoff.log_likelihood(); typename unordered_map, CCRP, boost::hash > >::const_iterator it; for (it = p.begin(); it != p.end(); ++it) - llh += it->second.log_crp_prob(dd, aa); + llh += it->second.log_crp_prob(); + // TODO parametric likelihood from TiedResampler return llh; } - struct DiscountResampler { - DiscountResampler(const PYPLM& m) : m_(m) {} - const PYPLM& m_; - double operator()(const double& proposed_discount) const { - return m_.log_likelihood(proposed_discount, m_.strength); - } - }; - - struct AlphaResampler { - AlphaResampler(const PYPLM& m) : m_(m) {} - const PYPLM& m_; - double operator()(const double& proposed_strength) const { - return m_.log_likelihood(m_.d, proposed_strength); - } - }; - - void resample_hyperparameters(MT19937* rng, const unsigned nloop = 5, const unsigned niterations = 10) { - DiscountResampler dr(*this); - AlphaResampler ar(*this); - for (int iter = 0; iter < nloop; ++iter) { - strength = slice_sampler1d(ar, strength, *rng, -d + std::numeric_limits::min(), - std::numeric_limits::infinity(), 0.0, niterations, 100*niterations); - double min_discount = std::numeric_limits::min(); - if (strength < 0.0) min_discount -= strength; - d = slice_sampler1d(dr, d, *rng, min_discount, - 1.0, 0.0, niterations, 100*niterations); - } - strength = slice_sampler1d(ar, strength, *rng, -d + std::numeric_limits::min(), - std::numeric_limits::infinity(), 0.0, niterations, 100*niterations); - typename unordered_map, CCRP, boost::hash > >::iterator it; - cerr << "PYPLM<" << N << ">(d=" << d << ",a=" << strength << ") = " << log_likelihood(d, strength) << endl; - for (it = p.begin(); it != p.end(); ++it) { - it->second.set_discount(d); - it->second.set_strength(strength); - } - backoff.resample_hyperparameters(rng, nloop, niterations); + void resample_hyperparameters(MT19937* rng) { + tr.ResampleHyperparameters(rng); + backoff.resample_hyperparameters(rng); } PYPLM backoff; + TiedResampler > tr; double discount_a, discount_b, strength_s, strength_r; double d, strength; mutable vector lookup; // thread-local diff --git a/gi/pf/pyp_tm.cc b/gi/pf/pyp_tm.cc index b5262f47..73104fe9 100644 --- a/gi/pf/pyp_tm.cc +++ b/gi/pf/pyp_tm.cc @@ -11,6 +11,8 @@ #include "ccrp.h" #include "pyp_word_model.h" +#include "tied_resampler.h" + using namespace std; using namespace std::tr1; diff --git a/gi/pf/tied_resampler.h b/gi/pf/tied_resampler.h new file mode 100644 index 00000000..208fb9c7 --- /dev/null +++ b/gi/pf/tied_resampler.h @@ -0,0 +1,82 @@ +#ifndef _TIED_RESAMPLER_H_ +#define _TIED_RESAMPLER_H_ + +#include +#include "sampler.h" +#include "slice_sampler.h" +#include "m.h" + +template +struct TiedResampler { + explicit TiedResampler(double da, double db, double ss, double sr, double d=0.5, double s=1.0) : + d_alpha(da), + d_beta(db), + s_shape(ss), + s_rate(sr), + discount(d), + strength(s) {} + + void Add(CRP* crp) { + crps.insert(crp); + crp->set_discount(discount); + crp->set_strength(strength); + assert(!crp->has_discount_prior()); + assert(!crp->has_strength_prior()); + } + + void Remove(CRP* crp) { + crps.erase(crp); + } + + double LogLikelihood(double d, double s) const { + if (s <= -d) return -std::numeric_limits::infinity(); + double llh = Md::log_beta_density(d, d_alpha, d_beta) + + Md::log_gamma_density(d + s, s_shape, s_rate); + for (typename std::set::iterator it = crps.begin(); it != crps.end(); ++it) + llh += (*it)->log_crp_prob(d, s); + return llh; + } + + struct DiscountResampler { + DiscountResampler(const TiedResampler& m) : m_(m) {} + const TiedResampler& m_; + double operator()(const double& proposed_discount) const { + return m_.LogLikelihood(proposed_discount, m_.strength); + } + }; + + struct AlphaResampler { + AlphaResampler(const TiedResampler& m) : m_(m) {} + const TiedResampler& m_; + double operator()(const double& proposed_strength) const { + return m_.LogLikelihood(m_.discount, proposed_strength); + } + }; + + void ResampleHyperparameters(MT19937* rng, const unsigned nloop = 5, const unsigned niterations = 10) { + const DiscountResampler dr(*this); + const AlphaResampler ar(*this); + for (int iter = 0; iter < nloop; ++iter) { + strength = slice_sampler1d(ar, strength, *rng, -discount + std::numeric_limits::min(), + std::numeric_limits::infinity(), 0.0, niterations, 100*niterations); + double min_discount = std::numeric_limits::min(); + if (strength < 0.0) min_discount -= strength; + discount = slice_sampler1d(dr, discount, *rng, min_discount, + 1.0, 0.0, niterations, 100*niterations); + } + strength = slice_sampler1d(ar, strength, *rng, -discount + std::numeric_limits::min(), + std::numeric_limits::infinity(), 0.0, niterations, 100*niterations); + std::cerr << "TiedCRPs(d=" << discount << ",s=" + << strength << ") = " << LogLikelihood(discount, strength) << std::endl; + for (typename std::set::iterator it = crps.begin(); it != crps.end(); ++it) { + (*it)->set_discount(discount); + (*it)->set_strength(strength); + } + } + private: + std::set crps; + const double d_alpha, d_beta, s_shape, s_rate; + double discount, strength; +}; + +#endif -- cgit v1.2.3 From 38f28be7cd2bada87ebad78994e3c938e10c2cce Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Sat, 10 Mar 2012 12:56:15 -0500 Subject: ready to infer alignment parameters --- gi/pf/Makefile.am | 4 +- gi/pf/align-lexonly-pyp.cc | 22 ++- gi/pf/align-lexonly.cc | 332 --------------------------------------------- gi/pf/pyp_tm.cc | 6 +- gi/pf/quasi_model2.h | 115 ++++++++++++---- gi/pf/tied_resampler.h | 31 +++++ 6 files changed, 143 insertions(+), 367 deletions(-) delete mode 100644 gi/pf/align-lexonly.cc (limited to 'gi/pf/pyp_tm.cc') diff --git a/gi/pf/Makefile.am b/gi/pf/Makefile.am index 4ce72ba1..f9c979d0 100644 --- a/gi/pf/Makefile.am +++ b/gi/pf/Makefile.am @@ -1,4 +1,4 @@ -bin_PROGRAMS = cbgi brat dpnaive pfbrat pfdist itg pfnaive condnaive align-lexonly align-lexonly-pyp learn_cfg pyp_lm nuisance_test align-tl +bin_PROGRAMS = cbgi brat dpnaive pfbrat pfdist itg pfnaive condnaive align-lexonly-pyp learn_cfg pyp_lm nuisance_test align-tl noinst_LIBRARIES = libpf.a @@ -7,8 +7,6 @@ libpf_a_SOURCES = base_distributions.cc reachability.cc cfg_wfst_composer.cc cor nuisance_test_SOURCES = nuisance_test.cc nuisance_test_LDADD = libpf.a $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a $(top_srcdir)/klm/lm/libklm.a $(top_srcdir)/klm/util/libklm_util.a -lz -align_lexonly_SOURCES = align-lexonly.cc - align_lexonly_pyp_SOURCES = align-lexonly-pyp.cc align_lexonly_pyp_LDADD = libpf.a $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a $(top_srcdir)/klm/lm/libklm.a $(top_srcdir)/klm/util/libklm_util.a -lz diff --git a/gi/pf/align-lexonly-pyp.cc b/gi/pf/align-lexonly-pyp.cc index 0c90b6ce..68cb9192 100644 --- a/gi/pf/align-lexonly-pyp.cc +++ b/gi/pf/align-lexonly-pyp.cc @@ -61,15 +61,15 @@ struct AlignedSentencePair { struct Aligner { Aligner(const vector >& lets, int num_letters, vector* c) : corpus(*c), + paj_model(4, 0.08), model(lets, num_letters), - paj(4, 0.08), kNULL(TD::Convert("NULL")) { assert(lets[kNULL].size() == 0); } vector& corpus; + QuasiModel2 paj_model; PYPLexicalTranslation model; - const QuasiModel2 paj; const WordID kNULL; void ResampleHyperparameters() { @@ -86,10 +86,12 @@ struct Aligner { a_j = prng->next() * (1 + asp.src.size()); const WordID f_a_j = (a_j ? asp.src[a_j - 1] : kNULL); model.Increment(f_a_j, asp.trg[j], &*prng); - // TODO factor in alignment prob + paj_model.Increment(a_j, j, asp.src.size(), asp.trg.size()); } } - cerr << "Corpus intialized randomly. LLH = " << model.Likelihood() << endl; + cerr << "Corpus intialized randomly." << endl; + cerr << "LLH = " << Likelihood() << " \t(Amodel=" << paj_model.Likelihood() + << " TModel=" << model.Likelihood() << ") contexts=" << model.UniqueConditioningContexts() << endl; } void ResampleCorpus() { @@ -101,19 +103,25 @@ struct Aligner { const WordID e_j = asp.trg[j]; WordID f_a_j = (a_j ? asp.src[a_j - 1] : kNULL); model.Decrement(f_a_j, e_j, prng); + paj_model.Decrement(a_j, j, asp.src.size(), asp.trg.size()); for (unsigned prop_a_j = 0; prop_a_j <= asp.src.size(); ++prop_a_j) { const WordID prop_f = (prop_a_j ? asp.src[prop_a_j - 1] : kNULL); ss[prop_a_j] = model.Prob(prop_f, e_j); - // TODO configurable - ss[prop_a_j] *= paj.Pa_j(prop_a_j, j, asp.src.size(), asp.trg.size()); + ss[prop_a_j] *= paj_model.Prob(prop_a_j, j, asp.src.size(), asp.trg.size()); } a_j = prng->SelectSample(ss); f_a_j = (a_j ? asp.src[a_j - 1] : kNULL); model.Increment(f_a_j, e_j, prng); + paj_model.Increment(a_j, j, asp.src.size(), asp.trg.size()); } } - cerr << "LLH = " << model.Likelihood() << " " << model.UniqueConditioningContexts() << endl; + cerr << "LLH = " << Likelihood() << " \t(Amodel=" << paj_model.Likelihood() + << " TModel=" << model.Likelihood() << ") contexts=" << model.UniqueConditioningContexts() << endl; + } + + prob_t Likelihood() const { + return model.Likelihood() * paj_model.Likelihood(); } }; diff --git a/gi/pf/align-lexonly.cc b/gi/pf/align-lexonly.cc deleted file mode 100644 index dbc9dc07..00000000 --- a/gi/pf/align-lexonly.cc +++ /dev/null @@ -1,332 +0,0 @@ -#include -#include -#include - -#include -#include -#include - -#include "array2d.h" -#include "base_distributions.h" -#include "monotonic_pseg.h" -#include "conditional_pseg.h" -#include "trule.h" -#include "tdict.h" -#include "stringlib.h" -#include "filelib.h" -#include "dict.h" -#include "sampler.h" -#include "ccrp_nt.h" -#include "corpus.h" -#include "ngram_base.h" - -using namespace std; -using namespace tr1; -namespace po = boost::program_options; - -void InitCommandLine(int argc, char** argv, po::variables_map* conf) { - po::options_description opts("Configuration options"); - opts.add_options() - ("samples,s",po::value()->default_value(1000),"Number of samples") - ("input,i",po::value(),"Read parallel data from") - ("random_seed,S",po::value(), "Random seed"); - po::options_description clo("Command line options"); - clo.add_options() - ("config", po::value(), "Configuration file") - ("help,h", "Print this help message and exit"); - po::options_description dconfig_options, dcmdline_options; - dconfig_options.add(opts); - dcmdline_options.add(opts).add(clo); - - po::store(parse_command_line(argc, argv, dcmdline_options), *conf); - if (conf->count("config")) { - ifstream config((*conf)["config"].as().c_str()); - po::store(po::parse_config_file(config, dconfig_options), *conf); - } - po::notify(*conf); - - if (conf->count("help") || (conf->count("input") == 0)) { - cerr << dcmdline_options << endl; - exit(1); - } -} - -shared_ptr prng; - -struct LexicalAlignment { - unsigned char src_index; - bool is_transliteration; - vector > derivation; -}; - -struct AlignedSentencePair { - vector src; - vector trg; - vector a; - Array2D posterior; -}; - -struct HierarchicalWordBase { - explicit HierarchicalWordBase(const unsigned vocab_e_size) : - base(prob_t::One()), r(25,25,10), u0(-log(vocab_e_size)) {} - - void ResampleHyperparameters(MT19937* rng) { - r.resample_hyperparameters(rng); - } - - inline double logp0(const vector& s) const { - return s.size() * u0; - } - - // return p0 of rule.e_ - prob_t operator()(const TRule& rule) const { - prob_t p; p.logeq(r.logprob(rule.e_, logp0(rule.e_))); - return p; - } - - void Increment(const TRule& rule) { - if (r.increment(rule.e_)) { - prob_t p; p.logeq(logp0(rule.e_)); - base *= p; - } - } - - void Decrement(const TRule& rule) { - if (r.decrement(rule.e_)) { - prob_t p; p.logeq(logp0(rule.e_)); - base /= p; - } - } - - prob_t Likelihood() const { - prob_t p; p.logeq(r.log_crp_prob()); - p *= base; - return p; - } - - void Summary() const { - cerr << "NUMBER OF CUSTOMERS: " << r.num_customers() << " (\\alpha=" << r.alpha() << ')' << endl; - for (CCRP_NoTable >::const_iterator it = r.begin(); it != r.end(); ++it) - cerr << " " << it->second << '\t' << TD::GetString(it->first) << endl; - } - - prob_t base; - CCRP_NoTable > r; - const double u0; -}; - -struct BasicLexicalAlignment { - explicit BasicLexicalAlignment(const vector >& lets, - const unsigned words_e, - const unsigned letters_e, - vector* corp) : - letters(lets), - corpus(*corp), - up0("fr-en.10k.translit-base.txt.gz"), - //up0(words_e), - //up0("en.chars.1gram", letters_e), - //up0("en.words.1gram"), - //up0(letters_e), - //up0("en.chars.2gram"), - tmodel(up0) { - } - - void InstantiateRule(const WordID src, - const WordID trg, - TRule* rule) const { - static const WordID kX = TD::Convert("X") * -1; - rule->lhs_ = kX; - rule->e_ = letters[trg]; - rule->f_ = letters[src]; - } - - void InitializeRandom() { - const WordID kNULL = TD::Convert("NULL"); - cerr << "Initializing with random alignments ...\n"; - for (unsigned i = 0; i < corpus.size(); ++i) { - AlignedSentencePair& asp = corpus[i]; - asp.a.resize(asp.trg.size()); - for (unsigned j = 0; j < asp.trg.size(); ++j) { - const unsigned char a_j = prng->next() * (1 + asp.src.size()); - const WordID f_a_j = (a_j ? asp.src[a_j - 1] : kNULL); - TRule r; - InstantiateRule(f_a_j, asp.trg[j], &r); - asp.a[j].is_transliteration = false; - asp.a[j].src_index = a_j; - if (tmodel.IncrementRule(r)) - up0.Increment(r); - } - } - cerr << " LLH = " << Likelihood() << endl; - } - - prob_t Likelihood() const { - prob_t p = tmodel.Likelihood(); - p *= up0.Likelihood(); - return p; - } - - void ResampleHyperparemeters() { - cerr << " LLH_prev = " << Likelihood() << flush; - tmodel.ResampleHyperparameters(&*prng); - up0.ResampleHyperparameters(&*prng); - cerr << "\tLLH_post = " << Likelihood() << endl; - } - - void ResampleCorpus(); - - const vector >& letters; // spelling dictionary - vector& corpus; - //PhraseConditionalUninformativeBase up0; - //PhraseConditionalUninformativeUnigramBase up0; - //UnigramWordBase up0; - //HierarchicalUnigramBase up0; - TableLookupBase up0; - //HierarchicalWordBase up0; - //PoissonUniformUninformativeBase up0; - //CompletelyUniformBase up0; - //FixedNgramBase up0; - //ConditionalTranslationModel tmodel; - //ConditionalTranslationModel tmodel; - //ConditionalTranslationModel tmodel; - //ConditionalTranslationModel tmodel; - //ConditionalTranslationModel tmodel; - //ConditionalTranslationModel tmodel; - ConditionalTranslationModel tmodel; - //ConditionalTranslationModel tmodel; - //ConditionalTranslationModel tmodel; -}; - -void BasicLexicalAlignment::ResampleCorpus() { - static const WordID kNULL = TD::Convert("NULL"); - for (unsigned i = 0; i < corpus.size(); ++i) { - AlignedSentencePair& asp = corpus[i]; - SampleSet ss; ss.resize(asp.src.size() + 1); - for (unsigned j = 0; j < asp.trg.size(); ++j) { - TRule r; - unsigned char& a_j = asp.a[j].src_index; - WordID f_a_j = (a_j ? asp.src[a_j - 1] : kNULL); - InstantiateRule(f_a_j, asp.trg[j], &r); - if (tmodel.DecrementRule(r)) - up0.Decrement(r); - - for (unsigned prop_a_j = 0; prop_a_j <= asp.src.size(); ++prop_a_j) { - const WordID prop_f = (prop_a_j ? asp.src[prop_a_j - 1] : kNULL); - InstantiateRule(prop_f, asp.trg[j], &r); - ss[prop_a_j] = tmodel.RuleProbability(r); - } - a_j = prng->SelectSample(ss); - f_a_j = (a_j ? asp.src[a_j - 1] : kNULL); - InstantiateRule(f_a_j, asp.trg[j], &r); - if (tmodel.IncrementRule(r)) - up0.Increment(r); - } - } - cerr << " LLH = " << tmodel.Likelihood() << endl; -} - -void ExtractLetters(const set& v, vector >* l, set* letset = NULL) { - for (set::const_iterator it = v.begin(); it != v.end(); ++it) { - if (*it >= l->size()) { l->resize(*it + 1); } - vector& letters = (*l)[*it]; - if (letters.size()) continue; // if e and f have the same word - - const string& w = TD::Convert(*it); - - size_t cur = 0; - while (cur < w.size()) { - const size_t len = UTF8Len(w[cur]); - letters.push_back(TD::Convert(w.substr(cur, len))); - if (letset) letset->insert(letters.back()); - cur += len; - } - } -} - -void Debug(const AlignedSentencePair& asp) { - cerr << TD::GetString(asp.src) << endl << TD::GetString(asp.trg) << endl; - Array2D a(asp.src.size(), asp.trg.size()); - for (unsigned j = 0; j < asp.trg.size(); ++j) - if (asp.a[j].src_index) a(asp.a[j].src_index - 1, j) = true; - cerr << a << endl; -} - -void AddSample(AlignedSentencePair* asp) { - for (unsigned j = 0; j < asp->trg.size(); ++j) - asp->posterior(asp->a[j].src_index, j)++; -} - -void WriteAlignments(const AlignedSentencePair& asp) { - bool first = true; - for (unsigned j = 0; j < asp.trg.size(); ++j) { - int src_index = -1; - int mc = -1; - for (unsigned i = 0; i <= asp.src.size(); ++i) { - if (asp.posterior(i, j) > mc) { - mc = asp.posterior(i, j); - src_index = i; - } - } - - if (src_index) { - if (first) first = false; else cout << ' '; - cout << (src_index - 1) << '-' << j; - } - } - cout << endl; -} - -int main(int argc, char** argv) { - po::variables_map conf; - InitCommandLine(argc, argv, &conf); - - if (conf.count("random_seed")) - prng.reset(new MT19937(conf["random_seed"].as())); - else - prng.reset(new MT19937); -// MT19937& rng = *prng; - - vector > corpuse, corpusf; - set vocabe, vocabf; - corpus::ReadParallelCorpus(conf["input"].as(), &corpusf, &corpuse, &vocabf, &vocabe); - cerr << "f-Corpus size: " << corpusf.size() << " sentences\n"; - cerr << "f-Vocabulary size: " << vocabf.size() << " types\n"; - cerr << "f-Corpus size: " << corpuse.size() << " sentences\n"; - cerr << "f-Vocabulary size: " << vocabe.size() << " types\n"; - assert(corpusf.size() == corpuse.size()); - - vector corpus(corpuse.size()); - for (unsigned i = 0; i < corpuse.size(); ++i) { - corpus[i].src.swap(corpusf[i]); - corpus[i].trg.swap(corpuse[i]); - corpus[i].posterior.resize(corpus[i].src.size() + 1, corpus[i].trg.size()); - } - corpusf.clear(); corpuse.clear(); - - vocabf.insert(TD::Convert("NULL")); - vector > letters(TD::NumWords()); - set letset; - ExtractLetters(vocabe, &letters, &letset); - ExtractLetters(vocabf, &letters, NULL); - letters[TD::Convert("NULL")].clear(); - - BasicLexicalAlignment x(letters, vocabe.size(), letset.size(), &corpus); - x.InitializeRandom(); - const unsigned samples = conf["samples"].as(); - for (int i = 0; i < samples; ++i) { - for (int j = 395; j < 397; ++j) Debug(corpus[j]); - cerr << i << "\t" << x.tmodel.r.size() << "\t"; - if (i % 10 == 0) x.ResampleHyperparemeters(); - x.ResampleCorpus(); - if (i > (samples / 5) && (i % 10 == 9)) for (int j = 0; j < corpus.size(); ++j) AddSample(&corpus[j]); - } - for (unsigned i = 0; i < corpus.size(); ++i) - WriteAlignments(corpus[i]); - //ModelAndData posterior(x, &corpus, vocabe, vocabf); - x.tmodel.Summary(); - x.up0.Summary(); - - //posterior.Sample(); - - return 0; -} diff --git a/gi/pf/pyp_tm.cc b/gi/pf/pyp_tm.cc index 73104fe9..bf5a6497 100644 --- a/gi/pf/pyp_tm.cc +++ b/gi/pf/pyp_tm.cc @@ -10,7 +10,6 @@ #include "tdict.h" #include "ccrp.h" #include "pyp_word_model.h" - #include "tied_resampler.h" using namespace std; @@ -18,7 +17,7 @@ using namespace std::tr1; template struct ConditionalPYPWordModel { - ConditionalPYPWordModel(Base* b) : base(*b) {} + ConditionalPYPWordModel(Base* b) : base(*b), btr(3) {} void Summary() const { cerr << "Number of conditioning contexts: " << r.size() << endl; @@ -32,6 +31,7 @@ struct ConditionalPYPWordModel { void ResampleHyperparameters(MT19937* rng) { for (RuleModelHash::iterator it = r.begin(); it != r.end(); ++it) it->second.resample_hyperparameters(rng); + btr.ResampleHyperparameters(rng); } prob_t Prob(const WordID src, const vector& trglets) const { @@ -72,7 +72,9 @@ struct ConditionalPYPWordModel { return r.size(); } + // TODO tie PYP hyperparameters based on source word frequency bins Base& base; + BinTiedResampler > > btr; typedef unordered_map > > RuleModelHash; RuleModelHash r; }; diff --git a/gi/pf/quasi_model2.h b/gi/pf/quasi_model2.h index 0095289f..8ec0a400 100644 --- a/gi/pf/quasi_model2.h +++ b/gi/pf/quasi_model2.h @@ -3,44 +3,113 @@ #include #include +#include +#include "boost/functional.hpp" #include "prob.h" #include "array2d.h" +struct AlignmentObservation { + AlignmentObservation() : src_len(), trg_len(), j(), a_j() {} + AlignmentObservation(unsigned sl, unsigned tl, unsigned tw, unsigned sw) : + src_len(sl), trg_len(tl), j(tw), a_j(sw) {} + unsigned short src_len; + unsigned short trg_len; + unsigned short j; + unsigned short a_j; +}; + +inline size_t hash_value(const AlignmentObservation& o) { + return reinterpret_cast(o); +} + +inline bool operator==(const AlignmentObservation& a, const AlignmentObservation& b) { + return hash_value(a) == hash_value(b); +} + struct QuasiModel2 { explicit QuasiModel2(double alpha, double pnull = 0.1) : alpha_(alpha), pnull_(pnull), - pnotnull_(1 - pnull), - z_(1000,1000) {} + pnotnull_(1 - pnull) {} + // a_j = 0 => NULL; src_len does *not* include null - prob_t Pa_j(unsigned a_j, unsigned j, unsigned src_len, unsigned trg_len) const { + prob_t Prob(unsigned a_j, unsigned j, unsigned src_len, unsigned trg_len) const { if (!a_j) return pnull_; - std::vector& zv = z_(src_len, trg_len); - if (zv.size() == 0) - zv.resize(trg_len); - - prob_t& z = zv[j]; - if (z.is_0()) z = ComputeZ(j, src_len, trg_len); - - prob_t p; - p.logeq(-fabs(double(a_j - 1) / src_len - double(j) / trg_len) * alpha_); - p *= pnotnull_; - p /= z; + return pnotnull_ * + prob_t(UnnormalizedProb(a_j, j, src_len, trg_len, alpha_) / GetOrComputeZ(j, src_len, trg_len)); + } + + void Increment(unsigned a_j, unsigned j, unsigned src_len, unsigned trg_len) { + assert(a_j <= src_len); + assert(j < trg_len); + ++obs_[AlignmentObservation(src_len, trg_len, j, a_j)]; + } + + void Decrement(unsigned a_j, unsigned j, unsigned src_len, unsigned trg_len) { + const AlignmentObservation ao(src_len, trg_len, j, a_j); + int &cc = obs_[ao]; + assert(cc > 0); + --cc; + if (!cc) obs_.erase(ao); + } + + prob_t Likelihood() const { + return Likelihood(alpha_, pnull_.as_float()); + } + + prob_t Likelihood(double alpha, double ppnull) const { + const prob_t pnull(ppnull); + const prob_t pnotnull(1 - ppnull); + + prob_t p = prob_t::One(); + for (ObsCount::const_iterator it = obs_.begin(); it != obs_.end(); ++it) { + const AlignmentObservation& ao = it->first; + if (ao.a_j) { + double u = UnnormalizedProb(ao.a_j, ao.j, ao.src_len, ao.trg_len, alpha); + double z = ComputeZ(ao.j, ao.src_len, ao.trg_len, alpha); + prob_t pa(u / z); + pa *= pnotnull; + pa.poweq(it->second); + p *= pa; + } else { + p *= pnull.pow(it->second); + } + } return p; } + private: - prob_t ComputeZ(unsigned j, unsigned src_len, unsigned trg_len) const { - prob_t p, z = prob_t::Zero(); - for (int a_j = 1; a_j <= src_len; ++a_j) { - p.logeq(-fabs(double(a_j - 1) / src_len - double(j) / trg_len) * alpha_); - z += p; - } + static double UnnormalizedProb(unsigned a_j, unsigned j, unsigned src_len, unsigned trg_len, double alpha) { + return exp(-fabs(double(a_j - 1) / src_len - double(j) / trg_len) * alpha); + } + + static double ComputeZ(unsigned j, unsigned src_len, unsigned trg_len, double alpha) { + double z = 0; + for (int a_j = 1; a_j <= src_len; ++a_j) + z += UnnormalizedProb(a_j, j, src_len, trg_len, alpha); return z; } + + const double& GetOrComputeZ(unsigned j, unsigned src_len, unsigned trg_len) const { + if (src_len >= zcache_.size()) + zcache_.resize(src_len + 1); + if (trg_len >= zcache_[src_len].size()) + zcache_[src_len].resize(trg_len + 1); + std::vector& zv = zcache_[src_len][trg_len]; + if (zv.size() == 0) + zv.resize(trg_len); + double& z = zv[j]; + if (!z) + z = ComputeZ(j, src_len, trg_len, alpha_); + return z; + } + double alpha_; - const prob_t pnull_; - const prob_t pnotnull_; - mutable Array2D > z_; + prob_t pnull_; + prob_t pnotnull_; + mutable std::vector > > zcache_; + typedef std::tr1::unordered_map > ObsCount; + ObsCount obs_; }; #endif diff --git a/gi/pf/tied_resampler.h b/gi/pf/tied_resampler.h index 208fb9c7..5a262f9d 100644 --- a/gi/pf/tied_resampler.h +++ b/gi/pf/tied_resampler.h @@ -2,6 +2,7 @@ #define _TIED_RESAMPLER_H_ #include +#include #include "sampler.h" #include "slice_sampler.h" #include "m.h" @@ -28,6 +29,10 @@ struct TiedResampler { crps.erase(crp); } + size_t size() const { + return crps.size(); + } + double LogLikelihood(double d, double s) const { if (s <= -d) return -std::numeric_limits::infinity(); double llh = Md::log_beta_density(d, d_alpha, d_beta) + @@ -54,6 +59,7 @@ struct TiedResampler { }; void ResampleHyperparameters(MT19937* rng, const unsigned nloop = 5, const unsigned niterations = 10) { + if (size() == 0) { std::cerr << "EMPTY - not resampling\n"; return; } const DiscountResampler dr(*this); const AlphaResampler ar(*this); for (int iter = 0; iter < nloop; ++iter) { @@ -79,4 +85,29 @@ struct TiedResampler { double discount, strength; }; +// split according to some criterion +template +struct BinTiedResampler { + explicit BinTiedResampler(unsigned nbins) : + resamplers(nbins, TiedResampler(1,1,1,1)) {} + + void Add(unsigned bin, CRP* crp) { + resamplers[bin].Add(crp); + } + + void Remove(unsigned bin, CRP* crp) { + resamplers[bin].Remove(crp); + } + + void ResampleHyperparameters(MT19937* rng) { + for (unsigned i = 0; i < resamplers.size(); ++i) { + std::cerr << "BIN " << i << " (" << resamplers[i].size() << " CRPs): " << std::flush; + resamplers[i].ResampleHyperparameters(rng); + } + } + + private: + std::vector > resamplers; +}; + #endif -- cgit v1.2.3 From 289f96779e665ba24adca3461a624c68aa37bd99 Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Sat, 10 Mar 2012 14:10:04 -0500 Subject: do Bayesian inference on quasimodel2 hyperparameters --- gi/pf/align-lexonly-pyp.cc | 5 ++-- gi/pf/pyp_lm.cc | 2 +- gi/pf/pyp_tm.cc | 11 +++++---- gi/pf/quasi_model2.h | 57 +++++++++++++++++++++++++++++++++++++++++++--- gi/pf/tied_resampler.h | 11 +++++++++ 5 files changed, 75 insertions(+), 11 deletions(-) (limited to 'gi/pf/pyp_tm.cc') diff --git a/gi/pf/align-lexonly-pyp.cc b/gi/pf/align-lexonly-pyp.cc index 68cb9192..6c054753 100644 --- a/gi/pf/align-lexonly-pyp.cc +++ b/gi/pf/align-lexonly-pyp.cc @@ -74,6 +74,7 @@ struct Aligner { void ResampleHyperparameters() { model.ResampleHyperparameters(prng); + paj_model.ResampleHyperparameters(prng); } void InitializeRandom() { @@ -216,9 +217,9 @@ int main(int argc, char** argv) { const unsigned samples = conf["samples"].as(); for (int i = 0; i < samples; ++i) { for (int j = 65; j < 67; ++j) Debug(corpus[j]); - if (i % 7 == 6) aligner.ResampleHyperparameters(); + if (i % 10 == 9) aligner.ResampleHyperparameters(); aligner.ResampleCorpus(); - if (i > (samples / 5) && (i % 10 == 9)) for (int j = 0; j < corpus.size(); ++j) AddSample(&corpus[j]); + if (i > (samples / 5) && (i % 6 == 5)) for (int j = 0; j < corpus.size(); ++j) AddSample(&corpus[j]); } for (unsigned i = 0; i < corpus.size(); ++i) WriteAlignments(corpus[i]); diff --git a/gi/pf/pyp_lm.cc b/gi/pf/pyp_lm.cc index 85635b8f..91029688 100644 --- a/gi/pf/pyp_lm.cc +++ b/gi/pf/pyp_lm.cc @@ -113,7 +113,7 @@ template struct PYPLM { typename unordered_map, CCRP, boost::hash > >::const_iterator it; for (it = p.begin(); it != p.end(); ++it) llh += it->second.log_crp_prob(); - // TODO parametric likelihood from TiedResampler + llh += tr.LogLikelihood(); return llh; } diff --git a/gi/pf/pyp_tm.cc b/gi/pf/pyp_tm.cc index bf5a6497..34ef0ba2 100644 --- a/gi/pf/pyp_tm.cc +++ b/gi/pf/pyp_tm.cc @@ -17,7 +17,7 @@ using namespace std::tr1; template struct ConditionalPYPWordModel { - ConditionalPYPWordModel(Base* b) : base(*b), btr(3) {} + ConditionalPYPWordModel(Base* b) : base(*b), btr(2) {} void Summary() const { cerr << "Number of conditioning contexts: " << r.size() << endl; @@ -29,8 +29,6 @@ struct ConditionalPYPWordModel { } void ResampleHyperparameters(MT19937* rng) { - for (RuleModelHash::iterator it = r.begin(); it != r.end(); ++it) - it->second.resample_hyperparameters(rng); btr.ResampleHyperparameters(rng); } @@ -45,8 +43,11 @@ struct ConditionalPYPWordModel { void Increment(const WordID src, const vector& trglets, MT19937* rng) { RuleModelHash::iterator it = r.find(src); - if (it == r.end()) - it = r.insert(make_pair(src, CCRP >(1,1,1,1,0.5,1.0))).first; + if (it == r.end()) { + it = r.insert(make_pair(src, CCRP >(0.5,1.0))).first; + static const WordID kNULL = TD::Convert("NULL"); + btr.Add(src == kNULL ? 0 : 1, &it->second); + } if (it->second.increment(trglets, base(trglets), rng)) base.Increment(trglets, rng); } diff --git a/gi/pf/quasi_model2.h b/gi/pf/quasi_model2.h index 8ec0a400..588c8f84 100644 --- a/gi/pf/quasi_model2.h +++ b/gi/pf/quasi_model2.h @@ -7,6 +7,8 @@ #include "boost/functional.hpp" #include "prob.h" #include "array2d.h" +#include "slice_sampler.h" +#include "m.h" struct AlignmentObservation { AlignmentObservation() : src_len(), trg_len(), j(), a_j() {} @@ -53,6 +55,37 @@ struct QuasiModel2 { if (!cc) obs_.erase(ao); } + struct PNullResampler { + PNullResampler(const QuasiModel2& m) : m_(m) {} + const QuasiModel2& m_; + double operator()(const double& proposed_pnull) const { + return log(m_.Likelihood(m_.alpha_, proposed_pnull)); + } + }; + + struct AlphaResampler { + AlphaResampler(const QuasiModel2& m) : m_(m) {} + const QuasiModel2& m_; + double operator()(const double& proposed_alpha) const { + return log(m_.Likelihood(proposed_alpha, m_.pnull_.as_float())); + } + }; + + void ResampleHyperparameters(MT19937* rng, const unsigned nloop = 5, const unsigned niterations = 10) { + const PNullResampler dr(*this); + const AlphaResampler ar(*this); + for (unsigned i = 0; i < nloop; ++i) { + double pnull = slice_sampler1d(dr, pnull_.as_float(), *rng, 0.00000001, + 1.0, 0.0, niterations, 100*niterations); + pnull_ = prob_t(pnull); + alpha_ = slice_sampler1d(ar, alpha_, *rng, 0.00000001, + std::numeric_limits::infinity(), 0.0, niterations, 100*niterations); + } + std::cerr << "QuasiModel2(alpha=" << alpha_ << ",p_null=" + << pnull_.as_float() << ") = " << Likelihood() << std::endl; + zcache_.clear(); + } + prob_t Likelihood() const { return Likelihood(alpha_, pnull_.as_float()); } @@ -61,12 +94,17 @@ struct QuasiModel2 { const prob_t pnull(ppnull); const prob_t pnotnull(1 - ppnull); - prob_t p = prob_t::One(); + prob_t p; + p.logeq(Md::log_gamma_density(alpha, 0.1, 25)); // TODO configure + assert(!p.is_0()); + prob_t prob_of_ppnull; prob_of_ppnull.logeq(Md::log_beta_density(ppnull, 2, 10)); + assert(!prob_of_ppnull.is_0()); + p *= prob_of_ppnull; for (ObsCount::const_iterator it = obs_.begin(); it != obs_.end(); ++it) { const AlignmentObservation& ao = it->first; if (ao.a_j) { - double u = UnnormalizedProb(ao.a_j, ao.j, ao.src_len, ao.trg_len, alpha); - double z = ComputeZ(ao.j, ao.src_len, ao.trg_len, alpha); + prob_t u = XUnnormalizedProb(ao.a_j, ao.j, ao.src_len, ao.trg_len, alpha); + prob_t z = XComputeZ(ao.j, ao.src_len, ao.trg_len, alpha); prob_t pa(u / z); pa *= pnotnull; pa.poweq(it->second); @@ -79,6 +117,19 @@ struct QuasiModel2 { } private: + static prob_t XUnnormalizedProb(unsigned a_j, unsigned j, unsigned src_len, unsigned trg_len, double alpha) { + prob_t p; + p.logeq(-fabs(double(a_j - 1) / src_len - double(j) / trg_len) * alpha); + return p; + } + + static prob_t XComputeZ(unsigned j, unsigned src_len, unsigned trg_len, double alpha) { + prob_t z = prob_t::Zero(); + for (int a_j = 1; a_j <= src_len; ++a_j) + z += XUnnormalizedProb(a_j, j, src_len, trg_len, alpha); + return z; + } + static double UnnormalizedProb(unsigned a_j, unsigned j, unsigned src_len, unsigned trg_len, double alpha) { return exp(-fabs(double(a_j - 1) / src_len - double(j) / trg_len) * alpha); } diff --git a/gi/pf/tied_resampler.h b/gi/pf/tied_resampler.h index 5a262f9d..6f45fbce 100644 --- a/gi/pf/tied_resampler.h +++ b/gi/pf/tied_resampler.h @@ -42,6 +42,10 @@ struct TiedResampler { return llh; } + double LogLikelihood() const { + return LogLikelihood(discount, strength); + } + struct DiscountResampler { DiscountResampler(const TiedResampler& m) : m_(m) {} const TiedResampler& m_; @@ -106,6 +110,13 @@ struct BinTiedResampler { } } + double LogLikelihood() const { + double llh = 0; + for (unsigned i = 0; i < resamplers.size(); ++i) + llh += resamplers[i].LogLikelihood(); + return llh; + } + private: std::vector > resamplers; }; -- cgit v1.2.3 From dfbc278c1057555fda9312291c8024049e00b7d8 Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Sat, 10 Mar 2012 16:42:12 -0500 Subject: frequency-based binning --- decoder/Makefile.am | 1 - decoder/ff_csplit.cc | 2 +- decoder/freqdict.cc | 29 ----------------------------- decoder/freqdict.h | 37 ++++++++++++++++++++++++++++++++----- gi/pf/align-lexonly-pyp.cc | 24 +++++++++++++++++------- gi/pf/make-freq-bins.pl | 26 ++++++++++++++++++++++++++ gi/pf/pyp_tm.cc | 24 +++++++++++++++++------- gi/pf/pyp_tm.h | 7 ++++--- 8 files changed, 97 insertions(+), 53 deletions(-) delete mode 100644 decoder/freqdict.cc create mode 100755 gi/pf/make-freq-bins.pl (limited to 'gi/pf/pyp_tm.cc') diff --git a/decoder/Makefile.am b/decoder/Makefile.am index a00b18af..ec51d643 100644 --- a/decoder/Makefile.am +++ b/decoder/Makefile.am @@ -76,7 +76,6 @@ libcdec_a_SOURCES = \ ff_source_syntax.cc \ ff_bleu.cc \ ff_factory.cc \ - freqdict.cc \ lexalign.cc \ lextrans.cc \ tagger.cc \ diff --git a/decoder/ff_csplit.cc b/decoder/ff_csplit.cc index 3991d38f..c9ed996c 100644 --- a/decoder/ff_csplit.cc +++ b/decoder/ff_csplit.cc @@ -72,7 +72,7 @@ struct BasicCSplitFeaturesImpl { const int fl1_; const int fl2_; const int bad_; - FreqDict freq_dict_; + FreqDict freq_dict_; set bad_words_; }; diff --git a/decoder/freqdict.cc b/decoder/freqdict.cc deleted file mode 100644 index 9e25d346..00000000 --- a/decoder/freqdict.cc +++ /dev/null @@ -1,29 +0,0 @@ -#include -#include -#include -#include "freqdict.h" -#include "tdict.h" -#include "filelib.h" - -using namespace std; - -void FreqDict::Load(const std::string& fname) { - cerr << "Reading word frequencies: " << fname << endl; - ReadFile rf(fname); - istream& ifs = *rf.stream(); - int cc=0; - while (ifs) { - std::string word; - ifs >> word; - if (word.size() == 0) continue; - if (word[0] == '#') continue; - double count = 0; - ifs >> count; - assert(count > 0.0); // use -log(f) - counts_[TD::Convert(word)]=count; - ++cc; - if (cc % 10000 == 0) { std::cerr << "."; } - } - std::cerr << "\n"; - std::cerr << "Loaded " << cc << " words\n"; -} diff --git a/decoder/freqdict.h b/decoder/freqdict.h index 9acf0c33..4e03fadd 100644 --- a/decoder/freqdict.h +++ b/decoder/freqdict.h @@ -1,20 +1,47 @@ #ifndef _FREQDICT_H_ #define _FREQDICT_H_ +#include #include #include #include "wordid.h" +#include "filelib.h" +#include "tdict.h" +template class FreqDict { public: - void Load(const std::string& fname); - float LookUp(const WordID& word) const { - std::map::const_iterator i = counts_.find(word); - if (i == counts_.end()) return 0; + FreqDict() : max_() {} + T Max() const { return max_; } + void Load(const std::string& fname) { + std::cerr << "Reading word statistics from: " << fname << std::endl; + ReadFile rf(fname); + std::istream& ifs = *rf.stream(); + int cc=0; + std::string word; + while (ifs) { + ifs >> word; + if (word.size() == 0) continue; + if (word[0] == '#') continue; + T count = 0; + ifs >> count; + if (count > max_) max_ = count; + counts_[TD::Convert(word)]=count; + ++cc; + if (cc % 10000 == 0) { std::cerr << "."; } + } + std::cerr << "\n"; + std::cerr << "Loaded " << cc << " words\n"; + } + + T LookUp(const WordID& word) const { + typename std::map::const_iterator i = counts_.find(word); + if (i == counts_.end()) return T(); return i->second; } private: - std::map counts_; + T max_; + std::map counts_; }; #endif diff --git a/gi/pf/align-lexonly-pyp.cc b/gi/pf/align-lexonly-pyp.cc index 6c054753..942dcf51 100644 --- a/gi/pf/align-lexonly-pyp.cc +++ b/gi/pf/align-lexonly-pyp.cc @@ -20,6 +20,9 @@ void InitCommandLine(int argc, char** argv, po::variables_map* conf) { po::options_description opts("Configuration options"); opts.add_options() ("samples,s",po::value()->default_value(1000),"Number of samples") + ("infer_alignment_hyperparameters,I", "Infer alpha and p_null, otherwise fixed values will be assumed") + ("p_null,0", po::value()->default_value(0.08), "probability of aligning to null") + ("align_alpha,a", po::value()->default_value(4.0), "how 'tight' is the bias toward be along the diagonal?") ("input,i",po::value(),"Read parallel data from") ("random_seed,S",po::value(), "Random seed"); po::options_description clo("Command line options"); @@ -59,9 +62,13 @@ struct AlignedSentencePair { }; struct Aligner { - Aligner(const vector >& lets, int num_letters, vector* c) : + Aligner(const vector >& lets, + int num_letters, + const po::variables_map& conf, + vector* c) : corpus(*c), - paj_model(4, 0.08), + paj_model(conf["align_alpha"].as(), conf["p_null"].as()), + infer_paj(conf.count("infer_alignment_hyperparameters") > 0), model(lets, num_letters), kNULL(TD::Convert("NULL")) { assert(lets[kNULL].size() == 0); @@ -69,12 +76,13 @@ struct Aligner { vector& corpus; QuasiModel2 paj_model; + const bool infer_paj; PYPLexicalTranslation model; const WordID kNULL; void ResampleHyperparameters() { model.ResampleHyperparameters(prng); - paj_model.ResampleHyperparameters(prng); + if (infer_paj) paj_model.ResampleHyperparameters(prng); } void InitializeRandom() { @@ -117,8 +125,6 @@ struct Aligner { paj_model.Increment(a_j, j, asp.src.size(), asp.trg.size()); } } - cerr << "LLH = " << Likelihood() << " \t(Amodel=" << paj_model.Likelihood() - << " TModel=" << model.Likelihood() << ") contexts=" << model.UniqueConditioningContexts() << endl; } prob_t Likelihood() const { @@ -211,13 +217,17 @@ int main(int argc, char** argv) { ExtractLetters(vocabf, &letters, NULL); letters[TD::Convert("NULL")].clear(); - Aligner aligner(letters, letset.size(), &corpus); + Aligner aligner(letters, letset.size(), conf, &corpus); aligner.InitializeRandom(); const unsigned samples = conf["samples"].as(); for (int i = 0; i < samples; ++i) { for (int j = 65; j < 67; ++j) Debug(corpus[j]); - if (i % 10 == 9) aligner.ResampleHyperparameters(); + if (i % 10 == 9) { + aligner.ResampleHyperparameters(); + cerr << "LLH = " << aligner.Likelihood() << " \t(Amodel=" << aligner.paj_model.Likelihood() + << " TModel=" << aligner.model.Likelihood() << ") contexts=" << aligner.model.UniqueConditioningContexts() << endl; + } aligner.ResampleCorpus(); if (i > (samples / 5) && (i % 6 == 5)) for (int j = 0; j < corpus.size(); ++j) AddSample(&corpus[j]); } diff --git a/gi/pf/make-freq-bins.pl b/gi/pf/make-freq-bins.pl new file mode 100755 index 00000000..fdcd3555 --- /dev/null +++ b/gi/pf/make-freq-bins.pl @@ -0,0 +1,26 @@ +#!/usr/bin/perl -w +use strict; + +my $BASE = 6; +my $CUTOFF = 3; + +my %d; +my $num = 0; +while(<>){ + chomp; + my @words = split /\s+/; + for my $w (@words) {$d{$w}++; $num++;} +} + +my @vocab = sort {$d{$b} <=> $d{$a}} keys %d; + +for (my $i=0; $i #include -#include "base_distributions.h" -#include "monotonic_pseg.h" -#include "conditional_pseg.h" #include "tdict.h" #include "ccrp.h" #include "pyp_word_model.h" @@ -15,9 +12,19 @@ using namespace std; using namespace std::tr1; -template +struct FreqBinner { + FreqBinner(const std::string& fname) { fd_.Load(fname); } + unsigned NumberOfBins() const { return fd_.Max() + 1; } + unsigned Bin(const WordID& w) const { return fd_.LookUp(w); } + FreqDict fd_; +}; + +template struct ConditionalPYPWordModel { - ConditionalPYPWordModel(Base* b) : base(*b), btr(2) {} + ConditionalPYPWordModel(Base* b, const Binner* bnr = NULL) : + base(*b), + binner(bnr), + btr(binner ? binner->NumberOfBins() + 1u : 2u) {} void Summary() const { cerr << "Number of conditioning contexts: " << r.size() << endl; @@ -46,7 +53,9 @@ struct ConditionalPYPWordModel { if (it == r.end()) { it = r.insert(make_pair(src, CCRP >(0.5,1.0))).first; static const WordID kNULL = TD::Convert("NULL"); - btr.Add(src == kNULL ? 0 : 1, &it->second); + unsigned bin = (src == kNULL ? 0 : 1); + if (binner && bin) { bin = binner->Bin(src) + 1; } + btr.Add(bin, &it->second); } if (it->second.increment(trglets, base(trglets), rng)) base.Increment(trglets, rng); @@ -75,6 +84,7 @@ struct ConditionalPYPWordModel { // TODO tie PYP hyperparameters based on source word frequency bins Base& base; + const Binner* binner; BinTiedResampler > > btr; typedef unordered_map > > RuleModelHash; RuleModelHash r; @@ -84,7 +94,7 @@ PYPLexicalTranslation::PYPLexicalTranslation(const vector >& lets const unsigned num_letters) : letters(lets), up0(new PYPWordModel(num_letters)), - tmodel(new ConditionalPYPWordModel(up0)), + tmodel(new ConditionalPYPWordModel(up0, new FreqBinner("10k.freq"))), kX(-TD::Convert("X")) {} void PYPLexicalTranslation::Summary() const { diff --git a/gi/pf/pyp_tm.h b/gi/pf/pyp_tm.h index fa0fb28f..63e7c96d 100644 --- a/gi/pf/pyp_tm.h +++ b/gi/pf/pyp_tm.h @@ -5,10 +5,11 @@ #include "wordid.h" #include "prob.h" #include "sampler.h" +#include "freqdict.h" -struct TRule; +struct FreqBinner; struct PYPWordModel; -template struct ConditionalPYPWordModel; +template struct ConditionalPYPWordModel; struct PYPLexicalTranslation { explicit PYPLexicalTranslation(const std::vector >& lets, @@ -26,7 +27,7 @@ struct PYPLexicalTranslation { private: const std::vector >& letters; // spelling dictionary PYPWordModel* up0; // base distribuction (model English word) - ConditionalPYPWordModel* tmodel; // translation distributions + ConditionalPYPWordModel* tmodel; // translation distributions // (model English word | French word) const WordID kX; }; -- cgit v1.2.3