From aac3ef3e3fdf636406fc61a40096cee6381e5461 Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Thu, 29 Dec 2011 21:08:30 -0500 Subject: lexical alignment samplers --- gi/pf/Makefile.am | 13 +- gi/pf/align-lexonly.cc | 356 +++++++++++++++++++++++++++++++++++++++++++++++++ gi/pf/base_measures.cc | 26 ++++ gi/pf/base_measures.h | 50 ++++++- gi/pf/itg.cc | 98 +++++++++++--- gi/pf/unigrams.cc | 80 +++++++++++ gi/pf/unigrams.h | 69 ++++++++++ 7 files changed, 668 insertions(+), 24 deletions(-) create mode 100644 gi/pf/align-lexonly.cc create mode 100644 gi/pf/unigrams.cc create mode 100644 gi/pf/unigrams.h (limited to 'gi/pf') diff --git a/gi/pf/Makefile.am b/gi/pf/Makefile.am index 42758939..7c8e89d0 100644 --- a/gi/pf/Makefile.am +++ b/gi/pf/Makefile.am @@ -1,10 +1,14 @@ -bin_PROGRAMS = cbgi brat dpnaive pfbrat pfdist itg pfnaive +bin_PROGRAMS = cbgi brat dpnaive pfbrat pfdist itg pfnaive condnaive align-lexonly noinst_LIBRARIES = libpf.a -libpf_a_SOURCES = base_measures.cc reachability.cc cfg_wfst_composer.cc corpus.cc +libpf_a_SOURCES = base_measures.cc reachability.cc cfg_wfst_composer.cc corpus.cc unigrams.cc ngram_base.cc + +align_lexonly_SOURCES = align-lexonly.cc itg_SOURCES = itg.cc +condnaive_SOURCES = condnaive.cc + dpnaive_SOURCES = dpnaive.cc pfdist_SOURCES = pfdist.cc @@ -17,5 +21,6 @@ brat_SOURCES = brat.cc pfbrat_SOURCES = pfbrat.cc -AM_CPPFLAGS = -W -Wall -Wno-sign-compare -funroll-loops -I$(top_srcdir)/utils $(GTEST_CPPFLAGS) -I$(top_srcdir)/decoder -AM_LDFLAGS = libpf.a $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/utils/libutils.a -lz +AM_CPPFLAGS = -W -Wall -Wno-sign-compare -funroll-loops -I$(top_srcdir)/utils $(GTEST_CPPFLAGS) -I$(top_srcdir)/decoder -I$(top_srcdir)/klm + +AM_LDFLAGS = libpf.a $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/klm/lm/libklm.a $(top_srcdir)/klm/util/libklm_util.a $(top_srcdir)/utils/libutils.a -lz diff --git a/gi/pf/align-lexonly.cc b/gi/pf/align-lexonly.cc new file mode 100644 index 00000000..91a3cfcf --- /dev/null +++ b/gi/pf/align-lexonly.cc @@ -0,0 +1,356 @@ +#include +#include +#include + +#include +#include +#include + +#include "array2d.h" +#include "base_measures.h" +#include "monotonic_pseg.h" +#include "conditional_pseg.h" +#include "trule.h" +#include "tdict.h" +#include "stringlib.h" +#include "filelib.h" +#include "dict.h" +#include "sampler.h" +#include "ccrp_nt.h" +#include "corpus.h" +#include "ngram_base.h" + +using namespace std; +using namespace tr1; +namespace po = boost::program_options; + +void InitCommandLine(int argc, char** argv, po::variables_map* conf) { + po::options_description opts("Configuration options"); + opts.add_options() + ("samples,s",po::value()->default_value(1000),"Number of samples") + ("input,i",po::value(),"Read parallel data from") + ("random_seed,S",po::value(), "Random seed"); + po::options_description clo("Command line options"); + clo.add_options() + ("config", po::value(), "Configuration file") + ("help,h", "Print this help message and exit"); + po::options_description dconfig_options, dcmdline_options; + dconfig_options.add(opts); + dcmdline_options.add(opts).add(clo); + + po::store(parse_command_line(argc, argv, dcmdline_options), *conf); + if (conf->count("config")) { + ifstream config((*conf)["config"].as().c_str()); + po::store(po::parse_config_file(config, dconfig_options), *conf); + } + po::notify(*conf); + + if (conf->count("help") || (conf->count("input") == 0)) { + cerr << dcmdline_options << endl; + exit(1); + } +} + +shared_ptr prng; + +struct LexicalAlignment { + unsigned char src_index; + bool is_transliteration; + vector > derivation; +}; + +struct AlignedSentencePair { + vector src; + vector trg; + vector a; + Array2D posterior; +}; + +struct HierarchicalUnigramBase { + explicit HierarchicalUnigramBase(const unsigned vocab_e_size) : r(5,5), u0(1.0 / vocab_e_size) {} + + // return p0 of rule.e_ + prob_t operator()(const TRule& rule) const { + prob_t p = prob_t::One(); + prob_t q; + for (unsigned i = 0; i < rule.e_.size(); ++i) { + q.logeq(r.logprob(rule.e_[i], log(u0))); + p *= q; + } + q.logeq(r.logprob(TD::Convert(""), log(u0))); + p *= q; + return p; + } + + void Increment(const TRule& rule) { + for (unsigned i = 0; i < rule.e_.size(); ++i) + r.increment(rule.e_[i]); + r.increment(TD::Convert("")); + } + + void Decrement(const TRule& rule) { + for (unsigned i = 0; i < rule.e_.size(); ++i) + r.decrement(rule.e_[i]); + r.decrement(TD::Convert("")); + } + + CCRP_NoTable r; + prob_t u0; +}; + +struct HierarchicalWordBase { + explicit HierarchicalWordBase(const unsigned vocab_e_size) : + base(prob_t::One()), r(15,15), u0(-log(vocab_e_size)) {} + + void ResampleHyperparameters(MT19937* rng) { + r.resample_hyperparameters(rng); + } + + inline double logp0(const vector& s) const { + return s.size() * u0; + } + + // return p0 of rule.e_ + prob_t operator()(const TRule& rule) const { + prob_t p; p.logeq(r.logprob(rule.e_, logp0(rule.e_))); + return p; + } + + void Increment(const TRule& rule) { + if (r.increment(rule.e_)) { + prob_t p; p.logeq(logp0(rule.e_)); + base *= p; + } + } + + void Decrement(const TRule& rule) { + if (r.decrement(rule.e_)) { + prob_t p; p.logeq(logp0(rule.e_)); + base /= p; + } + } + + prob_t Likelihood() const { + prob_t p; p.logeq(r.log_crp_prob()); + p *= base; + return p; + } + + void Summary() const { + cerr << "NUMBER OF CUSTOMERS: " << r.num_customers() << endl; + for (CCRP_NoTable >::const_iterator it = r.begin(); it != r.end(); ++it) + cerr << " " << it->second << '\t' << TD::GetString(it->first) << endl; + } + + prob_t base; + CCRP_NoTable > r; + const double u0; +}; + +struct BasicLexicalAlignment { + explicit BasicLexicalAlignment(const vector >& lets, + const unsigned letters_e, + vector* corp) : + letters(lets), + corpus(*corp), + //up0("en.chars.1gram", letters_e), + //up0("en.words.1gram"), + up0(letters_e), + //up0("en.chars.2gram"), + tmodel(up0) { + } + + void InstantiateRule(const WordID src, + const WordID trg, + TRule* rule) const { + static const WordID kX = TD::Convert("X") * -1; + rule->lhs_ = kX; + rule->e_ = letters[trg]; + rule->f_ = letters[src]; + } + + void InitializeRandom() { + const WordID kNULL = TD::Convert("NULL"); + cerr << "Initializing with random alignments ...\n"; + for (unsigned i = 0; i < corpus.size(); ++i) { + AlignedSentencePair& asp = corpus[i]; + asp.a.resize(asp.trg.size()); + for (unsigned j = 0; j < asp.trg.size(); ++j) { + const unsigned char a_j = prng->next() * (1 + asp.src.size()); + const WordID f_a_j = (a_j ? asp.src[a_j - 1] : kNULL); + TRule r; + InstantiateRule(f_a_j, asp.trg[j], &r); + asp.a[j].is_transliteration = false; + asp.a[j].src_index = a_j; + if (tmodel.IncrementRule(r)) + up0.Increment(r); + } + } + cerr << " LLH = " << Likelihood() << endl; + } + + prob_t Likelihood() const { + prob_t p = tmodel.Likelihood(); + p *= up0.Likelihood(); + return p; + } + + void ResampleHyperparemeters() { + cerr << " LLH_prev = " << Likelihood() << flush; + tmodel.ResampleHyperparameters(&*prng); + up0.ResampleHyperparameters(&*prng); + cerr << "\tLLH_post = " << Likelihood() << endl; + } + + void ResampleCorpus(); + + const vector >& letters; // spelling dictionary + vector& corpus; + //PhraseConditionalUninformativeBase up0; + //PhraseConditionalUninformativeUnigramBase up0; + //UnigramWordBase up0; + //HierarchicalUnigramBase up0; + HierarchicalWordBase up0; + //CompletelyUniformBase up0; + //FixedNgramBase up0; + //ConditionalTranslationModel tmodel; + //ConditionalTranslationModel tmodel; + //ConditionalTranslationModel tmodel; + //ConditionalTranslationModel tmodel; + ConditionalTranslationModel tmodel; + //ConditionalTranslationModel tmodel; + //ConditionalTranslationModel tmodel; +}; + +void BasicLexicalAlignment::ResampleCorpus() { + static const WordID kNULL = TD::Convert("NULL"); + for (unsigned i = 0; i < corpus.size(); ++i) { + AlignedSentencePair& asp = corpus[i]; + SampleSet ss; ss.resize(asp.src.size() + 1); + for (unsigned j = 0; j < asp.trg.size(); ++j) { + TRule r; + unsigned char& a_j = asp.a[j].src_index; + WordID f_a_j = (a_j ? asp.src[a_j - 1] : kNULL); + InstantiateRule(f_a_j, asp.trg[j], &r); + if (tmodel.DecrementRule(r)) + up0.Decrement(r); + + for (unsigned prop_a_j = 0; prop_a_j <= asp.src.size(); ++prop_a_j) { + const WordID prop_f = (prop_a_j ? asp.src[prop_a_j - 1] : kNULL); + InstantiateRule(prop_f, asp.trg[j], &r); + ss[prop_a_j] = tmodel.RuleProbability(r); + } + a_j = prng->SelectSample(ss); + f_a_j = (a_j ? asp.src[a_j - 1] : kNULL); + InstantiateRule(f_a_j, asp.trg[j], &r); + if (tmodel.IncrementRule(r)) + up0.Increment(r); + } + } + cerr << " LLH = " << tmodel.Likelihood() << endl; +} + +void ExtractLetters(const set& v, vector >* l, set* letset = NULL) { + for (set::const_iterator it = v.begin(); it != v.end(); ++it) { + vector& letters = (*l)[*it]; + if (letters.size()) continue; // if e and f have the same word + + const string& w = TD::Convert(*it); + + size_t cur = 0; + while (cur < w.size()) { + const size_t len = UTF8Len(w[cur]); + letters.push_back(TD::Convert(w.substr(cur, len))); + if (letset) letset->insert(letters.back()); + cur += len; + } + } +} + +void Debug(const AlignedSentencePair& asp) { + cerr << TD::GetString(asp.src) << endl << TD::GetString(asp.trg) << endl; + Array2D a(asp.src.size(), asp.trg.size()); + for (unsigned j = 0; j < asp.trg.size(); ++j) + if (asp.a[j].src_index) a(asp.a[j].src_index - 1, j) = true; + cerr << a << endl; +} + +void AddSample(AlignedSentencePair* asp) { + for (unsigned j = 0; j < asp->trg.size(); ++j) + asp->posterior(asp->a[j].src_index, j)++; +} + +void WriteAlignments(const AlignedSentencePair& asp) { + bool first = true; + for (unsigned j = 0; j < asp.trg.size(); ++j) { + int src_index = -1; + int mc = -1; + for (unsigned i = 0; i <= asp.src.size(); ++i) { + if (asp.posterior(i, j) > mc) { + mc = asp.posterior(i, j); + src_index = i; + } + } + + if (src_index) { + if (first) first = false; else cout << ' '; + cout << (src_index - 1) << '-' << j; + } + } + cout << endl; +} + +int main(int argc, char** argv) { + po::variables_map conf; + InitCommandLine(argc, argv, &conf); + + if (conf.count("random_seed")) + prng.reset(new MT19937(conf["random_seed"].as())); + else + prng.reset(new MT19937); +// MT19937& rng = *prng; + + vector > corpuse, corpusf; + set vocabe, vocabf; + corpus::ReadParallelCorpus(conf["input"].as(), &corpusf, &corpuse, &vocabf, &vocabe); + cerr << "f-Corpus size: " << corpusf.size() << " sentences\n"; + cerr << "f-Vocabulary size: " << vocabf.size() << " types\n"; + cerr << "f-Corpus size: " << corpuse.size() << " sentences\n"; + cerr << "f-Vocabulary size: " << vocabe.size() << " types\n"; + assert(corpusf.size() == corpuse.size()); + + vector corpus(corpuse.size()); + for (unsigned i = 0; i < corpuse.size(); ++i) { + corpus[i].src.swap(corpusf[i]); + corpus[i].trg.swap(corpuse[i]); + corpus[i].posterior.resize(corpus[i].src.size() + 1, corpus[i].trg.size()); + } + corpusf.clear(); corpuse.clear(); + + vocabf.insert(TD::Convert("NULL")); + vector > letters(TD::NumWords()); + set letset; + ExtractLetters(vocabe, &letters, &letset); + ExtractLetters(vocabf, &letters, NULL); + letters[TD::Convert("NULL")].clear(); + + BasicLexicalAlignment x(letters, letset.size(), &corpus); + x.InitializeRandom(); + const unsigned samples = conf["samples"].as(); + for (int i = 0; i < samples; ++i) { + for (int j = 431; j < 433; ++j) Debug(corpus[j]); + cerr << i << "\t" << x.tmodel.r.size() << "\t"; + if (i % 10 == 0) x.ResampleHyperparemeters(); + x.ResampleCorpus(); + if (i > (samples / 5) && (i % 10 == 9)) for (int j = 0; j < corpus.size(); ++j) AddSample(&corpus[j]); + } + for (unsigned i = 0; i < corpus.size(); ++i) + WriteAlignments(corpus[i]); + //ModelAndData posterior(x, &corpus, vocabe, vocabf); + x.tmodel.Summary(); + x.up0.Summary(); + + //posterior.Sample(); + + return 0; +} diff --git a/gi/pf/base_measures.cc b/gi/pf/base_measures.cc index 8adb37d7..97b4e698 100644 --- a/gi/pf/base_measures.cc +++ b/gi/pf/base_measures.cc @@ -6,6 +6,32 @@ using namespace std; +prob_t PhraseConditionalUninformativeUnigramBase::p0(const vector& vsrc, + const vector& vtrg, + int start_src, int start_trg) const { + const int flen = vsrc.size() - start_src; + const int elen = vtrg.size() - start_trg; + prob_t p; + p.logeq(log_poisson(elen, flen + 0.01)); // elen | flen ~Pois(flen + 0.01) + //p.logeq(log_poisson(elen, 1)); // elen | flen ~Pois(flen + 0.01) + for (int i = 0; i < elen; ++i) + p *= u(vtrg[i + start_trg]); // draw e_i ~Uniform + return p; +} + +prob_t PhraseConditionalUninformativeBase::p0(const vector& vsrc, + const vector& vtrg, + int start_src, int start_trg) const { + const int flen = vsrc.size() - start_src; + const int elen = vtrg.size() - start_trg; + prob_t p; + //p.logeq(log_poisson(elen, flen + 0.01)); // elen | flen ~Pois(flen + 0.01) + p.logeq(log_poisson(elen, 1)); // elen | flen ~Pois(flen + 0.01) + for (int i = 0; i < elen; ++i) + p *= kUNIFORM_TARGET; // draw e_i ~Uniform + return p; +} + void Model1::LoadModel1(const string& fname) { cerr << "Loading Model 1 parameters from " << fname << " ..." << endl; ReadFile rf(fname); diff --git a/gi/pf/base_measures.h b/gi/pf/base_measures.h index 7ce7e2e6..fbd1c3ad 100644 --- a/gi/pf/base_measures.h +++ b/gi/pf/base_measures.h @@ -7,6 +7,7 @@ #include #include +#include "unigrams.h" #include "trule.h" #include "prob.h" #include "tdict.h" @@ -49,6 +50,51 @@ struct Model1 { std::vector > ttable; }; +struct CompletelyUniformBase { + explicit CompletelyUniformBase(const unsigned ves) : kUNIFORM(1.0 / ves) {} + prob_t operator()(const TRule&) const { + return kUNIFORM; + } + const prob_t kUNIFORM; +}; + +struct UnigramWordBase { + explicit UnigramWordBase(const std::string& fname) : un(fname) {} + prob_t operator()(const TRule& r) const { + return un(r.e_); + } + const UnigramWordModel un; +}; + +struct PhraseConditionalUninformativeBase { + explicit PhraseConditionalUninformativeBase(const unsigned vocab_e_size) : + kUNIFORM_TARGET(1.0 / vocab_e_size) { + assert(vocab_e_size > 0); + } + + // return p0 of rule.e_ | rule.f_ + prob_t operator()(const TRule& rule) const { + return p0(rule.f_, rule.e_, 0, 0); + } + + prob_t p0(const std::vector& vsrc, const std::vector& vtrg, int start_src, int start_trg) const; + + const prob_t kUNIFORM_TARGET; +}; + +struct PhraseConditionalUninformativeUnigramBase { + explicit PhraseConditionalUninformativeUnigramBase(const std::string& file, const unsigned vocab_e_size) : u(file, vocab_e_size) {} + + // return p0 of rule.e_ | rule.f_ + prob_t operator()(const TRule& rule) const { + return p0(rule.f_, rule.e_, 0, 0); + } + + prob_t p0(const std::vector& vsrc, const std::vector& vtrg, int start_src, int start_trg) const; + + const UnigramModel u; +}; + struct PhraseConditionalBase { explicit PhraseConditionalBase(const Model1& m1, const double m1mixture, const unsigned vocab_e_size) : model1(m1), @@ -83,7 +129,7 @@ struct PhraseJointBase { assert(vocab_e_size > 0); } - // return p0 of rule.e_ | rule.f_ + // return p0 of rule.e_ , rule.f_ prob_t operator()(const TRule& rule) const { return p0(rule.f_, rule.e_, 0, 0); } @@ -113,7 +159,7 @@ struct PhraseJointBase_BiDir { assert(vocab_e_size > 0); } - // return p0 of rule.e_ | rule.f_ + // return p0 of rule.e_ , rule.f_ prob_t operator()(const TRule& rule) const { return p0(rule.f_, rule.e_, 0, 0); } diff --git a/gi/pf/itg.cc b/gi/pf/itg.cc index ac3c16a3..a38fe672 100644 --- a/gi/pf/itg.cc +++ b/gi/pf/itg.cc @@ -27,10 +27,67 @@ ostream& operator<<(ostream& os, const vector& p) { return os << ']'; } -double log_poisson(unsigned x, const double& lambda) { - assert(lambda > 0.0); - return log(lambda) * x - lgamma(x + 1) - lambda; -} +struct UnigramModel { + explicit UnigramModel(const string& fname, unsigned vocab_size, double p0null = 0.05) : + use_uniform_(fname.size() == 0), + p0null_(p0null), + uniform_((1.0 - p0null) / vocab_size), + probs_(TD::NumWords() + 1) { + if (fname.size() > 0) LoadUnigrams(fname); + probs_[0] = p0null_; + } + +// +// \data\ +// ngram 1=9295 +// +// \1-grams: +// -3.191193 " + + void LoadUnigrams(const string& fname) { + cerr << "Loading unigram probabilities from " << fname << " ..." << endl; + ReadFile rf(fname); + string line; + istream& in = *rf.stream(); + assert(in); + getline(in, line); + assert(line.empty()); + getline(in, line); + assert(line == "\\data\\"); + getline(in, line); + size_t pos = line.find("ngram 1="); + assert(pos == 0); + assert(line.size() > 8); + const size_t num_unigrams = atoi(&line[8]); + getline(in, line); + assert(line.empty()); + getline(in, line); + assert(line == "\\1-grams:"); + for (size_t i = 0; i < num_unigrams; ++i) { + getline(in, line); + assert(line.size() > 0); + pos = line.find('\t'); + assert(pos > 0); + assert(pos + 1 < line.size()); + const WordID w = TD::Convert(line.substr(pos + 1)); + line[pos] = 0; + float p = atof(&line[0]); + const prob_t pnon_null(1.0 - p0null_.as_float()); + if (w < probs_.size()) probs_[w].logeq(p * log(10) + log(pnon_null)); else abort(); + } + } + + const prob_t& operator()(const WordID& w) const { + if (!w) return p0null_; + if (use_uniform_) return uniform_; + return probs_[w]; + } + + const bool use_uniform_; + const prob_t p0null_; + const prob_t uniform_; + vector probs_; +}; struct Model1 { explicit Model1(const string& fname) : @@ -89,11 +146,11 @@ void InitCommandLine(int argc, char** argv, po::variables_map* conf) { ("samples,s",po::value()->default_value(1000),"Number of samples") ("particles,p",po::value()->default_value(25),"Number of particles") ("input,i",po::value(),"Read parallel data from") - ("max_src_phrase",po::value()->default_value(7),"Maximum length of source language phrases") - ("max_trg_phrase",po::value()->default_value(7),"Maximum length of target language phrases") ("model1,m",po::value(),"Model 1 parameters (used in base distribution)") ("inverse_model1,M",po::value(),"Inverse Model 1 parameters (used in backward estimate)") ("model1_interpolation_weight",po::value()->default_value(0.95),"Mixing proportion of model 1 with uniform target distribution") + ("src_unigram,u",po::value()->default_value(""),"Source unigram distribution; empty for uniform") + ("trg_unigram,U",po::value()->default_value(""),"Target unigram distribution; empty for uniform") ("random_seed,S",po::value(), "Random seed"); po::options_description clo("Command line options"); clo.add_options() @@ -165,11 +222,11 @@ void ReadParallelCorpus(const string& filename, int main(int argc, char** argv) { po::variables_map conf; InitCommandLine(argc, argv, &conf); - const size_t kMAX_TRG_PHRASE = conf["max_trg_phrase"].as(); - const size_t kMAX_SRC_PHRASE = conf["max_src_phrase"].as(); const unsigned particles = conf["particles"].as(); const unsigned samples = conf["samples"].as(); - + TD::Convert(""); + TD::Convert(""); + TD::Convert(""); if (!conf.count("model1")) { cerr << argv[0] << "Please use --model1 to specify model 1 parameters\n"; return 1; @@ -188,23 +245,28 @@ int main(int argc, char** argv) { cerr << "F-corpus size: " << corpusf.size() << " sentences\t (" << vocabf.size() << " word types)\n"; cerr << "E-corpus size: " << corpuse.size() << " sentences\t (" << vocabe.size() << " word types)\n"; assert(corpusf.size() == corpuse.size()); + UnigramModel src_unigram(conf["src_unigram"].as(), vocabf.size()); + UnigramModel trg_unigram(conf["trg_unigram"].as(), vocabe.size()); + const prob_t kHALF(0.5); + const string kEMPTY = "NULL"; const int kLHS = -TD::Convert("X"); Model1 m1(conf["model1"].as()); Model1 invm1(conf["inverse_model1"].as()); for (int si = 0; si < conf["samples"].as(); ++si) { cerr << '.' << flush; for (int ci = 0; ci < corpusf.size(); ++ci) { - const vector& src = corpusf[ci]; const vector& trg = corpuse[ci]; - for (int i = 0; i < src.size(); ++i) { - for (int j = 0; j < trg.size(); ++j) { - const int eff_max_src = min(src.size() - i, kMAX_SRC_PHRASE); - for (int k = 0; k < eff_max_src; ++k) { - const int eff_max_trg = (k == 0 ? 1 : min(trg.size() - j, kMAX_TRG_PHRASE)); - for (int l = 0; l < eff_max_trg; ++l) { - } - } + const vector& src = corpusf[ci]; + for (int i = 0; i <= trg.size(); ++i) { + const WordID e_i = i > 0 ? trg[i-1] : 0; + for (int j = 0; j <= src.size(); ++j) { + const WordID f_j = j > 0 ? src[j-1] : 0; + if (e_i == 0 && f_j == 0) continue; + prob_t je = kHALF * src_unigram(f_j) * m1(f_j,e_i) + kHALF * trg_unigram(e_i) * invm1(e_i,f_j); + cerr << "p( " << (e_i ? TD::Convert(e_i) : kEMPTY) << " , " << (f_j ? TD::Convert(f_j) : kEMPTY) << " ) = " << log(je) << endl; + if (e_i && f_j) + cout << "[X] ||| " << TD::Convert(f_j) << " ||| " << TD::Convert(e_i) << " ||| LogProb=" << log(je) << endl; } } } diff --git a/gi/pf/unigrams.cc b/gi/pf/unigrams.cc new file mode 100644 index 00000000..40829775 --- /dev/null +++ b/gi/pf/unigrams.cc @@ -0,0 +1,80 @@ +#include "unigrams.h" + +#include +#include + +#include "stringlib.h" +#include "filelib.h" + +using namespace std; + +void UnigramModel::LoadUnigrams(const string& fname) { + cerr << "Loading unigram probabilities from " << fname << " ..." << endl; + ReadFile rf(fname); + string line; + istream& in = *rf.stream(); + assert(in); + getline(in, line); + assert(line.empty()); + getline(in, line); + assert(line == "\\data\\"); + getline(in, line); + size_t pos = line.find("ngram 1="); + assert(pos == 0); + assert(line.size() > 8); + const size_t num_unigrams = atoi(&line[8]); + getline(in, line); + assert(line.empty()); + getline(in, line); + assert(line == "\\1-grams:"); + for (size_t i = 0; i < num_unigrams; ++i) { + getline(in, line); + assert(line.size() > 0); + pos = line.find('\t'); + assert(pos > 0); + assert(pos + 1 < line.size()); + const WordID w = TD::Convert(line.substr(pos + 1)); + line[pos] = 0; + float p = atof(&line[0]); + if (w < probs_.size()) probs_[w].logeq(p * log(10)); else cerr << "WARNING: don't know about '" << TD::Convert(w) << "'\n"; + } +} + +void UnigramWordModel::LoadUnigrams(const string& fname) { + cerr << "Loading unigram probabilities from " << fname << " ..." << endl; + ReadFile rf(fname); + string line; + istream& in = *rf.stream(); + assert(in); + getline(in, line); + assert(line.empty()); + getline(in, line); + assert(line == "\\data\\"); + getline(in, line); + size_t pos = line.find("ngram 1="); + assert(pos == 0); + assert(line.size() > 8); + const size_t num_unigrams = atoi(&line[8]); + getline(in, line); + assert(line.empty()); + getline(in, line); + assert(line == "\\1-grams:"); + for (size_t i = 0; i < num_unigrams; ++i) { + getline(in, line); + assert(line.size() > 0); + pos = line.find('\t'); + assert(pos > 0); + assert(pos + 1 < line.size()); + size_t cur = pos + 1; + vector w; + while (cur < line.size()) { + const size_t len = UTF8Len(line[cur]); + w.push_back(TD::Convert(line.substr(cur, len))); + cur += len; + } + line[pos] = 0; + float p = atof(&line[0]); + probs_[w].logeq(p * log(10.0)); + } +} + diff --git a/gi/pf/unigrams.h b/gi/pf/unigrams.h new file mode 100644 index 00000000..1660d1ed --- /dev/null +++ b/gi/pf/unigrams.h @@ -0,0 +1,69 @@ +#ifndef _UNIGRAMS_H_ +#define _UNIGRAMS_H_ + +#include +#include +#include +#include + +#include "wordid.h" +#include "prob.h" +#include "tdict.h" + +struct UnigramModel { + explicit UnigramModel(const std::string& fname, unsigned vocab_size) : + use_uniform_(fname.size() == 0), + uniform_(1.0 / vocab_size), + probs_() { + if (fname.size() > 0) { + probs_.resize(TD::NumWords() + 1); + LoadUnigrams(fname); + } + } + + const prob_t& operator()(const WordID& w) const { + assert(w); + if (use_uniform_) return uniform_; + return probs_[w]; + } + + private: + void LoadUnigrams(const std::string& fname); + + const bool use_uniform_; + const prob_t uniform_; + std::vector probs_; +}; + + +// reads an ARPA unigram file and converts words like 'cat' into a string 'c a t' +struct UnigramWordModel { + explicit UnigramWordModel(const std::string& fname) : + use_uniform_(false), + uniform_(1.0), + probs_() { + LoadUnigrams(fname); + } + + explicit UnigramWordModel(const unsigned vocab_size) : + use_uniform_(true), + uniform_(1.0 / vocab_size), + probs_() {} + + const prob_t& operator()(const std::vector& s) const { + if (use_uniform_) return uniform_; + const VectorProbHash::const_iterator it = probs_.find(s); + assert(it != probs_.end()); + return it->second; + } + + private: + void LoadUnigrams(const std::string& fname); + + const bool use_uniform_; + const prob_t uniform_; + typedef std::tr1::unordered_map, prob_t, boost::hash > > VectorProbHash; + VectorProbHash probs_; +}; + +#endif -- cgit v1.2.3 From a4c69d2a8ef5a39c2ebc0e3a1307801c5288be8e Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Thu, 29 Dec 2011 21:09:14 -0500 Subject: ngram base dist --- gi/pf/ngram_base.cc | 69 +++++++++++++++++++++++++++++++++++++++++++++++++++++ gi/pf/ngram_base.h | 25 +++++++++++++++++++ 2 files changed, 94 insertions(+) create mode 100644 gi/pf/ngram_base.cc create mode 100644 gi/pf/ngram_base.h (limited to 'gi/pf') diff --git a/gi/pf/ngram_base.cc b/gi/pf/ngram_base.cc new file mode 100644 index 00000000..1299f06f --- /dev/null +++ b/gi/pf/ngram_base.cc @@ -0,0 +1,69 @@ +#include "ngram_base.h" + +#include "lm/model.hh" +#include "tdict.h" + +using namespace std; + +namespace { +struct GICSVMapper : public lm::EnumerateVocab { + GICSVMapper(vector* out) : out_(out), kLM_UNKNOWN_TOKEN(0) { out_->clear(); } + void Add(lm::WordIndex index, const StringPiece &str) { + const WordID cdec_id = TD::Convert(str.as_string()); + if (cdec_id >= out_->size()) + out_->resize(cdec_id + 1, kLM_UNKNOWN_TOKEN); + (*out_)[cdec_id] = index; + } + vector* out_; + const lm::WordIndex kLM_UNKNOWN_TOKEN; +}; +} + +struct FixedNgramBaseImpl { + FixedNgramBaseImpl(const string& param) { + GICSVMapper vm(&cdec2klm_map_); + lm::ngram::Config conf; + conf.enumerate_vocab = &vm; + cerr << "Reading character LM from " << param << endl; + model = new lm::ngram::ProbingModel(param.c_str(), conf); + order = model->Order(); + kEOS = MapWord(TD::Convert("")); + assert(kEOS > 0); + } + + lm::WordIndex MapWord(const WordID w) const { + if (w < cdec2klm_map_.size()) return cdec2klm_map_[w]; + return 0; + } + + ~FixedNgramBaseImpl() { delete model; } + + prob_t StringProbability(const vector& s) const { + lm::ngram::State state = model->BeginSentenceState(); + double prob = 0; + for (unsigned i = 0; i < s.size(); ++i) { + const lm::ngram::State scopy(state); + prob += model->Score(scopy, MapWord(s[i]), state); + } + const lm::ngram::State scopy(state); + prob += model->Score(scopy, kEOS, state); + prob_t p; p.logeq(prob * log(10)); + return p; + } + + lm::ngram::ProbingModel* model; + unsigned order; + vector cdec2klm_map_; + lm::WordIndex kEOS; +}; + +FixedNgramBase::~FixedNgramBase() { delete impl; } + +FixedNgramBase::FixedNgramBase(const string& lmfname) { + impl = new FixedNgramBaseImpl(lmfname); +} + +prob_t FixedNgramBase::StringProbability(const vector& s) const { + return impl->StringProbability(s); +} + diff --git a/gi/pf/ngram_base.h b/gi/pf/ngram_base.h new file mode 100644 index 00000000..4ea999f3 --- /dev/null +++ b/gi/pf/ngram_base.h @@ -0,0 +1,25 @@ +#ifndef _NGRAM_BASE_H_ +#define _NGRAM_BASE_H_ + +#include +#include +#include "trule.h" +#include "wordid.h" +#include "prob.h" + +struct FixedNgramBaseImpl; +struct FixedNgramBase { + FixedNgramBase(const std::string& lmfname); + ~FixedNgramBase(); + prob_t StringProbability(const std::vector& s) const; + + prob_t operator()(const TRule& rule) const { + return StringProbability(rule.e_); + } + + private: + FixedNgramBaseImpl* impl; + +}; + +#endif -- cgit v1.2.3 From c430c3b3f7b22319001b2ddb1343d8a7506e9f81 Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Thu, 29 Dec 2011 21:10:24 -0500 Subject: forgotten --- gi/pf/condnaive.cc | 298 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 298 insertions(+) create mode 100644 gi/pf/condnaive.cc (limited to 'gi/pf') diff --git a/gi/pf/condnaive.cc b/gi/pf/condnaive.cc new file mode 100644 index 00000000..52ddbbfe --- /dev/null +++ b/gi/pf/condnaive.cc @@ -0,0 +1,298 @@ +#include +#include +#include + +#include +#include +#include + +#include "base_measures.h" +#include "monotonic_pseg.h" +#include "conditional_pseg.h" +#include "trule.h" +#include "tdict.h" +#include "filelib.h" +#include "dict.h" +#include "sampler.h" +#include "ccrp_nt.h" +#include "corpus.h" + +using namespace std; +using namespace std::tr1; +namespace po = boost::program_options; + +static unsigned kMAX_SRC_PHRASE; +static unsigned kMAX_TRG_PHRASE; + +void InitCommandLine(int argc, char** argv, po::variables_map* conf) { + po::options_description opts("Configuration options"); + opts.add_options() + ("samples,s",po::value()->default_value(1000),"Number of samples") + ("input,i",po::value(),"Read parallel data from") + ("max_src_phrase",po::value()->default_value(4),"Maximum length of source language phrases") + ("max_trg_phrase",po::value()->default_value(4),"Maximum length of target language phrases") + ("model1,m",po::value(),"Model 1 parameters (used in base distribution)") + ("model1_interpolation_weight",po::value()->default_value(0.95),"Mixing proportion of model 1 with uniform target distribution") + ("random_seed,S",po::value(), "Random seed"); + po::options_description clo("Command line options"); + clo.add_options() + ("config", po::value(), "Configuration file") + ("help,h", "Print this help message and exit"); + po::options_description dconfig_options, dcmdline_options; + dconfig_options.add(opts); + dcmdline_options.add(opts).add(clo); + + po::store(parse_command_line(argc, argv, dcmdline_options), *conf); + if (conf->count("config")) { + ifstream config((*conf)["config"].as().c_str()); + po::store(po::parse_config_file(config, dconfig_options), *conf); + } + po::notify(*conf); + + if (conf->count("help") || (conf->count("input") == 0)) { + cerr << dcmdline_options << endl; + exit(1); + } +} + +shared_ptr prng; + +struct ModelAndData { + explicit ModelAndData(ConditionalParallelSegementationModel& m, const vector >& ce, const vector >& cf, const set& ve, const set& vf) : + model(m), + rng(&*prng), + corpuse(ce), + corpusf(cf), + vocabe(ve), + vocabf(vf), + mh_samples(), + mh_rejects(), + kX(-TD::Convert("X")), + derivations(corpuse.size()) {} + + void ResampleHyperparameters() { + } + + void InstantiateRule(const pair& from, + const pair& to, + const vector& sentf, + const vector& sente, + TRule* rule) const { + rule->f_.clear(); + rule->e_.clear(); + rule->lhs_ = kX; + for (short i = from.first; i < to.first; ++i) + rule->f_.push_back(sentf[i]); + for (short i = from.second; i < to.second; ++i) + rule->e_.push_back(sente[i]); + } + + void DecrementDerivation(const vector >& d, const vector& sentf, const vector& sente) { + if (d.size() < 2) return; + TRule x; + for (int i = 1; i < d.size(); ++i) { + InstantiateRule(d[i], d[i-1], sentf, sente, &x); + model.DecrementRule(x); + model.DecrementAlign(x.f_.size()); + } + } + + void PrintDerivation(const vector >& d, const vector& sentf, const vector& sente) { + if (d.size() < 2) return; + TRule x; + for (int i = 1; i < d.size(); ++i) { + InstantiateRule(d[i], d[i-1], sentf, sente, &x); + cerr << i << '/' << (d.size() - 1) << ": " << x << endl; + } + } + + void IncrementDerivation(const vector >& d, const vector& sentf, const vector& sente) { + if (d.size() < 2) return; + TRule x; + for (int i = 1; i < d.size(); ++i) { + InstantiateRule(d[i], d[i-1], sentf, sente, &x); + model.IncrementRule(x); + model.IncrementAlign(x.f_.size()); + } + } + + prob_t Likelihood() const { + return model.Likelihood(); + } + + prob_t DerivationProposalProbability(const vector >& d, const vector& sentf, const vector& sente) const { + prob_t p = prob_t::One(); + TRule x; + for (int i = 1; i < d.size(); ++i) { + InstantiateRule(d[i], d[i-1], sentf, sente, &x); + p *= model.RuleProbability(x); + p *= model.AlignProbability(x.f_.size()); + } + return p; + } + + void Sample(); + + ConditionalParallelSegementationModel& model; + MT19937* rng; + const vector >& corpuse, corpusf; + const set& vocabe, vocabf; + unsigned mh_samples, mh_rejects; + const int kX; + vector > > derivations; +}; + +void ModelAndData::Sample() { + unsigned MAXK = kMAX_SRC_PHRASE; + unsigned MAXL = kMAX_TRG_PHRASE; + TRule x; + x.lhs_ = -TD::Convert("X"); + + for (int samples = 0; samples < 1000; ++samples) { + if (samples % 1 == 0 && samples > 0) { + //ResampleHyperparameters(); + cerr << " [" << samples << " LLH=" << log(Likelihood()) << " MH=" << ((double)mh_rejects / mh_samples) << "]\n"; + for (int i = 0; i < 10; ++i) { + cerr << "SENTENCE: " << TD::GetString(corpusf[i]) << " ||| " << TD::GetString(corpuse[i]) << endl; + PrintDerivation(derivations[i], corpusf[i], corpuse[i]); + } + static TRule xx("[X] ||| w n ||| s h ||| X=0"); + const CCRP_NoTable& dcrp = model.tmodel.r.find(xx.f_)->second; + for (CCRP_NoTable::const_iterator it = dcrp.begin(); it != dcrp.end(); ++it) { + cerr << "\t" << it->second << "\t" << it->first << endl; + } + } + cerr << '.' << flush; + for (int s = 0; s < corpuse.size(); ++s) { + const vector& sentf = corpusf[s]; + const vector& sente = corpuse[s]; +// cerr << " CUSTOMERS: " << rules.num_customers() << endl; +// cerr << "SENTENCE: " << TD::GetString(sentf) << " ||| " << TD::GetString(sente) << endl; + + vector >& deriv = derivations[s]; + const prob_t p_cur = Likelihood(); + DecrementDerivation(deriv, sentf, sente); + + boost::multi_array a(boost::extents[sentf.size() + 1][sente.size() + 1]); + boost::multi_array trans(boost::extents[sentf.size() + 1][sente.size() + 1][MAXK][MAXL]); + a[0][0] = prob_t::One(); + for (int i = 0; i < sentf.size(); ++i) { + for (int j = 0; j < sente.size(); ++j) { + const prob_t src_a = a[i][j]; + x.f_.clear(); + for (int k = 1; k <= MAXK; ++k) { + if (i + k > sentf.size()) break; + x.f_.push_back(sentf[i + k - 1]); + x.e_.clear(); + const prob_t p_span = model.AlignProbability(k); // prob of consuming this much source + for (int l = 1; l <= MAXL; ++l) { + if (j + l > sente.size()) break; + x.e_.push_back(sente[j + l - 1]); + trans[i][j][k - 1][l - 1] = model.RuleProbability(x) * p_span; + a[i + k][j + l] += src_a * trans[i][j][k - 1][l - 1]; + } + } + } + } +// cerr << "Inside: " << log(a[sentf.size()][sente.size()]) << endl; + const prob_t q_cur = DerivationProposalProbability(deriv, sentf, sente); + + vector > newderiv; + int cur_i = sentf.size(); + int cur_j = sente.size(); + while(cur_i > 0 && cur_j > 0) { + newderiv.push_back(pair(cur_i, cur_j)); +// cerr << "NODE: (" << cur_i << "," << cur_j << ")\n"; + SampleSet ss; + vector > nexts; + for (int k = 1; k <= MAXK; ++k) { + const int hyp_i = cur_i - k; + if (hyp_i < 0) break; + for (int l = 1; l <= MAXL; ++l) { + const int hyp_j = cur_j - l; + if (hyp_j < 0) break; + const prob_t& inside = a[hyp_i][hyp_j]; + if (inside == prob_t::Zero()) continue; + const prob_t& transp = trans[hyp_i][hyp_j][k - 1][l - 1]; + if (transp == prob_t::Zero()) continue; + const prob_t p = inside * transp; + ss.add(p); + nexts.push_back(pair(hyp_i, hyp_j)); +// cerr << " (" << hyp_i << "," << hyp_j << ") <--- " << log(p) << endl; + } + } +// cerr << " sample set has " << nexts.size() << " elements.\n"; + const int selected = rng->SelectSample(ss); + cur_i = nexts[selected].first; + cur_j = nexts[selected].second; + } + newderiv.push_back(pair(0,0)); + const prob_t q_new = DerivationProposalProbability(newderiv, sentf, sente); + IncrementDerivation(newderiv, sentf, sente); +// cerr << "SANITY: " << q_new << " " <(); + kMAX_SRC_PHRASE = conf["max_src_phrase"].as(); + + if (!conf.count("model1")) { + cerr << argv[0] << "Please use --model1 to specify model 1 parameters\n"; + return 1; + } + if (conf.count("random_seed")) + prng.reset(new MT19937(conf["random_seed"].as())); + else + prng.reset(new MT19937); +// MT19937& rng = *prng; + + vector > corpuse, corpusf; + set vocabe, vocabf; + corpus::ReadParallelCorpus(conf["input"].as(), &corpusf, &corpuse, &vocabf, &vocabe); + cerr << "f-Corpus size: " << corpusf.size() << " sentences\n"; + cerr << "f-Vocabulary size: " << vocabf.size() << " types\n"; + cerr << "f-Corpus size: " << corpuse.size() << " sentences\n"; + cerr << "f-Vocabulary size: " << vocabe.size() << " types\n"; + assert(corpusf.size() == corpuse.size()); + + Model1 m1(conf["model1"].as()); + + PhraseConditionalBase pcb0(m1, conf["model1_interpolation_weight"].as(), vocabe.size()); + ConditionalParallelSegementationModel x(pcb0); + + ModelAndData posterior(x, corpuse, corpusf, vocabe, vocabf); + posterior.Sample(); + + TRule r1("[X] ||| x ||| l e ||| X=0"); + TRule r2("[X] ||| A ||| a d ||| X=0"); + TRule r3("[X] ||| n ||| e r ||| X=0"); + TRule r4("[X] ||| x A n ||| b l a g ||| X=0"); + + PhraseConditionalUninformativeBase u0(vocabe.size()); + + cerr << (pcb0(r1)*pcb0(r2)*pcb0(r3)) << endl; + cerr << (u0(r4)) << endl; + + return 0; +} + -- cgit v1.2.3 From 665badbdcc755183aa83414f6e86987f4d017393 Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Thu, 29 Dec 2011 21:10:36 -0500 Subject: foo --- gi/pf/conditional_pseg.h | 155 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 155 insertions(+) create mode 100644 gi/pf/conditional_pseg.h (limited to 'gi/pf') diff --git a/gi/pf/conditional_pseg.h b/gi/pf/conditional_pseg.h new file mode 100644 index 00000000..edcdc813 --- /dev/null +++ b/gi/pf/conditional_pseg.h @@ -0,0 +1,155 @@ +#ifndef _CONDITIONAL_PSEG_H_ +#define _CONDITIONAL_PSEG_H_ + +#include +#include +#include +#include + +#include "prob.h" +#include "ccrp_nt.h" +#include "trule.h" +#include "base_measures.h" +#include "tdict.h" + +template +struct ConditionalTranslationModel { + explicit ConditionalTranslationModel(ConditionalBaseMeasure& rcp0) : + rp0(rcp0) {} + + void Summary() const { + std::cerr << "Number of conditioning contexts: " << r.size() << std::endl; + for (RuleModelHash::const_iterator it = r.begin(); it != r.end(); ++it) { + std::cerr << TD::GetString(it->first) << " \t(\\alpha = " << it->second.concentration() << ") --------------------------" << std::endl; + for (CCRP_NoTable::const_iterator i2 = it->second.begin(); i2 != it->second.end(); ++i2) + std::cerr << " " << i2->second << '\t' << i2->first << std::endl; + } + } + + void ResampleHyperparameters(MT19937* rng) { + for (RuleModelHash::iterator it = r.begin(); it != r.end(); ++it) + it->second.resample_hyperparameters(rng); + } + + int DecrementRule(const TRule& rule) { + RuleModelHash::iterator it = r.find(rule.f_); + assert(it != r.end()); + int count = it->second.decrement(rule); + if (count) { + if (it->second.num_customers() == 0) r.erase(it); + } + return count; + } + + int IncrementRule(const TRule& rule) { + RuleModelHash::iterator it = r.find(rule.f_); + if (it == r.end()) { + it = r.insert(make_pair(rule.f_, CCRP_NoTable(1.0, 1.0, 8.0))).first; + } + int count = it->second.increment(rule); + return count; + } + + void IncrementRules(const std::vector& rules) { + for (int i = 0; i < rules.size(); ++i) + IncrementRule(*rules[i]); + } + + void DecrementRules(const std::vector& rules) { + for (int i = 0; i < rules.size(); ++i) + DecrementRule(*rules[i]); + } + + prob_t RuleProbability(const TRule& rule) const { + prob_t p; + RuleModelHash::const_iterator it = r.find(rule.f_); + if (it == r.end()) { + p.logeq(log(rp0(rule))); + } else { + p.logeq(it->second.logprob(rule, log(rp0(rule)))); + } + return p; + } + + prob_t Likelihood() const { + prob_t p = prob_t::One(); + for (RuleModelHash::const_iterator it = r.begin(); it != r.end(); ++it) { + prob_t q; q.logeq(it->second.log_crp_prob()); + p *= q; + for (CCRP_NoTable::const_iterator i2 = it->second.begin(); i2 != it->second.end(); ++i2) + p *= rp0(i2->first); + } + return p; + } + + const ConditionalBaseMeasure& rp0; + typedef std::tr1::unordered_map, + CCRP_NoTable, + boost::hash > > RuleModelHash; + RuleModelHash r; +}; + +template +struct ConditionalParallelSegementationModel { + explicit ConditionalParallelSegementationModel(ConditionalBaseMeasure& rcp0) : + tmodel(rcp0), base(prob_t::One()), aligns(1,1) {} + + ConditionalTranslationModel tmodel; + + void DecrementRule(const TRule& rule) { + tmodel.DecrementRule(rule); + } + + void IncrementRule(const TRule& rule) { + tmodel.IncrementRule(rule); + } + + void IncrementRulesAndAlignments(const std::vector& rules) { + tmodel.IncrementRules(rules); + for (int i = 0; i < rules.size(); ++i) { + IncrementAlign(rules[i]->f_.size()); + } + } + + void DecrementRulesAndAlignments(const std::vector& rules) { + tmodel.DecrementRules(rules); + for (int i = 0; i < rules.size(); ++i) { + DecrementAlign(rules[i]->f_.size()); + } + } + + prob_t RuleProbability(const TRule& rule) const { + return tmodel.RuleProbability(rule); + } + + void IncrementAlign(unsigned span) { + if (aligns.increment(span)) { + // TODO + } + } + + void DecrementAlign(unsigned span) { + if (aligns.decrement(span)) { + // TODO + } + } + + prob_t AlignProbability(unsigned span) const { + prob_t p; + p.logeq(aligns.logprob(span, log_poisson(span, 1.0))); + return p; + } + + prob_t Likelihood() const { + prob_t p; p.logeq(aligns.log_crp_prob()); + p *= base; + p *= tmodel.Likelihood(); + return p; + } + + prob_t base; + CCRP_NoTable aligns; +}; + +#endif + -- cgit v1.2.3 From 5ea87bf5487f0bd9fef7385eb1812b0601b57a6e Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Thu, 29 Dec 2011 23:02:50 -0500 Subject: remove broken prior, add logging --- gi/pf/align-lexonly.cc | 36 ++---------------------------------- 1 file changed, 2 insertions(+), 34 deletions(-) (limited to 'gi/pf') diff --git a/gi/pf/align-lexonly.cc b/gi/pf/align-lexonly.cc index 91a3cfcf..7e48b25a 100644 --- a/gi/pf/align-lexonly.cc +++ b/gi/pf/align-lexonly.cc @@ -66,41 +66,9 @@ struct AlignedSentencePair { Array2D posterior; }; -struct HierarchicalUnigramBase { - explicit HierarchicalUnigramBase(const unsigned vocab_e_size) : r(5,5), u0(1.0 / vocab_e_size) {} - - // return p0 of rule.e_ - prob_t operator()(const TRule& rule) const { - prob_t p = prob_t::One(); - prob_t q; - for (unsigned i = 0; i < rule.e_.size(); ++i) { - q.logeq(r.logprob(rule.e_[i], log(u0))); - p *= q; - } - q.logeq(r.logprob(TD::Convert(""), log(u0))); - p *= q; - return p; - } - - void Increment(const TRule& rule) { - for (unsigned i = 0; i < rule.e_.size(); ++i) - r.increment(rule.e_[i]); - r.increment(TD::Convert("")); - } - - void Decrement(const TRule& rule) { - for (unsigned i = 0; i < rule.e_.size(); ++i) - r.decrement(rule.e_[i]); - r.decrement(TD::Convert("")); - } - - CCRP_NoTable r; - prob_t u0; -}; - struct HierarchicalWordBase { explicit HierarchicalWordBase(const unsigned vocab_e_size) : - base(prob_t::One()), r(15,15), u0(-log(vocab_e_size)) {} + base(prob_t::One()), r(25,25,10), u0(-log(vocab_e_size)) {} void ResampleHyperparameters(MT19937* rng) { r.resample_hyperparameters(rng); @@ -137,7 +105,7 @@ struct HierarchicalWordBase { } void Summary() const { - cerr << "NUMBER OF CUSTOMERS: " << r.num_customers() << endl; + cerr << "NUMBER OF CUSTOMERS: " << r.num_customers() << " (\\alpha=" << r.concentration() << ')' << endl; for (CCRP_NoTable >::const_iterator it = r.begin(); it != r.end(); ++it) cerr << " " << it->second << '\t' << TD::GetString(it->first) << endl; } -- cgit v1.2.3 From 71da76d47d5a6f988b56b5641f7296249cb85124 Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Fri, 30 Dec 2011 19:23:32 +0000 Subject: logging corpus errors --- gi/pf/corpus.cc | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) (limited to 'gi/pf') diff --git a/gi/pf/corpus.cc b/gi/pf/corpus.cc index a408e7cf..cb6e4ed7 100644 --- a/gi/pf/corpus.cc +++ b/gi/pf/corpus.cc @@ -24,11 +24,11 @@ void ReadParallelCorpus(const string& filename, istream* in = rf.stream(); assert(*in); string line; + unsigned lc = 0; const WordID kDIV = TD::Convert("|||"); vector tmp; - while(*in) { - getline(*in, line); - if (line.empty() && !*in) break; + while(getline(*in, line)) { + ++lc; e->push_back(vector()); f->push_back(vector()); vector& le = e->back(); @@ -39,12 +39,17 @@ void ReadParallelCorpus(const string& filename, for (unsigned i = 0; i < tmp.size(); ++i) { const int cur = tmp[i]; if (isf) { - if (kDIV == cur) { isf = false; } else { + if (kDIV == cur) { + isf = false; + } else { lf.push_back(cur); vocab_f->insert(cur); } } else { - assert(cur != kDIV); + if (cur == kDIV) { + cerr << "ERROR in " << lc << ": " << line << endl << endl; + abort(); + } le.push_back(cur); vocab_e->insert(cur); } -- cgit v1.2.3 From 134228d946a3f119e88f23a5315fa7849d498ee4 Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Sat, 31 Dec 2011 17:54:38 +0000 Subject: last change before adding wood&teh stuff --- gi/pf/align-lexonly.cc | 6 ++++-- gi/pf/base_measures.h | 11 +++++++++++ 2 files changed, 15 insertions(+), 2 deletions(-) (limited to 'gi/pf') diff --git a/gi/pf/align-lexonly.cc b/gi/pf/align-lexonly.cc index 7e48b25a..e9f1e7b6 100644 --- a/gi/pf/align-lexonly.cc +++ b/gi/pf/align-lexonly.cc @@ -117,10 +117,12 @@ struct HierarchicalWordBase { struct BasicLexicalAlignment { explicit BasicLexicalAlignment(const vector >& lets, + const unsigned words_e, const unsigned letters_e, vector* corp) : letters(lets), corpus(*corp), + //up0(words_e), //up0("en.chars.1gram", letters_e), //up0("en.words.1gram"), up0(letters_e), @@ -302,11 +304,11 @@ int main(int argc, char** argv) { ExtractLetters(vocabf, &letters, NULL); letters[TD::Convert("NULL")].clear(); - BasicLexicalAlignment x(letters, letset.size(), &corpus); + BasicLexicalAlignment x(letters, vocabe.size(), letset.size(), &corpus); x.InitializeRandom(); const unsigned samples = conf["samples"].as(); for (int i = 0; i < samples; ++i) { - for (int j = 431; j < 433; ++j) Debug(corpus[j]); + for (int j = 4995; j < 4997; ++j) Debug(corpus[j]); cerr << i << "\t" << x.tmodel.r.size() << "\t"; if (i % 10 == 0) x.ResampleHyperparemeters(); x.ResampleCorpus(); diff --git a/gi/pf/base_measures.h b/gi/pf/base_measures.h index fbd1c3ad..a4e9ac28 100644 --- a/gi/pf/base_measures.h +++ b/gi/pf/base_measures.h @@ -11,6 +11,7 @@ #include "trule.h" #include "prob.h" #include "tdict.h" +#include "sampler.h" inline double log_poisson(unsigned x, const double& lambda) { assert(lambda > 0.0); @@ -55,6 +56,11 @@ struct CompletelyUniformBase { prob_t operator()(const TRule&) const { return kUNIFORM; } + void Summary() const {} + void ResampleHyperparameters(MT19937*) {} + void Increment(const TRule&) {} + void Decrement(const TRule&) {} + prob_t Likelihood() const { return prob_t::One(); } const prob_t kUNIFORM; }; @@ -79,6 +85,11 @@ struct PhraseConditionalUninformativeBase { prob_t p0(const std::vector& vsrc, const std::vector& vtrg, int start_src, int start_trg) const; + void Summary() const {} + void ResampleHyperparameters(MT19937*) {} + void Increment(const TRule&) {} + void Decrement(const TRule&) {} + prob_t Likelihood() const { return prob_t::One(); } const prob_t kUNIFORM_TARGET; }; -- cgit v1.2.3 From d156a65ac638b574abfabfd78c949e122faada5d Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Wed, 11 Jan 2012 01:22:54 -0500 Subject: script to pull out candidate transliterations from a word-aligned parallel corpus --- gi/pf/guess-translits.pl | 71 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 71 insertions(+) create mode 100755 gi/pf/guess-translits.pl (limited to 'gi/pf') diff --git a/gi/pf/guess-translits.pl b/gi/pf/guess-translits.pl new file mode 100755 index 00000000..ab737121 --- /dev/null +++ b/gi/pf/guess-translits.pl @@ -0,0 +1,71 @@ +#!/usr/bin/perl -w +use strict; +use utf8; + +my $MIN_PMI = -3; + +my %fs; +my %es; +my %ef; + +die "Usage: $0 < input.utf8.txt\n" if scalar @ARGV > 0; + +binmode(STDIN,":utf8"); +binmode(STDOUT,":utf8"); +binmode(STDERR,":utf8"); + +my $tot = 0; +print STDERR "Reading alignments from STDIN ...\n"; +while() { + chomp; + my ($fsent, $esent, $alsent) = split / \|\|\| /; + die "Format should be 'foreign sentence ||| english sentence ||| 0-0 1-1 ...'\n" unless defined $fsent && defined $esent && defined $alsent; + + my @fws = split /\s+/, $fsent; + my @ews = split /\s+/, $esent; + my @as = split /\s+/, $alsent; + my %a2b; + my %b2a; + for my $ap (@as) { + my ($a,$b) = split /-/, $ap; + $a2b{$a}->{$b} = 1; + $b2a{$b}->{$a} = 1; + } + for my $a (keys %a2b) { + my $bref = $a2b{$a}; + next unless scalar keys %$bref < 2; + my $b = (keys %$bref)[0]; + next unless scalar keys %{$b2a{$b}} < 2; + my $f = $fws[$a]; + next unless defined $f; + next unless length($f) > 3; + my $e = $ews[$b]; + next unless defined $e; + next unless length($e) > 3; + + $ef{$f}->{$e}++; + $es{$e}++; + $fs{$f}++; + $tot++; + } +} +my $ltot = log($tot); +my $num = 0; +print STDERR "Extracting pairs for PMI > $MIN_PMI ...\n"; +for my $f (keys %fs) { + my $logf = log($fs{$f}); + my $esref = $ef{$f}; + for my $e (keys %$esref) { + my $loge = log($es{$e}); + my $ef = $esref->{$e}; + my $logef = log($ef); + my $pmi = $logef - ($loge + $logf); + next if $pmi < $MIN_PMI; + my @flets = split //, $f; + my @elets = split //, $e; + print "@flets ||| @elets\n"; + $num++; + } +} +print STDERR "Extracted $num pairs.\n"; +print STDERR "Recommend running:\n ../../training/model1 -t -99999 output.txt\n"; -- cgit v1.2.3 From 26d9ad04bd81508163d75c99726f970dd75f5127 Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Mon, 23 Jan 2012 15:47:29 -0500 Subject: more alignment stuff --- gi/pf/Makefile.am | 4 +- gi/pf/align-lexonly-pyp.cc | 327 ++++++++++++++++++++++++++++++++++++++++++++ gi/pf/base_measures.cc | 47 +++++++ gi/pf/base_measures.h | 18 +++ gi/pf/conditional_pseg.h | 74 ++++++++++ word-aligner/stemmers/ur.pl | 38 +++++ 6 files changed, 507 insertions(+), 1 deletion(-) create mode 100644 gi/pf/align-lexonly-pyp.cc create mode 100755 word-aligner/stemmers/ur.pl (limited to 'gi/pf') diff --git a/gi/pf/Makefile.am b/gi/pf/Makefile.am index 7c8e89d0..28367e67 100644 --- a/gi/pf/Makefile.am +++ b/gi/pf/Makefile.am @@ -1,10 +1,12 @@ -bin_PROGRAMS = cbgi brat dpnaive pfbrat pfdist itg pfnaive condnaive align-lexonly +bin_PROGRAMS = cbgi brat dpnaive pfbrat pfdist itg pfnaive condnaive align-lexonly align-lexonly-pyp noinst_LIBRARIES = libpf.a libpf_a_SOURCES = base_measures.cc reachability.cc cfg_wfst_composer.cc corpus.cc unigrams.cc ngram_base.cc align_lexonly_SOURCES = align-lexonly.cc +align_lexonly_pyp_SOURCES = align-lexonly-pyp.cc + itg_SOURCES = itg.cc condnaive_SOURCES = condnaive.cc diff --git a/gi/pf/align-lexonly-pyp.cc b/gi/pf/align-lexonly-pyp.cc new file mode 100644 index 00000000..d2630a2b --- /dev/null +++ b/gi/pf/align-lexonly-pyp.cc @@ -0,0 +1,327 @@ +#include +#include +#include + +#include +#include +#include + +#include "array2d.h" +#include "base_measures.h" +#include "monotonic_pseg.h" +#include "conditional_pseg.h" +#include "trule.h" +#include "tdict.h" +#include "stringlib.h" +#include "filelib.h" +#include "dict.h" +#include "sampler.h" +#include "mfcr.h" +#include "corpus.h" +#include "ngram_base.h" + +using namespace std; +using namespace tr1; +namespace po = boost::program_options; + +void InitCommandLine(int argc, char** argv, po::variables_map* conf) { + po::options_description opts("Configuration options"); + opts.add_options() + ("samples,s",po::value()->default_value(1000),"Number of samples") + ("input,i",po::value(),"Read parallel data from") + ("random_seed,S",po::value(), "Random seed"); + po::options_description clo("Command line options"); + clo.add_options() + ("config", po::value(), "Configuration file") + ("help,h", "Print this help message and exit"); + po::options_description dconfig_options, dcmdline_options; + dconfig_options.add(opts); + dcmdline_options.add(opts).add(clo); + + po::store(parse_command_line(argc, argv, dcmdline_options), *conf); + if (conf->count("config")) { + ifstream config((*conf)["config"].as().c_str()); + po::store(po::parse_config_file(config, dconfig_options), *conf); + } + po::notify(*conf); + + if (conf->count("help") || (conf->count("input") == 0)) { + cerr << dcmdline_options << endl; + exit(1); + } +} + +shared_ptr prng; + +struct LexicalAlignment { + unsigned char src_index; + bool is_transliteration; + vector > derivation; +}; + +struct AlignedSentencePair { + vector src; + vector trg; + vector a; + Array2D posterior; +}; + +struct HierarchicalWordBase { + explicit HierarchicalWordBase(const unsigned vocab_e_size) : + base(prob_t::One()), r(1,1,1,25,25), u0(-log(vocab_e_size)), l(1,1.0), v(1, 0.0) {} + + void ResampleHyperparameters(MT19937* rng) { + r.resample_hyperparameters(rng); + } + + inline double logp0(const vector& s) const { + return s.size() * u0; + } + + // return p0 of rule.e_ + prob_t operator()(const TRule& rule) const { + v[0] = exp(logp0(rule.e_)); + return prob_t(r.prob(rule.e_, v, l)); + } + + void Increment(const TRule& rule) { + v[0] = exp(logp0(rule.e_)); + if (r.increment(rule.e_, v, l, &*prng).count) { + base *= prob_t(v[0] * l[0]); + } + } + + void Decrement(const TRule& rule) { + if (r.decrement(rule.e_, &*prng).count) { + base /= prob_t(exp(logp0(rule.e_))); + } + } + + prob_t Likelihood() const { + prob_t p; p.logeq(r.log_crp_prob()); + p *= base; + return p; + } + + void Summary() const { + cerr << "NUMBER OF CUSTOMERS: " << r.num_customers() << " (d=" << r.d() << ",\\alpha=" << r.alpha() << ')' << endl; + for (MFCR >::const_iterator it = r.begin(); it != r.end(); ++it) + cerr << " " << it->second.total_dish_count_ << " (on " << it->second.table_counts_.size() << " tables)" << TD::GetString(it->first) << endl; + } + + prob_t base; + MFCR > r; + const double u0; + const vector l; + mutable vector v; +}; + +struct BasicLexicalAlignment { + explicit BasicLexicalAlignment(const vector >& lets, + const unsigned words_e, + const unsigned letters_e, + vector* corp) : + letters(lets), + corpus(*corp), + //up0(words_e), + //up0("en.chars.1gram", letters_e), + //up0("en.words.1gram"), + up0(letters_e), + //up0("en.chars.2gram"), + tmodel(up0) { + } + + void InstantiateRule(const WordID src, + const WordID trg, + TRule* rule) const { + static const WordID kX = TD::Convert("X") * -1; + rule->lhs_ = kX; + rule->e_ = letters[trg]; + rule->f_ = letters[src]; + } + + void InitializeRandom() { + const WordID kNULL = TD::Convert("NULL"); + cerr << "Initializing with random alignments ...\n"; + for (unsigned i = 0; i < corpus.size(); ++i) { + AlignedSentencePair& asp = corpus[i]; + asp.a.resize(asp.trg.size()); + for (unsigned j = 0; j < asp.trg.size(); ++j) { + const unsigned char a_j = prng->next() * (1 + asp.src.size()); + const WordID f_a_j = (a_j ? asp.src[a_j - 1] : kNULL); + TRule r; + InstantiateRule(f_a_j, asp.trg[j], &r); + asp.a[j].is_transliteration = false; + asp.a[j].src_index = a_j; + if (tmodel.IncrementRule(r, &*prng)) + up0.Increment(r); + } + } + cerr << " LLH = " << Likelihood() << endl; + } + + prob_t Likelihood() const { + prob_t p = tmodel.Likelihood(); + p *= up0.Likelihood(); + return p; + } + + void ResampleHyperparemeters() { + cerr << " LLH_prev = " << Likelihood() << flush; + tmodel.ResampleHyperparameters(&*prng); + up0.ResampleHyperparameters(&*prng); + cerr << "\tLLH_post = " << Likelihood() << endl; + } + + void ResampleCorpus(); + + const vector >& letters; // spelling dictionary + vector& corpus; + //PhraseConditionalUninformativeBase up0; + //PhraseConditionalUninformativeUnigramBase up0; + //UnigramWordBase up0; + //HierarchicalUnigramBase up0; + HierarchicalWordBase up0; + //CompletelyUniformBase up0; + //FixedNgramBase up0; + //ConditionalTranslationModel tmodel; + //ConditionalTranslationModel tmodel; + //ConditionalTranslationModel tmodel; + //ConditionalTranslationModel tmodel; + MConditionalTranslationModel tmodel; + //ConditionalTranslationModel tmodel; + //ConditionalTranslationModel tmodel; +}; + +void BasicLexicalAlignment::ResampleCorpus() { + static const WordID kNULL = TD::Convert("NULL"); + for (unsigned i = 0; i < corpus.size(); ++i) { + AlignedSentencePair& asp = corpus[i]; + SampleSet ss; ss.resize(asp.src.size() + 1); + for (unsigned j = 0; j < asp.trg.size(); ++j) { + TRule r; + unsigned char& a_j = asp.a[j].src_index; + WordID f_a_j = (a_j ? asp.src[a_j - 1] : kNULL); + InstantiateRule(f_a_j, asp.trg[j], &r); + if (tmodel.DecrementRule(r, &*prng)) + up0.Decrement(r); + + for (unsigned prop_a_j = 0; prop_a_j <= asp.src.size(); ++prop_a_j) { + const WordID prop_f = (prop_a_j ? asp.src[prop_a_j - 1] : kNULL); + InstantiateRule(prop_f, asp.trg[j], &r); + ss[prop_a_j] = tmodel.RuleProbability(r); + } + a_j = prng->SelectSample(ss); + f_a_j = (a_j ? asp.src[a_j - 1] : kNULL); + InstantiateRule(f_a_j, asp.trg[j], &r); + if (tmodel.IncrementRule(r, &*prng)) + up0.Increment(r); + } + } + cerr << " LLH = " << tmodel.Likelihood() << endl; +} + +void ExtractLetters(const set& v, vector >* l, set* letset = NULL) { + for (set::const_iterator it = v.begin(); it != v.end(); ++it) { + vector& letters = (*l)[*it]; + if (letters.size()) continue; // if e and f have the same word + + const string& w = TD::Convert(*it); + + size_t cur = 0; + while (cur < w.size()) { + const size_t len = UTF8Len(w[cur]); + letters.push_back(TD::Convert(w.substr(cur, len))); + if (letset) letset->insert(letters.back()); + cur += len; + } + } +} + +void Debug(const AlignedSentencePair& asp) { + cerr << TD::GetString(asp.src) << endl << TD::GetString(asp.trg) << endl; + Array2D a(asp.src.size(), asp.trg.size()); + for (unsigned j = 0; j < asp.trg.size(); ++j) + if (asp.a[j].src_index) a(asp.a[j].src_index - 1, j) = true; + cerr << a << endl; +} + +void AddSample(AlignedSentencePair* asp) { + for (unsigned j = 0; j < asp->trg.size(); ++j) + asp->posterior(asp->a[j].src_index, j)++; +} + +void WriteAlignments(const AlignedSentencePair& asp) { + bool first = true; + for (unsigned j = 0; j < asp.trg.size(); ++j) { + int src_index = -1; + int mc = -1; + for (unsigned i = 0; i <= asp.src.size(); ++i) { + if (asp.posterior(i, j) > mc) { + mc = asp.posterior(i, j); + src_index = i; + } + } + + if (src_index) { + if (first) first = false; else cout << ' '; + cout << (src_index - 1) << '-' << j; + } + } + cout << endl; +} + +int main(int argc, char** argv) { + po::variables_map conf; + InitCommandLine(argc, argv, &conf); + + if (conf.count("random_seed")) + prng.reset(new MT19937(conf["random_seed"].as())); + else + prng.reset(new MT19937); +// MT19937& rng = *prng; + + vector > corpuse, corpusf; + set vocabe, vocabf; + corpus::ReadParallelCorpus(conf["input"].as(), &corpusf, &corpuse, &vocabf, &vocabe); + cerr << "f-Corpus size: " << corpusf.size() << " sentences\n"; + cerr << "f-Vocabulary size: " << vocabf.size() << " types\n"; + cerr << "f-Corpus size: " << corpuse.size() << " sentences\n"; + cerr << "f-Vocabulary size: " << vocabe.size() << " types\n"; + assert(corpusf.size() == corpuse.size()); + + vector corpus(corpuse.size()); + for (unsigned i = 0; i < corpuse.size(); ++i) { + corpus[i].src.swap(corpusf[i]); + corpus[i].trg.swap(corpuse[i]); + corpus[i].posterior.resize(corpus[i].src.size() + 1, corpus[i].trg.size()); + } + corpusf.clear(); corpuse.clear(); + + vocabf.insert(TD::Convert("NULL")); + vector > letters(TD::NumWords()); + set letset; + ExtractLetters(vocabe, &letters, &letset); + ExtractLetters(vocabf, &letters, NULL); + letters[TD::Convert("NULL")].clear(); + + BasicLexicalAlignment x(letters, vocabe.size(), letset.size(), &corpus); + x.InitializeRandom(); + const unsigned samples = conf["samples"].as(); + for (int i = 0; i < samples; ++i) { + for (int j = 65; j < 67; ++j) Debug(corpus[j]); + cerr << i << "\t" << x.tmodel.r.size() << "\t"; + if (i % 10 == 0) x.ResampleHyperparemeters(); + x.ResampleCorpus(); + if (i > (samples / 5) && (i % 10 == 9)) for (int j = 0; j < corpus.size(); ++j) AddSample(&corpus[j]); + } + for (unsigned i = 0; i < corpus.size(); ++i) + WriteAlignments(corpus[i]); + //ModelAndData posterior(x, &corpus, vocabe, vocabf); + x.tmodel.Summary(); + x.up0.Summary(); + + //posterior.Sample(); + + return 0; +} diff --git a/gi/pf/base_measures.cc b/gi/pf/base_measures.cc index 97b4e698..7894d3e7 100644 --- a/gi/pf/base_measures.cc +++ b/gi/pf/base_measures.cc @@ -6,6 +6,53 @@ using namespace std; +TableLookupBase::TableLookupBase(const string& fname) { + cerr << "TableLookupBase reading from " << fname << " ..." << endl; + ReadFile rf(fname); + istream& in = *rf.stream(); + string line; + unsigned lc = 0; + const WordID kDIV = TD::Convert("|||"); + vector tmp; + vector le, lf; + TRule x; + x.lhs_ = -TD::Convert("X"); + bool flag = false; + while(getline(in, line)) { + ++lc; + if (lc % 1000000 == 0) { cerr << " [" << lc << ']' << endl; flag = false; } + else if (lc % 25000 == 0) { cerr << '.' << flush; flag = true; } + tmp.clear(); + TD::ConvertSentence(line, &tmp); + x.f_.clear(); + x.e_.clear(); + size_t pos = 0; + int cc = 0; + while(pos < tmp.size()) { + const WordID cur = tmp[pos++]; + if (cur == kDIV) { + ++cc; + } else if (cc == 0) { + x.f_.push_back(cur); + } else if (cc == 1) { + x.e_.push_back(cur); + } else if (cc == 2) { + table[x] = atof(TD::Convert(cur)); + ++cc; + } else { + if (flag) cerr << endl; + cerr << "Bad format in " << lc << ": " << line << endl; abort(); + } + } + if (cc != 3) { + if (flag) cerr << endl; + cerr << "Bad format in " << lc << ": " << line << endl; abort(); + } + } + if (flag) cerr << endl; + cerr << " read " << lc << " entries\n"; +} + prob_t PhraseConditionalUninformativeUnigramBase::p0(const vector& vsrc, const vector& vtrg, int start_src, int start_trg) const { diff --git a/gi/pf/base_measures.h b/gi/pf/base_measures.h index a4e9ac28..7214aa22 100644 --- a/gi/pf/base_measures.h +++ b/gi/pf/base_measures.h @@ -72,6 +72,24 @@ struct UnigramWordBase { const UnigramWordModel un; }; +struct RuleHasher { + size_t operator()(const TRule& r) const { + return hash_value(r); + } +}; + +struct TableLookupBase { + TableLookupBase(const std::string& fname); + + prob_t operator()(const TRule& rule) const { + const std::tr1::unordered_map::const_iterator it = table.find(rule); + assert(it != table.end()); + return it->second; + } + + std::tr1::unordered_map table; +}; + struct PhraseConditionalUninformativeBase { explicit PhraseConditionalUninformativeBase(const unsigned vocab_e_size) : kUNIFORM_TARGET(1.0 / vocab_e_size) { diff --git a/gi/pf/conditional_pseg.h b/gi/pf/conditional_pseg.h index edcdc813..db951d15 100644 --- a/gi/pf/conditional_pseg.h +++ b/gi/pf/conditional_pseg.h @@ -8,10 +8,84 @@ #include "prob.h" #include "ccrp_nt.h" +#include "mfcr.h" #include "trule.h" #include "base_measures.h" #include "tdict.h" +template +struct MConditionalTranslationModel { + explicit MConditionalTranslationModel(ConditionalBaseMeasure& rcp0) : + rp0(rcp0), lambdas(1, 1.0), p0s(1) {} + + void Summary() const { + std::cerr << "Number of conditioning contexts: " << r.size() << std::endl; + for (RuleModelHash::const_iterator it = r.begin(); it != r.end(); ++it) { + std::cerr << TD::GetString(it->first) << " \t(d=" << it->second.d() << ",\\alpha = " << it->second.alpha() << ") --------------------------" << std::endl; + for (MFCR::const_iterator i2 = it->second.begin(); i2 != it->second.end(); ++i2) + std::cerr << " " << -1 << '\t' << i2->first << std::endl; + } + } + + void ResampleHyperparameters(MT19937* rng) { + for (RuleModelHash::iterator it = r.begin(); it != r.end(); ++it) + it->second.resample_hyperparameters(rng); + } + + int DecrementRule(const TRule& rule, MT19937* rng) { + RuleModelHash::iterator it = r.find(rule.f_); + assert(it != r.end()); + const TableCount delta = it->second.decrement(rule, rng); + if (delta.count) { + if (it->second.num_customers() == 0) r.erase(it); + } + return delta.count; + } + + int IncrementRule(const TRule& rule, MT19937* rng) { + RuleModelHash::iterator it = r.find(rule.f_); + if (it == r.end()) { + it = r.insert(make_pair(rule.f_, MFCR(1, 1.0, 1.0, 1.0, 1.0, 1e-9, 4.0))).first; + } + p0s[0] = rp0(rule).as_float(); + TableCount delta = it->second.increment(rule, p0s, lambdas, rng); + return delta.count; + } + + prob_t RuleProbability(const TRule& rule) const { + prob_t p; + RuleModelHash::const_iterator it = r.find(rule.f_); + if (it == r.end()) { + p.logeq(log(rp0(rule))); + } else { + p0s[0] = rp0(rule).as_float(); + p = prob_t(it->second.prob(rule, p0s, lambdas)); + } + return p; + } + + prob_t Likelihood() const { + prob_t p = prob_t::One(); +#if 0 + for (RuleModelHash::const_iterator it = r.begin(); it != r.end(); ++it) { + prob_t q; q.logeq(it->second.log_crp_prob()); + p *= q; + for (CCRP_NoTable::const_iterator i2 = it->second.begin(); i2 != it->second.end(); ++i2) + p *= rp0(i2->first); + } +#endif + return p; + } + + const ConditionalBaseMeasure& rp0; + typedef std::tr1::unordered_map, + MFCR, + boost::hash > > RuleModelHash; + RuleModelHash r; + std::vector lambdas; + mutable std::vector p0s; +}; + template struct ConditionalTranslationModel { explicit ConditionalTranslationModel(ConditionalBaseMeasure& rcp0) : diff --git a/word-aligner/stemmers/ur.pl b/word-aligner/stemmers/ur.pl new file mode 100755 index 00000000..3a4f5a45 --- /dev/null +++ b/word-aligner/stemmers/ur.pl @@ -0,0 +1,38 @@ +#!/usr/bin/perl -w + +use strict; +use utf8; + +binmode(STDIN, ":utf8"); +binmode(STDOUT,":utf8"); + +my $vocab = undef; +if (scalar @ARGV > 0) { + die "Only allow --vocab" unless ($ARGV[0] eq '--vocab' && scalar @ARGV == 1); + $vocab = 1; +} + +my %dict; +while() { + chomp; + my @words = split /\s+/; + my @out = (); + for my $w (@words) { + my $tw = $dict{$w}; + if (!defined $tw) { + my $el = 4; + if ($w =~ /^(al|Al)/) { $el++; } + if ($el > length($w)) { $el = length($w); } + $tw = substr $w, 0, $el; + $dict{$w} = $tw; + } + push @out, $tw; + } + if ($vocab) { + die "Expected exactly one word per line with --vocab: $_" unless scalar @out == 1; + print "$_ @out\n"; + } else { + print "@out\n"; + } +} + -- cgit v1.2.3 From 4c2360119def2fb624d2691b355b1908c511f004 Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Tue, 24 Jan 2012 22:26:44 -0500 Subject: more models --- gi/pf/align-lexonly.cc | 14 +++++++---- gi/pf/base_measures.cc | 2 +- gi/pf/base_measures.h | 27 ++++++++++++++++++++- training/model1.cc | 64 +++++++++++++++++++++++++++++++++++++++++++++++--- 4 files changed, 98 insertions(+), 9 deletions(-) (limited to 'gi/pf') diff --git a/gi/pf/align-lexonly.cc b/gi/pf/align-lexonly.cc index e9f1e7b6..76e2e009 100644 --- a/gi/pf/align-lexonly.cc +++ b/gi/pf/align-lexonly.cc @@ -122,10 +122,11 @@ struct BasicLexicalAlignment { vector* corp) : letters(lets), corpus(*corp), + up0("fr-en.10k.translit-base.txt.gz"), //up0(words_e), //up0("en.chars.1gram", letters_e), //up0("en.words.1gram"), - up0(letters_e), + //up0(letters_e), //up0("en.chars.2gram"), tmodel(up0) { } @@ -180,14 +181,18 @@ struct BasicLexicalAlignment { //PhraseConditionalUninformativeUnigramBase up0; //UnigramWordBase up0; //HierarchicalUnigramBase up0; - HierarchicalWordBase up0; + TableLookupBase up0; + //HierarchicalWordBase up0; + //PoissonUniformUninformativeBase up0; //CompletelyUniformBase up0; //FixedNgramBase up0; //ConditionalTranslationModel tmodel; //ConditionalTranslationModel tmodel; //ConditionalTranslationModel tmodel; //ConditionalTranslationModel tmodel; - ConditionalTranslationModel tmodel; + //ConditionalTranslationModel tmodel; + //ConditionalTranslationModel tmodel; + ConditionalTranslationModel tmodel; //ConditionalTranslationModel tmodel; //ConditionalTranslationModel tmodel; }; @@ -222,6 +227,7 @@ void BasicLexicalAlignment::ResampleCorpus() { void ExtractLetters(const set& v, vector >* l, set* letset = NULL) { for (set::const_iterator it = v.begin(); it != v.end(); ++it) { + if (*it >= l->size()) { l->resize(*it + 1); } vector& letters = (*l)[*it]; if (letters.size()) continue; // if e and f have the same word @@ -308,7 +314,7 @@ int main(int argc, char** argv) { x.InitializeRandom(); const unsigned samples = conf["samples"].as(); for (int i = 0; i < samples; ++i) { - for (int j = 4995; j < 4997; ++j) Debug(corpus[j]); + for (int j = 395; j < 397; ++j) Debug(corpus[j]); cerr << i << "\t" << x.tmodel.r.size() << "\t"; if (i % 10 == 0) x.ResampleHyperparemeters(); x.ResampleCorpus(); diff --git a/gi/pf/base_measures.cc b/gi/pf/base_measures.cc index 7894d3e7..4b1863fa 100644 --- a/gi/pf/base_measures.cc +++ b/gi/pf/base_measures.cc @@ -37,7 +37,7 @@ TableLookupBase::TableLookupBase(const string& fname) { } else if (cc == 1) { x.e_.push_back(cur); } else if (cc == 2) { - table[x] = atof(TD::Convert(cur)); + table[x].logeq(atof(TD::Convert(cur))); ++cc; } else { if (flag) cerr << endl; diff --git a/gi/pf/base_measures.h b/gi/pf/base_measures.h index 7214aa22..b0495bfd 100644 --- a/gi/pf/base_measures.h +++ b/gi/pf/base_measures.h @@ -51,6 +51,22 @@ struct Model1 { std::vector > ttable; }; +struct PoissonUniformUninformativeBase { + explicit PoissonUniformUninformativeBase(const unsigned ves) : kUNIFORM(1.0 / ves) {} + prob_t operator()(const TRule& r) const { + prob_t p; p.logeq(log_poisson(r.e_.size(), 1.0)); + prob_t q = kUNIFORM; q.poweq(r.e_.size()); + p *= q; + return p; + } + void Summary() const {} + void ResampleHyperparameters(MT19937*) {} + void Increment(const TRule&) {} + void Decrement(const TRule&) {} + prob_t Likelihood() const { return prob_t::One(); } + const prob_t kUNIFORM; +}; + struct CompletelyUniformBase { explicit CompletelyUniformBase(const unsigned ves) : kUNIFORM(1.0 / ves) {} prob_t operator()(const TRule&) const { @@ -83,10 +99,19 @@ struct TableLookupBase { prob_t operator()(const TRule& rule) const { const std::tr1::unordered_map::const_iterator it = table.find(rule); - assert(it != table.end()); + if (it == table.end()) { + std::cerr << rule << " not found\n"; + abort(); + } return it->second; } + void ResampleHyperparameters(MT19937*) {} + void Increment(const TRule&) {} + void Decrement(const TRule&) {} + prob_t Likelihood() const { return prob_t::One(); } + void Summary() const {} + std::tr1::unordered_map table; }; diff --git a/training/model1.cc b/training/model1.cc index 346c0033..40249aa3 100644 --- a/training/model1.cc +++ b/training/model1.cc @@ -14,6 +14,11 @@ namespace po = boost::program_options; using namespace std; +inline double log_poisson(unsigned x, const double& lambda) { + assert(lambda > 0.0); + return log(lambda) * x - lgamma(x + 1) - lambda; +} + bool InitCommandLine(int argc, char** argv, po::variables_map* conf) { po::options_description opts("Configuration options"); opts.add_options() @@ -25,6 +30,7 @@ bool InitCommandLine(int argc, char** argv, po::variables_map* conf) { ("diagonal_tension,T", po::value()->default_value(4.0), "How sharp or flat around the diagonal is the alignment distribution (<1 = flat >1 = sharp)") ("prob_align_null", po::value()->default_value(0.08), "When --favor_diagonal is set, what's the probability of a null alignment?") ("variational_bayes,v","Add a symmetric Dirichlet prior and infer VB estimate of weights") + ("testset,x", po::value(), "After training completes, compute the log likelihood of this set of sentence pairs under the learned model") ("alpha,a", po::value()->default_value(0.01), "Hyperparameter for optional Dirichlet prior") ("no_add_viterbi,V","Do not add Viterbi alignment points (may generate a grammar where some training sentence pairs are unreachable)"); po::options_description clo("Command line options"); @@ -63,6 +69,8 @@ int main(int argc, char** argv) { const bool write_alignments = (conf.count("write_alignments") > 0); const double diagonal_tension = conf["diagonal_tension"].as(); const double prob_align_null = conf["prob_align_null"].as(); + string testset; + if (conf.count("testset")) testset = conf["testset"].as(); const double prob_align_not_null = 1.0 - prob_align_null; const double alpha = conf["alpha"].as(); const bool favor_diagonal = conf.count("favor_diagonal"); @@ -73,6 +81,8 @@ int main(int argc, char** argv) { TTable tt; TTable::Word2Word2Double was_viterbi; + double tot_len_ratio = 0; + double mean_srclen_multiplier = 0; for (int iter = 0; iter < ITERATIONS; ++iter) { const bool final_iteration = (iter == (ITERATIONS - 1)); cerr << "ITERATION " << (iter + 1) << (final_iteration ? " (FINAL)" : "") << endl; @@ -83,13 +93,13 @@ int main(int argc, char** argv) { int lc = 0; bool flag = false; string line; + string ssrc, strg; while(true) { getline(in, line); if (!in) break; ++lc; if (lc % 1000 == 0) { cerr << '.'; flag = true; } if (lc %50000 == 0) { cerr << " [" << lc << "]\n" << flush; flag = false; } - string ssrc, strg; ParseTranslatorInput(line, &ssrc, &strg); Lattice src, trg; LatticeTools::ConvertTextToLattice(ssrc, &src); @@ -99,9 +109,10 @@ int main(int argc, char** argv) { assert(src.size() > 0); assert(trg.size() > 0); } + if (iter == 0) + tot_len_ratio += static_cast(trg.size()) / static_cast(src.size()); denom += trg.size(); vector probs(src.size() + 1); - const double src_logprob = -log(src.size() + 1); bool first_al = true; // used for write_alignments for (int j = 0; j < trg.size(); ++j) { const WordID& f_j = trg[j][0].label; @@ -156,7 +167,7 @@ int main(int argc, char** argv) { for (int i = 1; i <= src.size(); ++i) tt.Increment(src[i-1][0].label, f_j, probs[i] / sum); } - likelihood += log(sum) + src_logprob; + likelihood += log(sum); } if (write_alignments && final_iteration) cout << endl; } @@ -165,6 +176,10 @@ int main(int argc, char** argv) { double base2_likelihood = likelihood / log(2); if (flag) { cerr << endl; } + if (iter == 0) { + mean_srclen_multiplier = tot_len_ratio / lc; + cerr << "expected target length = source length * " << mean_srclen_multiplier << endl; + } cerr << " log_e likelihood: " << likelihood << endl; cerr << " log_2 likelihood: " << base2_likelihood << endl; cerr << " cross entropy: " << (-base2_likelihood / denom) << endl; @@ -176,6 +191,49 @@ int main(int argc, char** argv) { tt.Normalize(); } } + if (testset.size()) { + ReadFile rf(testset); + istream& in = *rf.stream(); + int lc = 0; + double tlp = 0; + string ssrc, strg, line; + while (getline(in, line)) { + ++lc; + ParseTranslatorInput(line, &ssrc, &strg); + Lattice src, trg; + LatticeTools::ConvertTextToLattice(ssrc, &src); + LatticeTools::ConvertTextToLattice(strg, &trg); + double log_prob = log_poisson(trg.size(), 0.05 + src.size() * mean_srclen_multiplier); + + // compute likelihood + for (int j = 0; j < trg.size(); ++j) { + const WordID& f_j = trg[j][0].label; + double sum = 0; + const double j_over_ts = double(j) / trg.size(); + double prob_a_i = 1.0 / (src.size() + use_null); // uniform (model 1) + if (use_null) { + if (favor_diagonal) prob_a_i = prob_align_null; + sum += tt.prob(kNULL, f_j) * prob_a_i; + } + double az = 0; + if (favor_diagonal) { + for (int ta = 0; ta < src.size(); ++ta) + az += exp(-fabs(double(ta) / src.size() - j_over_ts) * diagonal_tension); + az /= prob_align_not_null; + } + for (int i = 1; i <= src.size(); ++i) { + if (favor_diagonal) + prob_a_i = exp(-fabs(double(i) / src.size() - j_over_ts) * diagonal_tension) / az; + sum += tt.prob(src[i-1][0].label, f_j) * prob_a_i; + } + log_prob += log(sum); + } + tlp += log_prob; + cerr << ssrc << " ||| " << strg << " ||| " << log_prob << endl; + } + cerr << "TOTAL LOG PROB " << tlp << endl; + } + if (write_alignments) return 0; for (TTable::Word2Word2Double::iterator ei = tt.ttable.begin(); ei != tt.ttable.end(); ++ei) { -- cgit v1.2.3 From 481a120564fdb73c8c6833e2102acb533683261c Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Fri, 27 Jan 2012 02:31:00 -0500 Subject: migrate mert to the new scorer interface --- gi/pf/base_distributions.cc | 241 ++++++++++++++++++++++++++++++++++++++++ gi/pf/base_distributions.h | 261 ++++++++++++++++++++++++++++++++++++++++++++ gi/pf/base_measures.cc | 241 ---------------------------------------- gi/pf/base_measures.h | 247 ----------------------------------------- mteval/ns.cc | 4 + mteval/ns.h | 10 +- vest/ces.cc | 42 +++---- vest/ces.h | 10 +- vest/dist-vest.pl | 4 +- vest/error_surface.cc | 11 +- vest/error_surface.h | 6 +- vest/line_optimizer.cc | 20 ++-- vest/line_optimizer.h | 2 + vest/lo_test.cc | 21 ++-- vest/mr_vest_map.cc | 16 +-- vest/mr_vest_reduce.cc | 34 +++--- 16 files changed, 602 insertions(+), 568 deletions(-) create mode 100644 gi/pf/base_distributions.cc create mode 100644 gi/pf/base_distributions.h delete mode 100644 gi/pf/base_measures.cc delete mode 100644 gi/pf/base_measures.h (limited to 'gi/pf') diff --git a/gi/pf/base_distributions.cc b/gi/pf/base_distributions.cc new file mode 100644 index 00000000..4b1863fa --- /dev/null +++ b/gi/pf/base_distributions.cc @@ -0,0 +1,241 @@ +#include "base_measures.h" + +#include + +#include "filelib.h" + +using namespace std; + +TableLookupBase::TableLookupBase(const string& fname) { + cerr << "TableLookupBase reading from " << fname << " ..." << endl; + ReadFile rf(fname); + istream& in = *rf.stream(); + string line; + unsigned lc = 0; + const WordID kDIV = TD::Convert("|||"); + vector tmp; + vector le, lf; + TRule x; + x.lhs_ = -TD::Convert("X"); + bool flag = false; + while(getline(in, line)) { + ++lc; + if (lc % 1000000 == 0) { cerr << " [" << lc << ']' << endl; flag = false; } + else if (lc % 25000 == 0) { cerr << '.' << flush; flag = true; } + tmp.clear(); + TD::ConvertSentence(line, &tmp); + x.f_.clear(); + x.e_.clear(); + size_t pos = 0; + int cc = 0; + while(pos < tmp.size()) { + const WordID cur = tmp[pos++]; + if (cur == kDIV) { + ++cc; + } else if (cc == 0) { + x.f_.push_back(cur); + } else if (cc == 1) { + x.e_.push_back(cur); + } else if (cc == 2) { + table[x].logeq(atof(TD::Convert(cur))); + ++cc; + } else { + if (flag) cerr << endl; + cerr << "Bad format in " << lc << ": " << line << endl; abort(); + } + } + if (cc != 3) { + if (flag) cerr << endl; + cerr << "Bad format in " << lc << ": " << line << endl; abort(); + } + } + if (flag) cerr << endl; + cerr << " read " << lc << " entries\n"; +} + +prob_t PhraseConditionalUninformativeUnigramBase::p0(const vector& vsrc, + const vector& vtrg, + int start_src, int start_trg) const { + const int flen = vsrc.size() - start_src; + const int elen = vtrg.size() - start_trg; + prob_t p; + p.logeq(log_poisson(elen, flen + 0.01)); // elen | flen ~Pois(flen + 0.01) + //p.logeq(log_poisson(elen, 1)); // elen | flen ~Pois(flen + 0.01) + for (int i = 0; i < elen; ++i) + p *= u(vtrg[i + start_trg]); // draw e_i ~Uniform + return p; +} + +prob_t PhraseConditionalUninformativeBase::p0(const vector& vsrc, + const vector& vtrg, + int start_src, int start_trg) const { + const int flen = vsrc.size() - start_src; + const int elen = vtrg.size() - start_trg; + prob_t p; + //p.logeq(log_poisson(elen, flen + 0.01)); // elen | flen ~Pois(flen + 0.01) + p.logeq(log_poisson(elen, 1)); // elen | flen ~Pois(flen + 0.01) + for (int i = 0; i < elen; ++i) + p *= kUNIFORM_TARGET; // draw e_i ~Uniform + return p; +} + +void Model1::LoadModel1(const string& fname) { + cerr << "Loading Model 1 parameters from " << fname << " ..." << endl; + ReadFile rf(fname); + istream& in = *rf.stream(); + string line; + unsigned lc = 0; + while(getline(in, line)) { + ++lc; + int cur = 0; + int start = 0; + while(cur < line.size() && line[cur] != ' ') { ++cur; } + assert(cur != line.size()); + line[cur] = 0; + const WordID src = TD::Convert(&line[0]); + ++cur; + start = cur; + while(cur < line.size() && line[cur] != ' ') { ++cur; } + assert(cur != line.size()); + line[cur] = 0; + WordID trg = TD::Convert(&line[start]); + const double logprob = strtod(&line[cur + 1], NULL); + if (src >= ttable.size()) ttable.resize(src + 1); + ttable[src][trg].logeq(logprob); + } + cerr << " read " << lc << " parameters.\n"; +} + +prob_t PhraseConditionalBase::p0(const vector& vsrc, + const vector& vtrg, + int start_src, int start_trg) const { + const int flen = vsrc.size() - start_src; + const int elen = vtrg.size() - start_trg; + prob_t uniform_src_alignment; uniform_src_alignment.logeq(-log(flen + 1)); + prob_t p; + p.logeq(log_poisson(elen, flen + 0.01)); // elen | flen ~Pois(flen + 0.01) + for (int i = 0; i < elen; ++i) { // for each position i in e-RHS + const WordID trg = vtrg[i + start_trg]; + prob_t tp = prob_t::Zero(); + for (int j = -1; j < flen; ++j) { + const WordID src = j < 0 ? 0 : vsrc[j + start_src]; + tp += kM1MIXTURE * model1(src, trg); + tp += kUNIFORM_MIXTURE * kUNIFORM_TARGET; + } + tp *= uniform_src_alignment; // draw a_i ~uniform + p *= tp; // draw e_i ~Model1(f_a_i) / uniform + } + if (p.is_0()) { + cerr << "Zero! " << vsrc << "\nTRG=" << vtrg << endl; + abort(); + } + return p; +} + +prob_t PhraseJointBase::p0(const vector& vsrc, + const vector& vtrg, + int start_src, int start_trg) const { + const int flen = vsrc.size() - start_src; + const int elen = vtrg.size() - start_trg; + prob_t uniform_src_alignment; uniform_src_alignment.logeq(-log(flen + 1)); + prob_t p; + p.logeq(log_poisson(flen, 1.0)); // flen ~Pois(1) + // elen | flen ~Pois(flen + 0.01) + prob_t ptrglen; ptrglen.logeq(log_poisson(elen, flen + 0.01)); + p *= ptrglen; + p *= kUNIFORM_SOURCE.pow(flen); // each f in F ~Uniform + for (int i = 0; i < elen; ++i) { // for each position i in E + const WordID trg = vtrg[i + start_trg]; + prob_t tp = prob_t::Zero(); + for (int j = -1; j < flen; ++j) { + const WordID src = j < 0 ? 0 : vsrc[j + start_src]; + tp += kM1MIXTURE * model1(src, trg); + tp += kUNIFORM_MIXTURE * kUNIFORM_TARGET; + } + tp *= uniform_src_alignment; // draw a_i ~uniform + p *= tp; // draw e_i ~Model1(f_a_i) / uniform + } + if (p.is_0()) { + cerr << "Zero! " << vsrc << "\nTRG=" << vtrg << endl; + abort(); + } + return p; +} + +prob_t PhraseJointBase_BiDir::p0(const vector& vsrc, + const vector& vtrg, + int start_src, int start_trg) const { + const int flen = vsrc.size() - start_src; + const int elen = vtrg.size() - start_trg; + prob_t uniform_src_alignment; uniform_src_alignment.logeq(-log(flen + 1)); + prob_t uniform_trg_alignment; uniform_trg_alignment.logeq(-log(elen + 1)); + + prob_t p1; + p1.logeq(log_poisson(flen, 1.0)); // flen ~Pois(1) + // elen | flen ~Pois(flen + 0.01) + prob_t ptrglen; ptrglen.logeq(log_poisson(elen, flen + 0.01)); + p1 *= ptrglen; + p1 *= kUNIFORM_SOURCE.pow(flen); // each f in F ~Uniform + for (int i = 0; i < elen; ++i) { // for each position i in E + const WordID trg = vtrg[i + start_trg]; + prob_t tp = prob_t::Zero(); + for (int j = -1; j < flen; ++j) { + const WordID src = j < 0 ? 0 : vsrc[j + start_src]; + tp += kM1MIXTURE * model1(src, trg); + tp += kUNIFORM_MIXTURE * kUNIFORM_TARGET; + } + tp *= uniform_src_alignment; // draw a_i ~uniform + p1 *= tp; // draw e_i ~Model1(f_a_i) / uniform + } + if (p1.is_0()) { + cerr << "Zero! " << vsrc << "\nTRG=" << vtrg << endl; + abort(); + } + + prob_t p2; + p2.logeq(log_poisson(elen, 1.0)); // elen ~Pois(1) + // flen | elen ~Pois(flen + 0.01) + prob_t psrclen; psrclen.logeq(log_poisson(flen, elen + 0.01)); + p2 *= psrclen; + p2 *= kUNIFORM_TARGET.pow(elen); // each f in F ~Uniform + for (int i = 0; i < flen; ++i) { // for each position i in E + const WordID src = vsrc[i + start_src]; + prob_t tp = prob_t::Zero(); + for (int j = -1; j < elen; ++j) { + const WordID trg = j < 0 ? 0 : vtrg[j + start_trg]; + tp += kM1MIXTURE * invmodel1(trg, src); + tp += kUNIFORM_MIXTURE * kUNIFORM_SOURCE; + } + tp *= uniform_trg_alignment; // draw a_i ~uniform + p2 *= tp; // draw e_i ~Model1(f_a_i) / uniform + } + if (p2.is_0()) { + cerr << "Zero! " << vsrc << "\nTRG=" << vtrg << endl; + abort(); + } + + static const prob_t kHALF(0.5); + return (p1 + p2) * kHALF; +} + +JumpBase::JumpBase() : p(200) { + for (unsigned src_len = 1; src_len < 200; ++src_len) { + map& cpd = p[src_len]; + int min_jump = 1 - src_len; + int max_jump = src_len; + prob_t z; + for (int j = min_jump; j <= max_jump; ++j) { + prob_t& cp = cpd[j]; + if (j < 0) + cp.logeq(log_poisson(1.5-j, 1)); + else if (j > 0) + cp.logeq(log_poisson(j, 1)); + cp.poweq(0.2); + z += cp; + } + for (int j = min_jump; j <= max_jump; ++j) { + cpd[j] /= z; + } + } +} + diff --git a/gi/pf/base_distributions.h b/gi/pf/base_distributions.h new file mode 100644 index 00000000..a23ac32b --- /dev/null +++ b/gi/pf/base_distributions.h @@ -0,0 +1,261 @@ +#ifndef _BASE_MEASURES_H_ +#define _BASE_MEASURES_H_ + +#include +#include +#include +#include +#include +#include + +#include "unigrams.h" +#include "trule.h" +#include "prob.h" +#include "tdict.h" +#include "sampler.h" + +inline double log_poisson(unsigned x, const double& lambda) { + assert(lambda > 0.0); + return log(lambda) * x - lgamma(x + 1) - lambda; +} + +inline double log_binom_coeff(unsigned n, unsigned k) { + assert(n >= k); + if (n == k) return 0.0; + return lgamma(n + 1) - lgamma(k + 1) - lgamma(n - k + 1); +} + +// http://en.wikipedia.org/wiki/Negative_binomial_distribution +inline double log_negative_binom(unsigned x, unsigned r, double p) { + assert(p > 0.0); + assert(p < 1.0); + return log_binom_coeff(x + r - 1, x) + r * log(1 - p) + x * log(p); +} + +inline std::ostream& operator<<(std::ostream& os, const std::vector& p) { + os << '['; + for (int i = 0; i < p.size(); ++i) + os << (i==0 ? "" : " ") << TD::Convert(p[i]); + return os << ']'; +} + +struct Model1 { + explicit Model1(const std::string& fname) : + kNULL(TD::Convert("")), + kZERO() { + LoadModel1(fname); + } + + void LoadModel1(const std::string& fname); + + // returns prob 0 if src or trg is not found + const prob_t& operator()(WordID src, WordID trg) const { + if (src == 0) src = kNULL; + if (src < ttable.size()) { + const std::map& cpd = ttable[src]; + const std::map::const_iterator it = cpd.find(trg); + if (it != cpd.end()) + return it->second; + } + return kZERO; + } + + const WordID kNULL; + const prob_t kZERO; + std::vector > ttable; +}; + +struct PoissonUniformUninformativeBase { + explicit PoissonUniformUninformativeBase(const unsigned ves) : kUNIFORM(1.0 / ves) {} + prob_t operator()(const TRule& r) const { + prob_t p; p.logeq(log_poisson(r.e_.size(), 1.0)); + prob_t q = kUNIFORM; q.poweq(r.e_.size()); + p *= q; + return p; + } + void Summary() const {} + void ResampleHyperparameters(MT19937*) {} + void Increment(const TRule&) {} + void Decrement(const TRule&) {} + prob_t Likelihood() const { return prob_t::One(); } + const prob_t kUNIFORM; +}; + +struct CompletelyUniformBase { + explicit CompletelyUniformBase(const unsigned ves) : kUNIFORM(1.0 / ves) {} + prob_t operator()(const TRule&) const { + return kUNIFORM; + } + void Summary() const {} + void ResampleHyperparameters(MT19937*) {} + void Increment(const TRule&) {} + void Decrement(const TRule&) {} + prob_t Likelihood() const { return prob_t::One(); } + const prob_t kUNIFORM; +}; + +struct UnigramWordBase { + explicit UnigramWordBase(const std::string& fname) : un(fname) {} + prob_t operator()(const TRule& r) const { + return un(r.e_); + } + const UnigramWordModel un; +}; + +struct RuleHasher { + size_t operator()(const TRule& r) const { + return hash_value(r); + } +}; + +struct TableLookupBase { + TableLookupBase(const std::string& fname); + + prob_t operator()(const TRule& rule) const { + const std::tr1::unordered_map::const_iterator it = table.find(rule); + if (it == table.end()) { + std::cerr << rule << " not found\n"; + abort(); + } + return it->second; + } + + void ResampleHyperparameters(MT19937*) {} + void Increment(const TRule&) {} + void Decrement(const TRule&) {} + prob_t Likelihood() const { return prob_t::One(); } + void Summary() const {} + + std::tr1::unordered_map table; +}; + +struct PhraseConditionalUninformativeBase { + explicit PhraseConditionalUninformativeBase(const unsigned vocab_e_size) : + kUNIFORM_TARGET(1.0 / vocab_e_size) { + assert(vocab_e_size > 0); + } + + // return p0 of rule.e_ | rule.f_ + prob_t operator()(const TRule& rule) const { + return p0(rule.f_, rule.e_, 0, 0); + } + + prob_t p0(const std::vector& vsrc, const std::vector& vtrg, int start_src, int start_trg) const; + + void Summary() const {} + void ResampleHyperparameters(MT19937*) {} + void Increment(const TRule&) {} + void Decrement(const TRule&) {} + prob_t Likelihood() const { return prob_t::One(); } + const prob_t kUNIFORM_TARGET; +}; + +struct PhraseConditionalUninformativeUnigramBase { + explicit PhraseConditionalUninformativeUnigramBase(const std::string& file, const unsigned vocab_e_size) : u(file, vocab_e_size) {} + + // return p0 of rule.e_ | rule.f_ + prob_t operator()(const TRule& rule) const { + return p0(rule.f_, rule.e_, 0, 0); + } + + prob_t p0(const std::vector& vsrc, const std::vector& vtrg, int start_src, int start_trg) const; + + const UnigramModel u; +}; + +struct PhraseConditionalBase { + explicit PhraseConditionalBase(const Model1& m1, const double m1mixture, const unsigned vocab_e_size) : + model1(m1), + kM1MIXTURE(m1mixture), + kUNIFORM_MIXTURE(1.0 - m1mixture), + kUNIFORM_TARGET(1.0 / vocab_e_size) { + assert(m1mixture >= 0.0 && m1mixture <= 1.0); + assert(vocab_e_size > 0); + } + + // return p0 of rule.e_ | rule.f_ + prob_t operator()(const TRule& rule) const { + return p0(rule.f_, rule.e_, 0, 0); + } + + prob_t p0(const std::vector& vsrc, const std::vector& vtrg, int start_src, int start_trg) const; + + const Model1& model1; + const prob_t kM1MIXTURE; // Model 1 mixture component + const prob_t kUNIFORM_MIXTURE; // uniform mixture component + const prob_t kUNIFORM_TARGET; +}; + +struct PhraseJointBase { + explicit PhraseJointBase(const Model1& m1, const double m1mixture, const unsigned vocab_e_size, const unsigned vocab_f_size) : + model1(m1), + kM1MIXTURE(m1mixture), + kUNIFORM_MIXTURE(1.0 - m1mixture), + kUNIFORM_SOURCE(1.0 / vocab_f_size), + kUNIFORM_TARGET(1.0 / vocab_e_size) { + assert(m1mixture >= 0.0 && m1mixture <= 1.0); + assert(vocab_e_size > 0); + } + + // return p0 of rule.e_ , rule.f_ + prob_t operator()(const TRule& rule) const { + return p0(rule.f_, rule.e_, 0, 0); + } + + prob_t p0(const std::vector& vsrc, const std::vector& vtrg, int start_src, int start_trg) const; + + const Model1& model1; + const prob_t kM1MIXTURE; // Model 1 mixture component + const prob_t kUNIFORM_MIXTURE; // uniform mixture component + const prob_t kUNIFORM_SOURCE; + const prob_t kUNIFORM_TARGET; +}; + +struct PhraseJointBase_BiDir { + explicit PhraseJointBase_BiDir(const Model1& m1, + const Model1& im1, + const double m1mixture, + const unsigned vocab_e_size, + const unsigned vocab_f_size) : + model1(m1), + invmodel1(im1), + kM1MIXTURE(m1mixture), + kUNIFORM_MIXTURE(1.0 - m1mixture), + kUNIFORM_SOURCE(1.0 / vocab_f_size), + kUNIFORM_TARGET(1.0 / vocab_e_size) { + assert(m1mixture >= 0.0 && m1mixture <= 1.0); + assert(vocab_e_size > 0); + } + + // return p0 of rule.e_ , rule.f_ + prob_t operator()(const TRule& rule) const { + return p0(rule.f_, rule.e_, 0, 0); + } + + prob_t p0(const std::vector& vsrc, const std::vector& vtrg, int start_src, int start_trg) const; + + const Model1& model1; + const Model1& invmodel1; + const prob_t kM1MIXTURE; // Model 1 mixture component + const prob_t kUNIFORM_MIXTURE; // uniform mixture component + const prob_t kUNIFORM_SOURCE; + const prob_t kUNIFORM_TARGET; +}; + +// base distribution for jump size multinomials +// basically p(0) = 0 and then, p(1) is max, and then +// you drop as you move to the max jump distance +struct JumpBase { + JumpBase(); + + const prob_t& operator()(int jump, unsigned src_len) const { + assert(jump != 0); + const std::map::const_iterator it = p[src_len].find(jump); + assert(it != p[src_len].end()); + return it->second; + } + std::vector > p; +}; + + +#endif diff --git a/gi/pf/base_measures.cc b/gi/pf/base_measures.cc deleted file mode 100644 index 4b1863fa..00000000 --- a/gi/pf/base_measures.cc +++ /dev/null @@ -1,241 +0,0 @@ -#include "base_measures.h" - -#include - -#include "filelib.h" - -using namespace std; - -TableLookupBase::TableLookupBase(const string& fname) { - cerr << "TableLookupBase reading from " << fname << " ..." << endl; - ReadFile rf(fname); - istream& in = *rf.stream(); - string line; - unsigned lc = 0; - const WordID kDIV = TD::Convert("|||"); - vector tmp; - vector le, lf; - TRule x; - x.lhs_ = -TD::Convert("X"); - bool flag = false; - while(getline(in, line)) { - ++lc; - if (lc % 1000000 == 0) { cerr << " [" << lc << ']' << endl; flag = false; } - else if (lc % 25000 == 0) { cerr << '.' << flush; flag = true; } - tmp.clear(); - TD::ConvertSentence(line, &tmp); - x.f_.clear(); - x.e_.clear(); - size_t pos = 0; - int cc = 0; - while(pos < tmp.size()) { - const WordID cur = tmp[pos++]; - if (cur == kDIV) { - ++cc; - } else if (cc == 0) { - x.f_.push_back(cur); - } else if (cc == 1) { - x.e_.push_back(cur); - } else if (cc == 2) { - table[x].logeq(atof(TD::Convert(cur))); - ++cc; - } else { - if (flag) cerr << endl; - cerr << "Bad format in " << lc << ": " << line << endl; abort(); - } - } - if (cc != 3) { - if (flag) cerr << endl; - cerr << "Bad format in " << lc << ": " << line << endl; abort(); - } - } - if (flag) cerr << endl; - cerr << " read " << lc << " entries\n"; -} - -prob_t PhraseConditionalUninformativeUnigramBase::p0(const vector& vsrc, - const vector& vtrg, - int start_src, int start_trg) const { - const int flen = vsrc.size() - start_src; - const int elen = vtrg.size() - start_trg; - prob_t p; - p.logeq(log_poisson(elen, flen + 0.01)); // elen | flen ~Pois(flen + 0.01) - //p.logeq(log_poisson(elen, 1)); // elen | flen ~Pois(flen + 0.01) - for (int i = 0; i < elen; ++i) - p *= u(vtrg[i + start_trg]); // draw e_i ~Uniform - return p; -} - -prob_t PhraseConditionalUninformativeBase::p0(const vector& vsrc, - const vector& vtrg, - int start_src, int start_trg) const { - const int flen = vsrc.size() - start_src; - const int elen = vtrg.size() - start_trg; - prob_t p; - //p.logeq(log_poisson(elen, flen + 0.01)); // elen | flen ~Pois(flen + 0.01) - p.logeq(log_poisson(elen, 1)); // elen | flen ~Pois(flen + 0.01) - for (int i = 0; i < elen; ++i) - p *= kUNIFORM_TARGET; // draw e_i ~Uniform - return p; -} - -void Model1::LoadModel1(const string& fname) { - cerr << "Loading Model 1 parameters from " << fname << " ..." << endl; - ReadFile rf(fname); - istream& in = *rf.stream(); - string line; - unsigned lc = 0; - while(getline(in, line)) { - ++lc; - int cur = 0; - int start = 0; - while(cur < line.size() && line[cur] != ' ') { ++cur; } - assert(cur != line.size()); - line[cur] = 0; - const WordID src = TD::Convert(&line[0]); - ++cur; - start = cur; - while(cur < line.size() && line[cur] != ' ') { ++cur; } - assert(cur != line.size()); - line[cur] = 0; - WordID trg = TD::Convert(&line[start]); - const double logprob = strtod(&line[cur + 1], NULL); - if (src >= ttable.size()) ttable.resize(src + 1); - ttable[src][trg].logeq(logprob); - } - cerr << " read " << lc << " parameters.\n"; -} - -prob_t PhraseConditionalBase::p0(const vector& vsrc, - const vector& vtrg, - int start_src, int start_trg) const { - const int flen = vsrc.size() - start_src; - const int elen = vtrg.size() - start_trg; - prob_t uniform_src_alignment; uniform_src_alignment.logeq(-log(flen + 1)); - prob_t p; - p.logeq(log_poisson(elen, flen + 0.01)); // elen | flen ~Pois(flen + 0.01) - for (int i = 0; i < elen; ++i) { // for each position i in e-RHS - const WordID trg = vtrg[i + start_trg]; - prob_t tp = prob_t::Zero(); - for (int j = -1; j < flen; ++j) { - const WordID src = j < 0 ? 0 : vsrc[j + start_src]; - tp += kM1MIXTURE * model1(src, trg); - tp += kUNIFORM_MIXTURE * kUNIFORM_TARGET; - } - tp *= uniform_src_alignment; // draw a_i ~uniform - p *= tp; // draw e_i ~Model1(f_a_i) / uniform - } - if (p.is_0()) { - cerr << "Zero! " << vsrc << "\nTRG=" << vtrg << endl; - abort(); - } - return p; -} - -prob_t PhraseJointBase::p0(const vector& vsrc, - const vector& vtrg, - int start_src, int start_trg) const { - const int flen = vsrc.size() - start_src; - const int elen = vtrg.size() - start_trg; - prob_t uniform_src_alignment; uniform_src_alignment.logeq(-log(flen + 1)); - prob_t p; - p.logeq(log_poisson(flen, 1.0)); // flen ~Pois(1) - // elen | flen ~Pois(flen + 0.01) - prob_t ptrglen; ptrglen.logeq(log_poisson(elen, flen + 0.01)); - p *= ptrglen; - p *= kUNIFORM_SOURCE.pow(flen); // each f in F ~Uniform - for (int i = 0; i < elen; ++i) { // for each position i in E - const WordID trg = vtrg[i + start_trg]; - prob_t tp = prob_t::Zero(); - for (int j = -1; j < flen; ++j) { - const WordID src = j < 0 ? 0 : vsrc[j + start_src]; - tp += kM1MIXTURE * model1(src, trg); - tp += kUNIFORM_MIXTURE * kUNIFORM_TARGET; - } - tp *= uniform_src_alignment; // draw a_i ~uniform - p *= tp; // draw e_i ~Model1(f_a_i) / uniform - } - if (p.is_0()) { - cerr << "Zero! " << vsrc << "\nTRG=" << vtrg << endl; - abort(); - } - return p; -} - -prob_t PhraseJointBase_BiDir::p0(const vector& vsrc, - const vector& vtrg, - int start_src, int start_trg) const { - const int flen = vsrc.size() - start_src; - const int elen = vtrg.size() - start_trg; - prob_t uniform_src_alignment; uniform_src_alignment.logeq(-log(flen + 1)); - prob_t uniform_trg_alignment; uniform_trg_alignment.logeq(-log(elen + 1)); - - prob_t p1; - p1.logeq(log_poisson(flen, 1.0)); // flen ~Pois(1) - // elen | flen ~Pois(flen + 0.01) - prob_t ptrglen; ptrglen.logeq(log_poisson(elen, flen + 0.01)); - p1 *= ptrglen; - p1 *= kUNIFORM_SOURCE.pow(flen); // each f in F ~Uniform - for (int i = 0; i < elen; ++i) { // for each position i in E - const WordID trg = vtrg[i + start_trg]; - prob_t tp = prob_t::Zero(); - for (int j = -1; j < flen; ++j) { - const WordID src = j < 0 ? 0 : vsrc[j + start_src]; - tp += kM1MIXTURE * model1(src, trg); - tp += kUNIFORM_MIXTURE * kUNIFORM_TARGET; - } - tp *= uniform_src_alignment; // draw a_i ~uniform - p1 *= tp; // draw e_i ~Model1(f_a_i) / uniform - } - if (p1.is_0()) { - cerr << "Zero! " << vsrc << "\nTRG=" << vtrg << endl; - abort(); - } - - prob_t p2; - p2.logeq(log_poisson(elen, 1.0)); // elen ~Pois(1) - // flen | elen ~Pois(flen + 0.01) - prob_t psrclen; psrclen.logeq(log_poisson(flen, elen + 0.01)); - p2 *= psrclen; - p2 *= kUNIFORM_TARGET.pow(elen); // each f in F ~Uniform - for (int i = 0; i < flen; ++i) { // for each position i in E - const WordID src = vsrc[i + start_src]; - prob_t tp = prob_t::Zero(); - for (int j = -1; j < elen; ++j) { - const WordID trg = j < 0 ? 0 : vtrg[j + start_trg]; - tp += kM1MIXTURE * invmodel1(trg, src); - tp += kUNIFORM_MIXTURE * kUNIFORM_SOURCE; - } - tp *= uniform_trg_alignment; // draw a_i ~uniform - p2 *= tp; // draw e_i ~Model1(f_a_i) / uniform - } - if (p2.is_0()) { - cerr << "Zero! " << vsrc << "\nTRG=" << vtrg << endl; - abort(); - } - - static const prob_t kHALF(0.5); - return (p1 + p2) * kHALF; -} - -JumpBase::JumpBase() : p(200) { - for (unsigned src_len = 1; src_len < 200; ++src_len) { - map& cpd = p[src_len]; - int min_jump = 1 - src_len; - int max_jump = src_len; - prob_t z; - for (int j = min_jump; j <= max_jump; ++j) { - prob_t& cp = cpd[j]; - if (j < 0) - cp.logeq(log_poisson(1.5-j, 1)); - else if (j > 0) - cp.logeq(log_poisson(j, 1)); - cp.poweq(0.2); - z += cp; - } - for (int j = min_jump; j <= max_jump; ++j) { - cpd[j] /= z; - } - } -} - diff --git a/gi/pf/base_measures.h b/gi/pf/base_measures.h deleted file mode 100644 index b0495bfd..00000000 --- a/gi/pf/base_measures.h +++ /dev/null @@ -1,247 +0,0 @@ -#ifndef _BASE_MEASURES_H_ -#define _BASE_MEASURES_H_ - -#include -#include -#include -#include -#include - -#include "unigrams.h" -#include "trule.h" -#include "prob.h" -#include "tdict.h" -#include "sampler.h" - -inline double log_poisson(unsigned x, const double& lambda) { - assert(lambda > 0.0); - return log(lambda) * x - lgamma(x + 1) - lambda; -} - -inline std::ostream& operator<<(std::ostream& os, const std::vector& p) { - os << '['; - for (int i = 0; i < p.size(); ++i) - os << (i==0 ? "" : " ") << TD::Convert(p[i]); - return os << ']'; -} - -struct Model1 { - explicit Model1(const std::string& fname) : - kNULL(TD::Convert("")), - kZERO() { - LoadModel1(fname); - } - - void LoadModel1(const std::string& fname); - - // returns prob 0 if src or trg is not found - const prob_t& operator()(WordID src, WordID trg) const { - if (src == 0) src = kNULL; - if (src < ttable.size()) { - const std::map& cpd = ttable[src]; - const std::map::const_iterator it = cpd.find(trg); - if (it != cpd.end()) - return it->second; - } - return kZERO; - } - - const WordID kNULL; - const prob_t kZERO; - std::vector > ttable; -}; - -struct PoissonUniformUninformativeBase { - explicit PoissonUniformUninformativeBase(const unsigned ves) : kUNIFORM(1.0 / ves) {} - prob_t operator()(const TRule& r) const { - prob_t p; p.logeq(log_poisson(r.e_.size(), 1.0)); - prob_t q = kUNIFORM; q.poweq(r.e_.size()); - p *= q; - return p; - } - void Summary() const {} - void ResampleHyperparameters(MT19937*) {} - void Increment(const TRule&) {} - void Decrement(const TRule&) {} - prob_t Likelihood() const { return prob_t::One(); } - const prob_t kUNIFORM; -}; - -struct CompletelyUniformBase { - explicit CompletelyUniformBase(const unsigned ves) : kUNIFORM(1.0 / ves) {} - prob_t operator()(const TRule&) const { - return kUNIFORM; - } - void Summary() const {} - void ResampleHyperparameters(MT19937*) {} - void Increment(const TRule&) {} - void Decrement(const TRule&) {} - prob_t Likelihood() const { return prob_t::One(); } - const prob_t kUNIFORM; -}; - -struct UnigramWordBase { - explicit UnigramWordBase(const std::string& fname) : un(fname) {} - prob_t operator()(const TRule& r) const { - return un(r.e_); - } - const UnigramWordModel un; -}; - -struct RuleHasher { - size_t operator()(const TRule& r) const { - return hash_value(r); - } -}; - -struct TableLookupBase { - TableLookupBase(const std::string& fname); - - prob_t operator()(const TRule& rule) const { - const std::tr1::unordered_map::const_iterator it = table.find(rule); - if (it == table.end()) { - std::cerr << rule << " not found\n"; - abort(); - } - return it->second; - } - - void ResampleHyperparameters(MT19937*) {} - void Increment(const TRule&) {} - void Decrement(const TRule&) {} - prob_t Likelihood() const { return prob_t::One(); } - void Summary() const {} - - std::tr1::unordered_map table; -}; - -struct PhraseConditionalUninformativeBase { - explicit PhraseConditionalUninformativeBase(const unsigned vocab_e_size) : - kUNIFORM_TARGET(1.0 / vocab_e_size) { - assert(vocab_e_size > 0); - } - - // return p0 of rule.e_ | rule.f_ - prob_t operator()(const TRule& rule) const { - return p0(rule.f_, rule.e_, 0, 0); - } - - prob_t p0(const std::vector& vsrc, const std::vector& vtrg, int start_src, int start_trg) const; - - void Summary() const {} - void ResampleHyperparameters(MT19937*) {} - void Increment(const TRule&) {} - void Decrement(const TRule&) {} - prob_t Likelihood() const { return prob_t::One(); } - const prob_t kUNIFORM_TARGET; -}; - -struct PhraseConditionalUninformativeUnigramBase { - explicit PhraseConditionalUninformativeUnigramBase(const std::string& file, const unsigned vocab_e_size) : u(file, vocab_e_size) {} - - // return p0 of rule.e_ | rule.f_ - prob_t operator()(const TRule& rule) const { - return p0(rule.f_, rule.e_, 0, 0); - } - - prob_t p0(const std::vector& vsrc, const std::vector& vtrg, int start_src, int start_trg) const; - - const UnigramModel u; -}; - -struct PhraseConditionalBase { - explicit PhraseConditionalBase(const Model1& m1, const double m1mixture, const unsigned vocab_e_size) : - model1(m1), - kM1MIXTURE(m1mixture), - kUNIFORM_MIXTURE(1.0 - m1mixture), - kUNIFORM_TARGET(1.0 / vocab_e_size) { - assert(m1mixture >= 0.0 && m1mixture <= 1.0); - assert(vocab_e_size > 0); - } - - // return p0 of rule.e_ | rule.f_ - prob_t operator()(const TRule& rule) const { - return p0(rule.f_, rule.e_, 0, 0); - } - - prob_t p0(const std::vector& vsrc, const std::vector& vtrg, int start_src, int start_trg) const; - - const Model1& model1; - const prob_t kM1MIXTURE; // Model 1 mixture component - const prob_t kUNIFORM_MIXTURE; // uniform mixture component - const prob_t kUNIFORM_TARGET; -}; - -struct PhraseJointBase { - explicit PhraseJointBase(const Model1& m1, const double m1mixture, const unsigned vocab_e_size, const unsigned vocab_f_size) : - model1(m1), - kM1MIXTURE(m1mixture), - kUNIFORM_MIXTURE(1.0 - m1mixture), - kUNIFORM_SOURCE(1.0 / vocab_f_size), - kUNIFORM_TARGET(1.0 / vocab_e_size) { - assert(m1mixture >= 0.0 && m1mixture <= 1.0); - assert(vocab_e_size > 0); - } - - // return p0 of rule.e_ , rule.f_ - prob_t operator()(const TRule& rule) const { - return p0(rule.f_, rule.e_, 0, 0); - } - - prob_t p0(const std::vector& vsrc, const std::vector& vtrg, int start_src, int start_trg) const; - - const Model1& model1; - const prob_t kM1MIXTURE; // Model 1 mixture component - const prob_t kUNIFORM_MIXTURE; // uniform mixture component - const prob_t kUNIFORM_SOURCE; - const prob_t kUNIFORM_TARGET; -}; - -struct PhraseJointBase_BiDir { - explicit PhraseJointBase_BiDir(const Model1& m1, - const Model1& im1, - const double m1mixture, - const unsigned vocab_e_size, - const unsigned vocab_f_size) : - model1(m1), - invmodel1(im1), - kM1MIXTURE(m1mixture), - kUNIFORM_MIXTURE(1.0 - m1mixture), - kUNIFORM_SOURCE(1.0 / vocab_f_size), - kUNIFORM_TARGET(1.0 / vocab_e_size) { - assert(m1mixture >= 0.0 && m1mixture <= 1.0); - assert(vocab_e_size > 0); - } - - // return p0 of rule.e_ , rule.f_ - prob_t operator()(const TRule& rule) const { - return p0(rule.f_, rule.e_, 0, 0); - } - - prob_t p0(const std::vector& vsrc, const std::vector& vtrg, int start_src, int start_trg) const; - - const Model1& model1; - const Model1& invmodel1; - const prob_t kM1MIXTURE; // Model 1 mixture component - const prob_t kUNIFORM_MIXTURE; // uniform mixture component - const prob_t kUNIFORM_SOURCE; - const prob_t kUNIFORM_TARGET; -}; - -// base distribution for jump size multinomials -// basically p(0) = 0 and then, p(1) is max, and then -// you drop as you move to the max jump distance -struct JumpBase { - JumpBase(); - - const prob_t& operator()(int jump, unsigned src_len) const { - assert(jump != 0); - const std::map::const_iterator it = p[src_len].find(jump); - assert(it != p[src_len].end()); - return it->second; - } - std::vector > p; -}; - - -#endif diff --git a/mteval/ns.cc b/mteval/ns.cc index 68c8deaa..da678b84 100644 --- a/mteval/ns.cc +++ b/mteval/ns.cc @@ -136,6 +136,10 @@ struct BleuSegmentEvaluator : public SegmentEvaluator { float* correct, // N elements reserved float* hyp, // N elements reserved bool clip_counts = true) const { + // clear clipping stats + for (typename NGramCountMap::iterator it = ngrams_.begin(); it != ngrams_.end(); ++it) + it->second.second = 0; + vector ngram(N); *correct *= 0; *hyp *= 0; diff --git a/mteval/ns.h b/mteval/ns.h index 622265db..d88c263b 100644 --- a/mteval/ns.h +++ b/mteval/ns.h @@ -6,6 +6,7 @@ #include #include #include "wordid.h" +#include class SufficientStats { public: @@ -43,6 +44,11 @@ class SufficientStats { bool operator==(const SufficientStats& other) const { return other.fields == fields; } + bool IsAdditiveIdentity() const { + for (unsigned i = 0; i < fields.size(); ++i) + if (fields[i]) return false; + return true; + } size_t size() const { return fields.size(); } float operator[](size_t i) const { if (i < fields.size()) return fields[i]; @@ -54,12 +60,12 @@ class SufficientStats { std::vector fields; }; -inline const SufficientStats& operator+(const SufficientStats& a, const SufficientStats& b) { +inline const SufficientStats operator+(const SufficientStats& a, const SufficientStats& b) { SufficientStats res(a); return res += b; } -inline const SufficientStats& operator-(const SufficientStats& a, const SufficientStats& b) { +inline const SufficientStats operator-(const SufficientStats& a, const SufficientStats& b) { SufficientStats res(a); return res -= b; } diff --git a/vest/ces.cc b/vest/ces.cc index 4ae6b695..cd89aa69 100644 --- a/vest/ces.cc +++ b/vest/ces.cc @@ -4,25 +4,32 @@ #include #include -#include "aligner.h" +// TODO, if AER is to be optimized again, we will need this +// #include "aligner.h" #include "lattice.h" #include "viterbi_envelope.h" #include "error_surface.h" +#include "ns.h" using boost::shared_ptr; using namespace std; const bool minimize_segments = true; // if adjacent segments have equal scores, merge them -void ComputeErrorSurface(const SentenceScorer& ss, const ViterbiEnvelope& ve, ErrorSurface* env, const ScoreType type, const Hypergraph& hg) { +void ComputeErrorSurface(const SegmentEvaluator& ss, + const ViterbiEnvelope& ve, + ErrorSurface* env, + const EvaluationMetric* metric, + const Hypergraph& hg) { vector prev_trans; const vector >& ienv = ve.GetSortedSegs(); env->resize(ienv.size()); - ScoreP prev_score; + SufficientStats prev_score; // defaults to 0 int j = 0; for (int i = 0; i < ienv.size(); ++i) { const Segment& seg = *ienv[i]; vector trans; +#if 0 if (type == AER) { vector edges(hg.edges_.size(), false); seg.CollectEdgesUsed(&edges); // get the set of edges in the viterbi @@ -46,34 +53,31 @@ void ComputeErrorSurface(const SentenceScorer& ss, const ViterbiEnvelope& ve, Er string tstr = os.str(); TD::ConvertSentence(tstr.substr(tstr.rfind(" ||| ") + 5), &trans); } else { +#endif seg.ConstructTranslation(&trans); - } - // cerr << "Scoring: " << TD::GetString(trans) << endl; + //} + //cerr << "Scoring: " << TD::GetString(trans) << endl; if (trans == prev_trans) { if (!minimize_segments) { - assert(prev_score); // if this fails, it means - // the decoder can generate null translations ErrorSegment& out = (*env)[j]; - out.delta = prev_score->GetZero(); + out.delta.fields.clear(); out.x = seg.x; ++j; } - // cerr << "Identical translation, skipping scoring\n"; + //cerr << "Identical translation, skipping scoring\n"; } else { - ScoreP score = ss.ScoreCandidate(trans); + SufficientStats score; + ss.Evaluate(trans, &score); // cerr << "score= " << score->ComputeScore() << "\n"; - ScoreP cur_delta_p = score->GetZero(); - Score* cur_delta = cur_delta_p.get(); - // just record the score diffs - if (!prev_score) - prev_score = score->GetZero(); - - score->Subtract(*prev_score, cur_delta); + //string x1; score.Encode(&x1); cerr << "STATS: " << x1 << endl; + const SufficientStats delta = score - prev_score; + //string x2; delta.Encode(&x2); cerr << "DELTA: " << x2 << endl; + //string xx; delta.Encode(&xx); cerr << xx << endl; prev_trans.swap(trans); prev_score = score; - if ((!minimize_segments) || (!cur_delta->IsAdditiveIdentity())) { + if ((!minimize_segments) || (!delta.IsAdditiveIdentity())) { ErrorSegment& out = (*env)[j]; - out.delta = cur_delta_p; + out.delta = delta; out.x = seg.x; ++j; } diff --git a/vest/ces.h b/vest/ces.h index 2f098990..e021e715 100644 --- a/vest/ces.h +++ b/vest/ces.h @@ -1,12 +1,16 @@ #ifndef _CES_H_ #define _CES_H_ -#include "scorer.h" - class ViterbiEnvelope; class Hypergraph; +class SegmentEvaluator; class ErrorSurface; +class EvaluationMetric; -void ComputeErrorSurface(const SentenceScorer& ss, const ViterbiEnvelope& ve, ErrorSurface* es, const ScoreType type, const Hypergraph& hg); +void ComputeErrorSurface(const SegmentEvaluator& ss, + const ViterbiEnvelope& ve, + ErrorSurface* es, + const EvaluationMetric* metric, + const Hypergraph& hg); #endif diff --git a/vest/dist-vest.pl b/vest/dist-vest.pl index c382a972..8cde748b 100755 --- a/vest/dist-vest.pl +++ b/vest/dist-vest.pl @@ -364,7 +364,7 @@ while (1){ $mapoutput =~ s/mapinput/mapoutput/; push @mapoutputs, "$dir/splag.$im1/$mapoutput"; $o2i{"$dir/splag.$im1/$mapoutput"} = "$dir/splag.$im1/$shard"; - my $script = "$MAPPER -s $srcFile -l $metric $refs_comma_sep < $dir/splag.$im1/$shard | sort -t \$'\\t' -k 1 > $dir/splag.$im1/$mapoutput"; + my $script = "$MAPPER -s $srcFile -m $metric $refs_comma_sep < $dir/splag.$im1/$shard | sort -t \$'\\t' -k 1 > $dir/splag.$im1/$mapoutput"; if ($use_make) { my $script_file = "$dir/scripts/map.$shard"; open F, ">$script_file" or die "Can't write $script_file: $!"; @@ -424,7 +424,7 @@ while (1){ print STDERR "Results for $tol/$til lines\n"; print STDERR "\nSORTING AND RUNNING VEST REDUCER\n"; print STDERR unchecked_output("date"); - $cmd="sort -t \$'\\t' -k 1 @mapoutputs | $REDUCER -l $metric > $dir/redoutput.$im1"; + $cmd="sort -t \$'\\t' -k 1 @mapoutputs | $REDUCER -m $metric > $dir/redoutput.$im1"; print STDERR "COMMAND:\n$cmd\n"; check_bash_call($cmd); $cmd="sort -nk3 $DIR_FLAG '-t|' $dir/redoutput.$im1 | head -1"; diff --git a/vest/error_surface.cc b/vest/error_surface.cc index 754aa8de..515b67f8 100644 --- a/vest/error_surface.cc +++ b/vest/error_surface.cc @@ -5,8 +5,7 @@ using namespace std; -ErrorSurface::~ErrorSurface() { -} +ErrorSurface::~ErrorSurface() {} void ErrorSurface::Serialize(std::string* out) const { const int segments = this->size(); @@ -15,8 +14,8 @@ void ErrorSurface::Serialize(std::string* out) const { for (int i = 0; i < segments; ++i) { const ErrorSegment& cur = (*this)[i]; string senc; - cur.delta->Encode(&senc); - assert(senc.size() < 256); + cur.delta.Encode(&senc); + assert(senc.size() < 1024); unsigned char len = senc.size(); os.write((const char*)&cur.x, sizeof(cur.x)); os.write((const char*)&len, sizeof(len)); @@ -25,7 +24,7 @@ void ErrorSurface::Serialize(std::string* out) const { *out = os.str(); } -void ErrorSurface::Deserialize(ScoreType type, const std::string& in) { +void ErrorSurface::Deserialize(const std::string& in) { istringstream is(in, ios::binary); int segments; is.read((char*)&segments, sizeof(segments)); @@ -37,7 +36,7 @@ void ErrorSurface::Deserialize(ScoreType type, const std::string& in) { is.read((char*)&len, sizeof(len)); string senc(len, '\0'); assert(senc.size() == len); is.read((char*)&senc[0], len); - cur.delta = SentenceScorer::CreateScoreFromString(type, senc); + cur.delta = SufficientStats(senc); } } diff --git a/vest/error_surface.h b/vest/error_surface.h index ad728cfa..bb65847b 100644 --- a/vest/error_surface.h +++ b/vest/error_surface.h @@ -4,13 +4,13 @@ #include #include -#include "scorer.h" +#include "ns.h" class Score; struct ErrorSegment { double x; - ScoreP delta; + SufficientStats delta; ErrorSegment() : x(0), delta() {} }; @@ -18,7 +18,7 @@ class ErrorSurface : public std::vector { public: ~ErrorSurface(); void Serialize(std::string* out) const; - void Deserialize(ScoreType type, const std::string& in); + void Deserialize(const std::string& in); }; #endif diff --git a/vest/line_optimizer.cc b/vest/line_optimizer.cc index 7303df8d..49443fbe 100644 --- a/vest/line_optimizer.cc +++ b/vest/line_optimizer.cc @@ -4,7 +4,7 @@ #include #include "sparse_vector.h" -#include "scorer.h" +#include "ns.h" using namespace std; @@ -18,6 +18,7 @@ struct IntervalComp { }; double LineOptimizer::LineOptimize( + const EvaluationMetric* metric, const vector& surfaces, const LineOptimizer::ScoreType type, float* best_score, @@ -32,8 +33,7 @@ double LineOptimizer::LineOptimize( } sort(all_ints.begin(), all_ints.end(), IntervalComp()); double last_boundary = all_ints.front()->x; - ScoreP accp = all_ints.front()->delta->GetZero(); - Score *acc=accp.get(); + SufficientStats acc; float& cur_best_score = *best_score; cur_best_score = (type == MAXIMIZE_SCORE ? -numeric_limits::max() : numeric_limits::max()); @@ -42,9 +42,8 @@ double LineOptimizer::LineOptimize( for (vector::iterator i = all_ints.begin(); i != all_ints.end(); ++i) { const ErrorSegment& seg = **i; - assert(seg.delta); if (seg.x - last_boundary > epsilon) { - float sco = acc->ComputeScore(); + float sco = metric->ComputeScore(acc); if ((type == MAXIMIZE_SCORE && sco > cur_best_score) || (type == MINIMIZE_SCORE && sco < cur_best_score) ) { cur_best_score = sco; @@ -54,16 +53,18 @@ double LineOptimizer::LineOptimize( } else { pos = last_boundary + (seg.x - last_boundary) / 2; } - // cerr << "NEW BEST: " << pos << " (score=" << cur_best_score << ")\n"; + //cerr << "NEW BEST: " << pos << " (score=" << cur_best_score << ")\n"; } - // string xx; acc->ScoreDetails(&xx); cerr << "---- " << xx; + // string xx = metric->DetailedScore(acc); cerr << "---- " << xx; // cerr << "---- s=" << sco << "\n"; last_boundary = seg.x; } // cerr << "x-boundary=" << seg.x << "\n"; - acc->PlusEquals(*seg.delta); + //string x2; acc.Encode(&x2); cerr << " ACC: " << x2 << endl; + //string x1; seg.delta.Encode(&x1); cerr << " DELTA: " << x1 << endl; + acc += seg.delta; } - float sco = acc->ComputeScore(); + float sco = metric->ComputeScore(acc); if ((type == MAXIMIZE_SCORE && sco > cur_best_score) || (type == MINIMIZE_SCORE && sco < cur_best_score) ) { cur_best_score = sco; @@ -107,3 +108,4 @@ void LineOptimizer::CreateOptimizationDirections( RandomUnitVector(features_to_optimize, &out[i], rng); cerr << "Generated " << out.size() << " total axes to optimize along.\n"; } + diff --git a/vest/line_optimizer.h b/vest/line_optimizer.h index 99a591f4..83819f41 100644 --- a/vest/line_optimizer.h +++ b/vest/line_optimizer.h @@ -7,6 +7,7 @@ #include "error_surface.h" #include "sampler.h" +class EvaluationMetric; class Weights; struct LineOptimizer { @@ -18,6 +19,7 @@ struct LineOptimizer { // merge all the error surfaces together into a global // error surface and find (the middle of) the best segment static double LineOptimize( + const EvaluationMetric* metric, const std::vector& envs, const LineOptimizer::ScoreType type, float* best_score, diff --git a/vest/lo_test.cc b/vest/lo_test.cc index f5638600..a67f65e1 100644 --- a/vest/lo_test.cc +++ b/vest/lo_test.cc @@ -5,6 +5,8 @@ #include #include +#include "ns.h" +#include "ns_docscorer.h" #include "ces.h" #include "fdict.h" #include "hg.h" @@ -15,7 +17,6 @@ #include "viterbi.h" #include "viterbi_envelope.h" #include "line_optimizer.h" -#include "scorer.h" using namespace std; using boost::shared_ptr; @@ -141,9 +142,6 @@ TEST_F(OptTest, TestS1) { TD::ConvertSentence(ref22, &refs2[1]); TD::ConvertSentence(ref32, &refs2[2]); TD::ConvertSentence(ref42, &refs2[3]); - ScoreType type = ScoreTypeFromString("ibm_bleu"); - ScorerP scorer1 = SentenceScorer::CreateSentenceScorer(type, refs1); - ScorerP scorer2 = SentenceScorer::CreateSentenceScorer(type, refs2); vector envs(2); RandomNumberGenerator rng; @@ -167,14 +165,17 @@ TEST_F(OptTest, TestS1) { envs[1] = Inside(hg2, NULL, wf); vector es(2); - ComputeErrorSurface(*scorer1, envs[0], &es[0], IBM_BLEU, hg); - ComputeErrorSurface(*scorer2, envs[1], &es[1], IBM_BLEU, hg2); + EvaluationMetric* metric = EvaluationMetric::Instance("IBM_BLEU"); + boost::shared_ptr scorer1 = metric->CreateSegmentEvaluator(refs1); + boost::shared_ptr scorer2 = metric->CreateSegmentEvaluator(refs2); + ComputeErrorSurface(*scorer1, envs[0], &es[0], metric, hg); + ComputeErrorSurface(*scorer2, envs[1], &es[1], metric, hg2); cerr << envs[0].size() << " " << envs[1].size() << endl; cerr << es[0].size() << " " << es[1].size() << endl; envs.clear(); clock_t t_env=clock(); float score; - double m = LineOptimizer::LineOptimize(es, LineOptimizer::MAXIMIZE_SCORE, &score); + double m = LineOptimizer::LineOptimize(metric,es, LineOptimizer::MAXIMIZE_SCORE, &score); clock_t t_opt=clock(); cerr << "line optimizer returned: " << m << " (SCORE=" << score << ")\n"; EXPECT_FLOAT_EQ(0.48719698, score); @@ -217,15 +218,15 @@ TEST_F(OptTest,TestZeroOrigin) { vector envs(1); envs[0] = Inside(hg, NULL, wf); - ScoreType type = ScoreTypeFromString("ibm_bleu"); vector > mr(4); TD::ConvertSentence("untitled", &mr[0]); TD::ConvertSentence("with no title", &mr[1]); TD::ConvertSentence("without a title", &mr[2]); TD::ConvertSentence("without title", &mr[3]); - ScorerP scorer1 = SentenceScorer::CreateSentenceScorer(type, mr); + EvaluationMetric* metric = EvaluationMetric::Instance("IBM_BLEU"); + boost::shared_ptr scorer1 = metric->CreateSegmentEvaluator(mr); vector es(1); - ComputeErrorSurface(*scorer1, envs[0], &es[0], IBM_BLEU, hg); + ComputeErrorSurface(*scorer1, envs[0], &es[0], metric, hg); } int main(int argc, char **argv) { diff --git a/vest/mr_vest_map.cc b/vest/mr_vest_map.cc index 71dda6d7..8f6e085d 100644 --- a/vest/mr_vest_map.cc +++ b/vest/mr_vest_map.cc @@ -6,11 +6,12 @@ #include #include +#include "ns.h" +#include "ns_docscorer.h" #include "ces.h" #include "filelib.h" #include "stringlib.h" #include "sparse_vector.h" -#include "scorer.h" #include "viterbi_envelope.h" #include "inside_outside.h" #include "error_surface.h" @@ -25,7 +26,7 @@ void InitCommandLine(int argc, char** argv, po::variables_map* conf) { opts.add_options() ("reference,r",po::value >(), "[REQD] Reference translation (tokenized text)") ("source,s",po::value(), "Source file (ignored, except for AER)") - ("loss_function,l",po::value()->default_value("ibm_bleu"), "Loss function being optimized") + ("evaluation_metric,m",po::value()->default_value("ibm_bleu"), "Evaluation metric being optimized") ("input,i",po::value()->default_value("-"), "Input file to map (- is STDIN)") ("help,h", "Help"); po::options_description dcmdline_options; @@ -67,10 +68,10 @@ bool ReadSparseVectorString(const string& s, SparseVector* v) { int main(int argc, char** argv) { po::variables_map conf; InitCommandLine(argc, argv, &conf); - const string loss_function = conf["loss_function"].as(); - ScoreType type = ScoreTypeFromString(loss_function); - DocScorer ds(type, conf["reference"].as >(), conf["source"].as()); - cerr << "Loaded " << ds.size() << " references for scoring with " << loss_function << endl; + const string evaluation_metric = conf["evaluation_metric"].as(); + EvaluationMetric* metric = EvaluationMetric::Instance(evaluation_metric); + DocumentScorer ds(metric, conf["reference"].as >()); + cerr << "Loaded " << ds.size() << " references for scoring with " << evaluation_metric << endl; Hypergraph hg; string last_file; ReadFile in_read(conf["input"].as()); @@ -97,7 +98,8 @@ int main(int argc, char** argv) { ViterbiEnvelopeWeightFunction wf(origin, axis); ViterbiEnvelope ve = Inside(hg, NULL, wf); ErrorSurface es; - ComputeErrorSurface(*ds[sent_id], ve, &es, type, hg); + + ComputeErrorSurface(*ds[sent_id], ve, &es, metric, hg); //cerr << "Viterbi envelope has " << ve.size() << " segments\n"; // cerr << "Error surface has " << es.size() << " segments\n"; string val; diff --git a/vest/mr_vest_reduce.cc b/vest/mr_vest_reduce.cc index 3df52020..dda61f88 100644 --- a/vest/mr_vest_reduce.cc +++ b/vest/mr_vest_reduce.cc @@ -10,6 +10,7 @@ #include "error_surface.h" #include "line_optimizer.h" #include "b64tools.h" +#include "stringlib.h" using namespace std; namespace po = boost::program_options; @@ -17,12 +18,12 @@ namespace po = boost::program_options; void InitCommandLine(int argc, char** argv, po::variables_map* conf) { po::options_description opts("Configuration options"); opts.add_options() - ("loss_function,l",po::value(), "Loss function being optimized") + ("evaluation_metric,m",po::value(), "Evaluation metric (IBM_BLEU, etc.)") ("help,h", "Help"); po::options_description dcmdline_options; dcmdline_options.add(opts); po::store(parse_command_line(argc, argv, dcmdline_options), *conf); - bool flag = conf->count("loss_function") == 0; + bool flag = conf->count("evaluation_metric") == 0; if (flag || conf->count("help")) { cerr << dcmdline_options << endl; exit(1); @@ -32,30 +33,27 @@ void InitCommandLine(int argc, char** argv, po::variables_map* conf) { int main(int argc, char** argv) { po::variables_map conf; InitCommandLine(argc, argv, &conf); - const string loss_function = conf["loss_function"].as(); - ScoreType type = ScoreTypeFromString(loss_function); + const string evaluation_metric = conf["evaluation_metric"].as(); LineOptimizer::ScoreType opt_type = LineOptimizer::MAXIMIZE_SCORE; - if (type == TER || type == AER) { + if (UppercaseString(evaluation_metric) == "TER") opt_type = LineOptimizer::MINIMIZE_SCORE; - } - string last_key; + EvaluationMetric* metric = EvaluationMetric::Instance(evaluation_metric); + vector esv; - while(cin) { - string line; - getline(cin, line); - if (line.empty()) continue; + string last_key, line, key, val; + while(getline(cin, line)) { size_t ks = line.find("\t"); assert(string::npos != ks); assert(ks > 2); - string key = line.substr(2, ks - 2); - string val = line.substr(ks + 1); + key = line.substr(2, ks - 2); + val = line.substr(ks + 1); if (key != last_key) { if (!last_key.empty()) { float score; - double x = LineOptimizer::LineOptimize(esv, opt_type, &score); + double x = LineOptimizer::LineOptimize(metric, esv, opt_type, &score); cout << last_key << "|" << x << "|" << score << endl; } - last_key = key; + last_key.swap(key); esv.clear(); } if (val.size() % 4 != 0) { @@ -68,13 +66,11 @@ int main(int argc, char** argv) { continue; } esv.push_back(ErrorSurface()); - esv.back().Deserialize(type, encoded); + esv.back().Deserialize(encoded); } if (!esv.empty()) { - // cerr << "ESV=" << esv.size() << endl; - // for (int i = 0; i < esv.size(); ++i) { cerr << esv[i].size() << endl; } float score; - double x = LineOptimizer::LineOptimize(esv, opt_type, &score); + double x = LineOptimizer::LineOptimize(metric, esv, opt_type, &score); cout << last_key << "|" << x << "|" << score << endl; } return 0; -- cgit v1.2.3 From e5831933364a4cab5084db49281a855baea8e09c Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Thu, 2 Feb 2012 12:33:13 -0500 Subject: fix broken build --- gi/pf/Makefile.am | 2 +- gi/pf/base_distributions.cc | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'gi/pf') diff --git a/gi/pf/Makefile.am b/gi/pf/Makefile.am index 28367e67..8d43f36d 100644 --- a/gi/pf/Makefile.am +++ b/gi/pf/Makefile.am @@ -1,7 +1,7 @@ bin_PROGRAMS = cbgi brat dpnaive pfbrat pfdist itg pfnaive condnaive align-lexonly align-lexonly-pyp noinst_LIBRARIES = libpf.a -libpf_a_SOURCES = base_measures.cc reachability.cc cfg_wfst_composer.cc corpus.cc unigrams.cc ngram_base.cc +libpf_a_SOURCES = base_distributions.cc reachability.cc cfg_wfst_composer.cc corpus.cc unigrams.cc ngram_base.cc align_lexonly_SOURCES = align-lexonly.cc diff --git a/gi/pf/base_distributions.cc b/gi/pf/base_distributions.cc index 4b1863fa..d362fd76 100644 --- a/gi/pf/base_distributions.cc +++ b/gi/pf/base_distributions.cc @@ -1,4 +1,4 @@ -#include "base_measures.h" +#include "base_distributions.h" #include -- cgit v1.2.3 From d91750f35d4d7edfc77a589ae92100d523068ad7 Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Fri, 3 Feb 2012 21:11:40 -0500 Subject: fix broken build --- gi/pf/align-lexonly-pyp.cc | 2 +- gi/pf/align-lexonly.cc | 2 +- gi/pf/conditional_pseg.h | 2 +- gi/pf/condnaive.cc | 2 +- gi/pf/dpnaive.cc | 2 +- gi/pf/monotonic_pseg.h | 2 +- gi/pf/pfdist.cc | 2 +- gi/pf/pfnaive.cc | 2 +- 8 files changed, 8 insertions(+), 8 deletions(-) (limited to 'gi/pf') diff --git a/gi/pf/align-lexonly-pyp.cc b/gi/pf/align-lexonly-pyp.cc index d2630a2b..e24cb457 100644 --- a/gi/pf/align-lexonly-pyp.cc +++ b/gi/pf/align-lexonly-pyp.cc @@ -7,7 +7,7 @@ #include #include "array2d.h" -#include "base_measures.h" +#include "base_distributions.h" #include "monotonic_pseg.h" #include "conditional_pseg.h" #include "trule.h" diff --git a/gi/pf/align-lexonly.cc b/gi/pf/align-lexonly.cc index 76e2e009..8c1d689f 100644 --- a/gi/pf/align-lexonly.cc +++ b/gi/pf/align-lexonly.cc @@ -7,7 +7,7 @@ #include #include "array2d.h" -#include "base_measures.h" +#include "base_distributions.h" #include "monotonic_pseg.h" #include "conditional_pseg.h" #include "trule.h" diff --git a/gi/pf/conditional_pseg.h b/gi/pf/conditional_pseg.h index db951d15..0aa5e8e0 100644 --- a/gi/pf/conditional_pseg.h +++ b/gi/pf/conditional_pseg.h @@ -10,7 +10,7 @@ #include "ccrp_nt.h" #include "mfcr.h" #include "trule.h" -#include "base_measures.h" +#include "base_distributions.h" #include "tdict.h" template diff --git a/gi/pf/condnaive.cc b/gi/pf/condnaive.cc index 52ddbbfe..3ea88016 100644 --- a/gi/pf/condnaive.cc +++ b/gi/pf/condnaive.cc @@ -6,7 +6,7 @@ #include #include -#include "base_measures.h" +#include "base_distributions.h" #include "monotonic_pseg.h" #include "conditional_pseg.h" #include "trule.h" diff --git a/gi/pf/dpnaive.cc b/gi/pf/dpnaive.cc index db1c43c7..469dff5c 100644 --- a/gi/pf/dpnaive.cc +++ b/gi/pf/dpnaive.cc @@ -6,7 +6,7 @@ #include #include -#include "base_measures.h" +#include "base_distributions.h" #include "monotonic_pseg.h" #include "trule.h" #include "tdict.h" diff --git a/gi/pf/monotonic_pseg.h b/gi/pf/monotonic_pseg.h index 301aa6d8..10d171fe 100644 --- a/gi/pf/monotonic_pseg.h +++ b/gi/pf/monotonic_pseg.h @@ -6,7 +6,7 @@ #include "prob.h" #include "ccrp_nt.h" #include "trule.h" -#include "base_measures.h" +#include "base_distributions.h" template struct MonotonicParallelSegementationModel { diff --git a/gi/pf/pfdist.cc b/gi/pf/pfdist.cc index aae5f798..ef08a165 100644 --- a/gi/pf/pfdist.cc +++ b/gi/pf/pfdist.cc @@ -7,7 +7,7 @@ #include #include "pf.h" -#include "base_measures.h" +#include "base_distributions.h" #include "reachability.h" #include "viterbi.h" #include "hg.h" diff --git a/gi/pf/pfnaive.cc b/gi/pf/pfnaive.cc index 728ec00d..acba9d22 100644 --- a/gi/pf/pfnaive.cc +++ b/gi/pf/pfnaive.cc @@ -7,7 +7,7 @@ #include #include "pf.h" -#include "base_measures.h" +#include "base_distributions.h" #include "monotonic_pseg.h" #include "reachability.h" #include "viterbi.h" -- cgit v1.2.3 From 648fd70ec05997003e801e113d825c84e55e01ca Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Wed, 8 Feb 2012 16:22:55 -0500 Subject: move widely duplicated math functions into m.h header --- .gitignore | 1 + gi/pf/base_distributions.cc | 22 +++++------ gi/pf/base_distributions.h | 21 +--------- gi/pf/conditional_pseg.h | 3 +- gi/pf/pfdist.cc | 6 +-- gi/pf/pfnaive.cc | 4 +- phrasinator/gibbs_train_plm.cc | 8 +--- utils/Makefile.am | 5 ++- utils/m.h | 89 ++++++++++++++++++++++++++++++++++++++++++ utils/m_test.cc | 75 +++++++++++++++++++++++++++++++++++ utils/mfcr.h | 22 ++--------- 11 files changed, 194 insertions(+), 62 deletions(-) create mode 100644 utils/m.h create mode 100644 utils/m_test.cc (limited to 'gi/pf') diff --git a/.gitignore b/.gitignore index ab8bf2c7..4f75d153 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,5 @@ mira/kbest_mira +utils/m_test sa-extract/calignment.c sa-extract/calignment.so sa-extract/cdat.c diff --git a/gi/pf/base_distributions.cc b/gi/pf/base_distributions.cc index d362fd76..d9761005 100644 --- a/gi/pf/base_distributions.cc +++ b/gi/pf/base_distributions.cc @@ -59,7 +59,7 @@ prob_t PhraseConditionalUninformativeUnigramBase::p0(const vector& vsrc, const int flen = vsrc.size() - start_src; const int elen = vtrg.size() - start_trg; prob_t p; - p.logeq(log_poisson(elen, flen + 0.01)); // elen | flen ~Pois(flen + 0.01) + p.logeq(Md::log_poisson(elen, flen + 0.01)); // elen | flen ~Pois(flen + 0.01) //p.logeq(log_poisson(elen, 1)); // elen | flen ~Pois(flen + 0.01) for (int i = 0; i < elen; ++i) p *= u(vtrg[i + start_trg]); // draw e_i ~Uniform @@ -73,7 +73,7 @@ prob_t PhraseConditionalUninformativeBase::p0(const vector& vsrc, const int elen = vtrg.size() - start_trg; prob_t p; //p.logeq(log_poisson(elen, flen + 0.01)); // elen | flen ~Pois(flen + 0.01) - p.logeq(log_poisson(elen, 1)); // elen | flen ~Pois(flen + 0.01) + p.logeq(Md::log_poisson(elen, 1)); // elen | flen ~Pois(flen + 0.01) for (int i = 0; i < elen; ++i) p *= kUNIFORM_TARGET; // draw e_i ~Uniform return p; @@ -113,7 +113,7 @@ prob_t PhraseConditionalBase::p0(const vector& vsrc, const int elen = vtrg.size() - start_trg; prob_t uniform_src_alignment; uniform_src_alignment.logeq(-log(flen + 1)); prob_t p; - p.logeq(log_poisson(elen, flen + 0.01)); // elen | flen ~Pois(flen + 0.01) + p.logeq(Md::log_poisson(elen, flen + 0.01)); // elen | flen ~Pois(flen + 0.01) for (int i = 0; i < elen; ++i) { // for each position i in e-RHS const WordID trg = vtrg[i + start_trg]; prob_t tp = prob_t::Zero(); @@ -139,9 +139,9 @@ prob_t PhraseJointBase::p0(const vector& vsrc, const int elen = vtrg.size() - start_trg; prob_t uniform_src_alignment; uniform_src_alignment.logeq(-log(flen + 1)); prob_t p; - p.logeq(log_poisson(flen, 1.0)); // flen ~Pois(1) + p.logeq(Md::log_poisson(flen, 1.0)); // flen ~Pois(1) // elen | flen ~Pois(flen + 0.01) - prob_t ptrglen; ptrglen.logeq(log_poisson(elen, flen + 0.01)); + prob_t ptrglen; ptrglen.logeq(Md::log_poisson(elen, flen + 0.01)); p *= ptrglen; p *= kUNIFORM_SOURCE.pow(flen); // each f in F ~Uniform for (int i = 0; i < elen; ++i) { // for each position i in E @@ -171,9 +171,9 @@ prob_t PhraseJointBase_BiDir::p0(const vector& vsrc, prob_t uniform_trg_alignment; uniform_trg_alignment.logeq(-log(elen + 1)); prob_t p1; - p1.logeq(log_poisson(flen, 1.0)); // flen ~Pois(1) + p1.logeq(Md::log_poisson(flen, 1.0)); // flen ~Pois(1) // elen | flen ~Pois(flen + 0.01) - prob_t ptrglen; ptrglen.logeq(log_poisson(elen, flen + 0.01)); + prob_t ptrglen; ptrglen.logeq(Md::log_poisson(elen, flen + 0.01)); p1 *= ptrglen; p1 *= kUNIFORM_SOURCE.pow(flen); // each f in F ~Uniform for (int i = 0; i < elen; ++i) { // for each position i in E @@ -193,9 +193,9 @@ prob_t PhraseJointBase_BiDir::p0(const vector& vsrc, } prob_t p2; - p2.logeq(log_poisson(elen, 1.0)); // elen ~Pois(1) + p2.logeq(Md::log_poisson(elen, 1.0)); // elen ~Pois(1) // flen | elen ~Pois(flen + 0.01) - prob_t psrclen; psrclen.logeq(log_poisson(flen, elen + 0.01)); + prob_t psrclen; psrclen.logeq(Md::log_poisson(flen, elen + 0.01)); p2 *= psrclen; p2 *= kUNIFORM_TARGET.pow(elen); // each f in F ~Uniform for (int i = 0; i < flen; ++i) { // for each position i in E @@ -227,9 +227,9 @@ JumpBase::JumpBase() : p(200) { for (int j = min_jump; j <= max_jump; ++j) { prob_t& cp = cpd[j]; if (j < 0) - cp.logeq(log_poisson(1.5-j, 1)); + cp.logeq(Md::log_poisson(1.5-j, 1)); else if (j > 0) - cp.logeq(log_poisson(j, 1)); + cp.logeq(Md::log_poisson(j, 1)); cp.poweq(0.2); z += cp; } diff --git a/gi/pf/base_distributions.h b/gi/pf/base_distributions.h index a23ac32b..0d597c5c 100644 --- a/gi/pf/base_distributions.h +++ b/gi/pf/base_distributions.h @@ -13,24 +13,7 @@ #include "prob.h" #include "tdict.h" #include "sampler.h" - -inline double log_poisson(unsigned x, const double& lambda) { - assert(lambda > 0.0); - return log(lambda) * x - lgamma(x + 1) - lambda; -} - -inline double log_binom_coeff(unsigned n, unsigned k) { - assert(n >= k); - if (n == k) return 0.0; - return lgamma(n + 1) - lgamma(k + 1) - lgamma(n - k + 1); -} - -// http://en.wikipedia.org/wiki/Negative_binomial_distribution -inline double log_negative_binom(unsigned x, unsigned r, double p) { - assert(p > 0.0); - assert(p < 1.0); - return log_binom_coeff(x + r - 1, x) + r * log(1 - p) + x * log(p); -} +#include "m.h" inline std::ostream& operator<<(std::ostream& os, const std::vector& p) { os << '['; @@ -68,7 +51,7 @@ struct Model1 { struct PoissonUniformUninformativeBase { explicit PoissonUniformUninformativeBase(const unsigned ves) : kUNIFORM(1.0 / ves) {} prob_t operator()(const TRule& r) const { - prob_t p; p.logeq(log_poisson(r.e_.size(), 1.0)); + prob_t p; p.logeq(Md::log_poisson(r.e_.size(), 1.0)); prob_t q = kUNIFORM; q.poweq(r.e_.size()); p *= q; return p; diff --git a/gi/pf/conditional_pseg.h b/gi/pf/conditional_pseg.h index 0aa5e8e0..2e9e38fc 100644 --- a/gi/pf/conditional_pseg.h +++ b/gi/pf/conditional_pseg.h @@ -6,6 +6,7 @@ #include #include +#include "m.h" #include "prob.h" #include "ccrp_nt.h" #include "mfcr.h" @@ -210,7 +211,7 @@ struct ConditionalParallelSegementationModel { prob_t AlignProbability(unsigned span) const { prob_t p; - p.logeq(aligns.logprob(span, log_poisson(span, 1.0))); + p.logeq(aligns.logprob(span, Md::log_poisson(span, 1.0))); return p; } diff --git a/gi/pf/pfdist.cc b/gi/pf/pfdist.cc index ef08a165..3d578db2 100644 --- a/gi/pf/pfdist.cc +++ b/gi/pf/pfdist.cc @@ -315,7 +315,7 @@ struct BackwardEstimate { for (int i = 0; i < src_cov.size(); ++i) if (!src_cov[i]) r.push_back(src_[i]); const prob_t uniform_alignment(1.0 / r.size()); - e.logeq(log_poisson(trg_.size() - trg_cov, r.size() - 1)); // p(trg len remaining | src len remaining) + e.logeq(Md::log_poisson(trg_.size() - trg_cov, r.size() - 1)); // p(trg len remaining | src len remaining) for (unsigned j = trg_cov; j < trg_.size(); ++j) { prob_t p; for (unsigned i = 0; i < r.size(); ++i) @@ -352,7 +352,7 @@ struct BackwardEstimateSym { if (!src_cov[i]) r.push_back(src_[i]); r.push_back(0); // NULL word const prob_t uniform_alignment(1.0 / r.size()); - e.logeq(log_poisson(trg_.size() - trg_cov, r.size() - 1)); // p(trg len remaining | src len remaining) + e.logeq(Md::log_poisson(trg_.size() - trg_cov, r.size() - 1)); // p(trg len remaining | src len remaining) for (unsigned j = trg_cov; j < trg_.size(); ++j) { prob_t p; for (unsigned i = 0; i < r.size(); ++i) @@ -367,7 +367,7 @@ struct BackwardEstimateSym { r.pop_back(); const prob_t inv_uniform(1.0 / (trg_.size() - trg_cov + 1.0)); prob_t inv; - inv.logeq(log_poisson(r.size(), trg_.size() - trg_cov)); + inv.logeq(Md::log_poisson(r.size(), trg_.size() - trg_cov)); for (unsigned i = 0; i < r.size(); ++i) { prob_t p; for (unsigned j = trg_cov - 1; j < trg_.size(); ++j) diff --git a/gi/pf/pfnaive.cc b/gi/pf/pfnaive.cc index acba9d22..e1a53f5c 100644 --- a/gi/pf/pfnaive.cc +++ b/gi/pf/pfnaive.cc @@ -77,7 +77,7 @@ struct BackwardEstimateSym { r.push_back(src_[i]); r.push_back(0); // NULL word const prob_t uniform_alignment(1.0 / r.size()); - e.logeq(log_poisson(trg_.size() - trg_cov, r.size() - 1)); // p(trg len remaining | src len remaining) + e.logeq(Md::log_poisson(trg_.size() - trg_cov, r.size() - 1)); // p(trg len remaining | src len remaining) for (unsigned j = trg_cov; j < trg_.size(); ++j) { prob_t p; for (unsigned i = 0; i < r.size(); ++i) @@ -92,7 +92,7 @@ struct BackwardEstimateSym { r.pop_back(); const prob_t inv_uniform(1.0 / (trg_.size() - trg_cov + 1.0)); prob_t inv; - inv.logeq(log_poisson(r.size(), trg_.size() - trg_cov)); + inv.logeq(Md::log_poisson(r.size(), trg_.size() - trg_cov)); for (unsigned i = 0; i < r.size(); ++i) { prob_t p; for (unsigned j = trg_cov - 1; j < trg_.size(); ++j) diff --git a/phrasinator/gibbs_train_plm.cc b/phrasinator/gibbs_train_plm.cc index 29b3d7ea..66b46011 100644 --- a/phrasinator/gibbs_train_plm.cc +++ b/phrasinator/gibbs_train_plm.cc @@ -8,6 +8,7 @@ #include "dict.h" #include "sampler.h" #include "ccrp.h" +#include "m.h" using namespace std; using namespace std::tr1; @@ -95,11 +96,6 @@ void ReadCorpus(const string& filename, vector >* c, set* vocab if (in != &cin) delete in; } -double log_poisson(unsigned x, const double& lambda) { - assert(lambda > 0.0); - return log(lambda) * x - lgamma(x + 1) - lambda; -} - struct UniphraseLM { UniphraseLM(const vector >& corpus, const set& vocab, @@ -128,7 +124,7 @@ struct UniphraseLM { double log_p0(const vector& phrase) const { double len_logprob; if (use_poisson_) - len_logprob = log_poisson(phrase.size(), 1.0); + len_logprob = Md::log_poisson(phrase.size(), 1.0); else len_logprob = log(1 - p_end_) * (phrase.size() -1) + log(p_end_); return log(uniform_word_) * phrase.size() + len_logprob; diff --git a/utils/Makefile.am b/utils/Makefile.am index 3e559c75..a1ea8270 100644 --- a/utils/Makefile.am +++ b/utils/Makefile.am @@ -7,11 +7,12 @@ TESTS = ts phmt mfcr_test if HAVE_GTEST noinst_PROGRAMS += \ dict_test \ + m_test \ weights_test \ logval_test \ small_vector_test -TESTS += small_vector_test logval_test weights_test dict_test +TESTS += small_vector_test logval_test weights_test dict_test m_test endif reconstruct_weights_SOURCES = reconstruct_weights.cc @@ -38,6 +39,8 @@ endif phmt_SOURCES = phmt.cc ts_SOURCES = ts.cc +m_test_SOURCES = m_test.cc +m_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) dict_test_SOURCES = dict_test.cc dict_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) mfcr_test_SOURCES = mfcr_test.cc diff --git a/utils/m.h b/utils/m.h new file mode 100644 index 00000000..b25248c2 --- /dev/null +++ b/utils/m.h @@ -0,0 +1,89 @@ +#ifndef _M_H_ +#define _M_H_ + +#include +#include + +template +struct M { + // support [0, 1, 2 ...) + static inline F log_poisson(unsigned x, const F& lambda) { + assert(lambda > 0.0); + return std::log(lambda) * x - lgamma(x + 1) - lambda; + } + + // support [0, 1, 2 ...) + static inline F log_geometric(unsigned x, const F& p) { + assert(p > 0.0); + assert(p < 1.0); + return std::log(1 - p) * x + std::log(p); + } + + // log of the binomial coefficient + static inline F log_binom_coeff(unsigned n, unsigned k) { + assert(n >= k); + if (n == k) return 0.0; + return lgamma(n + 1) - lgamma(k + 1) - lgamma(n - k + 1); + } + + // http://en.wikipedia.org/wiki/Negative_binomial_distribution + // support [0, 1, 2 ...) + static inline F log_negative_binom(unsigned x, unsigned r, const F& p) { + assert(p > 0.0); + assert(p < 1.0); + return log_binom_coeff(x + r - 1u, x) + r * std::log(F(1) - p) + x * std::log(p); + } + + // this is the Beta function, *not* the beta probability density + // http://mathworld.wolfram.com/BetaFunction.html + static inline F log_beta_fn(const F& x, const F& y) { + return lgamma(x) + lgamma(y) - lgamma(x + y); + } + + // support x >= 0.0 + static F log_gamma_density(const F& x, const F& shape, const F& rate) { + assert(x >= 0.0); + assert(shape > 0.0); + assert(rate > 0.0); + return (shape-1)*std::log(x) - shape*std::log(rate) - x/rate - lgamma(shape); + } + + // this is the Beta *density* p(x ; alpha, beta) + // support x \in (0,1) + static inline F log_beta_density(const F& x, const F& alpha, const F& beta) { + assert(x > 0.0); + assert(x < 1.0); + assert(alpha > 0.0); + assert(beta > 0.0); + return (alpha-1)*std::log(x)+(beta-1)*std::log(1-x) - log_beta_fn(alpha, beta); + } + + // note: this has been adapted so that 0 is in the support of the distribution + // support [0, 1, 2 ...) + static inline F log_yule_simon(unsigned x, const F& rho) { + assert(rho > 0.0); + return std::log(rho) + log_beta_fn(x + 1, rho + 1); + } + + // see http://www.gatsby.ucl.ac.uk/~ywteh/research/compling/hpylm.pdf + // when y=1, sometimes written x^{\overline{n}} or x^{(n)} "Pochhammer symbol" + static inline F log_generalized_factorial(const F& x, const F& n, const F& y = 1.0) { + assert(x > 0.0); + assert(y >= 0.0); + assert(n > 0.0); + if (!n) return 0.0; + if (y == F(1)) { + return lgamma(x + n) - lgamma(x); + } else if (y) { + return n * std::log(y) + lgamma(x/y + n) - lgamma(x/y); + } else { // y == 0.0 + return n * std::log(x); + } + } + +}; + +typedef M Md; +typedef M Mf; + +#endif diff --git a/utils/m_test.cc b/utils/m_test.cc new file mode 100644 index 00000000..fca8f895 --- /dev/null +++ b/utils/m_test.cc @@ -0,0 +1,75 @@ +#include "m.h" + +#include +#include +#include + +using namespace std; + +class MTest : public testing::Test { + public: + MTest() {} + protected: + virtual void SetUp() { } + virtual void TearDown() { } +}; + +TEST_F(MTest, Poisson) { + double prev = 1.0; + double tot = 0; + for (int i = 0; i < 10; ++i) { + double p = Md::log_poisson(i, 0.99); + cerr << "p(i=" << i << ") = " << exp(p) << endl; + EXPECT_LT(p, prev); + tot += exp(p); + prev = p; + } + cerr << " tot=" << tot << endl; + EXPECT_LE(tot, 1.0); +} + +TEST_F(MTest, YuleSimon) { + double prev = 1.0; + double tot = 0; + for (int i = 0; i < 10; ++i) { + double p = Md::log_yule_simon(i, 1.0); + cerr << "p(i=" << i << ") = " << exp(p) << endl; + EXPECT_LT(p, prev); + tot += exp(p); + prev = p; + } + cerr << " tot=" << tot << endl; + EXPECT_LE(tot, 1.0); +} + +TEST_F(MTest, LogGeometric) { + double prev = 1.0; + double tot = 0; + for (int i = 0; i < 10; ++i) { + double p = Md::log_geometric(i, 0.5); + cerr << "p(i=" << i << ") = " << exp(p) << endl; + EXPECT_LT(p, prev); + tot += exp(p); + prev = p; + } + cerr << " tot=" << tot << endl; + EXPECT_LE(tot, 1.0); +} + +TEST_F(MTest, GeneralizedFactorial) { + for (double i = 0.3; i < 10000; i += 0.4) { + double a = Md::log_generalized_factorial(1.0, i); + double b = lgamma(1.0 + i); + EXPECT_FLOAT_EQ(a,b); + } + double gf_3_6 = 3.0 * 4.0 * 5.0 * 6.0 * 7.0 * 8.0; + EXPECT_FLOAT_EQ(Md::log_generalized_factorial(3.0, 6.0), std::log(gf_3_6)); + double gf_314_6 = 3.14 * 4.14 * 5.14 * 6.14 * 7.14 * 8.14; + EXPECT_FLOAT_EQ(Md::log_generalized_factorial(3.14, 6.0), std::log(gf_314_6)); +} + +int main(int argc, char** argv) { + testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} + diff --git a/utils/mfcr.h b/utils/mfcr.h index 3eb133fc..396d0205 100644 --- a/utils/mfcr.h +++ b/utils/mfcr.h @@ -12,6 +12,7 @@ #include #include "sampler.h" #include "slice_sampler.h" +#include "m.h" struct TableCount { TableCount() : count(), floor() {} @@ -218,31 +219,14 @@ class MFCR { return log_crp_prob(d_, alpha_); } - static double log_beta_density(const double& x, const double& alpha, const double& beta) { - assert(x > 0.0); - assert(x < 1.0); - assert(alpha > 0.0); - assert(beta > 0.0); - const double lp = (alpha-1)*log(x)+(beta-1)*log(1-x)+lgamma(alpha+beta)-lgamma(alpha)-lgamma(beta); - return lp; - } - - static double log_gamma_density(const double& x, const double& shape, const double& rate) { - assert(x >= 0.0); - assert(shape > 0.0); - assert(rate > 0.0); - const double lp = (shape-1)*log(x) - shape*log(rate) - x/rate - lgamma(shape); - return lp; - } - // taken from http://en.wikipedia.org/wiki/Chinese_restaurant_process // does not include draws from G_w's double log_crp_prob(const double& d, const double& alpha) const { double lp = 0.0; if (has_d_prior()) - lp = log_beta_density(d, d_prior_alpha_, d_prior_beta_); + lp = Md::log_beta_density(d, d_prior_alpha_, d_prior_beta_); if (has_alpha_prior()) - lp += log_gamma_density(alpha, alpha_prior_shape_, alpha_prior_rate_); + lp += Md::log_gamma_density(alpha, alpha_prior_shape_, alpha_prior_rate_); assert(lp <= 0.0); if (num_customers_) { if (d > 0.0) { -- cgit v1.2.3 From f15bbfbf105ff873e89a8bdf55e845f3ac7b030e Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Fri, 10 Feb 2012 18:31:05 +0000 Subject: better error checking --- gi/pf/guess-translits.pl | 1 + 1 file changed, 1 insertion(+) (limited to 'gi/pf') diff --git a/gi/pf/guess-translits.pl b/gi/pf/guess-translits.pl index ab737121..aafec13a 100755 --- a/gi/pf/guess-translits.pl +++ b/gi/pf/guess-translits.pl @@ -28,6 +28,7 @@ while() { my %b2a; for my $ap (@as) { my ($a,$b) = split /-/, $ap; + die "BAD INPUT: $_\n" unless defined $a && defined $b; $a2b{$a}->{$b} = 1; $b2a{$b}->{$a} = 1; } -- cgit v1.2.3 From 9007216a43c5572c2c343a1700ac79fb35b7d82f Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Sat, 25 Feb 2012 21:22:27 -0500 Subject: really slow hiero lm --- gi/pf/Makefile.am | 4 +- gi/pf/hierolm.cc | 309 +++++++++++++++++++++++++++++++++++++++++++++ phrasinator/ccrp.h | 294 ------------------------------------------- utils/ccrp.h | 340 ++++++++++++++++++++++++++++++++++++++++++++++++++ utils/ccrp_onetable.h | 12 ++ utils/sampler.h | 2 +- 6 files changed, 665 insertions(+), 296 deletions(-) create mode 100644 gi/pf/hierolm.cc delete mode 100644 phrasinator/ccrp.h create mode 100644 utils/ccrp.h (limited to 'gi/pf') diff --git a/gi/pf/Makefile.am b/gi/pf/Makefile.am index 8d43f36d..ed5b6fd3 100644 --- a/gi/pf/Makefile.am +++ b/gi/pf/Makefile.am @@ -1,4 +1,4 @@ -bin_PROGRAMS = cbgi brat dpnaive pfbrat pfdist itg pfnaive condnaive align-lexonly align-lexonly-pyp +bin_PROGRAMS = cbgi brat dpnaive pfbrat pfdist itg pfnaive condnaive align-lexonly align-lexonly-pyp hierolm noinst_LIBRARIES = libpf.a libpf_a_SOURCES = base_distributions.cc reachability.cc cfg_wfst_composer.cc corpus.cc unigrams.cc ngram_base.cc @@ -9,6 +9,8 @@ align_lexonly_pyp_SOURCES = align-lexonly-pyp.cc itg_SOURCES = itg.cc +hierolm_SOURCES = hierolm.cc + condnaive_SOURCES = condnaive.cc dpnaive_SOURCES = dpnaive.cc diff --git a/gi/pf/hierolm.cc b/gi/pf/hierolm.cc new file mode 100644 index 00000000..afb12fef --- /dev/null +++ b/gi/pf/hierolm.cc @@ -0,0 +1,309 @@ +#include +#include +#include + +#include +#include +#include + +#include "inside_outside.h" +#include "hg.h" +#include "bottom_up_parser.h" +#include "fdict.h" +#include "grammar.h" +#include "m.h" +#include "trule.h" +#include "tdict.h" +#include "filelib.h" +#include "dict.h" +#include "sampler.h" +#include "ccrp.h" +#include "ccrp_onetable.h" + +using namespace std; +using namespace tr1; +namespace po = boost::program_options; + +shared_ptr prng; + +void InitCommandLine(int argc, char** argv, po::variables_map* conf) { + po::options_description opts("Configuration options"); + opts.add_options() + ("samples,s",po::value()->default_value(1000),"Number of samples") + ("input,i",po::value(),"Read parallel data from") + ("random_seed,S",po::value(), "Random seed"); + po::options_description clo("Command line options"); + clo.add_options() + ("config", po::value(), "Configuration file") + ("help,h", "Print this help message and exit"); + po::options_description dconfig_options, dcmdline_options; + dconfig_options.add(opts); + dcmdline_options.add(opts).add(clo); + + po::store(parse_command_line(argc, argv, dcmdline_options), *conf); + if (conf->count("config")) { + ifstream config((*conf)["config"].as().c_str()); + po::store(po::parse_config_file(config, dconfig_options), *conf); + } + po::notify(*conf); + + if (conf->count("help") || (conf->count("input") == 0)) { + cerr << dcmdline_options << endl; + exit(1); + } +} + +void ReadCorpus(const string& filename, + vector >* e, + set* vocab_e) { + e->clear(); + vocab_e->clear(); + istream* in; + if (filename == "-") + in = &cin; + else + in = new ifstream(filename.c_str()); + assert(*in); + string line; + while(*in) { + getline(*in, line); + if (line.empty() && !*in) break; + e->push_back(vector()); + vector& le = e->back(); + TD::ConvertSentence(line, &le); + for (unsigned i = 0; i < le.size(); ++i) + vocab_e->insert(le[i]); + } + if (in != &cin) delete in; +} + +struct Grid { + // a b c d e + // 0 - 0 - - + vector grid; +}; + +struct BaseRuleModel { + explicit BaseRuleModel(unsigned term_size, + unsigned nonterm_size = 1) : + unif_term(1.0 / term_size), + unif_nonterm(1.0 / nonterm_size) {} + prob_t operator()(const TRule& r) const { + prob_t p; p.logeq(Md::log_poisson(1.0, r.f_.size())); + const prob_t term_prob((2.0 + 0.01*r.f_.size()) / (r.f_.size() + 2)); + const prob_t nonterm_prob(1.0 - term_prob.as_float()); + for (unsigned i = 0; i < r.f_.size(); ++i) { + if (r.f_[i] <= 0) { // nonterminal + p *= nonterm_prob; + p *= unif_nonterm; + } else { // terminal + p *= term_prob; + p *= unif_term; + } + } + return p; + } + const prob_t unif_term, unif_nonterm; +}; + +struct HieroLMModel { + explicit HieroLMModel(unsigned vocab_size) : p0(vocab_size), x(1,1,1,1) {} + + prob_t Prob(const TRule& r) const { + return x.probT(r, p0(r)); + } + + int Increment(const TRule& r, MT19937* rng) { + return x.incrementT(r, p0(r), rng); + // return x.increment(r); + } + + int Decrement(const TRule& r, MT19937* rng) { + return x.decrement(r, rng); + //return x.decrement(r); + } + + prob_t Likelihood() const { + prob_t p; + p.logeq(x.log_crp_prob()); + for (CCRP::const_iterator it = x.begin(); it != x.end(); ++it) { + prob_t tp = p0(it->first); + tp.poweq(it->second.table_counts_.size()); + p *= tp; + } + //for (CCRP_OneTable::const_iterator it = x.begin(); it != x.end(); ++it) + // p *= p0(it->first); + return p; + } + + void ResampleHyperparameters(MT19937* rng) { + x.resample_hyperparameters(rng); + cerr << " d=" << x.discount() << ", alpha=" << x.concentration() << endl; + } + + const BaseRuleModel p0; + CCRP x; + //CCRP_OneTable x; +}; + +vector tofreelist; + +HieroLMModel* plm; + +struct NPGrammarIter : public GrammarIter, public RuleBin { + NPGrammarIter() : arity() { tofreelist.push_back(this); } + NPGrammarIter(const TRulePtr& inr, const int a, int symbol) : arity(a + (symbol < 0 ? 1 : 0)) { + if (inr) { + r.reset(new TRule(*inr)); + } else { + static const int kLHS = -TD::Convert("X"); + r.reset(new TRule); + r->lhs_ = kLHS; + } + TRule& rr = *r; + rr.f_.push_back(symbol); + rr.e_.push_back(symbol < 0 ? (1-int(arity)) : symbol); + tofreelist.push_back(this); + } + virtual int GetNumRules() const { + if (r) return 1; else return 0; + } + virtual TRulePtr GetIthRule(int) const { + return r; + } + virtual int Arity() const { + return arity; + } + virtual const RuleBin* GetRules() const { + if (!r) return NULL; else return this; + } + virtual const GrammarIter* Extend(int symbol) const { + return new NPGrammarIter(r, arity, symbol); + } + const unsigned char arity; + TRulePtr r; +}; + +struct NPGrammar : public Grammar { + virtual const GrammarIter* GetRoot() const { + return new NPGrammarIter; + } +}; + +void SampleDerivation(const Hypergraph& hg, MT19937* rng, vector* sampled_deriv, HieroLMModel* plm) { + HieroLMModel& lm = *plm; + vector node_probs; + const prob_t total_prob = Inside(hg, &node_probs); + queue q; + q.push(hg.nodes_.size() - 3); + while(!q.empty()) { + unsigned cur_node_id = q.front(); +// cerr << "NODE=" << cur_node_id << endl; + q.pop(); + const Hypergraph::Node& node = hg.nodes_[cur_node_id]; + const unsigned num_in_edges = node.in_edges_.size(); + unsigned sampled_edge = 0; + if (num_in_edges == 1) { + sampled_edge = node.in_edges_[0]; + } else { + //prob_t z; + assert(num_in_edges > 1); + SampleSet ss; + for (unsigned j = 0; j < num_in_edges; ++j) { + const Hypergraph::Edge& edge = hg.edges_[node.in_edges_[j]]; + prob_t p = edge.edge_prob_; + for (unsigned k = 0; k < edge.tail_nodes_.size(); ++k) + p *= node_probs[edge.tail_nodes_[k]]; + ss.add(p); +// cerr << log(ss[j]) << " ||| " << edge.rule_->AsString() << endl; + //z += p; + } +// for (unsigned j = 0; j < num_in_edges; ++j) { +// const Hypergraph::Edge& edge = hg.edges_[node.in_edges_[j]]; +// cerr << exp(log(ss[j] / z)) << " ||| " << edge.rule_->AsString() << endl; +// } +// cerr << " --- \n"; + sampled_edge = node.in_edges_[rng->SelectSample(ss)]; + } + sampled_deriv->push_back(sampled_edge); + const Hypergraph::Edge& edge = hg.edges_[sampled_edge]; + for (unsigned j = 0; j < edge.tail_nodes_.size(); ++j) { + q.push(edge.tail_nodes_[j]); + } + } + for (unsigned i = 0; i < sampled_deriv->size(); ++i) { + cerr << *hg.edges_[(*sampled_deriv)[i]].rule_ << endl; + } +} + +void IncrementDerivation(const Hypergraph& hg, const vector& d, HieroLMModel* plm, MT19937* rng) { + for (unsigned i = 0; i < d.size(); ++i) + plm->Increment(*hg.edges_[d[i]].rule_, rng); +} + +void DecrementDerivation(const Hypergraph& hg, const vector& d, HieroLMModel* plm, MT19937* rng) { + for (unsigned i = 0; i < d.size(); ++i) + plm->Decrement(*hg.edges_[d[i]].rule_, rng); +} + +int main(int argc, char** argv) { + po::variables_map conf; + vector grammars; + grammars.push_back(GrammarPtr(new NPGrammar)); + + InitCommandLine(argc, argv, &conf); + const unsigned samples = conf["samples"].as(); + + if (conf.count("random_seed")) + prng.reset(new MT19937(conf["random_seed"].as())); + else + prng.reset(new MT19937); + MT19937& rng = *prng; + + vector > corpuse; + set vocabe; + cerr << "Reading corpus...\n"; + ReadCorpus(conf["input"].as(), &corpuse, &vocabe); + cerr << "E-corpus size: " << corpuse.size() << " sentences\t (" << vocabe.size() << " word types)\n"; + HieroLMModel lm(vocabe.size()); + + plm = &lm; + ExhaustiveBottomUpParser parser("X", grammars); + + Hypergraph hg; + const int kX = -TD::Convert("X"); + const int kLP = FD::Convert("LogProb"); + SparseVector v; v.set_value(kLP, 1.0); + vector > derivs(corpuse.size()); + for (int SS=0; SS < samples; ++SS) { + for (int ci = 0; ci < corpuse.size(); ++ci) { + vector& src = corpuse[ci]; + Lattice lat(src.size()); + for (unsigned i = 0; i < src.size(); ++i) + lat[i].push_back(LatticeArc(src[i], 0.0, 1)); + cerr << TD::GetString(src) << endl; + hg.clear(); + parser.Parse(lat, &hg); // exhaustive parse + DecrementDerivation(hg, derivs[ci], &lm, &rng); + for (unsigned i = 0; i < hg.edges_.size(); ++i) { + TRule& r = *hg.edges_[i].rule_; + if (r.lhs_ == kX) + hg.edges_[i].edge_prob_ = lm.Prob(r); + } + vector d; + SampleDerivation(hg, &rng, &d, &lm); + derivs[ci] = d; + IncrementDerivation(hg, derivs[ci], &lm, &rng); + if (tofreelist.size() > 100000) { + cerr << "Freeing ... "; + for (unsigned i = 0; i < tofreelist.size(); ++i) + delete tofreelist[i]; + tofreelist.clear(); + cerr << "Freed.\n"; + } + } + cerr << "LLH=" << lm.Likelihood() << endl; + } + return 0; +} + diff --git a/phrasinator/ccrp.h b/phrasinator/ccrp.h deleted file mode 100644 index 9acf12ab..00000000 --- a/phrasinator/ccrp.h +++ /dev/null @@ -1,294 +0,0 @@ -#ifndef _CCRP_H_ -#define _CCRP_H_ - -#include -#include -#include -#include -#include -#include -#include -#include -#include "sampler.h" -#include "slice_sampler.h" - -// Chinese restaurant process (Pitman-Yor parameters) with table tracking. - -template > -class CCRP { - public: - CCRP(double disc, double conc) : - num_tables_(), - num_customers_(), - discount_(disc), - concentration_(conc), - discount_prior_alpha_(std::numeric_limits::quiet_NaN()), - discount_prior_beta_(std::numeric_limits::quiet_NaN()), - concentration_prior_shape_(std::numeric_limits::quiet_NaN()), - concentration_prior_rate_(std::numeric_limits::quiet_NaN()) {} - - CCRP(double d_alpha, double d_beta, double c_shape, double c_rate, double d = 0.1, double c = 10.0) : - num_tables_(), - num_customers_(), - discount_(d), - concentration_(c), - discount_prior_alpha_(d_alpha), - discount_prior_beta_(d_beta), - concentration_prior_shape_(c_shape), - concentration_prior_rate_(c_rate) {} - - double discount() const { return discount_; } - double concentration() const { return concentration_; } - - bool has_discount_prior() const { - return !std::isnan(discount_prior_alpha_); - } - - bool has_concentration_prior() const { - return !std::isnan(concentration_prior_shape_); - } - - void clear() { - num_tables_ = 0; - num_customers_ = 0; - dish_locs_.clear(); - } - - unsigned num_tables() const { - return num_tables_; - } - - unsigned num_tables(const Dish& dish) const { - const typename std::tr1::unordered_map::const_iterator it = dish_locs_.find(dish); - if (it == dish_locs_.end()) return 0; - return it->second.table_counts_.size(); - } - - unsigned num_customers() const { - return num_customers_; - } - - unsigned num_customers(const Dish& dish) const { - const typename std::tr1::unordered_map::const_iterator it = dish_locs_.find(dish); - if (it == dish_locs_.end()) return 0; - return it->total_dish_count_; - } - - // returns +1 or 0 indicating whether a new table was opened - int increment(const Dish& dish, const double& p0, MT19937* rng) { - DishLocations& loc = dish_locs_[dish]; - bool share_table = false; - if (loc.total_dish_count_) { - const double p_empty = (concentration_ + num_tables_ * discount_) * p0; - const double p_share = (loc.total_dish_count_ - loc.table_counts_.size() * discount_); - share_table = rng->SelectSample(p_empty, p_share); - } - if (share_table) { - double r = rng->next() * (loc.total_dish_count_ - loc.table_counts_.size() * discount_); - for (typename std::list::iterator ti = loc.table_counts_.begin(); - ti != loc.table_counts_.end(); ++ti) { - r -= (*ti - discount_); - if (r <= 0.0) { - ++(*ti); - break; - } - } - if (r > 0.0) { - std::cerr << "Serious error: r=" << r << std::endl; - Print(&std::cerr); - assert(r <= 0.0); - } - } else { - loc.table_counts_.push_back(1u); - ++num_tables_; - } - ++loc.total_dish_count_; - ++num_customers_; - return (share_table ? 0 : 1); - } - - // returns -1 or 0, indicating whether a table was closed - int decrement(const Dish& dish, MT19937* rng) { - DishLocations& loc = dish_locs_[dish]; - assert(loc.total_dish_count_); - if (loc.total_dish_count_ == 1) { - dish_locs_.erase(dish); - --num_tables_; - --num_customers_; - return -1; - } else { - int delta = 0; - // sample customer to remove UNIFORMLY. that is, do NOT use the discount - // here. if you do, it will introduce (unwanted) bias! - double r = rng->next() * loc.total_dish_count_; - --loc.total_dish_count_; - for (typename std::list::iterator ti = loc.table_counts_.begin(); - ti != loc.table_counts_.end(); ++ti) { - r -= *ti; - if (r <= 0.0) { - if ((--(*ti)) == 0) { - --num_tables_; - delta = -1; - loc.table_counts_.erase(ti); - } - break; - } - } - if (r > 0.0) { - std::cerr << "Serious error: r=" << r << std::endl; - Print(&std::cerr); - assert(r <= 0.0); - } - --num_customers_; - return delta; - } - } - - double prob(const Dish& dish, const double& p0) const { - const typename std::tr1::unordered_map::const_iterator it = dish_locs_.find(dish); - const double r = num_tables_ * discount_ + concentration_; - if (it == dish_locs_.end()) { - return r * p0 / (num_customers_ + concentration_); - } else { - return (it->second.total_dish_count_ - discount_ * it->second.table_counts_.size() + r * p0) / - (num_customers_ + concentration_); - } - } - - double log_crp_prob() const { - return log_crp_prob(discount_, concentration_); - } - - static double log_beta_density(const double& x, const double& alpha, const double& beta) { - assert(x > 0.0); - assert(x < 1.0); - assert(alpha > 0.0); - assert(beta > 0.0); - const double lp = (alpha-1)*log(x)+(beta-1)*log(1-x)+lgamma(alpha+beta)-lgamma(alpha)-lgamma(beta); - return lp; - } - - static double log_gamma_density(const double& x, const double& shape, const double& rate) { - assert(x >= 0.0); - assert(shape > 0.0); - assert(rate > 0.0); - const double lp = (shape-1)*log(x) - shape*log(rate) - x/rate - lgamma(shape); - return lp; - } - - // taken from http://en.wikipedia.org/wiki/Chinese_restaurant_process - // does not include P_0's - double log_crp_prob(const double& discount, const double& concentration) const { - double lp = 0.0; - if (has_discount_prior()) - lp = log_beta_density(discount, discount_prior_alpha_, discount_prior_beta_); - if (has_concentration_prior()) - lp += log_gamma_density(concentration, concentration_prior_shape_, concentration_prior_rate_); - assert(lp <= 0.0); - if (num_customers_) { - if (discount > 0.0) { - const double r = lgamma(1.0 - discount); - lp += lgamma(concentration) - lgamma(concentration + num_customers_) - + num_tables_ * log(discount) + lgamma(concentration / discount + num_tables_) - - lgamma(concentration / discount); - assert(std::isfinite(lp)); - for (typename std::tr1::unordered_map::const_iterator it = dish_locs_.begin(); - it != dish_locs_.end(); ++it) { - const DishLocations& cur = it->second; - for (std::list::const_iterator ti = cur.table_counts_.begin(); ti != cur.table_counts_.end(); ++ti) { - lp += lgamma(*ti - discount) - r; - } - } - } else { - assert(!"not implemented yet"); - } - } - assert(std::isfinite(lp)); - return lp; - } - - void resample_hyperparameters(MT19937* rng, const unsigned nloop = 5, const unsigned niterations = 10) { - assert(has_discount_prior() || has_concentration_prior()); - DiscountResampler dr(*this); - ConcentrationResampler cr(*this); - for (int iter = 0; iter < nloop; ++iter) { - if (has_concentration_prior()) { - concentration_ = slice_sampler1d(cr, concentration_, *rng, 0.0, - std::numeric_limits::infinity(), 0.0, niterations, 100*niterations); - } - if (has_discount_prior()) { - discount_ = slice_sampler1d(dr, discount_, *rng, std::numeric_limits::min(), - 1.0, 0.0, niterations, 100*niterations); - } - } - concentration_ = slice_sampler1d(cr, concentration_, *rng, 0.0, - std::numeric_limits::infinity(), 0.0, niterations, 100*niterations); - } - - struct DiscountResampler { - DiscountResampler(const CCRP& crp) : crp_(crp) {} - const CCRP& crp_; - double operator()(const double& proposed_discount) const { - return crp_.log_crp_prob(proposed_discount, crp_.concentration_); - } - }; - - struct ConcentrationResampler { - ConcentrationResampler(const CCRP& crp) : crp_(crp) {} - const CCRP& crp_; - double operator()(const double& proposed_concentration) const { - return crp_.log_crp_prob(crp_.discount_, proposed_concentration); - } - }; - - struct DishLocations { - DishLocations() : total_dish_count_() {} - unsigned total_dish_count_; // customers at all tables with this dish - std::list table_counts_; // list<> gives O(1) deletion and insertion, which we want - // .size() is the number of tables for this dish - }; - - void Print(std::ostream* out) const { - std::cerr << "PYP(d=" << discount_ << ",c=" << concentration_ << ") customers=" << num_customers_ << std::endl; - for (typename std::tr1::unordered_map::const_iterator it = dish_locs_.begin(); - it != dish_locs_.end(); ++it) { - (*out) << it->first << " (" << it->second.total_dish_count_ << " on " << it->second.table_counts_.size() << " tables): "; - for (typename std::list::const_iterator i = it->second.table_counts_.begin(); - i != it->second.table_counts_.end(); ++i) { - (*out) << " " << *i; - } - (*out) << std::endl; - } - } - - typedef typename std::tr1::unordered_map::const_iterator const_iterator; - const_iterator begin() const { - return dish_locs_.begin(); - } - const_iterator end() const { - return dish_locs_.end(); - } - - unsigned num_tables_; - unsigned num_customers_; - std::tr1::unordered_map dish_locs_; - - double discount_; - double concentration_; - - // optional beta prior on discount_ (NaN if no prior) - double discount_prior_alpha_; - double discount_prior_beta_; - - // optional gamma prior on concentration_ (NaN if no prior) - double concentration_prior_shape_; - double concentration_prior_rate_; -}; - -template -std::ostream& operator<<(std::ostream& o, const CCRP& c) { - c.Print(&o); - return o; -} - -#endif diff --git a/utils/ccrp.h b/utils/ccrp.h new file mode 100644 index 00000000..1a9e3ed5 --- /dev/null +++ b/utils/ccrp.h @@ -0,0 +1,340 @@ +#ifndef _CCRP_H_ +#define _CCRP_H_ + +#include +#include +#include +#include +#include +#include +#include +#include +#include "sampler.h" +#include "slice_sampler.h" + +// Chinese restaurant process (Pitman-Yor parameters) with table tracking. + +template > +class CCRP { + public: + CCRP(double disc, double conc) : + num_tables_(), + num_customers_(), + discount_(disc), + concentration_(conc), + discount_prior_alpha_(std::numeric_limits::quiet_NaN()), + discount_prior_beta_(std::numeric_limits::quiet_NaN()), + concentration_prior_shape_(std::numeric_limits::quiet_NaN()), + concentration_prior_rate_(std::numeric_limits::quiet_NaN()) {} + + CCRP(double d_alpha, double d_beta, double c_shape, double c_rate, double d = 0.9, double c = 1.0) : + num_tables_(), + num_customers_(), + discount_(d), + concentration_(c), + discount_prior_alpha_(d_alpha), + discount_prior_beta_(d_beta), + concentration_prior_shape_(c_shape), + concentration_prior_rate_(c_rate) {} + + double discount() const { return discount_; } + double concentration() const { return concentration_; } + + bool has_discount_prior() const { + return !std::isnan(discount_prior_alpha_); + } + + bool has_concentration_prior() const { + return !std::isnan(concentration_prior_shape_); + } + + void clear() { + num_tables_ = 0; + num_customers_ = 0; + dish_locs_.clear(); + } + + unsigned num_tables() const { + return num_tables_; + } + + unsigned num_tables(const Dish& dish) const { + const typename std::tr1::unordered_map::const_iterator it = dish_locs_.find(dish); + if (it == dish_locs_.end()) return 0; + return it->second.table_counts_.size(); + } + + unsigned num_customers() const { + return num_customers_; + } + + unsigned num_customers(const Dish& dish) const { + const typename std::tr1::unordered_map::const_iterator it = dish_locs_.find(dish); + if (it == dish_locs_.end()) return 0; + return it->total_dish_count_; + } + + // returns +1 or 0 indicating whether a new table was opened + int increment(const Dish& dish, const double& p0, MT19937* rng) { + DishLocations& loc = dish_locs_[dish]; + bool share_table = false; + if (loc.total_dish_count_) { + const double p_empty = (concentration_ + num_tables_ * discount_) * p0; + const double p_share = (loc.total_dish_count_ - loc.table_counts_.size() * discount_); + share_table = rng->SelectSample(p_empty, p_share); + } + if (share_table) { + double r = rng->next() * (loc.total_dish_count_ - loc.table_counts_.size() * discount_); + for (typename std::list::iterator ti = loc.table_counts_.begin(); + ti != loc.table_counts_.end(); ++ti) { + r -= (*ti - discount_); + if (r <= 0.0) { + ++(*ti); + break; + } + } + if (r > 0.0) { + std::cerr << "Serious error: r=" << r << std::endl; + Print(&std::cerr); + assert(r <= 0.0); + } + } else { + loc.table_counts_.push_back(1u); + ++num_tables_; + } + ++loc.total_dish_count_; + ++num_customers_; + return (share_table ? 0 : 1); + } + + // returns +1 or 0 indicating whether a new table was opened + template + int incrementT(const Dish& dish, const T& p0, MT19937* rng) { + DishLocations& loc = dish_locs_[dish]; + bool share_table = false; + if (loc.total_dish_count_) { + const T p_empty = T(concentration_ + num_tables_ * discount_) * p0; + const T p_share = T(loc.total_dish_count_ - loc.table_counts_.size() * discount_); + share_table = rng->SelectSample(p_empty, p_share); + } + if (share_table) { + double r = rng->next() * (loc.total_dish_count_ - loc.table_counts_.size() * discount_); + for (typename std::list::iterator ti = loc.table_counts_.begin(); + ti != loc.table_counts_.end(); ++ti) { + r -= (*ti - discount_); + if (r <= 0.0) { + ++(*ti); + break; + } + } + if (r > 0.0) { + std::cerr << "Serious error: r=" << r << std::endl; + Print(&std::cerr); + assert(r <= 0.0); + } + } else { + loc.table_counts_.push_back(1u); + ++num_tables_; + } + ++loc.total_dish_count_; + ++num_customers_; + return (share_table ? 0 : 1); + } + + // returns -1 or 0, indicating whether a table was closed + int decrement(const Dish& dish, MT19937* rng) { + DishLocations& loc = dish_locs_[dish]; + assert(loc.total_dish_count_); + if (loc.total_dish_count_ == 1) { + dish_locs_.erase(dish); + --num_tables_; + --num_customers_; + return -1; + } else { + int delta = 0; + // sample customer to remove UNIFORMLY. that is, do NOT use the discount + // here. if you do, it will introduce (unwanted) bias! + double r = rng->next() * loc.total_dish_count_; + --loc.total_dish_count_; + for (typename std::list::iterator ti = loc.table_counts_.begin(); + ti != loc.table_counts_.end(); ++ti) { + r -= *ti; + if (r <= 0.0) { + if ((--(*ti)) == 0) { + --num_tables_; + delta = -1; + loc.table_counts_.erase(ti); + } + break; + } + } + if (r > 0.0) { + std::cerr << "Serious error: r=" << r << std::endl; + Print(&std::cerr); + assert(r <= 0.0); + } + --num_customers_; + return delta; + } + } + + double prob(const Dish& dish, const double& p0) const { + const typename std::tr1::unordered_map::const_iterator it = dish_locs_.find(dish); + const double r = num_tables_ * discount_ + concentration_; + if (it == dish_locs_.end()) { + return r * p0 / (num_customers_ + concentration_); + } else { + return (it->second.total_dish_count_ - discount_ * it->second.table_counts_.size() + r * p0) / + (num_customers_ + concentration_); + } + } + + template + T probT(const Dish& dish, const T& p0) const { + const typename std::tr1::unordered_map::const_iterator it = dish_locs_.find(dish); + const T r = T(num_tables_ * discount_ + concentration_); + if (it == dish_locs_.end()) { + return r * p0 / T(num_customers_ + concentration_); + } else { + return (T(it->second.total_dish_count_ - discount_ * it->second.table_counts_.size()) + r * p0) / + T(num_customers_ + concentration_); + } + } + + double log_crp_prob() const { + return log_crp_prob(discount_, concentration_); + } + + static double log_beta_density(const double& x, const double& alpha, const double& beta) { + assert(x > 0.0); + assert(x < 1.0); + assert(alpha > 0.0); + assert(beta > 0.0); + const double lp = (alpha-1)*log(x)+(beta-1)*log(1-x)+lgamma(alpha+beta)-lgamma(alpha)-lgamma(beta); + return lp; + } + + static double log_gamma_density(const double& x, const double& shape, const double& rate) { + assert(x >= 0.0); + assert(shape > 0.0); + assert(rate > 0.0); + const double lp = (shape-1)*log(x) - shape*log(rate) - x/rate - lgamma(shape); + return lp; + } + + // taken from http://en.wikipedia.org/wiki/Chinese_restaurant_process + // does not include P_0's + double log_crp_prob(const double& discount, const double& concentration) const { + double lp = 0.0; + if (has_discount_prior()) + lp = log_beta_density(discount, discount_prior_alpha_, discount_prior_beta_); + if (has_concentration_prior()) + lp += log_gamma_density(concentration, concentration_prior_shape_, concentration_prior_rate_); + assert(lp <= 0.0); + if (num_customers_) { + if (discount > 0.0) { + const double r = lgamma(1.0 - discount); + lp += lgamma(concentration) - lgamma(concentration + num_customers_) + + num_tables_ * log(discount) + lgamma(concentration / discount + num_tables_) + - lgamma(concentration / discount); + assert(std::isfinite(lp)); + for (typename std::tr1::unordered_map::const_iterator it = dish_locs_.begin(); + it != dish_locs_.end(); ++it) { + const DishLocations& cur = it->second; + for (std::list::const_iterator ti = cur.table_counts_.begin(); ti != cur.table_counts_.end(); ++ti) { + lp += lgamma(*ti - discount) - r; + } + } + } else { + assert(!"not implemented yet"); + } + } + assert(std::isfinite(lp)); + return lp; + } + + void resample_hyperparameters(MT19937* rng, const unsigned nloop = 5, const unsigned niterations = 10) { + assert(has_discount_prior() || has_concentration_prior()); + DiscountResampler dr(*this); + ConcentrationResampler cr(*this); + for (int iter = 0; iter < nloop; ++iter) { + if (has_concentration_prior()) { + concentration_ = slice_sampler1d(cr, concentration_, *rng, 0.0, + std::numeric_limits::infinity(), 0.0, niterations, 100*niterations); + } + if (has_discount_prior()) { + discount_ = slice_sampler1d(dr, discount_, *rng, std::numeric_limits::min(), + 1.0, 0.0, niterations, 100*niterations); + } + } + concentration_ = slice_sampler1d(cr, concentration_, *rng, 0.0, + std::numeric_limits::infinity(), 0.0, niterations, 100*niterations); + } + + struct DiscountResampler { + DiscountResampler(const CCRP& crp) : crp_(crp) {} + const CCRP& crp_; + double operator()(const double& proposed_discount) const { + return crp_.log_crp_prob(proposed_discount, crp_.concentration_); + } + }; + + struct ConcentrationResampler { + ConcentrationResampler(const CCRP& crp) : crp_(crp) {} + const CCRP& crp_; + double operator()(const double& proposed_concentration) const { + return crp_.log_crp_prob(crp_.discount_, proposed_concentration); + } + }; + + struct DishLocations { + DishLocations() : total_dish_count_() {} + unsigned total_dish_count_; // customers at all tables with this dish + std::list table_counts_; // list<> gives O(1) deletion and insertion, which we want + // .size() is the number of tables for this dish + }; + + void Print(std::ostream* out) const { + std::cerr << "PYP(d=" << discount_ << ",c=" << concentration_ << ") customers=" << num_customers_ << std::endl; + for (typename std::tr1::unordered_map::const_iterator it = dish_locs_.begin(); + it != dish_locs_.end(); ++it) { + (*out) << it->first << " (" << it->second.total_dish_count_ << " on " << it->second.table_counts_.size() << " tables): "; + for (typename std::list::const_iterator i = it->second.table_counts_.begin(); + i != it->second.table_counts_.end(); ++i) { + (*out) << " " << *i; + } + (*out) << std::endl; + } + } + + typedef typename std::tr1::unordered_map::const_iterator const_iterator; + const_iterator begin() const { + return dish_locs_.begin(); + } + const_iterator end() const { + return dish_locs_.end(); + } + + unsigned num_tables_; + unsigned num_customers_; + std::tr1::unordered_map dish_locs_; + + double discount_; + double concentration_; + + // optional beta prior on discount_ (NaN if no prior) + double discount_prior_alpha_; + double discount_prior_beta_; + + // optional gamma prior on concentration_ (NaN if no prior) + double concentration_prior_shape_; + double concentration_prior_rate_; +}; + +template +std::ostream& operator<<(std::ostream& o, const CCRP& c) { + c.Print(&o); + return o; +} + +#endif diff --git a/utils/ccrp_onetable.h b/utils/ccrp_onetable.h index a868af9a..b63737d1 100644 --- a/utils/ccrp_onetable.h +++ b/utils/ccrp_onetable.h @@ -117,6 +117,18 @@ class CCRP_OneTable { } } + template + T probT(const Dish& dish, const T& p0) const { + const typename DishMapType::const_iterator it = dish_counts_.find(dish); + const T r(num_tables_ * discount_ + concentration_); + if (it == dish_counts_.end()) { + return r * p0 / T(num_customers_ + concentration_); + } else { + return (T(it->second - discount_) + r * p0) / + T(num_customers_ + concentration_); + } + } + double log_crp_prob() const { return log_crp_prob(discount_, concentration_); } diff --git a/utils/sampler.h b/utils/sampler.h index 153e7ef1..22c873d4 100644 --- a/utils/sampler.h +++ b/utils/sampler.h @@ -48,7 +48,7 @@ struct RandomNumberGenerator { template size_t SelectSample(const F& a, const F& b, double T = 1.0) { if (T == 1.0) { - if (this->next() > (a / (a + b))) return 1; else return 0; + if (F(this->next()) > (a / (a + b))) return 1; else return 0; } else { assert(!"not implemented"); } -- cgit v1.2.3 From d87220030b82fed860efee40487502e9ee8f0651 Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Mon, 27 Feb 2012 02:19:34 +0000 Subject: generic bayesian cfg learner with a bunch of cfg grammar types --- .gitignore | 1 + decoder/trule.cc | 16 +-- gi/pf/Makefile.am | 4 +- gi/pf/hierolm.cc | 309 ----------------------------------------- gi/pf/learn_cfg.cc | 394 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 5 files changed, 398 insertions(+), 326 deletions(-) delete mode 100644 gi/pf/hierolm.cc create mode 100644 gi/pf/learn_cfg.cc (limited to 'gi/pf') diff --git a/.gitignore b/.gitignore index 327f7261..28d5a60a 100644 --- a/.gitignore +++ b/.gitignore @@ -57,6 +57,7 @@ training/mpi_extract_reachable klm/lm/build_binary extools/extractor_monolingual gi/pf/.deps +gi/pf/learn_cfg gi/pf/brat gi/pf/cbgi gi/pf/dpnaive diff --git a/decoder/trule.cc b/decoder/trule.cc index 40235542..141b8faa 100644 --- a/decoder/trule.cc +++ b/decoder/trule.cc @@ -232,16 +232,6 @@ void TRule::ComputeArity() { arity_ = 1 - min; } -static string AnonymousStrVar(int i) { - string res("[v]"); - if(!(i <= 0 && i >= -8)) { - cerr << "Can't handle more than 9 non-terminals: index=" << (-i) << endl; - abort(); - } - res[1] = '1' - i; - return res; -} - string TRule::AsString(bool verbose) const { ostringstream os; int idx = 0; @@ -259,15 +249,11 @@ string TRule::AsString(bool verbose) const { } } os << " ||| "; - if (idx > 9) { - cerr << "Too many non-terminals!\n partial: " << os.str() << endl; - exit(1); - } for (int i =0; i -#include -#include - -#include -#include -#include - -#include "inside_outside.h" -#include "hg.h" -#include "bottom_up_parser.h" -#include "fdict.h" -#include "grammar.h" -#include "m.h" -#include "trule.h" -#include "tdict.h" -#include "filelib.h" -#include "dict.h" -#include "sampler.h" -#include "ccrp.h" -#include "ccrp_onetable.h" - -using namespace std; -using namespace tr1; -namespace po = boost::program_options; - -shared_ptr prng; - -void InitCommandLine(int argc, char** argv, po::variables_map* conf) { - po::options_description opts("Configuration options"); - opts.add_options() - ("samples,s",po::value()->default_value(1000),"Number of samples") - ("input,i",po::value(),"Read parallel data from") - ("random_seed,S",po::value(), "Random seed"); - po::options_description clo("Command line options"); - clo.add_options() - ("config", po::value(), "Configuration file") - ("help,h", "Print this help message and exit"); - po::options_description dconfig_options, dcmdline_options; - dconfig_options.add(opts); - dcmdline_options.add(opts).add(clo); - - po::store(parse_command_line(argc, argv, dcmdline_options), *conf); - if (conf->count("config")) { - ifstream config((*conf)["config"].as().c_str()); - po::store(po::parse_config_file(config, dconfig_options), *conf); - } - po::notify(*conf); - - if (conf->count("help") || (conf->count("input") == 0)) { - cerr << dcmdline_options << endl; - exit(1); - } -} - -void ReadCorpus(const string& filename, - vector >* e, - set* vocab_e) { - e->clear(); - vocab_e->clear(); - istream* in; - if (filename == "-") - in = &cin; - else - in = new ifstream(filename.c_str()); - assert(*in); - string line; - while(*in) { - getline(*in, line); - if (line.empty() && !*in) break; - e->push_back(vector()); - vector& le = e->back(); - TD::ConvertSentence(line, &le); - for (unsigned i = 0; i < le.size(); ++i) - vocab_e->insert(le[i]); - } - if (in != &cin) delete in; -} - -struct Grid { - // a b c d e - // 0 - 0 - - - vector grid; -}; - -struct BaseRuleModel { - explicit BaseRuleModel(unsigned term_size, - unsigned nonterm_size = 1) : - unif_term(1.0 / term_size), - unif_nonterm(1.0 / nonterm_size) {} - prob_t operator()(const TRule& r) const { - prob_t p; p.logeq(Md::log_poisson(1.0, r.f_.size())); - const prob_t term_prob((2.0 + 0.01*r.f_.size()) / (r.f_.size() + 2)); - const prob_t nonterm_prob(1.0 - term_prob.as_float()); - for (unsigned i = 0; i < r.f_.size(); ++i) { - if (r.f_[i] <= 0) { // nonterminal - p *= nonterm_prob; - p *= unif_nonterm; - } else { // terminal - p *= term_prob; - p *= unif_term; - } - } - return p; - } - const prob_t unif_term, unif_nonterm; -}; - -struct HieroLMModel { - explicit HieroLMModel(unsigned vocab_size) : p0(vocab_size), x(1,1,1,1) {} - - prob_t Prob(const TRule& r) const { - return x.probT(r, p0(r)); - } - - int Increment(const TRule& r, MT19937* rng) { - return x.incrementT(r, p0(r), rng); - // return x.increment(r); - } - - int Decrement(const TRule& r, MT19937* rng) { - return x.decrement(r, rng); - //return x.decrement(r); - } - - prob_t Likelihood() const { - prob_t p; - p.logeq(x.log_crp_prob()); - for (CCRP::const_iterator it = x.begin(); it != x.end(); ++it) { - prob_t tp = p0(it->first); - tp.poweq(it->second.table_counts_.size()); - p *= tp; - } - //for (CCRP_OneTable::const_iterator it = x.begin(); it != x.end(); ++it) - // p *= p0(it->first); - return p; - } - - void ResampleHyperparameters(MT19937* rng) { - x.resample_hyperparameters(rng); - cerr << " d=" << x.discount() << ", alpha=" << x.concentration() << endl; - } - - const BaseRuleModel p0; - CCRP x; - //CCRP_OneTable x; -}; - -vector tofreelist; - -HieroLMModel* plm; - -struct NPGrammarIter : public GrammarIter, public RuleBin { - NPGrammarIter() : arity() { tofreelist.push_back(this); } - NPGrammarIter(const TRulePtr& inr, const int a, int symbol) : arity(a + (symbol < 0 ? 1 : 0)) { - if (inr) { - r.reset(new TRule(*inr)); - } else { - static const int kLHS = -TD::Convert("X"); - r.reset(new TRule); - r->lhs_ = kLHS; - } - TRule& rr = *r; - rr.f_.push_back(symbol); - rr.e_.push_back(symbol < 0 ? (1-int(arity)) : symbol); - tofreelist.push_back(this); - } - virtual int GetNumRules() const { - if (r) return 1; else return 0; - } - virtual TRulePtr GetIthRule(int) const { - return r; - } - virtual int Arity() const { - return arity; - } - virtual const RuleBin* GetRules() const { - if (!r) return NULL; else return this; - } - virtual const GrammarIter* Extend(int symbol) const { - return new NPGrammarIter(r, arity, symbol); - } - const unsigned char arity; - TRulePtr r; -}; - -struct NPGrammar : public Grammar { - virtual const GrammarIter* GetRoot() const { - return new NPGrammarIter; - } -}; - -void SampleDerivation(const Hypergraph& hg, MT19937* rng, vector* sampled_deriv, HieroLMModel* plm) { - HieroLMModel& lm = *plm; - vector node_probs; - const prob_t total_prob = Inside(hg, &node_probs); - queue q; - q.push(hg.nodes_.size() - 3); - while(!q.empty()) { - unsigned cur_node_id = q.front(); -// cerr << "NODE=" << cur_node_id << endl; - q.pop(); - const Hypergraph::Node& node = hg.nodes_[cur_node_id]; - const unsigned num_in_edges = node.in_edges_.size(); - unsigned sampled_edge = 0; - if (num_in_edges == 1) { - sampled_edge = node.in_edges_[0]; - } else { - //prob_t z; - assert(num_in_edges > 1); - SampleSet ss; - for (unsigned j = 0; j < num_in_edges; ++j) { - const Hypergraph::Edge& edge = hg.edges_[node.in_edges_[j]]; - prob_t p = edge.edge_prob_; - for (unsigned k = 0; k < edge.tail_nodes_.size(); ++k) - p *= node_probs[edge.tail_nodes_[k]]; - ss.add(p); -// cerr << log(ss[j]) << " ||| " << edge.rule_->AsString() << endl; - //z += p; - } -// for (unsigned j = 0; j < num_in_edges; ++j) { -// const Hypergraph::Edge& edge = hg.edges_[node.in_edges_[j]]; -// cerr << exp(log(ss[j] / z)) << " ||| " << edge.rule_->AsString() << endl; -// } -// cerr << " --- \n"; - sampled_edge = node.in_edges_[rng->SelectSample(ss)]; - } - sampled_deriv->push_back(sampled_edge); - const Hypergraph::Edge& edge = hg.edges_[sampled_edge]; - for (unsigned j = 0; j < edge.tail_nodes_.size(); ++j) { - q.push(edge.tail_nodes_[j]); - } - } - for (unsigned i = 0; i < sampled_deriv->size(); ++i) { - cerr << *hg.edges_[(*sampled_deriv)[i]].rule_ << endl; - } -} - -void IncrementDerivation(const Hypergraph& hg, const vector& d, HieroLMModel* plm, MT19937* rng) { - for (unsigned i = 0; i < d.size(); ++i) - plm->Increment(*hg.edges_[d[i]].rule_, rng); -} - -void DecrementDerivation(const Hypergraph& hg, const vector& d, HieroLMModel* plm, MT19937* rng) { - for (unsigned i = 0; i < d.size(); ++i) - plm->Decrement(*hg.edges_[d[i]].rule_, rng); -} - -int main(int argc, char** argv) { - po::variables_map conf; - vector grammars; - grammars.push_back(GrammarPtr(new NPGrammar)); - - InitCommandLine(argc, argv, &conf); - const unsigned samples = conf["samples"].as(); - - if (conf.count("random_seed")) - prng.reset(new MT19937(conf["random_seed"].as())); - else - prng.reset(new MT19937); - MT19937& rng = *prng; - - vector > corpuse; - set vocabe; - cerr << "Reading corpus...\n"; - ReadCorpus(conf["input"].as(), &corpuse, &vocabe); - cerr << "E-corpus size: " << corpuse.size() << " sentences\t (" << vocabe.size() << " word types)\n"; - HieroLMModel lm(vocabe.size()); - - plm = &lm; - ExhaustiveBottomUpParser parser("X", grammars); - - Hypergraph hg; - const int kX = -TD::Convert("X"); - const int kLP = FD::Convert("LogProb"); - SparseVector v; v.set_value(kLP, 1.0); - vector > derivs(corpuse.size()); - for (int SS=0; SS < samples; ++SS) { - for (int ci = 0; ci < corpuse.size(); ++ci) { - vector& src = corpuse[ci]; - Lattice lat(src.size()); - for (unsigned i = 0; i < src.size(); ++i) - lat[i].push_back(LatticeArc(src[i], 0.0, 1)); - cerr << TD::GetString(src) << endl; - hg.clear(); - parser.Parse(lat, &hg); // exhaustive parse - DecrementDerivation(hg, derivs[ci], &lm, &rng); - for (unsigned i = 0; i < hg.edges_.size(); ++i) { - TRule& r = *hg.edges_[i].rule_; - if (r.lhs_ == kX) - hg.edges_[i].edge_prob_ = lm.Prob(r); - } - vector d; - SampleDerivation(hg, &rng, &d, &lm); - derivs[ci] = d; - IncrementDerivation(hg, derivs[ci], &lm, &rng); - if (tofreelist.size() > 100000) { - cerr << "Freeing ... "; - for (unsigned i = 0; i < tofreelist.size(); ++i) - delete tofreelist[i]; - tofreelist.clear(); - cerr << "Freed.\n"; - } - } - cerr << "LLH=" << lm.Likelihood() << endl; - } - return 0; -} - diff --git a/gi/pf/learn_cfg.cc b/gi/pf/learn_cfg.cc new file mode 100644 index 00000000..3d202816 --- /dev/null +++ b/gi/pf/learn_cfg.cc @@ -0,0 +1,394 @@ +#include +#include +#include + +#include +#include +#include + +#include "inside_outside.h" +#include "hg.h" +#include "bottom_up_parser.h" +#include "fdict.h" +#include "grammar.h" +#include "m.h" +#include "trule.h" +#include "tdict.h" +#include "filelib.h" +#include "dict.h" +#include "sampler.h" +#include "ccrp.h" +#include "ccrp_onetable.h" + +using namespace std; +using namespace tr1; +namespace po = boost::program_options; + +shared_ptr prng; +vector nt_vocab; +vector nt_id_to_index; +static unsigned kMAX_RULE_SIZE = 0; +static unsigned kMAX_ARITY = 0; +static bool kALLOW_MIXED = true; // allow rules with mixed terminals and NTs + +void InitCommandLine(int argc, char** argv, po::variables_map* conf) { + po::options_description opts("Configuration options"); + opts.add_options() + ("samples,s",po::value()->default_value(1000),"Number of samples") + ("input,i",po::value(),"Read parallel data from") + ("max_rule_size,m", po::value()->default_value(0), "Maximum rule size (0 for unlimited)") + ("max_arity,a", po::value()->default_value(0), "Maximum number of nonterminals in a rule (0 for unlimited)") + ("no_mixed_rules,M", "Do not mix terminals and nonterminals in a rule RHS") + ("nonterminals,n", po::value()->default_value(1), "Size of nonterminal vocabulary") + ("random_seed,S",po::value(), "Random seed"); + po::options_description clo("Command line options"); + clo.add_options() + ("config", po::value(), "Configuration file") + ("help,h", "Print this help message and exit"); + po::options_description dconfig_options, dcmdline_options; + dconfig_options.add(opts); + dcmdline_options.add(opts).add(clo); + + po::store(parse_command_line(argc, argv, dcmdline_options), *conf); + if (conf->count("config")) { + ifstream config((*conf)["config"].as().c_str()); + po::store(po::parse_config_file(config, dconfig_options), *conf); + } + po::notify(*conf); + + if (conf->count("help") || (conf->count("input") == 0)) { + cerr << dcmdline_options << endl; + exit(1); + } +} + +unsigned ReadCorpus(const string& filename, + vector >* e, + set* vocab_e) { + e->clear(); + vocab_e->clear(); + istream* in; + if (filename == "-") + in = &cin; + else + in = new ifstream(filename.c_str()); + assert(*in); + string line; + unsigned toks = 0; + while(*in) { + getline(*in, line); + if (line.empty() && !*in) break; + e->push_back(vector()); + vector& le = e->back(); + TD::ConvertSentence(line, &le); + for (unsigned i = 0; i < le.size(); ++i) + vocab_e->insert(le[i]); + toks += le.size(); + } + if (in != &cin) delete in; + return toks; +} + +struct Grid { + // a b c d e + // 0 - 0 - - + vector grid; +}; + +struct BaseRuleModel { + explicit BaseRuleModel(unsigned term_size, + unsigned nonterm_size = 1) : + unif_term(1.0 / term_size), + unif_nonterm(1.0 / nonterm_size) {} + prob_t operator()(const TRule& r) const { + prob_t p; p.logeq(Md::log_poisson(1.0, r.f_.size())); + const prob_t term_prob((2.0 + 0.01*r.f_.size()) / (r.f_.size() + 2)); + const prob_t nonterm_prob(1.0 - term_prob.as_float()); + for (unsigned i = 0; i < r.f_.size(); ++i) { + if (r.f_[i] <= 0) { // nonterminal + p *= nonterm_prob; + p *= unif_nonterm; + } else { // terminal + p *= term_prob; + p *= unif_term; + } + } + return p; + } + const prob_t unif_term, unif_nonterm; +}; + +struct HieroLMModel { + explicit HieroLMModel(unsigned vocab_size, unsigned num_nts = 1) : p0(vocab_size, num_nts), nts(num_nts, CCRP(1,1,1,1)) {} + + prob_t Prob(const TRule& r) const { + return nts[nt_id_to_index[-r.lhs_]].probT(r, p0(r)); + } + + int Increment(const TRule& r, MT19937* rng) { + return nts[nt_id_to_index[-r.lhs_]].incrementT(r, p0(r), rng); + // return x.increment(r); + } + + int Decrement(const TRule& r, MT19937* rng) { + return nts[nt_id_to_index[-r.lhs_]].decrement(r, rng); + //return x.decrement(r); + } + + prob_t Likelihood() const { + prob_t p = prob_t::One(); + for (unsigned i = 0; i < nts.size(); ++i) { + prob_t q; q.logeq(nts[i].log_crp_prob()); + p *= q; + for (CCRP::const_iterator it = nts[i].begin(); it != nts[i].end(); ++it) { + prob_t tp = p0(it->first); + tp.poweq(it->second.table_counts_.size()); + p *= tp; + } + } + //for (CCRP_OneTable::const_iterator it = x.begin(); it != x.end(); ++it) + // p *= p0(it->first); + return p; + } + + void ResampleHyperparameters(MT19937* rng) { + for (unsigned i = 0; i < nts.size(); ++i) + nts[i].resample_hyperparameters(rng); + cerr << " d=" << nts[0].discount() << ", alpha=" << nts[0].concentration() << endl; + } + + const BaseRuleModel p0; + vector > nts; + //CCRP_OneTable x; +}; + +vector tofreelist; + +HieroLMModel* plm; + +struct NPGrammarIter : public GrammarIter, public RuleBin { + NPGrammarIter() : arity() { tofreelist.push_back(this); } + NPGrammarIter(const TRulePtr& inr, const int a, int symbol) : arity(a) { + if (inr) { + r.reset(new TRule(*inr)); + } else { + r.reset(new TRule); + } + TRule& rr = *r; + rr.lhs_ = nt_vocab[0]; + rr.f_.push_back(symbol); + rr.e_.push_back(symbol < 0 ? (1-int(arity)) : symbol); + tofreelist.push_back(this); + } + inline static unsigned NextArity(int cur_a, int symbol) { + return cur_a + (symbol <= 0 ? 1 : 0); + } + virtual int GetNumRules() const { + if (r) return nt_vocab.size(); else return 0; + } + virtual TRulePtr GetIthRule(int i) const { + if (i == 0) return r; + TRulePtr nr(new TRule(*r)); + nr->lhs_ = nt_vocab[i]; + return nr; + } + virtual int Arity() const { + return arity; + } + virtual const RuleBin* GetRules() const { + if (!r) return NULL; else return this; + } + virtual const GrammarIter* Extend(int symbol) const { + const int next_arity = NextArity(arity, symbol); + if (kMAX_ARITY && next_arity > kMAX_ARITY) + return NULL; + if (!kALLOW_MIXED && r) { + bool t1 = r->f_.front() <= 0; + bool t2 = symbol <= 0; + if (t1 != t2) return NULL; + } + if (!kMAX_RULE_SIZE || !r || (r->f_.size() < kMAX_RULE_SIZE)) + return new NPGrammarIter(r, next_arity, symbol); + else + return NULL; + } + const unsigned char arity; + TRulePtr r; +}; + +struct NPGrammar : public Grammar { + virtual const GrammarIter* GetRoot() const { + return new NPGrammarIter; + } +}; + +prob_t TotalProb(const Hypergraph& hg) { + return Inside(hg); +} + +void SampleDerivation(const Hypergraph& hg, MT19937* rng, vector* sampled_deriv) { + vector node_probs; + Inside(hg, &node_probs); + queue q; + q.push(hg.nodes_.size() - 2); + while(!q.empty()) { + unsigned cur_node_id = q.front(); +// cerr << "NODE=" << cur_node_id << endl; + q.pop(); + const Hypergraph::Node& node = hg.nodes_[cur_node_id]; + const unsigned num_in_edges = node.in_edges_.size(); + unsigned sampled_edge = 0; + if (num_in_edges == 1) { + sampled_edge = node.in_edges_[0]; + } else { + //prob_t z; + assert(num_in_edges > 1); + SampleSet ss; + for (unsigned j = 0; j < num_in_edges; ++j) { + const Hypergraph::Edge& edge = hg.edges_[node.in_edges_[j]]; + prob_t p = edge.edge_prob_; + for (unsigned k = 0; k < edge.tail_nodes_.size(); ++k) + p *= node_probs[edge.tail_nodes_[k]]; + ss.add(p); +// cerr << log(ss[j]) << " ||| " << edge.rule_->AsString() << endl; + //z += p; + } +// for (unsigned j = 0; j < num_in_edges; ++j) { +// const Hypergraph::Edge& edge = hg.edges_[node.in_edges_[j]]; +// cerr << exp(log(ss[j] / z)) << " ||| " << edge.rule_->AsString() << endl; +// } +// cerr << " --- \n"; + sampled_edge = node.in_edges_[rng->SelectSample(ss)]; + } + sampled_deriv->push_back(sampled_edge); + const Hypergraph::Edge& edge = hg.edges_[sampled_edge]; + for (unsigned j = 0; j < edge.tail_nodes_.size(); ++j) { + q.push(edge.tail_nodes_[j]); + } + } + for (unsigned i = 0; i < sampled_deriv->size(); ++i) { + cerr << *hg.edges_[(*sampled_deriv)[i]].rule_ << endl; + } +} + +void IncrementDerivation(const Hypergraph& hg, const vector& d, HieroLMModel* plm, MT19937* rng) { + for (unsigned i = 0; i < d.size(); ++i) + plm->Increment(*hg.edges_[d[i]].rule_, rng); +} + +void DecrementDerivation(const Hypergraph& hg, const vector& d, HieroLMModel* plm, MT19937* rng) { + for (unsigned i = 0; i < d.size(); ++i) + plm->Decrement(*hg.edges_[d[i]].rule_, rng); +} + +int main(int argc, char** argv) { + po::variables_map conf; + + InitCommandLine(argc, argv, &conf); + nt_vocab.resize(conf["nonterminals"].as()); + assert(nt_vocab.size() > 0); + assert(nt_vocab.size() < 26); + { + string nt = "X"; + for (unsigned i = 0; i < nt_vocab.size(); ++i) { + if (nt_vocab.size() > 1) nt[0] = ('A' + i); + int pid = TD::Convert(nt); + nt_vocab[i] = -pid; + if (pid >= nt_id_to_index.size()) { + nt_id_to_index.resize(pid + 1, -1); + } + nt_id_to_index[pid] = i; + } + } + vector grammars; + grammars.push_back(GrammarPtr(new NPGrammar)); + + const unsigned samples = conf["samples"].as(); + kMAX_RULE_SIZE = conf["max_rule_size"].as(); + if (kMAX_RULE_SIZE == 1) { + cerr << "Invalid maximum rule size: must be 0 or >1\n"; + return 1; + } + kMAX_ARITY = conf["max_arity"].as(); + if (kMAX_ARITY == 1) { + cerr << "Invalid maximum arity: must be 0 or >1\n"; + return 1; + } + kALLOW_MIXED = !conf.count("no_mixed_rules"); + + if (conf.count("random_seed")) + prng.reset(new MT19937(conf["random_seed"].as())); + else + prng.reset(new MT19937); + MT19937& rng = *prng; + vector > corpuse; + set vocabe; + cerr << "Reading corpus...\n"; + const unsigned toks = ReadCorpus(conf["input"].as(), &corpuse, &vocabe); + cerr << "E-corpus size: " << corpuse.size() << " sentences\t (" << vocabe.size() << " word types)\n"; + HieroLMModel lm(vocabe.size(), nt_vocab.size()); + + plm = &lm; + ExhaustiveBottomUpParser parser(TD::Convert(-nt_vocab[0]), grammars); + + Hypergraph hg; + const int kGoal = -TD::Convert("Goal"); + const int kLP = FD::Convert("LogProb"); + SparseVector v; v.set_value(kLP, 1.0); + vector > derivs(corpuse.size()); + vector cl(corpuse.size()); + for (int ci = 0; ci < corpuse.size(); ++ci) { + vector& src = corpuse[ci]; + Lattice& lat = cl[ci]; + lat.resize(src.size()); + for (unsigned i = 0; i < src.size(); ++i) + lat[i].push_back(LatticeArc(src[i], 0.0, 1)); + } + for (int SS=0; SS < samples; ++SS) { + const bool is_last = ((samples - 1) == SS); + prob_t dlh = prob_t::One(); + for (int ci = 0; ci < corpuse.size(); ++ci) { + const vector& src = corpuse[ci]; + const Lattice& lat = cl[ci]; + cerr << TD::GetString(src) << endl; + hg.clear(); + parser.Parse(lat, &hg); // exhaustive parse + vector& d = derivs[ci]; + if (!is_last) DecrementDerivation(hg, d, &lm, &rng); + for (unsigned i = 0; i < hg.edges_.size(); ++i) { + TRule& r = *hg.edges_[i].rule_; + if (r.lhs_ == kGoal) + hg.edges_[i].edge_prob_ = prob_t::One(); + else + hg.edges_[i].edge_prob_ = lm.Prob(r); + } + if (!is_last) { + d.clear(); + SampleDerivation(hg, &rng, &d); + IncrementDerivation(hg, derivs[ci], &lm, &rng); + } else { + prob_t p = TotalProb(hg); + dlh *= p; + cerr << " p(sentence) = " << log(p) << "\t" << log(dlh) << endl; + } + if (tofreelist.size() > 200000) { + cerr << "Freeing ... "; + for (unsigned i = 0; i < tofreelist.size(); ++i) + delete tofreelist[i]; + tofreelist.clear(); + cerr << "Freed.\n"; + } + } + double llh = log(lm.Likelihood()); + cerr << "LLH=" << llh << "\tENTROPY=" << (-llh / log(2) / toks) << "\tPPL=" << pow(2, -llh / log(2) / toks) << endl; + if (SS % 10 == 9) lm.ResampleHyperparameters(&rng); + if (is_last) { + double z = log(dlh); + cerr << "TOTAL_PROB=" << z << "\tENTROPY=" << (-z / log(2) / toks) << "\tPPL=" << pow(2, -z / log(2) / toks) << endl; + } + } + for (unsigned i = 0; i < nt_vocab.size(); ++i) + cerr << lm.nts[i] << endl; + return 0; +} + -- cgit v1.2.3 From d4c89a181635b8ca1286bcc8887dfb368000cc6f Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Mon, 27 Feb 2012 02:40:00 +0000 Subject: fix base distribution, partially --- gi/pf/learn_cfg.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'gi/pf') diff --git a/gi/pf/learn_cfg.cc b/gi/pf/learn_cfg.cc index 3d202816..6e574035 100644 --- a/gi/pf/learn_cfg.cc +++ b/gi/pf/learn_cfg.cc @@ -106,10 +106,10 @@ struct BaseRuleModel { const prob_t nonterm_prob(1.0 - term_prob.as_float()); for (unsigned i = 0; i < r.f_.size(); ++i) { if (r.f_[i] <= 0) { // nonterminal - p *= nonterm_prob; + if (kALLOW_MIXED) p *= nonterm_prob; p *= unif_nonterm; } else { // terminal - p *= term_prob; + if (kALLOW_MIXED) p *= term_prob; p *= unif_term; } } -- cgit v1.2.3 From 1f0ded1e7f59b13d7512111dd910d0f4b2f82d02 Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Tue, 28 Feb 2012 00:47:20 -0500 Subject: optional hierarchical prior --- gi/pf/learn_cfg.cc | 46 ++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 40 insertions(+), 6 deletions(-) (limited to 'gi/pf') diff --git a/gi/pf/learn_cfg.cc b/gi/pf/learn_cfg.cc index 6e574035..b2ca029a 100644 --- a/gi/pf/learn_cfg.cc +++ b/gi/pf/learn_cfg.cc @@ -30,6 +30,7 @@ vector nt_id_to_index; static unsigned kMAX_RULE_SIZE = 0; static unsigned kMAX_ARITY = 0; static bool kALLOW_MIXED = true; // allow rules with mixed terminals and NTs +static bool kHIERARCHICAL_PRIOR = false; void InitCommandLine(int argc, char** argv, po::variables_map* conf) { po::options_description opts("Configuration options"); @@ -40,11 +41,12 @@ void InitCommandLine(int argc, char** argv, po::variables_map* conf) { ("max_arity,a", po::value()->default_value(0), "Maximum number of nonterminals in a rule (0 for unlimited)") ("no_mixed_rules,M", "Do not mix terminals and nonterminals in a rule RHS") ("nonterminals,n", po::value()->default_value(1), "Size of nonterminal vocabulary") + ("hierarchical_prior,h", "Use hierarchical prior") ("random_seed,S",po::value(), "Random seed"); po::options_description clo("Command line options"); clo.add_options() ("config", po::value(), "Configuration file") - ("help,h", "Print this help message and exit"); + ("help", "Print this help message and exit"); po::options_description dconfig_options, dcmdline_options; dconfig_options.add(opts); dcmdline_options.add(opts).add(clo); @@ -119,19 +121,35 @@ struct BaseRuleModel { }; struct HieroLMModel { - explicit HieroLMModel(unsigned vocab_size, unsigned num_nts = 1) : p0(vocab_size, num_nts), nts(num_nts, CCRP(1,1,1,1)) {} + explicit HieroLMModel(unsigned vocab_size, unsigned num_nts = 1) : + base(vocab_size, num_nts), + q0(1,1,1,1), + nts(num_nts, CCRP(1,1,1,1)) {} prob_t Prob(const TRule& r) const { return nts[nt_id_to_index[-r.lhs_]].probT(r, p0(r)); } + inline prob_t p0(const TRule& r) const { + if (kHIERARCHICAL_PRIOR) + return q0.probT(r, base(r)); + else + return base(r); + } + int Increment(const TRule& r, MT19937* rng) { - return nts[nt_id_to_index[-r.lhs_]].incrementT(r, p0(r), rng); + const int delta = nts[nt_id_to_index[-r.lhs_]].incrementT(r, p0(r), rng); + if (kHIERARCHICAL_PRIOR && delta) + q0.incrementT(r, base(r), rng); + return delta; // return x.increment(r); } int Decrement(const TRule& r, MT19937* rng) { - return nts[nt_id_to_index[-r.lhs_]].decrement(r, rng); + const int delta = nts[nt_id_to_index[-r.lhs_]].decrement(r, rng); + if (kHIERARCHICAL_PRIOR && delta) + q0.decrement(r, rng); + return delta; //return x.decrement(r); } @@ -146,18 +164,32 @@ struct HieroLMModel { p *= tp; } } + if (kHIERARCHICAL_PRIOR) { + prob_t q; q.logeq(q0.log_crp_prob()); + p *= q; + for (CCRP::const_iterator it = q0.begin(); it != q0.end(); ++it) { + prob_t tp = base(it->first); + tp.poweq(it->second.table_counts_.size()); + p *= tp; + } + } //for (CCRP_OneTable::const_iterator it = x.begin(); it != x.end(); ++it) - // p *= p0(it->first); + // p *= base(it->first); return p; } void ResampleHyperparameters(MT19937* rng) { for (unsigned i = 0; i < nts.size(); ++i) nts[i].resample_hyperparameters(rng); + if (kHIERARCHICAL_PRIOR) { + q0.resample_hyperparameters(rng); + cerr << "[base d=" << q0.discount() << ", alpha=" << q0.discount() << "]"; + } cerr << " d=" << nts[0].discount() << ", alpha=" << nts[0].concentration() << endl; } - const BaseRuleModel p0; + const BaseRuleModel base; + CCRP q0; vector > nts; //CCRP_OneTable x; }; @@ -316,6 +348,8 @@ int main(int argc, char** argv) { } kALLOW_MIXED = !conf.count("no_mixed_rules"); + kHIERARCHICAL_PRIOR = conf.count("hierarchical_prior"); + if (conf.count("random_seed")) prng.reset(new MT19937(conf["random_seed"].as())); else -- cgit v1.2.3 From e0507d1aa96c6b1348e6a202beb95f63d8662258 Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Sat, 3 Mar 2012 03:24:53 -0500 Subject: PYP language model (Teh 2006) --- decoder/fst_translator.cc | 5 +- gi/pf/Makefile.am | 4 +- gi/pf/pyp_lm.cc | 150 ++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 157 insertions(+), 2 deletions(-) create mode 100644 gi/pf/pyp_lm.cc (limited to 'gi/pf') diff --git a/decoder/fst_translator.cc b/decoder/fst_translator.cc index 38dbd717..074de4c9 100644 --- a/decoder/fst_translator.cc +++ b/decoder/fst_translator.cc @@ -30,7 +30,10 @@ struct FSTTranslatorImpl { if (input.find("{\"rules\"") == 0) { istringstream is(input); Hypergraph src_cfg_hg; - assert(HypergraphIO::ReadFromJSON(&is, &src_cfg_hg)); + if (!HypergraphIO::ReadFromJSON(&is, &src_cfg_hg)) { + cerr << "Failed to read HG from JSON.\n"; + abort(); + } if (add_pass_through_rules) { SparseVector feats; feats.set_value(FD::Convert("PassThrough"), 1); diff --git a/gi/pf/Makefile.am b/gi/pf/Makefile.am index 0cf0bc63..7cf9c14d 100644 --- a/gi/pf/Makefile.am +++ b/gi/pf/Makefile.am @@ -1,4 +1,4 @@ -bin_PROGRAMS = cbgi brat dpnaive pfbrat pfdist itg pfnaive condnaive align-lexonly align-lexonly-pyp learn_cfg +bin_PROGRAMS = cbgi brat dpnaive pfbrat pfdist itg pfnaive condnaive align-lexonly align-lexonly-pyp learn_cfg pyp_lm noinst_LIBRARIES = libpf.a libpf_a_SOURCES = base_distributions.cc reachability.cc cfg_wfst_composer.cc corpus.cc unigrams.cc ngram_base.cc @@ -9,6 +9,8 @@ align_lexonly_pyp_SOURCES = align-lexonly-pyp.cc itg_SOURCES = itg.cc +pyp_lm_SOURCES = pyp_lm.cc + learn_cfg_SOURCES = learn_cfg.cc condnaive_SOURCES = condnaive.cc diff --git a/gi/pf/pyp_lm.cc b/gi/pf/pyp_lm.cc new file mode 100644 index 00000000..2837e33c --- /dev/null +++ b/gi/pf/pyp_lm.cc @@ -0,0 +1,150 @@ +#include +#include +#include + +#include +#include +#include + +#include "corpus_tools.h" +#include "m.h" +#include "tdict.h" +#include "sampler.h" +#include "ccrp.h" +#include "ccrp_onetable.h" + +using namespace std; +using namespace tr1; +namespace po = boost::program_options; + +shared_ptr prng; + +void InitCommandLine(int argc, char** argv, po::variables_map* conf) { + po::options_description opts("Configuration options"); + opts.add_options() + ("samples,s",po::value()->default_value(1000),"Number of samples") + ("input,i",po::value(),"Read data from") + ("random_seed,S",po::value(), "Random seed"); + po::options_description clo("Command line options"); + clo.add_options() + ("config", po::value(), "Configuration file") + ("help", "Print this help message and exit"); + po::options_description dconfig_options, dcmdline_options; + dconfig_options.add(opts); + dcmdline_options.add(opts).add(clo); + + po::store(parse_command_line(argc, argv, dcmdline_options), *conf); + if (conf->count("config")) { + ifstream config((*conf)["config"].as().c_str()); + po::store(po::parse_config_file(config, dconfig_options), *conf); + } + po::notify(*conf); + + if (conf->count("help") || (conf->count("input") == 0)) { + cerr << dcmdline_options << endl; + exit(1); + } +} + +template struct PYPLM; + +// uniform base distribution +template<> struct PYPLM<0> { + PYPLM(unsigned vs) : p0(1.0 / vs) {} + void increment(WordID w, const vector& context, MT19937* rng) const {} + void decrement(WordID w, const vector& context, MT19937* rng) const {} + double prob(WordID w, const vector& context) const { return p0; } + const double p0; +}; + +// represents an N-gram LM +template struct PYPLM { + PYPLM(unsigned vs) : backoff(vs) {} + void increment(WordID w, const vector& context, MT19937* rng) { + const double bo = backoff.prob(w, context); + static vector lookup(N-1); + for (unsigned i = 0; i < N-1; ++i) + lookup[i] = context[context.size() - 1 - i]; + typename unordered_map, CCRP, boost::hash > >::iterator it = p.find(lookup); + if (it == p.end()) + it = p.insert(make_pair(lookup, CCRP(1,1,1,1))).first; + if (it->second.increment(w, bo, rng)) + backoff.increment(w, context, rng); + } + void decrement(WordID w, const vector& context, MT19937* rng) { + static vector lookup(N-1); + for (unsigned i = 0; i < N-1; ++i) + lookup[i] = context[context.size() - 1 - i]; + typename unordered_map, CCRP, boost::hash > >::iterator it = p.find(lookup); + assert(it != p.end()); + if (it->second.decrement(w, rng)) + backoff.decrement(w, context, rng); + } + double prob(WordID w, const vector& context) const { + const double bo = backoff.prob(w, context); + static vector lookup(N-1); + for (unsigned i = 0; i < N-1; ++i) + lookup[i] = context[context.size() - 1 - i]; + typename unordered_map, CCRP, boost::hash > >::const_iterator it = p.find(lookup); + if (it == p.end()) return bo; + return it->second.prob(w, bo); + } + PYPLM backoff; + unordered_map, CCRP, boost::hash > > p; +}; + +int main(int argc, char** argv) { + po::variables_map conf; + + InitCommandLine(argc, argv, &conf); + const unsigned samples = conf["samples"].as(); + if (conf.count("random_seed")) + prng.reset(new MT19937(conf["random_seed"].as())); + else + prng.reset(new MT19937); + MT19937& rng = *prng; + vector > corpuse; + set vocabe; + const WordID kEOS = TD::Convert(""); + cerr << "Reading corpus...\n"; + CorpusTools::ReadFromFile(conf["input"].as(), &corpuse, &vocabe); + cerr << "E-corpus size: " << corpuse.size() << " sentences\t (" << vocabe.size() << " word types)\n"; +#define kORDER 5 + PYPLM lm(vocabe.size()); + vector ctx(kORDER - 1, TD::Convert("")); + int mci = corpuse.size() * 99 / 100; + for (int SS=0; SS < samples; ++SS) { + for (int ci = 0; ci < mci; ++ci) { + ctx.resize(kORDER - 1); + const vector& s = corpuse[ci]; + for (int i = 0; i <= s.size(); ++i) { + WordID w = (i < s.size() ? s[i] : kEOS); + if (SS > 0) lm.decrement(w, ctx, &rng); + lm.increment(w, ctx, &rng); + ctx.push_back(w); + } + if (SS > 0) lm.decrement(kEOS, ctx, &rng); + lm.increment(kEOS, ctx, &rng); + } + } + double llh = 0; + unsigned cnt = 0; + for (int ci = mci; ci < corpuse.size(); ++ci) { + ctx.resize(kORDER - 1); + const vector& s = corpuse[ci]; + for (int i = 0; i <= s.size(); ++i) { + WordID w = (i < s.size() ? s[i] : kEOS); + double lp = log(lm.prob(w, ctx)) / log(2); + cerr << "p(" << TD::Convert(w) << " | " << TD::GetString(ctx) << ") = " << lp << endl; + ctx.push_back(w); + llh -= lp; + cnt++; + } + } + cerr << " Log_10 prob: " << (llh * log(2) / log(10)) << endl; + cerr << " Count: " << (cnt) << endl; + cerr << "Cross-entropy: " << (llh / cnt) << endl; + cerr << " Perplexity: " << pow(2, llh / cnt) << endl; + return 0; +} + -- cgit v1.2.3 From 2579dd24d3833823527e688196276c2fab381b37 Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Sat, 3 Mar 2012 17:16:58 -0500 Subject: pyp lm, fixed hyperparameters inference --- gi/pf/align-lexonly-pyp.cc | 2 +- gi/pf/align-lexonly.cc | 2 +- gi/pf/brat.cc | 2 +- gi/pf/conditional_pseg.h | 4 +- gi/pf/learn_cfg.cc | 4 +- gi/pf/pfbrat.cc | 2 +- gi/pf/pyp_lm.cc | 70 ++++++++++++++++++++++++++++--- phrasinator/gibbs_train_plm.cc | 2 +- utils/ccrp.h | 95 ++++++++++++++++++------------------------ utils/ccrp_nt.h | 52 +++++++++++------------ utils/ccrp_onetable.h | 70 +++++++++++++++---------------- utils/mfcr.h | 58 +++++++++++++------------- 12 files changed, 203 insertions(+), 160 deletions(-) (limited to 'gi/pf') diff --git a/gi/pf/align-lexonly-pyp.cc b/gi/pf/align-lexonly-pyp.cc index e24cb457..4ce7cf62 100644 --- a/gi/pf/align-lexonly-pyp.cc +++ b/gi/pf/align-lexonly-pyp.cc @@ -104,7 +104,7 @@ struct HierarchicalWordBase { } void Summary() const { - cerr << "NUMBER OF CUSTOMERS: " << r.num_customers() << " (d=" << r.d() << ",\\alpha=" << r.alpha() << ')' << endl; + cerr << "NUMBER OF CUSTOMERS: " << r.num_customers() << " (d=" << r.discount() << ",\\alpha=" << r.alpha() << ')' << endl; for (MFCR >::const_iterator it = r.begin(); it != r.end(); ++it) cerr << " " << it->second.total_dish_count_ << " (on " << it->second.table_counts_.size() << " tables)" << TD::GetString(it->first) << endl; } diff --git a/gi/pf/align-lexonly.cc b/gi/pf/align-lexonly.cc index 8c1d689f..dbc9dc07 100644 --- a/gi/pf/align-lexonly.cc +++ b/gi/pf/align-lexonly.cc @@ -105,7 +105,7 @@ struct HierarchicalWordBase { } void Summary() const { - cerr << "NUMBER OF CUSTOMERS: " << r.num_customers() << " (\\alpha=" << r.concentration() << ')' << endl; + cerr << "NUMBER OF CUSTOMERS: " << r.num_customers() << " (\\alpha=" << r.alpha() << ')' << endl; for (CCRP_NoTable >::const_iterator it = r.begin(); it != r.end(); ++it) cerr << " " << it->second << '\t' << TD::GetString(it->first) << endl; } diff --git a/gi/pf/brat.cc b/gi/pf/brat.cc index 7b60ef23..c2c52760 100644 --- a/gi/pf/brat.cc +++ b/gi/pf/brat.cc @@ -191,7 +191,7 @@ struct UniphraseLM { void ResampleHyperparameters(MT19937* rng) { phrases_.resample_hyperparameters(rng); gen_.resample_hyperparameters(rng); - cerr << " " << phrases_.concentration(); + cerr << " " << phrases_.alpha(); } CCRP_NoTable > phrases_; diff --git a/gi/pf/conditional_pseg.h b/gi/pf/conditional_pseg.h index 2e9e38fc..f9841cbf 100644 --- a/gi/pf/conditional_pseg.h +++ b/gi/pf/conditional_pseg.h @@ -22,7 +22,7 @@ struct MConditionalTranslationModel { void Summary() const { std::cerr << "Number of conditioning contexts: " << r.size() << std::endl; for (RuleModelHash::const_iterator it = r.begin(); it != r.end(); ++it) { - std::cerr << TD::GetString(it->first) << " \t(d=" << it->second.d() << ",\\alpha = " << it->second.alpha() << ") --------------------------" << std::endl; + std::cerr << TD::GetString(it->first) << " \t(d=" << it->second.discount() << ",\\alpha = " << it->second.alpha() << ") --------------------------" << std::endl; for (MFCR::const_iterator i2 = it->second.begin(); i2 != it->second.end(); ++i2) std::cerr << " " << -1 << '\t' << i2->first << std::endl; } @@ -95,7 +95,7 @@ struct ConditionalTranslationModel { void Summary() const { std::cerr << "Number of conditioning contexts: " << r.size() << std::endl; for (RuleModelHash::const_iterator it = r.begin(); it != r.end(); ++it) { - std::cerr << TD::GetString(it->first) << " \t(\\alpha = " << it->second.concentration() << ") --------------------------" << std::endl; + std::cerr << TD::GetString(it->first) << " \t(\\alpha = " << it->second.alpha() << ") --------------------------" << std::endl; for (CCRP_NoTable::const_iterator i2 = it->second.begin(); i2 != it->second.end(); ++i2) std::cerr << " " << i2->second << '\t' << i2->first << std::endl; } diff --git a/gi/pf/learn_cfg.cc b/gi/pf/learn_cfg.cc index b2ca029a..5b748311 100644 --- a/gi/pf/learn_cfg.cc +++ b/gi/pf/learn_cfg.cc @@ -183,9 +183,9 @@ struct HieroLMModel { nts[i].resample_hyperparameters(rng); if (kHIERARCHICAL_PRIOR) { q0.resample_hyperparameters(rng); - cerr << "[base d=" << q0.discount() << ", alpha=" << q0.discount() << "]"; + cerr << "[base d=" << q0.discount() << ", alpha=" << q0.alpha() << "]"; } - cerr << " d=" << nts[0].discount() << ", alpha=" << nts[0].concentration() << endl; + cerr << " d=" << nts[0].discount() << ", alpha=" << nts[0].alpha() << endl; } const BaseRuleModel base; diff --git a/gi/pf/pfbrat.cc b/gi/pf/pfbrat.cc index 7b60ef23..c2c52760 100644 --- a/gi/pf/pfbrat.cc +++ b/gi/pf/pfbrat.cc @@ -191,7 +191,7 @@ struct UniphraseLM { void ResampleHyperparameters(MT19937* rng) { phrases_.resample_hyperparameters(rng); gen_.resample_hyperparameters(rng); - cerr << " " << phrases_.concentration(); + cerr << " " << phrases_.alpha(); } CCRP_NoTable > phrases_; diff --git a/gi/pf/pyp_lm.cc b/gi/pf/pyp_lm.cc index 2837e33c..0d85536c 100644 --- a/gi/pf/pyp_lm.cc +++ b/gi/pf/pyp_lm.cc @@ -50,16 +50,19 @@ template struct PYPLM; // uniform base distribution template<> struct PYPLM<0> { - PYPLM(unsigned vs) : p0(1.0 / vs) {} - void increment(WordID w, const vector& context, MT19937* rng) const {} - void decrement(WordID w, const vector& context, MT19937* rng) const {} + PYPLM(unsigned vs) : p0(1.0 / vs), draws() {} + void increment(WordID w, const vector& context, MT19937* rng) { ++draws; } + void decrement(WordID w, const vector& context, MT19937* rng) { --draws; assert(draws >= 0); } double prob(WordID w, const vector& context) const { return p0; } + void resample_hyperparameters(MT19937* rng, const unsigned nloop, const unsigned niterations) {} + double log_likelihood() const { return draws * log(p0); } const double p0; + int draws; }; // represents an N-gram LM template struct PYPLM { - PYPLM(unsigned vs) : backoff(vs) {} + PYPLM(unsigned vs) : backoff(vs), d(0.8), alpha(1.0) {} void increment(WordID w, const vector& context, MT19937* rng) { const double bo = backoff.prob(w, context); static vector lookup(N-1); @@ -67,7 +70,7 @@ template struct PYPLM { lookup[i] = context[context.size() - 1 - i]; typename unordered_map, CCRP, boost::hash > >::iterator it = p.find(lookup); if (it == p.end()) - it = p.insert(make_pair(lookup, CCRP(1,1,1,1))).first; + it = p.insert(make_pair(lookup, CCRP(d,alpha))).first; if (it->second.increment(w, bo, rng)) backoff.increment(w, context, rng); } @@ -89,7 +92,58 @@ template struct PYPLM { if (it == p.end()) return bo; return it->second.prob(w, bo); } + + double log_likelihood() const { + return log_likelihood(d, alpha) + backoff.log_likelihood(); + } + + double log_likelihood(const double& dd, const double& aa) const { + if (aa <= -dd) return -std::numeric_limits::infinity(); + double llh = Md::log_beta_density(dd, 1, 1) + Md::log_gamma_density(aa, 1, 1); + typename unordered_map, CCRP, boost::hash > >::const_iterator it; + for (it = p.begin(); it != p.end(); ++it) + llh += it->second.log_crp_prob(dd, aa); + return llh; + } + + struct DiscountResampler { + DiscountResampler(const PYPLM& m) : m_(m) {} + const PYPLM& m_; + double operator()(const double& proposed_discount) const { + return m_.log_likelihood(proposed_discount, m_.alpha); + } + }; + + struct AlphaResampler { + AlphaResampler(const PYPLM& m) : m_(m) {} + const PYPLM& m_; + double operator()(const double& proposed_alpha) const { + return m_.log_likelihood(m_.d, proposed_alpha); + } + }; + + void resample_hyperparameters(MT19937* rng, const unsigned nloop = 5, const unsigned niterations = 10) { + DiscountResampler dr(*this); + AlphaResampler ar(*this); + for (int iter = 0; iter < nloop; ++iter) { + alpha = slice_sampler1d(ar, alpha, *rng, 0.0, + std::numeric_limits::infinity(), 0.0, niterations, 100*niterations); + d = slice_sampler1d(dr, d, *rng, std::numeric_limits::min(), + 1.0, 0.0, niterations, 100*niterations); + } + alpha = slice_sampler1d(ar, alpha, *rng, 0.0, + std::numeric_limits::infinity(), 0.0, niterations, 100*niterations); + typename unordered_map, CCRP, boost::hash > >::iterator it; + cerr << "PYPLM<" << N << ">(d=" << d << ",a=" << alpha << ") = " << log_likelihood(d, alpha) << endl; + for (it = p.begin(); it != p.end(); ++it) { + it->second.set_discount(d); + it->second.set_alpha(alpha); + } + backoff.resample_hyperparameters(rng, nloop, niterations); + } + PYPLM backoff; + double d, alpha; unordered_map, CCRP, boost::hash > > p; }; @@ -109,7 +163,7 @@ int main(int argc, char** argv) { cerr << "Reading corpus...\n"; CorpusTools::ReadFromFile(conf["input"].as(), &corpuse, &vocabe); cerr << "E-corpus size: " << corpuse.size() << " sentences\t (" << vocabe.size() << " word types)\n"; -#define kORDER 5 +#define kORDER 3 PYPLM lm(vocabe.size()); vector ctx(kORDER - 1, TD::Convert("")); int mci = corpuse.size() * 99 / 100; @@ -126,6 +180,10 @@ int main(int argc, char** argv) { if (SS > 0) lm.decrement(kEOS, ctx, &rng); lm.increment(kEOS, ctx, &rng); } + if (SS % 10 == 9) { + cerr << " [LLH=" << lm.log_likelihood() << "]" << endl; + if (SS % 20 == 19) lm.resample_hyperparameters(&rng); + } else { cerr << '.' << flush; } } double llh = 0; unsigned cnt = 0; diff --git a/phrasinator/gibbs_train_plm.cc b/phrasinator/gibbs_train_plm.cc index 66b46011..54861dcb 100644 --- a/phrasinator/gibbs_train_plm.cc +++ b/phrasinator/gibbs_train_plm.cc @@ -252,7 +252,7 @@ struct UniphraseLM { void ResampleHyperparameters(MT19937* rng) { phrases_.resample_hyperparameters(rng); gen_.resample_hyperparameters(rng); - cerr << " d=" << phrases_.discount() << ",c=" << phrases_.concentration(); + cerr << " d=" << phrases_.discount() << ",a=" << phrases_.alpha(); } CCRP > phrases_; diff --git a/utils/ccrp.h b/utils/ccrp.h index 1a9e3ed5..d9a38089 100644 --- a/utils/ccrp.h +++ b/utils/ccrp.h @@ -17,35 +17,37 @@ template > class CCRP { public: - CCRP(double disc, double conc) : + CCRP(double disc, double alpha) : num_tables_(), num_customers_(), discount_(disc), - concentration_(conc), + alpha_(alpha), discount_prior_alpha_(std::numeric_limits::quiet_NaN()), discount_prior_beta_(std::numeric_limits::quiet_NaN()), - concentration_prior_shape_(std::numeric_limits::quiet_NaN()), - concentration_prior_rate_(std::numeric_limits::quiet_NaN()) {} + alpha_prior_shape_(std::numeric_limits::quiet_NaN()), + alpha_prior_rate_(std::numeric_limits::quiet_NaN()) {} CCRP(double d_alpha, double d_beta, double c_shape, double c_rate, double d = 0.9, double c = 1.0) : num_tables_(), num_customers_(), discount_(d), - concentration_(c), + alpha_(c), discount_prior_alpha_(d_alpha), discount_prior_beta_(d_beta), - concentration_prior_shape_(c_shape), - concentration_prior_rate_(c_rate) {} + alpha_prior_shape_(c_shape), + alpha_prior_rate_(c_rate) {} double discount() const { return discount_; } - double concentration() const { return concentration_; } + double alpha() const { return alpha_; } + void set_discount(double d) { discount_ = d; } + void set_alpha(double a) { alpha_ = a; } bool has_discount_prior() const { return !std::isnan(discount_prior_alpha_); } - bool has_concentration_prior() const { - return !std::isnan(concentration_prior_shape_); + bool has_alpha_prior() const { + return !std::isnan(alpha_prior_shape_); } void clear() { @@ -79,7 +81,7 @@ class CCRP { DishLocations& loc = dish_locs_[dish]; bool share_table = false; if (loc.total_dish_count_) { - const double p_empty = (concentration_ + num_tables_ * discount_) * p0; + const double p_empty = (alpha_ + num_tables_ * discount_) * p0; const double p_share = (loc.total_dish_count_ - loc.table_counts_.size() * discount_); share_table = rng->SelectSample(p_empty, p_share); } @@ -113,7 +115,7 @@ class CCRP { DishLocations& loc = dish_locs_[dish]; bool share_table = false; if (loc.total_dish_count_) { - const T p_empty = T(concentration_ + num_tables_ * discount_) * p0; + const T p_empty = T(alpha_ + num_tables_ * discount_) * p0; const T p_share = T(loc.total_dish_count_ - loc.table_counts_.size() * discount_); share_table = rng->SelectSample(p_empty, p_share); } @@ -180,63 +182,46 @@ class CCRP { double prob(const Dish& dish, const double& p0) const { const typename std::tr1::unordered_map::const_iterator it = dish_locs_.find(dish); - const double r = num_tables_ * discount_ + concentration_; + const double r = num_tables_ * discount_ + alpha_; if (it == dish_locs_.end()) { - return r * p0 / (num_customers_ + concentration_); + return r * p0 / (num_customers_ + alpha_); } else { return (it->second.total_dish_count_ - discount_ * it->second.table_counts_.size() + r * p0) / - (num_customers_ + concentration_); + (num_customers_ + alpha_); } } template T probT(const Dish& dish, const T& p0) const { const typename std::tr1::unordered_map::const_iterator it = dish_locs_.find(dish); - const T r = T(num_tables_ * discount_ + concentration_); + const T r = T(num_tables_ * discount_ + alpha_); if (it == dish_locs_.end()) { - return r * p0 / T(num_customers_ + concentration_); + return r * p0 / T(num_customers_ + alpha_); } else { return (T(it->second.total_dish_count_ - discount_ * it->second.table_counts_.size()) + r * p0) / - T(num_customers_ + concentration_); + T(num_customers_ + alpha_); } } double log_crp_prob() const { - return log_crp_prob(discount_, concentration_); - } - - static double log_beta_density(const double& x, const double& alpha, const double& beta) { - assert(x > 0.0); - assert(x < 1.0); - assert(alpha > 0.0); - assert(beta > 0.0); - const double lp = (alpha-1)*log(x)+(beta-1)*log(1-x)+lgamma(alpha+beta)-lgamma(alpha)-lgamma(beta); - return lp; - } - - static double log_gamma_density(const double& x, const double& shape, const double& rate) { - assert(x >= 0.0); - assert(shape > 0.0); - assert(rate > 0.0); - const double lp = (shape-1)*log(x) - shape*log(rate) - x/rate - lgamma(shape); - return lp; + return log_crp_prob(discount_, alpha_); } // taken from http://en.wikipedia.org/wiki/Chinese_restaurant_process // does not include P_0's - double log_crp_prob(const double& discount, const double& concentration) const { + double log_crp_prob(const double& discount, const double& alpha) const { double lp = 0.0; if (has_discount_prior()) - lp = log_beta_density(discount, discount_prior_alpha_, discount_prior_beta_); - if (has_concentration_prior()) - lp += log_gamma_density(concentration, concentration_prior_shape_, concentration_prior_rate_); + lp = Md::log_beta_density(discount, discount_prior_alpha_, discount_prior_beta_); + if (has_alpha_prior()) + lp += Md::log_gamma_density(alpha, alpha_prior_shape_, alpha_prior_rate_); assert(lp <= 0.0); if (num_customers_) { if (discount > 0.0) { const double r = lgamma(1.0 - discount); - lp += lgamma(concentration) - lgamma(concentration + num_customers_) - + num_tables_ * log(discount) + lgamma(concentration / discount + num_tables_) - - lgamma(concentration / discount); + lp += lgamma(alpha) - lgamma(alpha + num_customers_) + + num_tables_ * log(discount) + lgamma(alpha / discount + num_tables_) + - lgamma(alpha / discount); assert(std::isfinite(lp)); for (typename std::tr1::unordered_map::const_iterator it = dish_locs_.begin(); it != dish_locs_.end(); ++it) { @@ -254,12 +239,12 @@ class CCRP { } void resample_hyperparameters(MT19937* rng, const unsigned nloop = 5, const unsigned niterations = 10) { - assert(has_discount_prior() || has_concentration_prior()); + assert(has_discount_prior() || has_alpha_prior()); DiscountResampler dr(*this); ConcentrationResampler cr(*this); for (int iter = 0; iter < nloop; ++iter) { - if (has_concentration_prior()) { - concentration_ = slice_sampler1d(cr, concentration_, *rng, 0.0, + if (has_alpha_prior()) { + alpha_ = slice_sampler1d(cr, alpha_, *rng, 0.0, std::numeric_limits::infinity(), 0.0, niterations, 100*niterations); } if (has_discount_prior()) { @@ -267,7 +252,7 @@ class CCRP { 1.0, 0.0, niterations, 100*niterations); } } - concentration_ = slice_sampler1d(cr, concentration_, *rng, 0.0, + alpha_ = slice_sampler1d(cr, alpha_, *rng, 0.0, std::numeric_limits::infinity(), 0.0, niterations, 100*niterations); } @@ -275,15 +260,15 @@ class CCRP { DiscountResampler(const CCRP& crp) : crp_(crp) {} const CCRP& crp_; double operator()(const double& proposed_discount) const { - return crp_.log_crp_prob(proposed_discount, crp_.concentration_); + return crp_.log_crp_prob(proposed_discount, crp_.alpha_); } }; struct ConcentrationResampler { ConcentrationResampler(const CCRP& crp) : crp_(crp) {} const CCRP& crp_; - double operator()(const double& proposed_concentration) const { - return crp_.log_crp_prob(crp_.discount_, proposed_concentration); + double operator()(const double& proposed_alpha) const { + return crp_.log_crp_prob(crp_.discount_, proposed_alpha); } }; @@ -295,7 +280,7 @@ class CCRP { }; void Print(std::ostream* out) const { - std::cerr << "PYP(d=" << discount_ << ",c=" << concentration_ << ") customers=" << num_customers_ << std::endl; + std::cerr << "PYP(d=" << discount_ << ",c=" << alpha_ << ") customers=" << num_customers_ << std::endl; for (typename std::tr1::unordered_map::const_iterator it = dish_locs_.begin(); it != dish_locs_.end(); ++it) { (*out) << it->first << " (" << it->second.total_dish_count_ << " on " << it->second.table_counts_.size() << " tables): "; @@ -320,15 +305,15 @@ class CCRP { std::tr1::unordered_map dish_locs_; double discount_; - double concentration_; + double alpha_; // optional beta prior on discount_ (NaN if no prior) double discount_prior_alpha_; double discount_prior_beta_; - // optional gamma prior on concentration_ (NaN if no prior) - double concentration_prior_shape_; - double concentration_prior_rate_; + // optional gamma prior on alpha_ (NaN if no prior) + double alpha_prior_shape_; + double alpha_prior_rate_; }; template diff --git a/utils/ccrp_nt.h b/utils/ccrp_nt.h index 63b6f4c2..79321493 100644 --- a/utils/ccrp_nt.h +++ b/utils/ccrp_nt.h @@ -18,20 +18,20 @@ class CCRP_NoTable { public: explicit CCRP_NoTable(double conc) : num_customers_(), - concentration_(conc), - concentration_prior_shape_(std::numeric_limits::quiet_NaN()), - concentration_prior_rate_(std::numeric_limits::quiet_NaN()) {} + alpha_(conc), + alpha_prior_shape_(std::numeric_limits::quiet_NaN()), + alpha_prior_rate_(std::numeric_limits::quiet_NaN()) {} CCRP_NoTable(double c_shape, double c_rate, double c = 10.0) : num_customers_(), - concentration_(c), - concentration_prior_shape_(c_shape), - concentration_prior_rate_(c_rate) {} + alpha_(c), + alpha_prior_shape_(c_shape), + alpha_prior_rate_(c_rate) {} - double concentration() const { return concentration_; } + double alpha() const { return alpha_; } - bool has_concentration_prior() const { - return !std::isnan(concentration_prior_shape_); + bool has_alpha_prior() const { + return !std::isnan(alpha_prior_shape_); } void clear() { @@ -73,16 +73,16 @@ class CCRP_NoTable { double prob(const Dish& dish, const double& p0) const { const unsigned at_table = num_customers(dish); - return (at_table + p0 * concentration_) / (num_customers_ + concentration_); + return (at_table + p0 * alpha_) / (num_customers_ + alpha_); } double logprob(const Dish& dish, const double& logp0) const { const unsigned at_table = num_customers(dish); - return log(at_table + exp(logp0 + log(concentration_))) - log(num_customers_ + concentration_); + return log(at_table + exp(logp0 + log(alpha_))) - log(num_customers_ + alpha_); } double log_crp_prob() const { - return log_crp_prob(concentration_); + return log_crp_prob(alpha_); } static double log_gamma_density(const double& x, const double& shape, const double& rate) { @@ -95,14 +95,14 @@ class CCRP_NoTable { // taken from http://en.wikipedia.org/wiki/Chinese_restaurant_process // does not include P_0's - double log_crp_prob(const double& concentration) const { + double log_crp_prob(const double& alpha) const { double lp = 0.0; - if (has_concentration_prior()) - lp += log_gamma_density(concentration, concentration_prior_shape_, concentration_prior_rate_); + if (has_alpha_prior()) + lp += log_gamma_density(alpha, alpha_prior_shape_, alpha_prior_rate_); assert(lp <= 0.0); if (num_customers_) { - lp += lgamma(concentration) - lgamma(concentration + num_customers_) + - custs_.size() * log(concentration); + lp += lgamma(alpha) - lgamma(alpha + num_customers_) + + custs_.size() * log(alpha); assert(std::isfinite(lp)); for (typename std::tr1::unordered_map::const_iterator it = custs_.begin(); it != custs_.end(); ++it) { @@ -114,10 +114,10 @@ class CCRP_NoTable { } void resample_hyperparameters(MT19937* rng, const unsigned nloop = 5, const unsigned niterations = 10) { - assert(has_concentration_prior()); + assert(has_alpha_prior()); ConcentrationResampler cr(*this); for (int iter = 0; iter < nloop; ++iter) { - concentration_ = slice_sampler1d(cr, concentration_, *rng, 0.0, + alpha_ = slice_sampler1d(cr, alpha_, *rng, 0.0, std::numeric_limits::infinity(), 0.0, niterations, 100*niterations); } } @@ -125,13 +125,13 @@ class CCRP_NoTable { struct ConcentrationResampler { ConcentrationResampler(const CCRP_NoTable& crp) : crp_(crp) {} const CCRP_NoTable& crp_; - double operator()(const double& proposed_concentration) const { - return crp_.log_crp_prob(proposed_concentration); + double operator()(const double& proposed_alpha) const { + return crp_.log_crp_prob(proposed_alpha); } }; void Print(std::ostream* out) const { - (*out) << "DP(alpha=" << concentration_ << ") customers=" << num_customers_ << std::endl; + (*out) << "DP(alpha=" << alpha_ << ") customers=" << num_customers_ << std::endl; int cc = 0; for (typename std::tr1::unordered_map::const_iterator it = custs_.begin(); it != custs_.end(); ++it) { @@ -153,11 +153,11 @@ class CCRP_NoTable { return custs_.end(); } - double concentration_; + double alpha_; - // optional gamma prior on concentration_ (NaN if no prior) - double concentration_prior_shape_; - double concentration_prior_rate_; + // optional gamma prior on alpha_ (NaN if no prior) + double alpha_prior_shape_; + double alpha_prior_rate_; }; template diff --git a/utils/ccrp_onetable.h b/utils/ccrp_onetable.h index b63737d1..1fe01b0e 100644 --- a/utils/ccrp_onetable.h +++ b/utils/ccrp_onetable.h @@ -21,33 +21,33 @@ class CCRP_OneTable { num_tables_(), num_customers_(), discount_(disc), - concentration_(conc), + alpha_(conc), discount_prior_alpha_(std::numeric_limits::quiet_NaN()), discount_prior_beta_(std::numeric_limits::quiet_NaN()), - concentration_prior_shape_(std::numeric_limits::quiet_NaN()), - concentration_prior_rate_(std::numeric_limits::quiet_NaN()) {} + alpha_prior_shape_(std::numeric_limits::quiet_NaN()), + alpha_prior_rate_(std::numeric_limits::quiet_NaN()) {} CCRP_OneTable(double d_alpha, double d_beta, double c_shape, double c_rate, double d = 0.9, double c = 1.0) : num_tables_(), num_customers_(), discount_(d), - concentration_(c), + alpha_(c), discount_prior_alpha_(d_alpha), discount_prior_beta_(d_beta), - concentration_prior_shape_(c_shape), - concentration_prior_rate_(c_rate) {} + alpha_prior_shape_(c_shape), + alpha_prior_rate_(c_rate) {} double discount() const { return discount_; } - double concentration() const { return concentration_; } - void set_concentration(double c) { concentration_ = c; } + double alpha() const { return alpha_; } + void set_alpha(double c) { alpha_ = c; } void set_discount(double d) { discount_ = d; } bool has_discount_prior() const { return !std::isnan(discount_prior_alpha_); } - bool has_concentration_prior() const { - return !std::isnan(concentration_prior_shape_); + bool has_alpha_prior() const { + return !std::isnan(alpha_prior_shape_); } void clear() { @@ -108,29 +108,29 @@ class CCRP_OneTable { double prob(const Dish& dish, const double& p0) const { const typename DishMapType::const_iterator it = dish_counts_.find(dish); - const double r = num_tables_ * discount_ + concentration_; + const double r = num_tables_ * discount_ + alpha_; if (it == dish_counts_.end()) { - return r * p0 / (num_customers_ + concentration_); + return r * p0 / (num_customers_ + alpha_); } else { return (it->second - discount_ + r * p0) / - (num_customers_ + concentration_); + (num_customers_ + alpha_); } } template T probT(const Dish& dish, const T& p0) const { const typename DishMapType::const_iterator it = dish_counts_.find(dish); - const T r(num_tables_ * discount_ + concentration_); + const T r(num_tables_ * discount_ + alpha_); if (it == dish_counts_.end()) { - return r * p0 / T(num_customers_ + concentration_); + return r * p0 / T(num_customers_ + alpha_); } else { return (T(it->second - discount_) + r * p0) / - T(num_customers_ + concentration_); + T(num_customers_ + alpha_); } } double log_crp_prob() const { - return log_crp_prob(discount_, concentration_); + return log_crp_prob(discount_, alpha_); } static double log_beta_density(const double& x, const double& alpha, const double& beta) { @@ -152,19 +152,19 @@ class CCRP_OneTable { // taken from http://en.wikipedia.org/wiki/Chinese_restaurant_process // does not include P_0's - double log_crp_prob(const double& discount, const double& concentration) const { + double log_crp_prob(const double& discount, const double& alpha) const { double lp = 0.0; if (has_discount_prior()) lp = log_beta_density(discount, discount_prior_alpha_, discount_prior_beta_); - if (has_concentration_prior()) - lp += log_gamma_density(concentration, concentration_prior_shape_, concentration_prior_rate_); + if (has_alpha_prior()) + lp += log_gamma_density(alpha, alpha_prior_shape_, alpha_prior_rate_); assert(lp <= 0.0); if (num_customers_) { if (discount > 0.0) { const double r = lgamma(1.0 - discount); - lp += lgamma(concentration) - lgamma(concentration + num_customers_) - + num_tables_ * log(discount) + lgamma(concentration / discount + num_tables_) - - lgamma(concentration / discount); + lp += lgamma(alpha) - lgamma(alpha + num_customers_) + + num_tables_ * log(discount) + lgamma(alpha / discount + num_tables_) + - lgamma(alpha / discount); assert(std::isfinite(lp)); for (typename DishMapType::const_iterator it = dish_counts_.begin(); it != dish_counts_.end(); ++it) { @@ -180,12 +180,12 @@ class CCRP_OneTable { } void resample_hyperparameters(MT19937* rng, const unsigned nloop = 5, const unsigned niterations = 10) { - assert(has_discount_prior() || has_concentration_prior()); + assert(has_discount_prior() || has_alpha_prior()); DiscountResampler dr(*this); ConcentrationResampler cr(*this); for (int iter = 0; iter < nloop; ++iter) { - if (has_concentration_prior()) { - concentration_ = slice_sampler1d(cr, concentration_, *rng, 0.0, + if (has_alpha_prior()) { + alpha_ = slice_sampler1d(cr, alpha_, *rng, 0.0, std::numeric_limits::infinity(), 0.0, niterations, 100*niterations); } if (has_discount_prior()) { @@ -193,7 +193,7 @@ class CCRP_OneTable { 1.0, 0.0, niterations, 100*niterations); } } - concentration_ = slice_sampler1d(cr, concentration_, *rng, 0.0, + alpha_ = slice_sampler1d(cr, alpha_, *rng, 0.0, std::numeric_limits::infinity(), 0.0, niterations, 100*niterations); } @@ -201,20 +201,20 @@ class CCRP_OneTable { DiscountResampler(const CCRP_OneTable& crp) : crp_(crp) {} const CCRP_OneTable& crp_; double operator()(const double& proposed_discount) const { - return crp_.log_crp_prob(proposed_discount, crp_.concentration_); + return crp_.log_crp_prob(proposed_discount, crp_.alpha_); } }; struct ConcentrationResampler { ConcentrationResampler(const CCRP_OneTable& crp) : crp_(crp) {} const CCRP_OneTable& crp_; - double operator()(const double& proposed_concentration) const { - return crp_.log_crp_prob(crp_.discount_, proposed_concentration); + double operator()(const double& proposed_alpha) const { + return crp_.log_crp_prob(crp_.discount_, proposed_alpha); } }; void Print(std::ostream* out) const { - (*out) << "PYP(d=" << discount_ << ",c=" << concentration_ << ") customers=" << num_customers_ << std::endl; + (*out) << "PYP(d=" << discount_ << ",c=" << alpha_ << ") customers=" << num_customers_ << std::endl; for (typename DishMapType::const_iterator it = dish_counts_.begin(); it != dish_counts_.end(); ++it) { (*out) << " " << it->first << " = " << it->second << std::endl; } @@ -233,15 +233,15 @@ class CCRP_OneTable { DishMapType dish_counts_; double discount_; - double concentration_; + double alpha_; // optional beta prior on discount_ (NaN if no prior) double discount_prior_alpha_; double discount_prior_beta_; - // optional gamma prior on concentration_ (NaN if no prior) - double concentration_prior_shape_; - double concentration_prior_rate_; + // optional gamma prior on alpha_ (NaN if no prior) + double alpha_prior_shape_; + double alpha_prior_rate_; }; template diff --git a/utils/mfcr.h b/utils/mfcr.h index 396d0205..df988f51 100644 --- a/utils/mfcr.h +++ b/utils/mfcr.h @@ -43,29 +43,29 @@ class MFCR { num_floors_(num_floors), num_tables_(), num_customers_(), - d_(d), + discount_(d), alpha_(alpha), - d_prior_alpha_(std::numeric_limits::quiet_NaN()), - d_prior_beta_(std::numeric_limits::quiet_NaN()), + discount_prior_alpha_(std::numeric_limits::quiet_NaN()), + discount_prior_beta_(std::numeric_limits::quiet_NaN()), alpha_prior_shape_(std::numeric_limits::quiet_NaN()), alpha_prior_rate_(std::numeric_limits::quiet_NaN()) {} - MFCR(unsigned num_floors, double d_alpha, double d_beta, double alpha_shape, double alpha_rate, double d = 0.9, double alpha = 10.0) : + MFCR(unsigned num_floors, double discount_alpha, double discount_beta, double alpha_shape, double alpha_rate, double d = 0.9, double alpha = 10.0) : num_floors_(num_floors), num_tables_(), num_customers_(), - d_(d), + discount_(d), alpha_(alpha), - d_prior_alpha_(d_alpha), - d_prior_beta_(d_beta), + discount_prior_alpha_(discount_alpha), + discount_prior_beta_(discount_beta), alpha_prior_shape_(alpha_shape), alpha_prior_rate_(alpha_rate) {} - double d() const { return d_; } + double discount() const { return discount_; } double alpha() const { return alpha_; } - bool has_d_prior() const { - return !std::isnan(d_prior_alpha_); + bool has_discount_prior() const { + return !std::isnan(discount_prior_alpha_); } bool has_alpha_prior() const { @@ -122,15 +122,15 @@ class MFCR { int floor = -1; bool share_table = false; if (loc.total_dish_count_) { - const double p_empty = (alpha_ + num_tables_ * d_) * marg_p0; - const double p_share = (loc.total_dish_count_ - loc.table_counts_.size() * d_); + const double p_empty = (alpha_ + num_tables_ * discount_) * marg_p0; + const double p_share = (loc.total_dish_count_ - loc.table_counts_.size() * discount_); share_table = rng->SelectSample(p_empty, p_share); } if (share_table) { - double r = rng->next() * (loc.total_dish_count_ - loc.table_counts_.size() * d_); + double r = rng->next() * (loc.total_dish_count_ - loc.table_counts_.size() * discount_); for (typename std::list::iterator ti = loc.table_counts_.begin(); ti != loc.table_counts_.end(); ++ti) { - r -= ti->count - d_; + r -= ti->count - discount_; if (r <= 0.0) { ++ti->count; floor = ti->floor; @@ -206,25 +206,25 @@ class MFCR { const double marg_p0 = std::inner_product(p0s.begin(), p0s.end(), lambdas.begin(), 0.0); assert(marg_p0 <= 1.0); const typename std::tr1::unordered_map::const_iterator it = dish_locs_.find(dish); - const double r = num_tables_ * d_ + alpha_; + const double r = num_tables_ * discount_ + alpha_; if (it == dish_locs_.end()) { return r * marg_p0 / (num_customers_ + alpha_); } else { - return (it->second.total_dish_count_ - d_ * it->second.table_counts_.size() + r * marg_p0) / + return (it->second.total_dish_count_ - discount_ * it->second.table_counts_.size() + r * marg_p0) / (num_customers_ + alpha_); } } double log_crp_prob() const { - return log_crp_prob(d_, alpha_); + return log_crp_prob(discount_, alpha_); } // taken from http://en.wikipedia.org/wiki/Chinese_restaurant_process // does not include draws from G_w's double log_crp_prob(const double& d, const double& alpha) const { double lp = 0.0; - if (has_d_prior()) - lp = Md::log_beta_density(d, d_prior_alpha_, d_prior_beta_); + if (has_discount_prior()) + lp = Md::log_beta_density(d, discount_prior_alpha_, discount_prior_beta_); if (has_alpha_prior()) lp += Md::log_gamma_density(alpha, alpha_prior_shape_, alpha_prior_rate_); assert(lp <= 0.0); @@ -251,7 +251,7 @@ class MFCR { } void resample_hyperparameters(MT19937* rng, const unsigned nloop = 5, const unsigned niterations = 10) { - assert(has_d_prior() || has_alpha_prior()); + assert(has_discount_prior() || has_alpha_prior()); DiscountResampler dr(*this); ConcentrationResampler cr(*this); for (int iter = 0; iter < nloop; ++iter) { @@ -259,8 +259,8 @@ class MFCR { alpha_ = slice_sampler1d(cr, alpha_, *rng, 0.0, std::numeric_limits::infinity(), 0.0, niterations, 100*niterations); } - if (has_d_prior()) { - d_ = slice_sampler1d(dr, d_, *rng, std::numeric_limits::min(), + if (has_discount_prior()) { + discount_ = slice_sampler1d(dr, discount_, *rng, std::numeric_limits::min(), 1.0, 0.0, niterations, 100*niterations); } } @@ -279,8 +279,8 @@ class MFCR { struct ConcentrationResampler { ConcentrationResampler(const MFCR& crp) : crp_(crp) {} const MFCR& crp_; - double operator()(const double& proposed_alpha) const { - return crp_.log_crp_prob(crp_.d_, proposed_alpha); + double operator()(const double& proposediscount_alpha) const { + return crp_.log_crp_prob(crp_.discount_, proposediscount_alpha); } }; @@ -292,7 +292,7 @@ class MFCR { }; void Print(std::ostream* out) const { - (*out) << "MFCR(d=" << d_ << ",alpha=" << alpha_ << ") customers=" << num_customers_ << std::endl; + (*out) << "MFCR(d=" << discount_ << ",alpha=" << alpha_ << ") customers=" << num_customers_ << std::endl; for (typename std::tr1::unordered_map::const_iterator it = dish_locs_.begin(); it != dish_locs_.end(); ++it) { (*out) << it->first << " (" << it->second.total_dish_count_ << " on " << it->second.table_counts_.size() << " tables): "; @@ -317,12 +317,12 @@ class MFCR { unsigned num_customers_; std::tr1::unordered_map dish_locs_; - double d_; + double discount_; double alpha_; - // optional beta prior on d_ (NaN if no prior) - double d_prior_alpha_; - double d_prior_beta_; + // optional beta prior on discount_ (NaN if no prior) + double discount_prior_alpha_; + double discount_prior_beta_; // optional gamma prior on alpha_ (NaN if no prior) double alpha_prior_shape_; -- cgit v1.2.3 From 3c918889d86fe1deaa5d26162bf85865f1aa33bd Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Sun, 4 Mar 2012 14:33:11 -0500 Subject: clean up pyp lm code --- gi/pf/pyp_lm.cc | 85 ++++++++++++++++++++++++++++++++++++++++----------------- 1 file changed, 60 insertions(+), 25 deletions(-) (limited to 'gi/pf') diff --git a/gi/pf/pyp_lm.cc b/gi/pf/pyp_lm.cc index 0d85536c..88dfcc7c 100644 --- a/gi/pf/pyp_lm.cc +++ b/gi/pf/pyp_lm.cc @@ -11,7 +11,14 @@ #include "tdict.h" #include "sampler.h" #include "ccrp.h" -#include "ccrp_onetable.h" + +// A not very memory-efficient implementation of an N-gram LM based on PYPs +// as described in Y.-W. Teh. (2006) A Hierarchical Bayesian Language Model +// based on Pitman-Yor Processes. In Proc. ACL. + +// I use templates to handle the recursive formalation of the prior, so +// the order of the model has to be specified here, at compile time: +#define kORDER 3 using namespace std; using namespace tr1; @@ -22,8 +29,13 @@ shared_ptr prng; void InitCommandLine(int argc, char** argv, po::variables_map* conf) { po::options_description opts("Configuration options"); opts.add_options() - ("samples,s",po::value()->default_value(1000),"Number of samples") - ("input,i",po::value(),"Read data from") + ("samples,s",po::value()->default_value(300),"Number of samples") + ("train,i",po::value(),"Training data file") + ("test,T",po::value(),"Test data file") + ("discount_prior_a,a",po::value()->default_value(1.0), "discount ~ Beta(a,b): a=this") + ("discount_prior_b,b",po::value()->default_value(1.0), "discount ~ Beta(a,b): b=this") + ("strength_prior_s,s",po::value()->default_value(1.0), "strength ~ Gamma(s,r): s=this") + ("strength_prior_r,r",po::value()->default_value(1.0), "strength ~ Gamma(s,r): r=this") ("random_seed,S",po::value(), "Random seed"); po::options_description clo("Command line options"); clo.add_options() @@ -40,7 +52,7 @@ void InitCommandLine(int argc, char** argv, po::variables_map* conf) { } po::notify(*conf); - if (conf->count("help") || (conf->count("input") == 0)) { + if (conf->count("help") || (conf->count("train") == 0)) { cerr << dcmdline_options << endl; exit(1); } @@ -48,13 +60,13 @@ void InitCommandLine(int argc, char** argv, po::variables_map* conf) { template struct PYPLM; -// uniform base distribution +// uniform base distribution (0-gram model) template<> struct PYPLM<0> { - PYPLM(unsigned vs) : p0(1.0 / vs), draws() {} - void increment(WordID w, const vector& context, MT19937* rng) { ++draws; } - void decrement(WordID w, const vector& context, MT19937* rng) { --draws; assert(draws >= 0); } - double prob(WordID w, const vector& context) const { return p0; } - void resample_hyperparameters(MT19937* rng, const unsigned nloop, const unsigned niterations) {} + PYPLM(unsigned vs, double, double, double, double) : p0(1.0 / vs), draws() {} + void increment(WordID, const vector&, MT19937*) { ++draws; } + void decrement(WordID, const vector&, MT19937*) { --draws; assert(draws >= 0); } + double prob(WordID, const vector&) const { return p0; } + void resample_hyperparameters(MT19937*, const unsigned, const unsigned) {} double log_likelihood() const { return draws * log(p0); } const double p0; int draws; @@ -62,10 +74,13 @@ template<> struct PYPLM<0> { // represents an N-gram LM template struct PYPLM { - PYPLM(unsigned vs) : backoff(vs), d(0.8), alpha(1.0) {} + PYPLM(unsigned vs, double da, double db, double ss, double sr) : + backoff(vs, da, db, ss, sr), + discount_a(da), discount_b(db), + strength_s(ss), strength_r(sr), + d(0.8), alpha(1.0), lookup(N-1) {} void increment(WordID w, const vector& context, MT19937* rng) { const double bo = backoff.prob(w, context); - static vector lookup(N-1); for (unsigned i = 0; i < N-1; ++i) lookup[i] = context[context.size() - 1 - i]; typename unordered_map, CCRP, boost::hash > >::iterator it = p.find(lookup); @@ -75,7 +90,6 @@ template struct PYPLM { backoff.increment(w, context, rng); } void decrement(WordID w, const vector& context, MT19937* rng) { - static vector lookup(N-1); for (unsigned i = 0; i < N-1; ++i) lookup[i] = context[context.size() - 1 - i]; typename unordered_map, CCRP, boost::hash > >::iterator it = p.find(lookup); @@ -85,7 +99,6 @@ template struct PYPLM { } double prob(WordID w, const vector& context) const { const double bo = backoff.prob(w, context); - static vector lookup(N-1); for (unsigned i = 0; i < N-1; ++i) lookup[i] = context[context.size() - 1 - i]; typename unordered_map, CCRP, boost::hash > >::const_iterator it = p.find(lookup); @@ -99,7 +112,9 @@ template struct PYPLM { double log_likelihood(const double& dd, const double& aa) const { if (aa <= -dd) return -std::numeric_limits::infinity(); - double llh = Md::log_beta_density(dd, 1, 1) + Md::log_gamma_density(aa, 1, 1); + //double llh = Md::log_beta_density(dd, 10, 3) + Md::log_gamma_density(aa, 1, 1); + double llh = Md::log_beta_density(dd, discount_a, discount_b) + + Md::log_gamma_density(aa, strength_s, strength_r); typename unordered_map, CCRP, boost::hash > >::const_iterator it; for (it = p.begin(); it != p.end(); ++it) llh += it->second.log_crp_prob(dd, aa); @@ -143,7 +158,9 @@ template struct PYPLM { } PYPLM backoff; + double discount_a, discount_b, strength_s, strength_r; double d, alpha; + mutable vector lookup; // thread-local unordered_map, CCRP, boost::hash > > p; }; @@ -161,14 +178,21 @@ int main(int argc, char** argv) { set vocabe; const WordID kEOS = TD::Convert(""); cerr << "Reading corpus...\n"; - CorpusTools::ReadFromFile(conf["input"].as(), &corpuse, &vocabe); + CorpusTools::ReadFromFile(conf["train"].as(), &corpuse, &vocabe); cerr << "E-corpus size: " << corpuse.size() << " sentences\t (" << vocabe.size() << " word types)\n"; -#define kORDER 3 - PYPLM lm(vocabe.size()); + vector > test; + if (conf.count("test")) + CorpusTools::ReadFromFile(conf["test"].as(), &test); + else + test = corpuse; + PYPLM lm(vocabe.size(), + conf["discount_prior_a"].as(), + conf["discount_prior_b"].as(), + conf["strength_prior_s"].as(), + conf["strength_prior_r"].as()); vector ctx(kORDER - 1, TD::Convert("")); - int mci = corpuse.size() * 99 / 100; for (int SS=0; SS < samples; ++SS) { - for (int ci = 0; ci < mci; ++ci) { + for (int ci = 0; ci < corpuse.size(); ++ci) { ctx.resize(kORDER - 1); const vector& s = corpuse[ci]; for (int i = 0; i <= s.size(); ++i) { @@ -187,22 +211,33 @@ int main(int argc, char** argv) { } double llh = 0; unsigned cnt = 0; - for (int ci = mci; ci < corpuse.size(); ++ci) { + unsigned oovs = 0; + for (int ci = 0; ci < test.size(); ++ci) { ctx.resize(kORDER - 1); - const vector& s = corpuse[ci]; + const vector& s = test[ci]; for (int i = 0; i <= s.size(); ++i) { WordID w = (i < s.size() ? s[i] : kEOS); double lp = log(lm.prob(w, ctx)) / log(2); - cerr << "p(" << TD::Convert(w) << " | " << TD::GetString(ctx) << ") = " << lp << endl; + if (i < s.size() && vocabe.count(w) == 0) { + cerr << "**OOV "; + ++oovs; + lp = 0; + } + cerr << "p(" << TD::Convert(w) << " |"; + for (int j = ctx.size() + 1 - kORDER; j < ctx.size(); ++j) + cerr << ' ' << TD::Convert(ctx[j]); + cerr << ") = " << lp << endl; ctx.push_back(w); llh -= lp; cnt++; } } - cerr << " Log_10 prob: " << (llh * log(2) / log(10)) << endl; - cerr << " Count: " << (cnt) << endl; + cerr << " Log_10 prob: " << (-llh * log(2) / log(10)) << endl; + cerr << " Count: " << cnt << endl; + cerr << " OOVs: " << oovs << endl; cerr << "Cross-entropy: " << (llh / cnt) << endl; cerr << " Perplexity: " << pow(2, llh / cnt) << endl; return 0; } + -- cgit v1.2.3 From 1fce37b69630269b93cfeca237675f7b2fc66ca4 Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Sun, 4 Mar 2012 23:26:17 +0000 Subject: fix parameter name clash --- gi/pf/pyp_lm.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'gi/pf') diff --git a/gi/pf/pyp_lm.cc b/gi/pf/pyp_lm.cc index 88dfcc7c..e5c44c8b 100644 --- a/gi/pf/pyp_lm.cc +++ b/gi/pf/pyp_lm.cc @@ -29,7 +29,7 @@ shared_ptr prng; void InitCommandLine(int argc, char** argv, po::variables_map* conf) { po::options_description opts("Configuration options"); opts.add_options() - ("samples,s",po::value()->default_value(300),"Number of samples") + ("samples,n",po::value()->default_value(300),"Number of samples") ("train,i",po::value(),"Training data file") ("test,T",po::value(),"Test data file") ("discount_prior_a,a",po::value()->default_value(1.0), "discount ~ Beta(a,b): a=this") -- cgit v1.2.3 From ce58cb44771a5194b71682d1602abe2fef9e6f13 Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Mon, 5 Mar 2012 14:51:04 -0500 Subject: support strength=0 PYPs, final notation clean-up --- gi/pf/align-lexonly-pyp.cc | 2 +- gi/pf/conditional_pseg.h | 2 +- gi/pf/learn_cfg.cc | 4 +- gi/pf/pyp_lm.cc | 22 ++++----- phrasinator/gibbs_train_plm.cc | 2 +- utils/ccrp.h | 106 ++++++++++++++++++++++------------------- utils/mfcr.h | 105 ++++++++++++++++++++++------------------ 7 files changed, 131 insertions(+), 112 deletions(-) (limited to 'gi/pf') diff --git a/gi/pf/align-lexonly-pyp.cc b/gi/pf/align-lexonly-pyp.cc index 4ce7cf62..87f7f6b5 100644 --- a/gi/pf/align-lexonly-pyp.cc +++ b/gi/pf/align-lexonly-pyp.cc @@ -104,7 +104,7 @@ struct HierarchicalWordBase { } void Summary() const { - cerr << "NUMBER OF CUSTOMERS: " << r.num_customers() << " (d=" << r.discount() << ",\\alpha=" << r.alpha() << ')' << endl; + cerr << "NUMBER OF CUSTOMERS: " << r.num_customers() << " (d=" << r.discount() << ",s=" << r.strength() << ')' << endl; for (MFCR >::const_iterator it = r.begin(); it != r.end(); ++it) cerr << " " << it->second.total_dish_count_ << " (on " << it->second.table_counts_.size() << " tables)" << TD::GetString(it->first) << endl; } diff --git a/gi/pf/conditional_pseg.h b/gi/pf/conditional_pseg.h index f9841cbf..86403d8d 100644 --- a/gi/pf/conditional_pseg.h +++ b/gi/pf/conditional_pseg.h @@ -22,7 +22,7 @@ struct MConditionalTranslationModel { void Summary() const { std::cerr << "Number of conditioning contexts: " << r.size() << std::endl; for (RuleModelHash::const_iterator it = r.begin(); it != r.end(); ++it) { - std::cerr << TD::GetString(it->first) << " \t(d=" << it->second.discount() << ",\\alpha = " << it->second.alpha() << ") --------------------------" << std::endl; + std::cerr << TD::GetString(it->first) << " \t(d=" << it->second.discount() << ",s=" << it->second.strength() << ") --------------------------" << std::endl; for (MFCR::const_iterator i2 = it->second.begin(); i2 != it->second.end(); ++i2) std::cerr << " " << -1 << '\t' << i2->first << std::endl; } diff --git a/gi/pf/learn_cfg.cc b/gi/pf/learn_cfg.cc index 5b748311..bf157828 100644 --- a/gi/pf/learn_cfg.cc +++ b/gi/pf/learn_cfg.cc @@ -183,9 +183,9 @@ struct HieroLMModel { nts[i].resample_hyperparameters(rng); if (kHIERARCHICAL_PRIOR) { q0.resample_hyperparameters(rng); - cerr << "[base d=" << q0.discount() << ", alpha=" << q0.alpha() << "]"; + cerr << "[base d=" << q0.discount() << ", s=" << q0.strength() << "]"; } - cerr << " d=" << nts[0].discount() << ", alpha=" << nts[0].alpha() << endl; + cerr << " d=" << nts[0].discount() << ", s=" << nts[0].strength() << endl; } const BaseRuleModel base; diff --git a/gi/pf/pyp_lm.cc b/gi/pf/pyp_lm.cc index e5c44c8b..7ebada13 100644 --- a/gi/pf/pyp_lm.cc +++ b/gi/pf/pyp_lm.cc @@ -78,14 +78,14 @@ template struct PYPLM { backoff(vs, da, db, ss, sr), discount_a(da), discount_b(db), strength_s(ss), strength_r(sr), - d(0.8), alpha(1.0), lookup(N-1) {} + d(0.8), strength(1.0), lookup(N-1) {} void increment(WordID w, const vector& context, MT19937* rng) { const double bo = backoff.prob(w, context); for (unsigned i = 0; i < N-1; ++i) lookup[i] = context[context.size() - 1 - i]; typename unordered_map, CCRP, boost::hash > >::iterator it = p.find(lookup); if (it == p.end()) - it = p.insert(make_pair(lookup, CCRP(d,alpha))).first; + it = p.insert(make_pair(lookup, CCRP(d,strength))).first; if (it->second.increment(w, bo, rng)) backoff.increment(w, context, rng); } @@ -107,7 +107,7 @@ template struct PYPLM { } double log_likelihood() const { - return log_likelihood(d, alpha) + backoff.log_likelihood(); + return log_likelihood(d, strength) + backoff.log_likelihood(); } double log_likelihood(const double& dd, const double& aa) const { @@ -125,15 +125,15 @@ template struct PYPLM { DiscountResampler(const PYPLM& m) : m_(m) {} const PYPLM& m_; double operator()(const double& proposed_discount) const { - return m_.log_likelihood(proposed_discount, m_.alpha); + return m_.log_likelihood(proposed_discount, m_.strength); } }; struct AlphaResampler { AlphaResampler(const PYPLM& m) : m_(m) {} const PYPLM& m_; - double operator()(const double& proposed_alpha) const { - return m_.log_likelihood(m_.d, proposed_alpha); + double operator()(const double& proposed_strength) const { + return m_.log_likelihood(m_.d, proposed_strength); } }; @@ -141,25 +141,25 @@ template struct PYPLM { DiscountResampler dr(*this); AlphaResampler ar(*this); for (int iter = 0; iter < nloop; ++iter) { - alpha = slice_sampler1d(ar, alpha, *rng, 0.0, + strength = slice_sampler1d(ar, strength, *rng, 0.0, std::numeric_limits::infinity(), 0.0, niterations, 100*niterations); d = slice_sampler1d(dr, d, *rng, std::numeric_limits::min(), 1.0, 0.0, niterations, 100*niterations); } - alpha = slice_sampler1d(ar, alpha, *rng, 0.0, + strength = slice_sampler1d(ar, strength, *rng, 0.0, std::numeric_limits::infinity(), 0.0, niterations, 100*niterations); typename unordered_map, CCRP, boost::hash > >::iterator it; - cerr << "PYPLM<" << N << ">(d=" << d << ",a=" << alpha << ") = " << log_likelihood(d, alpha) << endl; + cerr << "PYPLM<" << N << ">(d=" << d << ",a=" << strength << ") = " << log_likelihood(d, strength) << endl; for (it = p.begin(); it != p.end(); ++it) { it->second.set_discount(d); - it->second.set_alpha(alpha); + it->second.set_strength(strength); } backoff.resample_hyperparameters(rng, nloop, niterations); } PYPLM backoff; double discount_a, discount_b, strength_s, strength_r; - double d, alpha; + double d, strength; mutable vector lookup; // thread-local unordered_map, CCRP, boost::hash > > p; }; diff --git a/phrasinator/gibbs_train_plm.cc b/phrasinator/gibbs_train_plm.cc index 54861dcb..3b99e1b6 100644 --- a/phrasinator/gibbs_train_plm.cc +++ b/phrasinator/gibbs_train_plm.cc @@ -252,7 +252,7 @@ struct UniphraseLM { void ResampleHyperparameters(MT19937* rng) { phrases_.resample_hyperparameters(rng); gen_.resample_hyperparameters(rng); - cerr << " d=" << phrases_.discount() << ",a=" << phrases_.alpha(); + cerr << " d=" << phrases_.discount() << ",s=" << phrases_.strength(); } CCRP > phrases_; diff --git a/utils/ccrp.h b/utils/ccrp.h index c883c027..5f9db7a6 100644 --- a/utils/ccrp.h +++ b/utils/ccrp.h @@ -18,27 +18,27 @@ template > class CCRP { public: - CCRP(double disc, double alpha) : + CCRP(double disc, double strength) : num_tables_(), num_customers_(), discount_(disc), - alpha_(alpha), - discount_prior_alpha_(std::numeric_limits::quiet_NaN()), + strength_(strength), + discount_prior_strength_(std::numeric_limits::quiet_NaN()), discount_prior_beta_(std::numeric_limits::quiet_NaN()), - alpha_prior_shape_(std::numeric_limits::quiet_NaN()), - alpha_prior_rate_(std::numeric_limits::quiet_NaN()) { + strength_prior_shape_(std::numeric_limits::quiet_NaN()), + strength_prior_rate_(std::numeric_limits::quiet_NaN()) { check_hyperparameters(); } - CCRP(double d_alpha, double d_beta, double c_shape, double c_rate, double d = 0.9, double c = 1.0) : + CCRP(double d_strength, double d_beta, double c_shape, double c_rate, double d = 0.9, double c = 1.0) : num_tables_(), num_customers_(), discount_(d), - alpha_(c), - discount_prior_alpha_(d_alpha), + strength_(c), + discount_prior_strength_(d_strength), discount_prior_beta_(d_beta), - alpha_prior_shape_(c_shape), - alpha_prior_rate_(c_rate) { + strength_prior_shape_(c_shape), + strength_prior_rate_(c_rate) { check_hyperparameters(); } @@ -47,23 +47,23 @@ class CCRP { std::cerr << "Bad discount: " << discount_ << std::endl; abort(); } - if (alpha_ <= -discount_) { - std::cerr << "Bad strength: " << alpha_ << " (discount=" << discount_ << ")" << std::endl; + if (strength_ <= -discount_) { + std::cerr << "Bad strength: " << strength_ << " (discount=" << discount_ << ")" << std::endl; abort(); } } double discount() const { return discount_; } - double alpha() const { return alpha_; } + double strength() const { return strength_; } void set_discount(double d) { discount_ = d; check_hyperparameters(); } - void set_alpha(double a) { alpha_ = a; check_hyperparameters(); } + void set_strength(double a) { strength_ = a; check_hyperparameters(); } bool has_discount_prior() const { - return !std::isnan(discount_prior_alpha_); + return !std::isnan(discount_prior_strength_); } - bool has_alpha_prior() const { - return !std::isnan(alpha_prior_shape_); + bool has_strength_prior() const { + return !std::isnan(strength_prior_shape_); } void clear() { @@ -97,7 +97,7 @@ class CCRP { DishLocations& loc = dish_locs_[dish]; bool share_table = false; if (loc.total_dish_count_) { - const double p_empty = (alpha_ + num_tables_ * discount_) * p0; + const double p_empty = (strength_ + num_tables_ * discount_) * p0; const double p_share = (loc.total_dish_count_ - loc.table_counts_.size() * discount_); share_table = rng->SelectSample(p_empty, p_share); } @@ -131,7 +131,7 @@ class CCRP { DishLocations& loc = dish_locs_[dish]; bool share_table = false; if (loc.total_dish_count_) { - const T p_empty = T(alpha_ + num_tables_ * discount_) * p0; + const T p_empty = T(strength_ + num_tables_ * discount_) * p0; const T p_share = T(loc.total_dish_count_ - loc.table_counts_.size() * discount_); share_table = rng->SelectSample(p_empty, p_share); } @@ -198,47 +198,47 @@ class CCRP { double prob(const Dish& dish, const double& p0) const { const typename std::tr1::unordered_map::const_iterator it = dish_locs_.find(dish); - const double r = num_tables_ * discount_ + alpha_; + const double r = num_tables_ * discount_ + strength_; if (it == dish_locs_.end()) { - return r * p0 / (num_customers_ + alpha_); + return r * p0 / (num_customers_ + strength_); } else { return (it->second.total_dish_count_ - discount_ * it->second.table_counts_.size() + r * p0) / - (num_customers_ + alpha_); + (num_customers_ + strength_); } } template T probT(const Dish& dish, const T& p0) const { const typename std::tr1::unordered_map::const_iterator it = dish_locs_.find(dish); - const T r = T(num_tables_ * discount_ + alpha_); + const T r = T(num_tables_ * discount_ + strength_); if (it == dish_locs_.end()) { - return r * p0 / T(num_customers_ + alpha_); + return r * p0 / T(num_customers_ + strength_); } else { return (T(it->second.total_dish_count_ - discount_ * it->second.table_counts_.size()) + r * p0) / - T(num_customers_ + alpha_); + T(num_customers_ + strength_); } } double log_crp_prob() const { - return log_crp_prob(discount_, alpha_); + return log_crp_prob(discount_, strength_); } // taken from http://en.wikipedia.org/wiki/Chinese_restaurant_process // does not include P_0's - double log_crp_prob(const double& discount, const double& alpha) const { + double log_crp_prob(const double& discount, const double& strength) const { double lp = 0.0; if (has_discount_prior()) - lp = Md::log_beta_density(discount, discount_prior_alpha_, discount_prior_beta_); - if (has_alpha_prior()) - lp += Md::log_gamma_density(alpha + discount, alpha_prior_shape_, alpha_prior_rate_); + lp = Md::log_beta_density(discount, discount_prior_strength_, discount_prior_beta_); + if (has_strength_prior()) + lp += Md::log_gamma_density(strength + discount, strength_prior_shape_, strength_prior_rate_); assert(lp <= 0.0); if (num_customers_) { if (discount > 0.0) { const double r = lgamma(1.0 - discount); - if (alpha) - lp += lgamma(alpha) - lgamma(alpha / discount); - lp += - lgamma(alpha + num_customers_) - + num_tables_ * log(discount) + lgamma(alpha / discount + num_tables_); + if (strength) + lp += lgamma(strength) - lgamma(strength / discount); + lp += - lgamma(strength + num_customers_) + + num_tables_ * log(discount) + lgamma(strength / discount + num_tables_); assert(std::isfinite(lp)); for (typename std::tr1::unordered_map::const_iterator it = dish_locs_.begin(); it != dish_locs_.end(); ++it) { @@ -247,8 +247,16 @@ class CCRP { lp += lgamma(*ti - discount) - r; } } + } else if (!discount) { // discount == 0.0 + lp += lgamma(strength) + num_tables_ * log(strength) - lgamma(strength + num_tables_); + assert(std::isfinite(lp)); + for (typename std::tr1::unordered_map::const_iterator it = dish_locs_.begin(); + it != dish_locs_.end(); ++it) { + const DishLocations& cur = it->second; + lp += lgamma(cur.table_counts_.size()); + } } else { - assert(!"not implemented yet"); + assert(!"discount less than 0 detected!"); } } assert(std::isfinite(lp)); @@ -256,22 +264,22 @@ class CCRP { } void resample_hyperparameters(MT19937* rng, const unsigned nloop = 5, const unsigned niterations = 10) { - assert(has_discount_prior() || has_alpha_prior()); + assert(has_discount_prior() || has_strength_prior()); DiscountResampler dr(*this); StrengthResampler sr(*this); for (int iter = 0; iter < nloop; ++iter) { - if (has_alpha_prior()) { - alpha_ = slice_sampler1d(sr, alpha_, *rng, -discount_, + if (has_strength_prior()) { + strength_ = slice_sampler1d(sr, strength_, *rng, -discount_, std::numeric_limits::infinity(), 0.0, niterations, 100*niterations); } if (has_discount_prior()) { double min_discount = std::numeric_limits::min(); - if (alpha_ < 0.0) min_discount = -alpha_; + if (strength_ < 0.0) min_discount = -strength_; discount_ = slice_sampler1d(dr, discount_, *rng, min_discount, 1.0, 0.0, niterations, 100*niterations); } } - alpha_ = slice_sampler1d(sr, alpha_, *rng, -discount_, + strength_ = slice_sampler1d(sr, strength_, *rng, -discount_, std::numeric_limits::infinity(), 0.0, niterations, 100*niterations); } @@ -279,15 +287,15 @@ class CCRP { DiscountResampler(const CCRP& crp) : crp_(crp) {} const CCRP& crp_; double operator()(const double& proposed_discount) const { - return crp_.log_crp_prob(proposed_discount, crp_.alpha_); + return crp_.log_crp_prob(proposed_discount, crp_.strength_); } }; struct StrengthResampler { StrengthResampler(const CCRP& crp) : crp_(crp) {} const CCRP& crp_; - double operator()(const double& proposed_alpha) const { - return crp_.log_crp_prob(crp_.discount_, proposed_alpha); + double operator()(const double& proposed_strength) const { + return crp_.log_crp_prob(crp_.discount_, proposed_strength); } }; @@ -299,7 +307,7 @@ class CCRP { }; void Print(std::ostream* out) const { - std::cerr << "PYP(d=" << discount_ << ",c=" << alpha_ << ") customers=" << num_customers_ << std::endl; + std::cerr << "PYP(d=" << discount_ << ",c=" << strength_ << ") customers=" << num_customers_ << std::endl; for (typename std::tr1::unordered_map::const_iterator it = dish_locs_.begin(); it != dish_locs_.end(); ++it) { (*out) << it->first << " (" << it->second.total_dish_count_ << " on " << it->second.table_counts_.size() << " tables): "; @@ -324,15 +332,15 @@ class CCRP { std::tr1::unordered_map dish_locs_; double discount_; - double alpha_; + double strength_; // optional beta prior on discount_ (NaN if no prior) - double discount_prior_alpha_; + double discount_prior_strength_; double discount_prior_beta_; - // optional gamma prior on alpha_ (NaN if no prior) - double alpha_prior_shape_; - double alpha_prior_rate_; + // optional gamma prior on strength_ (NaN if no prior) + double strength_prior_shape_; + double strength_prior_rate_; }; template diff --git a/utils/mfcr.h b/utils/mfcr.h index df988f51..aeaf599d 100644 --- a/utils/mfcr.h +++ b/utils/mfcr.h @@ -39,37 +39,37 @@ template > class MFCR { public: - MFCR(unsigned num_floors, double d, double alpha) : + MFCR(unsigned num_floors, double d, double strength) : num_floors_(num_floors), num_tables_(), num_customers_(), discount_(d), - alpha_(alpha), - discount_prior_alpha_(std::numeric_limits::quiet_NaN()), + strength_(strength), + discount_prior_strength_(std::numeric_limits::quiet_NaN()), discount_prior_beta_(std::numeric_limits::quiet_NaN()), - alpha_prior_shape_(std::numeric_limits::quiet_NaN()), - alpha_prior_rate_(std::numeric_limits::quiet_NaN()) {} + strength_prior_shape_(std::numeric_limits::quiet_NaN()), + strength_prior_rate_(std::numeric_limits::quiet_NaN()) {} - MFCR(unsigned num_floors, double discount_alpha, double discount_beta, double alpha_shape, double alpha_rate, double d = 0.9, double alpha = 10.0) : + MFCR(unsigned num_floors, double discount_strength, double discount_beta, double strength_shape, double strength_rate, double d = 0.9, double strength = 10.0) : num_floors_(num_floors), num_tables_(), num_customers_(), discount_(d), - alpha_(alpha), - discount_prior_alpha_(discount_alpha), + strength_(strength), + discount_prior_strength_(discount_strength), discount_prior_beta_(discount_beta), - alpha_prior_shape_(alpha_shape), - alpha_prior_rate_(alpha_rate) {} + strength_prior_shape_(strength_shape), + strength_prior_rate_(strength_rate) {} double discount() const { return discount_; } - double alpha() const { return alpha_; } + double strength() const { return strength_; } bool has_discount_prior() const { - return !std::isnan(discount_prior_alpha_); + return !std::isnan(discount_prior_strength_); } - bool has_alpha_prior() const { - return !std::isnan(alpha_prior_shape_); + bool has_strength_prior() const { + return !std::isnan(strength_prior_shape_); } void clear() { @@ -122,7 +122,7 @@ class MFCR { int floor = -1; bool share_table = false; if (loc.total_dish_count_) { - const double p_empty = (alpha_ + num_tables_ * discount_) * marg_p0; + const double p_empty = (strength_ + num_tables_ * discount_) * marg_p0; const double p_share = (loc.total_dish_count_ - loc.table_counts_.size() * discount_); share_table = rng->SelectSample(p_empty, p_share); } @@ -206,44 +206,53 @@ class MFCR { const double marg_p0 = std::inner_product(p0s.begin(), p0s.end(), lambdas.begin(), 0.0); assert(marg_p0 <= 1.0); const typename std::tr1::unordered_map::const_iterator it = dish_locs_.find(dish); - const double r = num_tables_ * discount_ + alpha_; + const double r = num_tables_ * discount_ + strength_; if (it == dish_locs_.end()) { - return r * marg_p0 / (num_customers_ + alpha_); + return r * marg_p0 / (num_customers_ + strength_); } else { return (it->second.total_dish_count_ - discount_ * it->second.table_counts_.size() + r * marg_p0) / - (num_customers_ + alpha_); + (num_customers_ + strength_); } } double log_crp_prob() const { - return log_crp_prob(discount_, alpha_); + return log_crp_prob(discount_, strength_); } // taken from http://en.wikipedia.org/wiki/Chinese_restaurant_process // does not include draws from G_w's - double log_crp_prob(const double& d, const double& alpha) const { + double log_crp_prob(const double& discount, const double& strength) const { double lp = 0.0; if (has_discount_prior()) - lp = Md::log_beta_density(d, discount_prior_alpha_, discount_prior_beta_); - if (has_alpha_prior()) - lp += Md::log_gamma_density(alpha, alpha_prior_shape_, alpha_prior_rate_); + lp = Md::log_beta_density(discount, discount_prior_strength_, discount_prior_beta_); + if (has_strength_prior()) + lp += Md::log_gamma_density(strength + discount, strength_prior_shape_, strength_prior_rate_); assert(lp <= 0.0); if (num_customers_) { - if (d > 0.0) { - const double r = lgamma(1.0 - d); - lp += lgamma(alpha) - lgamma(alpha + num_customers_) - + num_tables_ * log(d) + lgamma(alpha / d + num_tables_) - - lgamma(alpha / d); + if (discount > 0.0) { + const double r = lgamma(1.0 - discount); + if (strength) + lp += lgamma(strength) - lgamma(strength / discount); + lp += - lgamma(strength + num_customers_) + + num_tables_ * log(discount) + lgamma(strength / discount + num_tables_); assert(std::isfinite(lp)); for (typename std::tr1::unordered_map::const_iterator it = dish_locs_.begin(); it != dish_locs_.end(); ++it) { const DishLocations& cur = it->second; for (std::list::const_iterator ti = cur.table_counts_.begin(); ti != cur.table_counts_.end(); ++ti) { - lp += lgamma(ti->count - d) - r; + lp += lgamma(ti->count - discount) - r; } } + } else if (!discount) { // discount == 0.0 + lp += lgamma(strength) + num_tables_ * log(strength) - lgamma(strength + num_tables_); + assert(std::isfinite(lp)); + for (typename std::tr1::unordered_map::const_iterator it = dish_locs_.begin(); + it != dish_locs_.end(); ++it) { + const DishLocations& cur = it->second; + lp += lgamma(cur.table_counts_.size()); + } } else { - assert(!"not implemented yet"); + assert(!"discount less than 0 detected!"); } } assert(std::isfinite(lp)); @@ -251,20 +260,22 @@ class MFCR { } void resample_hyperparameters(MT19937* rng, const unsigned nloop = 5, const unsigned niterations = 10) { - assert(has_discount_prior() || has_alpha_prior()); + assert(has_discount_prior() || has_strength_prior()); DiscountResampler dr(*this); - ConcentrationResampler cr(*this); + StrengthResampler sr(*this); for (int iter = 0; iter < nloop; ++iter) { - if (has_alpha_prior()) { - alpha_ = slice_sampler1d(cr, alpha_, *rng, 0.0, + if (has_strength_prior()) { + strength_ = slice_sampler1d(sr, strength_, *rng, -discount_, std::numeric_limits::infinity(), 0.0, niterations, 100*niterations); } if (has_discount_prior()) { - discount_ = slice_sampler1d(dr, discount_, *rng, std::numeric_limits::min(), + double min_discount = std::numeric_limits::min(); + if (strength_ < 0.0) min_discount = -strength_; + discount_ = slice_sampler1d(dr, discount_, *rng, min_discount, 1.0, 0.0, niterations, 100*niterations); } } - alpha_ = slice_sampler1d(cr, alpha_, *rng, 0.0, + strength_ = slice_sampler1d(sr, strength_, *rng, -discount_, std::numeric_limits::infinity(), 0.0, niterations, 100*niterations); } @@ -272,15 +283,15 @@ class MFCR { DiscountResampler(const MFCR& crp) : crp_(crp) {} const MFCR& crp_; double operator()(const double& proposed_d) const { - return crp_.log_crp_prob(proposed_d, crp_.alpha_); + return crp_.log_crp_prob(proposed_d, crp_.strength_); } }; - struct ConcentrationResampler { - ConcentrationResampler(const MFCR& crp) : crp_(crp) {} + struct StrengthResampler { + StrengthResampler(const MFCR& crp) : crp_(crp) {} const MFCR& crp_; - double operator()(const double& proposediscount_alpha) const { - return crp_.log_crp_prob(crp_.discount_, proposediscount_alpha); + double operator()(const double& proposediscount_strength) const { + return crp_.log_crp_prob(crp_.discount_, proposediscount_strength); } }; @@ -292,7 +303,7 @@ class MFCR { }; void Print(std::ostream* out) const { - (*out) << "MFCR(d=" << discount_ << ",alpha=" << alpha_ << ") customers=" << num_customers_ << std::endl; + (*out) << "MFCR(d=" << discount_ << ",strength=" << strength_ << ") customers=" << num_customers_ << std::endl; for (typename std::tr1::unordered_map::const_iterator it = dish_locs_.begin(); it != dish_locs_.end(); ++it) { (*out) << it->first << " (" << it->second.total_dish_count_ << " on " << it->second.table_counts_.size() << " tables): "; @@ -318,15 +329,15 @@ class MFCR { std::tr1::unordered_map dish_locs_; double discount_; - double alpha_; + double strength_; // optional beta prior on discount_ (NaN if no prior) - double discount_prior_alpha_; + double discount_prior_strength_; double discount_prior_beta_; - // optional gamma prior on alpha_ (NaN if no prior) - double alpha_prior_shape_; - double alpha_prior_rate_; + // optional gamma prior on strength_ (NaN if no prior) + double strength_prior_shape_; + double strength_prior_rate_; }; template -- cgit v1.2.3 From 2048ac9943e2695a75b5f0303ca869e66ee32202 Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Mon, 5 Mar 2012 16:06:45 -0500 Subject: use template parameter inference to figure out what type to use for probability computations, templatatize number of floors in MFCR rather than compile-time set --- gi/pf/align-lexonly-pyp.cc | 20 +++++++------- gi/pf/conditional_pseg.h | 22 +++++++-------- gi/pf/learn_cfg.cc | 8 +++--- utils/ccrp.h | 48 ++------------------------------ utils/mfcr.h | 68 ++++++++++++++++++++++++---------------------- utils/mfcr_test.cc | 10 +++---- 6 files changed, 68 insertions(+), 108 deletions(-) (limited to 'gi/pf') diff --git a/gi/pf/align-lexonly-pyp.cc b/gi/pf/align-lexonly-pyp.cc index 87f7f6b5..ac0590e0 100644 --- a/gi/pf/align-lexonly-pyp.cc +++ b/gi/pf/align-lexonly-pyp.cc @@ -68,7 +68,7 @@ struct AlignedSentencePair { struct HierarchicalWordBase { explicit HierarchicalWordBase(const unsigned vocab_e_size) : - base(prob_t::One()), r(1,1,1,25,25), u0(-log(vocab_e_size)), l(1,1.0), v(1, 0.0) {} + base(prob_t::One()), r(1,1,1,1), u0(-log(vocab_e_size)), l(1,prob_t::One()), v(1, prob_t::Zero()) {} void ResampleHyperparameters(MT19937* rng) { r.resample_hyperparameters(rng); @@ -80,14 +80,14 @@ struct HierarchicalWordBase { // return p0 of rule.e_ prob_t operator()(const TRule& rule) const { - v[0] = exp(logp0(rule.e_)); - return prob_t(r.prob(rule.e_, v, l)); + v[0].logeq(logp0(rule.e_)); + return r.prob(rule.e_, v.begin(), l.begin()); } void Increment(const TRule& rule) { - v[0] = exp(logp0(rule.e_)); - if (r.increment(rule.e_, v, l, &*prng).count) { - base *= prob_t(v[0] * l[0]); + v[0].logeq(logp0(rule.e_)); + if (r.increment(rule.e_, v.begin(), l.begin(), &*prng).count) { + base *= v[0] * l[0]; } } @@ -105,15 +105,15 @@ struct HierarchicalWordBase { void Summary() const { cerr << "NUMBER OF CUSTOMERS: " << r.num_customers() << " (d=" << r.discount() << ",s=" << r.strength() << ')' << endl; - for (MFCR >::const_iterator it = r.begin(); it != r.end(); ++it) + for (MFCR<1,vector >::const_iterator it = r.begin(); it != r.end(); ++it) cerr << " " << it->second.total_dish_count_ << " (on " << it->second.table_counts_.size() << " tables)" << TD::GetString(it->first) << endl; } prob_t base; - MFCR > r; + MFCR<1,vector > r; const double u0; - const vector l; - mutable vector v; + const vector l; + mutable vector v; }; struct BasicLexicalAlignment { diff --git a/gi/pf/conditional_pseg.h b/gi/pf/conditional_pseg.h index 86403d8d..ef73e332 100644 --- a/gi/pf/conditional_pseg.h +++ b/gi/pf/conditional_pseg.h @@ -17,13 +17,13 @@ template struct MConditionalTranslationModel { explicit MConditionalTranslationModel(ConditionalBaseMeasure& rcp0) : - rp0(rcp0), lambdas(1, 1.0), p0s(1) {} + rp0(rcp0), lambdas(1, prob_t::One()), p0s(1) {} void Summary() const { std::cerr << "Number of conditioning contexts: " << r.size() << std::endl; for (RuleModelHash::const_iterator it = r.begin(); it != r.end(); ++it) { std::cerr << TD::GetString(it->first) << " \t(d=" << it->second.discount() << ",s=" << it->second.strength() << ") --------------------------" << std::endl; - for (MFCR::const_iterator i2 = it->second.begin(); i2 != it->second.end(); ++i2) + for (MFCR<1,TRule>::const_iterator i2 = it->second.begin(); i2 != it->second.end(); ++i2) std::cerr << " " << -1 << '\t' << i2->first << std::endl; } } @@ -46,10 +46,10 @@ struct MConditionalTranslationModel { int IncrementRule(const TRule& rule, MT19937* rng) { RuleModelHash::iterator it = r.find(rule.f_); if (it == r.end()) { - it = r.insert(make_pair(rule.f_, MFCR(1, 1.0, 1.0, 1.0, 1.0, 1e-9, 4.0))).first; + it = r.insert(make_pair(rule.f_, MFCR<1,TRule>(1.0, 1.0, 1.0, 1.0, 1e-9, 4.0))).first; } - p0s[0] = rp0(rule).as_float(); - TableCount delta = it->second.increment(rule, p0s, lambdas, rng); + p0s[0] = rp0(rule); + TableCount delta = it->second.increment(rule, p0s.begin(), lambdas.begin(), rng); return delta.count; } @@ -57,10 +57,10 @@ struct MConditionalTranslationModel { prob_t p; RuleModelHash::const_iterator it = r.find(rule.f_); if (it == r.end()) { - p.logeq(log(rp0(rule))); + p = rp0(rule); } else { - p0s[0] = rp0(rule).as_float(); - p = prob_t(it->second.prob(rule, p0s, lambdas)); + p0s[0] = rp0(rule); + p = it->second.prob(rule, p0s.begin(), lambdas.begin()); } return p; } @@ -80,11 +80,11 @@ struct MConditionalTranslationModel { const ConditionalBaseMeasure& rp0; typedef std::tr1::unordered_map, - MFCR, + MFCR<1, TRule>, boost::hash > > RuleModelHash; RuleModelHash r; - std::vector lambdas; - mutable std::vector p0s; + std::vector lambdas; + mutable std::vector p0s; }; template diff --git a/gi/pf/learn_cfg.cc b/gi/pf/learn_cfg.cc index bf157828..ed1772bf 100644 --- a/gi/pf/learn_cfg.cc +++ b/gi/pf/learn_cfg.cc @@ -127,20 +127,20 @@ struct HieroLMModel { nts(num_nts, CCRP(1,1,1,1)) {} prob_t Prob(const TRule& r) const { - return nts[nt_id_to_index[-r.lhs_]].probT(r, p0(r)); + return nts[nt_id_to_index[-r.lhs_]].prob(r, p0(r)); } inline prob_t p0(const TRule& r) const { if (kHIERARCHICAL_PRIOR) - return q0.probT(r, base(r)); + return q0.prob(r, base(r)); else return base(r); } int Increment(const TRule& r, MT19937* rng) { - const int delta = nts[nt_id_to_index[-r.lhs_]].incrementT(r, p0(r), rng); + const int delta = nts[nt_id_to_index[-r.lhs_]].increment(r, p0(r), rng); if (kHIERARCHICAL_PRIOR && delta) - q0.incrementT(r, base(r), rng); + q0.increment(r, base(r), rng); return delta; // return x.increment(r); } diff --git a/utils/ccrp.h b/utils/ccrp.h index 5f9db7a6..e24130ac 100644 --- a/utils/ccrp.h +++ b/utils/ccrp.h @@ -92,42 +92,9 @@ class CCRP { return it->total_dish_count_; } - // returns +1 or 0 indicating whether a new table was opened - int increment(const Dish& dish, const double& p0, MT19937* rng) { - DishLocations& loc = dish_locs_[dish]; - bool share_table = false; - if (loc.total_dish_count_) { - const double p_empty = (strength_ + num_tables_ * discount_) * p0; - const double p_share = (loc.total_dish_count_ - loc.table_counts_.size() * discount_); - share_table = rng->SelectSample(p_empty, p_share); - } - if (share_table) { - double r = rng->next() * (loc.total_dish_count_ - loc.table_counts_.size() * discount_); - for (typename std::list::iterator ti = loc.table_counts_.begin(); - ti != loc.table_counts_.end(); ++ti) { - r -= (*ti - discount_); - if (r <= 0.0) { - ++(*ti); - break; - } - } - if (r > 0.0) { - std::cerr << "Serious error: r=" << r << std::endl; - Print(&std::cerr); - assert(r <= 0.0); - } - } else { - loc.table_counts_.push_back(1u); - ++num_tables_; - } - ++loc.total_dish_count_; - ++num_customers_; - return (share_table ? 0 : 1); - } - // returns +1 or 0 indicating whether a new table was opened template - int incrementT(const Dish& dish, const T& p0, MT19937* rng) { + int increment(const Dish& dish, const T& p0, MT19937* rng) { DishLocations& loc = dish_locs_[dish]; bool share_table = false; if (loc.total_dish_count_) { @@ -196,19 +163,8 @@ class CCRP { } } - double prob(const Dish& dish, const double& p0) const { - const typename std::tr1::unordered_map::const_iterator it = dish_locs_.find(dish); - const double r = num_tables_ * discount_ + strength_; - if (it == dish_locs_.end()) { - return r * p0 / (num_customers_ + strength_); - } else { - return (it->second.total_dish_count_ - discount_ * it->second.table_counts_.size() + r * p0) / - (num_customers_ + strength_); - } - } - template - T probT(const Dish& dish, const T& p0) const { + T prob(const Dish& dish, const T& p0) const { const typename std::tr1::unordered_map::const_iterator it = dish_locs_.find(dish); const T r = T(num_tables_ * discount_ + strength_); if (it == dish_locs_.end()) { diff --git a/utils/mfcr.h b/utils/mfcr.h index aeaf599d..6cc0ebf1 100644 --- a/utils/mfcr.h +++ b/utils/mfcr.h @@ -8,6 +8,7 @@ #include #include #include +#include #include #include #include "sampler.h" @@ -35,12 +36,11 @@ std::ostream& operator<<(std::ostream& o, const TableCount& tc) { // referenced therein. // http://www.aclweb.org/anthology/P/P09/P09-2085.pdf // -template > +template > class MFCR { public: - MFCR(unsigned num_floors, double d, double strength) : - num_floors_(num_floors), + MFCR(double d, double strength) : num_tables_(), num_customers_(), discount_(d), @@ -50,8 +50,7 @@ class MFCR { strength_prior_shape_(std::numeric_limits::quiet_NaN()), strength_prior_rate_(std::numeric_limits::quiet_NaN()) {} - MFCR(unsigned num_floors, double discount_strength, double discount_beta, double strength_shape, double strength_rate, double d = 0.9, double strength = 10.0) : - num_floors_(num_floors), + MFCR(double discount_strength, double discount_beta, double strength_shape, double strength_rate, double d = 0.9, double strength = 10.0) : num_tables_(), num_customers_(), discount_(d), @@ -111,22 +110,22 @@ class MFCR { } // returns (delta, floor) indicating whether a new table (delta) was opened and on which floor - TableCount increment(const Dish& dish, const std::vector& p0s, const std::vector& lambdas, MT19937* rng) { - assert(p0s.size() == num_floors_); - assert(lambdas.size() == num_floors_); - + template + TableCount increment(const Dish& dish, InputIterator p0s, InputIterator2 lambdas, MT19937* rng) { DishLocations& loc = dish_locs_[dish]; // marg_p0 = marginal probability of opening a new table on any floor with label dish - const double marg_p0 = std::inner_product(p0s.begin(), p0s.end(), lambdas.begin(), 0.0); - assert(marg_p0 <= 1.0); + typedef typename std::iterator_traits::value_type F; + const F marg_p0 = std::inner_product(p0s, p0s + Floors, lambdas, F(0.0)); + assert(marg_p0 <= F(1.0001)); int floor = -1; bool share_table = false; if (loc.total_dish_count_) { - const double p_empty = (strength_ + num_tables_ * discount_) * marg_p0; - const double p_share = (loc.total_dish_count_ - loc.table_counts_.size() * discount_); + const F p_empty = F(strength_ + num_tables_ * discount_) * marg_p0; + const F p_share = F(loc.total_dish_count_ - loc.table_counts_.size() * discount_); share_table = rng->SelectSample(p_empty, p_share); } if (share_table) { + // this can be done with doubles since P0 (which may be tiny) is not involved double r = rng->next() * (loc.total_dish_count_ - loc.table_counts_.size() * discount_); for (typename std::list::iterator ti = loc.table_counts_.begin(); ti != loc.table_counts_.end(); ++ti) { @@ -143,12 +142,18 @@ class MFCR { assert(r <= 0.0); } } else { // sit at currently empty table -- must sample what floor - double r = rng->next() * marg_p0; - for (unsigned i = 0; i < p0s.size(); ++i) { - r -= p0s[i] * lambdas[i]; - if (r <= 0.0) { - floor = i; - break; + if (Floors == 1) { + floor = 0; + } else { + F r = F(rng->next()) * marg_p0; + for (unsigned i = 0; i < Floors; ++i) { + r -= (*p0s) * (*lambdas); + ++p0s; + ++lambdas; + if (r <= F(0.0)) { + floor = i; + break; + } } } assert(floor >= 0); @@ -200,18 +205,18 @@ class MFCR { return TableCount(delta, floor); } - double prob(const Dish& dish, const std::vector& p0s, const std::vector& lambdas) const { - assert(p0s.size() == num_floors_); - assert(lambdas.size() == num_floors_); - const double marg_p0 = std::inner_product(p0s.begin(), p0s.end(), lambdas.begin(), 0.0); - assert(marg_p0 <= 1.0); + template + typename std::iterator_traits::value_type prob(const Dish& dish, InputIterator p0s, InputIterator2 lambdas) const { + typedef typename std::iterator_traits::value_type F; + const F marg_p0 = std::inner_product(p0s, p0s + Floors, lambdas, F(0.0)); + assert(marg_p0 <= F(1.0001)); const typename std::tr1::unordered_map::const_iterator it = dish_locs_.find(dish); - const double r = num_tables_ * discount_ + strength_; + const F r = F(num_tables_ * discount_ + strength_); if (it == dish_locs_.end()) { - return r * marg_p0 / (num_customers_ + strength_); + return r * marg_p0 / F(num_customers_ + strength_); } else { - return (it->second.total_dish_count_ - discount_ * it->second.table_counts_.size() + r * marg_p0) / - (num_customers_ + strength_); + return (F(it->second.total_dish_count_ - discount_ * it->second.table_counts_.size()) + F(r * marg_p0)) / + F(num_customers_ + strength_); } } @@ -303,7 +308,7 @@ class MFCR { }; void Print(std::ostream* out) const { - (*out) << "MFCR(d=" << discount_ << ",strength=" << strength_ << ") customers=" << num_customers_ << std::endl; + (*out) << "MFCR<" << Floors << ">(d=" << discount_ << ",strength=" << strength_ << ") customers=" << num_customers_ << std::endl; for (typename std::tr1::unordered_map::const_iterator it = dish_locs_.begin(); it != dish_locs_.end(); ++it) { (*out) << it->first << " (" << it->second.total_dish_count_ << " on " << it->second.table_counts_.size() << " tables): "; @@ -323,7 +328,6 @@ class MFCR { return dish_locs_.end(); } - unsigned num_floors_; unsigned num_tables_; unsigned num_customers_; std::tr1::unordered_map dish_locs_; @@ -340,8 +344,8 @@ class MFCR { double strength_prior_rate_; }; -template -std::ostream& operator<<(std::ostream& o, const MFCR& c) { +template +std::ostream& operator<<(std::ostream& o, const MFCR& c) { c.Print(&o); return o; } diff --git a/utils/mfcr_test.cc b/utils/mfcr_test.cc index 7c45a37c..cc886335 100644 --- a/utils/mfcr_test.cc +++ b/utils/mfcr_test.cc @@ -9,7 +9,7 @@ using namespace std; void test_exch(MT19937* rng) { - MFCR crp(2, 0.5, 3.0); + MFCR<2, int> crp(0.5, 3.0); vector lambdas(2); vector p0s(2); lambdas[0] = 0.2; @@ -22,23 +22,23 @@ void test_exch(MT19937* rng) { double xt = 0; int cust = 10; vector hist(cust + 1, 0), hist2(cust + 1, 0); - for (int i = 0; i < cust; ++i) { crp.increment(1, p0s, lambdas, rng); } + for (int i = 0; i < cust; ++i) { crp.increment(1, p0s.begin(), lambdas.begin(), rng); } const int samples = 100000; const bool simulate = true; for (int k = 0; k < samples; ++k) { if (!simulate) { crp.clear(); - for (int i = 0; i < cust; ++i) { crp.increment(1, p0s, lambdas, rng); } + for (int i = 0; i < cust; ++i) { crp.increment(1, p0s.begin(), lambdas.begin(), rng); } } else { int da = rng->next() * cust; bool a = rng->next() < 0.45; if (a) { - for (int i = 0; i < da; ++i) { crp.increment(1, p0s, lambdas, rng); } + for (int i = 0; i < da; ++i) { crp.increment(1, p0s.begin(), lambdas.begin(), rng); } for (int i = 0; i < da; ++i) { crp.decrement(1, rng); } xt += 1.0; } else { for (int i = 0; i < da; ++i) { crp.decrement(1, rng); } - for (int i = 0; i < da; ++i) { crp.increment(1, p0s, lambdas, rng); } + for (int i = 0; i < da; ++i) { crp.increment(1, p0s.begin(), lambdas.begin(), rng); } } } int c = crp.num_tables(1); -- cgit v1.2.3 From de34b1493df93169c991a1828f951ca5abc00cae Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Mon, 5 Mar 2012 21:36:07 -0500 Subject: tie hyperparameters for translation distributions; support theta < 0 for PYPLM --- gi/pf/align-lexonly-pyp.cc | 13 ++++----- gi/pf/conditional_pseg.h | 68 ++++++++++++++++++++++++++++++++++++---------- gi/pf/pyp_lm.cc | 12 ++++---- utils/ccrp.h | 4 +-- utils/mfcr.h | 19 +++++++++++-- 5 files changed, 84 insertions(+), 32 deletions(-) (limited to 'gi/pf') diff --git a/gi/pf/align-lexonly-pyp.cc b/gi/pf/align-lexonly-pyp.cc index ac0590e0..13a3a487 100644 --- a/gi/pf/align-lexonly-pyp.cc +++ b/gi/pf/align-lexonly-pyp.cc @@ -68,14 +68,14 @@ struct AlignedSentencePair { struct HierarchicalWordBase { explicit HierarchicalWordBase(const unsigned vocab_e_size) : - base(prob_t::One()), r(1,1,1,1), u0(-log(vocab_e_size)), l(1,prob_t::One()), v(1, prob_t::Zero()) {} + base(prob_t::One()), r(1,1,1,1,0.66,50.0), u0(-log(vocab_e_size)), l(1,prob_t::One()), v(1, prob_t::Zero()) {} void ResampleHyperparameters(MT19937* rng) { r.resample_hyperparameters(rng); } inline double logp0(const vector& s) const { - return s.size() * u0; + return Md::log_poisson(s.size(), 7.5) + s.size() * u0; } // return p0 of rule.e_ @@ -106,7 +106,7 @@ struct HierarchicalWordBase { void Summary() const { cerr << "NUMBER OF CUSTOMERS: " << r.num_customers() << " (d=" << r.discount() << ",s=" << r.strength() << ')' << endl; for (MFCR<1,vector >::const_iterator it = r.begin(); it != r.end(); ++it) - cerr << " " << it->second.total_dish_count_ << " (on " << it->second.table_counts_.size() << " tables)" << TD::GetString(it->first) << endl; + cerr << " " << it->second.total_dish_count_ << " (on " << it->second.table_counts_.size() << " tables) " << TD::GetString(it->first) << endl; } prob_t base; @@ -167,10 +167,9 @@ struct BasicLexicalAlignment { } void ResampleHyperparemeters() { - cerr << " LLH_prev = " << Likelihood() << flush; tmodel.ResampleHyperparameters(&*prng); up0.ResampleHyperparameters(&*prng); - cerr << "\tLLH_post = " << Likelihood() << endl; + cerr << " (base d=" << up0.r.discount() << ",s=" << up0.r.strength() << ")\n"; } void ResampleCorpus(); @@ -218,7 +217,7 @@ void BasicLexicalAlignment::ResampleCorpus() { up0.Increment(r); } } - cerr << " LLH = " << tmodel.Likelihood() << endl; + cerr << " LLH = " << Likelihood() << endl; } void ExtractLetters(const set& v, vector >* l, set* letset = NULL) { @@ -311,7 +310,7 @@ int main(int argc, char** argv) { for (int i = 0; i < samples; ++i) { for (int j = 65; j < 67; ++j) Debug(corpus[j]); cerr << i << "\t" << x.tmodel.r.size() << "\t"; - if (i % 10 == 0) x.ResampleHyperparemeters(); + if (i % 7 == 6) x.ResampleHyperparemeters(); x.ResampleCorpus(); if (i > (samples / 5) && (i % 10 == 9)) for (int j = 0; j < corpus.size(); ++j) AddSample(&corpus[j]); } diff --git a/gi/pf/conditional_pseg.h b/gi/pf/conditional_pseg.h index ef73e332..8202778b 100644 --- a/gi/pf/conditional_pseg.h +++ b/gi/pf/conditional_pseg.h @@ -17,21 +17,66 @@ template struct MConditionalTranslationModel { explicit MConditionalTranslationModel(ConditionalBaseMeasure& rcp0) : - rp0(rcp0), lambdas(1, prob_t::One()), p0s(1) {} + rp0(rcp0), d(0.5), strength(1.0), lambdas(1, prob_t::One()), p0s(1) {} void Summary() const { std::cerr << "Number of conditioning contexts: " << r.size() << std::endl; for (RuleModelHash::const_iterator it = r.begin(); it != r.end(); ++it) { std::cerr << TD::GetString(it->first) << " \t(d=" << it->second.discount() << ",s=" << it->second.strength() << ") --------------------------" << std::endl; for (MFCR<1,TRule>::const_iterator i2 = it->second.begin(); i2 != it->second.end(); ++i2) - std::cerr << " " << -1 << '\t' << i2->first << std::endl; + std::cerr << " " << i2->second.total_dish_count_ << '\t' << i2->first << std::endl; } } + double log_likelihood(const double& dd, const double& aa) const { + if (aa <= -dd) return -std::numeric_limits::infinity(); + //double llh = Md::log_beta_density(dd, 10, 3) + Md::log_gamma_density(aa, 1, 1); + double llh = Md::log_beta_density(dd, 1, 1) + + Md::log_gamma_density(dd + aa, 1, 1); + typename std::tr1::unordered_map, MFCR<1,TRule>, boost::hash > >::const_iterator it; + for (it = r.begin(); it != r.end(); ++it) + llh += it->second.log_crp_prob(dd, aa); + return llh; + } + + struct DiscountResampler { + DiscountResampler(const MConditionalTranslationModel& m) : m_(m) {} + const MConditionalTranslationModel& m_; + double operator()(const double& proposed_discount) const { + return m_.log_likelihood(proposed_discount, m_.strength); + } + }; + + struct AlphaResampler { + AlphaResampler(const MConditionalTranslationModel& m) : m_(m) {} + const MConditionalTranslationModel& m_; + double operator()(const double& proposed_strength) const { + return m_.log_likelihood(m_.d, proposed_strength); + } + }; + void ResampleHyperparameters(MT19937* rng) { - for (RuleModelHash::iterator it = r.begin(); it != r.end(); ++it) - it->second.resample_hyperparameters(rng); - } + const unsigned nloop = 5; + const unsigned niterations = 10; + DiscountResampler dr(*this); + AlphaResampler ar(*this); + for (int iter = 0; iter < nloop; ++iter) { + strength = slice_sampler1d(ar, strength, *rng, -d + std::numeric_limits::min(), + std::numeric_limits::infinity(), 0.0, niterations, 100*niterations); + double min_discount = std::numeric_limits::min(); + if (strength < 0.0) min_discount -= strength; + d = slice_sampler1d(dr, d, *rng, min_discount, + 1.0, 0.0, niterations, 100*niterations); + } + strength = slice_sampler1d(ar, strength, *rng, -d, + std::numeric_limits::infinity(), 0.0, niterations, 100*niterations); + typename std::tr1::unordered_map, MFCR<1,TRule>, boost::hash > >::iterator it; + std::cerr << "MConditionalTranslationModel(d=" << d << ",s=" << strength << ") = " << log_likelihood(d, strength) << std::endl; + for (it = r.begin(); it != r.end(); ++it) { + it->second.set_discount(d); + it->second.set_strength(strength); + } + } int DecrementRule(const TRule& rule, MT19937* rng) { RuleModelHash::iterator it = r.find(rule.f_); @@ -46,7 +91,7 @@ struct MConditionalTranslationModel { int IncrementRule(const TRule& rule, MT19937* rng) { RuleModelHash::iterator it = r.find(rule.f_); if (it == r.end()) { - it = r.insert(make_pair(rule.f_, MFCR<1,TRule>(1.0, 1.0, 1.0, 1.0, 1e-9, 4.0))).first; + it = r.insert(make_pair(rule.f_, MFCR<1,TRule>(d, strength))).first; } p0s[0] = rp0(rule); TableCount delta = it->second.increment(rule, p0s.begin(), lambdas.begin(), rng); @@ -66,15 +111,7 @@ struct MConditionalTranslationModel { } prob_t Likelihood() const { - prob_t p = prob_t::One(); -#if 0 - for (RuleModelHash::const_iterator it = r.begin(); it != r.end(); ++it) { - prob_t q; q.logeq(it->second.log_crp_prob()); - p *= q; - for (CCRP_NoTable::const_iterator i2 = it->second.begin(); i2 != it->second.end(); ++i2) - p *= rp0(i2->first); - } -#endif + prob_t p; p.logeq(log_likelihood(d, strength)); return p; } @@ -83,6 +120,7 @@ struct MConditionalTranslationModel { MFCR<1, TRule>, boost::hash > > RuleModelHash; RuleModelHash r; + double d, strength; std::vector lambdas; mutable std::vector p0s; }; diff --git a/gi/pf/pyp_lm.cc b/gi/pf/pyp_lm.cc index 7ebada13..104f356b 100644 --- a/gi/pf/pyp_lm.cc +++ b/gi/pf/pyp_lm.cc @@ -18,7 +18,7 @@ // I use templates to handle the recursive formalation of the prior, so // the order of the model has to be specified here, at compile time: -#define kORDER 3 +#define kORDER 4 using namespace std; using namespace tr1; @@ -114,7 +114,7 @@ template struct PYPLM { if (aa <= -dd) return -std::numeric_limits::infinity(); //double llh = Md::log_beta_density(dd, 10, 3) + Md::log_gamma_density(aa, 1, 1); double llh = Md::log_beta_density(dd, discount_a, discount_b) + - Md::log_gamma_density(aa, strength_s, strength_r); + Md::log_gamma_density(aa + dd, strength_s, strength_r); typename unordered_map, CCRP, boost::hash > >::const_iterator it; for (it = p.begin(); it != p.end(); ++it) llh += it->second.log_crp_prob(dd, aa); @@ -141,12 +141,14 @@ template struct PYPLM { DiscountResampler dr(*this); AlphaResampler ar(*this); for (int iter = 0; iter < nloop; ++iter) { - strength = slice_sampler1d(ar, strength, *rng, 0.0, + strength = slice_sampler1d(ar, strength, *rng, -d + std::numeric_limits::min(), std::numeric_limits::infinity(), 0.0, niterations, 100*niterations); - d = slice_sampler1d(dr, d, *rng, std::numeric_limits::min(), + double min_discount = std::numeric_limits::min(); + if (strength < 0.0) min_discount -= strength; + d = slice_sampler1d(dr, d, *rng, min_discount, 1.0, 0.0, niterations, 100*niterations); } - strength = slice_sampler1d(ar, strength, *rng, 0.0, + strength = slice_sampler1d(ar, strength, *rng, -d + std::numeric_limits::min(), std::numeric_limits::infinity(), 0.0, niterations, 100*niterations); typename unordered_map, CCRP, boost::hash > >::iterator it; cerr << "PYPLM<" << N << ">(d=" << d << ",a=" << strength << ") = " << log_likelihood(d, strength) << endl; diff --git a/utils/ccrp.h b/utils/ccrp.h index e24130ac..439d7e1e 100644 --- a/utils/ccrp.h +++ b/utils/ccrp.h @@ -225,12 +225,12 @@ class CCRP { StrengthResampler sr(*this); for (int iter = 0; iter < nloop; ++iter) { if (has_strength_prior()) { - strength_ = slice_sampler1d(sr, strength_, *rng, -discount_, + strength_ = slice_sampler1d(sr, strength_, *rng, -discount_ + std::numeric_limits::min(), std::numeric_limits::infinity(), 0.0, niterations, 100*niterations); } if (has_discount_prior()) { double min_discount = std::numeric_limits::min(); - if (strength_ < 0.0) min_discount = -strength_; + if (strength_ < 0.0) min_discount -= strength_; discount_ = slice_sampler1d(dr, discount_, *rng, min_discount, 1.0, 0.0, niterations, 100*niterations); } diff --git a/utils/mfcr.h b/utils/mfcr.h index 6cc0ebf1..886f01ef 100644 --- a/utils/mfcr.h +++ b/utils/mfcr.h @@ -48,7 +48,7 @@ class MFCR { discount_prior_strength_(std::numeric_limits::quiet_NaN()), discount_prior_beta_(std::numeric_limits::quiet_NaN()), strength_prior_shape_(std::numeric_limits::quiet_NaN()), - strength_prior_rate_(std::numeric_limits::quiet_NaN()) {} + strength_prior_rate_(std::numeric_limits::quiet_NaN()) { check_hyperparameters(); } MFCR(double discount_strength, double discount_beta, double strength_shape, double strength_rate, double d = 0.9, double strength = 10.0) : num_tables_(), @@ -58,10 +58,23 @@ class MFCR { discount_prior_strength_(discount_strength), discount_prior_beta_(discount_beta), strength_prior_shape_(strength_shape), - strength_prior_rate_(strength_rate) {} + strength_prior_rate_(strength_rate) { check_hyperparameters(); } + + void check_hyperparameters() { + if (discount_ < 0.0 || discount_ >= 1.0) { + std::cerr << "Bad discount: " << discount_ << std::endl; + abort(); + } + if (strength_ <= -discount_) { + std::cerr << "Bad strength: " << strength_ << " (discount=" << discount_ << ")" << std::endl; + abort(); + } + } double discount() const { return discount_; } double strength() const { return strength_; } + void set_discount(double d) { discount_ = d; check_hyperparameters(); } + void set_strength(double a) { strength_ = a; check_hyperparameters(); } bool has_discount_prior() const { return !std::isnan(discount_prior_strength_); @@ -275,7 +288,7 @@ class MFCR { } if (has_discount_prior()) { double min_discount = std::numeric_limits::min(); - if (strength_ < 0.0) min_discount = -strength_; + if (strength_ < 0.0) min_discount -= strength_; discount_ = slice_sampler1d(dr, discount_, *rng, min_discount, 1.0, 0.0, niterations, 100*niterations); } -- cgit v1.2.3 From 7fd9fe26f00cf31a7b407364399d37b4eaf04eba Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Wed, 7 Mar 2012 20:25:53 -0500 Subject: lattice builder --- gi/pf/Makefile.am | 7 +- gi/pf/align-tl.cc | 334 ++++++++++++++++++++++++++++++++++++++++++++++ gi/pf/conditional_pseg.h | 11 +- gi/pf/nuisance_test.cc | 161 ++++++++++++++++++++++ gi/pf/transliterations.cc | 193 +++++++++++++++++++++++++++ gi/pf/transliterations.h | 20 +++ 6 files changed, 723 insertions(+), 3 deletions(-) create mode 100644 gi/pf/align-tl.cc create mode 100644 gi/pf/nuisance_test.cc create mode 100644 gi/pf/transliterations.cc create mode 100644 gi/pf/transliterations.h (limited to 'gi/pf') diff --git a/gi/pf/Makefile.am b/gi/pf/Makefile.am index 7cf9c14d..5e89f02a 100644 --- a/gi/pf/Makefile.am +++ b/gi/pf/Makefile.am @@ -1,12 +1,17 @@ -bin_PROGRAMS = cbgi brat dpnaive pfbrat pfdist itg pfnaive condnaive align-lexonly align-lexonly-pyp learn_cfg pyp_lm +bin_PROGRAMS = cbgi brat dpnaive pfbrat pfdist itg pfnaive condnaive align-lexonly align-lexonly-pyp learn_cfg pyp_lm nuisance_test align-tl noinst_LIBRARIES = libpf.a + libpf_a_SOURCES = base_distributions.cc reachability.cc cfg_wfst_composer.cc corpus.cc unigrams.cc ngram_base.cc +nuisance_test_SOURCES = nuisance_test.cc transliterations.cc + align_lexonly_SOURCES = align-lexonly.cc align_lexonly_pyp_SOURCES = align-lexonly-pyp.cc +align_tl_SOURCES = align-tl.cc transliterations.cc + itg_SOURCES = itg.cc pyp_lm_SOURCES = pyp_lm.cc diff --git a/gi/pf/align-tl.cc b/gi/pf/align-tl.cc new file mode 100644 index 00000000..0e0454e5 --- /dev/null +++ b/gi/pf/align-tl.cc @@ -0,0 +1,334 @@ +#include +#include +#include + +#include +#include +#include + +#include "array2d.h" +#include "base_distributions.h" +#include "monotonic_pseg.h" +#include "conditional_pseg.h" +#include "trule.h" +#include "tdict.h" +#include "stringlib.h" +#include "filelib.h" +#include "dict.h" +#include "sampler.h" +#include "mfcr.h" +#include "corpus.h" +#include "ngram_base.h" +#include "transliterations.h" + +using namespace std; +using namespace tr1; +namespace po = boost::program_options; + +void InitCommandLine(int argc, char** argv, po::variables_map* conf) { + po::options_description opts("Configuration options"); + opts.add_options() + ("samples,s",po::value()->default_value(1000),"Number of samples") + ("input,i",po::value(),"Read parallel data from") + ("random_seed,S",po::value(), "Random seed"); + po::options_description clo("Command line options"); + clo.add_options() + ("config", po::value(), "Configuration file") + ("help,h", "Print this help message and exit"); + po::options_description dconfig_options, dcmdline_options; + dconfig_options.add(opts); + dcmdline_options.add(opts).add(clo); + + po::store(parse_command_line(argc, argv, dcmdline_options), *conf); + if (conf->count("config")) { + ifstream config((*conf)["config"].as().c_str()); + po::store(po::parse_config_file(config, dconfig_options), *conf); + } + po::notify(*conf); + + if (conf->count("help") || (conf->count("input") == 0)) { + cerr << dcmdline_options << endl; + exit(1); + } +} + +shared_ptr prng; + +struct LexicalAlignment { + unsigned char src_index; + bool is_transliteration; + vector > derivation; +}; + +struct AlignedSentencePair { + vector src; + vector trg; + vector a; + Array2D posterior; +}; + +struct HierarchicalWordBase { + explicit HierarchicalWordBase(const unsigned vocab_e_size) : + base(prob_t::One()), r(1,1,1,1,0.66,50.0), u0(-log(vocab_e_size)), l(1,prob_t::One()), v(1, prob_t::Zero()) {} + + void ResampleHyperparameters(MT19937* rng) { + r.resample_hyperparameters(rng); + } + + inline double logp0(const vector& s) const { + return Md::log_poisson(s.size(), 7.5) + s.size() * u0; + } + + // return p0 of rule.e_ + prob_t operator()(const TRule& rule) const { + v[0].logeq(logp0(rule.e_)); + return r.prob(rule.e_, v.begin(), l.begin()); + } + + void Increment(const TRule& rule) { + v[0].logeq(logp0(rule.e_)); + if (r.increment(rule.e_, v.begin(), l.begin(), &*prng).count) { + base *= v[0] * l[0]; + } + } + + void Decrement(const TRule& rule) { + if (r.decrement(rule.e_, &*prng).count) { + base /= prob_t(exp(logp0(rule.e_))); + } + } + + prob_t Likelihood() const { + prob_t p; p.logeq(r.log_crp_prob()); + p *= base; + return p; + } + + void Summary() const { + cerr << "NUMBER OF CUSTOMERS: " << r.num_customers() << " (d=" << r.discount() << ",s=" << r.strength() << ')' << endl; + for (MFCR<1,vector >::const_iterator it = r.begin(); it != r.end(); ++it) + cerr << " " << it->second.total_dish_count_ << " (on " << it->second.table_counts_.size() << " tables) " << TD::GetString(it->first) << endl; + } + + prob_t base; + MFCR<1,vector > r; + const double u0; + const vector l; + mutable vector v; +}; + +struct BasicLexicalAlignment { + explicit BasicLexicalAlignment(const vector >& lets, + const unsigned words_e, + const unsigned letters_e, + vector* corp) : + letters(lets), + corpus(*corp), + //up0(words_e), + //up0("en.chars.1gram", letters_e), + //up0("en.words.1gram"), + up0(letters_e), + //up0("en.chars.2gram"), + tmodel(up0) { + } + + void InstantiateRule(const WordID src, + const WordID trg, + TRule* rule) const { + static const WordID kX = TD::Convert("X") * -1; + rule->lhs_ = kX; + rule->e_ = letters[trg]; + rule->f_ = letters[src]; + } + + void InitializeRandom() { + const WordID kNULL = TD::Convert("NULL"); + cerr << "Initializing with random alignments ...\n"; + for (unsigned i = 0; i < corpus.size(); ++i) { + AlignedSentencePair& asp = corpus[i]; + asp.a.resize(asp.trg.size()); + for (unsigned j = 0; j < asp.trg.size(); ++j) { + const unsigned char a_j = prng->next() * (1 + asp.src.size()); + const WordID f_a_j = (a_j ? asp.src[a_j - 1] : kNULL); + TRule r; + InstantiateRule(f_a_j, asp.trg[j], &r); + asp.a[j].is_transliteration = false; + asp.a[j].src_index = a_j; + if (tmodel.IncrementRule(r, &*prng)) + up0.Increment(r); + } + } + cerr << " LLH = " << Likelihood() << endl; + } + + prob_t Likelihood() const { + prob_t p = tmodel.Likelihood(); + p *= up0.Likelihood(); + return p; + } + + void ResampleHyperparemeters() { + tmodel.ResampleHyperparameters(&*prng); + up0.ResampleHyperparameters(&*prng); + cerr << " (base d=" << up0.r.discount() << ",s=" << up0.r.strength() << ")\n"; + } + + void ResampleCorpus(); + + const vector >& letters; // spelling dictionary + vector& corpus; + //PhraseConditionalUninformativeBase up0; + //PhraseConditionalUninformativeUnigramBase up0; + //UnigramWordBase up0; + //HierarchicalUnigramBase up0; + HierarchicalWordBase up0; + //CompletelyUniformBase up0; + //FixedNgramBase up0; + //ConditionalTranslationModel tmodel; + //ConditionalTranslationModel tmodel; + //ConditionalTranslationModel tmodel; + //ConditionalTranslationModel tmodel; + MConditionalTranslationModel tmodel; + //ConditionalTranslationModel tmodel; + //ConditionalTranslationModel tmodel; +}; + +void BasicLexicalAlignment::ResampleCorpus() { + static const WordID kNULL = TD::Convert("NULL"); + for (unsigned i = 0; i < corpus.size(); ++i) { + AlignedSentencePair& asp = corpus[i]; + SampleSet ss; ss.resize(asp.src.size() + 1); + for (unsigned j = 0; j < asp.trg.size(); ++j) { + TRule r; + unsigned char& a_j = asp.a[j].src_index; + WordID f_a_j = (a_j ? asp.src[a_j - 1] : kNULL); + InstantiateRule(f_a_j, asp.trg[j], &r); + if (tmodel.DecrementRule(r, &*prng)) + up0.Decrement(r); + + for (unsigned prop_a_j = 0; prop_a_j <= asp.src.size(); ++prop_a_j) { + const WordID prop_f = (prop_a_j ? asp.src[prop_a_j - 1] : kNULL); + InstantiateRule(prop_f, asp.trg[j], &r); + ss[prop_a_j] = tmodel.RuleProbability(r); + } + a_j = prng->SelectSample(ss); + f_a_j = (a_j ? asp.src[a_j - 1] : kNULL); + InstantiateRule(f_a_j, asp.trg[j], &r); + if (tmodel.IncrementRule(r, &*prng)) + up0.Increment(r); + } + } + cerr << " LLH = " << Likelihood() << endl; +} + +void ExtractLetters(const set& v, vector >* l, set* letset = NULL) { + for (set::const_iterator it = v.begin(); it != v.end(); ++it) { + vector& letters = (*l)[*it]; + if (letters.size()) continue; // if e and f have the same word + + const string& w = TD::Convert(*it); + + size_t cur = 0; + while (cur < w.size()) { + const size_t len = UTF8Len(w[cur]); + letters.push_back(TD::Convert(w.substr(cur, len))); + if (letset) letset->insert(letters.back()); + cur += len; + } + } +} + +void Debug(const AlignedSentencePair& asp) { + cerr << TD::GetString(asp.src) << endl << TD::GetString(asp.trg) << endl; + Array2D a(asp.src.size(), asp.trg.size()); + for (unsigned j = 0; j < asp.trg.size(); ++j) + if (asp.a[j].src_index) a(asp.a[j].src_index - 1, j) = true; + cerr << a << endl; +} + +void AddSample(AlignedSentencePair* asp) { + for (unsigned j = 0; j < asp->trg.size(); ++j) + asp->posterior(asp->a[j].src_index, j)++; +} + +void WriteAlignments(const AlignedSentencePair& asp) { + bool first = true; + for (unsigned j = 0; j < asp.trg.size(); ++j) { + int src_index = -1; + int mc = -1; + for (unsigned i = 0; i <= asp.src.size(); ++i) { + if (asp.posterior(i, j) > mc) { + mc = asp.posterior(i, j); + src_index = i; + } + } + + if (src_index) { + if (first) first = false; else cout << ' '; + cout << (src_index - 1) << '-' << j; + } + } + cout << endl; +} + +int main(int argc, char** argv) { + po::variables_map conf; + InitCommandLine(argc, argv, &conf); + + if (conf.count("random_seed")) + prng.reset(new MT19937(conf["random_seed"].as())); + else + prng.reset(new MT19937); +// MT19937& rng = *prng; + + vector > corpuse, corpusf; + set vocabe, vocabf; + corpus::ReadParallelCorpus(conf["input"].as(), &corpusf, &corpuse, &vocabf, &vocabe); + cerr << "f-Corpus size: " << corpusf.size() << " sentences\n"; + cerr << "f-Vocabulary size: " << vocabf.size() << " types\n"; + cerr << "f-Corpus size: " << corpuse.size() << " sentences\n"; + cerr << "f-Vocabulary size: " << vocabe.size() << " types\n"; + assert(corpusf.size() == corpuse.size()); + + vector corpus(corpuse.size()); + for (unsigned i = 0; i < corpuse.size(); ++i) { + corpus[i].src.swap(corpusf[i]); + corpus[i].trg.swap(corpuse[i]); + corpus[i].posterior.resize(corpus[i].src.size() + 1, corpus[i].trg.size()); + } + corpusf.clear(); corpuse.clear(); + + vocabf.insert(TD::Convert("NULL")); + vector > letters(TD::NumWords()); + set letset; + ExtractLetters(vocabe, &letters, &letset); + ExtractLetters(vocabf, &letters, NULL); + letters[TD::Convert("NULL")].clear(); + + Transliterations tl; + + // TODO CONFIGURE THIS + int min_trans_src = 4; + + cerr << "Initializing transliteration DPs ...\n"; + for (int i = 0; i < corpus.size(); ++i) { + const vector& src = corpus[i].src; + const vector& trg = corpus[i].trg; + cerr << '.' << flush; + if (i % 80 == 79) cerr << endl; + for (int j = 0; j < src.size(); ++j) { + const vector& src_let = letters[src[j]]; + for (int k = 0; k < trg.size(); ++k) { + const vector& trg_let = letters[trg[k]]; + if (src_let.size() < min_trans_src) + tl.Forbid(src[j], trg[k]); + else + tl.Initialize(src[j], src_let, trg[k], trg_let); + } + } + } + cerr << endl; + tl.GraphSummary(); + + return 0; +} diff --git a/gi/pf/conditional_pseg.h b/gi/pf/conditional_pseg.h index 8202778b..81ddb206 100644 --- a/gi/pf/conditional_pseg.h +++ b/gi/pf/conditional_pseg.h @@ -56,6 +56,12 @@ struct MConditionalTranslationModel { }; void ResampleHyperparameters(MT19937* rng) { + typename std::tr1::unordered_map, MFCR<1,TRule>, boost::hash > >::iterator it; +#if 1 + for (it = r.begin(); it != r.end(); ++it) { + it->second.resample_hyperparameters(rng); + } +#else const unsigned nloop = 5; const unsigned niterations = 10; DiscountResampler dr(*this); @@ -70,12 +76,12 @@ struct MConditionalTranslationModel { } strength = slice_sampler1d(ar, strength, *rng, -d, std::numeric_limits::infinity(), 0.0, niterations, 100*niterations); - typename std::tr1::unordered_map, MFCR<1,TRule>, boost::hash > >::iterator it; std::cerr << "MConditionalTranslationModel(d=" << d << ",s=" << strength << ") = " << log_likelihood(d, strength) << std::endl; for (it = r.begin(); it != r.end(); ++it) { it->second.set_discount(d); it->second.set_strength(strength); } +#endif } int DecrementRule(const TRule& rule, MT19937* rng) { @@ -91,7 +97,8 @@ struct MConditionalTranslationModel { int IncrementRule(const TRule& rule, MT19937* rng) { RuleModelHash::iterator it = r.find(rule.f_); if (it == r.end()) { - it = r.insert(make_pair(rule.f_, MFCR<1,TRule>(d, strength))).first; + //it = r.insert(make_pair(rule.f_, MFCR<1,TRule>(d, strength))).first; + it = r.insert(make_pair(rule.f_, MFCR<1,TRule>(1,1,1,1,0.6, -0.12))).first; } p0s[0] = rp0(rule); TableCount delta = it->second.increment(rule, p0s.begin(), lambdas.begin(), rng); diff --git a/gi/pf/nuisance_test.cc b/gi/pf/nuisance_test.cc new file mode 100644 index 00000000..0f44fe95 --- /dev/null +++ b/gi/pf/nuisance_test.cc @@ -0,0 +1,161 @@ +#include "ccrp.h" + +#include +#include + +#include "tdict.h" +#include "transliterations.h" + +using namespace std; + +MT19937 rng; + +ostream& operator<<(ostream&os, const vector& v) { + os << '[' << v[0]; + if (v.size() == 2) os << ' ' << v[1]; + return os << ']'; +} + +struct Base { + Base() : llh(), v(2), v1(1), v2(1), crp(0.25, 0.5) {} + inline double p0(const vector& x) const { + double p = 0.75; + if (x.size() == 2) p = 0.25; + p *= 1.0 / 3.0; + if (x.size() == 2) p *= 1.0 / 3.0; + return p; + } + double est_deriv_prob(int a, int b, int seg) const { + assert(a > 0 && a < 4); // a \in {1,2,3} + assert(b > 0 && b < 4); // b \in {1,2,3} + assert(seg == 0 || seg == 1); // seg \in {0,1} + if (seg == 0) { + v[0] = a; + v[1] = b; + return crp.prob(v, p0(v)); + } else { + v1[0] = a; + v2[0] = b; + return crp.prob(v1, p0(v1)) * crp.prob(v2, p0(v2)); + } + } + double est_marginal_prob(int a, int b) const { + return est_deriv_prob(a,b,0) + est_deriv_prob(a,b,1); + } + int increment(int a, int b, double* pw = NULL) { + double p1 = est_deriv_prob(a, b, 0); + double p2 = est_deriv_prob(a, b, 1); + //p1 = 0.5; p2 = 0.5; + int seg = rng.SelectSample(p1,p2); + double tmp = 0; + if (!pw) pw = &tmp; + double& w = *pw; + if (seg == 0) { + v[0] = a; + v[1] = b; + w = crp.prob(v, p0(v)) / p1; + if (crp.increment(v, p0(v), &rng)) { + llh += log(p0(v)); + } + } else { + v1[0] = a; + w = crp.prob(v1, p0(v1)) / p2; + if (crp.increment(v1, p0(v1), &rng)) { + llh += log(p0(v1)); + } + v2[0] = b; + w *= crp.prob(v2, p0(v2)); + if (crp.increment(v2, p0(v2), &rng)) { + llh += log(p0(v2)); + } + } + return seg; + } + void increment(int a, int b, int seg) { + if (seg == 0) { + v[0] = a; + v[1] = b; + if (crp.increment(v, p0(v), &rng)) { + llh += log(p0(v)); + } + } else { + v1[0] = a; + if (crp.increment(v1, p0(v1), &rng)) { + llh += log(p0(v1)); + } + v2[0] = b; + if (crp.increment(v2, p0(v2), &rng)) { + llh += log(p0(v2)); + } + } + } + void decrement(int a, int b, int seg) { + if (seg == 0) { + v[0] = a; + v[1] = b; + if (crp.decrement(v, &rng)) { + llh -= log(p0(v)); + } + } else { + v1[0] = a; + if (crp.decrement(v1, &rng)) { + llh -= log(p0(v1)); + } + v2[0] = b; + if (crp.decrement(v2, &rng)) { + llh -= log(p0(v2)); + } + } + } + double log_likelihood() const { + return llh + crp.log_crp_prob(); + } + double llh; + mutable vector v, v1, v2; + CCRP > crp; +}; + +int main(int argc, char** argv) { + double tl = 0; + const int ITERS = 1000; + const int PARTICLES = 20; + const int DATAPOINTS = 50; + WordID x = TD::Convert("souvenons"); + WordID y = TD::Convert("remember"); + vector src; TD::ConvertSentence("s o u v e n o n s", &src); + vector trg; TD::ConvertSentence("r e m e m b e r", &trg); + Transliterations xx; + xx.Initialize(x, src, y, trg); + return 1; + + for (int j = 0; j < ITERS; ++j) { + Base b; + vector segs(DATAPOINTS); + SampleSet ss; + vector sss; + for (int i = 0; i < DATAPOINTS; i++) { + ss.clear(); + sss.clear(); + int x = ((i / 10) % 3) + 1; + int y = (i % 3) + 1; + //double ep = b.est_marginal_prob(x,y); + //cerr << "est p(" << x << "," << y << ") = " << ep << endl; + for (int n = 0; n < PARTICLES; ++n) { + double w; + int seg = b.increment(x,y,&w); + //cerr << seg << " w=" << w << endl; + ss.add(w); + sss.push_back(seg); + b.decrement(x,y,seg); + } + int seg = sss[rng.SelectSample(ss)]; + b.increment(x, y, seg); + //cerr << "Selected: " << seg << endl; + //return 1; + segs[i] = seg; + } + tl += b.log_likelihood(); + } + cerr << "LLH=" << tl / ITERS << endl; +} + diff --git a/gi/pf/transliterations.cc b/gi/pf/transliterations.cc new file mode 100644 index 00000000..6e0c2e93 --- /dev/null +++ b/gi/pf/transliterations.cc @@ -0,0 +1,193 @@ +#include "transliterations.h" + +#include +#include +#include + +#include "grammar.h" +#include "bottom_up_parser.h" +#include "hg.h" +#include "hg_intersect.h" +#include "filelib.h" +#include "ccrp.h" +#include "m.h" +#include "lattice.h" +#include "verbose.h" + +using namespace std; +using namespace std::tr1; + +static WordID kX; +static int kMAX_SRC_SIZE = 0; +static vector > cur_trg_chunks; + +vector tlttofreelist; + +static void InitTargetChunks(int max_size, const vector& trg) { + cur_trg_chunks.clear(); + vector tmp; + unordered_set, boost::hash > > u; + for (int len = 1; len <= max_size; ++len) { + int end = trg.size() + 1; + end -= len; + for (int i = 0; i < end; ++i) { + tmp.clear(); + for (int j = 0; j < len; ++j) + tmp.push_back(trg[i + j]); + if (u.insert(tmp).second) cur_trg_chunks.push_back(tmp); + } + } +} + +struct TransliterationGrammarIter : public GrammarIter, public RuleBin { + TransliterationGrammarIter() { tlttofreelist.push_back(this); } + TransliterationGrammarIter(const TRulePtr& inr, int symbol) { + if (inr) { + r.reset(new TRule(*inr)); + } else { + r.reset(new TRule); + } + TRule& rr = *r; + rr.lhs_ = kX; + rr.f_.push_back(symbol); + tlttofreelist.push_back(this); + } + virtual int GetNumRules() const { + if (!r) return 0; + return cur_trg_chunks.size(); + } + virtual TRulePtr GetIthRule(int i) const { + TRulePtr nr(new TRule(*r)); + nr->e_ = cur_trg_chunks[i]; + //cerr << nr->AsString() << endl; + return nr; + } + virtual int Arity() const { + return 0; + } + virtual const RuleBin* GetRules() const { + if (!r) return NULL; else return this; + } + virtual const GrammarIter* Extend(int symbol) const { + if (symbol <= 0) return NULL; + if (!r || !kMAX_SRC_SIZE || r->f_.size() < kMAX_SRC_SIZE) + return new TransliterationGrammarIter(r, symbol); + else + return NULL; + } + TRulePtr r; +}; + +struct TransliterationGrammar : public Grammar { + virtual const GrammarIter* GetRoot() const { + return new TransliterationGrammarIter; + } + virtual bool HasRuleForSpan(int, int, int distance) const { + return (distance < kMAX_SRC_SIZE); + } +}; + +struct TInfo { + TInfo() : initialized(false) {} + bool initialized; + Hypergraph lattice; // may be empty if transliteration is not possible + prob_t est_prob; // will be zero if not possible +}; + +struct TransliterationsImpl { + TransliterationsImpl() { + kX = TD::Convert("X")*-1; + kMAX_SRC_SIZE = 4; + grammars.push_back(GrammarPtr(new TransliterationGrammar)); + grammars.push_back(GrammarPtr(new GlueGrammar("S", "X"))); + SetSilent(true); + } + + void Initialize(WordID src, const vector& src_lets, WordID trg, const vector& trg_lets) { + if (src >= graphs.size()) graphs.resize(src + 1); + if (graphs[src][trg].initialized) return; + int kMAX_TRG_SIZE = 4; + InitTargetChunks(kMAX_TRG_SIZE, trg_lets); + ExhaustiveBottomUpParser parser("S", grammars); + Lattice lat(src_lets.size()), tlat(trg_lets.size()); + for (unsigned i = 0; i < src_lets.size(); ++i) + lat[i].push_back(LatticeArc(src_lets[i], 0.0, 1)); + for (unsigned i = 0; i < trg_lets.size(); ++i) + tlat[i].push_back(LatticeArc(trg_lets[i], 0.0, 1)); + //cerr << "Creating lattice for: " << TD::Convert(src) << " --> " << TD::Convert(trg) << endl; + //cerr << "'" << TD::GetString(src_lets) << "' --> " << TD::GetString(trg_lets) << endl; + if (!parser.Parse(lat, &graphs[src][trg].lattice)) { + //cerr << "Failed to parse " << TD::GetString(src_lets) << endl; + abort(); + } + if (HG::Intersect(tlat, &graphs[src][trg].lattice)) { + graphs[src][trg].est_prob = prob_t(1e-4); + } else { + graphs[src][trg].lattice.clear(); + //cerr << "Failed to intersect " << TD::GetString(src_lets) << " ||| " << TD::GetString(trg_lets) << endl; + graphs[src][trg].est_prob = prob_t::Zero(); + } + for (unsigned i = 0; i < tlttofreelist.size(); ++i) + delete tlttofreelist[i]; + tlttofreelist.clear(); + //cerr << "Number of paths: " << graphs[src][trg].lattice.NumberOfPaths() << endl; + graphs[src][trg].initialized = true; + } + + const prob_t& EstimateProbability(WordID src, WordID trg) const { + assert(src < graphs.size()); + const unordered_map& um = graphs[src]; + const unordered_map::const_iterator it = um.find(trg); + assert(it != um.end()); + assert(it->second.initialized); + return it->second.est_prob; + } + + void Forbid(WordID src, WordID trg) { + if (src >= graphs.size()) graphs.resize(src + 1); + graphs[src][trg].est_prob = prob_t::Zero(); + graphs[src][trg].initialized = true; + } + + void GraphSummary() const { + double tlp = 0; + int tt = 0; + for (int i = 0; i < graphs.size(); ++i) { + const unordered_map& um = graphs[i]; + unordered_map::const_iterator it; + for (it = um.begin(); it != um.end(); ++it) { + if (it->second.lattice.empty()) continue; + //cerr << TD::Convert(i) << " --> " << TD::Convert(it->first) << ": " << it->second.lattice.NumberOfPaths() << endl; + tlp += log(it->second.lattice.NumberOfPaths()); + tt++; + } + } + tlp /= tt; + cerr << "E[log paths] = " << tlp << endl; + cerr << "exp(E[log paths]) = " << exp(tlp) << endl; + } + + vector > graphs; + vector grammars; +}; + +Transliterations::Transliterations() : pimpl_(new TransliterationsImpl) {} +Transliterations::~Transliterations() { delete pimpl_; } + +void Transliterations::Initialize(WordID src, const vector& src_lets, WordID trg, const vector& trg_lets) { + pimpl_->Initialize(src, src_lets, trg, trg_lets); +} + +prob_t Transliterations::EstimateProbability(WordID src, WordID trg) const { + return pimpl_->EstimateProbability(src,trg); +} + +void Transliterations::Forbid(WordID src, WordID trg) { + pimpl_->Forbid(src, trg); +} + +void Transliterations::GraphSummary() const { + pimpl_->GraphSummary(); +} + + diff --git a/gi/pf/transliterations.h b/gi/pf/transliterations.h new file mode 100644 index 00000000..a548aacf --- /dev/null +++ b/gi/pf/transliterations.h @@ -0,0 +1,20 @@ +#ifndef _TRANSLITERATIONS_H_ +#define _TRANSLITERATIONS_H_ + +#include +#include "wordid.h" +#include "prob.h" + +struct TransliterationsImpl; +struct Transliterations { + explicit Transliterations(); + ~Transliterations(); + void Initialize(WordID src, const std::vector& src_lets, WordID trg, const std::vector& trg_lets); + void Forbid(WordID src, WordID trg); + void GraphSummary() const; + prob_t EstimateProbability(WordID src, WordID trg) const; + TransliterationsImpl* pimpl_; +}; + +#endif + -- cgit v1.2.3 From e2998fc79c9dd549b1c1bad537fdf1052276f82c Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Thu, 8 Mar 2012 01:46:32 -0500 Subject: simple context feature for tagger --- decoder/Makefile.am | 1 + decoder/cdec_ff.cc | 2 + decoder/ff_context.cc | 99 +++++++++++++++++++++++ decoder/ff_context.h | 23 ++++++ gi/pf/align-tl.cc | 6 +- gi/pf/reachability.cc | 2 + gi/pf/reachability.h | 6 +- gi/pf/transliterations.cc | 198 ++++++++++++++-------------------------------- gi/pf/transliterations.h | 5 +- 9 files changed, 194 insertions(+), 148 deletions(-) create mode 100644 decoder/ff_context.cc create mode 100644 decoder/ff_context.h (limited to 'gi/pf') diff --git a/decoder/Makefile.am b/decoder/Makefile.am index 30eaf04d..a00b18af 100644 --- a/decoder/Makefile.am +++ b/decoder/Makefile.am @@ -63,6 +63,7 @@ libcdec_a_SOURCES = \ ff.cc \ ff_rules.cc \ ff_wordset.cc \ + ff_context.cc \ ff_charset.cc \ ff_lm.cc \ ff_klm.cc \ diff --git a/decoder/cdec_ff.cc b/decoder/cdec_ff.cc index 4ce5749e..b516c386 100644 --- a/decoder/cdec_ff.cc +++ b/decoder/cdec_ff.cc @@ -1,6 +1,7 @@ #include #include "ff.h" +#include "ff_context.h" #include "ff_spans.h" #include "ff_lm.h" #include "ff_klm.h" @@ -42,6 +43,7 @@ void register_feature_functions() { #endif ff_registry.Register("SpanFeatures", new FFFactory()); ff_registry.Register("NgramFeatures", new FFFactory()); + ff_registry.Register("RuleContextFeatures", new FFFactory()); ff_registry.Register("RuleIdentityFeatures", new FFFactory()); ff_registry.Register("SourceSyntaxFeatures", new FFFactory); ff_registry.Register("SourceSpanSizeFeatures", new FFFactory); diff --git a/decoder/ff_context.cc b/decoder/ff_context.cc new file mode 100644 index 00000000..19f9a413 --- /dev/null +++ b/decoder/ff_context.cc @@ -0,0 +1,99 @@ +#include "ff_context.h" + +#include +#include +#include + +#include "filelib.h" +#include "stringlib.h" +#include "sentence_metadata.h" +#include "lattice.h" +#include "fdict.h" +#include "verbose.h" + +using namespace std; + +namespace { + string Escape(const string& x) { + string y = x; + for (int i = 0; i < y.size(); ++i) { + if (y[i] == '=') y[i]='_'; + if (y[i] == ';') y[i]='_'; + } + return y; + } +} + +RuleContextFeatures::RuleContextFeatures(const std::string& param) { + kSOS = TD::Convert(""); + kEOS = TD::Convert(""); + + // TODO param lets you pass in a string from the cdec.ini file +} + +void RuleContextFeatures::PrepareForInput(const SentenceMetadata& smeta) { + const Lattice& sl = smeta.GetSourceLattice(); + current_input.resize(sl.size()); + for (unsigned i = 0; i < sl.size(); ++i) { + if (sl[i].size() != 1) { + cerr << "Context features not supported with lattice inputs!\nid=" << smeta.GetSentenceId() << endl; + abort(); + } + current_input[i] = sl[i][0].label; + } +} + +void RuleContextFeatures::TraversalFeaturesImpl(const SentenceMetadata& smeta, + const Hypergraph::Edge& edge, + const vector& ant_contexts, + SparseVector* features, + SparseVector* estimated_features, + void* context) const { + const TRule& rule = *edge.rule_; + + if (rule.Arity() != 0 || // arity = 0, no nonterminals + rule.e_.size() != 1) return; // size = 1, predicted label is a single token + + + // you can see the current label "for free" + const WordID cur_label = rule.e_[0]; + // (if you want to see more labels, you have to be very careful, and muck + // about with contexts and ant_contexts) + + // but... you can look at as much of the source as you want! + const int from_src_index = edge.i_; // start of the span in the input being labeled + const int to_src_index = edge.j_; // end of the span in the input + // (note: in the case of tagging the size of the spans being labeled will + // always be 1, but in other formalisms, you can have bigger spans.) + + // this is the current token being labeled: + const WordID cur_input = current_input[from_src_index]; + + // let's get the previous token in the input (may be to the left of the start + // of the sentence!) + WordID prev_input = kSOS; + if (from_src_index > 0) { prev_input = current_input[from_src_index - 1]; } + // let's get the next token (may be to the left of the start of the sentence!) + WordID next_input = kEOS; + if (to_src_index < current_input.size()) { next_input = current_input[to_src_index]; } + + // now, build a feature string + ostringstream os; + // TD::Convert converts from the internal integer representation of a token + // to the actual token + os << "C1:" << TD::Convert(prev_input) << '_' + << TD::Convert(cur_input) << '|' << TD::Convert(cur_label); + // C1 is just to prevent a name clash + + // pick a value + double fval = 1.0; // can be any real value + + // add it to the feature vector FD::Convert converts the feature string to a + // feature int, Escape makes sure the feature string doesn't have any bad + // symbols that could confuse a parser somewhere + features->add_value(FD::Convert(Escape(os.str())), fval); + // that's it! + + // create more features if you like... +} + diff --git a/decoder/ff_context.h b/decoder/ff_context.h new file mode 100644 index 00000000..0d22b027 --- /dev/null +++ b/decoder/ff_context.h @@ -0,0 +1,23 @@ +#ifndef _FF_CONTEXT_H_ +#define _FF_CONTEXT_H_ + +#include +#include "ff.h" + +class RuleContextFeatures : public FeatureFunction { + public: + RuleContextFeatures(const std::string& param); + protected: + virtual void TraversalFeaturesImpl(const SentenceMetadata& smeta, + const Hypergraph::Edge& edge, + const std::vector& ant_contexts, + SparseVector* features, + SparseVector* estimated_features, + void* context) const; + virtual void PrepareForInput(const SentenceMetadata& smeta); + private: + std::vector current_input; + WordID kSOS, kEOS; +}; + +#endif diff --git a/gi/pf/align-tl.cc b/gi/pf/align-tl.cc index 0e0454e5..6bb8c886 100644 --- a/gi/pf/align-tl.cc +++ b/gi/pf/align-tl.cc @@ -310,18 +310,16 @@ int main(int argc, char** argv) { // TODO CONFIGURE THIS int min_trans_src = 4; - cerr << "Initializing transliteration DPs ...\n"; + cerr << "Initializing transliteration graph structures ...\n"; for (int i = 0; i < corpus.size(); ++i) { const vector& src = corpus[i].src; const vector& trg = corpus[i].trg; - cerr << '.' << flush; - if (i % 80 == 79) cerr << endl; for (int j = 0; j < src.size(); ++j) { const vector& src_let = letters[src[j]]; for (int k = 0; k < trg.size(); ++k) { const vector& trg_let = letters[trg[k]]; if (src_let.size() < min_trans_src) - tl.Forbid(src[j], trg[k]); + tl.Forbid(src[j], src_let, trg[k], trg_let); else tl.Initialize(src[j], src_let, trg[k], trg_let); } diff --git a/gi/pf/reachability.cc b/gi/pf/reachability.cc index 73dd8d39..70fb76da 100644 --- a/gi/pf/reachability.cc +++ b/gi/pf/reachability.cc @@ -47,6 +47,7 @@ void Reachability::ComputeReachability(int srclen, int trglen, int src_max_phras r[prevs[k].prev_src_covered][prevs[k].prev_trg_covered] = true; int src_delta = i - prevs[k].prev_src_covered; edges[prevs[k].prev_src_covered][prevs[k].prev_trg_covered][src_delta][j - prevs[k].prev_trg_covered] = true; + valid_deltas[prevs[k].prev_src_covered][prevs[k].prev_trg_covered].push_back(make_pair(src_delta,j - prevs[k].prev_trg_covered)); short &msd = max_src_delta[prevs[k].prev_src_covered][prevs[k].prev_trg_covered]; if (src_delta > msd) msd = src_delta; } @@ -56,6 +57,7 @@ void Reachability::ComputeReachability(int srclen, int trglen, int src_max_phras assert(!edges[0][0][0][1]); assert(!edges[0][0][0][0]); assert(max_src_delta[0][0] > 0); + cerr << "Sentence with length (" << srclen << ',' << trglen << ") has " << valid_deltas[0][0].size() << " out edges in its root node\n"; //cerr << "First cell contains " << b[0][0].size() << " forward pointers\n"; //for (int i = 0; i < b[0][0].size(); ++i) { // cerr << " -> (" << b[0][0][i].next_src_covered << "," << b[0][0][i].next_trg_covered << ")\n"; diff --git a/gi/pf/reachability.h b/gi/pf/reachability.h index 98450ec1..fb2f4965 100644 --- a/gi/pf/reachability.h +++ b/gi/pf/reachability.h @@ -12,12 +12,14 @@ // currently forbids 0 -> n and n -> 0 alignments struct Reachability { - boost::multi_array edges; // edges[src_covered][trg_covered][x][trg_delta] is this edge worth exploring? + boost::multi_array edges; // edges[src_covered][trg_covered][src_delta][trg_delta] is this edge worth exploring? boost::multi_array max_src_delta; // msd[src_covered][trg_covered] -- the largest src delta that's valid + boost::multi_array >, 2> valid_deltas; // valid_deltas[src_covered][trg_covered] list of valid transitions leaving a particular node Reachability(int srclen, int trglen, int src_max_phrase_len, int trg_max_phrase_len) : edges(boost::extents[srclen][trglen][src_max_phrase_len+1][trg_max_phrase_len+1]), - max_src_delta(boost::extents[srclen][trglen]) { + max_src_delta(boost::extents[srclen][trglen]), + valid_deltas(boost::extents[srclen][trglen]) { ComputeReachability(srclen, trglen, src_max_phrase_len, trg_max_phrase_len); } diff --git a/gi/pf/transliterations.cc b/gi/pf/transliterations.cc index 6e0c2e93..e29334fd 100644 --- a/gi/pf/transliterations.cc +++ b/gi/pf/transliterations.cc @@ -2,173 +2,92 @@ #include #include -#include -#include "grammar.h" -#include "bottom_up_parser.h" -#include "hg.h" -#include "hg_intersect.h" +#include "boost/shared_ptr.hpp" + #include "filelib.h" #include "ccrp.h" #include "m.h" -#include "lattice.h" -#include "verbose.h" +#include "reachability.h" using namespace std; using namespace std::tr1; -static WordID kX; -static int kMAX_SRC_SIZE = 0; -static vector > cur_trg_chunks; - -vector tlttofreelist; - -static void InitTargetChunks(int max_size, const vector& trg) { - cur_trg_chunks.clear(); - vector tmp; - unordered_set, boost::hash > > u; - for (int len = 1; len <= max_size; ++len) { - int end = trg.size() + 1; - end -= len; - for (int i = 0; i < end; ++i) { - tmp.clear(); - for (int j = 0; j < len; ++j) - tmp.push_back(trg[i + j]); - if (u.insert(tmp).second) cur_trg_chunks.push_back(tmp); - } - } -} - -struct TransliterationGrammarIter : public GrammarIter, public RuleBin { - TransliterationGrammarIter() { tlttofreelist.push_back(this); } - TransliterationGrammarIter(const TRulePtr& inr, int symbol) { - if (inr) { - r.reset(new TRule(*inr)); - } else { - r.reset(new TRule); - } - TRule& rr = *r; - rr.lhs_ = kX; - rr.f_.push_back(symbol); - tlttofreelist.push_back(this); - } - virtual int GetNumRules() const { - if (!r) return 0; - return cur_trg_chunks.size(); - } - virtual TRulePtr GetIthRule(int i) const { - TRulePtr nr(new TRule(*r)); - nr->e_ = cur_trg_chunks[i]; - //cerr << nr->AsString() << endl; - return nr; - } - virtual int Arity() const { - return 0; - } - virtual const RuleBin* GetRules() const { - if (!r) return NULL; else return this; - } - virtual const GrammarIter* Extend(int symbol) const { - if (symbol <= 0) return NULL; - if (!r || !kMAX_SRC_SIZE || r->f_.size() < kMAX_SRC_SIZE) - return new TransliterationGrammarIter(r, symbol); - else - return NULL; - } - TRulePtr r; -}; - -struct TransliterationGrammar : public Grammar { - virtual const GrammarIter* GetRoot() const { - return new TransliterationGrammarIter; - } - virtual bool HasRuleForSpan(int, int, int distance) const { - return (distance < kMAX_SRC_SIZE); - } -}; - -struct TInfo { - TInfo() : initialized(false) {} +struct GraphStructure { + GraphStructure() : initialized(false) {} + boost::shared_ptr r; bool initialized; - Hypergraph lattice; // may be empty if transliteration is not possible - prob_t est_prob; // will be zero if not possible }; struct TransliterationsImpl { TransliterationsImpl() { - kX = TD::Convert("X")*-1; - kMAX_SRC_SIZE = 4; - grammars.push_back(GrammarPtr(new TransliterationGrammar)); - grammars.push_back(GrammarPtr(new GlueGrammar("S", "X"))); - SetSilent(true); } void Initialize(WordID src, const vector& src_lets, WordID trg, const vector& trg_lets) { - if (src >= graphs.size()) graphs.resize(src + 1); - if (graphs[src][trg].initialized) return; - int kMAX_TRG_SIZE = 4; - InitTargetChunks(kMAX_TRG_SIZE, trg_lets); - ExhaustiveBottomUpParser parser("S", grammars); - Lattice lat(src_lets.size()), tlat(trg_lets.size()); - for (unsigned i = 0; i < src_lets.size(); ++i) - lat[i].push_back(LatticeArc(src_lets[i], 0.0, 1)); - for (unsigned i = 0; i < trg_lets.size(); ++i) - tlat[i].push_back(LatticeArc(trg_lets[i], 0.0, 1)); - //cerr << "Creating lattice for: " << TD::Convert(src) << " --> " << TD::Convert(trg) << endl; - //cerr << "'" << TD::GetString(src_lets) << "' --> " << TD::GetString(trg_lets) << endl; - if (!parser.Parse(lat, &graphs[src][trg].lattice)) { - //cerr << "Failed to parse " << TD::GetString(src_lets) << endl; - abort(); - } - if (HG::Intersect(tlat, &graphs[src][trg].lattice)) { - graphs[src][trg].est_prob = prob_t(1e-4); + const size_t src_len = src_lets.size(); + const size_t trg_len = trg_lets.size(); + if (src_len >= graphs.size()) graphs.resize(src_len + 1); + if (trg_len >= graphs[src_len].size()) graphs[src_len].resize(trg_len + 1); + if (graphs[src_len][trg_len].initialized) return; + graphs[src_len][trg_len].r.reset(new Reachability(src_len, trg_len, 4, 4)); + +#if 0 + if (HG::Intersect(tlat, &hg)) { + // TODO } else { - graphs[src][trg].lattice.clear(); - //cerr << "Failed to intersect " << TD::GetString(src_lets) << " ||| " << TD::GetString(trg_lets) << endl; - graphs[src][trg].est_prob = prob_t::Zero(); + cerr << "No transliteration lattice possible for src_len=" << src_len << " trg_len=" << trg_len << endl; + hg.clear(); } - for (unsigned i = 0; i < tlttofreelist.size(); ++i) - delete tlttofreelist[i]; - tlttofreelist.clear(); //cerr << "Number of paths: " << graphs[src][trg].lattice.NumberOfPaths() << endl; - graphs[src][trg].initialized = true; +#endif + graphs[src_len][trg_len].initialized = true; } - const prob_t& EstimateProbability(WordID src, WordID trg) const { - assert(src < graphs.size()); - const unordered_map& um = graphs[src]; - const unordered_map::const_iterator it = um.find(trg); - assert(it != um.end()); - assert(it->second.initialized); - return it->second.est_prob; + void Forbid(WordID src, const vector& src_lets, WordID trg, const vector& trg_lets) { + const size_t src_len = src_lets.size(); + const size_t trg_len = trg_lets.size(); + if (src_len >= graphs.size()) graphs.resize(src_len + 1); + if (trg_len >= graphs[src_len].size()) graphs[src_len].resize(trg_len + 1); + graphs[src_len][trg_len].r.reset(); + graphs[src_len][trg_len].initialized = true; } - void Forbid(WordID src, WordID trg) { - if (src >= graphs.size()) graphs.resize(src + 1); - graphs[src][trg].est_prob = prob_t::Zero(); - graphs[src][trg].initialized = true; + prob_t EstimateProbability(WordID s, const vector& src, WordID t, const vector& trg) const { + assert(src.size() < graphs.size()); + const vector& tv = graphs[src.size()]; + assert(trg.size() < tv.size()); + const GraphStructure& gs = tv[trg.size()]; + // TODO: do prob + return prob_t::Zero(); } void GraphSummary() const { - double tlp = 0; - int tt = 0; + double to = 0; + double tn = 0; + double tt = 0; for (int i = 0; i < graphs.size(); ++i) { - const unordered_map& um = graphs[i]; - unordered_map::const_iterator it; - for (it = um.begin(); it != um.end(); ++it) { - if (it->second.lattice.empty()) continue; - //cerr << TD::Convert(i) << " --> " << TD::Convert(it->first) << ": " << it->second.lattice.NumberOfPaths() << endl; - tlp += log(it->second.lattice.NumberOfPaths()); + const vector& vt = graphs[i]; + for (int j = 0; j < vt.size(); ++j) { + const GraphStructure& gs = vt[j]; + if (!gs.r) continue; tt++; + for (int k = 0; k < i; ++k) { + for (int l = 0; l < j; ++l) { + size_t c = gs.r->valid_deltas[k][l].size(); + if (c) { + tn += 1; + to += c; + } + } + } } } - tlp /= tt; - cerr << "E[log paths] = " << tlp << endl; - cerr << "exp(E[log paths]) = " << exp(tlp) << endl; + cerr << " Average nodes = " << (tn / tt) << endl; + cerr << "Average out-degree = " << (to / tn) << endl; + cerr << " Unique structures = " << tt << endl; } - vector > graphs; - vector grammars; + vector > graphs; // graphs[src_len][trg_len] }; Transliterations::Transliterations() : pimpl_(new TransliterationsImpl) {} @@ -178,16 +97,15 @@ void Transliterations::Initialize(WordID src, const vector& src_lets, Wo pimpl_->Initialize(src, src_lets, trg, trg_lets); } -prob_t Transliterations::EstimateProbability(WordID src, WordID trg) const { - return pimpl_->EstimateProbability(src,trg); +prob_t Transliterations::EstimateProbability(WordID s, const vector& src, WordID t, const vector& trg) const { + return pimpl_->EstimateProbability(s, src,t, trg); } -void Transliterations::Forbid(WordID src, WordID trg) { - pimpl_->Forbid(src, trg); +void Transliterations::Forbid(WordID src, const vector& src_lets, WordID trg, const vector& trg_lets) { + pimpl_->Forbid(src, src_lets, trg, trg_lets); } void Transliterations::GraphSummary() const { pimpl_->GraphSummary(); } - diff --git a/gi/pf/transliterations.h b/gi/pf/transliterations.h index a548aacf..76eb2a05 100644 --- a/gi/pf/transliterations.h +++ b/gi/pf/transliterations.h @@ -10,9 +10,10 @@ struct Transliterations { explicit Transliterations(); ~Transliterations(); void Initialize(WordID src, const std::vector& src_lets, WordID trg, const std::vector& trg_lets); - void Forbid(WordID src, WordID trg); + void Forbid(WordID src, const std::vector& src_lets, WordID trg, const std::vector& trg_lets); void GraphSummary() const; - prob_t EstimateProbability(WordID src, WordID trg) const; + prob_t EstimateProbability(WordID s, const std::vector& src, WordID t, const std::vector& trg) const; + private: TransliterationsImpl* pimpl_; }; -- cgit v1.2.3 From 9a8256604686a9283e7afce04e6feaab4922dd45 Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Thu, 8 Mar 2012 13:32:41 -0500 Subject: tl stuff --- gi/pf/Makefile.am | 8 +++-- gi/pf/align-tl.cc | 8 +++-- gi/pf/reachability.cc | 17 +++++++--- gi/pf/reachability.h | 4 +++ gi/pf/transliterations.cc | 82 ++++++++++++++++++++++++++++++++++------------- gi/pf/transliterations.h | 3 +- 6 files changed, 88 insertions(+), 34 deletions(-) (limited to 'gi/pf') diff --git a/gi/pf/Makefile.am b/gi/pf/Makefile.am index 5e89f02a..9888a70b 100644 --- a/gi/pf/Makefile.am +++ b/gi/pf/Makefile.am @@ -2,15 +2,17 @@ bin_PROGRAMS = cbgi brat dpnaive pfbrat pfdist itg pfnaive condnaive align-lexon noinst_LIBRARIES = libpf.a -libpf_a_SOURCES = base_distributions.cc reachability.cc cfg_wfst_composer.cc corpus.cc unigrams.cc ngram_base.cc +libpf_a_SOURCES = base_distributions.cc reachability.cc cfg_wfst_composer.cc corpus.cc unigrams.cc ngram_base.cc transliterations.cc -nuisance_test_SOURCES = nuisance_test.cc transliterations.cc +nuisance_test_SOURCES = nuisance_test.cc +nuisance_test_LDADD = libpf.a align_lexonly_SOURCES = align-lexonly.cc align_lexonly_pyp_SOURCES = align-lexonly-pyp.cc -align_tl_SOURCES = align-tl.cc transliterations.cc +align_tl_SOURCES = align-tl.cc +align_tl_LDADD = libpf.a itg_SOURCES = itg.cc diff --git a/gi/pf/align-tl.cc b/gi/pf/align-tl.cc index 6bb8c886..fe8950b5 100644 --- a/gi/pf/align-tl.cc +++ b/gi/pf/align-tl.cc @@ -305,7 +305,10 @@ int main(int argc, char** argv) { ExtractLetters(vocabf, &letters, NULL); letters[TD::Convert("NULL")].clear(); - Transliterations tl; + // TODO configure this + int max_src_chunk = 4; + int max_trg_chunk = 4; + Transliterations tl(max_src_chunk, max_trg_chunk); // TODO CONFIGURE THIS int min_trans_src = 4; @@ -318,10 +321,9 @@ int main(int argc, char** argv) { const vector& src_let = letters[src[j]]; for (int k = 0; k < trg.size(); ++k) { const vector& trg_let = letters[trg[k]]; + tl.Initialize(src[j], src_let, trg[k], trg_let); if (src_let.size() < min_trans_src) tl.Forbid(src[j], src_let, trg[k], trg_let); - else - tl.Initialize(src[j], src_let, trg[k], trg_let); } } } diff --git a/gi/pf/reachability.cc b/gi/pf/reachability.cc index 70fb76da..59bc6ace 100644 --- a/gi/pf/reachability.cc +++ b/gi/pf/reachability.cc @@ -39,6 +39,7 @@ void Reachability::ComputeReachability(int srclen, int trglen, int src_max_phras typedef boost::multi_array rarray_type; rarray_type r(boost::extents[srclen + 1][trglen + 1]); r[srclen][trglen] = true; + nodes = 0; for (int i = srclen; i >= 0; --i) { for (int j = trglen; j >= 0; --j) { vector& prevs = a[i][j]; @@ -57,10 +58,16 @@ void Reachability::ComputeReachability(int srclen, int trglen, int src_max_phras assert(!edges[0][0][0][1]); assert(!edges[0][0][0][0]); assert(max_src_delta[0][0] > 0); - cerr << "Sentence with length (" << srclen << ',' << trglen << ") has " << valid_deltas[0][0].size() << " out edges in its root node\n"; - //cerr << "First cell contains " << b[0][0].size() << " forward pointers\n"; - //for (int i = 0; i < b[0][0].size(); ++i) { - // cerr << " -> (" << b[0][0][i].next_src_covered << "," << b[0][0][i].next_trg_covered << ")\n"; - //} + nodes = 0; + for (int i = 0; i < srclen; ++i) { + for (int j = 0; j < trglen; ++j) { + if (valid_deltas[i][j].size() > 0) { + node_addresses[i][j] = nodes++; + } else { + node_addresses[i][j] = -1; + } + } + } + cerr << "Sequence pair with lengths (" << srclen << ',' << trglen << ") has " << valid_deltas[0][0].size() << " out edges in its root node, " << nodes << " nodes in total, and outside estimate matrix will require " << sizeof(float)*nodes << " bytes\n"; } diff --git a/gi/pf/reachability.h b/gi/pf/reachability.h index fb2f4965..1e22c76a 100644 --- a/gi/pf/reachability.h +++ b/gi/pf/reachability.h @@ -12,13 +12,17 @@ // currently forbids 0 -> n and n -> 0 alignments struct Reachability { + unsigned nodes; boost::multi_array edges; // edges[src_covered][trg_covered][src_delta][trg_delta] is this edge worth exploring? boost::multi_array max_src_delta; // msd[src_covered][trg_covered] -- the largest src delta that's valid + boost::multi_array node_addresses; // na[src_covered][trg_covered] -- the index of the node in a one-dimensional array (of size "nodes") boost::multi_array >, 2> valid_deltas; // valid_deltas[src_covered][trg_covered] list of valid transitions leaving a particular node Reachability(int srclen, int trglen, int src_max_phrase_len, int trg_max_phrase_len) : + nodes(), edges(boost::extents[srclen][trglen][src_max_phrase_len+1][trg_max_phrase_len+1]), max_src_delta(boost::extents[srclen][trglen]), + node_addresses(boost::extents[srclen][trglen]), valid_deltas(boost::extents[srclen][trglen]) { ComputeReachability(srclen, trglen, src_max_phrase_len, trg_max_phrase_len); } diff --git a/gi/pf/transliterations.cc b/gi/pf/transliterations.cc index e29334fd..61e95b82 100644 --- a/gi/pf/transliterations.cc +++ b/gi/pf/transliterations.cc @@ -14,42 +14,75 @@ using namespace std; using namespace std::tr1; struct GraphStructure { - GraphStructure() : initialized(false) {} - boost::shared_ptr r; - bool initialized; + GraphStructure() : r() {} + // leak memory - these are basically static + const Reachability* r; + bool IsReachable() const { return r->nodes > 0; } +}; + +struct BackwardEstimates { + BackwardEstimates() : gs(), backward() {} + explicit BackwardEstimates(const GraphStructure& g) : + gs(&g), backward() { + if (g.r->nodes > 0) + backward = new float[g.r->nodes]; + } + // leak memory, these are static + + // returns an estimate of the marginal probability + double MarginalEstimate() const { + if (!backward) return 0; + return backward[0]; + } + + // returns an backward estimate + double operator()(int src_covered, int trg_covered) const { + if (!backward) return 0; + int ind = gs->r->node_addresses[src_covered][trg_covered]; + if (ind < 0) return 0; + return backward[ind]; + } + private: + const GraphStructure* gs; + float* backward; }; struct TransliterationsImpl { - TransliterationsImpl() { + TransliterationsImpl(int max_src, int max_trg) : + kMAX_SRC_CHUNK(max_src), + kMAX_TRG_CHUNK(max_trg), + tot_pairs() { } void Initialize(WordID src, const vector& src_lets, WordID trg, const vector& trg_lets) { const size_t src_len = src_lets.size(); const size_t trg_len = trg_lets.size(); + + // init graph structure if (src_len >= graphs.size()) graphs.resize(src_len + 1); if (trg_len >= graphs[src_len].size()) graphs[src_len].resize(trg_len + 1); - if (graphs[src_len][trg_len].initialized) return; - graphs[src_len][trg_len].r.reset(new Reachability(src_len, trg_len, 4, 4)); - -#if 0 - if (HG::Intersect(tlat, &hg)) { - // TODO - } else { - cerr << "No transliteration lattice possible for src_len=" << src_len << " trg_len=" << trg_len << endl; - hg.clear(); - } - //cerr << "Number of paths: " << graphs[src][trg].lattice.NumberOfPaths() << endl; -#endif - graphs[src_len][trg_len].initialized = true; + GraphStructure& gs = graphs[src_len][trg_len]; + if (!gs.r) + gs.r = new Reachability(src_len, trg_len, kMAX_SRC_CHUNK, kMAX_TRG_CHUNK); + const Reachability& r = *gs.r; + + // init backward estimates + if (src >= bes.size()) bes.resize(src + 1); + unordered_map::iterator it = bes[src].find(trg); + if (it != bes[src].end()) return; // already initialized + + it = bes[src].insert(make_pair(trg, BackwardEstimates(gs))).first; + BackwardEstimates& b = it->second; + if (!gs.r->nodes) return; // not derivable subject to length constraints + + // TODO + tot_pairs++; } void Forbid(WordID src, const vector& src_lets, WordID trg, const vector& trg_lets) { const size_t src_len = src_lets.size(); const size_t trg_len = trg_lets.size(); - if (src_len >= graphs.size()) graphs.resize(src_len + 1); - if (trg_len >= graphs[src_len].size()) graphs[src_len].resize(trg_len + 1); - graphs[src_len][trg_len].r.reset(); - graphs[src_len][trg_len].initialized = true; + // TODO } prob_t EstimateProbability(WordID s, const vector& src, WordID t, const vector& trg) const { @@ -85,12 +118,17 @@ struct TransliterationsImpl { cerr << " Average nodes = " << (tn / tt) << endl; cerr << "Average out-degree = " << (to / tn) << endl; cerr << " Unique structures = " << tt << endl; + cerr << " Unique pairs = " << tot_pairs << endl; } + const int kMAX_SRC_CHUNK; + const int kMAX_TRG_CHUNK; + unsigned tot_pairs; vector > graphs; // graphs[src_len][trg_len] + vector > bes; // bes[src][trg] }; -Transliterations::Transliterations() : pimpl_(new TransliterationsImpl) {} +Transliterations::Transliterations(int max_src, int max_trg) : pimpl_(new TransliterationsImpl(max_src, max_trg)) {} Transliterations::~Transliterations() { delete pimpl_; } void Transliterations::Initialize(WordID src, const vector& src_lets, WordID trg, const vector& trg_lets) { diff --git a/gi/pf/transliterations.h b/gi/pf/transliterations.h index 76eb2a05..e025547e 100644 --- a/gi/pf/transliterations.h +++ b/gi/pf/transliterations.h @@ -7,7 +7,8 @@ struct TransliterationsImpl; struct Transliterations { - explicit Transliterations(); + // max_src and max_trg indicate how big the transliteration phrases can be + explicit Transliterations(int max_src, int max_trg); ~Transliterations(); void Initialize(WordID src, const std::vector& src_lets, WordID trg, const std::vector& trg_lets); void Forbid(WordID src, const std::vector& src_lets, WordID trg, const std::vector& trg_lets); -- cgit v1.2.3 From 301106af9c13285ff252c618848eaa54b9b0e490 Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Thu, 8 Mar 2012 18:47:23 +0000 Subject: fix link error on linux --- gi/pf/Makefile.am | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'gi/pf') diff --git a/gi/pf/Makefile.am b/gi/pf/Makefile.am index 9888a70b..94364c3d 100644 --- a/gi/pf/Makefile.am +++ b/gi/pf/Makefile.am @@ -5,14 +5,14 @@ noinst_LIBRARIES = libpf.a libpf_a_SOURCES = base_distributions.cc reachability.cc cfg_wfst_composer.cc corpus.cc unigrams.cc ngram_base.cc transliterations.cc nuisance_test_SOURCES = nuisance_test.cc -nuisance_test_LDADD = libpf.a +nuisance_test_LDADD = libpf.a $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a $(top_srcdir)/klm/lm/libklm.a $(top_srcdir)/klm/util/libklm_util.a -lz align_lexonly_SOURCES = align-lexonly.cc align_lexonly_pyp_SOURCES = align-lexonly-pyp.cc align_tl_SOURCES = align-tl.cc -align_tl_LDADD = libpf.a +align_tl_LDADD = libpf.a $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a $(top_srcdir)/klm/lm/libklm.a $(top_srcdir)/klm/util/libklm_util.a -lz itg_SOURCES = itg.cc -- cgit v1.2.3 From 78bf1457f606dd3880c2bc912201c4945d3f85b4 Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Thu, 8 Mar 2012 14:29:42 -0500 Subject: moar --- gi/pf/align-tl.cc | 15 +++++++++------ gi/pf/reachability.cc | 9 +++++---- gi/pf/reachability.h | 8 +++++--- gi/pf/transliterations.cc | 14 ++++++++++---- gi/pf/transliterations.h | 3 ++- 5 files changed, 31 insertions(+), 18 deletions(-) (limited to 'gi/pf') diff --git a/gi/pf/align-tl.cc b/gi/pf/align-tl.cc index fe8950b5..fc9b7ca5 100644 --- a/gi/pf/align-tl.cc +++ b/gi/pf/align-tl.cc @@ -30,6 +30,10 @@ void InitCommandLine(int argc, char** argv, po::variables_map* conf) { opts.add_options() ("samples,s",po::value()->default_value(1000),"Number of samples") ("input,i",po::value(),"Read parallel data from") + ("max_src_chunk", po::value()->default_value(4), "Maximum size of translitered chunk in source") + ("max_trg_chunk", po::value()->default_value(4), "Maximum size of translitered chunk in target") + ("min_transliterated_src_length", po::value()->default_value(3), "Minimum length of source words considered for transliteration") + ("filter_ratio", po::value()->default_value(0.66), "Filter ratio: basically, if the lengths differ by less than this ratio, mark the pair as non-transliteratable") ("random_seed,S",po::value(), "Random seed"); po::options_description clo("Command line options"); clo.add_options() @@ -306,12 +310,11 @@ int main(int argc, char** argv) { letters[TD::Convert("NULL")].clear(); // TODO configure this - int max_src_chunk = 4; - int max_trg_chunk = 4; - Transliterations tl(max_src_chunk, max_trg_chunk); - - // TODO CONFIGURE THIS - int min_trans_src = 4; + const int max_src_chunk = conf["max_src_chunk"].as(); + const int max_trg_chunk = conf["max_trg_chunk"].as(); + const double filter_rat = conf["filter_ratio"].as(); + const int min_trans_src = conf["min_transliterated_src_length"].as(); + Transliterations tl(max_src_chunk, max_trg_chunk, filter_rat); cerr << "Initializing transliteration graph structures ...\n"; for (int i = 0; i < corpus.size(); ++i) { diff --git a/gi/pf/reachability.cc b/gi/pf/reachability.cc index 59bc6ace..c10000f2 100644 --- a/gi/pf/reachability.cc +++ b/gi/pf/reachability.cc @@ -12,7 +12,7 @@ struct SState { int prev_trg_covered; }; -void Reachability::ComputeReachability(int srclen, int trglen, int src_max_phrase_len, int trg_max_phrase_len) { +void Reachability::ComputeReachability(int srclen, int trglen, int src_max_phrase_len, int trg_max_phrase_len, double filter_ratio) { typedef boost::multi_array, 2> array_type; array_type a(boost::extents[srclen + 1][trglen + 1]); a[0][0].push_back(SState()); @@ -30,9 +30,10 @@ void Reachability::ComputeReachability(int srclen, int trglen, int src_max_phras } } a[0][0].clear(); - //cerr << "Final cell contains " << a[srclen][trglen].size() << " back pointers\n"; - if (a[srclen][trglen].size() == 0) { - cerr << "Sentence with length (" << srclen << ',' << trglen << ") violates reachability constraints\n"; + //cerr << srclen << "," << trglen << ": Final cell contains " << a[srclen][trglen].size() << " back pointers\n"; + size_t min_allowed = (src_max_phrase_len + 1) * (trg_max_phrase_len + 1) * (filter_ratio * filter_ratio); + if (a[srclen][trglen].size() < min_allowed) { + cerr << "Sequence pair with lengths (" << srclen << ',' << trglen << ") violates reachability constraint of min indegree " << min_allowed << " with " << a[srclen][trglen].size() << " in edges\n"; return; } diff --git a/gi/pf/reachability.h b/gi/pf/reachability.h index 1e22c76a..03967d44 100644 --- a/gi/pf/reachability.h +++ b/gi/pf/reachability.h @@ -18,17 +18,19 @@ struct Reachability { boost::multi_array node_addresses; // na[src_covered][trg_covered] -- the index of the node in a one-dimensional array (of size "nodes") boost::multi_array >, 2> valid_deltas; // valid_deltas[src_covered][trg_covered] list of valid transitions leaving a particular node - Reachability(int srclen, int trglen, int src_max_phrase_len, int trg_max_phrase_len) : + // filter_ratio says if the number of outgoing edges from the first cell is less than + // src_max * trg_max * filter_rat^2 then mark as non reachable + Reachability(int srclen, int trglen, int src_max_phrase_len, int trg_max_phrase_len, double filter_ratio = 0.0) : nodes(), edges(boost::extents[srclen][trglen][src_max_phrase_len+1][trg_max_phrase_len+1]), max_src_delta(boost::extents[srclen][trglen]), node_addresses(boost::extents[srclen][trglen]), valid_deltas(boost::extents[srclen][trglen]) { - ComputeReachability(srclen, trglen, src_max_phrase_len, trg_max_phrase_len); + ComputeReachability(srclen, trglen, src_max_phrase_len, trg_max_phrase_len, filter_ratio); } private: - void ComputeReachability(int srclen, int trglen, int src_max_phrase_len, int trg_max_phrase_len); + void ComputeReachability(int srclen, int trglen, int src_max_phrase_len, int trg_max_phrase_len, double filter_ratio); }; #endif diff --git a/gi/pf/transliterations.cc b/gi/pf/transliterations.cc index 61e95b82..8ea4ebd2 100644 --- a/gi/pf/transliterations.cc +++ b/gi/pf/transliterations.cc @@ -48,10 +48,11 @@ struct BackwardEstimates { }; struct TransliterationsImpl { - TransliterationsImpl(int max_src, int max_trg) : + TransliterationsImpl(int max_src, int max_trg, double fr) : kMAX_SRC_CHUNK(max_src), kMAX_TRG_CHUNK(max_trg), - tot_pairs() { + kFILTER_RATIO(fr), + tot_pairs(), tot_mem() { } void Initialize(WordID src, const vector& src_lets, WordID trg, const vector& trg_lets) { @@ -63,7 +64,7 @@ struct TransliterationsImpl { if (trg_len >= graphs[src_len].size()) graphs[src_len].resize(trg_len + 1); GraphStructure& gs = graphs[src_len][trg_len]; if (!gs.r) - gs.r = new Reachability(src_len, trg_len, kMAX_SRC_CHUNK, kMAX_TRG_CHUNK); + gs.r = new Reachability(src_len, trg_len, kMAX_SRC_CHUNK, kMAX_TRG_CHUNK, kFILTER_RATIO); const Reachability& r = *gs.r; // init backward estimates @@ -77,6 +78,7 @@ struct TransliterationsImpl { // TODO tot_pairs++; + tot_mem += sizeof(float) * gs.r->nodes; } void Forbid(WordID src, const vector& src_lets, WordID trg, const vector& trg_lets) { @@ -119,16 +121,20 @@ struct TransliterationsImpl { cerr << "Average out-degree = " << (to / tn) << endl; cerr << " Unique structures = " << tt << endl; cerr << " Unique pairs = " << tot_pairs << endl; + cerr << " BEs size = " << (tot_mem / (1024.0*1024.0)) << " MB" << endl; } const int kMAX_SRC_CHUNK; const int kMAX_TRG_CHUNK; + const double kFILTER_RATIO; unsigned tot_pairs; + size_t tot_mem; vector > graphs; // graphs[src_len][trg_len] vector > bes; // bes[src][trg] }; -Transliterations::Transliterations(int max_src, int max_trg) : pimpl_(new TransliterationsImpl(max_src, max_trg)) {} +Transliterations::Transliterations(int max_src, int max_trg, double fr) : + pimpl_(new TransliterationsImpl(max_src, max_trg, fr)) {} Transliterations::~Transliterations() { delete pimpl_; } void Transliterations::Initialize(WordID src, const vector& src_lets, WordID trg, const vector& trg_lets) { diff --git a/gi/pf/transliterations.h b/gi/pf/transliterations.h index e025547e..ea9f9d3f 100644 --- a/gi/pf/transliterations.h +++ b/gi/pf/transliterations.h @@ -8,7 +8,8 @@ struct TransliterationsImpl; struct Transliterations { // max_src and max_trg indicate how big the transliteration phrases can be - explicit Transliterations(int max_src, int max_trg); + // see reachability.h for information about filter_ratio + explicit Transliterations(int max_src, int max_trg, double filter_ratio); ~Transliterations(); void Initialize(WordID src, const std::vector& src_lets, WordID trg, const std::vector& trg_lets); void Forbid(WordID src, const std::vector& src_lets, WordID trg, const std::vector& trg_lets); -- cgit v1.2.3 From 113317266853abff2e1c0c3e889017d0eee55c93 Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Fri, 9 Mar 2012 22:23:50 -0500 Subject: moar --- gi/pf/Makefile.am | 3 +- gi/pf/align-lexonly-pyp.cc | 207 ++++++++++------------------------------- gi/pf/align-tl.cc | 18 ++-- gi/pf/backward.cc | 89 ++++++++++++++++++ gi/pf/backward.h | 33 +++++++ gi/pf/base_distributions.h | 8 +- gi/pf/guess-translits.pl | 2 +- gi/pf/nuisance_test.cc | 6 +- gi/pf/pyp_lm.cc | 2 +- gi/pf/pyp_tm.cc | 113 +++++++++++++++++++++++ gi/pf/pyp_tm.h | 34 +++++++ gi/pf/pyp_word_model.cc | 20 ++++ gi/pf/pyp_word_model.h | 58 ++++++++++++ gi/pf/reachability.cc | 8 +- gi/pf/reachability.h | 8 +- gi/pf/transliterations.cc | 223 ++++++++++++++++++++++++++++++++++++++++----- gi/pf/transliterations.h | 3 +- utils/ccrp_nt.h | 17 ++-- 18 files changed, 628 insertions(+), 224 deletions(-) create mode 100644 gi/pf/backward.cc create mode 100644 gi/pf/backward.h create mode 100644 gi/pf/pyp_tm.cc create mode 100644 gi/pf/pyp_tm.h create mode 100644 gi/pf/pyp_word_model.cc create mode 100644 gi/pf/pyp_word_model.h (limited to 'gi/pf') diff --git a/gi/pf/Makefile.am b/gi/pf/Makefile.am index 94364c3d..4ce72ba1 100644 --- a/gi/pf/Makefile.am +++ b/gi/pf/Makefile.am @@ -2,7 +2,7 @@ bin_PROGRAMS = cbgi brat dpnaive pfbrat pfdist itg pfnaive condnaive align-lexon noinst_LIBRARIES = libpf.a -libpf_a_SOURCES = base_distributions.cc reachability.cc cfg_wfst_composer.cc corpus.cc unigrams.cc ngram_base.cc transliterations.cc +libpf_a_SOURCES = base_distributions.cc reachability.cc cfg_wfst_composer.cc corpus.cc unigrams.cc ngram_base.cc transliterations.cc backward.cc pyp_word_model.cc pyp_tm.cc nuisance_test_SOURCES = nuisance_test.cc nuisance_test_LDADD = libpf.a $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a $(top_srcdir)/klm/lm/libklm.a $(top_srcdir)/klm/util/libklm_util.a -lz @@ -10,6 +10,7 @@ nuisance_test_LDADD = libpf.a $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mtev align_lexonly_SOURCES = align-lexonly.cc align_lexonly_pyp_SOURCES = align-lexonly-pyp.cc +align_lexonly_pyp_LDADD = libpf.a $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a $(top_srcdir)/klm/lm/libklm.a $(top_srcdir)/klm/util/libklm_util.a -lz align_tl_SOURCES = align-tl.cc align_tl_LDADD = libpf.a $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a $(top_srcdir)/klm/lm/libklm.a $(top_srcdir)/klm/util/libklm_util.a -lz diff --git a/gi/pf/align-lexonly-pyp.cc b/gi/pf/align-lexonly-pyp.cc index 13a3a487..d68a4b8f 100644 --- a/gi/pf/align-lexonly-pyp.cc +++ b/gi/pf/align-lexonly-pyp.cc @@ -1,27 +1,18 @@ #include -#include #include -#include #include #include -#include "array2d.h" -#include "base_distributions.h" -#include "monotonic_pseg.h" -#include "conditional_pseg.h" -#include "trule.h" #include "tdict.h" #include "stringlib.h" #include "filelib.h" -#include "dict.h" +#include "array2d.h" #include "sampler.h" -#include "mfcr.h" #include "corpus.h" -#include "ngram_base.h" +#include "pyp_tm.h" using namespace std; -using namespace tr1; namespace po = boost::program_options; void InitCommandLine(int argc, char** argv, po::variables_map* conf) { @@ -51,7 +42,7 @@ void InitCommandLine(int argc, char** argv, po::variables_map* conf) { } } -shared_ptr prng; +MT19937* prng; struct LexicalAlignment { unsigned char src_index; @@ -66,159 +57,59 @@ struct AlignedSentencePair { Array2D posterior; }; -struct HierarchicalWordBase { - explicit HierarchicalWordBase(const unsigned vocab_e_size) : - base(prob_t::One()), r(1,1,1,1,0.66,50.0), u0(-log(vocab_e_size)), l(1,prob_t::One()), v(1, prob_t::Zero()) {} - - void ResampleHyperparameters(MT19937* rng) { - r.resample_hyperparameters(rng); - } - - inline double logp0(const vector& s) const { - return Md::log_poisson(s.size(), 7.5) + s.size() * u0; - } - - // return p0 of rule.e_ - prob_t operator()(const TRule& rule) const { - v[0].logeq(logp0(rule.e_)); - return r.prob(rule.e_, v.begin(), l.begin()); - } - - void Increment(const TRule& rule) { - v[0].logeq(logp0(rule.e_)); - if (r.increment(rule.e_, v.begin(), l.begin(), &*prng).count) { - base *= v[0] * l[0]; - } - } - - void Decrement(const TRule& rule) { - if (r.decrement(rule.e_, &*prng).count) { - base /= prob_t(exp(logp0(rule.e_))); - } - } - - prob_t Likelihood() const { - prob_t p; p.logeq(r.log_crp_prob()); - p *= base; - return p; +struct Aligner { + Aligner(const vector >& lets, int num_letters, vector* c) : + corpus(*c), + model(lets, num_letters), + kNULL(TD::Convert("NULL")) { + assert(lets[kNULL].size() == 0); } - void Summary() const { - cerr << "NUMBER OF CUSTOMERS: " << r.num_customers() << " (d=" << r.discount() << ",s=" << r.strength() << ')' << endl; - for (MFCR<1,vector >::const_iterator it = r.begin(); it != r.end(); ++it) - cerr << " " << it->second.total_dish_count_ << " (on " << it->second.table_counts_.size() << " tables) " << TD::GetString(it->first) << endl; - } - - prob_t base; - MFCR<1,vector > r; - const double u0; - const vector l; - mutable vector v; -}; - -struct BasicLexicalAlignment { - explicit BasicLexicalAlignment(const vector >& lets, - const unsigned words_e, - const unsigned letters_e, - vector* corp) : - letters(lets), - corpus(*corp), - //up0(words_e), - //up0("en.chars.1gram", letters_e), - //up0("en.words.1gram"), - up0(letters_e), - //up0("en.chars.2gram"), - tmodel(up0) { - } + vector& corpus; + PYPLexicalTranslation model; + const WordID kNULL; - void InstantiateRule(const WordID src, - const WordID trg, - TRule* rule) const { - static const WordID kX = TD::Convert("X") * -1; - rule->lhs_ = kX; - rule->e_ = letters[trg]; - rule->f_ = letters[src]; + void ResampleHyperparameters() { + model.ResampleHyperparameters(prng); } void InitializeRandom() { - const WordID kNULL = TD::Convert("NULL"); cerr << "Initializing with random alignments ...\n"; for (unsigned i = 0; i < corpus.size(); ++i) { AlignedSentencePair& asp = corpus[i]; asp.a.resize(asp.trg.size()); for (unsigned j = 0; j < asp.trg.size(); ++j) { - const unsigned char a_j = prng->next() * (1 + asp.src.size()); + unsigned char& a_j = asp.a[j].src_index; + a_j = prng->next() * (1 + asp.src.size()); const WordID f_a_j = (a_j ? asp.src[a_j - 1] : kNULL); - TRule r; - InstantiateRule(f_a_j, asp.trg[j], &r); - asp.a[j].is_transliteration = false; - asp.a[j].src_index = a_j; - if (tmodel.IncrementRule(r, &*prng)) - up0.Increment(r); + model.Increment(f_a_j, asp.trg[j], &*prng); } } - cerr << " LLH = " << Likelihood() << endl; - } - - prob_t Likelihood() const { - prob_t p = tmodel.Likelihood(); - p *= up0.Likelihood(); - return p; - } - - void ResampleHyperparemeters() { - tmodel.ResampleHyperparameters(&*prng); - up0.ResampleHyperparameters(&*prng); - cerr << " (base d=" << up0.r.discount() << ",s=" << up0.r.strength() << ")\n"; + cerr << "Corpus intialized randomly. LLH = " << model.Likelihood() << endl; } - void ResampleCorpus(); - - const vector >& letters; // spelling dictionary - vector& corpus; - //PhraseConditionalUninformativeBase up0; - //PhraseConditionalUninformativeUnigramBase up0; - //UnigramWordBase up0; - //HierarchicalUnigramBase up0; - HierarchicalWordBase up0; - //CompletelyUniformBase up0; - //FixedNgramBase up0; - //ConditionalTranslationModel tmodel; - //ConditionalTranslationModel tmodel; - //ConditionalTranslationModel tmodel; - //ConditionalTranslationModel tmodel; - MConditionalTranslationModel tmodel; - //ConditionalTranslationModel tmodel; - //ConditionalTranslationModel tmodel; -}; - -void BasicLexicalAlignment::ResampleCorpus() { - static const WordID kNULL = TD::Convert("NULL"); - for (unsigned i = 0; i < corpus.size(); ++i) { - AlignedSentencePair& asp = corpus[i]; - SampleSet ss; ss.resize(asp.src.size() + 1); - for (unsigned j = 0; j < asp.trg.size(); ++j) { - TRule r; - unsigned char& a_j = asp.a[j].src_index; - WordID f_a_j = (a_j ? asp.src[a_j - 1] : kNULL); - InstantiateRule(f_a_j, asp.trg[j], &r); - if (tmodel.DecrementRule(r, &*prng)) - up0.Decrement(r); - - for (unsigned prop_a_j = 0; prop_a_j <= asp.src.size(); ++prop_a_j) { - const WordID prop_f = (prop_a_j ? asp.src[prop_a_j - 1] : kNULL); - InstantiateRule(prop_f, asp.trg[j], &r); - ss[prop_a_j] = tmodel.RuleProbability(r); + void ResampleCorpus() { + for (unsigned i = 0; i < corpus.size(); ++i) { + AlignedSentencePair& asp = corpus[i]; + SampleSet ss; ss.resize(asp.src.size() + 1); + for (unsigned j = 0; j < asp.trg.size(); ++j) { + unsigned char& a_j = asp.a[j].src_index; + const WordID e_j = asp.trg[j]; + WordID f_a_j = (a_j ? asp.src[a_j - 1] : kNULL); + model.Decrement(f_a_j, e_j, prng); + + for (unsigned prop_a_j = 0; prop_a_j <= asp.src.size(); ++prop_a_j) { + const WordID prop_f = (prop_a_j ? asp.src[prop_a_j - 1] : kNULL); + ss[prop_a_j] = model.Prob(prop_f, e_j); + } + a_j = prng->SelectSample(ss); + f_a_j = (a_j ? asp.src[a_j - 1] : kNULL); + model.Increment(f_a_j, e_j, prng); } - a_j = prng->SelectSample(ss); - f_a_j = (a_j ? asp.src[a_j - 1] : kNULL); - InstantiateRule(f_a_j, asp.trg[j], &r); - if (tmodel.IncrementRule(r, &*prng)) - up0.Increment(r); } + cerr << "LLH = " << model.Likelihood() << " " << model.UniqueConditioningContexts() << endl; } - cerr << " LLH = " << Likelihood() << endl; -} +}; void ExtractLetters(const set& v, vector >* l, set* letset = NULL) { for (set::const_iterator it = v.begin(); it != v.end(); ++it) { @@ -240,8 +131,10 @@ void ExtractLetters(const set& v, vector >* l, set a(asp.src.size(), asp.trg.size()); - for (unsigned j = 0; j < asp.trg.size(); ++j) + for (unsigned j = 0; j < asp.trg.size(); ++j) { + assert(asp.a[j].src_index <= asp.src.size()); if (asp.a[j].src_index) a(asp.a[j].src_index - 1, j) = true; + } cerr << a << endl; } @@ -275,10 +168,9 @@ int main(int argc, char** argv) { InitCommandLine(argc, argv, &conf); if (conf.count("random_seed")) - prng.reset(new MT19937(conf["random_seed"].as())); + prng = new MT19937(conf["random_seed"].as()); else - prng.reset(new MT19937); -// MT19937& rng = *prng; + prng = new MT19937; vector > corpuse, corpusf; set vocabe, vocabf; @@ -304,23 +196,18 @@ int main(int argc, char** argv) { ExtractLetters(vocabf, &letters, NULL); letters[TD::Convert("NULL")].clear(); - BasicLexicalAlignment x(letters, vocabe.size(), letset.size(), &corpus); - x.InitializeRandom(); + Aligner aligner(letters, letset.size(), &corpus); + aligner.InitializeRandom(); + const unsigned samples = conf["samples"].as(); for (int i = 0; i < samples; ++i) { for (int j = 65; j < 67; ++j) Debug(corpus[j]); - cerr << i << "\t" << x.tmodel.r.size() << "\t"; - if (i % 7 == 6) x.ResampleHyperparemeters(); - x.ResampleCorpus(); + if (i % 7 == 6) aligner.ResampleHyperparameters(); + aligner.ResampleCorpus(); if (i > (samples / 5) && (i % 10 == 9)) for (int j = 0; j < corpus.size(); ++j) AddSample(&corpus[j]); } for (unsigned i = 0; i < corpus.size(); ++i) WriteAlignments(corpus[i]); - //ModelAndData posterior(x, &corpus, vocabe, vocabf); - x.tmodel.Summary(); - x.up0.Summary(); - - //posterior.Sample(); return 0; } diff --git a/gi/pf/align-tl.cc b/gi/pf/align-tl.cc index fc9b7ca5..cbe8c6c8 100644 --- a/gi/pf/align-tl.cc +++ b/gi/pf/align-tl.cc @@ -6,6 +6,7 @@ #include #include +#include "backward.h" #include "array2d.h" #include "base_distributions.h" #include "monotonic_pseg.h" @@ -30,10 +31,11 @@ void InitCommandLine(int argc, char** argv, po::variables_map* conf) { opts.add_options() ("samples,s",po::value()->default_value(1000),"Number of samples") ("input,i",po::value(),"Read parallel data from") + ("s2t", po::value(), "character level source-to-target prior transliteration probabilities") + ("t2s", po::value(), "character level target-to-source prior transliteration probabilities") ("max_src_chunk", po::value()->default_value(4), "Maximum size of translitered chunk in source") ("max_trg_chunk", po::value()->default_value(4), "Maximum size of translitered chunk in target") - ("min_transliterated_src_length", po::value()->default_value(3), "Minimum length of source words considered for transliteration") - ("filter_ratio", po::value()->default_value(0.66), "Filter ratio: basically, if the lengths differ by less than this ratio, mark the pair as non-transliteratable") + ("expected_src_to_trg_ratio", po::value()->default_value(1.0), "If a word is transliterated, what is the expected length ratio from source to target?") ("random_seed,S",po::value(), "Random seed"); po::options_description clo("Command line options"); clo.add_options() @@ -303,7 +305,7 @@ int main(int argc, char** argv) { corpusf.clear(); corpuse.clear(); vocabf.insert(TD::Convert("NULL")); - vector > letters(TD::NumWords()); + vector > letters(TD::NumWords() + 1); set letset; ExtractLetters(vocabe, &letters, &letset); ExtractLetters(vocabf, &letters, NULL); @@ -312,9 +314,9 @@ int main(int argc, char** argv) { // TODO configure this const int max_src_chunk = conf["max_src_chunk"].as(); const int max_trg_chunk = conf["max_trg_chunk"].as(); - const double filter_rat = conf["filter_ratio"].as(); - const int min_trans_src = conf["min_transliterated_src_length"].as(); - Transliterations tl(max_src_chunk, max_trg_chunk, filter_rat); + const double s2t_rat = conf["expected_src_to_trg_ratio"].as(); + const BackwardEstimator be(conf["s2t"].as(), conf["t2s"].as()); + Transliterations tl(max_src_chunk, max_trg_chunk, s2t_rat, be); cerr << "Initializing transliteration graph structures ...\n"; for (int i = 0; i < corpus.size(); ++i) { @@ -325,8 +327,8 @@ int main(int argc, char** argv) { for (int k = 0; k < trg.size(); ++k) { const vector& trg_let = letters[trg[k]]; tl.Initialize(src[j], src_let, trg[k], trg_let); - if (src_let.size() < min_trans_src) - tl.Forbid(src[j], src_let, trg[k], trg_let); + //if (src_let.size() < min_trans_src) + // tl.Forbid(src[j], src_let, trg[k], trg_let); } } } diff --git a/gi/pf/backward.cc b/gi/pf/backward.cc new file mode 100644 index 00000000..b92629fd --- /dev/null +++ b/gi/pf/backward.cc @@ -0,0 +1,89 @@ +#include "backward.h" + +#include +#include + +#include "array2d.h" +#include "reachability.h" +#include "base_distributions.h" + +using namespace std; + +BackwardEstimator::BackwardEstimator(const string& s2t, + const string& t2s) : m1(new Model1(s2t)), m1inv(new Model1(t2s)) {} + +BackwardEstimator::~BackwardEstimator() { + delete m1; m1 = NULL; + delete m1inv; m1inv = NULL; +} + +float BackwardEstimator::ComputeBackwardProb(const std::vector& src, + const std::vector& trg, + unsigned src_covered, + unsigned trg_covered, + double s2t_ratio) const { + if (src_covered == src.size() || trg_covered == trg.size()) { + assert(src_covered == src.size()); + assert(trg_covered == trg.size()); + return 0; + } + static const WordID kNULL = TD::Convert(""); + const prob_t uniform_alignment(1.0 / (src.size() - src_covered + 1)); + // TODO factor in expected length ratio + prob_t e; e.logeq(Md::log_poisson(trg.size() - trg_covered, (src.size() - src_covered) * s2t_ratio)); // p(trg len remaining | src len remaining) + for (unsigned j = trg_covered; j < trg.size(); ++j) { + prob_t p = (*m1)(kNULL, trg[j]) + prob_t(1e-12); + for (unsigned i = src_covered; i < src.size(); ++i) + p += (*m1)(src[i], trg[j]); + if (p.is_0()) { + cerr << "ERROR: p(" << TD::Convert(trg[j]) << " | " << TD::GetString(src) << ") = 0!\n"; + assert(!"failed"); + } + p *= uniform_alignment; + e *= p; + } + // TODO factor in expected length ratio + const prob_t inv_uniform(1.0 / (trg.size() - trg_covered + 1.0)); + prob_t inv; + inv.logeq(Md::log_poisson(src.size() - src_covered, (trg.size() - trg_covered) / s2t_ratio)); + for (unsigned i = src_covered; i < src.size(); ++i) { + prob_t p = (*m1inv)(kNULL, src[i]) + prob_t(1e-12); + for (unsigned j = trg_covered; j < trg.size(); ++j) + p += (*m1inv)(trg[j], src[i]); + if (p.is_0()) { + cerr << "ERROR: p_inv(" << TD::Convert(src[i]) << " | " << TD::GetString(trg) << ") = 0!\n"; + assert(!"failed"); + } + p *= inv_uniform; + inv *= p; + } + return (log(e) + log(inv)) / 2; +} + +void BackwardEstimator::InitializeGrid(const vector& src, + const vector& trg, + const Reachability& r, + double s2t_ratio, + float* grid) const { + queue > q; + q.push(make_pair(0,0)); + Array2D done(src.size()+1, trg.size()+1, false); + //cerr << TD::GetString(src) << " ||| " << TD::GetString(trg) << endl; + while(!q.empty()) { + const pair n = q.front(); + q.pop(); + if (done(n.first,n.second)) continue; + done(n.first,n.second) = true; + + float lp = ComputeBackwardProb(src, trg, n.first, n.second, s2t_ratio); + if (n.first == 0 && n.second == 0) grid[0] = lp; + //cerr << " " << n.first << "," << n.second << "\t" << lp << endl; + + if (n.first == src.size() || n.second == trg.size()) continue; + const vector >& edges = r.valid_deltas[n.first][n.second]; + for (int i = 0; i < edges.size(); ++i) + q.push(make_pair(n.first + edges[i].first, n.second + edges[i].second)); + } + //static int cc = 0; ++cc; if (cc == 80) exit(1); +} + diff --git a/gi/pf/backward.h b/gi/pf/backward.h new file mode 100644 index 00000000..e67eff0c --- /dev/null +++ b/gi/pf/backward.h @@ -0,0 +1,33 @@ +#ifndef _BACKWARD_H_ +#define _BACKWARD_H_ + +#include +#include +#include "wordid.h" + +struct Reachability; +struct Model1; + +struct BackwardEstimator { + BackwardEstimator(const std::string& s2t, + const std::string& t2s); + ~BackwardEstimator(); + + void InitializeGrid(const std::vector& src, + const std::vector& trg, + const Reachability& r, + double src2trg_ratio, + float* grid) const; + + private: + float ComputeBackwardProb(const std::vector& src, + const std::vector& trg, + unsigned src_covered, + unsigned trg_covered, + double src2trg_ratio) const; + + Model1* m1; + Model1* m1inv; +}; + +#endif diff --git a/gi/pf/base_distributions.h b/gi/pf/base_distributions.h index 0d597c5c..84dacdf2 100644 --- a/gi/pf/base_distributions.h +++ b/gi/pf/base_distributions.h @@ -14,13 +14,7 @@ #include "tdict.h" #include "sampler.h" #include "m.h" - -inline std::ostream& operator<<(std::ostream& os, const std::vector& p) { - os << '['; - for (int i = 0; i < p.size(); ++i) - os << (i==0 ? "" : " ") << TD::Convert(p[i]); - return os << ']'; -} +#include "os_phrase.h" struct Model1 { explicit Model1(const std::string& fname) : diff --git a/gi/pf/guess-translits.pl b/gi/pf/guess-translits.pl index aafec13a..d00c2168 100755 --- a/gi/pf/guess-translits.pl +++ b/gi/pf/guess-translits.pl @@ -69,4 +69,4 @@ for my $f (keys %fs) { } } print STDERR "Extracted $num pairs.\n"; -print STDERR "Recommend running:\n ../../training/model1 -t -99999 output.txt\n"; +print STDERR "Recommend running:\n ../../training/model1 -v -d -t -99999 output.txt\n"; diff --git a/gi/pf/nuisance_test.cc b/gi/pf/nuisance_test.cc index 0f44fe95..fc0af9cb 100644 --- a/gi/pf/nuisance_test.cc +++ b/gi/pf/nuisance_test.cc @@ -124,9 +124,9 @@ int main(int argc, char** argv) { WordID y = TD::Convert("remember"); vector src; TD::ConvertSentence("s o u v e n o n s", &src); vector trg; TD::ConvertSentence("r e m e m b e r", &trg); - Transliterations xx; - xx.Initialize(x, src, y, trg); - return 1; +// Transliterations xx; +// xx.Initialize(x, src, y, trg); +// return 1; for (int j = 0; j < ITERS; ++j) { Base b; diff --git a/gi/pf/pyp_lm.cc b/gi/pf/pyp_lm.cc index 104f356b..52e6be2c 100644 --- a/gi/pf/pyp_lm.cc +++ b/gi/pf/pyp_lm.cc @@ -18,7 +18,7 @@ // I use templates to handle the recursive formalation of the prior, so // the order of the model has to be specified here, at compile time: -#define kORDER 4 +#define kORDER 3 using namespace std; using namespace tr1; diff --git a/gi/pf/pyp_tm.cc b/gi/pf/pyp_tm.cc new file mode 100644 index 00000000..94cbe7c3 --- /dev/null +++ b/gi/pf/pyp_tm.cc @@ -0,0 +1,113 @@ +#include "pyp_tm.h" + +#include +#include +#include + +#include "base_distributions.h" +#include "monotonic_pseg.h" +#include "conditional_pseg.h" +#include "tdict.h" +#include "ccrp.h" +#include "pyp_word_model.h" + +using namespace std; +using namespace std::tr1; + +template +struct ConditionalPYPWordModel { + ConditionalPYPWordModel(Base* b) : base(*b) {} + + void Summary() const { + cerr << "Number of conditioning contexts: " << r.size() << endl; + for (RuleModelHash::const_iterator it = r.begin(); it != r.end(); ++it) { + cerr << TD::Convert(it->first) << " \tPYP(d=" << it->second.discount() << ",s=" << it->second.strength() << ") --------------------------" << endl; + for (CCRP >::const_iterator i2 = it->second.begin(); i2 != it->second.end(); ++i2) + cerr << " " << i2->second.total_dish_count_ << '\t' << TD::GetString(i2->first) << endl; + } + } + + void ResampleHyperparameters(MT19937* rng) { + for (RuleModelHash::iterator it = r.begin(); it != r.end(); ++it) + it->second.resample_hyperparameters(rng); + } + + prob_t Prob(const WordID src, const vector& trglets) const { + RuleModelHash::const_iterator it = r.find(src); + if (it == r.end()) { + return base(trglets); + } else { + return it->second.prob(trglets, base(trglets)); + } + } + + void Increment(const WordID src, const vector& trglets, MT19937* rng) { + RuleModelHash::iterator it = r.find(src); + if (it == r.end()) + it = r.insert(make_pair(src, CCRP >(1,1,1,1,0.5,1.0))).first; + if (it->second.increment(trglets, base(trglets), rng)) + base.Increment(trglets, rng); + } + + void Decrement(const WordID src, const vector& trglets, MT19937* rng) { + RuleModelHash::iterator it = r.find(src); + assert(it != r.end()); + if (it->second.decrement(trglets, rng)) { + base.Decrement(trglets, rng); + if (it->second.num_customers() == 0) + r.erase(it); + } + } + + prob_t Likelihood() const { + prob_t p = prob_t::One(); + for (RuleModelHash::const_iterator it = r.begin(); it != r.end(); ++it) { + prob_t q; q.logeq(it->second.log_crp_prob()); + p *= q; + } + return p; + } + + unsigned UniqueConditioningContexts() const { + return r.size(); + } + + Base& base; + typedef unordered_map > > RuleModelHash; + RuleModelHash r; +}; + +PYPLexicalTranslation::PYPLexicalTranslation(const vector >& lets, + const unsigned num_letters) : + letters(lets), + up0(new PYPWordModel(num_letters)), + tmodel(new ConditionalPYPWordModel(up0)), + kX(-TD::Convert("X")) {} + +prob_t PYPLexicalTranslation::Likelihood() const { + prob_t p = up0->Likelihood(); + p *= tmodel->Likelihood(); + return p; +} + +void PYPLexicalTranslation::ResampleHyperparameters(MT19937* rng) { + tmodel->ResampleHyperparameters(rng); + up0->ResampleHyperparameters(rng); +} + +unsigned PYPLexicalTranslation::UniqueConditioningContexts() const { + return tmodel->UniqueConditioningContexts(); +} + +prob_t PYPLexicalTranslation::Prob(WordID src, WordID trg) const { + return tmodel->Prob(src, letters[trg]); +} + +void PYPLexicalTranslation::Increment(WordID src, WordID trg, MT19937* rng) { + tmodel->Increment(src, letters[trg], rng); +} + +void PYPLexicalTranslation::Decrement(WordID src, WordID trg, MT19937* rng) { + tmodel->Decrement(src, letters[trg], rng); +} + diff --git a/gi/pf/pyp_tm.h b/gi/pf/pyp_tm.h new file mode 100644 index 00000000..fa0fb28f --- /dev/null +++ b/gi/pf/pyp_tm.h @@ -0,0 +1,34 @@ +#ifndef PYP_LEX_TRANS +#define PYP_LEX_TRANS + +#include +#include "wordid.h" +#include "prob.h" +#include "sampler.h" + +struct TRule; +struct PYPWordModel; +template struct ConditionalPYPWordModel; + +struct PYPLexicalTranslation { + explicit PYPLexicalTranslation(const std::vector >& lets, + const unsigned num_letters); + + prob_t Likelihood() const; + + void ResampleHyperparameters(MT19937* rng); + prob_t Prob(WordID src, WordID trg) const; // return p(trg | src) + void Summary() const; + void Increment(WordID src, WordID trg, MT19937* rng); + void Decrement(WordID src, WordID trg, MT19937* rng); + unsigned UniqueConditioningContexts() const; + + private: + const std::vector >& letters; // spelling dictionary + PYPWordModel* up0; // base distribuction (model English word) + ConditionalPYPWordModel* tmodel; // translation distributions + // (model English word | French word) + const WordID kX; +}; + +#endif diff --git a/gi/pf/pyp_word_model.cc b/gi/pf/pyp_word_model.cc new file mode 100644 index 00000000..12df4abf --- /dev/null +++ b/gi/pf/pyp_word_model.cc @@ -0,0 +1,20 @@ +#include "pyp_word_model.h" + +#include + +using namespace std; + +void PYPWordModel::ResampleHyperparameters(MT19937* rng) { + r.resample_hyperparameters(rng); + cerr << " PYPWordModel(d=" << r.discount() << ",s=" << r.strength() << ")\n"; +} + +void PYPWordModel::Summary() const { + cerr << "PYPWordModel: generations=" << r.num_customers() + << " PYP(d=" << r.discount() << ",s=" << r.strength() << ')' << endl; + for (CCRP >::const_iterator it = r.begin(); it != r.end(); ++it) + cerr << " " << it->second.total_dish_count_ + << " (on " << it->second.table_counts_.size() << " tables) " + << TD::GetString(it->first) << endl; +} + diff --git a/gi/pf/pyp_word_model.h b/gi/pf/pyp_word_model.h new file mode 100644 index 00000000..800a4fd7 --- /dev/null +++ b/gi/pf/pyp_word_model.h @@ -0,0 +1,58 @@ +#ifndef _PYP_WORD_MODEL_H_ +#define _PYP_WORD_MODEL_H_ + +#include +#include +#include +#include "prob.h" +#include "ccrp.h" +#include "m.h" +#include "tdict.h" +#include "os_phrase.h" + +// PYP(d,s,poisson-uniform) represented as a CRP +struct PYPWordModel { + explicit PYPWordModel(const unsigned vocab_e_size, const double mean_len = 7.5) : + base(prob_t::One()), r(1,1,1,1,0.66,50.0), u0(-std::log(vocab_e_size)), mean_length(mean_len) {} + + void ResampleHyperparameters(MT19937* rng); + + inline prob_t operator()(const std::vector& s) const { + return r.prob(s, p0(s)); + } + + inline void Increment(const std::vector& s, MT19937* rng) { + if (r.increment(s, p0(s), rng)) + base *= p0(s); + } + + inline void Decrement(const std::vector& s, MT19937 *rng) { + if (r.decrement(s, rng)) + base /= p0(s); + } + + inline prob_t Likelihood() const { + prob_t p; p.logeq(r.log_crp_prob()); + p *= base; + return p; + } + + void Summary() const; + + private: + inline double logp0(const std::vector& s) const { + return Md::log_poisson(s.size(), mean_length) + s.size() * u0; + } + + inline prob_t p0(const std::vector& s) const { + prob_t p; p.logeq(logp0(s)); + return p; + } + + prob_t base; // keeps track of the draws from the base distribution + CCRP > r; + const double u0; // uniform log prob of generating a letter + const double mean_length; // mean length of a word in the base distribution +}; + +#endif diff --git a/gi/pf/reachability.cc b/gi/pf/reachability.cc index c10000f2..7d0d04ac 100644 --- a/gi/pf/reachability.cc +++ b/gi/pf/reachability.cc @@ -12,7 +12,7 @@ struct SState { int prev_trg_covered; }; -void Reachability::ComputeReachability(int srclen, int trglen, int src_max_phrase_len, int trg_max_phrase_len, double filter_ratio) { +void Reachability::ComputeReachability(int srclen, int trglen, int src_max_phrase_len, int trg_max_phrase_len) { typedef boost::multi_array, 2> array_type; array_type a(boost::extents[srclen + 1][trglen + 1]); a[0][0].push_back(SState()); @@ -31,9 +31,9 @@ void Reachability::ComputeReachability(int srclen, int trglen, int src_max_phras } a[0][0].clear(); //cerr << srclen << "," << trglen << ": Final cell contains " << a[srclen][trglen].size() << " back pointers\n"; - size_t min_allowed = (src_max_phrase_len + 1) * (trg_max_phrase_len + 1) * (filter_ratio * filter_ratio); - if (a[srclen][trglen].size() < min_allowed) { - cerr << "Sequence pair with lengths (" << srclen << ',' << trglen << ") violates reachability constraint of min indegree " << min_allowed << " with " << a[srclen][trglen].size() << " in edges\n"; + if (a[srclen][trglen].empty()) { + cerr << "Sequence pair with lengths (" << srclen << ',' << trglen << ") violates reachability constraints\n"; + nodes = 0; return; } diff --git a/gi/pf/reachability.h b/gi/pf/reachability.h index 03967d44..1e22c76a 100644 --- a/gi/pf/reachability.h +++ b/gi/pf/reachability.h @@ -18,19 +18,17 @@ struct Reachability { boost::multi_array node_addresses; // na[src_covered][trg_covered] -- the index of the node in a one-dimensional array (of size "nodes") boost::multi_array >, 2> valid_deltas; // valid_deltas[src_covered][trg_covered] list of valid transitions leaving a particular node - // filter_ratio says if the number of outgoing edges from the first cell is less than - // src_max * trg_max * filter_rat^2 then mark as non reachable - Reachability(int srclen, int trglen, int src_max_phrase_len, int trg_max_phrase_len, double filter_ratio = 0.0) : + Reachability(int srclen, int trglen, int src_max_phrase_len, int trg_max_phrase_len) : nodes(), edges(boost::extents[srclen][trglen][src_max_phrase_len+1][trg_max_phrase_len+1]), max_src_delta(boost::extents[srclen][trglen]), node_addresses(boost::extents[srclen][trglen]), valid_deltas(boost::extents[srclen][trglen]) { - ComputeReachability(srclen, trglen, src_max_phrase_len, trg_max_phrase_len, filter_ratio); + ComputeReachability(srclen, trglen, src_max_phrase_len, trg_max_phrase_len); } private: - void ComputeReachability(int srclen, int trglen, int src_max_phrase_len, int trg_max_phrase_len, double filter_ratio); + void ComputeReachability(int srclen, int trglen, int src_max_phrase_len, int trg_max_phrase_len); }; #endif diff --git a/gi/pf/transliterations.cc b/gi/pf/transliterations.cc index 8ea4ebd2..2200715e 100644 --- a/gi/pf/transliterations.cc +++ b/gi/pf/transliterations.cc @@ -5,14 +5,173 @@ #include "boost/shared_ptr.hpp" +#include "backward.h" #include "filelib.h" -#include "ccrp.h" +#include "tdict.h" +#include "trule.h" +#include "filelib.h" +#include "ccrp_nt.h" #include "m.h" #include "reachability.h" using namespace std; using namespace std::tr1; +struct TruncatedConditionalLengthModel { + TruncatedConditionalLengthModel(unsigned max_src_size, unsigned max_trg_size, double expected_src_to_trg_ratio) : + plens(max_src_size+1, vector(max_trg_size+1, 0.0)) { + for (unsigned i = 1; i <= max_src_size; ++i) { + prob_t z = prob_t::Zero(); + for (unsigned j = 1; j <= max_trg_size; ++j) + z += (plens[i][j] = prob_t(0.01 + exp(Md::log_poisson(j, i * expected_src_to_trg_ratio)))); + for (unsigned j = 1; j <= max_trg_size; ++j) + plens[i][j] /= z; + //for (unsigned j = 1; j <= max_trg_size; ++j) + // cerr << "P(trg_len=" << j << " | src_len=" << i << ") = " << plens[i][j] << endl; + } + } + + // return p(tlen | slen) for *chunks* not full words + inline const prob_t& operator()(int slen, int tlen) const { + return plens[slen][tlen]; + } + + vector > plens; +}; + +struct CondBaseDist { + CondBaseDist(unsigned max_src_size, unsigned max_trg_size, double expected_src_to_trg_ratio) : + tclm(max_src_size, max_trg_size, expected_src_to_trg_ratio) {} + + prob_t operator()(const vector& src, unsigned sf, unsigned st, + const vector& trg, unsigned tf, unsigned tt) const { + prob_t p = tclm(st - sf, tt - tf); // target len | source length ~ TCLM(source len) + assert(!"not impl"); + return p; + } + inline prob_t operator()(const vector& src, const vector& trg) const { + return (*this)(src, 0, src.size(), trg, 0, trg.size()); + } + TruncatedConditionalLengthModel tclm; +}; + +// represents transliteration phrase probabilities, e.g. +// p( a l - | A l ) , p( o | A w ) , ... +struct TransliterationChunkConditionalModel { + explicit TransliterationChunkConditionalModel(const CondBaseDist& pp0) : + d(0.0), + strength(1.0), + rp0(pp0) { + } + + void Summary() const { + std::cerr << "Number of conditioning contexts: " << r.size() << std::endl; + for (RuleModelHash::const_iterator it = r.begin(); it != r.end(); ++it) { + std::cerr << TD::GetString(it->first) << " \t(\\alpha = " << it->second.alpha() << ") --------------------------" << std::endl; + for (CCRP_NoTable::const_iterator i2 = it->second.begin(); i2 != it->second.end(); ++i2) + std::cerr << " " << i2->second << '\t' << i2->first << std::endl; + } + } + + int DecrementRule(const TRule& rule) { + RuleModelHash::iterator it = r.find(rule.f_); + assert(it != r.end()); + int count = it->second.decrement(rule); + if (count) { + if (it->second.num_customers() == 0) r.erase(it); + } + return count; + } + + int IncrementRule(const TRule& rule) { + RuleModelHash::iterator it = r.find(rule.f_); + if (it == r.end()) { + it = r.insert(make_pair(rule.f_, CCRP_NoTable(strength))).first; + } + int count = it->second.increment(rule); + return count; + } + + void IncrementRules(const std::vector& rules) { + for (int i = 0; i < rules.size(); ++i) + IncrementRule(*rules[i]); + } + + void DecrementRules(const std::vector& rules) { + for (int i = 0; i < rules.size(); ++i) + DecrementRule(*rules[i]); + } + + prob_t RuleProbability(const TRule& rule) const { + prob_t p; + RuleModelHash::const_iterator it = r.find(rule.f_); + if (it == r.end()) { + p = rp0(rule.f_, rule.e_); + } else { + p = it->second.prob(rule, rp0(rule.f_, rule.e_)); + } + return p; + } + + double LogLikelihood(const double& dd, const double& aa) const { + if (aa <= -dd) return -std::numeric_limits::infinity(); + //double llh = Md::log_beta_density(dd, 10, 3) + Md::log_gamma_density(aa, 1, 1); + double llh = //Md::log_beta_density(dd, 1, 1) + + Md::log_gamma_density(dd + aa, 1, 1); + typename std::tr1::unordered_map, CCRP_NoTable, boost::hash > >::const_iterator it; + for (it = r.begin(); it != r.end(); ++it) + llh += it->second.log_crp_prob(aa); + return llh; + } + + struct AlphaResampler { + AlphaResampler(const TransliterationChunkConditionalModel& m) : m_(m) {} + const TransliterationChunkConditionalModel& m_; + double operator()(const double& proposed_strength) const { + return m_.LogLikelihood(m_.d, proposed_strength); + } + }; + + void ResampleHyperparameters(MT19937* rng) { + typename std::tr1::unordered_map, CCRP_NoTable, boost::hash > >::iterator it; + //const unsigned nloop = 5; + const unsigned niterations = 10; + //DiscountResampler dr(*this); + AlphaResampler ar(*this); +#if 0 + for (int iter = 0; iter < nloop; ++iter) { + strength = slice_sampler1d(ar, strength, *rng, -d + std::numeric_limits::min(), + std::numeric_limits::infinity(), 0.0, niterations, 100*niterations); + double min_discount = std::numeric_limits::min(); + if (strength < 0.0) min_discount -= strength; + d = slice_sampler1d(dr, d, *rng, min_discount, + 1.0, 0.0, niterations, 100*niterations); + } +#endif + strength = slice_sampler1d(ar, strength, *rng, -d, + std::numeric_limits::infinity(), 0.0, niterations, 100*niterations); + std::cerr << "CTMModel(alpha=" << strength << ") = " << LogLikelihood(d, strength) << std::endl; + for (it = r.begin(); it != r.end(); ++it) { +#if 0 + it->second.set_discount(d); +#endif + it->second.set_alpha(strength); + } + } + + prob_t Likelihood() const { + prob_t p; p.logeq(LogLikelihood(d, strength)); + return p; + } + + const CondBaseDist& rp0; + typedef std::tr1::unordered_map, + CCRP_NoTable, + boost::hash > > RuleModelHash; + RuleModelHash r; + double d, strength; +}; + struct GraphStructure { GraphStructure() : r() {} // leak memory - these are basically static @@ -20,9 +179,9 @@ struct GraphStructure { bool IsReachable() const { return r->nodes > 0; } }; -struct BackwardEstimates { - BackwardEstimates() : gs(), backward() {} - explicit BackwardEstimates(const GraphStructure& g) : +struct ProbabilityEstimates { + ProbabilityEstimates() : gs(), backward() {} + explicit ProbabilityEstimates(const GraphStructure& g) : gs(&g), backward() { if (g.r->nodes > 0) backward = new float[g.r->nodes]; @@ -36,24 +195,32 @@ struct BackwardEstimates { } // returns an backward estimate - double operator()(int src_covered, int trg_covered) const { + double Backward(int src_covered, int trg_covered) const { if (!backward) return 0; int ind = gs->r->node_addresses[src_covered][trg_covered]; if (ind < 0) return 0; return backward[ind]; } + + prob_t estp; + float* backward; private: const GraphStructure* gs; - float* backward; }; struct TransliterationsImpl { - TransliterationsImpl(int max_src, int max_trg, double fr) : + TransliterationsImpl(int max_src, int max_trg, double sr, const BackwardEstimator& b) : + cp0(max_src, max_trg, sr), + tccm(cp0), + be(b), kMAX_SRC_CHUNK(max_src), kMAX_TRG_CHUNK(max_trg), - kFILTER_RATIO(fr), + kS2T_RATIO(sr), tot_pairs(), tot_mem() { } + const CondBaseDist cp0; + TransliterationChunkConditionalModel tccm; + const BackwardEstimator& be; void Initialize(WordID src, const vector& src_lets, WordID trg, const vector& trg_lets) { const size_t src_len = src_lets.size(); @@ -63,20 +230,29 @@ struct TransliterationsImpl { if (src_len >= graphs.size()) graphs.resize(src_len + 1); if (trg_len >= graphs[src_len].size()) graphs[src_len].resize(trg_len + 1); GraphStructure& gs = graphs[src_len][trg_len]; - if (!gs.r) - gs.r = new Reachability(src_len, trg_len, kMAX_SRC_CHUNK, kMAX_TRG_CHUNK, kFILTER_RATIO); + if (!gs.r) { + double rat = exp(fabs(log(trg_len / (src_len * kS2T_RATIO)))); + if (rat > 1.5 || (rat > 2.4 && src_len < 6)) { + cerr << " ** Forbidding transliterations of size " << src_len << "," << trg_len << ": " << rat << endl; + gs.r = new Reachability(src_len, trg_len, 0, 0); + } else { + gs.r = new Reachability(src_len, trg_len, kMAX_SRC_CHUNK, kMAX_TRG_CHUNK); + } + } + const Reachability& r = *gs.r; // init backward estimates - if (src >= bes.size()) bes.resize(src + 1); - unordered_map::iterator it = bes[src].find(trg); - if (it != bes[src].end()) return; // already initialized + if (src >= ests.size()) ests.resize(src + 1); + unordered_map::iterator it = ests[src].find(trg); + if (it != ests[src].end()) return; // already initialized - it = bes[src].insert(make_pair(trg, BackwardEstimates(gs))).first; - BackwardEstimates& b = it->second; + it = ests[src].insert(make_pair(trg, ProbabilityEstimates(gs))).first; + ProbabilityEstimates& est = it->second; if (!gs.r->nodes) return; // not derivable subject to length constraints - // TODO + be.InitializeGrid(src_lets, trg_lets, r, kS2T_RATIO, est.backward); + cerr << TD::GetString(src_lets) << " ||| " << TD::GetString(trg_lets) << " ||| " << (est.backward[0] / trg_lets.size()) << endl; tot_pairs++; tot_mem += sizeof(float) * gs.r->nodes; } @@ -92,8 +268,11 @@ struct TransliterationsImpl { const vector& tv = graphs[src.size()]; assert(trg.size() < tv.size()); const GraphStructure& gs = tv[trg.size()]; - // TODO: do prob - return prob_t::Zero(); + if (gs.r->nodes == 0) + return prob_t::Zero(); + const unordered_map::const_iterator it = ests[s].find(t); + assert(it != ests[s].end()); + return it->second.estp; } void GraphSummary() const { @@ -126,15 +305,15 @@ struct TransliterationsImpl { const int kMAX_SRC_CHUNK; const int kMAX_TRG_CHUNK; - const double kFILTER_RATIO; + const double kS2T_RATIO; unsigned tot_pairs; size_t tot_mem; vector > graphs; // graphs[src_len][trg_len] - vector > bes; // bes[src][trg] + vector > ests; // ests[src][trg] }; -Transliterations::Transliterations(int max_src, int max_trg, double fr) : - pimpl_(new TransliterationsImpl(max_src, max_trg, fr)) {} +Transliterations::Transliterations(int max_src, int max_trg, double sr, const BackwardEstimator& be) : + pimpl_(new TransliterationsImpl(max_src, max_trg, sr, be)) {} Transliterations::~Transliterations() { delete pimpl_; } void Transliterations::Initialize(WordID src, const vector& src_lets, WordID trg, const vector& trg_lets) { diff --git a/gi/pf/transliterations.h b/gi/pf/transliterations.h index ea9f9d3f..49d14684 100644 --- a/gi/pf/transliterations.h +++ b/gi/pf/transliterations.h @@ -5,11 +5,12 @@ #include "wordid.h" #include "prob.h" +struct BackwardEstimator; struct TransliterationsImpl; struct Transliterations { // max_src and max_trg indicate how big the transliteration phrases can be // see reachability.h for information about filter_ratio - explicit Transliterations(int max_src, int max_trg, double filter_ratio); + explicit Transliterations(int max_src, int max_trg, double s2t_rat, const BackwardEstimator& be); ~Transliterations(); void Initialize(WordID src, const std::vector& src_lets, WordID trg, const std::vector& trg_lets); void Forbid(WordID src, const std::vector& src_lets, WordID trg, const std::vector& trg_lets); diff --git a/utils/ccrp_nt.h b/utils/ccrp_nt.h index 79321493..6efbfc78 100644 --- a/utils/ccrp_nt.h +++ b/utils/ccrp_nt.h @@ -11,6 +11,7 @@ #include #include "sampler.h" #include "slice_sampler.h" +#include "m.h" // Chinese restaurant process (1 parameter) template > @@ -29,6 +30,7 @@ class CCRP_NoTable { alpha_prior_rate_(c_rate) {} double alpha() const { return alpha_; } + void set_alpha(const double& alpha) { alpha_ = alpha; assert(alpha_ > 0.0); } bool has_alpha_prior() const { return !std::isnan(alpha_prior_shape_); @@ -71,9 +73,10 @@ class CCRP_NoTable { return table_diff; } - double prob(const Dish& dish, const double& p0) const { + template + F prob(const Dish& dish, const F& p0) const { const unsigned at_table = num_customers(dish); - return (at_table + p0 * alpha_) / (num_customers_ + alpha_); + return (F(at_table) + p0 * F(alpha_)) / F(num_customers_ + alpha_); } double logprob(const Dish& dish, const double& logp0) const { @@ -85,20 +88,12 @@ class CCRP_NoTable { return log_crp_prob(alpha_); } - static double log_gamma_density(const double& x, const double& shape, const double& rate) { - assert(x >= 0.0); - assert(shape > 0.0); - assert(rate > 0.0); - const double lp = (shape-1)*log(x) - shape*log(rate) - x/rate - lgamma(shape); - return lp; - } - // taken from http://en.wikipedia.org/wiki/Chinese_restaurant_process // does not include P_0's double log_crp_prob(const double& alpha) const { double lp = 0.0; if (has_alpha_prior()) - lp += log_gamma_density(alpha, alpha_prior_shape_, alpha_prior_rate_); + lp += Md::log_gamma_density(alpha, alpha_prior_shape_, alpha_prior_rate_); assert(lp <= 0.0); if (num_customers_) { lp += lgamma(alpha) - lgamma(alpha + num_customers_) + -- cgit v1.2.3 From 600ff8e60086c5cc197fe302bfcea113ebd15565 Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Fri, 9 Mar 2012 22:27:12 -0500 Subject: forgotten file --- gi/pf/os_phrase.h | 15 +++++++++++++++ 1 file changed, 15 insertions(+) create mode 100644 gi/pf/os_phrase.h (limited to 'gi/pf') diff --git a/gi/pf/os_phrase.h b/gi/pf/os_phrase.h new file mode 100644 index 00000000..dfe40cb1 --- /dev/null +++ b/gi/pf/os_phrase.h @@ -0,0 +1,15 @@ +#ifndef _OS_PHRASE_H_ +#define _OS_PHRASE_H_ + +#include +#include +#include "tdict.h" + +inline std::ostream& operator<<(std::ostream& os, const std::vector& p) { + os << '['; + for (int i = 0; i < p.size(); ++i) + os << (i==0 ? "" : " ") << TD::Convert(p[i]); + return os << ']'; +} + +#endif -- cgit v1.2.3 From ef614a1d968aebbf463ed57876fee395b4c24635 Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Fri, 9 Mar 2012 23:13:09 -0500 Subject: logging after alignment --- gi/pf/align-lexonly-pyp.cc | 1 + gi/pf/pyp_tm.cc | 7 +++++-- gi/pf/pyp_word_model.h | 2 +- utils/ccrp.h | 1 + 4 files changed, 8 insertions(+), 3 deletions(-) (limited to 'gi/pf') diff --git a/gi/pf/align-lexonly-pyp.cc b/gi/pf/align-lexonly-pyp.cc index d68a4b8f..4a1d1db6 100644 --- a/gi/pf/align-lexonly-pyp.cc +++ b/gi/pf/align-lexonly-pyp.cc @@ -208,6 +208,7 @@ int main(int argc, char** argv) { } for (unsigned i = 0; i < corpus.size(); ++i) WriteAlignments(corpus[i]); + aligner.model.Summary(); return 0; } diff --git a/gi/pf/pyp_tm.cc b/gi/pf/pyp_tm.cc index 94cbe7c3..b5262f47 100644 --- a/gi/pf/pyp_tm.cc +++ b/gi/pf/pyp_tm.cc @@ -54,8 +54,6 @@ struct ConditionalPYPWordModel { assert(it != r.end()); if (it->second.decrement(trglets, rng)) { base.Decrement(trglets, rng); - if (it->second.num_customers() == 0) - r.erase(it); } } @@ -84,6 +82,11 @@ PYPLexicalTranslation::PYPLexicalTranslation(const vector >& lets tmodel(new ConditionalPYPWordModel(up0)), kX(-TD::Convert("X")) {} +void PYPLexicalTranslation::Summary() const { + tmodel->Summary(); + up0->Summary(); +} + prob_t PYPLexicalTranslation::Likelihood() const { prob_t p = up0->Likelihood(); p *= tmodel->Likelihood(); diff --git a/gi/pf/pyp_word_model.h b/gi/pf/pyp_word_model.h index 800a4fd7..ff366865 100644 --- a/gi/pf/pyp_word_model.h +++ b/gi/pf/pyp_word_model.h @@ -12,7 +12,7 @@ // PYP(d,s,poisson-uniform) represented as a CRP struct PYPWordModel { - explicit PYPWordModel(const unsigned vocab_e_size, const double mean_len = 7.5) : + explicit PYPWordModel(const unsigned vocab_e_size, const double mean_len = 5) : base(prob_t::One()), r(1,1,1,1,0.66,50.0), u0(-std::log(vocab_e_size)), mean_length(mean_len) {} void ResampleHyperparameters(MT19937* rng); diff --git a/utils/ccrp.h b/utils/ccrp.h index 439d7e1e..4a8b80e7 100644 --- a/utils/ccrp.h +++ b/utils/ccrp.h @@ -221,6 +221,7 @@ class CCRP { void resample_hyperparameters(MT19937* rng, const unsigned nloop = 5, const unsigned niterations = 10) { assert(has_discount_prior() || has_strength_prior()); + if (num_customers() == 0) return; DiscountResampler dr(*this); StrengthResampler sr(*this); for (int iter = 0; iter < nloop; ++iter) { -- cgit v1.2.3 From e320dd47380f8f3a628073f926a56e4321146ebd Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Sat, 10 Mar 2012 00:00:27 -0500 Subject: use quasi model 2 instead of uniform alignments --- gi/pf/align-lexonly-pyp.cc | 6 ++++++ gi/pf/quasi_model2.h | 46 ++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 52 insertions(+) create mode 100644 gi/pf/quasi_model2.h (limited to 'gi/pf') diff --git a/gi/pf/align-lexonly-pyp.cc b/gi/pf/align-lexonly-pyp.cc index 4a1d1db6..0c90b6ce 100644 --- a/gi/pf/align-lexonly-pyp.cc +++ b/gi/pf/align-lexonly-pyp.cc @@ -11,6 +11,7 @@ #include "sampler.h" #include "corpus.h" #include "pyp_tm.h" +#include "quasi_model2.h" using namespace std; namespace po = boost::program_options; @@ -61,12 +62,14 @@ struct Aligner { Aligner(const vector >& lets, int num_letters, vector* c) : corpus(*c), model(lets, num_letters), + paj(4, 0.08), kNULL(TD::Convert("NULL")) { assert(lets[kNULL].size() == 0); } vector& corpus; PYPLexicalTranslation model; + const QuasiModel2 paj; const WordID kNULL; void ResampleHyperparameters() { @@ -83,6 +86,7 @@ struct Aligner { a_j = prng->next() * (1 + asp.src.size()); const WordID f_a_j = (a_j ? asp.src[a_j - 1] : kNULL); model.Increment(f_a_j, asp.trg[j], &*prng); + // TODO factor in alignment prob } } cerr << "Corpus intialized randomly. LLH = " << model.Likelihood() << endl; @@ -101,6 +105,8 @@ struct Aligner { for (unsigned prop_a_j = 0; prop_a_j <= asp.src.size(); ++prop_a_j) { const WordID prop_f = (prop_a_j ? asp.src[prop_a_j - 1] : kNULL); ss[prop_a_j] = model.Prob(prop_f, e_j); + // TODO configurable + ss[prop_a_j] *= paj.Pa_j(prop_a_j, j, asp.src.size(), asp.trg.size()); } a_j = prng->SelectSample(ss); f_a_j = (a_j ? asp.src[a_j - 1] : kNULL); diff --git a/gi/pf/quasi_model2.h b/gi/pf/quasi_model2.h new file mode 100644 index 00000000..0095289f --- /dev/null +++ b/gi/pf/quasi_model2.h @@ -0,0 +1,46 @@ +#ifndef _QUASI_MODEL2_H_ +#define _QUASI_MODEL2_H_ + +#include +#include +#include "prob.h" +#include "array2d.h" + +struct QuasiModel2 { + explicit QuasiModel2(double alpha, double pnull = 0.1) : + alpha_(alpha), + pnull_(pnull), + pnotnull_(1 - pnull), + z_(1000,1000) {} + // a_j = 0 => NULL; src_len does *not* include null + prob_t Pa_j(unsigned a_j, unsigned j, unsigned src_len, unsigned trg_len) const { + if (!a_j) return pnull_; + std::vector& zv = z_(src_len, trg_len); + if (zv.size() == 0) + zv.resize(trg_len); + + prob_t& z = zv[j]; + if (z.is_0()) z = ComputeZ(j, src_len, trg_len); + + prob_t p; + p.logeq(-fabs(double(a_j - 1) / src_len - double(j) / trg_len) * alpha_); + p *= pnotnull_; + p /= z; + return p; + } + private: + prob_t ComputeZ(unsigned j, unsigned src_len, unsigned trg_len) const { + prob_t p, z = prob_t::Zero(); + for (int a_j = 1; a_j <= src_len; ++a_j) { + p.logeq(-fabs(double(a_j - 1) / src_len - double(j) / trg_len) * alpha_); + z += p; + } + return z; + } + double alpha_; + const prob_t pnull_; + const prob_t pnotnull_; + mutable Array2D > z_; +}; + +#endif -- cgit v1.2.3 From de136247bdedb960dc0f317cd65b28c02a441532 Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Sat, 10 Mar 2012 01:08:23 -0500 Subject: tie params --- gi/pf/pyp_lm.cc | 66 +++++++++------------------------------- gi/pf/pyp_tm.cc | 2 ++ gi/pf/tied_resampler.h | 82 ++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 99 insertions(+), 51 deletions(-) create mode 100644 gi/pf/tied_resampler.h (limited to 'gi/pf') diff --git a/gi/pf/pyp_lm.cc b/gi/pf/pyp_lm.cc index 52e6be2c..85635b8f 100644 --- a/gi/pf/pyp_lm.cc +++ b/gi/pf/pyp_lm.cc @@ -11,6 +11,7 @@ #include "tdict.h" #include "sampler.h" #include "ccrp.h" +#include "tied_resampler.h" // A not very memory-efficient implementation of an N-gram LM based on PYPs // as described in Y.-W. Teh. (2006) A Hierarchical Bayesian Language Model @@ -66,7 +67,7 @@ template<> struct PYPLM<0> { void increment(WordID, const vector&, MT19937*) { ++draws; } void decrement(WordID, const vector&, MT19937*) { --draws; assert(draws >= 0); } double prob(WordID, const vector&) const { return p0; } - void resample_hyperparameters(MT19937*, const unsigned, const unsigned) {} + void resample_hyperparameters(MT19937*) {} double log_likelihood() const { return draws * log(p0); } const double p0; int draws; @@ -76,16 +77,17 @@ template<> struct PYPLM<0> { template struct PYPLM { PYPLM(unsigned vs, double da, double db, double ss, double sr) : backoff(vs, da, db, ss, sr), - discount_a(da), discount_b(db), - strength_s(ss), strength_r(sr), - d(0.8), strength(1.0), lookup(N-1) {} + tr(da, db, ss, sr, 0.8, 1.0), + lookup(N-1) {} void increment(WordID w, const vector& context, MT19937* rng) { const double bo = backoff.prob(w, context); for (unsigned i = 0; i < N-1; ++i) lookup[i] = context[context.size() - 1 - i]; typename unordered_map, CCRP, boost::hash > >::iterator it = p.find(lookup); - if (it == p.end()) - it = p.insert(make_pair(lookup, CCRP(d,strength))).first; + if (it == p.end()) { + it = p.insert(make_pair(lookup, CCRP(0.5,1))).first; + tr.Add(&it->second); // add to resampler + } if (it->second.increment(w, bo, rng)) backoff.increment(w, context, rng); } @@ -107,59 +109,21 @@ template struct PYPLM { } double log_likelihood() const { - return log_likelihood(d, strength) + backoff.log_likelihood(); - } - - double log_likelihood(const double& dd, const double& aa) const { - if (aa <= -dd) return -std::numeric_limits::infinity(); - //double llh = Md::log_beta_density(dd, 10, 3) + Md::log_gamma_density(aa, 1, 1); - double llh = Md::log_beta_density(dd, discount_a, discount_b) + - Md::log_gamma_density(aa + dd, strength_s, strength_r); + double llh = backoff.log_likelihood(); typename unordered_map, CCRP, boost::hash > >::const_iterator it; for (it = p.begin(); it != p.end(); ++it) - llh += it->second.log_crp_prob(dd, aa); + llh += it->second.log_crp_prob(); + // TODO parametric likelihood from TiedResampler return llh; } - struct DiscountResampler { - DiscountResampler(const PYPLM& m) : m_(m) {} - const PYPLM& m_; - double operator()(const double& proposed_discount) const { - return m_.log_likelihood(proposed_discount, m_.strength); - } - }; - - struct AlphaResampler { - AlphaResampler(const PYPLM& m) : m_(m) {} - const PYPLM& m_; - double operator()(const double& proposed_strength) const { - return m_.log_likelihood(m_.d, proposed_strength); - } - }; - - void resample_hyperparameters(MT19937* rng, const unsigned nloop = 5, const unsigned niterations = 10) { - DiscountResampler dr(*this); - AlphaResampler ar(*this); - for (int iter = 0; iter < nloop; ++iter) { - strength = slice_sampler1d(ar, strength, *rng, -d + std::numeric_limits::min(), - std::numeric_limits::infinity(), 0.0, niterations, 100*niterations); - double min_discount = std::numeric_limits::min(); - if (strength < 0.0) min_discount -= strength; - d = slice_sampler1d(dr, d, *rng, min_discount, - 1.0, 0.0, niterations, 100*niterations); - } - strength = slice_sampler1d(ar, strength, *rng, -d + std::numeric_limits::min(), - std::numeric_limits::infinity(), 0.0, niterations, 100*niterations); - typename unordered_map, CCRP, boost::hash > >::iterator it; - cerr << "PYPLM<" << N << ">(d=" << d << ",a=" << strength << ") = " << log_likelihood(d, strength) << endl; - for (it = p.begin(); it != p.end(); ++it) { - it->second.set_discount(d); - it->second.set_strength(strength); - } - backoff.resample_hyperparameters(rng, nloop, niterations); + void resample_hyperparameters(MT19937* rng) { + tr.ResampleHyperparameters(rng); + backoff.resample_hyperparameters(rng); } PYPLM backoff; + TiedResampler > tr; double discount_a, discount_b, strength_s, strength_r; double d, strength; mutable vector lookup; // thread-local diff --git a/gi/pf/pyp_tm.cc b/gi/pf/pyp_tm.cc index b5262f47..73104fe9 100644 --- a/gi/pf/pyp_tm.cc +++ b/gi/pf/pyp_tm.cc @@ -11,6 +11,8 @@ #include "ccrp.h" #include "pyp_word_model.h" +#include "tied_resampler.h" + using namespace std; using namespace std::tr1; diff --git a/gi/pf/tied_resampler.h b/gi/pf/tied_resampler.h new file mode 100644 index 00000000..208fb9c7 --- /dev/null +++ b/gi/pf/tied_resampler.h @@ -0,0 +1,82 @@ +#ifndef _TIED_RESAMPLER_H_ +#define _TIED_RESAMPLER_H_ + +#include +#include "sampler.h" +#include "slice_sampler.h" +#include "m.h" + +template +struct TiedResampler { + explicit TiedResampler(double da, double db, double ss, double sr, double d=0.5, double s=1.0) : + d_alpha(da), + d_beta(db), + s_shape(ss), + s_rate(sr), + discount(d), + strength(s) {} + + void Add(CRP* crp) { + crps.insert(crp); + crp->set_discount(discount); + crp->set_strength(strength); + assert(!crp->has_discount_prior()); + assert(!crp->has_strength_prior()); + } + + void Remove(CRP* crp) { + crps.erase(crp); + } + + double LogLikelihood(double d, double s) const { + if (s <= -d) return -std::numeric_limits::infinity(); + double llh = Md::log_beta_density(d, d_alpha, d_beta) + + Md::log_gamma_density(d + s, s_shape, s_rate); + for (typename std::set::iterator it = crps.begin(); it != crps.end(); ++it) + llh += (*it)->log_crp_prob(d, s); + return llh; + } + + struct DiscountResampler { + DiscountResampler(const TiedResampler& m) : m_(m) {} + const TiedResampler& m_; + double operator()(const double& proposed_discount) const { + return m_.LogLikelihood(proposed_discount, m_.strength); + } + }; + + struct AlphaResampler { + AlphaResampler(const TiedResampler& m) : m_(m) {} + const TiedResampler& m_; + double operator()(const double& proposed_strength) const { + return m_.LogLikelihood(m_.discount, proposed_strength); + } + }; + + void ResampleHyperparameters(MT19937* rng, const unsigned nloop = 5, const unsigned niterations = 10) { + const DiscountResampler dr(*this); + const AlphaResampler ar(*this); + for (int iter = 0; iter < nloop; ++iter) { + strength = slice_sampler1d(ar, strength, *rng, -discount + std::numeric_limits::min(), + std::numeric_limits::infinity(), 0.0, niterations, 100*niterations); + double min_discount = std::numeric_limits::min(); + if (strength < 0.0) min_discount -= strength; + discount = slice_sampler1d(dr, discount, *rng, min_discount, + 1.0, 0.0, niterations, 100*niterations); + } + strength = slice_sampler1d(ar, strength, *rng, -discount + std::numeric_limits::min(), + std::numeric_limits::infinity(), 0.0, niterations, 100*niterations); + std::cerr << "TiedCRPs(d=" << discount << ",s=" + << strength << ") = " << LogLikelihood(discount, strength) << std::endl; + for (typename std::set::iterator it = crps.begin(); it != crps.end(); ++it) { + (*it)->set_discount(discount); + (*it)->set_strength(strength); + } + } + private: + std::set crps; + const double d_alpha, d_beta, s_shape, s_rate; + double discount, strength; +}; + +#endif -- cgit v1.2.3 From 38f28be7cd2bada87ebad78994e3c938e10c2cce Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Sat, 10 Mar 2012 12:56:15 -0500 Subject: ready to infer alignment parameters --- gi/pf/Makefile.am | 4 +- gi/pf/align-lexonly-pyp.cc | 22 ++- gi/pf/align-lexonly.cc | 332 --------------------------------------------- gi/pf/pyp_tm.cc | 6 +- gi/pf/quasi_model2.h | 115 ++++++++++++---- gi/pf/tied_resampler.h | 31 +++++ 6 files changed, 143 insertions(+), 367 deletions(-) delete mode 100644 gi/pf/align-lexonly.cc (limited to 'gi/pf') diff --git a/gi/pf/Makefile.am b/gi/pf/Makefile.am index 4ce72ba1..f9c979d0 100644 --- a/gi/pf/Makefile.am +++ b/gi/pf/Makefile.am @@ -1,4 +1,4 @@ -bin_PROGRAMS = cbgi brat dpnaive pfbrat pfdist itg pfnaive condnaive align-lexonly align-lexonly-pyp learn_cfg pyp_lm nuisance_test align-tl +bin_PROGRAMS = cbgi brat dpnaive pfbrat pfdist itg pfnaive condnaive align-lexonly-pyp learn_cfg pyp_lm nuisance_test align-tl noinst_LIBRARIES = libpf.a @@ -7,8 +7,6 @@ libpf_a_SOURCES = base_distributions.cc reachability.cc cfg_wfst_composer.cc cor nuisance_test_SOURCES = nuisance_test.cc nuisance_test_LDADD = libpf.a $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a $(top_srcdir)/klm/lm/libklm.a $(top_srcdir)/klm/util/libklm_util.a -lz -align_lexonly_SOURCES = align-lexonly.cc - align_lexonly_pyp_SOURCES = align-lexonly-pyp.cc align_lexonly_pyp_LDADD = libpf.a $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a $(top_srcdir)/klm/lm/libklm.a $(top_srcdir)/klm/util/libklm_util.a -lz diff --git a/gi/pf/align-lexonly-pyp.cc b/gi/pf/align-lexonly-pyp.cc index 0c90b6ce..68cb9192 100644 --- a/gi/pf/align-lexonly-pyp.cc +++ b/gi/pf/align-lexonly-pyp.cc @@ -61,15 +61,15 @@ struct AlignedSentencePair { struct Aligner { Aligner(const vector >& lets, int num_letters, vector* c) : corpus(*c), + paj_model(4, 0.08), model(lets, num_letters), - paj(4, 0.08), kNULL(TD::Convert("NULL")) { assert(lets[kNULL].size() == 0); } vector& corpus; + QuasiModel2 paj_model; PYPLexicalTranslation model; - const QuasiModel2 paj; const WordID kNULL; void ResampleHyperparameters() { @@ -86,10 +86,12 @@ struct Aligner { a_j = prng->next() * (1 + asp.src.size()); const WordID f_a_j = (a_j ? asp.src[a_j - 1] : kNULL); model.Increment(f_a_j, asp.trg[j], &*prng); - // TODO factor in alignment prob + paj_model.Increment(a_j, j, asp.src.size(), asp.trg.size()); } } - cerr << "Corpus intialized randomly. LLH = " << model.Likelihood() << endl; + cerr << "Corpus intialized randomly." << endl; + cerr << "LLH = " << Likelihood() << " \t(Amodel=" << paj_model.Likelihood() + << " TModel=" << model.Likelihood() << ") contexts=" << model.UniqueConditioningContexts() << endl; } void ResampleCorpus() { @@ -101,19 +103,25 @@ struct Aligner { const WordID e_j = asp.trg[j]; WordID f_a_j = (a_j ? asp.src[a_j - 1] : kNULL); model.Decrement(f_a_j, e_j, prng); + paj_model.Decrement(a_j, j, asp.src.size(), asp.trg.size()); for (unsigned prop_a_j = 0; prop_a_j <= asp.src.size(); ++prop_a_j) { const WordID prop_f = (prop_a_j ? asp.src[prop_a_j - 1] : kNULL); ss[prop_a_j] = model.Prob(prop_f, e_j); - // TODO configurable - ss[prop_a_j] *= paj.Pa_j(prop_a_j, j, asp.src.size(), asp.trg.size()); + ss[prop_a_j] *= paj_model.Prob(prop_a_j, j, asp.src.size(), asp.trg.size()); } a_j = prng->SelectSample(ss); f_a_j = (a_j ? asp.src[a_j - 1] : kNULL); model.Increment(f_a_j, e_j, prng); + paj_model.Increment(a_j, j, asp.src.size(), asp.trg.size()); } } - cerr << "LLH = " << model.Likelihood() << " " << model.UniqueConditioningContexts() << endl; + cerr << "LLH = " << Likelihood() << " \t(Amodel=" << paj_model.Likelihood() + << " TModel=" << model.Likelihood() << ") contexts=" << model.UniqueConditioningContexts() << endl; + } + + prob_t Likelihood() const { + return model.Likelihood() * paj_model.Likelihood(); } }; diff --git a/gi/pf/align-lexonly.cc b/gi/pf/align-lexonly.cc deleted file mode 100644 index dbc9dc07..00000000 --- a/gi/pf/align-lexonly.cc +++ /dev/null @@ -1,332 +0,0 @@ -#include -#include -#include - -#include -#include -#include - -#include "array2d.h" -#include "base_distributions.h" -#include "monotonic_pseg.h" -#include "conditional_pseg.h" -#include "trule.h" -#include "tdict.h" -#include "stringlib.h" -#include "filelib.h" -#include "dict.h" -#include "sampler.h" -#include "ccrp_nt.h" -#include "corpus.h" -#include "ngram_base.h" - -using namespace std; -using namespace tr1; -namespace po = boost::program_options; - -void InitCommandLine(int argc, char** argv, po::variables_map* conf) { - po::options_description opts("Configuration options"); - opts.add_options() - ("samples,s",po::value()->default_value(1000),"Number of samples") - ("input,i",po::value(),"Read parallel data from") - ("random_seed,S",po::value(), "Random seed"); - po::options_description clo("Command line options"); - clo.add_options() - ("config", po::value(), "Configuration file") - ("help,h", "Print this help message and exit"); - po::options_description dconfig_options, dcmdline_options; - dconfig_options.add(opts); - dcmdline_options.add(opts).add(clo); - - po::store(parse_command_line(argc, argv, dcmdline_options), *conf); - if (conf->count("config")) { - ifstream config((*conf)["config"].as().c_str()); - po::store(po::parse_config_file(config, dconfig_options), *conf); - } - po::notify(*conf); - - if (conf->count("help") || (conf->count("input") == 0)) { - cerr << dcmdline_options << endl; - exit(1); - } -} - -shared_ptr prng; - -struct LexicalAlignment { - unsigned char src_index; - bool is_transliteration; - vector > derivation; -}; - -struct AlignedSentencePair { - vector src; - vector trg; - vector a; - Array2D posterior; -}; - -struct HierarchicalWordBase { - explicit HierarchicalWordBase(const unsigned vocab_e_size) : - base(prob_t::One()), r(25,25,10), u0(-log(vocab_e_size)) {} - - void ResampleHyperparameters(MT19937* rng) { - r.resample_hyperparameters(rng); - } - - inline double logp0(const vector& s) const { - return s.size() * u0; - } - - // return p0 of rule.e_ - prob_t operator()(const TRule& rule) const { - prob_t p; p.logeq(r.logprob(rule.e_, logp0(rule.e_))); - return p; - } - - void Increment(const TRule& rule) { - if (r.increment(rule.e_)) { - prob_t p; p.logeq(logp0(rule.e_)); - base *= p; - } - } - - void Decrement(const TRule& rule) { - if (r.decrement(rule.e_)) { - prob_t p; p.logeq(logp0(rule.e_)); - base /= p; - } - } - - prob_t Likelihood() const { - prob_t p; p.logeq(r.log_crp_prob()); - p *= base; - return p; - } - - void Summary() const { - cerr << "NUMBER OF CUSTOMERS: " << r.num_customers() << " (\\alpha=" << r.alpha() << ')' << endl; - for (CCRP_NoTable >::const_iterator it = r.begin(); it != r.end(); ++it) - cerr << " " << it->second << '\t' << TD::GetString(it->first) << endl; - } - - prob_t base; - CCRP_NoTable > r; - const double u0; -}; - -struct BasicLexicalAlignment { - explicit BasicLexicalAlignment(const vector >& lets, - const unsigned words_e, - const unsigned letters_e, - vector* corp) : - letters(lets), - corpus(*corp), - up0("fr-en.10k.translit-base.txt.gz"), - //up0(words_e), - //up0("en.chars.1gram", letters_e), - //up0("en.words.1gram"), - //up0(letters_e), - //up0("en.chars.2gram"), - tmodel(up0) { - } - - void InstantiateRule(const WordID src, - const WordID trg, - TRule* rule) const { - static const WordID kX = TD::Convert("X") * -1; - rule->lhs_ = kX; - rule->e_ = letters[trg]; - rule->f_ = letters[src]; - } - - void InitializeRandom() { - const WordID kNULL = TD::Convert("NULL"); - cerr << "Initializing with random alignments ...\n"; - for (unsigned i = 0; i < corpus.size(); ++i) { - AlignedSentencePair& asp = corpus[i]; - asp.a.resize(asp.trg.size()); - for (unsigned j = 0; j < asp.trg.size(); ++j) { - const unsigned char a_j = prng->next() * (1 + asp.src.size()); - const WordID f_a_j = (a_j ? asp.src[a_j - 1] : kNULL); - TRule r; - InstantiateRule(f_a_j, asp.trg[j], &r); - asp.a[j].is_transliteration = false; - asp.a[j].src_index = a_j; - if (tmodel.IncrementRule(r)) - up0.Increment(r); - } - } - cerr << " LLH = " << Likelihood() << endl; - } - - prob_t Likelihood() const { - prob_t p = tmodel.Likelihood(); - p *= up0.Likelihood(); - return p; - } - - void ResampleHyperparemeters() { - cerr << " LLH_prev = " << Likelihood() << flush; - tmodel.ResampleHyperparameters(&*prng); - up0.ResampleHyperparameters(&*prng); - cerr << "\tLLH_post = " << Likelihood() << endl; - } - - void ResampleCorpus(); - - const vector >& letters; // spelling dictionary - vector& corpus; - //PhraseConditionalUninformativeBase up0; - //PhraseConditionalUninformativeUnigramBase up0; - //UnigramWordBase up0; - //HierarchicalUnigramBase up0; - TableLookupBase up0; - //HierarchicalWordBase up0; - //PoissonUniformUninformativeBase up0; - //CompletelyUniformBase up0; - //FixedNgramBase up0; - //ConditionalTranslationModel tmodel; - //ConditionalTranslationModel tmodel; - //ConditionalTranslationModel tmodel; - //ConditionalTranslationModel tmodel; - //ConditionalTranslationModel tmodel; - //ConditionalTranslationModel tmodel; - ConditionalTranslationModel tmodel; - //ConditionalTranslationModel tmodel; - //ConditionalTranslationModel tmodel; -}; - -void BasicLexicalAlignment::ResampleCorpus() { - static const WordID kNULL = TD::Convert("NULL"); - for (unsigned i = 0; i < corpus.size(); ++i) { - AlignedSentencePair& asp = corpus[i]; - SampleSet ss; ss.resize(asp.src.size() + 1); - for (unsigned j = 0; j < asp.trg.size(); ++j) { - TRule r; - unsigned char& a_j = asp.a[j].src_index; - WordID f_a_j = (a_j ? asp.src[a_j - 1] : kNULL); - InstantiateRule(f_a_j, asp.trg[j], &r); - if (tmodel.DecrementRule(r)) - up0.Decrement(r); - - for (unsigned prop_a_j = 0; prop_a_j <= asp.src.size(); ++prop_a_j) { - const WordID prop_f = (prop_a_j ? asp.src[prop_a_j - 1] : kNULL); - InstantiateRule(prop_f, asp.trg[j], &r); - ss[prop_a_j] = tmodel.RuleProbability(r); - } - a_j = prng->SelectSample(ss); - f_a_j = (a_j ? asp.src[a_j - 1] : kNULL); - InstantiateRule(f_a_j, asp.trg[j], &r); - if (tmodel.IncrementRule(r)) - up0.Increment(r); - } - } - cerr << " LLH = " << tmodel.Likelihood() << endl; -} - -void ExtractLetters(const set& v, vector >* l, set* letset = NULL) { - for (set::const_iterator it = v.begin(); it != v.end(); ++it) { - if (*it >= l->size()) { l->resize(*it + 1); } - vector& letters = (*l)[*it]; - if (letters.size()) continue; // if e and f have the same word - - const string& w = TD::Convert(*it); - - size_t cur = 0; - while (cur < w.size()) { - const size_t len = UTF8Len(w[cur]); - letters.push_back(TD::Convert(w.substr(cur, len))); - if (letset) letset->insert(letters.back()); - cur += len; - } - } -} - -void Debug(const AlignedSentencePair& asp) { - cerr << TD::GetString(asp.src) << endl << TD::GetString(asp.trg) << endl; - Array2D a(asp.src.size(), asp.trg.size()); - for (unsigned j = 0; j < asp.trg.size(); ++j) - if (asp.a[j].src_index) a(asp.a[j].src_index - 1, j) = true; - cerr << a << endl; -} - -void AddSample(AlignedSentencePair* asp) { - for (unsigned j = 0; j < asp->trg.size(); ++j) - asp->posterior(asp->a[j].src_index, j)++; -} - -void WriteAlignments(const AlignedSentencePair& asp) { - bool first = true; - for (unsigned j = 0; j < asp.trg.size(); ++j) { - int src_index = -1; - int mc = -1; - for (unsigned i = 0; i <= asp.src.size(); ++i) { - if (asp.posterior(i, j) > mc) { - mc = asp.posterior(i, j); - src_index = i; - } - } - - if (src_index) { - if (first) first = false; else cout << ' '; - cout << (src_index - 1) << '-' << j; - } - } - cout << endl; -} - -int main(int argc, char** argv) { - po::variables_map conf; - InitCommandLine(argc, argv, &conf); - - if (conf.count("random_seed")) - prng.reset(new MT19937(conf["random_seed"].as())); - else - prng.reset(new MT19937); -// MT19937& rng = *prng; - - vector > corpuse, corpusf; - set vocabe, vocabf; - corpus::ReadParallelCorpus(conf["input"].as(), &corpusf, &corpuse, &vocabf, &vocabe); - cerr << "f-Corpus size: " << corpusf.size() << " sentences\n"; - cerr << "f-Vocabulary size: " << vocabf.size() << " types\n"; - cerr << "f-Corpus size: " << corpuse.size() << " sentences\n"; - cerr << "f-Vocabulary size: " << vocabe.size() << " types\n"; - assert(corpusf.size() == corpuse.size()); - - vector corpus(corpuse.size()); - for (unsigned i = 0; i < corpuse.size(); ++i) { - corpus[i].src.swap(corpusf[i]); - corpus[i].trg.swap(corpuse[i]); - corpus[i].posterior.resize(corpus[i].src.size() + 1, corpus[i].trg.size()); - } - corpusf.clear(); corpuse.clear(); - - vocabf.insert(TD::Convert("NULL")); - vector > letters(TD::NumWords()); - set letset; - ExtractLetters(vocabe, &letters, &letset); - ExtractLetters(vocabf, &letters, NULL); - letters[TD::Convert("NULL")].clear(); - - BasicLexicalAlignment x(letters, vocabe.size(), letset.size(), &corpus); - x.InitializeRandom(); - const unsigned samples = conf["samples"].as(); - for (int i = 0; i < samples; ++i) { - for (int j = 395; j < 397; ++j) Debug(corpus[j]); - cerr << i << "\t" << x.tmodel.r.size() << "\t"; - if (i % 10 == 0) x.ResampleHyperparemeters(); - x.ResampleCorpus(); - if (i > (samples / 5) && (i % 10 == 9)) for (int j = 0; j < corpus.size(); ++j) AddSample(&corpus[j]); - } - for (unsigned i = 0; i < corpus.size(); ++i) - WriteAlignments(corpus[i]); - //ModelAndData posterior(x, &corpus, vocabe, vocabf); - x.tmodel.Summary(); - x.up0.Summary(); - - //posterior.Sample(); - - return 0; -} diff --git a/gi/pf/pyp_tm.cc b/gi/pf/pyp_tm.cc index 73104fe9..bf5a6497 100644 --- a/gi/pf/pyp_tm.cc +++ b/gi/pf/pyp_tm.cc @@ -10,7 +10,6 @@ #include "tdict.h" #include "ccrp.h" #include "pyp_word_model.h" - #include "tied_resampler.h" using namespace std; @@ -18,7 +17,7 @@ using namespace std::tr1; template struct ConditionalPYPWordModel { - ConditionalPYPWordModel(Base* b) : base(*b) {} + ConditionalPYPWordModel(Base* b) : base(*b), btr(3) {} void Summary() const { cerr << "Number of conditioning contexts: " << r.size() << endl; @@ -32,6 +31,7 @@ struct ConditionalPYPWordModel { void ResampleHyperparameters(MT19937* rng) { for (RuleModelHash::iterator it = r.begin(); it != r.end(); ++it) it->second.resample_hyperparameters(rng); + btr.ResampleHyperparameters(rng); } prob_t Prob(const WordID src, const vector& trglets) const { @@ -72,7 +72,9 @@ struct ConditionalPYPWordModel { return r.size(); } + // TODO tie PYP hyperparameters based on source word frequency bins Base& base; + BinTiedResampler > > btr; typedef unordered_map > > RuleModelHash; RuleModelHash r; }; diff --git a/gi/pf/quasi_model2.h b/gi/pf/quasi_model2.h index 0095289f..8ec0a400 100644 --- a/gi/pf/quasi_model2.h +++ b/gi/pf/quasi_model2.h @@ -3,44 +3,113 @@ #include #include +#include +#include "boost/functional.hpp" #include "prob.h" #include "array2d.h" +struct AlignmentObservation { + AlignmentObservation() : src_len(), trg_len(), j(), a_j() {} + AlignmentObservation(unsigned sl, unsigned tl, unsigned tw, unsigned sw) : + src_len(sl), trg_len(tl), j(tw), a_j(sw) {} + unsigned short src_len; + unsigned short trg_len; + unsigned short j; + unsigned short a_j; +}; + +inline size_t hash_value(const AlignmentObservation& o) { + return reinterpret_cast(o); +} + +inline bool operator==(const AlignmentObservation& a, const AlignmentObservation& b) { + return hash_value(a) == hash_value(b); +} + struct QuasiModel2 { explicit QuasiModel2(double alpha, double pnull = 0.1) : alpha_(alpha), pnull_(pnull), - pnotnull_(1 - pnull), - z_(1000,1000) {} + pnotnull_(1 - pnull) {} + // a_j = 0 => NULL; src_len does *not* include null - prob_t Pa_j(unsigned a_j, unsigned j, unsigned src_len, unsigned trg_len) const { + prob_t Prob(unsigned a_j, unsigned j, unsigned src_len, unsigned trg_len) const { if (!a_j) return pnull_; - std::vector& zv = z_(src_len, trg_len); - if (zv.size() == 0) - zv.resize(trg_len); - - prob_t& z = zv[j]; - if (z.is_0()) z = ComputeZ(j, src_len, trg_len); - - prob_t p; - p.logeq(-fabs(double(a_j - 1) / src_len - double(j) / trg_len) * alpha_); - p *= pnotnull_; - p /= z; + return pnotnull_ * + prob_t(UnnormalizedProb(a_j, j, src_len, trg_len, alpha_) / GetOrComputeZ(j, src_len, trg_len)); + } + + void Increment(unsigned a_j, unsigned j, unsigned src_len, unsigned trg_len) { + assert(a_j <= src_len); + assert(j < trg_len); + ++obs_[AlignmentObservation(src_len, trg_len, j, a_j)]; + } + + void Decrement(unsigned a_j, unsigned j, unsigned src_len, unsigned trg_len) { + const AlignmentObservation ao(src_len, trg_len, j, a_j); + int &cc = obs_[ao]; + assert(cc > 0); + --cc; + if (!cc) obs_.erase(ao); + } + + prob_t Likelihood() const { + return Likelihood(alpha_, pnull_.as_float()); + } + + prob_t Likelihood(double alpha, double ppnull) const { + const prob_t pnull(ppnull); + const prob_t pnotnull(1 - ppnull); + + prob_t p = prob_t::One(); + for (ObsCount::const_iterator it = obs_.begin(); it != obs_.end(); ++it) { + const AlignmentObservation& ao = it->first; + if (ao.a_j) { + double u = UnnormalizedProb(ao.a_j, ao.j, ao.src_len, ao.trg_len, alpha); + double z = ComputeZ(ao.j, ao.src_len, ao.trg_len, alpha); + prob_t pa(u / z); + pa *= pnotnull; + pa.poweq(it->second); + p *= pa; + } else { + p *= pnull.pow(it->second); + } + } return p; } + private: - prob_t ComputeZ(unsigned j, unsigned src_len, unsigned trg_len) const { - prob_t p, z = prob_t::Zero(); - for (int a_j = 1; a_j <= src_len; ++a_j) { - p.logeq(-fabs(double(a_j - 1) / src_len - double(j) / trg_len) * alpha_); - z += p; - } + static double UnnormalizedProb(unsigned a_j, unsigned j, unsigned src_len, unsigned trg_len, double alpha) { + return exp(-fabs(double(a_j - 1) / src_len - double(j) / trg_len) * alpha); + } + + static double ComputeZ(unsigned j, unsigned src_len, unsigned trg_len, double alpha) { + double z = 0; + for (int a_j = 1; a_j <= src_len; ++a_j) + z += UnnormalizedProb(a_j, j, src_len, trg_len, alpha); return z; } + + const double& GetOrComputeZ(unsigned j, unsigned src_len, unsigned trg_len) const { + if (src_len >= zcache_.size()) + zcache_.resize(src_len + 1); + if (trg_len >= zcache_[src_len].size()) + zcache_[src_len].resize(trg_len + 1); + std::vector& zv = zcache_[src_len][trg_len]; + if (zv.size() == 0) + zv.resize(trg_len); + double& z = zv[j]; + if (!z) + z = ComputeZ(j, src_len, trg_len, alpha_); + return z; + } + double alpha_; - const prob_t pnull_; - const prob_t pnotnull_; - mutable Array2D > z_; + prob_t pnull_; + prob_t pnotnull_; + mutable std::vector > > zcache_; + typedef std::tr1::unordered_map > ObsCount; + ObsCount obs_; }; #endif diff --git a/gi/pf/tied_resampler.h b/gi/pf/tied_resampler.h index 208fb9c7..5a262f9d 100644 --- a/gi/pf/tied_resampler.h +++ b/gi/pf/tied_resampler.h @@ -2,6 +2,7 @@ #define _TIED_RESAMPLER_H_ #include +#include #include "sampler.h" #include "slice_sampler.h" #include "m.h" @@ -28,6 +29,10 @@ struct TiedResampler { crps.erase(crp); } + size_t size() const { + return crps.size(); + } + double LogLikelihood(double d, double s) const { if (s <= -d) return -std::numeric_limits::infinity(); double llh = Md::log_beta_density(d, d_alpha, d_beta) + @@ -54,6 +59,7 @@ struct TiedResampler { }; void ResampleHyperparameters(MT19937* rng, const unsigned nloop = 5, const unsigned niterations = 10) { + if (size() == 0) { std::cerr << "EMPTY - not resampling\n"; return; } const DiscountResampler dr(*this); const AlphaResampler ar(*this); for (int iter = 0; iter < nloop; ++iter) { @@ -79,4 +85,29 @@ struct TiedResampler { double discount, strength; }; +// split according to some criterion +template +struct BinTiedResampler { + explicit BinTiedResampler(unsigned nbins) : + resamplers(nbins, TiedResampler(1,1,1,1)) {} + + void Add(unsigned bin, CRP* crp) { + resamplers[bin].Add(crp); + } + + void Remove(unsigned bin, CRP* crp) { + resamplers[bin].Remove(crp); + } + + void ResampleHyperparameters(MT19937* rng) { + for (unsigned i = 0; i < resamplers.size(); ++i) { + std::cerr << "BIN " << i << " (" << resamplers[i].size() << " CRPs): " << std::flush; + resamplers[i].ResampleHyperparameters(rng); + } + } + + private: + std::vector > resamplers; +}; + #endif -- cgit v1.2.3 From 289f96779e665ba24adca3461a624c68aa37bd99 Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Sat, 10 Mar 2012 14:10:04 -0500 Subject: do Bayesian inference on quasimodel2 hyperparameters --- gi/pf/align-lexonly-pyp.cc | 5 ++-- gi/pf/pyp_lm.cc | 2 +- gi/pf/pyp_tm.cc | 11 +++++---- gi/pf/quasi_model2.h | 57 +++++++++++++++++++++++++++++++++++++++++++--- gi/pf/tied_resampler.h | 11 +++++++++ 5 files changed, 75 insertions(+), 11 deletions(-) (limited to 'gi/pf') diff --git a/gi/pf/align-lexonly-pyp.cc b/gi/pf/align-lexonly-pyp.cc index 68cb9192..6c054753 100644 --- a/gi/pf/align-lexonly-pyp.cc +++ b/gi/pf/align-lexonly-pyp.cc @@ -74,6 +74,7 @@ struct Aligner { void ResampleHyperparameters() { model.ResampleHyperparameters(prng); + paj_model.ResampleHyperparameters(prng); } void InitializeRandom() { @@ -216,9 +217,9 @@ int main(int argc, char** argv) { const unsigned samples = conf["samples"].as(); for (int i = 0; i < samples; ++i) { for (int j = 65; j < 67; ++j) Debug(corpus[j]); - if (i % 7 == 6) aligner.ResampleHyperparameters(); + if (i % 10 == 9) aligner.ResampleHyperparameters(); aligner.ResampleCorpus(); - if (i > (samples / 5) && (i % 10 == 9)) for (int j = 0; j < corpus.size(); ++j) AddSample(&corpus[j]); + if (i > (samples / 5) && (i % 6 == 5)) for (int j = 0; j < corpus.size(); ++j) AddSample(&corpus[j]); } for (unsigned i = 0; i < corpus.size(); ++i) WriteAlignments(corpus[i]); diff --git a/gi/pf/pyp_lm.cc b/gi/pf/pyp_lm.cc index 85635b8f..91029688 100644 --- a/gi/pf/pyp_lm.cc +++ b/gi/pf/pyp_lm.cc @@ -113,7 +113,7 @@ template struct PYPLM { typename unordered_map, CCRP, boost::hash > >::const_iterator it; for (it = p.begin(); it != p.end(); ++it) llh += it->second.log_crp_prob(); - // TODO parametric likelihood from TiedResampler + llh += tr.LogLikelihood(); return llh; } diff --git a/gi/pf/pyp_tm.cc b/gi/pf/pyp_tm.cc index bf5a6497..34ef0ba2 100644 --- a/gi/pf/pyp_tm.cc +++ b/gi/pf/pyp_tm.cc @@ -17,7 +17,7 @@ using namespace std::tr1; template struct ConditionalPYPWordModel { - ConditionalPYPWordModel(Base* b) : base(*b), btr(3) {} + ConditionalPYPWordModel(Base* b) : base(*b), btr(2) {} void Summary() const { cerr << "Number of conditioning contexts: " << r.size() << endl; @@ -29,8 +29,6 @@ struct ConditionalPYPWordModel { } void ResampleHyperparameters(MT19937* rng) { - for (RuleModelHash::iterator it = r.begin(); it != r.end(); ++it) - it->second.resample_hyperparameters(rng); btr.ResampleHyperparameters(rng); } @@ -45,8 +43,11 @@ struct ConditionalPYPWordModel { void Increment(const WordID src, const vector& trglets, MT19937* rng) { RuleModelHash::iterator it = r.find(src); - if (it == r.end()) - it = r.insert(make_pair(src, CCRP >(1,1,1,1,0.5,1.0))).first; + if (it == r.end()) { + it = r.insert(make_pair(src, CCRP >(0.5,1.0))).first; + static const WordID kNULL = TD::Convert("NULL"); + btr.Add(src == kNULL ? 0 : 1, &it->second); + } if (it->second.increment(trglets, base(trglets), rng)) base.Increment(trglets, rng); } diff --git a/gi/pf/quasi_model2.h b/gi/pf/quasi_model2.h index 8ec0a400..588c8f84 100644 --- a/gi/pf/quasi_model2.h +++ b/gi/pf/quasi_model2.h @@ -7,6 +7,8 @@ #include "boost/functional.hpp" #include "prob.h" #include "array2d.h" +#include "slice_sampler.h" +#include "m.h" struct AlignmentObservation { AlignmentObservation() : src_len(), trg_len(), j(), a_j() {} @@ -53,6 +55,37 @@ struct QuasiModel2 { if (!cc) obs_.erase(ao); } + struct PNullResampler { + PNullResampler(const QuasiModel2& m) : m_(m) {} + const QuasiModel2& m_; + double operator()(const double& proposed_pnull) const { + return log(m_.Likelihood(m_.alpha_, proposed_pnull)); + } + }; + + struct AlphaResampler { + AlphaResampler(const QuasiModel2& m) : m_(m) {} + const QuasiModel2& m_; + double operator()(const double& proposed_alpha) const { + return log(m_.Likelihood(proposed_alpha, m_.pnull_.as_float())); + } + }; + + void ResampleHyperparameters(MT19937* rng, const unsigned nloop = 5, const unsigned niterations = 10) { + const PNullResampler dr(*this); + const AlphaResampler ar(*this); + for (unsigned i = 0; i < nloop; ++i) { + double pnull = slice_sampler1d(dr, pnull_.as_float(), *rng, 0.00000001, + 1.0, 0.0, niterations, 100*niterations); + pnull_ = prob_t(pnull); + alpha_ = slice_sampler1d(ar, alpha_, *rng, 0.00000001, + std::numeric_limits::infinity(), 0.0, niterations, 100*niterations); + } + std::cerr << "QuasiModel2(alpha=" << alpha_ << ",p_null=" + << pnull_.as_float() << ") = " << Likelihood() << std::endl; + zcache_.clear(); + } + prob_t Likelihood() const { return Likelihood(alpha_, pnull_.as_float()); } @@ -61,12 +94,17 @@ struct QuasiModel2 { const prob_t pnull(ppnull); const prob_t pnotnull(1 - ppnull); - prob_t p = prob_t::One(); + prob_t p; + p.logeq(Md::log_gamma_density(alpha, 0.1, 25)); // TODO configure + assert(!p.is_0()); + prob_t prob_of_ppnull; prob_of_ppnull.logeq(Md::log_beta_density(ppnull, 2, 10)); + assert(!prob_of_ppnull.is_0()); + p *= prob_of_ppnull; for (ObsCount::const_iterator it = obs_.begin(); it != obs_.end(); ++it) { const AlignmentObservation& ao = it->first; if (ao.a_j) { - double u = UnnormalizedProb(ao.a_j, ao.j, ao.src_len, ao.trg_len, alpha); - double z = ComputeZ(ao.j, ao.src_len, ao.trg_len, alpha); + prob_t u = XUnnormalizedProb(ao.a_j, ao.j, ao.src_len, ao.trg_len, alpha); + prob_t z = XComputeZ(ao.j, ao.src_len, ao.trg_len, alpha); prob_t pa(u / z); pa *= pnotnull; pa.poweq(it->second); @@ -79,6 +117,19 @@ struct QuasiModel2 { } private: + static prob_t XUnnormalizedProb(unsigned a_j, unsigned j, unsigned src_len, unsigned trg_len, double alpha) { + prob_t p; + p.logeq(-fabs(double(a_j - 1) / src_len - double(j) / trg_len) * alpha); + return p; + } + + static prob_t XComputeZ(unsigned j, unsigned src_len, unsigned trg_len, double alpha) { + prob_t z = prob_t::Zero(); + for (int a_j = 1; a_j <= src_len; ++a_j) + z += XUnnormalizedProb(a_j, j, src_len, trg_len, alpha); + return z; + } + static double UnnormalizedProb(unsigned a_j, unsigned j, unsigned src_len, unsigned trg_len, double alpha) { return exp(-fabs(double(a_j - 1) / src_len - double(j) / trg_len) * alpha); } diff --git a/gi/pf/tied_resampler.h b/gi/pf/tied_resampler.h index 5a262f9d..6f45fbce 100644 --- a/gi/pf/tied_resampler.h +++ b/gi/pf/tied_resampler.h @@ -42,6 +42,10 @@ struct TiedResampler { return llh; } + double LogLikelihood() const { + return LogLikelihood(discount, strength); + } + struct DiscountResampler { DiscountResampler(const TiedResampler& m) : m_(m) {} const TiedResampler& m_; @@ -106,6 +110,13 @@ struct BinTiedResampler { } } + double LogLikelihood() const { + double llh = 0; + for (unsigned i = 0; i < resamplers.size(); ++i) + llh += resamplers[i].LogLikelihood(); + return llh; + } + private: std::vector > resamplers; }; -- cgit v1.2.3 From dfbc278c1057555fda9312291c8024049e00b7d8 Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Sat, 10 Mar 2012 16:42:12 -0500 Subject: frequency-based binning --- decoder/Makefile.am | 1 - decoder/ff_csplit.cc | 2 +- decoder/freqdict.cc | 29 ----------------------------- decoder/freqdict.h | 37 ++++++++++++++++++++++++++++++++----- gi/pf/align-lexonly-pyp.cc | 24 +++++++++++++++++------- gi/pf/make-freq-bins.pl | 26 ++++++++++++++++++++++++++ gi/pf/pyp_tm.cc | 24 +++++++++++++++++------- gi/pf/pyp_tm.h | 7 ++++--- 8 files changed, 97 insertions(+), 53 deletions(-) delete mode 100644 decoder/freqdict.cc create mode 100755 gi/pf/make-freq-bins.pl (limited to 'gi/pf') diff --git a/decoder/Makefile.am b/decoder/Makefile.am index a00b18af..ec51d643 100644 --- a/decoder/Makefile.am +++ b/decoder/Makefile.am @@ -76,7 +76,6 @@ libcdec_a_SOURCES = \ ff_source_syntax.cc \ ff_bleu.cc \ ff_factory.cc \ - freqdict.cc \ lexalign.cc \ lextrans.cc \ tagger.cc \ diff --git a/decoder/ff_csplit.cc b/decoder/ff_csplit.cc index 3991d38f..c9ed996c 100644 --- a/decoder/ff_csplit.cc +++ b/decoder/ff_csplit.cc @@ -72,7 +72,7 @@ struct BasicCSplitFeaturesImpl { const int fl1_; const int fl2_; const int bad_; - FreqDict freq_dict_; + FreqDict freq_dict_; set bad_words_; }; diff --git a/decoder/freqdict.cc b/decoder/freqdict.cc deleted file mode 100644 index 9e25d346..00000000 --- a/decoder/freqdict.cc +++ /dev/null @@ -1,29 +0,0 @@ -#include -#include -#include -#include "freqdict.h" -#include "tdict.h" -#include "filelib.h" - -using namespace std; - -void FreqDict::Load(const std::string& fname) { - cerr << "Reading word frequencies: " << fname << endl; - ReadFile rf(fname); - istream& ifs = *rf.stream(); - int cc=0; - while (ifs) { - std::string word; - ifs >> word; - if (word.size() == 0) continue; - if (word[0] == '#') continue; - double count = 0; - ifs >> count; - assert(count > 0.0); // use -log(f) - counts_[TD::Convert(word)]=count; - ++cc; - if (cc % 10000 == 0) { std::cerr << "."; } - } - std::cerr << "\n"; - std::cerr << "Loaded " << cc << " words\n"; -} diff --git a/decoder/freqdict.h b/decoder/freqdict.h index 9acf0c33..4e03fadd 100644 --- a/decoder/freqdict.h +++ b/decoder/freqdict.h @@ -1,20 +1,47 @@ #ifndef _FREQDICT_H_ #define _FREQDICT_H_ +#include #include #include #include "wordid.h" +#include "filelib.h" +#include "tdict.h" +template class FreqDict { public: - void Load(const std::string& fname); - float LookUp(const WordID& word) const { - std::map::const_iterator i = counts_.find(word); - if (i == counts_.end()) return 0; + FreqDict() : max_() {} + T Max() const { return max_; } + void Load(const std::string& fname) { + std::cerr << "Reading word statistics from: " << fname << std::endl; + ReadFile rf(fname); + std::istream& ifs = *rf.stream(); + int cc=0; + std::string word; + while (ifs) { + ifs >> word; + if (word.size() == 0) continue; + if (word[0] == '#') continue; + T count = 0; + ifs >> count; + if (count > max_) max_ = count; + counts_[TD::Convert(word)]=count; + ++cc; + if (cc % 10000 == 0) { std::cerr << "."; } + } + std::cerr << "\n"; + std::cerr << "Loaded " << cc << " words\n"; + } + + T LookUp(const WordID& word) const { + typename std::map::const_iterator i = counts_.find(word); + if (i == counts_.end()) return T(); return i->second; } private: - std::map counts_; + T max_; + std::map counts_; }; #endif diff --git a/gi/pf/align-lexonly-pyp.cc b/gi/pf/align-lexonly-pyp.cc index 6c054753..942dcf51 100644 --- a/gi/pf/align-lexonly-pyp.cc +++ b/gi/pf/align-lexonly-pyp.cc @@ -20,6 +20,9 @@ void InitCommandLine(int argc, char** argv, po::variables_map* conf) { po::options_description opts("Configuration options"); opts.add_options() ("samples,s",po::value()->default_value(1000),"Number of samples") + ("infer_alignment_hyperparameters,I", "Infer alpha and p_null, otherwise fixed values will be assumed") + ("p_null,0", po::value()->default_value(0.08), "probability of aligning to null") + ("align_alpha,a", po::value()->default_value(4.0), "how 'tight' is the bias toward be along the diagonal?") ("input,i",po::value(),"Read parallel data from") ("random_seed,S",po::value(), "Random seed"); po::options_description clo("Command line options"); @@ -59,9 +62,13 @@ struct AlignedSentencePair { }; struct Aligner { - Aligner(const vector >& lets, int num_letters, vector* c) : + Aligner(const vector >& lets, + int num_letters, + const po::variables_map& conf, + vector* c) : corpus(*c), - paj_model(4, 0.08), + paj_model(conf["align_alpha"].as(), conf["p_null"].as()), + infer_paj(conf.count("infer_alignment_hyperparameters") > 0), model(lets, num_letters), kNULL(TD::Convert("NULL")) { assert(lets[kNULL].size() == 0); @@ -69,12 +76,13 @@ struct Aligner { vector& corpus; QuasiModel2 paj_model; + const bool infer_paj; PYPLexicalTranslation model; const WordID kNULL; void ResampleHyperparameters() { model.ResampleHyperparameters(prng); - paj_model.ResampleHyperparameters(prng); + if (infer_paj) paj_model.ResampleHyperparameters(prng); } void InitializeRandom() { @@ -117,8 +125,6 @@ struct Aligner { paj_model.Increment(a_j, j, asp.src.size(), asp.trg.size()); } } - cerr << "LLH = " << Likelihood() << " \t(Amodel=" << paj_model.Likelihood() - << " TModel=" << model.Likelihood() << ") contexts=" << model.UniqueConditioningContexts() << endl; } prob_t Likelihood() const { @@ -211,13 +217,17 @@ int main(int argc, char** argv) { ExtractLetters(vocabf, &letters, NULL); letters[TD::Convert("NULL")].clear(); - Aligner aligner(letters, letset.size(), &corpus); + Aligner aligner(letters, letset.size(), conf, &corpus); aligner.InitializeRandom(); const unsigned samples = conf["samples"].as(); for (int i = 0; i < samples; ++i) { for (int j = 65; j < 67; ++j) Debug(corpus[j]); - if (i % 10 == 9) aligner.ResampleHyperparameters(); + if (i % 10 == 9) { + aligner.ResampleHyperparameters(); + cerr << "LLH = " << aligner.Likelihood() << " \t(Amodel=" << aligner.paj_model.Likelihood() + << " TModel=" << aligner.model.Likelihood() << ") contexts=" << aligner.model.UniqueConditioningContexts() << endl; + } aligner.ResampleCorpus(); if (i > (samples / 5) && (i % 6 == 5)) for (int j = 0; j < corpus.size(); ++j) AddSample(&corpus[j]); } diff --git a/gi/pf/make-freq-bins.pl b/gi/pf/make-freq-bins.pl new file mode 100755 index 00000000..fdcd3555 --- /dev/null +++ b/gi/pf/make-freq-bins.pl @@ -0,0 +1,26 @@ +#!/usr/bin/perl -w +use strict; + +my $BASE = 6; +my $CUTOFF = 3; + +my %d; +my $num = 0; +while(<>){ + chomp; + my @words = split /\s+/; + for my $w (@words) {$d{$w}++; $num++;} +} + +my @vocab = sort {$d{$b} <=> $d{$a}} keys %d; + +for (my $i=0; $i #include -#include "base_distributions.h" -#include "monotonic_pseg.h" -#include "conditional_pseg.h" #include "tdict.h" #include "ccrp.h" #include "pyp_word_model.h" @@ -15,9 +12,19 @@ using namespace std; using namespace std::tr1; -template +struct FreqBinner { + FreqBinner(const std::string& fname) { fd_.Load(fname); } + unsigned NumberOfBins() const { return fd_.Max() + 1; } + unsigned Bin(const WordID& w) const { return fd_.LookUp(w); } + FreqDict fd_; +}; + +template struct ConditionalPYPWordModel { - ConditionalPYPWordModel(Base* b) : base(*b), btr(2) {} + ConditionalPYPWordModel(Base* b, const Binner* bnr = NULL) : + base(*b), + binner(bnr), + btr(binner ? binner->NumberOfBins() + 1u : 2u) {} void Summary() const { cerr << "Number of conditioning contexts: " << r.size() << endl; @@ -46,7 +53,9 @@ struct ConditionalPYPWordModel { if (it == r.end()) { it = r.insert(make_pair(src, CCRP >(0.5,1.0))).first; static const WordID kNULL = TD::Convert("NULL"); - btr.Add(src == kNULL ? 0 : 1, &it->second); + unsigned bin = (src == kNULL ? 0 : 1); + if (binner && bin) { bin = binner->Bin(src) + 1; } + btr.Add(bin, &it->second); } if (it->second.increment(trglets, base(trglets), rng)) base.Increment(trglets, rng); @@ -75,6 +84,7 @@ struct ConditionalPYPWordModel { // TODO tie PYP hyperparameters based on source word frequency bins Base& base; + const Binner* binner; BinTiedResampler > > btr; typedef unordered_map > > RuleModelHash; RuleModelHash r; @@ -84,7 +94,7 @@ PYPLexicalTranslation::PYPLexicalTranslation(const vector >& lets const unsigned num_letters) : letters(lets), up0(new PYPWordModel(num_letters)), - tmodel(new ConditionalPYPWordModel(up0)), + tmodel(new ConditionalPYPWordModel(up0, new FreqBinner("10k.freq"))), kX(-TD::Convert("X")) {} void PYPLexicalTranslation::Summary() const { diff --git a/gi/pf/pyp_tm.h b/gi/pf/pyp_tm.h index fa0fb28f..63e7c96d 100644 --- a/gi/pf/pyp_tm.h +++ b/gi/pf/pyp_tm.h @@ -5,10 +5,11 @@ #include "wordid.h" #include "prob.h" #include "sampler.h" +#include "freqdict.h" -struct TRule; +struct FreqBinner; struct PYPWordModel; -template struct ConditionalPYPWordModel; +template struct ConditionalPYPWordModel; struct PYPLexicalTranslation { explicit PYPLexicalTranslation(const std::vector >& lets, @@ -26,7 +27,7 @@ struct PYPLexicalTranslation { private: const std::vector >& letters; // spelling dictionary PYPWordModel* up0; // base distribuction (model English word) - ConditionalPYPWordModel* tmodel; // translation distributions + ConditionalPYPWordModel* tmodel; // translation distributions // (model English word | French word) const WordID kX; }; -- cgit v1.2.3