From aac3ef3e3fdf636406fc61a40096cee6381e5461 Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Thu, 29 Dec 2011 21:08:30 -0500 Subject: lexical alignment samplers --- gi/pf/Makefile.am | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) (limited to 'gi/pf/Makefile.am') diff --git a/gi/pf/Makefile.am b/gi/pf/Makefile.am index 42758939..7c8e89d0 100644 --- a/gi/pf/Makefile.am +++ b/gi/pf/Makefile.am @@ -1,10 +1,14 @@ -bin_PROGRAMS = cbgi brat dpnaive pfbrat pfdist itg pfnaive +bin_PROGRAMS = cbgi brat dpnaive pfbrat pfdist itg pfnaive condnaive align-lexonly noinst_LIBRARIES = libpf.a -libpf_a_SOURCES = base_measures.cc reachability.cc cfg_wfst_composer.cc corpus.cc +libpf_a_SOURCES = base_measures.cc reachability.cc cfg_wfst_composer.cc corpus.cc unigrams.cc ngram_base.cc + +align_lexonly_SOURCES = align-lexonly.cc itg_SOURCES = itg.cc +condnaive_SOURCES = condnaive.cc + dpnaive_SOURCES = dpnaive.cc pfdist_SOURCES = pfdist.cc @@ -17,5 +21,6 @@ brat_SOURCES = brat.cc pfbrat_SOURCES = pfbrat.cc -AM_CPPFLAGS = -W -Wall -Wno-sign-compare -funroll-loops -I$(top_srcdir)/utils $(GTEST_CPPFLAGS) -I$(top_srcdir)/decoder -AM_LDFLAGS = libpf.a $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/utils/libutils.a -lz +AM_CPPFLAGS = -W -Wall -Wno-sign-compare -funroll-loops -I$(top_srcdir)/utils $(GTEST_CPPFLAGS) -I$(top_srcdir)/decoder -I$(top_srcdir)/klm + +AM_LDFLAGS = libpf.a $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/klm/lm/libklm.a $(top_srcdir)/klm/util/libklm_util.a $(top_srcdir)/utils/libutils.a -lz -- cgit v1.2.3 From 26d9ad04bd81508163d75c99726f970dd75f5127 Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Mon, 23 Jan 2012 15:47:29 -0500 Subject: more alignment stuff --- gi/pf/Makefile.am | 4 +- gi/pf/align-lexonly-pyp.cc | 327 ++++++++++++++++++++++++++++++++++++++++++++ gi/pf/base_measures.cc | 47 +++++++ gi/pf/base_measures.h | 18 +++ gi/pf/conditional_pseg.h | 74 ++++++++++ word-aligner/stemmers/ur.pl | 38 +++++ 6 files changed, 507 insertions(+), 1 deletion(-) create mode 100644 gi/pf/align-lexonly-pyp.cc create mode 100755 word-aligner/stemmers/ur.pl (limited to 'gi/pf/Makefile.am') diff --git a/gi/pf/Makefile.am b/gi/pf/Makefile.am index 7c8e89d0..28367e67 100644 --- a/gi/pf/Makefile.am +++ b/gi/pf/Makefile.am @@ -1,10 +1,12 @@ -bin_PROGRAMS = cbgi brat dpnaive pfbrat pfdist itg pfnaive condnaive align-lexonly +bin_PROGRAMS = cbgi brat dpnaive pfbrat pfdist itg pfnaive condnaive align-lexonly align-lexonly-pyp noinst_LIBRARIES = libpf.a libpf_a_SOURCES = base_measures.cc reachability.cc cfg_wfst_composer.cc corpus.cc unigrams.cc ngram_base.cc align_lexonly_SOURCES = align-lexonly.cc +align_lexonly_pyp_SOURCES = align-lexonly-pyp.cc + itg_SOURCES = itg.cc condnaive_SOURCES = condnaive.cc diff --git a/gi/pf/align-lexonly-pyp.cc b/gi/pf/align-lexonly-pyp.cc new file mode 100644 index 00000000..d2630a2b --- /dev/null +++ b/gi/pf/align-lexonly-pyp.cc @@ -0,0 +1,327 @@ +#include +#include +#include + +#include +#include +#include + +#include "array2d.h" +#include "base_measures.h" +#include "monotonic_pseg.h" +#include "conditional_pseg.h" +#include "trule.h" +#include "tdict.h" +#include "stringlib.h" +#include "filelib.h" +#include "dict.h" +#include "sampler.h" +#include "mfcr.h" +#include "corpus.h" +#include "ngram_base.h" + +using namespace std; +using namespace tr1; +namespace po = boost::program_options; + +void InitCommandLine(int argc, char** argv, po::variables_map* conf) { + po::options_description opts("Configuration options"); + opts.add_options() + ("samples,s",po::value()->default_value(1000),"Number of samples") + ("input,i",po::value(),"Read parallel data from") + ("random_seed,S",po::value(), "Random seed"); + po::options_description clo("Command line options"); + clo.add_options() + ("config", po::value(), "Configuration file") + ("help,h", "Print this help message and exit"); + po::options_description dconfig_options, dcmdline_options; + dconfig_options.add(opts); + dcmdline_options.add(opts).add(clo); + + po::store(parse_command_line(argc, argv, dcmdline_options), *conf); + if (conf->count("config")) { + ifstream config((*conf)["config"].as().c_str()); + po::store(po::parse_config_file(config, dconfig_options), *conf); + } + po::notify(*conf); + + if (conf->count("help") || (conf->count("input") == 0)) { + cerr << dcmdline_options << endl; + exit(1); + } +} + +shared_ptr prng; + +struct LexicalAlignment { + unsigned char src_index; + bool is_transliteration; + vector > derivation; +}; + +struct AlignedSentencePair { + vector src; + vector trg; + vector a; + Array2D posterior; +}; + +struct HierarchicalWordBase { + explicit HierarchicalWordBase(const unsigned vocab_e_size) : + base(prob_t::One()), r(1,1,1,25,25), u0(-log(vocab_e_size)), l(1,1.0), v(1, 0.0) {} + + void ResampleHyperparameters(MT19937* rng) { + r.resample_hyperparameters(rng); + } + + inline double logp0(const vector& s) const { + return s.size() * u0; + } + + // return p0 of rule.e_ + prob_t operator()(const TRule& rule) const { + v[0] = exp(logp0(rule.e_)); + return prob_t(r.prob(rule.e_, v, l)); + } + + void Increment(const TRule& rule) { + v[0] = exp(logp0(rule.e_)); + if (r.increment(rule.e_, v, l, &*prng).count) { + base *= prob_t(v[0] * l[0]); + } + } + + void Decrement(const TRule& rule) { + if (r.decrement(rule.e_, &*prng).count) { + base /= prob_t(exp(logp0(rule.e_))); + } + } + + prob_t Likelihood() const { + prob_t p; p.logeq(r.log_crp_prob()); + p *= base; + return p; + } + + void Summary() const { + cerr << "NUMBER OF CUSTOMERS: " << r.num_customers() << " (d=" << r.d() << ",\\alpha=" << r.alpha() << ')' << endl; + for (MFCR >::const_iterator it = r.begin(); it != r.end(); ++it) + cerr << " " << it->second.total_dish_count_ << " (on " << it->second.table_counts_.size() << " tables)" << TD::GetString(it->first) << endl; + } + + prob_t base; + MFCR > r; + const double u0; + const vector l; + mutable vector v; +}; + +struct BasicLexicalAlignment { + explicit BasicLexicalAlignment(const vector >& lets, + const unsigned words_e, + const unsigned letters_e, + vector* corp) : + letters(lets), + corpus(*corp), + //up0(words_e), + //up0("en.chars.1gram", letters_e), + //up0("en.words.1gram"), + up0(letters_e), + //up0("en.chars.2gram"), + tmodel(up0) { + } + + void InstantiateRule(const WordID src, + const WordID trg, + TRule* rule) const { + static const WordID kX = TD::Convert("X") * -1; + rule->lhs_ = kX; + rule->e_ = letters[trg]; + rule->f_ = letters[src]; + } + + void InitializeRandom() { + const WordID kNULL = TD::Convert("NULL"); + cerr << "Initializing with random alignments ...\n"; + for (unsigned i = 0; i < corpus.size(); ++i) { + AlignedSentencePair& asp = corpus[i]; + asp.a.resize(asp.trg.size()); + for (unsigned j = 0; j < asp.trg.size(); ++j) { + const unsigned char a_j = prng->next() * (1 + asp.src.size()); + const WordID f_a_j = (a_j ? asp.src[a_j - 1] : kNULL); + TRule r; + InstantiateRule(f_a_j, asp.trg[j], &r); + asp.a[j].is_transliteration = false; + asp.a[j].src_index = a_j; + if (tmodel.IncrementRule(r, &*prng)) + up0.Increment(r); + } + } + cerr << " LLH = " << Likelihood() << endl; + } + + prob_t Likelihood() const { + prob_t p = tmodel.Likelihood(); + p *= up0.Likelihood(); + return p; + } + + void ResampleHyperparemeters() { + cerr << " LLH_prev = " << Likelihood() << flush; + tmodel.ResampleHyperparameters(&*prng); + up0.ResampleHyperparameters(&*prng); + cerr << "\tLLH_post = " << Likelihood() << endl; + } + + void ResampleCorpus(); + + const vector >& letters; // spelling dictionary + vector& corpus; + //PhraseConditionalUninformativeBase up0; + //PhraseConditionalUninformativeUnigramBase up0; + //UnigramWordBase up0; + //HierarchicalUnigramBase up0; + HierarchicalWordBase up0; + //CompletelyUniformBase up0; + //FixedNgramBase up0; + //ConditionalTranslationModel tmodel; + //ConditionalTranslationModel tmodel; + //ConditionalTranslationModel tmodel; + //ConditionalTranslationModel tmodel; + MConditionalTranslationModel tmodel; + //ConditionalTranslationModel tmodel; + //ConditionalTranslationModel tmodel; +}; + +void BasicLexicalAlignment::ResampleCorpus() { + static const WordID kNULL = TD::Convert("NULL"); + for (unsigned i = 0; i < corpus.size(); ++i) { + AlignedSentencePair& asp = corpus[i]; + SampleSet ss; ss.resize(asp.src.size() + 1); + for (unsigned j = 0; j < asp.trg.size(); ++j) { + TRule r; + unsigned char& a_j = asp.a[j].src_index; + WordID f_a_j = (a_j ? asp.src[a_j - 1] : kNULL); + InstantiateRule(f_a_j, asp.trg[j], &r); + if (tmodel.DecrementRule(r, &*prng)) + up0.Decrement(r); + + for (unsigned prop_a_j = 0; prop_a_j <= asp.src.size(); ++prop_a_j) { + const WordID prop_f = (prop_a_j ? asp.src[prop_a_j - 1] : kNULL); + InstantiateRule(prop_f, asp.trg[j], &r); + ss[prop_a_j] = tmodel.RuleProbability(r); + } + a_j = prng->SelectSample(ss); + f_a_j = (a_j ? asp.src[a_j - 1] : kNULL); + InstantiateRule(f_a_j, asp.trg[j], &r); + if (tmodel.IncrementRule(r, &*prng)) + up0.Increment(r); + } + } + cerr << " LLH = " << tmodel.Likelihood() << endl; +} + +void ExtractLetters(const set& v, vector >* l, set* letset = NULL) { + for (set::const_iterator it = v.begin(); it != v.end(); ++it) { + vector& letters = (*l)[*it]; + if (letters.size()) continue; // if e and f have the same word + + const string& w = TD::Convert(*it); + + size_t cur = 0; + while (cur < w.size()) { + const size_t len = UTF8Len(w[cur]); + letters.push_back(TD::Convert(w.substr(cur, len))); + if (letset) letset->insert(letters.back()); + cur += len; + } + } +} + +void Debug(const AlignedSentencePair& asp) { + cerr << TD::GetString(asp.src) << endl << TD::GetString(asp.trg) << endl; + Array2D a(asp.src.size(), asp.trg.size()); + for (unsigned j = 0; j < asp.trg.size(); ++j) + if (asp.a[j].src_index) a(asp.a[j].src_index - 1, j) = true; + cerr << a << endl; +} + +void AddSample(AlignedSentencePair* asp) { + for (unsigned j = 0; j < asp->trg.size(); ++j) + asp->posterior(asp->a[j].src_index, j)++; +} + +void WriteAlignments(const AlignedSentencePair& asp) { + bool first = true; + for (unsigned j = 0; j < asp.trg.size(); ++j) { + int src_index = -1; + int mc = -1; + for (unsigned i = 0; i <= asp.src.size(); ++i) { + if (asp.posterior(i, j) > mc) { + mc = asp.posterior(i, j); + src_index = i; + } + } + + if (src_index) { + if (first) first = false; else cout << ' '; + cout << (src_index - 1) << '-' << j; + } + } + cout << endl; +} + +int main(int argc, char** argv) { + po::variables_map conf; + InitCommandLine(argc, argv, &conf); + + if (conf.count("random_seed")) + prng.reset(new MT19937(conf["random_seed"].as())); + else + prng.reset(new MT19937); +// MT19937& rng = *prng; + + vector > corpuse, corpusf; + set vocabe, vocabf; + corpus::ReadParallelCorpus(conf["input"].as(), &corpusf, &corpuse, &vocabf, &vocabe); + cerr << "f-Corpus size: " << corpusf.size() << " sentences\n"; + cerr << "f-Vocabulary size: " << vocabf.size() << " types\n"; + cerr << "f-Corpus size: " << corpuse.size() << " sentences\n"; + cerr << "f-Vocabulary size: " << vocabe.size() << " types\n"; + assert(corpusf.size() == corpuse.size()); + + vector corpus(corpuse.size()); + for (unsigned i = 0; i < corpuse.size(); ++i) { + corpus[i].src.swap(corpusf[i]); + corpus[i].trg.swap(corpuse[i]); + corpus[i].posterior.resize(corpus[i].src.size() + 1, corpus[i].trg.size()); + } + corpusf.clear(); corpuse.clear(); + + vocabf.insert(TD::Convert("NULL")); + vector > letters(TD::NumWords()); + set letset; + ExtractLetters(vocabe, &letters, &letset); + ExtractLetters(vocabf, &letters, NULL); + letters[TD::Convert("NULL")].clear(); + + BasicLexicalAlignment x(letters, vocabe.size(), letset.size(), &corpus); + x.InitializeRandom(); + const unsigned samples = conf["samples"].as(); + for (int i = 0; i < samples; ++i) { + for (int j = 65; j < 67; ++j) Debug(corpus[j]); + cerr << i << "\t" << x.tmodel.r.size() << "\t"; + if (i % 10 == 0) x.ResampleHyperparemeters(); + x.ResampleCorpus(); + if (i > (samples / 5) && (i % 10 == 9)) for (int j = 0; j < corpus.size(); ++j) AddSample(&corpus[j]); + } + for (unsigned i = 0; i < corpus.size(); ++i) + WriteAlignments(corpus[i]); + //ModelAndData posterior(x, &corpus, vocabe, vocabf); + x.tmodel.Summary(); + x.up0.Summary(); + + //posterior.Sample(); + + return 0; +} diff --git a/gi/pf/base_measures.cc b/gi/pf/base_measures.cc index 97b4e698..7894d3e7 100644 --- a/gi/pf/base_measures.cc +++ b/gi/pf/base_measures.cc @@ -6,6 +6,53 @@ using namespace std; +TableLookupBase::TableLookupBase(const string& fname) { + cerr << "TableLookupBase reading from " << fname << " ..." << endl; + ReadFile rf(fname); + istream& in = *rf.stream(); + string line; + unsigned lc = 0; + const WordID kDIV = TD::Convert("|||"); + vector tmp; + vector le, lf; + TRule x; + x.lhs_ = -TD::Convert("X"); + bool flag = false; + while(getline(in, line)) { + ++lc; + if (lc % 1000000 == 0) { cerr << " [" << lc << ']' << endl; flag = false; } + else if (lc % 25000 == 0) { cerr << '.' << flush; flag = true; } + tmp.clear(); + TD::ConvertSentence(line, &tmp); + x.f_.clear(); + x.e_.clear(); + size_t pos = 0; + int cc = 0; + while(pos < tmp.size()) { + const WordID cur = tmp[pos++]; + if (cur == kDIV) { + ++cc; + } else if (cc == 0) { + x.f_.push_back(cur); + } else if (cc == 1) { + x.e_.push_back(cur); + } else if (cc == 2) { + table[x] = atof(TD::Convert(cur)); + ++cc; + } else { + if (flag) cerr << endl; + cerr << "Bad format in " << lc << ": " << line << endl; abort(); + } + } + if (cc != 3) { + if (flag) cerr << endl; + cerr << "Bad format in " << lc << ": " << line << endl; abort(); + } + } + if (flag) cerr << endl; + cerr << " read " << lc << " entries\n"; +} + prob_t PhraseConditionalUninformativeUnigramBase::p0(const vector& vsrc, const vector& vtrg, int start_src, int start_trg) const { diff --git a/gi/pf/base_measures.h b/gi/pf/base_measures.h index a4e9ac28..7214aa22 100644 --- a/gi/pf/base_measures.h +++ b/gi/pf/base_measures.h @@ -72,6 +72,24 @@ struct UnigramWordBase { const UnigramWordModel un; }; +struct RuleHasher { + size_t operator()(const TRule& r) const { + return hash_value(r); + } +}; + +struct TableLookupBase { + TableLookupBase(const std::string& fname); + + prob_t operator()(const TRule& rule) const { + const std::tr1::unordered_map::const_iterator it = table.find(rule); + assert(it != table.end()); + return it->second; + } + + std::tr1::unordered_map table; +}; + struct PhraseConditionalUninformativeBase { explicit PhraseConditionalUninformativeBase(const unsigned vocab_e_size) : kUNIFORM_TARGET(1.0 / vocab_e_size) { diff --git a/gi/pf/conditional_pseg.h b/gi/pf/conditional_pseg.h index edcdc813..db951d15 100644 --- a/gi/pf/conditional_pseg.h +++ b/gi/pf/conditional_pseg.h @@ -8,10 +8,84 @@ #include "prob.h" #include "ccrp_nt.h" +#include "mfcr.h" #include "trule.h" #include "base_measures.h" #include "tdict.h" +template +struct MConditionalTranslationModel { + explicit MConditionalTranslationModel(ConditionalBaseMeasure& rcp0) : + rp0(rcp0), lambdas(1, 1.0), p0s(1) {} + + void Summary() const { + std::cerr << "Number of conditioning contexts: " << r.size() << std::endl; + for (RuleModelHash::const_iterator it = r.begin(); it != r.end(); ++it) { + std::cerr << TD::GetString(it->first) << " \t(d=" << it->second.d() << ",\\alpha = " << it->second.alpha() << ") --------------------------" << std::endl; + for (MFCR::const_iterator i2 = it->second.begin(); i2 != it->second.end(); ++i2) + std::cerr << " " << -1 << '\t' << i2->first << std::endl; + } + } + + void ResampleHyperparameters(MT19937* rng) { + for (RuleModelHash::iterator it = r.begin(); it != r.end(); ++it) + it->second.resample_hyperparameters(rng); + } + + int DecrementRule(const TRule& rule, MT19937* rng) { + RuleModelHash::iterator it = r.find(rule.f_); + assert(it != r.end()); + const TableCount delta = it->second.decrement(rule, rng); + if (delta.count) { + if (it->second.num_customers() == 0) r.erase(it); + } + return delta.count; + } + + int IncrementRule(const TRule& rule, MT19937* rng) { + RuleModelHash::iterator it = r.find(rule.f_); + if (it == r.end()) { + it = r.insert(make_pair(rule.f_, MFCR(1, 1.0, 1.0, 1.0, 1.0, 1e-9, 4.0))).first; + } + p0s[0] = rp0(rule).as_float(); + TableCount delta = it->second.increment(rule, p0s, lambdas, rng); + return delta.count; + } + + prob_t RuleProbability(const TRule& rule) const { + prob_t p; + RuleModelHash::const_iterator it = r.find(rule.f_); + if (it == r.end()) { + p.logeq(log(rp0(rule))); + } else { + p0s[0] = rp0(rule).as_float(); + p = prob_t(it->second.prob(rule, p0s, lambdas)); + } + return p; + } + + prob_t Likelihood() const { + prob_t p = prob_t::One(); +#if 0 + for (RuleModelHash::const_iterator it = r.begin(); it != r.end(); ++it) { + prob_t q; q.logeq(it->second.log_crp_prob()); + p *= q; + for (CCRP_NoTable::const_iterator i2 = it->second.begin(); i2 != it->second.end(); ++i2) + p *= rp0(i2->first); + } +#endif + return p; + } + + const ConditionalBaseMeasure& rp0; + typedef std::tr1::unordered_map, + MFCR, + boost::hash > > RuleModelHash; + RuleModelHash r; + std::vector lambdas; + mutable std::vector p0s; +}; + template struct ConditionalTranslationModel { explicit ConditionalTranslationModel(ConditionalBaseMeasure& rcp0) : diff --git a/word-aligner/stemmers/ur.pl b/word-aligner/stemmers/ur.pl new file mode 100755 index 00000000..3a4f5a45 --- /dev/null +++ b/word-aligner/stemmers/ur.pl @@ -0,0 +1,38 @@ +#!/usr/bin/perl -w + +use strict; +use utf8; + +binmode(STDIN, ":utf8"); +binmode(STDOUT,":utf8"); + +my $vocab = undef; +if (scalar @ARGV > 0) { + die "Only allow --vocab" unless ($ARGV[0] eq '--vocab' && scalar @ARGV == 1); + $vocab = 1; +} + +my %dict; +while() { + chomp; + my @words = split /\s+/; + my @out = (); + for my $w (@words) { + my $tw = $dict{$w}; + if (!defined $tw) { + my $el = 4; + if ($w =~ /^(al|Al)/) { $el++; } + if ($el > length($w)) { $el = length($w); } + $tw = substr $w, 0, $el; + $dict{$w} = $tw; + } + push @out, $tw; + } + if ($vocab) { + die "Expected exactly one word per line with --vocab: $_" unless scalar @out == 1; + print "$_ @out\n"; + } else { + print "@out\n"; + } +} + -- cgit v1.2.3 From e5831933364a4cab5084db49281a855baea8e09c Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Thu, 2 Feb 2012 12:33:13 -0500 Subject: fix broken build --- gi/pf/Makefile.am | 2 +- gi/pf/base_distributions.cc | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'gi/pf/Makefile.am') diff --git a/gi/pf/Makefile.am b/gi/pf/Makefile.am index 28367e67..8d43f36d 100644 --- a/gi/pf/Makefile.am +++ b/gi/pf/Makefile.am @@ -1,7 +1,7 @@ bin_PROGRAMS = cbgi brat dpnaive pfbrat pfdist itg pfnaive condnaive align-lexonly align-lexonly-pyp noinst_LIBRARIES = libpf.a -libpf_a_SOURCES = base_measures.cc reachability.cc cfg_wfst_composer.cc corpus.cc unigrams.cc ngram_base.cc +libpf_a_SOURCES = base_distributions.cc reachability.cc cfg_wfst_composer.cc corpus.cc unigrams.cc ngram_base.cc align_lexonly_SOURCES = align-lexonly.cc diff --git a/gi/pf/base_distributions.cc b/gi/pf/base_distributions.cc index 4b1863fa..d362fd76 100644 --- a/gi/pf/base_distributions.cc +++ b/gi/pf/base_distributions.cc @@ -1,4 +1,4 @@ -#include "base_measures.h" +#include "base_distributions.h" #include -- cgit v1.2.3 From 9007216a43c5572c2c343a1700ac79fb35b7d82f Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Sat, 25 Feb 2012 21:22:27 -0500 Subject: really slow hiero lm --- gi/pf/Makefile.am | 4 +- gi/pf/hierolm.cc | 309 +++++++++++++++++++++++++++++++++++++++++++++ phrasinator/ccrp.h | 294 ------------------------------------------- utils/ccrp.h | 340 ++++++++++++++++++++++++++++++++++++++++++++++++++ utils/ccrp_onetable.h | 12 ++ utils/sampler.h | 2 +- 6 files changed, 665 insertions(+), 296 deletions(-) create mode 100644 gi/pf/hierolm.cc delete mode 100644 phrasinator/ccrp.h create mode 100644 utils/ccrp.h (limited to 'gi/pf/Makefile.am') diff --git a/gi/pf/Makefile.am b/gi/pf/Makefile.am index 8d43f36d..ed5b6fd3 100644 --- a/gi/pf/Makefile.am +++ b/gi/pf/Makefile.am @@ -1,4 +1,4 @@ -bin_PROGRAMS = cbgi brat dpnaive pfbrat pfdist itg pfnaive condnaive align-lexonly align-lexonly-pyp +bin_PROGRAMS = cbgi brat dpnaive pfbrat pfdist itg pfnaive condnaive align-lexonly align-lexonly-pyp hierolm noinst_LIBRARIES = libpf.a libpf_a_SOURCES = base_distributions.cc reachability.cc cfg_wfst_composer.cc corpus.cc unigrams.cc ngram_base.cc @@ -9,6 +9,8 @@ align_lexonly_pyp_SOURCES = align-lexonly-pyp.cc itg_SOURCES = itg.cc +hierolm_SOURCES = hierolm.cc + condnaive_SOURCES = condnaive.cc dpnaive_SOURCES = dpnaive.cc diff --git a/gi/pf/hierolm.cc b/gi/pf/hierolm.cc new file mode 100644 index 00000000..afb12fef --- /dev/null +++ b/gi/pf/hierolm.cc @@ -0,0 +1,309 @@ +#include +#include +#include + +#include +#include +#include + +#include "inside_outside.h" +#include "hg.h" +#include "bottom_up_parser.h" +#include "fdict.h" +#include "grammar.h" +#include "m.h" +#include "trule.h" +#include "tdict.h" +#include "filelib.h" +#include "dict.h" +#include "sampler.h" +#include "ccrp.h" +#include "ccrp_onetable.h" + +using namespace std; +using namespace tr1; +namespace po = boost::program_options; + +shared_ptr prng; + +void InitCommandLine(int argc, char** argv, po::variables_map* conf) { + po::options_description opts("Configuration options"); + opts.add_options() + ("samples,s",po::value()->default_value(1000),"Number of samples") + ("input,i",po::value(),"Read parallel data from") + ("random_seed,S",po::value(), "Random seed"); + po::options_description clo("Command line options"); + clo.add_options() + ("config", po::value(), "Configuration file") + ("help,h", "Print this help message and exit"); + po::options_description dconfig_options, dcmdline_options; + dconfig_options.add(opts); + dcmdline_options.add(opts).add(clo); + + po::store(parse_command_line(argc, argv, dcmdline_options), *conf); + if (conf->count("config")) { + ifstream config((*conf)["config"].as().c_str()); + po::store(po::parse_config_file(config, dconfig_options), *conf); + } + po::notify(*conf); + + if (conf->count("help") || (conf->count("input") == 0)) { + cerr << dcmdline_options << endl; + exit(1); + } +} + +void ReadCorpus(const string& filename, + vector >* e, + set* vocab_e) { + e->clear(); + vocab_e->clear(); + istream* in; + if (filename == "-") + in = &cin; + else + in = new ifstream(filename.c_str()); + assert(*in); + string line; + while(*in) { + getline(*in, line); + if (line.empty() && !*in) break; + e->push_back(vector()); + vector& le = e->back(); + TD::ConvertSentence(line, &le); + for (unsigned i = 0; i < le.size(); ++i) + vocab_e->insert(le[i]); + } + if (in != &cin) delete in; +} + +struct Grid { + // a b c d e + // 0 - 0 - - + vector grid; +}; + +struct BaseRuleModel { + explicit BaseRuleModel(unsigned term_size, + unsigned nonterm_size = 1) : + unif_term(1.0 / term_size), + unif_nonterm(1.0 / nonterm_size) {} + prob_t operator()(const TRule& r) const { + prob_t p; p.logeq(Md::log_poisson(1.0, r.f_.size())); + const prob_t term_prob((2.0 + 0.01*r.f_.size()) / (r.f_.size() + 2)); + const prob_t nonterm_prob(1.0 - term_prob.as_float()); + for (unsigned i = 0; i < r.f_.size(); ++i) { + if (r.f_[i] <= 0) { // nonterminal + p *= nonterm_prob; + p *= unif_nonterm; + } else { // terminal + p *= term_prob; + p *= unif_term; + } + } + return p; + } + const prob_t unif_term, unif_nonterm; +}; + +struct HieroLMModel { + explicit HieroLMModel(unsigned vocab_size) : p0(vocab_size), x(1,1,1,1) {} + + prob_t Prob(const TRule& r) const { + return x.probT(r, p0(r)); + } + + int Increment(const TRule& r, MT19937* rng) { + return x.incrementT(r, p0(r), rng); + // return x.increment(r); + } + + int Decrement(const TRule& r, MT19937* rng) { + return x.decrement(r, rng); + //return x.decrement(r); + } + + prob_t Likelihood() const { + prob_t p; + p.logeq(x.log_crp_prob()); + for (CCRP::const_iterator it = x.begin(); it != x.end(); ++it) { + prob_t tp = p0(it->first); + tp.poweq(it->second.table_counts_.size()); + p *= tp; + } + //for (CCRP_OneTable::const_iterator it = x.begin(); it != x.end(); ++it) + // p *= p0(it->first); + return p; + } + + void ResampleHyperparameters(MT19937* rng) { + x.resample_hyperparameters(rng); + cerr << " d=" << x.discount() << ", alpha=" << x.concentration() << endl; + } + + const BaseRuleModel p0; + CCRP x; + //CCRP_OneTable x; +}; + +vector tofreelist; + +HieroLMModel* plm; + +struct NPGrammarIter : public GrammarIter, public RuleBin { + NPGrammarIter() : arity() { tofreelist.push_back(this); } + NPGrammarIter(const TRulePtr& inr, const int a, int symbol) : arity(a + (symbol < 0 ? 1 : 0)) { + if (inr) { + r.reset(new TRule(*inr)); + } else { + static const int kLHS = -TD::Convert("X"); + r.reset(new TRule); + r->lhs_ = kLHS; + } + TRule& rr = *r; + rr.f_.push_back(symbol); + rr.e_.push_back(symbol < 0 ? (1-int(arity)) : symbol); + tofreelist.push_back(this); + } + virtual int GetNumRules() const { + if (r) return 1; else return 0; + } + virtual TRulePtr GetIthRule(int) const { + return r; + } + virtual int Arity() const { + return arity; + } + virtual const RuleBin* GetRules() const { + if (!r) return NULL; else return this; + } + virtual const GrammarIter* Extend(int symbol) const { + return new NPGrammarIter(r, arity, symbol); + } + const unsigned char arity; + TRulePtr r; +}; + +struct NPGrammar : public Grammar { + virtual const GrammarIter* GetRoot() const { + return new NPGrammarIter; + } +}; + +void SampleDerivation(const Hypergraph& hg, MT19937* rng, vector* sampled_deriv, HieroLMModel* plm) { + HieroLMModel& lm = *plm; + vector node_probs; + const prob_t total_prob = Inside(hg, &node_probs); + queue q; + q.push(hg.nodes_.size() - 3); + while(!q.empty()) { + unsigned cur_node_id = q.front(); +// cerr << "NODE=" << cur_node_id << endl; + q.pop(); + const Hypergraph::Node& node = hg.nodes_[cur_node_id]; + const unsigned num_in_edges = node.in_edges_.size(); + unsigned sampled_edge = 0; + if (num_in_edges == 1) { + sampled_edge = node.in_edges_[0]; + } else { + //prob_t z; + assert(num_in_edges > 1); + SampleSet ss; + for (unsigned j = 0; j < num_in_edges; ++j) { + const Hypergraph::Edge& edge = hg.edges_[node.in_edges_[j]]; + prob_t p = edge.edge_prob_; + for (unsigned k = 0; k < edge.tail_nodes_.size(); ++k) + p *= node_probs[edge.tail_nodes_[k]]; + ss.add(p); +// cerr << log(ss[j]) << " ||| " << edge.rule_->AsString() << endl; + //z += p; + } +// for (unsigned j = 0; j < num_in_edges; ++j) { +// const Hypergraph::Edge& edge = hg.edges_[node.in_edges_[j]]; +// cerr << exp(log(ss[j] / z)) << " ||| " << edge.rule_->AsString() << endl; +// } +// cerr << " --- \n"; + sampled_edge = node.in_edges_[rng->SelectSample(ss)]; + } + sampled_deriv->push_back(sampled_edge); + const Hypergraph::Edge& edge = hg.edges_[sampled_edge]; + for (unsigned j = 0; j < edge.tail_nodes_.size(); ++j) { + q.push(edge.tail_nodes_[j]); + } + } + for (unsigned i = 0; i < sampled_deriv->size(); ++i) { + cerr << *hg.edges_[(*sampled_deriv)[i]].rule_ << endl; + } +} + +void IncrementDerivation(const Hypergraph& hg, const vector& d, HieroLMModel* plm, MT19937* rng) { + for (unsigned i = 0; i < d.size(); ++i) + plm->Increment(*hg.edges_[d[i]].rule_, rng); +} + +void DecrementDerivation(const Hypergraph& hg, const vector& d, HieroLMModel* plm, MT19937* rng) { + for (unsigned i = 0; i < d.size(); ++i) + plm->Decrement(*hg.edges_[d[i]].rule_, rng); +} + +int main(int argc, char** argv) { + po::variables_map conf; + vector grammars; + grammars.push_back(GrammarPtr(new NPGrammar)); + + InitCommandLine(argc, argv, &conf); + const unsigned samples = conf["samples"].as(); + + if (conf.count("random_seed")) + prng.reset(new MT19937(conf["random_seed"].as())); + else + prng.reset(new MT19937); + MT19937& rng = *prng; + + vector > corpuse; + set vocabe; + cerr << "Reading corpus...\n"; + ReadCorpus(conf["input"].as(), &corpuse, &vocabe); + cerr << "E-corpus size: " << corpuse.size() << " sentences\t (" << vocabe.size() << " word types)\n"; + HieroLMModel lm(vocabe.size()); + + plm = &lm; + ExhaustiveBottomUpParser parser("X", grammars); + + Hypergraph hg; + const int kX = -TD::Convert("X"); + const int kLP = FD::Convert("LogProb"); + SparseVector v; v.set_value(kLP, 1.0); + vector > derivs(corpuse.size()); + for (int SS=0; SS < samples; ++SS) { + for (int ci = 0; ci < corpuse.size(); ++ci) { + vector& src = corpuse[ci]; + Lattice lat(src.size()); + for (unsigned i = 0; i < src.size(); ++i) + lat[i].push_back(LatticeArc(src[i], 0.0, 1)); + cerr << TD::GetString(src) << endl; + hg.clear(); + parser.Parse(lat, &hg); // exhaustive parse + DecrementDerivation(hg, derivs[ci], &lm, &rng); + for (unsigned i = 0; i < hg.edges_.size(); ++i) { + TRule& r = *hg.edges_[i].rule_; + if (r.lhs_ == kX) + hg.edges_[i].edge_prob_ = lm.Prob(r); + } + vector d; + SampleDerivation(hg, &rng, &d, &lm); + derivs[ci] = d; + IncrementDerivation(hg, derivs[ci], &lm, &rng); + if (tofreelist.size() > 100000) { + cerr << "Freeing ... "; + for (unsigned i = 0; i < tofreelist.size(); ++i) + delete tofreelist[i]; + tofreelist.clear(); + cerr << "Freed.\n"; + } + } + cerr << "LLH=" << lm.Likelihood() << endl; + } + return 0; +} + diff --git a/phrasinator/ccrp.h b/phrasinator/ccrp.h deleted file mode 100644 index 9acf12ab..00000000 --- a/phrasinator/ccrp.h +++ /dev/null @@ -1,294 +0,0 @@ -#ifndef _CCRP_H_ -#define _CCRP_H_ - -#include -#include -#include -#include -#include -#include -#include -#include -#include "sampler.h" -#include "slice_sampler.h" - -// Chinese restaurant process (Pitman-Yor parameters) with table tracking. - -template > -class CCRP { - public: - CCRP(double disc, double conc) : - num_tables_(), - num_customers_(), - discount_(disc), - concentration_(conc), - discount_prior_alpha_(std::numeric_limits::quiet_NaN()), - discount_prior_beta_(std::numeric_limits::quiet_NaN()), - concentration_prior_shape_(std::numeric_limits::quiet_NaN()), - concentration_prior_rate_(std::numeric_limits::quiet_NaN()) {} - - CCRP(double d_alpha, double d_beta, double c_shape, double c_rate, double d = 0.1, double c = 10.0) : - num_tables_(), - num_customers_(), - discount_(d), - concentration_(c), - discount_prior_alpha_(d_alpha), - discount_prior_beta_(d_beta), - concentration_prior_shape_(c_shape), - concentration_prior_rate_(c_rate) {} - - double discount() const { return discount_; } - double concentration() const { return concentration_; } - - bool has_discount_prior() const { - return !std::isnan(discount_prior_alpha_); - } - - bool has_concentration_prior() const { - return !std::isnan(concentration_prior_shape_); - } - - void clear() { - num_tables_ = 0; - num_customers_ = 0; - dish_locs_.clear(); - } - - unsigned num_tables() const { - return num_tables_; - } - - unsigned num_tables(const Dish& dish) const { - const typename std::tr1::unordered_map::const_iterator it = dish_locs_.find(dish); - if (it == dish_locs_.end()) return 0; - return it->second.table_counts_.size(); - } - - unsigned num_customers() const { - return num_customers_; - } - - unsigned num_customers(const Dish& dish) const { - const typename std::tr1::unordered_map::const_iterator it = dish_locs_.find(dish); - if (it == dish_locs_.end()) return 0; - return it->total_dish_count_; - } - - // returns +1 or 0 indicating whether a new table was opened - int increment(const Dish& dish, const double& p0, MT19937* rng) { - DishLocations& loc = dish_locs_[dish]; - bool share_table = false; - if (loc.total_dish_count_) { - const double p_empty = (concentration_ + num_tables_ * discount_) * p0; - const double p_share = (loc.total_dish_count_ - loc.table_counts_.size() * discount_); - share_table = rng->SelectSample(p_empty, p_share); - } - if (share_table) { - double r = rng->next() * (loc.total_dish_count_ - loc.table_counts_.size() * discount_); - for (typename std::list::iterator ti = loc.table_counts_.begin(); - ti != loc.table_counts_.end(); ++ti) { - r -= (*ti - discount_); - if (r <= 0.0) { - ++(*ti); - break; - } - } - if (r > 0.0) { - std::cerr << "Serious error: r=" << r << std::endl; - Print(&std::cerr); - assert(r <= 0.0); - } - } else { - loc.table_counts_.push_back(1u); - ++num_tables_; - } - ++loc.total_dish_count_; - ++num_customers_; - return (share_table ? 0 : 1); - } - - // returns -1 or 0, indicating whether a table was closed - int decrement(const Dish& dish, MT19937* rng) { - DishLocations& loc = dish_locs_[dish]; - assert(loc.total_dish_count_); - if (loc.total_dish_count_ == 1) { - dish_locs_.erase(dish); - --num_tables_; - --num_customers_; - return -1; - } else { - int delta = 0; - // sample customer to remove UNIFORMLY. that is, do NOT use the discount - // here. if you do, it will introduce (unwanted) bias! - double r = rng->next() * loc.total_dish_count_; - --loc.total_dish_count_; - for (typename std::list::iterator ti = loc.table_counts_.begin(); - ti != loc.table_counts_.end(); ++ti) { - r -= *ti; - if (r <= 0.0) { - if ((--(*ti)) == 0) { - --num_tables_; - delta = -1; - loc.table_counts_.erase(ti); - } - break; - } - } - if (r > 0.0) { - std::cerr << "Serious error: r=" << r << std::endl; - Print(&std::cerr); - assert(r <= 0.0); - } - --num_customers_; - return delta; - } - } - - double prob(const Dish& dish, const double& p0) const { - const typename std::tr1::unordered_map::const_iterator it = dish_locs_.find(dish); - const double r = num_tables_ * discount_ + concentration_; - if (it == dish_locs_.end()) { - return r * p0 / (num_customers_ + concentration_); - } else { - return (it->second.total_dish_count_ - discount_ * it->second.table_counts_.size() + r * p0) / - (num_customers_ + concentration_); - } - } - - double log_crp_prob() const { - return log_crp_prob(discount_, concentration_); - } - - static double log_beta_density(const double& x, const double& alpha, const double& beta) { - assert(x > 0.0); - assert(x < 1.0); - assert(alpha > 0.0); - assert(beta > 0.0); - const double lp = (alpha-1)*log(x)+(beta-1)*log(1-x)+lgamma(alpha+beta)-lgamma(alpha)-lgamma(beta); - return lp; - } - - static double log_gamma_density(const double& x, const double& shape, const double& rate) { - assert(x >= 0.0); - assert(shape > 0.0); - assert(rate > 0.0); - const double lp = (shape-1)*log(x) - shape*log(rate) - x/rate - lgamma(shape); - return lp; - } - - // taken from http://en.wikipedia.org/wiki/Chinese_restaurant_process - // does not include P_0's - double log_crp_prob(const double& discount, const double& concentration) const { - double lp = 0.0; - if (has_discount_prior()) - lp = log_beta_density(discount, discount_prior_alpha_, discount_prior_beta_); - if (has_concentration_prior()) - lp += log_gamma_density(concentration, concentration_prior_shape_, concentration_prior_rate_); - assert(lp <= 0.0); - if (num_customers_) { - if (discount > 0.0) { - const double r = lgamma(1.0 - discount); - lp += lgamma(concentration) - lgamma(concentration + num_customers_) - + num_tables_ * log(discount) + lgamma(concentration / discount + num_tables_) - - lgamma(concentration / discount); - assert(std::isfinite(lp)); - for (typename std::tr1::unordered_map::const_iterator it = dish_locs_.begin(); - it != dish_locs_.end(); ++it) { - const DishLocations& cur = it->second; - for (std::list::const_iterator ti = cur.table_counts_.begin(); ti != cur.table_counts_.end(); ++ti) { - lp += lgamma(*ti - discount) - r; - } - } - } else { - assert(!"not implemented yet"); - } - } - assert(std::isfinite(lp)); - return lp; - } - - void resample_hyperparameters(MT19937* rng, const unsigned nloop = 5, const unsigned niterations = 10) { - assert(has_discount_prior() || has_concentration_prior()); - DiscountResampler dr(*this); - ConcentrationResampler cr(*this); - for (int iter = 0; iter < nloop; ++iter) { - if (has_concentration_prior()) { - concentration_ = slice_sampler1d(cr, concentration_, *rng, 0.0, - std::numeric_limits::infinity(), 0.0, niterations, 100*niterations); - } - if (has_discount_prior()) { - discount_ = slice_sampler1d(dr, discount_, *rng, std::numeric_limits::min(), - 1.0, 0.0, niterations, 100*niterations); - } - } - concentration_ = slice_sampler1d(cr, concentration_, *rng, 0.0, - std::numeric_limits::infinity(), 0.0, niterations, 100*niterations); - } - - struct DiscountResampler { - DiscountResampler(const CCRP& crp) : crp_(crp) {} - const CCRP& crp_; - double operator()(const double& proposed_discount) const { - return crp_.log_crp_prob(proposed_discount, crp_.concentration_); - } - }; - - struct ConcentrationResampler { - ConcentrationResampler(const CCRP& crp) : crp_(crp) {} - const CCRP& crp_; - double operator()(const double& proposed_concentration) const { - return crp_.log_crp_prob(crp_.discount_, proposed_concentration); - } - }; - - struct DishLocations { - DishLocations() : total_dish_count_() {} - unsigned total_dish_count_; // customers at all tables with this dish - std::list table_counts_; // list<> gives O(1) deletion and insertion, which we want - // .size() is the number of tables for this dish - }; - - void Print(std::ostream* out) const { - std::cerr << "PYP(d=" << discount_ << ",c=" << concentration_ << ") customers=" << num_customers_ << std::endl; - for (typename std::tr1::unordered_map::const_iterator it = dish_locs_.begin(); - it != dish_locs_.end(); ++it) { - (*out) << it->first << " (" << it->second.total_dish_count_ << " on " << it->second.table_counts_.size() << " tables): "; - for (typename std::list::const_iterator i = it->second.table_counts_.begin(); - i != it->second.table_counts_.end(); ++i) { - (*out) << " " << *i; - } - (*out) << std::endl; - } - } - - typedef typename std::tr1::unordered_map::const_iterator const_iterator; - const_iterator begin() const { - return dish_locs_.begin(); - } - const_iterator end() const { - return dish_locs_.end(); - } - - unsigned num_tables_; - unsigned num_customers_; - std::tr1::unordered_map dish_locs_; - - double discount_; - double concentration_; - - // optional beta prior on discount_ (NaN if no prior) - double discount_prior_alpha_; - double discount_prior_beta_; - - // optional gamma prior on concentration_ (NaN if no prior) - double concentration_prior_shape_; - double concentration_prior_rate_; -}; - -template -std::ostream& operator<<(std::ostream& o, const CCRP& c) { - c.Print(&o); - return o; -} - -#endif diff --git a/utils/ccrp.h b/utils/ccrp.h new file mode 100644 index 00000000..1a9e3ed5 --- /dev/null +++ b/utils/ccrp.h @@ -0,0 +1,340 @@ +#ifndef _CCRP_H_ +#define _CCRP_H_ + +#include +#include +#include +#include +#include +#include +#include +#include +#include "sampler.h" +#include "slice_sampler.h" + +// Chinese restaurant process (Pitman-Yor parameters) with table tracking. + +template > +class CCRP { + public: + CCRP(double disc, double conc) : + num_tables_(), + num_customers_(), + discount_(disc), + concentration_(conc), + discount_prior_alpha_(std::numeric_limits::quiet_NaN()), + discount_prior_beta_(std::numeric_limits::quiet_NaN()), + concentration_prior_shape_(std::numeric_limits::quiet_NaN()), + concentration_prior_rate_(std::numeric_limits::quiet_NaN()) {} + + CCRP(double d_alpha, double d_beta, double c_shape, double c_rate, double d = 0.9, double c = 1.0) : + num_tables_(), + num_customers_(), + discount_(d), + concentration_(c), + discount_prior_alpha_(d_alpha), + discount_prior_beta_(d_beta), + concentration_prior_shape_(c_shape), + concentration_prior_rate_(c_rate) {} + + double discount() const { return discount_; } + double concentration() const { return concentration_; } + + bool has_discount_prior() const { + return !std::isnan(discount_prior_alpha_); + } + + bool has_concentration_prior() const { + return !std::isnan(concentration_prior_shape_); + } + + void clear() { + num_tables_ = 0; + num_customers_ = 0; + dish_locs_.clear(); + } + + unsigned num_tables() const { + return num_tables_; + } + + unsigned num_tables(const Dish& dish) const { + const typename std::tr1::unordered_map::const_iterator it = dish_locs_.find(dish); + if (it == dish_locs_.end()) return 0; + return it->second.table_counts_.size(); + } + + unsigned num_customers() const { + return num_customers_; + } + + unsigned num_customers(const Dish& dish) const { + const typename std::tr1::unordered_map::const_iterator it = dish_locs_.find(dish); + if (it == dish_locs_.end()) return 0; + return it->total_dish_count_; + } + + // returns +1 or 0 indicating whether a new table was opened + int increment(const Dish& dish, const double& p0, MT19937* rng) { + DishLocations& loc = dish_locs_[dish]; + bool share_table = false; + if (loc.total_dish_count_) { + const double p_empty = (concentration_ + num_tables_ * discount_) * p0; + const double p_share = (loc.total_dish_count_ - loc.table_counts_.size() * discount_); + share_table = rng->SelectSample(p_empty, p_share); + } + if (share_table) { + double r = rng->next() * (loc.total_dish_count_ - loc.table_counts_.size() * discount_); + for (typename std::list::iterator ti = loc.table_counts_.begin(); + ti != loc.table_counts_.end(); ++ti) { + r -= (*ti - discount_); + if (r <= 0.0) { + ++(*ti); + break; + } + } + if (r > 0.0) { + std::cerr << "Serious error: r=" << r << std::endl; + Print(&std::cerr); + assert(r <= 0.0); + } + } else { + loc.table_counts_.push_back(1u); + ++num_tables_; + } + ++loc.total_dish_count_; + ++num_customers_; + return (share_table ? 0 : 1); + } + + // returns +1 or 0 indicating whether a new table was opened + template + int incrementT(const Dish& dish, const T& p0, MT19937* rng) { + DishLocations& loc = dish_locs_[dish]; + bool share_table = false; + if (loc.total_dish_count_) { + const T p_empty = T(concentration_ + num_tables_ * discount_) * p0; + const T p_share = T(loc.total_dish_count_ - loc.table_counts_.size() * discount_); + share_table = rng->SelectSample(p_empty, p_share); + } + if (share_table) { + double r = rng->next() * (loc.total_dish_count_ - loc.table_counts_.size() * discount_); + for (typename std::list::iterator ti = loc.table_counts_.begin(); + ti != loc.table_counts_.end(); ++ti) { + r -= (*ti - discount_); + if (r <= 0.0) { + ++(*ti); + break; + } + } + if (r > 0.0) { + std::cerr << "Serious error: r=" << r << std::endl; + Print(&std::cerr); + assert(r <= 0.0); + } + } else { + loc.table_counts_.push_back(1u); + ++num_tables_; + } + ++loc.total_dish_count_; + ++num_customers_; + return (share_table ? 0 : 1); + } + + // returns -1 or 0, indicating whether a table was closed + int decrement(const Dish& dish, MT19937* rng) { + DishLocations& loc = dish_locs_[dish]; + assert(loc.total_dish_count_); + if (loc.total_dish_count_ == 1) { + dish_locs_.erase(dish); + --num_tables_; + --num_customers_; + return -1; + } else { + int delta = 0; + // sample customer to remove UNIFORMLY. that is, do NOT use the discount + // here. if you do, it will introduce (unwanted) bias! + double r = rng->next() * loc.total_dish_count_; + --loc.total_dish_count_; + for (typename std::list::iterator ti = loc.table_counts_.begin(); + ti != loc.table_counts_.end(); ++ti) { + r -= *ti; + if (r <= 0.0) { + if ((--(*ti)) == 0) { + --num_tables_; + delta = -1; + loc.table_counts_.erase(ti); + } + break; + } + } + if (r > 0.0) { + std::cerr << "Serious error: r=" << r << std::endl; + Print(&std::cerr); + assert(r <= 0.0); + } + --num_customers_; + return delta; + } + } + + double prob(const Dish& dish, const double& p0) const { + const typename std::tr1::unordered_map::const_iterator it = dish_locs_.find(dish); + const double r = num_tables_ * discount_ + concentration_; + if (it == dish_locs_.end()) { + return r * p0 / (num_customers_ + concentration_); + } else { + return (it->second.total_dish_count_ - discount_ * it->second.table_counts_.size() + r * p0) / + (num_customers_ + concentration_); + } + } + + template + T probT(const Dish& dish, const T& p0) const { + const typename std::tr1::unordered_map::const_iterator it = dish_locs_.find(dish); + const T r = T(num_tables_ * discount_ + concentration_); + if (it == dish_locs_.end()) { + return r * p0 / T(num_customers_ + concentration_); + } else { + return (T(it->second.total_dish_count_ - discount_ * it->second.table_counts_.size()) + r * p0) / + T(num_customers_ + concentration_); + } + } + + double log_crp_prob() const { + return log_crp_prob(discount_, concentration_); + } + + static double log_beta_density(const double& x, const double& alpha, const double& beta) { + assert(x > 0.0); + assert(x < 1.0); + assert(alpha > 0.0); + assert(beta > 0.0); + const double lp = (alpha-1)*log(x)+(beta-1)*log(1-x)+lgamma(alpha+beta)-lgamma(alpha)-lgamma(beta); + return lp; + } + + static double log_gamma_density(const double& x, const double& shape, const double& rate) { + assert(x >= 0.0); + assert(shape > 0.0); + assert(rate > 0.0); + const double lp = (shape-1)*log(x) - shape*log(rate) - x/rate - lgamma(shape); + return lp; + } + + // taken from http://en.wikipedia.org/wiki/Chinese_restaurant_process + // does not include P_0's + double log_crp_prob(const double& discount, const double& concentration) const { + double lp = 0.0; + if (has_discount_prior()) + lp = log_beta_density(discount, discount_prior_alpha_, discount_prior_beta_); + if (has_concentration_prior()) + lp += log_gamma_density(concentration, concentration_prior_shape_, concentration_prior_rate_); + assert(lp <= 0.0); + if (num_customers_) { + if (discount > 0.0) { + const double r = lgamma(1.0 - discount); + lp += lgamma(concentration) - lgamma(concentration + num_customers_) + + num_tables_ * log(discount) + lgamma(concentration / discount + num_tables_) + - lgamma(concentration / discount); + assert(std::isfinite(lp)); + for (typename std::tr1::unordered_map::const_iterator it = dish_locs_.begin(); + it != dish_locs_.end(); ++it) { + const DishLocations& cur = it->second; + for (std::list::const_iterator ti = cur.table_counts_.begin(); ti != cur.table_counts_.end(); ++ti) { + lp += lgamma(*ti - discount) - r; + } + } + } else { + assert(!"not implemented yet"); + } + } + assert(std::isfinite(lp)); + return lp; + } + + void resample_hyperparameters(MT19937* rng, const unsigned nloop = 5, const unsigned niterations = 10) { + assert(has_discount_prior() || has_concentration_prior()); + DiscountResampler dr(*this); + ConcentrationResampler cr(*this); + for (int iter = 0; iter < nloop; ++iter) { + if (has_concentration_prior()) { + concentration_ = slice_sampler1d(cr, concentration_, *rng, 0.0, + std::numeric_limits::infinity(), 0.0, niterations, 100*niterations); + } + if (has_discount_prior()) { + discount_ = slice_sampler1d(dr, discount_, *rng, std::numeric_limits::min(), + 1.0, 0.0, niterations, 100*niterations); + } + } + concentration_ = slice_sampler1d(cr, concentration_, *rng, 0.0, + std::numeric_limits::infinity(), 0.0, niterations, 100*niterations); + } + + struct DiscountResampler { + DiscountResampler(const CCRP& crp) : crp_(crp) {} + const CCRP& crp_; + double operator()(const double& proposed_discount) const { + return crp_.log_crp_prob(proposed_discount, crp_.concentration_); + } + }; + + struct ConcentrationResampler { + ConcentrationResampler(const CCRP& crp) : crp_(crp) {} + const CCRP& crp_; + double operator()(const double& proposed_concentration) const { + return crp_.log_crp_prob(crp_.discount_, proposed_concentration); + } + }; + + struct DishLocations { + DishLocations() : total_dish_count_() {} + unsigned total_dish_count_; // customers at all tables with this dish + std::list table_counts_; // list<> gives O(1) deletion and insertion, which we want + // .size() is the number of tables for this dish + }; + + void Print(std::ostream* out) const { + std::cerr << "PYP(d=" << discount_ << ",c=" << concentration_ << ") customers=" << num_customers_ << std::endl; + for (typename std::tr1::unordered_map::const_iterator it = dish_locs_.begin(); + it != dish_locs_.end(); ++it) { + (*out) << it->first << " (" << it->second.total_dish_count_ << " on " << it->second.table_counts_.size() << " tables): "; + for (typename std::list::const_iterator i = it->second.table_counts_.begin(); + i != it->second.table_counts_.end(); ++i) { + (*out) << " " << *i; + } + (*out) << std::endl; + } + } + + typedef typename std::tr1::unordered_map::const_iterator const_iterator; + const_iterator begin() const { + return dish_locs_.begin(); + } + const_iterator end() const { + return dish_locs_.end(); + } + + unsigned num_tables_; + unsigned num_customers_; + std::tr1::unordered_map dish_locs_; + + double discount_; + double concentration_; + + // optional beta prior on discount_ (NaN if no prior) + double discount_prior_alpha_; + double discount_prior_beta_; + + // optional gamma prior on concentration_ (NaN if no prior) + double concentration_prior_shape_; + double concentration_prior_rate_; +}; + +template +std::ostream& operator<<(std::ostream& o, const CCRP& c) { + c.Print(&o); + return o; +} + +#endif diff --git a/utils/ccrp_onetable.h b/utils/ccrp_onetable.h index a868af9a..b63737d1 100644 --- a/utils/ccrp_onetable.h +++ b/utils/ccrp_onetable.h @@ -117,6 +117,18 @@ class CCRP_OneTable { } } + template + T probT(const Dish& dish, const T& p0) const { + const typename DishMapType::const_iterator it = dish_counts_.find(dish); + const T r(num_tables_ * discount_ + concentration_); + if (it == dish_counts_.end()) { + return r * p0 / T(num_customers_ + concentration_); + } else { + return (T(it->second - discount_) + r * p0) / + T(num_customers_ + concentration_); + } + } + double log_crp_prob() const { return log_crp_prob(discount_, concentration_); } diff --git a/utils/sampler.h b/utils/sampler.h index 153e7ef1..22c873d4 100644 --- a/utils/sampler.h +++ b/utils/sampler.h @@ -48,7 +48,7 @@ struct RandomNumberGenerator { template size_t SelectSample(const F& a, const F& b, double T = 1.0) { if (T == 1.0) { - if (this->next() > (a / (a + b))) return 1; else return 0; + if (F(this->next()) > (a / (a + b))) return 1; else return 0; } else { assert(!"not implemented"); } -- cgit v1.2.3 From d87220030b82fed860efee40487502e9ee8f0651 Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Mon, 27 Feb 2012 02:19:34 +0000 Subject: generic bayesian cfg learner with a bunch of cfg grammar types --- .gitignore | 1 + decoder/trule.cc | 16 +-- gi/pf/Makefile.am | 4 +- gi/pf/hierolm.cc | 309 ----------------------------------------- gi/pf/learn_cfg.cc | 394 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 5 files changed, 398 insertions(+), 326 deletions(-) delete mode 100644 gi/pf/hierolm.cc create mode 100644 gi/pf/learn_cfg.cc (limited to 'gi/pf/Makefile.am') diff --git a/.gitignore b/.gitignore index 327f7261..28d5a60a 100644 --- a/.gitignore +++ b/.gitignore @@ -57,6 +57,7 @@ training/mpi_extract_reachable klm/lm/build_binary extools/extractor_monolingual gi/pf/.deps +gi/pf/learn_cfg gi/pf/brat gi/pf/cbgi gi/pf/dpnaive diff --git a/decoder/trule.cc b/decoder/trule.cc index 40235542..141b8faa 100644 --- a/decoder/trule.cc +++ b/decoder/trule.cc @@ -232,16 +232,6 @@ void TRule::ComputeArity() { arity_ = 1 - min; } -static string AnonymousStrVar(int i) { - string res("[v]"); - if(!(i <= 0 && i >= -8)) { - cerr << "Can't handle more than 9 non-terminals: index=" << (-i) << endl; - abort(); - } - res[1] = '1' - i; - return res; -} - string TRule::AsString(bool verbose) const { ostringstream os; int idx = 0; @@ -259,15 +249,11 @@ string TRule::AsString(bool verbose) const { } } os << " ||| "; - if (idx > 9) { - cerr << "Too many non-terminals!\n partial: " << os.str() << endl; - exit(1); - } for (int i =0; i -#include -#include - -#include -#include -#include - -#include "inside_outside.h" -#include "hg.h" -#include "bottom_up_parser.h" -#include "fdict.h" -#include "grammar.h" -#include "m.h" -#include "trule.h" -#include "tdict.h" -#include "filelib.h" -#include "dict.h" -#include "sampler.h" -#include "ccrp.h" -#include "ccrp_onetable.h" - -using namespace std; -using namespace tr1; -namespace po = boost::program_options; - -shared_ptr prng; - -void InitCommandLine(int argc, char** argv, po::variables_map* conf) { - po::options_description opts("Configuration options"); - opts.add_options() - ("samples,s",po::value()->default_value(1000),"Number of samples") - ("input,i",po::value(),"Read parallel data from") - ("random_seed,S",po::value(), "Random seed"); - po::options_description clo("Command line options"); - clo.add_options() - ("config", po::value(), "Configuration file") - ("help,h", "Print this help message and exit"); - po::options_description dconfig_options, dcmdline_options; - dconfig_options.add(opts); - dcmdline_options.add(opts).add(clo); - - po::store(parse_command_line(argc, argv, dcmdline_options), *conf); - if (conf->count("config")) { - ifstream config((*conf)["config"].as().c_str()); - po::store(po::parse_config_file(config, dconfig_options), *conf); - } - po::notify(*conf); - - if (conf->count("help") || (conf->count("input") == 0)) { - cerr << dcmdline_options << endl; - exit(1); - } -} - -void ReadCorpus(const string& filename, - vector >* e, - set* vocab_e) { - e->clear(); - vocab_e->clear(); - istream* in; - if (filename == "-") - in = &cin; - else - in = new ifstream(filename.c_str()); - assert(*in); - string line; - while(*in) { - getline(*in, line); - if (line.empty() && !*in) break; - e->push_back(vector()); - vector& le = e->back(); - TD::ConvertSentence(line, &le); - for (unsigned i = 0; i < le.size(); ++i) - vocab_e->insert(le[i]); - } - if (in != &cin) delete in; -} - -struct Grid { - // a b c d e - // 0 - 0 - - - vector grid; -}; - -struct BaseRuleModel { - explicit BaseRuleModel(unsigned term_size, - unsigned nonterm_size = 1) : - unif_term(1.0 / term_size), - unif_nonterm(1.0 / nonterm_size) {} - prob_t operator()(const TRule& r) const { - prob_t p; p.logeq(Md::log_poisson(1.0, r.f_.size())); - const prob_t term_prob((2.0 + 0.01*r.f_.size()) / (r.f_.size() + 2)); - const prob_t nonterm_prob(1.0 - term_prob.as_float()); - for (unsigned i = 0; i < r.f_.size(); ++i) { - if (r.f_[i] <= 0) { // nonterminal - p *= nonterm_prob; - p *= unif_nonterm; - } else { // terminal - p *= term_prob; - p *= unif_term; - } - } - return p; - } - const prob_t unif_term, unif_nonterm; -}; - -struct HieroLMModel { - explicit HieroLMModel(unsigned vocab_size) : p0(vocab_size), x(1,1,1,1) {} - - prob_t Prob(const TRule& r) const { - return x.probT(r, p0(r)); - } - - int Increment(const TRule& r, MT19937* rng) { - return x.incrementT(r, p0(r), rng); - // return x.increment(r); - } - - int Decrement(const TRule& r, MT19937* rng) { - return x.decrement(r, rng); - //return x.decrement(r); - } - - prob_t Likelihood() const { - prob_t p; - p.logeq(x.log_crp_prob()); - for (CCRP::const_iterator it = x.begin(); it != x.end(); ++it) { - prob_t tp = p0(it->first); - tp.poweq(it->second.table_counts_.size()); - p *= tp; - } - //for (CCRP_OneTable::const_iterator it = x.begin(); it != x.end(); ++it) - // p *= p0(it->first); - return p; - } - - void ResampleHyperparameters(MT19937* rng) { - x.resample_hyperparameters(rng); - cerr << " d=" << x.discount() << ", alpha=" << x.concentration() << endl; - } - - const BaseRuleModel p0; - CCRP x; - //CCRP_OneTable x; -}; - -vector tofreelist; - -HieroLMModel* plm; - -struct NPGrammarIter : public GrammarIter, public RuleBin { - NPGrammarIter() : arity() { tofreelist.push_back(this); } - NPGrammarIter(const TRulePtr& inr, const int a, int symbol) : arity(a + (symbol < 0 ? 1 : 0)) { - if (inr) { - r.reset(new TRule(*inr)); - } else { - static const int kLHS = -TD::Convert("X"); - r.reset(new TRule); - r->lhs_ = kLHS; - } - TRule& rr = *r; - rr.f_.push_back(symbol); - rr.e_.push_back(symbol < 0 ? (1-int(arity)) : symbol); - tofreelist.push_back(this); - } - virtual int GetNumRules() const { - if (r) return 1; else return 0; - } - virtual TRulePtr GetIthRule(int) const { - return r; - } - virtual int Arity() const { - return arity; - } - virtual const RuleBin* GetRules() const { - if (!r) return NULL; else return this; - } - virtual const GrammarIter* Extend(int symbol) const { - return new NPGrammarIter(r, arity, symbol); - } - const unsigned char arity; - TRulePtr r; -}; - -struct NPGrammar : public Grammar { - virtual const GrammarIter* GetRoot() const { - return new NPGrammarIter; - } -}; - -void SampleDerivation(const Hypergraph& hg, MT19937* rng, vector* sampled_deriv, HieroLMModel* plm) { - HieroLMModel& lm = *plm; - vector node_probs; - const prob_t total_prob = Inside(hg, &node_probs); - queue q; - q.push(hg.nodes_.size() - 3); - while(!q.empty()) { - unsigned cur_node_id = q.front(); -// cerr << "NODE=" << cur_node_id << endl; - q.pop(); - const Hypergraph::Node& node = hg.nodes_[cur_node_id]; - const unsigned num_in_edges = node.in_edges_.size(); - unsigned sampled_edge = 0; - if (num_in_edges == 1) { - sampled_edge = node.in_edges_[0]; - } else { - //prob_t z; - assert(num_in_edges > 1); - SampleSet ss; - for (unsigned j = 0; j < num_in_edges; ++j) { - const Hypergraph::Edge& edge = hg.edges_[node.in_edges_[j]]; - prob_t p = edge.edge_prob_; - for (unsigned k = 0; k < edge.tail_nodes_.size(); ++k) - p *= node_probs[edge.tail_nodes_[k]]; - ss.add(p); -// cerr << log(ss[j]) << " ||| " << edge.rule_->AsString() << endl; - //z += p; - } -// for (unsigned j = 0; j < num_in_edges; ++j) { -// const Hypergraph::Edge& edge = hg.edges_[node.in_edges_[j]]; -// cerr << exp(log(ss[j] / z)) << " ||| " << edge.rule_->AsString() << endl; -// } -// cerr << " --- \n"; - sampled_edge = node.in_edges_[rng->SelectSample(ss)]; - } - sampled_deriv->push_back(sampled_edge); - const Hypergraph::Edge& edge = hg.edges_[sampled_edge]; - for (unsigned j = 0; j < edge.tail_nodes_.size(); ++j) { - q.push(edge.tail_nodes_[j]); - } - } - for (unsigned i = 0; i < sampled_deriv->size(); ++i) { - cerr << *hg.edges_[(*sampled_deriv)[i]].rule_ << endl; - } -} - -void IncrementDerivation(const Hypergraph& hg, const vector& d, HieroLMModel* plm, MT19937* rng) { - for (unsigned i = 0; i < d.size(); ++i) - plm->Increment(*hg.edges_[d[i]].rule_, rng); -} - -void DecrementDerivation(const Hypergraph& hg, const vector& d, HieroLMModel* plm, MT19937* rng) { - for (unsigned i = 0; i < d.size(); ++i) - plm->Decrement(*hg.edges_[d[i]].rule_, rng); -} - -int main(int argc, char** argv) { - po::variables_map conf; - vector grammars; - grammars.push_back(GrammarPtr(new NPGrammar)); - - InitCommandLine(argc, argv, &conf); - const unsigned samples = conf["samples"].as(); - - if (conf.count("random_seed")) - prng.reset(new MT19937(conf["random_seed"].as())); - else - prng.reset(new MT19937); - MT19937& rng = *prng; - - vector > corpuse; - set vocabe; - cerr << "Reading corpus...\n"; - ReadCorpus(conf["input"].as(), &corpuse, &vocabe); - cerr << "E-corpus size: " << corpuse.size() << " sentences\t (" << vocabe.size() << " word types)\n"; - HieroLMModel lm(vocabe.size()); - - plm = &lm; - ExhaustiveBottomUpParser parser("X", grammars); - - Hypergraph hg; - const int kX = -TD::Convert("X"); - const int kLP = FD::Convert("LogProb"); - SparseVector v; v.set_value(kLP, 1.0); - vector > derivs(corpuse.size()); - for (int SS=0; SS < samples; ++SS) { - for (int ci = 0; ci < corpuse.size(); ++ci) { - vector& src = corpuse[ci]; - Lattice lat(src.size()); - for (unsigned i = 0; i < src.size(); ++i) - lat[i].push_back(LatticeArc(src[i], 0.0, 1)); - cerr << TD::GetString(src) << endl; - hg.clear(); - parser.Parse(lat, &hg); // exhaustive parse - DecrementDerivation(hg, derivs[ci], &lm, &rng); - for (unsigned i = 0; i < hg.edges_.size(); ++i) { - TRule& r = *hg.edges_[i].rule_; - if (r.lhs_ == kX) - hg.edges_[i].edge_prob_ = lm.Prob(r); - } - vector d; - SampleDerivation(hg, &rng, &d, &lm); - derivs[ci] = d; - IncrementDerivation(hg, derivs[ci], &lm, &rng); - if (tofreelist.size() > 100000) { - cerr << "Freeing ... "; - for (unsigned i = 0; i < tofreelist.size(); ++i) - delete tofreelist[i]; - tofreelist.clear(); - cerr << "Freed.\n"; - } - } - cerr << "LLH=" << lm.Likelihood() << endl; - } - return 0; -} - diff --git a/gi/pf/learn_cfg.cc b/gi/pf/learn_cfg.cc new file mode 100644 index 00000000..3d202816 --- /dev/null +++ b/gi/pf/learn_cfg.cc @@ -0,0 +1,394 @@ +#include +#include +#include + +#include +#include +#include + +#include "inside_outside.h" +#include "hg.h" +#include "bottom_up_parser.h" +#include "fdict.h" +#include "grammar.h" +#include "m.h" +#include "trule.h" +#include "tdict.h" +#include "filelib.h" +#include "dict.h" +#include "sampler.h" +#include "ccrp.h" +#include "ccrp_onetable.h" + +using namespace std; +using namespace tr1; +namespace po = boost::program_options; + +shared_ptr prng; +vector nt_vocab; +vector nt_id_to_index; +static unsigned kMAX_RULE_SIZE = 0; +static unsigned kMAX_ARITY = 0; +static bool kALLOW_MIXED = true; // allow rules with mixed terminals and NTs + +void InitCommandLine(int argc, char** argv, po::variables_map* conf) { + po::options_description opts("Configuration options"); + opts.add_options() + ("samples,s",po::value()->default_value(1000),"Number of samples") + ("input,i",po::value(),"Read parallel data from") + ("max_rule_size,m", po::value()->default_value(0), "Maximum rule size (0 for unlimited)") + ("max_arity,a", po::value()->default_value(0), "Maximum number of nonterminals in a rule (0 for unlimited)") + ("no_mixed_rules,M", "Do not mix terminals and nonterminals in a rule RHS") + ("nonterminals,n", po::value()->default_value(1), "Size of nonterminal vocabulary") + ("random_seed,S",po::value(), "Random seed"); + po::options_description clo("Command line options"); + clo.add_options() + ("config", po::value(), "Configuration file") + ("help,h", "Print this help message and exit"); + po::options_description dconfig_options, dcmdline_options; + dconfig_options.add(opts); + dcmdline_options.add(opts).add(clo); + + po::store(parse_command_line(argc, argv, dcmdline_options), *conf); + if (conf->count("config")) { + ifstream config((*conf)["config"].as().c_str()); + po::store(po::parse_config_file(config, dconfig_options), *conf); + } + po::notify(*conf); + + if (conf->count("help") || (conf->count("input") == 0)) { + cerr << dcmdline_options << endl; + exit(1); + } +} + +unsigned ReadCorpus(const string& filename, + vector >* e, + set* vocab_e) { + e->clear(); + vocab_e->clear(); + istream* in; + if (filename == "-") + in = &cin; + else + in = new ifstream(filename.c_str()); + assert(*in); + string line; + unsigned toks = 0; + while(*in) { + getline(*in, line); + if (line.empty() && !*in) break; + e->push_back(vector()); + vector& le = e->back(); + TD::ConvertSentence(line, &le); + for (unsigned i = 0; i < le.size(); ++i) + vocab_e->insert(le[i]); + toks += le.size(); + } + if (in != &cin) delete in; + return toks; +} + +struct Grid { + // a b c d e + // 0 - 0 - - + vector grid; +}; + +struct BaseRuleModel { + explicit BaseRuleModel(unsigned term_size, + unsigned nonterm_size = 1) : + unif_term(1.0 / term_size), + unif_nonterm(1.0 / nonterm_size) {} + prob_t operator()(const TRule& r) const { + prob_t p; p.logeq(Md::log_poisson(1.0, r.f_.size())); + const prob_t term_prob((2.0 + 0.01*r.f_.size()) / (r.f_.size() + 2)); + const prob_t nonterm_prob(1.0 - term_prob.as_float()); + for (unsigned i = 0; i < r.f_.size(); ++i) { + if (r.f_[i] <= 0) { // nonterminal + p *= nonterm_prob; + p *= unif_nonterm; + } else { // terminal + p *= term_prob; + p *= unif_term; + } + } + return p; + } + const prob_t unif_term, unif_nonterm; +}; + +struct HieroLMModel { + explicit HieroLMModel(unsigned vocab_size, unsigned num_nts = 1) : p0(vocab_size, num_nts), nts(num_nts, CCRP(1,1,1,1)) {} + + prob_t Prob(const TRule& r) const { + return nts[nt_id_to_index[-r.lhs_]].probT(r, p0(r)); + } + + int Increment(const TRule& r, MT19937* rng) { + return nts[nt_id_to_index[-r.lhs_]].incrementT(r, p0(r), rng); + // return x.increment(r); + } + + int Decrement(const TRule& r, MT19937* rng) { + return nts[nt_id_to_index[-r.lhs_]].decrement(r, rng); + //return x.decrement(r); + } + + prob_t Likelihood() const { + prob_t p = prob_t::One(); + for (unsigned i = 0; i < nts.size(); ++i) { + prob_t q; q.logeq(nts[i].log_crp_prob()); + p *= q; + for (CCRP::const_iterator it = nts[i].begin(); it != nts[i].end(); ++it) { + prob_t tp = p0(it->first); + tp.poweq(it->second.table_counts_.size()); + p *= tp; + } + } + //for (CCRP_OneTable::const_iterator it = x.begin(); it != x.end(); ++it) + // p *= p0(it->first); + return p; + } + + void ResampleHyperparameters(MT19937* rng) { + for (unsigned i = 0; i < nts.size(); ++i) + nts[i].resample_hyperparameters(rng); + cerr << " d=" << nts[0].discount() << ", alpha=" << nts[0].concentration() << endl; + } + + const BaseRuleModel p0; + vector > nts; + //CCRP_OneTable x; +}; + +vector tofreelist; + +HieroLMModel* plm; + +struct NPGrammarIter : public GrammarIter, public RuleBin { + NPGrammarIter() : arity() { tofreelist.push_back(this); } + NPGrammarIter(const TRulePtr& inr, const int a, int symbol) : arity(a) { + if (inr) { + r.reset(new TRule(*inr)); + } else { + r.reset(new TRule); + } + TRule& rr = *r; + rr.lhs_ = nt_vocab[0]; + rr.f_.push_back(symbol); + rr.e_.push_back(symbol < 0 ? (1-int(arity)) : symbol); + tofreelist.push_back(this); + } + inline static unsigned NextArity(int cur_a, int symbol) { + return cur_a + (symbol <= 0 ? 1 : 0); + } + virtual int GetNumRules() const { + if (r) return nt_vocab.size(); else return 0; + } + virtual TRulePtr GetIthRule(int i) const { + if (i == 0) return r; + TRulePtr nr(new TRule(*r)); + nr->lhs_ = nt_vocab[i]; + return nr; + } + virtual int Arity() const { + return arity; + } + virtual const RuleBin* GetRules() const { + if (!r) return NULL; else return this; + } + virtual const GrammarIter* Extend(int symbol) const { + const int next_arity = NextArity(arity, symbol); + if (kMAX_ARITY && next_arity > kMAX_ARITY) + return NULL; + if (!kALLOW_MIXED && r) { + bool t1 = r->f_.front() <= 0; + bool t2 = symbol <= 0; + if (t1 != t2) return NULL; + } + if (!kMAX_RULE_SIZE || !r || (r->f_.size() < kMAX_RULE_SIZE)) + return new NPGrammarIter(r, next_arity, symbol); + else + return NULL; + } + const unsigned char arity; + TRulePtr r; +}; + +struct NPGrammar : public Grammar { + virtual const GrammarIter* GetRoot() const { + return new NPGrammarIter; + } +}; + +prob_t TotalProb(const Hypergraph& hg) { + return Inside(hg); +} + +void SampleDerivation(const Hypergraph& hg, MT19937* rng, vector* sampled_deriv) { + vector node_probs; + Inside(hg, &node_probs); + queue q; + q.push(hg.nodes_.size() - 2); + while(!q.empty()) { + unsigned cur_node_id = q.front(); +// cerr << "NODE=" << cur_node_id << endl; + q.pop(); + const Hypergraph::Node& node = hg.nodes_[cur_node_id]; + const unsigned num_in_edges = node.in_edges_.size(); + unsigned sampled_edge = 0; + if (num_in_edges == 1) { + sampled_edge = node.in_edges_[0]; + } else { + //prob_t z; + assert(num_in_edges > 1); + SampleSet ss; + for (unsigned j = 0; j < num_in_edges; ++j) { + const Hypergraph::Edge& edge = hg.edges_[node.in_edges_[j]]; + prob_t p = edge.edge_prob_; + for (unsigned k = 0; k < edge.tail_nodes_.size(); ++k) + p *= node_probs[edge.tail_nodes_[k]]; + ss.add(p); +// cerr << log(ss[j]) << " ||| " << edge.rule_->AsString() << endl; + //z += p; + } +// for (unsigned j = 0; j < num_in_edges; ++j) { +// const Hypergraph::Edge& edge = hg.edges_[node.in_edges_[j]]; +// cerr << exp(log(ss[j] / z)) << " ||| " << edge.rule_->AsString() << endl; +// } +// cerr << " --- \n"; + sampled_edge = node.in_edges_[rng->SelectSample(ss)]; + } + sampled_deriv->push_back(sampled_edge); + const Hypergraph::Edge& edge = hg.edges_[sampled_edge]; + for (unsigned j = 0; j < edge.tail_nodes_.size(); ++j) { + q.push(edge.tail_nodes_[j]); + } + } + for (unsigned i = 0; i < sampled_deriv->size(); ++i) { + cerr << *hg.edges_[(*sampled_deriv)[i]].rule_ << endl; + } +} + +void IncrementDerivation(const Hypergraph& hg, const vector& d, HieroLMModel* plm, MT19937* rng) { + for (unsigned i = 0; i < d.size(); ++i) + plm->Increment(*hg.edges_[d[i]].rule_, rng); +} + +void DecrementDerivation(const Hypergraph& hg, const vector& d, HieroLMModel* plm, MT19937* rng) { + for (unsigned i = 0; i < d.size(); ++i) + plm->Decrement(*hg.edges_[d[i]].rule_, rng); +} + +int main(int argc, char** argv) { + po::variables_map conf; + + InitCommandLine(argc, argv, &conf); + nt_vocab.resize(conf["nonterminals"].as()); + assert(nt_vocab.size() > 0); + assert(nt_vocab.size() < 26); + { + string nt = "X"; + for (unsigned i = 0; i < nt_vocab.size(); ++i) { + if (nt_vocab.size() > 1) nt[0] = ('A' + i); + int pid = TD::Convert(nt); + nt_vocab[i] = -pid; + if (pid >= nt_id_to_index.size()) { + nt_id_to_index.resize(pid + 1, -1); + } + nt_id_to_index[pid] = i; + } + } + vector grammars; + grammars.push_back(GrammarPtr(new NPGrammar)); + + const unsigned samples = conf["samples"].as(); + kMAX_RULE_SIZE = conf["max_rule_size"].as(); + if (kMAX_RULE_SIZE == 1) { + cerr << "Invalid maximum rule size: must be 0 or >1\n"; + return 1; + } + kMAX_ARITY = conf["max_arity"].as(); + if (kMAX_ARITY == 1) { + cerr << "Invalid maximum arity: must be 0 or >1\n"; + return 1; + } + kALLOW_MIXED = !conf.count("no_mixed_rules"); + + if (conf.count("random_seed")) + prng.reset(new MT19937(conf["random_seed"].as())); + else + prng.reset(new MT19937); + MT19937& rng = *prng; + vector > corpuse; + set vocabe; + cerr << "Reading corpus...\n"; + const unsigned toks = ReadCorpus(conf["input"].as(), &corpuse, &vocabe); + cerr << "E-corpus size: " << corpuse.size() << " sentences\t (" << vocabe.size() << " word types)\n"; + HieroLMModel lm(vocabe.size(), nt_vocab.size()); + + plm = &lm; + ExhaustiveBottomUpParser parser(TD::Convert(-nt_vocab[0]), grammars); + + Hypergraph hg; + const int kGoal = -TD::Convert("Goal"); + const int kLP = FD::Convert("LogProb"); + SparseVector v; v.set_value(kLP, 1.0); + vector > derivs(corpuse.size()); + vector cl(corpuse.size()); + for (int ci = 0; ci < corpuse.size(); ++ci) { + vector& src = corpuse[ci]; + Lattice& lat = cl[ci]; + lat.resize(src.size()); + for (unsigned i = 0; i < src.size(); ++i) + lat[i].push_back(LatticeArc(src[i], 0.0, 1)); + } + for (int SS=0; SS < samples; ++SS) { + const bool is_last = ((samples - 1) == SS); + prob_t dlh = prob_t::One(); + for (int ci = 0; ci < corpuse.size(); ++ci) { + const vector& src = corpuse[ci]; + const Lattice& lat = cl[ci]; + cerr << TD::GetString(src) << endl; + hg.clear(); + parser.Parse(lat, &hg); // exhaustive parse + vector& d = derivs[ci]; + if (!is_last) DecrementDerivation(hg, d, &lm, &rng); + for (unsigned i = 0; i < hg.edges_.size(); ++i) { + TRule& r = *hg.edges_[i].rule_; + if (r.lhs_ == kGoal) + hg.edges_[i].edge_prob_ = prob_t::One(); + else + hg.edges_[i].edge_prob_ = lm.Prob(r); + } + if (!is_last) { + d.clear(); + SampleDerivation(hg, &rng, &d); + IncrementDerivation(hg, derivs[ci], &lm, &rng); + } else { + prob_t p = TotalProb(hg); + dlh *= p; + cerr << " p(sentence) = " << log(p) << "\t" << log(dlh) << endl; + } + if (tofreelist.size() > 200000) { + cerr << "Freeing ... "; + for (unsigned i = 0; i < tofreelist.size(); ++i) + delete tofreelist[i]; + tofreelist.clear(); + cerr << "Freed.\n"; + } + } + double llh = log(lm.Likelihood()); + cerr << "LLH=" << llh << "\tENTROPY=" << (-llh / log(2) / toks) << "\tPPL=" << pow(2, -llh / log(2) / toks) << endl; + if (SS % 10 == 9) lm.ResampleHyperparameters(&rng); + if (is_last) { + double z = log(dlh); + cerr << "TOTAL_PROB=" << z << "\tENTROPY=" << (-z / log(2) / toks) << "\tPPL=" << pow(2, -z / log(2) / toks) << endl; + } + } + for (unsigned i = 0; i < nt_vocab.size(); ++i) + cerr << lm.nts[i] << endl; + return 0; +} + -- cgit v1.2.3 From e0507d1aa96c6b1348e6a202beb95f63d8662258 Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Sat, 3 Mar 2012 03:24:53 -0500 Subject: PYP language model (Teh 2006) --- decoder/fst_translator.cc | 5 +- gi/pf/Makefile.am | 4 +- gi/pf/pyp_lm.cc | 150 ++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 157 insertions(+), 2 deletions(-) create mode 100644 gi/pf/pyp_lm.cc (limited to 'gi/pf/Makefile.am') diff --git a/decoder/fst_translator.cc b/decoder/fst_translator.cc index 38dbd717..074de4c9 100644 --- a/decoder/fst_translator.cc +++ b/decoder/fst_translator.cc @@ -30,7 +30,10 @@ struct FSTTranslatorImpl { if (input.find("{\"rules\"") == 0) { istringstream is(input); Hypergraph src_cfg_hg; - assert(HypergraphIO::ReadFromJSON(&is, &src_cfg_hg)); + if (!HypergraphIO::ReadFromJSON(&is, &src_cfg_hg)) { + cerr << "Failed to read HG from JSON.\n"; + abort(); + } if (add_pass_through_rules) { SparseVector feats; feats.set_value(FD::Convert("PassThrough"), 1); diff --git a/gi/pf/Makefile.am b/gi/pf/Makefile.am index 0cf0bc63..7cf9c14d 100644 --- a/gi/pf/Makefile.am +++ b/gi/pf/Makefile.am @@ -1,4 +1,4 @@ -bin_PROGRAMS = cbgi brat dpnaive pfbrat pfdist itg pfnaive condnaive align-lexonly align-lexonly-pyp learn_cfg +bin_PROGRAMS = cbgi brat dpnaive pfbrat pfdist itg pfnaive condnaive align-lexonly align-lexonly-pyp learn_cfg pyp_lm noinst_LIBRARIES = libpf.a libpf_a_SOURCES = base_distributions.cc reachability.cc cfg_wfst_composer.cc corpus.cc unigrams.cc ngram_base.cc @@ -9,6 +9,8 @@ align_lexonly_pyp_SOURCES = align-lexonly-pyp.cc itg_SOURCES = itg.cc +pyp_lm_SOURCES = pyp_lm.cc + learn_cfg_SOURCES = learn_cfg.cc condnaive_SOURCES = condnaive.cc diff --git a/gi/pf/pyp_lm.cc b/gi/pf/pyp_lm.cc new file mode 100644 index 00000000..2837e33c --- /dev/null +++ b/gi/pf/pyp_lm.cc @@ -0,0 +1,150 @@ +#include +#include +#include + +#include +#include +#include + +#include "corpus_tools.h" +#include "m.h" +#include "tdict.h" +#include "sampler.h" +#include "ccrp.h" +#include "ccrp_onetable.h" + +using namespace std; +using namespace tr1; +namespace po = boost::program_options; + +shared_ptr prng; + +void InitCommandLine(int argc, char** argv, po::variables_map* conf) { + po::options_description opts("Configuration options"); + opts.add_options() + ("samples,s",po::value()->default_value(1000),"Number of samples") + ("input,i",po::value(),"Read data from") + ("random_seed,S",po::value(), "Random seed"); + po::options_description clo("Command line options"); + clo.add_options() + ("config", po::value(), "Configuration file") + ("help", "Print this help message and exit"); + po::options_description dconfig_options, dcmdline_options; + dconfig_options.add(opts); + dcmdline_options.add(opts).add(clo); + + po::store(parse_command_line(argc, argv, dcmdline_options), *conf); + if (conf->count("config")) { + ifstream config((*conf)["config"].as().c_str()); + po::store(po::parse_config_file(config, dconfig_options), *conf); + } + po::notify(*conf); + + if (conf->count("help") || (conf->count("input") == 0)) { + cerr << dcmdline_options << endl; + exit(1); + } +} + +template struct PYPLM; + +// uniform base distribution +template<> struct PYPLM<0> { + PYPLM(unsigned vs) : p0(1.0 / vs) {} + void increment(WordID w, const vector& context, MT19937* rng) const {} + void decrement(WordID w, const vector& context, MT19937* rng) const {} + double prob(WordID w, const vector& context) const { return p0; } + const double p0; +}; + +// represents an N-gram LM +template struct PYPLM { + PYPLM(unsigned vs) : backoff(vs) {} + void increment(WordID w, const vector& context, MT19937* rng) { + const double bo = backoff.prob(w, context); + static vector lookup(N-1); + for (unsigned i = 0; i < N-1; ++i) + lookup[i] = context[context.size() - 1 - i]; + typename unordered_map, CCRP, boost::hash > >::iterator it = p.find(lookup); + if (it == p.end()) + it = p.insert(make_pair(lookup, CCRP(1,1,1,1))).first; + if (it->second.increment(w, bo, rng)) + backoff.increment(w, context, rng); + } + void decrement(WordID w, const vector& context, MT19937* rng) { + static vector lookup(N-1); + for (unsigned i = 0; i < N-1; ++i) + lookup[i] = context[context.size() - 1 - i]; + typename unordered_map, CCRP, boost::hash > >::iterator it = p.find(lookup); + assert(it != p.end()); + if (it->second.decrement(w, rng)) + backoff.decrement(w, context, rng); + } + double prob(WordID w, const vector& context) const { + const double bo = backoff.prob(w, context); + static vector lookup(N-1); + for (unsigned i = 0; i < N-1; ++i) + lookup[i] = context[context.size() - 1 - i]; + typename unordered_map, CCRP, boost::hash > >::const_iterator it = p.find(lookup); + if (it == p.end()) return bo; + return it->second.prob(w, bo); + } + PYPLM backoff; + unordered_map, CCRP, boost::hash > > p; +}; + +int main(int argc, char** argv) { + po::variables_map conf; + + InitCommandLine(argc, argv, &conf); + const unsigned samples = conf["samples"].as(); + if (conf.count("random_seed")) + prng.reset(new MT19937(conf["random_seed"].as())); + else + prng.reset(new MT19937); + MT19937& rng = *prng; + vector > corpuse; + set vocabe; + const WordID kEOS = TD::Convert(""); + cerr << "Reading corpus...\n"; + CorpusTools::ReadFromFile(conf["input"].as(), &corpuse, &vocabe); + cerr << "E-corpus size: " << corpuse.size() << " sentences\t (" << vocabe.size() << " word types)\n"; +#define kORDER 5 + PYPLM lm(vocabe.size()); + vector ctx(kORDER - 1, TD::Convert("")); + int mci = corpuse.size() * 99 / 100; + for (int SS=0; SS < samples; ++SS) { + for (int ci = 0; ci < mci; ++ci) { + ctx.resize(kORDER - 1); + const vector& s = corpuse[ci]; + for (int i = 0; i <= s.size(); ++i) { + WordID w = (i < s.size() ? s[i] : kEOS); + if (SS > 0) lm.decrement(w, ctx, &rng); + lm.increment(w, ctx, &rng); + ctx.push_back(w); + } + if (SS > 0) lm.decrement(kEOS, ctx, &rng); + lm.increment(kEOS, ctx, &rng); + } + } + double llh = 0; + unsigned cnt = 0; + for (int ci = mci; ci < corpuse.size(); ++ci) { + ctx.resize(kORDER - 1); + const vector& s = corpuse[ci]; + for (int i = 0; i <= s.size(); ++i) { + WordID w = (i < s.size() ? s[i] : kEOS); + double lp = log(lm.prob(w, ctx)) / log(2); + cerr << "p(" << TD::Convert(w) << " | " << TD::GetString(ctx) << ") = " << lp << endl; + ctx.push_back(w); + llh -= lp; + cnt++; + } + } + cerr << " Log_10 prob: " << (llh * log(2) / log(10)) << endl; + cerr << " Count: " << (cnt) << endl; + cerr << "Cross-entropy: " << (llh / cnt) << endl; + cerr << " Perplexity: " << pow(2, llh / cnt) << endl; + return 0; +} + -- cgit v1.2.3 From 7fd9fe26f00cf31a7b407364399d37b4eaf04eba Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Wed, 7 Mar 2012 20:25:53 -0500 Subject: lattice builder --- gi/pf/Makefile.am | 7 +- gi/pf/align-tl.cc | 334 ++++++++++++++++++++++++++++++++++++++++++++++ gi/pf/conditional_pseg.h | 11 +- gi/pf/nuisance_test.cc | 161 ++++++++++++++++++++++ gi/pf/transliterations.cc | 193 +++++++++++++++++++++++++++ gi/pf/transliterations.h | 20 +++ 6 files changed, 723 insertions(+), 3 deletions(-) create mode 100644 gi/pf/align-tl.cc create mode 100644 gi/pf/nuisance_test.cc create mode 100644 gi/pf/transliterations.cc create mode 100644 gi/pf/transliterations.h (limited to 'gi/pf/Makefile.am') diff --git a/gi/pf/Makefile.am b/gi/pf/Makefile.am index 7cf9c14d..5e89f02a 100644 --- a/gi/pf/Makefile.am +++ b/gi/pf/Makefile.am @@ -1,12 +1,17 @@ -bin_PROGRAMS = cbgi brat dpnaive pfbrat pfdist itg pfnaive condnaive align-lexonly align-lexonly-pyp learn_cfg pyp_lm +bin_PROGRAMS = cbgi brat dpnaive pfbrat pfdist itg pfnaive condnaive align-lexonly align-lexonly-pyp learn_cfg pyp_lm nuisance_test align-tl noinst_LIBRARIES = libpf.a + libpf_a_SOURCES = base_distributions.cc reachability.cc cfg_wfst_composer.cc corpus.cc unigrams.cc ngram_base.cc +nuisance_test_SOURCES = nuisance_test.cc transliterations.cc + align_lexonly_SOURCES = align-lexonly.cc align_lexonly_pyp_SOURCES = align-lexonly-pyp.cc +align_tl_SOURCES = align-tl.cc transliterations.cc + itg_SOURCES = itg.cc pyp_lm_SOURCES = pyp_lm.cc diff --git a/gi/pf/align-tl.cc b/gi/pf/align-tl.cc new file mode 100644 index 00000000..0e0454e5 --- /dev/null +++ b/gi/pf/align-tl.cc @@ -0,0 +1,334 @@ +#include +#include +#include + +#include +#include +#include + +#include "array2d.h" +#include "base_distributions.h" +#include "monotonic_pseg.h" +#include "conditional_pseg.h" +#include "trule.h" +#include "tdict.h" +#include "stringlib.h" +#include "filelib.h" +#include "dict.h" +#include "sampler.h" +#include "mfcr.h" +#include "corpus.h" +#include "ngram_base.h" +#include "transliterations.h" + +using namespace std; +using namespace tr1; +namespace po = boost::program_options; + +void InitCommandLine(int argc, char** argv, po::variables_map* conf) { + po::options_description opts("Configuration options"); + opts.add_options() + ("samples,s",po::value()->default_value(1000),"Number of samples") + ("input,i",po::value(),"Read parallel data from") + ("random_seed,S",po::value(), "Random seed"); + po::options_description clo("Command line options"); + clo.add_options() + ("config", po::value(), "Configuration file") + ("help,h", "Print this help message and exit"); + po::options_description dconfig_options, dcmdline_options; + dconfig_options.add(opts); + dcmdline_options.add(opts).add(clo); + + po::store(parse_command_line(argc, argv, dcmdline_options), *conf); + if (conf->count("config")) { + ifstream config((*conf)["config"].as().c_str()); + po::store(po::parse_config_file(config, dconfig_options), *conf); + } + po::notify(*conf); + + if (conf->count("help") || (conf->count("input") == 0)) { + cerr << dcmdline_options << endl; + exit(1); + } +} + +shared_ptr prng; + +struct LexicalAlignment { + unsigned char src_index; + bool is_transliteration; + vector > derivation; +}; + +struct AlignedSentencePair { + vector src; + vector trg; + vector a; + Array2D posterior; +}; + +struct HierarchicalWordBase { + explicit HierarchicalWordBase(const unsigned vocab_e_size) : + base(prob_t::One()), r(1,1,1,1,0.66,50.0), u0(-log(vocab_e_size)), l(1,prob_t::One()), v(1, prob_t::Zero()) {} + + void ResampleHyperparameters(MT19937* rng) { + r.resample_hyperparameters(rng); + } + + inline double logp0(const vector& s) const { + return Md::log_poisson(s.size(), 7.5) + s.size() * u0; + } + + // return p0 of rule.e_ + prob_t operator()(const TRule& rule) const { + v[0].logeq(logp0(rule.e_)); + return r.prob(rule.e_, v.begin(), l.begin()); + } + + void Increment(const TRule& rule) { + v[0].logeq(logp0(rule.e_)); + if (r.increment(rule.e_, v.begin(), l.begin(), &*prng).count) { + base *= v[0] * l[0]; + } + } + + void Decrement(const TRule& rule) { + if (r.decrement(rule.e_, &*prng).count) { + base /= prob_t(exp(logp0(rule.e_))); + } + } + + prob_t Likelihood() const { + prob_t p; p.logeq(r.log_crp_prob()); + p *= base; + return p; + } + + void Summary() const { + cerr << "NUMBER OF CUSTOMERS: " << r.num_customers() << " (d=" << r.discount() << ",s=" << r.strength() << ')' << endl; + for (MFCR<1,vector >::const_iterator it = r.begin(); it != r.end(); ++it) + cerr << " " << it->second.total_dish_count_ << " (on " << it->second.table_counts_.size() << " tables) " << TD::GetString(it->first) << endl; + } + + prob_t base; + MFCR<1,vector > r; + const double u0; + const vector l; + mutable vector v; +}; + +struct BasicLexicalAlignment { + explicit BasicLexicalAlignment(const vector >& lets, + const unsigned words_e, + const unsigned letters_e, + vector* corp) : + letters(lets), + corpus(*corp), + //up0(words_e), + //up0("en.chars.1gram", letters_e), + //up0("en.words.1gram"), + up0(letters_e), + //up0("en.chars.2gram"), + tmodel(up0) { + } + + void InstantiateRule(const WordID src, + const WordID trg, + TRule* rule) const { + static const WordID kX = TD::Convert("X") * -1; + rule->lhs_ = kX; + rule->e_ = letters[trg]; + rule->f_ = letters[src]; + } + + void InitializeRandom() { + const WordID kNULL = TD::Convert("NULL"); + cerr << "Initializing with random alignments ...\n"; + for (unsigned i = 0; i < corpus.size(); ++i) { + AlignedSentencePair& asp = corpus[i]; + asp.a.resize(asp.trg.size()); + for (unsigned j = 0; j < asp.trg.size(); ++j) { + const unsigned char a_j = prng->next() * (1 + asp.src.size()); + const WordID f_a_j = (a_j ? asp.src[a_j - 1] : kNULL); + TRule r; + InstantiateRule(f_a_j, asp.trg[j], &r); + asp.a[j].is_transliteration = false; + asp.a[j].src_index = a_j; + if (tmodel.IncrementRule(r, &*prng)) + up0.Increment(r); + } + } + cerr << " LLH = " << Likelihood() << endl; + } + + prob_t Likelihood() const { + prob_t p = tmodel.Likelihood(); + p *= up0.Likelihood(); + return p; + } + + void ResampleHyperparemeters() { + tmodel.ResampleHyperparameters(&*prng); + up0.ResampleHyperparameters(&*prng); + cerr << " (base d=" << up0.r.discount() << ",s=" << up0.r.strength() << ")\n"; + } + + void ResampleCorpus(); + + const vector >& letters; // spelling dictionary + vector& corpus; + //PhraseConditionalUninformativeBase up0; + //PhraseConditionalUninformativeUnigramBase up0; + //UnigramWordBase up0; + //HierarchicalUnigramBase up0; + HierarchicalWordBase up0; + //CompletelyUniformBase up0; + //FixedNgramBase up0; + //ConditionalTranslationModel tmodel; + //ConditionalTranslationModel tmodel; + //ConditionalTranslationModel tmodel; + //ConditionalTranslationModel tmodel; + MConditionalTranslationModel tmodel; + //ConditionalTranslationModel tmodel; + //ConditionalTranslationModel tmodel; +}; + +void BasicLexicalAlignment::ResampleCorpus() { + static const WordID kNULL = TD::Convert("NULL"); + for (unsigned i = 0; i < corpus.size(); ++i) { + AlignedSentencePair& asp = corpus[i]; + SampleSet ss; ss.resize(asp.src.size() + 1); + for (unsigned j = 0; j < asp.trg.size(); ++j) { + TRule r; + unsigned char& a_j = asp.a[j].src_index; + WordID f_a_j = (a_j ? asp.src[a_j - 1] : kNULL); + InstantiateRule(f_a_j, asp.trg[j], &r); + if (tmodel.DecrementRule(r, &*prng)) + up0.Decrement(r); + + for (unsigned prop_a_j = 0; prop_a_j <= asp.src.size(); ++prop_a_j) { + const WordID prop_f = (prop_a_j ? asp.src[prop_a_j - 1] : kNULL); + InstantiateRule(prop_f, asp.trg[j], &r); + ss[prop_a_j] = tmodel.RuleProbability(r); + } + a_j = prng->SelectSample(ss); + f_a_j = (a_j ? asp.src[a_j - 1] : kNULL); + InstantiateRule(f_a_j, asp.trg[j], &r); + if (tmodel.IncrementRule(r, &*prng)) + up0.Increment(r); + } + } + cerr << " LLH = " << Likelihood() << endl; +} + +void ExtractLetters(const set& v, vector >* l, set* letset = NULL) { + for (set::const_iterator it = v.begin(); it != v.end(); ++it) { + vector& letters = (*l)[*it]; + if (letters.size()) continue; // if e and f have the same word + + const string& w = TD::Convert(*it); + + size_t cur = 0; + while (cur < w.size()) { + const size_t len = UTF8Len(w[cur]); + letters.push_back(TD::Convert(w.substr(cur, len))); + if (letset) letset->insert(letters.back()); + cur += len; + } + } +} + +void Debug(const AlignedSentencePair& asp) { + cerr << TD::GetString(asp.src) << endl << TD::GetString(asp.trg) << endl; + Array2D a(asp.src.size(), asp.trg.size()); + for (unsigned j = 0; j < asp.trg.size(); ++j) + if (asp.a[j].src_index) a(asp.a[j].src_index - 1, j) = true; + cerr << a << endl; +} + +void AddSample(AlignedSentencePair* asp) { + for (unsigned j = 0; j < asp->trg.size(); ++j) + asp->posterior(asp->a[j].src_index, j)++; +} + +void WriteAlignments(const AlignedSentencePair& asp) { + bool first = true; + for (unsigned j = 0; j < asp.trg.size(); ++j) { + int src_index = -1; + int mc = -1; + for (unsigned i = 0; i <= asp.src.size(); ++i) { + if (asp.posterior(i, j) > mc) { + mc = asp.posterior(i, j); + src_index = i; + } + } + + if (src_index) { + if (first) first = false; else cout << ' '; + cout << (src_index - 1) << '-' << j; + } + } + cout << endl; +} + +int main(int argc, char** argv) { + po::variables_map conf; + InitCommandLine(argc, argv, &conf); + + if (conf.count("random_seed")) + prng.reset(new MT19937(conf["random_seed"].as())); + else + prng.reset(new MT19937); +// MT19937& rng = *prng; + + vector > corpuse, corpusf; + set vocabe, vocabf; + corpus::ReadParallelCorpus(conf["input"].as(), &corpusf, &corpuse, &vocabf, &vocabe); + cerr << "f-Corpus size: " << corpusf.size() << " sentences\n"; + cerr << "f-Vocabulary size: " << vocabf.size() << " types\n"; + cerr << "f-Corpus size: " << corpuse.size() << " sentences\n"; + cerr << "f-Vocabulary size: " << vocabe.size() << " types\n"; + assert(corpusf.size() == corpuse.size()); + + vector corpus(corpuse.size()); + for (unsigned i = 0; i < corpuse.size(); ++i) { + corpus[i].src.swap(corpusf[i]); + corpus[i].trg.swap(corpuse[i]); + corpus[i].posterior.resize(corpus[i].src.size() + 1, corpus[i].trg.size()); + } + corpusf.clear(); corpuse.clear(); + + vocabf.insert(TD::Convert("NULL")); + vector > letters(TD::NumWords()); + set letset; + ExtractLetters(vocabe, &letters, &letset); + ExtractLetters(vocabf, &letters, NULL); + letters[TD::Convert("NULL")].clear(); + + Transliterations tl; + + // TODO CONFIGURE THIS + int min_trans_src = 4; + + cerr << "Initializing transliteration DPs ...\n"; + for (int i = 0; i < corpus.size(); ++i) { + const vector& src = corpus[i].src; + const vector& trg = corpus[i].trg; + cerr << '.' << flush; + if (i % 80 == 79) cerr << endl; + for (int j = 0; j < src.size(); ++j) { + const vector& src_let = letters[src[j]]; + for (int k = 0; k < trg.size(); ++k) { + const vector& trg_let = letters[trg[k]]; + if (src_let.size() < min_trans_src) + tl.Forbid(src[j], trg[k]); + else + tl.Initialize(src[j], src_let, trg[k], trg_let); + } + } + } + cerr << endl; + tl.GraphSummary(); + + return 0; +} diff --git a/gi/pf/conditional_pseg.h b/gi/pf/conditional_pseg.h index 8202778b..81ddb206 100644 --- a/gi/pf/conditional_pseg.h +++ b/gi/pf/conditional_pseg.h @@ -56,6 +56,12 @@ struct MConditionalTranslationModel { }; void ResampleHyperparameters(MT19937* rng) { + typename std::tr1::unordered_map, MFCR<1,TRule>, boost::hash > >::iterator it; +#if 1 + for (it = r.begin(); it != r.end(); ++it) { + it->second.resample_hyperparameters(rng); + } +#else const unsigned nloop = 5; const unsigned niterations = 10; DiscountResampler dr(*this); @@ -70,12 +76,12 @@ struct MConditionalTranslationModel { } strength = slice_sampler1d(ar, strength, *rng, -d, std::numeric_limits::infinity(), 0.0, niterations, 100*niterations); - typename std::tr1::unordered_map, MFCR<1,TRule>, boost::hash > >::iterator it; std::cerr << "MConditionalTranslationModel(d=" << d << ",s=" << strength << ") = " << log_likelihood(d, strength) << std::endl; for (it = r.begin(); it != r.end(); ++it) { it->second.set_discount(d); it->second.set_strength(strength); } +#endif } int DecrementRule(const TRule& rule, MT19937* rng) { @@ -91,7 +97,8 @@ struct MConditionalTranslationModel { int IncrementRule(const TRule& rule, MT19937* rng) { RuleModelHash::iterator it = r.find(rule.f_); if (it == r.end()) { - it = r.insert(make_pair(rule.f_, MFCR<1,TRule>(d, strength))).first; + //it = r.insert(make_pair(rule.f_, MFCR<1,TRule>(d, strength))).first; + it = r.insert(make_pair(rule.f_, MFCR<1,TRule>(1,1,1,1,0.6, -0.12))).first; } p0s[0] = rp0(rule); TableCount delta = it->second.increment(rule, p0s.begin(), lambdas.begin(), rng); diff --git a/gi/pf/nuisance_test.cc b/gi/pf/nuisance_test.cc new file mode 100644 index 00000000..0f44fe95 --- /dev/null +++ b/gi/pf/nuisance_test.cc @@ -0,0 +1,161 @@ +#include "ccrp.h" + +#include +#include + +#include "tdict.h" +#include "transliterations.h" + +using namespace std; + +MT19937 rng; + +ostream& operator<<(ostream&os, const vector& v) { + os << '[' << v[0]; + if (v.size() == 2) os << ' ' << v[1]; + return os << ']'; +} + +struct Base { + Base() : llh(), v(2), v1(1), v2(1), crp(0.25, 0.5) {} + inline double p0(const vector& x) const { + double p = 0.75; + if (x.size() == 2) p = 0.25; + p *= 1.0 / 3.0; + if (x.size() == 2) p *= 1.0 / 3.0; + return p; + } + double est_deriv_prob(int a, int b, int seg) const { + assert(a > 0 && a < 4); // a \in {1,2,3} + assert(b > 0 && b < 4); // b \in {1,2,3} + assert(seg == 0 || seg == 1); // seg \in {0,1} + if (seg == 0) { + v[0] = a; + v[1] = b; + return crp.prob(v, p0(v)); + } else { + v1[0] = a; + v2[0] = b; + return crp.prob(v1, p0(v1)) * crp.prob(v2, p0(v2)); + } + } + double est_marginal_prob(int a, int b) const { + return est_deriv_prob(a,b,0) + est_deriv_prob(a,b,1); + } + int increment(int a, int b, double* pw = NULL) { + double p1 = est_deriv_prob(a, b, 0); + double p2 = est_deriv_prob(a, b, 1); + //p1 = 0.5; p2 = 0.5; + int seg = rng.SelectSample(p1,p2); + double tmp = 0; + if (!pw) pw = &tmp; + double& w = *pw; + if (seg == 0) { + v[0] = a; + v[1] = b; + w = crp.prob(v, p0(v)) / p1; + if (crp.increment(v, p0(v), &rng)) { + llh += log(p0(v)); + } + } else { + v1[0] = a; + w = crp.prob(v1, p0(v1)) / p2; + if (crp.increment(v1, p0(v1), &rng)) { + llh += log(p0(v1)); + } + v2[0] = b; + w *= crp.prob(v2, p0(v2)); + if (crp.increment(v2, p0(v2), &rng)) { + llh += log(p0(v2)); + } + } + return seg; + } + void increment(int a, int b, int seg) { + if (seg == 0) { + v[0] = a; + v[1] = b; + if (crp.increment(v, p0(v), &rng)) { + llh += log(p0(v)); + } + } else { + v1[0] = a; + if (crp.increment(v1, p0(v1), &rng)) { + llh += log(p0(v1)); + } + v2[0] = b; + if (crp.increment(v2, p0(v2), &rng)) { + llh += log(p0(v2)); + } + } + } + void decrement(int a, int b, int seg) { + if (seg == 0) { + v[0] = a; + v[1] = b; + if (crp.decrement(v, &rng)) { + llh -= log(p0(v)); + } + } else { + v1[0] = a; + if (crp.decrement(v1, &rng)) { + llh -= log(p0(v1)); + } + v2[0] = b; + if (crp.decrement(v2, &rng)) { + llh -= log(p0(v2)); + } + } + } + double log_likelihood() const { + return llh + crp.log_crp_prob(); + } + double llh; + mutable vector v, v1, v2; + CCRP > crp; +}; + +int main(int argc, char** argv) { + double tl = 0; + const int ITERS = 1000; + const int PARTICLES = 20; + const int DATAPOINTS = 50; + WordID x = TD::Convert("souvenons"); + WordID y = TD::Convert("remember"); + vector src; TD::ConvertSentence("s o u v e n o n s", &src); + vector trg; TD::ConvertSentence("r e m e m b e r", &trg); + Transliterations xx; + xx.Initialize(x, src, y, trg); + return 1; + + for (int j = 0; j < ITERS; ++j) { + Base b; + vector segs(DATAPOINTS); + SampleSet ss; + vector sss; + for (int i = 0; i < DATAPOINTS; i++) { + ss.clear(); + sss.clear(); + int x = ((i / 10) % 3) + 1; + int y = (i % 3) + 1; + //double ep = b.est_marginal_prob(x,y); + //cerr << "est p(" << x << "," << y << ") = " << ep << endl; + for (int n = 0; n < PARTICLES; ++n) { + double w; + int seg = b.increment(x,y,&w); + //cerr << seg << " w=" << w << endl; + ss.add(w); + sss.push_back(seg); + b.decrement(x,y,seg); + } + int seg = sss[rng.SelectSample(ss)]; + b.increment(x, y, seg); + //cerr << "Selected: " << seg << endl; + //return 1; + segs[i] = seg; + } + tl += b.log_likelihood(); + } + cerr << "LLH=" << tl / ITERS << endl; +} + diff --git a/gi/pf/transliterations.cc b/gi/pf/transliterations.cc new file mode 100644 index 00000000..6e0c2e93 --- /dev/null +++ b/gi/pf/transliterations.cc @@ -0,0 +1,193 @@ +#include "transliterations.h" + +#include +#include +#include + +#include "grammar.h" +#include "bottom_up_parser.h" +#include "hg.h" +#include "hg_intersect.h" +#include "filelib.h" +#include "ccrp.h" +#include "m.h" +#include "lattice.h" +#include "verbose.h" + +using namespace std; +using namespace std::tr1; + +static WordID kX; +static int kMAX_SRC_SIZE = 0; +static vector > cur_trg_chunks; + +vector tlttofreelist; + +static void InitTargetChunks(int max_size, const vector& trg) { + cur_trg_chunks.clear(); + vector tmp; + unordered_set, boost::hash > > u; + for (int len = 1; len <= max_size; ++len) { + int end = trg.size() + 1; + end -= len; + for (int i = 0; i < end; ++i) { + tmp.clear(); + for (int j = 0; j < len; ++j) + tmp.push_back(trg[i + j]); + if (u.insert(tmp).second) cur_trg_chunks.push_back(tmp); + } + } +} + +struct TransliterationGrammarIter : public GrammarIter, public RuleBin { + TransliterationGrammarIter() { tlttofreelist.push_back(this); } + TransliterationGrammarIter(const TRulePtr& inr, int symbol) { + if (inr) { + r.reset(new TRule(*inr)); + } else { + r.reset(new TRule); + } + TRule& rr = *r; + rr.lhs_ = kX; + rr.f_.push_back(symbol); + tlttofreelist.push_back(this); + } + virtual int GetNumRules() const { + if (!r) return 0; + return cur_trg_chunks.size(); + } + virtual TRulePtr GetIthRule(int i) const { + TRulePtr nr(new TRule(*r)); + nr->e_ = cur_trg_chunks[i]; + //cerr << nr->AsString() << endl; + return nr; + } + virtual int Arity() const { + return 0; + } + virtual const RuleBin* GetRules() const { + if (!r) return NULL; else return this; + } + virtual const GrammarIter* Extend(int symbol) const { + if (symbol <= 0) return NULL; + if (!r || !kMAX_SRC_SIZE || r->f_.size() < kMAX_SRC_SIZE) + return new TransliterationGrammarIter(r, symbol); + else + return NULL; + } + TRulePtr r; +}; + +struct TransliterationGrammar : public Grammar { + virtual const GrammarIter* GetRoot() const { + return new TransliterationGrammarIter; + } + virtual bool HasRuleForSpan(int, int, int distance) const { + return (distance < kMAX_SRC_SIZE); + } +}; + +struct TInfo { + TInfo() : initialized(false) {} + bool initialized; + Hypergraph lattice; // may be empty if transliteration is not possible + prob_t est_prob; // will be zero if not possible +}; + +struct TransliterationsImpl { + TransliterationsImpl() { + kX = TD::Convert("X")*-1; + kMAX_SRC_SIZE = 4; + grammars.push_back(GrammarPtr(new TransliterationGrammar)); + grammars.push_back(GrammarPtr(new GlueGrammar("S", "X"))); + SetSilent(true); + } + + void Initialize(WordID src, const vector& src_lets, WordID trg, const vector& trg_lets) { + if (src >= graphs.size()) graphs.resize(src + 1); + if (graphs[src][trg].initialized) return; + int kMAX_TRG_SIZE = 4; + InitTargetChunks(kMAX_TRG_SIZE, trg_lets); + ExhaustiveBottomUpParser parser("S", grammars); + Lattice lat(src_lets.size()), tlat(trg_lets.size()); + for (unsigned i = 0; i < src_lets.size(); ++i) + lat[i].push_back(LatticeArc(src_lets[i], 0.0, 1)); + for (unsigned i = 0; i < trg_lets.size(); ++i) + tlat[i].push_back(LatticeArc(trg_lets[i], 0.0, 1)); + //cerr << "Creating lattice for: " << TD::Convert(src) << " --> " << TD::Convert(trg) << endl; + //cerr << "'" << TD::GetString(src_lets) << "' --> " << TD::GetString(trg_lets) << endl; + if (!parser.Parse(lat, &graphs[src][trg].lattice)) { + //cerr << "Failed to parse " << TD::GetString(src_lets) << endl; + abort(); + } + if (HG::Intersect(tlat, &graphs[src][trg].lattice)) { + graphs[src][trg].est_prob = prob_t(1e-4); + } else { + graphs[src][trg].lattice.clear(); + //cerr << "Failed to intersect " << TD::GetString(src_lets) << " ||| " << TD::GetString(trg_lets) << endl; + graphs[src][trg].est_prob = prob_t::Zero(); + } + for (unsigned i = 0; i < tlttofreelist.size(); ++i) + delete tlttofreelist[i]; + tlttofreelist.clear(); + //cerr << "Number of paths: " << graphs[src][trg].lattice.NumberOfPaths() << endl; + graphs[src][trg].initialized = true; + } + + const prob_t& EstimateProbability(WordID src, WordID trg) const { + assert(src < graphs.size()); + const unordered_map& um = graphs[src]; + const unordered_map::const_iterator it = um.find(trg); + assert(it != um.end()); + assert(it->second.initialized); + return it->second.est_prob; + } + + void Forbid(WordID src, WordID trg) { + if (src >= graphs.size()) graphs.resize(src + 1); + graphs[src][trg].est_prob = prob_t::Zero(); + graphs[src][trg].initialized = true; + } + + void GraphSummary() const { + double tlp = 0; + int tt = 0; + for (int i = 0; i < graphs.size(); ++i) { + const unordered_map& um = graphs[i]; + unordered_map::const_iterator it; + for (it = um.begin(); it != um.end(); ++it) { + if (it->second.lattice.empty()) continue; + //cerr << TD::Convert(i) << " --> " << TD::Convert(it->first) << ": " << it->second.lattice.NumberOfPaths() << endl; + tlp += log(it->second.lattice.NumberOfPaths()); + tt++; + } + } + tlp /= tt; + cerr << "E[log paths] = " << tlp << endl; + cerr << "exp(E[log paths]) = " << exp(tlp) << endl; + } + + vector > graphs; + vector grammars; +}; + +Transliterations::Transliterations() : pimpl_(new TransliterationsImpl) {} +Transliterations::~Transliterations() { delete pimpl_; } + +void Transliterations::Initialize(WordID src, const vector& src_lets, WordID trg, const vector& trg_lets) { + pimpl_->Initialize(src, src_lets, trg, trg_lets); +} + +prob_t Transliterations::EstimateProbability(WordID src, WordID trg) const { + return pimpl_->EstimateProbability(src,trg); +} + +void Transliterations::Forbid(WordID src, WordID trg) { + pimpl_->Forbid(src, trg); +} + +void Transliterations::GraphSummary() const { + pimpl_->GraphSummary(); +} + + diff --git a/gi/pf/transliterations.h b/gi/pf/transliterations.h new file mode 100644 index 00000000..a548aacf --- /dev/null +++ b/gi/pf/transliterations.h @@ -0,0 +1,20 @@ +#ifndef _TRANSLITERATIONS_H_ +#define _TRANSLITERATIONS_H_ + +#include +#include "wordid.h" +#include "prob.h" + +struct TransliterationsImpl; +struct Transliterations { + explicit Transliterations(); + ~Transliterations(); + void Initialize(WordID src, const std::vector& src_lets, WordID trg, const std::vector& trg_lets); + void Forbid(WordID src, WordID trg); + void GraphSummary() const; + prob_t EstimateProbability(WordID src, WordID trg) const; + TransliterationsImpl* pimpl_; +}; + +#endif + -- cgit v1.2.3 From 9a8256604686a9283e7afce04e6feaab4922dd45 Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Thu, 8 Mar 2012 13:32:41 -0500 Subject: tl stuff --- gi/pf/Makefile.am | 8 +++-- gi/pf/align-tl.cc | 8 +++-- gi/pf/reachability.cc | 17 +++++++--- gi/pf/reachability.h | 4 +++ gi/pf/transliterations.cc | 82 ++++++++++++++++++++++++++++++++++------------- gi/pf/transliterations.h | 3 +- 6 files changed, 88 insertions(+), 34 deletions(-) (limited to 'gi/pf/Makefile.am') diff --git a/gi/pf/Makefile.am b/gi/pf/Makefile.am index 5e89f02a..9888a70b 100644 --- a/gi/pf/Makefile.am +++ b/gi/pf/Makefile.am @@ -2,15 +2,17 @@ bin_PROGRAMS = cbgi brat dpnaive pfbrat pfdist itg pfnaive condnaive align-lexon noinst_LIBRARIES = libpf.a -libpf_a_SOURCES = base_distributions.cc reachability.cc cfg_wfst_composer.cc corpus.cc unigrams.cc ngram_base.cc +libpf_a_SOURCES = base_distributions.cc reachability.cc cfg_wfst_composer.cc corpus.cc unigrams.cc ngram_base.cc transliterations.cc -nuisance_test_SOURCES = nuisance_test.cc transliterations.cc +nuisance_test_SOURCES = nuisance_test.cc +nuisance_test_LDADD = libpf.a align_lexonly_SOURCES = align-lexonly.cc align_lexonly_pyp_SOURCES = align-lexonly-pyp.cc -align_tl_SOURCES = align-tl.cc transliterations.cc +align_tl_SOURCES = align-tl.cc +align_tl_LDADD = libpf.a itg_SOURCES = itg.cc diff --git a/gi/pf/align-tl.cc b/gi/pf/align-tl.cc index 6bb8c886..fe8950b5 100644 --- a/gi/pf/align-tl.cc +++ b/gi/pf/align-tl.cc @@ -305,7 +305,10 @@ int main(int argc, char** argv) { ExtractLetters(vocabf, &letters, NULL); letters[TD::Convert("NULL")].clear(); - Transliterations tl; + // TODO configure this + int max_src_chunk = 4; + int max_trg_chunk = 4; + Transliterations tl(max_src_chunk, max_trg_chunk); // TODO CONFIGURE THIS int min_trans_src = 4; @@ -318,10 +321,9 @@ int main(int argc, char** argv) { const vector& src_let = letters[src[j]]; for (int k = 0; k < trg.size(); ++k) { const vector& trg_let = letters[trg[k]]; + tl.Initialize(src[j], src_let, trg[k], trg_let); if (src_let.size() < min_trans_src) tl.Forbid(src[j], src_let, trg[k], trg_let); - else - tl.Initialize(src[j], src_let, trg[k], trg_let); } } } diff --git a/gi/pf/reachability.cc b/gi/pf/reachability.cc index 70fb76da..59bc6ace 100644 --- a/gi/pf/reachability.cc +++ b/gi/pf/reachability.cc @@ -39,6 +39,7 @@ void Reachability::ComputeReachability(int srclen, int trglen, int src_max_phras typedef boost::multi_array rarray_type; rarray_type r(boost::extents[srclen + 1][trglen + 1]); r[srclen][trglen] = true; + nodes = 0; for (int i = srclen; i >= 0; --i) { for (int j = trglen; j >= 0; --j) { vector& prevs = a[i][j]; @@ -57,10 +58,16 @@ void Reachability::ComputeReachability(int srclen, int trglen, int src_max_phras assert(!edges[0][0][0][1]); assert(!edges[0][0][0][0]); assert(max_src_delta[0][0] > 0); - cerr << "Sentence with length (" << srclen << ',' << trglen << ") has " << valid_deltas[0][0].size() << " out edges in its root node\n"; - //cerr << "First cell contains " << b[0][0].size() << " forward pointers\n"; - //for (int i = 0; i < b[0][0].size(); ++i) { - // cerr << " -> (" << b[0][0][i].next_src_covered << "," << b[0][0][i].next_trg_covered << ")\n"; - //} + nodes = 0; + for (int i = 0; i < srclen; ++i) { + for (int j = 0; j < trglen; ++j) { + if (valid_deltas[i][j].size() > 0) { + node_addresses[i][j] = nodes++; + } else { + node_addresses[i][j] = -1; + } + } + } + cerr << "Sequence pair with lengths (" << srclen << ',' << trglen << ") has " << valid_deltas[0][0].size() << " out edges in its root node, " << nodes << " nodes in total, and outside estimate matrix will require " << sizeof(float)*nodes << " bytes\n"; } diff --git a/gi/pf/reachability.h b/gi/pf/reachability.h index fb2f4965..1e22c76a 100644 --- a/gi/pf/reachability.h +++ b/gi/pf/reachability.h @@ -12,13 +12,17 @@ // currently forbids 0 -> n and n -> 0 alignments struct Reachability { + unsigned nodes; boost::multi_array edges; // edges[src_covered][trg_covered][src_delta][trg_delta] is this edge worth exploring? boost::multi_array max_src_delta; // msd[src_covered][trg_covered] -- the largest src delta that's valid + boost::multi_array node_addresses; // na[src_covered][trg_covered] -- the index of the node in a one-dimensional array (of size "nodes") boost::multi_array >, 2> valid_deltas; // valid_deltas[src_covered][trg_covered] list of valid transitions leaving a particular node Reachability(int srclen, int trglen, int src_max_phrase_len, int trg_max_phrase_len) : + nodes(), edges(boost::extents[srclen][trglen][src_max_phrase_len+1][trg_max_phrase_len+1]), max_src_delta(boost::extents[srclen][trglen]), + node_addresses(boost::extents[srclen][trglen]), valid_deltas(boost::extents[srclen][trglen]) { ComputeReachability(srclen, trglen, src_max_phrase_len, trg_max_phrase_len); } diff --git a/gi/pf/transliterations.cc b/gi/pf/transliterations.cc index e29334fd..61e95b82 100644 --- a/gi/pf/transliterations.cc +++ b/gi/pf/transliterations.cc @@ -14,42 +14,75 @@ using namespace std; using namespace std::tr1; struct GraphStructure { - GraphStructure() : initialized(false) {} - boost::shared_ptr r; - bool initialized; + GraphStructure() : r() {} + // leak memory - these are basically static + const Reachability* r; + bool IsReachable() const { return r->nodes > 0; } +}; + +struct BackwardEstimates { + BackwardEstimates() : gs(), backward() {} + explicit BackwardEstimates(const GraphStructure& g) : + gs(&g), backward() { + if (g.r->nodes > 0) + backward = new float[g.r->nodes]; + } + // leak memory, these are static + + // returns an estimate of the marginal probability + double MarginalEstimate() const { + if (!backward) return 0; + return backward[0]; + } + + // returns an backward estimate + double operator()(int src_covered, int trg_covered) const { + if (!backward) return 0; + int ind = gs->r->node_addresses[src_covered][trg_covered]; + if (ind < 0) return 0; + return backward[ind]; + } + private: + const GraphStructure* gs; + float* backward; }; struct TransliterationsImpl { - TransliterationsImpl() { + TransliterationsImpl(int max_src, int max_trg) : + kMAX_SRC_CHUNK(max_src), + kMAX_TRG_CHUNK(max_trg), + tot_pairs() { } void Initialize(WordID src, const vector& src_lets, WordID trg, const vector& trg_lets) { const size_t src_len = src_lets.size(); const size_t trg_len = trg_lets.size(); + + // init graph structure if (src_len >= graphs.size()) graphs.resize(src_len + 1); if (trg_len >= graphs[src_len].size()) graphs[src_len].resize(trg_len + 1); - if (graphs[src_len][trg_len].initialized) return; - graphs[src_len][trg_len].r.reset(new Reachability(src_len, trg_len, 4, 4)); - -#if 0 - if (HG::Intersect(tlat, &hg)) { - // TODO - } else { - cerr << "No transliteration lattice possible for src_len=" << src_len << " trg_len=" << trg_len << endl; - hg.clear(); - } - //cerr << "Number of paths: " << graphs[src][trg].lattice.NumberOfPaths() << endl; -#endif - graphs[src_len][trg_len].initialized = true; + GraphStructure& gs = graphs[src_len][trg_len]; + if (!gs.r) + gs.r = new Reachability(src_len, trg_len, kMAX_SRC_CHUNK, kMAX_TRG_CHUNK); + const Reachability& r = *gs.r; + + // init backward estimates + if (src >= bes.size()) bes.resize(src + 1); + unordered_map::iterator it = bes[src].find(trg); + if (it != bes[src].end()) return; // already initialized + + it = bes[src].insert(make_pair(trg, BackwardEstimates(gs))).first; + BackwardEstimates& b = it->second; + if (!gs.r->nodes) return; // not derivable subject to length constraints + + // TODO + tot_pairs++; } void Forbid(WordID src, const vector& src_lets, WordID trg, const vector& trg_lets) { const size_t src_len = src_lets.size(); const size_t trg_len = trg_lets.size(); - if (src_len >= graphs.size()) graphs.resize(src_len + 1); - if (trg_len >= graphs[src_len].size()) graphs[src_len].resize(trg_len + 1); - graphs[src_len][trg_len].r.reset(); - graphs[src_len][trg_len].initialized = true; + // TODO } prob_t EstimateProbability(WordID s, const vector& src, WordID t, const vector& trg) const { @@ -85,12 +118,17 @@ struct TransliterationsImpl { cerr << " Average nodes = " << (tn / tt) << endl; cerr << "Average out-degree = " << (to / tn) << endl; cerr << " Unique structures = " << tt << endl; + cerr << " Unique pairs = " << tot_pairs << endl; } + const int kMAX_SRC_CHUNK; + const int kMAX_TRG_CHUNK; + unsigned tot_pairs; vector > graphs; // graphs[src_len][trg_len] + vector > bes; // bes[src][trg] }; -Transliterations::Transliterations() : pimpl_(new TransliterationsImpl) {} +Transliterations::Transliterations(int max_src, int max_trg) : pimpl_(new TransliterationsImpl(max_src, max_trg)) {} Transliterations::~Transliterations() { delete pimpl_; } void Transliterations::Initialize(WordID src, const vector& src_lets, WordID trg, const vector& trg_lets) { diff --git a/gi/pf/transliterations.h b/gi/pf/transliterations.h index 76eb2a05..e025547e 100644 --- a/gi/pf/transliterations.h +++ b/gi/pf/transliterations.h @@ -7,7 +7,8 @@ struct TransliterationsImpl; struct Transliterations { - explicit Transliterations(); + // max_src and max_trg indicate how big the transliteration phrases can be + explicit Transliterations(int max_src, int max_trg); ~Transliterations(); void Initialize(WordID src, const std::vector& src_lets, WordID trg, const std::vector& trg_lets); void Forbid(WordID src, const std::vector& src_lets, WordID trg, const std::vector& trg_lets); -- cgit v1.2.3 From 301106af9c13285ff252c618848eaa54b9b0e490 Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Thu, 8 Mar 2012 18:47:23 +0000 Subject: fix link error on linux --- gi/pf/Makefile.am | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'gi/pf/Makefile.am') diff --git a/gi/pf/Makefile.am b/gi/pf/Makefile.am index 9888a70b..94364c3d 100644 --- a/gi/pf/Makefile.am +++ b/gi/pf/Makefile.am @@ -5,14 +5,14 @@ noinst_LIBRARIES = libpf.a libpf_a_SOURCES = base_distributions.cc reachability.cc cfg_wfst_composer.cc corpus.cc unigrams.cc ngram_base.cc transliterations.cc nuisance_test_SOURCES = nuisance_test.cc -nuisance_test_LDADD = libpf.a +nuisance_test_LDADD = libpf.a $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a $(top_srcdir)/klm/lm/libklm.a $(top_srcdir)/klm/util/libklm_util.a -lz align_lexonly_SOURCES = align-lexonly.cc align_lexonly_pyp_SOURCES = align-lexonly-pyp.cc align_tl_SOURCES = align-tl.cc -align_tl_LDADD = libpf.a +align_tl_LDADD = libpf.a $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a $(top_srcdir)/klm/lm/libklm.a $(top_srcdir)/klm/util/libklm_util.a -lz itg_SOURCES = itg.cc -- cgit v1.2.3 From 113317266853abff2e1c0c3e889017d0eee55c93 Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Fri, 9 Mar 2012 22:23:50 -0500 Subject: moar --- gi/pf/Makefile.am | 3 +- gi/pf/align-lexonly-pyp.cc | 207 ++++++++++------------------------------- gi/pf/align-tl.cc | 18 ++-- gi/pf/backward.cc | 89 ++++++++++++++++++ gi/pf/backward.h | 33 +++++++ gi/pf/base_distributions.h | 8 +- gi/pf/guess-translits.pl | 2 +- gi/pf/nuisance_test.cc | 6 +- gi/pf/pyp_lm.cc | 2 +- gi/pf/pyp_tm.cc | 113 +++++++++++++++++++++++ gi/pf/pyp_tm.h | 34 +++++++ gi/pf/pyp_word_model.cc | 20 ++++ gi/pf/pyp_word_model.h | 58 ++++++++++++ gi/pf/reachability.cc | 8 +- gi/pf/reachability.h | 8 +- gi/pf/transliterations.cc | 223 ++++++++++++++++++++++++++++++++++++++++----- gi/pf/transliterations.h | 3 +- utils/ccrp_nt.h | 17 ++-- 18 files changed, 628 insertions(+), 224 deletions(-) create mode 100644 gi/pf/backward.cc create mode 100644 gi/pf/backward.h create mode 100644 gi/pf/pyp_tm.cc create mode 100644 gi/pf/pyp_tm.h create mode 100644 gi/pf/pyp_word_model.cc create mode 100644 gi/pf/pyp_word_model.h (limited to 'gi/pf/Makefile.am') diff --git a/gi/pf/Makefile.am b/gi/pf/Makefile.am index 94364c3d..4ce72ba1 100644 --- a/gi/pf/Makefile.am +++ b/gi/pf/Makefile.am @@ -2,7 +2,7 @@ bin_PROGRAMS = cbgi brat dpnaive pfbrat pfdist itg pfnaive condnaive align-lexon noinst_LIBRARIES = libpf.a -libpf_a_SOURCES = base_distributions.cc reachability.cc cfg_wfst_composer.cc corpus.cc unigrams.cc ngram_base.cc transliterations.cc +libpf_a_SOURCES = base_distributions.cc reachability.cc cfg_wfst_composer.cc corpus.cc unigrams.cc ngram_base.cc transliterations.cc backward.cc pyp_word_model.cc pyp_tm.cc nuisance_test_SOURCES = nuisance_test.cc nuisance_test_LDADD = libpf.a $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a $(top_srcdir)/klm/lm/libklm.a $(top_srcdir)/klm/util/libklm_util.a -lz @@ -10,6 +10,7 @@ nuisance_test_LDADD = libpf.a $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mtev align_lexonly_SOURCES = align-lexonly.cc align_lexonly_pyp_SOURCES = align-lexonly-pyp.cc +align_lexonly_pyp_LDADD = libpf.a $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a $(top_srcdir)/klm/lm/libklm.a $(top_srcdir)/klm/util/libklm_util.a -lz align_tl_SOURCES = align-tl.cc align_tl_LDADD = libpf.a $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a $(top_srcdir)/klm/lm/libklm.a $(top_srcdir)/klm/util/libklm_util.a -lz diff --git a/gi/pf/align-lexonly-pyp.cc b/gi/pf/align-lexonly-pyp.cc index 13a3a487..d68a4b8f 100644 --- a/gi/pf/align-lexonly-pyp.cc +++ b/gi/pf/align-lexonly-pyp.cc @@ -1,27 +1,18 @@ #include -#include #include -#include #include #include -#include "array2d.h" -#include "base_distributions.h" -#include "monotonic_pseg.h" -#include "conditional_pseg.h" -#include "trule.h" #include "tdict.h" #include "stringlib.h" #include "filelib.h" -#include "dict.h" +#include "array2d.h" #include "sampler.h" -#include "mfcr.h" #include "corpus.h" -#include "ngram_base.h" +#include "pyp_tm.h" using namespace std; -using namespace tr1; namespace po = boost::program_options; void InitCommandLine(int argc, char** argv, po::variables_map* conf) { @@ -51,7 +42,7 @@ void InitCommandLine(int argc, char** argv, po::variables_map* conf) { } } -shared_ptr prng; +MT19937* prng; struct LexicalAlignment { unsigned char src_index; @@ -66,159 +57,59 @@ struct AlignedSentencePair { Array2D posterior; }; -struct HierarchicalWordBase { - explicit HierarchicalWordBase(const unsigned vocab_e_size) : - base(prob_t::One()), r(1,1,1,1,0.66,50.0), u0(-log(vocab_e_size)), l(1,prob_t::One()), v(1, prob_t::Zero()) {} - - void ResampleHyperparameters(MT19937* rng) { - r.resample_hyperparameters(rng); - } - - inline double logp0(const vector& s) const { - return Md::log_poisson(s.size(), 7.5) + s.size() * u0; - } - - // return p0 of rule.e_ - prob_t operator()(const TRule& rule) const { - v[0].logeq(logp0(rule.e_)); - return r.prob(rule.e_, v.begin(), l.begin()); - } - - void Increment(const TRule& rule) { - v[0].logeq(logp0(rule.e_)); - if (r.increment(rule.e_, v.begin(), l.begin(), &*prng).count) { - base *= v[0] * l[0]; - } - } - - void Decrement(const TRule& rule) { - if (r.decrement(rule.e_, &*prng).count) { - base /= prob_t(exp(logp0(rule.e_))); - } - } - - prob_t Likelihood() const { - prob_t p; p.logeq(r.log_crp_prob()); - p *= base; - return p; +struct Aligner { + Aligner(const vector >& lets, int num_letters, vector* c) : + corpus(*c), + model(lets, num_letters), + kNULL(TD::Convert("NULL")) { + assert(lets[kNULL].size() == 0); } - void Summary() const { - cerr << "NUMBER OF CUSTOMERS: " << r.num_customers() << " (d=" << r.discount() << ",s=" << r.strength() << ')' << endl; - for (MFCR<1,vector >::const_iterator it = r.begin(); it != r.end(); ++it) - cerr << " " << it->second.total_dish_count_ << " (on " << it->second.table_counts_.size() << " tables) " << TD::GetString(it->first) << endl; - } - - prob_t base; - MFCR<1,vector > r; - const double u0; - const vector l; - mutable vector v; -}; - -struct BasicLexicalAlignment { - explicit BasicLexicalAlignment(const vector >& lets, - const unsigned words_e, - const unsigned letters_e, - vector* corp) : - letters(lets), - corpus(*corp), - //up0(words_e), - //up0("en.chars.1gram", letters_e), - //up0("en.words.1gram"), - up0(letters_e), - //up0("en.chars.2gram"), - tmodel(up0) { - } + vector& corpus; + PYPLexicalTranslation model; + const WordID kNULL; - void InstantiateRule(const WordID src, - const WordID trg, - TRule* rule) const { - static const WordID kX = TD::Convert("X") * -1; - rule->lhs_ = kX; - rule->e_ = letters[trg]; - rule->f_ = letters[src]; + void ResampleHyperparameters() { + model.ResampleHyperparameters(prng); } void InitializeRandom() { - const WordID kNULL = TD::Convert("NULL"); cerr << "Initializing with random alignments ...\n"; for (unsigned i = 0; i < corpus.size(); ++i) { AlignedSentencePair& asp = corpus[i]; asp.a.resize(asp.trg.size()); for (unsigned j = 0; j < asp.trg.size(); ++j) { - const unsigned char a_j = prng->next() * (1 + asp.src.size()); + unsigned char& a_j = asp.a[j].src_index; + a_j = prng->next() * (1 + asp.src.size()); const WordID f_a_j = (a_j ? asp.src[a_j - 1] : kNULL); - TRule r; - InstantiateRule(f_a_j, asp.trg[j], &r); - asp.a[j].is_transliteration = false; - asp.a[j].src_index = a_j; - if (tmodel.IncrementRule(r, &*prng)) - up0.Increment(r); + model.Increment(f_a_j, asp.trg[j], &*prng); } } - cerr << " LLH = " << Likelihood() << endl; - } - - prob_t Likelihood() const { - prob_t p = tmodel.Likelihood(); - p *= up0.Likelihood(); - return p; - } - - void ResampleHyperparemeters() { - tmodel.ResampleHyperparameters(&*prng); - up0.ResampleHyperparameters(&*prng); - cerr << " (base d=" << up0.r.discount() << ",s=" << up0.r.strength() << ")\n"; + cerr << "Corpus intialized randomly. LLH = " << model.Likelihood() << endl; } - void ResampleCorpus(); - - const vector >& letters; // spelling dictionary - vector& corpus; - //PhraseConditionalUninformativeBase up0; - //PhraseConditionalUninformativeUnigramBase up0; - //UnigramWordBase up0; - //HierarchicalUnigramBase up0; - HierarchicalWordBase up0; - //CompletelyUniformBase up0; - //FixedNgramBase up0; - //ConditionalTranslationModel tmodel; - //ConditionalTranslationModel tmodel; - //ConditionalTranslationModel tmodel; - //ConditionalTranslationModel tmodel; - MConditionalTranslationModel tmodel; - //ConditionalTranslationModel tmodel; - //ConditionalTranslationModel tmodel; -}; - -void BasicLexicalAlignment::ResampleCorpus() { - static const WordID kNULL = TD::Convert("NULL"); - for (unsigned i = 0; i < corpus.size(); ++i) { - AlignedSentencePair& asp = corpus[i]; - SampleSet ss; ss.resize(asp.src.size() + 1); - for (unsigned j = 0; j < asp.trg.size(); ++j) { - TRule r; - unsigned char& a_j = asp.a[j].src_index; - WordID f_a_j = (a_j ? asp.src[a_j - 1] : kNULL); - InstantiateRule(f_a_j, asp.trg[j], &r); - if (tmodel.DecrementRule(r, &*prng)) - up0.Decrement(r); - - for (unsigned prop_a_j = 0; prop_a_j <= asp.src.size(); ++prop_a_j) { - const WordID prop_f = (prop_a_j ? asp.src[prop_a_j - 1] : kNULL); - InstantiateRule(prop_f, asp.trg[j], &r); - ss[prop_a_j] = tmodel.RuleProbability(r); + void ResampleCorpus() { + for (unsigned i = 0; i < corpus.size(); ++i) { + AlignedSentencePair& asp = corpus[i]; + SampleSet ss; ss.resize(asp.src.size() + 1); + for (unsigned j = 0; j < asp.trg.size(); ++j) { + unsigned char& a_j = asp.a[j].src_index; + const WordID e_j = asp.trg[j]; + WordID f_a_j = (a_j ? asp.src[a_j - 1] : kNULL); + model.Decrement(f_a_j, e_j, prng); + + for (unsigned prop_a_j = 0; prop_a_j <= asp.src.size(); ++prop_a_j) { + const WordID prop_f = (prop_a_j ? asp.src[prop_a_j - 1] : kNULL); + ss[prop_a_j] = model.Prob(prop_f, e_j); + } + a_j = prng->SelectSample(ss); + f_a_j = (a_j ? asp.src[a_j - 1] : kNULL); + model.Increment(f_a_j, e_j, prng); } - a_j = prng->SelectSample(ss); - f_a_j = (a_j ? asp.src[a_j - 1] : kNULL); - InstantiateRule(f_a_j, asp.trg[j], &r); - if (tmodel.IncrementRule(r, &*prng)) - up0.Increment(r); } + cerr << "LLH = " << model.Likelihood() << " " << model.UniqueConditioningContexts() << endl; } - cerr << " LLH = " << Likelihood() << endl; -} +}; void ExtractLetters(const set& v, vector >* l, set* letset = NULL) { for (set::const_iterator it = v.begin(); it != v.end(); ++it) { @@ -240,8 +131,10 @@ void ExtractLetters(const set& v, vector >* l, set a(asp.src.size(), asp.trg.size()); - for (unsigned j = 0; j < asp.trg.size(); ++j) + for (unsigned j = 0; j < asp.trg.size(); ++j) { + assert(asp.a[j].src_index <= asp.src.size()); if (asp.a[j].src_index) a(asp.a[j].src_index - 1, j) = true; + } cerr << a << endl; } @@ -275,10 +168,9 @@ int main(int argc, char** argv) { InitCommandLine(argc, argv, &conf); if (conf.count("random_seed")) - prng.reset(new MT19937(conf["random_seed"].as())); + prng = new MT19937(conf["random_seed"].as()); else - prng.reset(new MT19937); -// MT19937& rng = *prng; + prng = new MT19937; vector > corpuse, corpusf; set vocabe, vocabf; @@ -304,23 +196,18 @@ int main(int argc, char** argv) { ExtractLetters(vocabf, &letters, NULL); letters[TD::Convert("NULL")].clear(); - BasicLexicalAlignment x(letters, vocabe.size(), letset.size(), &corpus); - x.InitializeRandom(); + Aligner aligner(letters, letset.size(), &corpus); + aligner.InitializeRandom(); + const unsigned samples = conf["samples"].as(); for (int i = 0; i < samples; ++i) { for (int j = 65; j < 67; ++j) Debug(corpus[j]); - cerr << i << "\t" << x.tmodel.r.size() << "\t"; - if (i % 7 == 6) x.ResampleHyperparemeters(); - x.ResampleCorpus(); + if (i % 7 == 6) aligner.ResampleHyperparameters(); + aligner.ResampleCorpus(); if (i > (samples / 5) && (i % 10 == 9)) for (int j = 0; j < corpus.size(); ++j) AddSample(&corpus[j]); } for (unsigned i = 0; i < corpus.size(); ++i) WriteAlignments(corpus[i]); - //ModelAndData posterior(x, &corpus, vocabe, vocabf); - x.tmodel.Summary(); - x.up0.Summary(); - - //posterior.Sample(); return 0; } diff --git a/gi/pf/align-tl.cc b/gi/pf/align-tl.cc index fc9b7ca5..cbe8c6c8 100644 --- a/gi/pf/align-tl.cc +++ b/gi/pf/align-tl.cc @@ -6,6 +6,7 @@ #include #include +#include "backward.h" #include "array2d.h" #include "base_distributions.h" #include "monotonic_pseg.h" @@ -30,10 +31,11 @@ void InitCommandLine(int argc, char** argv, po::variables_map* conf) { opts.add_options() ("samples,s",po::value()->default_value(1000),"Number of samples") ("input,i",po::value(),"Read parallel data from") + ("s2t", po::value(), "character level source-to-target prior transliteration probabilities") + ("t2s", po::value(), "character level target-to-source prior transliteration probabilities") ("max_src_chunk", po::value()->default_value(4), "Maximum size of translitered chunk in source") ("max_trg_chunk", po::value()->default_value(4), "Maximum size of translitered chunk in target") - ("min_transliterated_src_length", po::value()->default_value(3), "Minimum length of source words considered for transliteration") - ("filter_ratio", po::value()->default_value(0.66), "Filter ratio: basically, if the lengths differ by less than this ratio, mark the pair as non-transliteratable") + ("expected_src_to_trg_ratio", po::value()->default_value(1.0), "If a word is transliterated, what is the expected length ratio from source to target?") ("random_seed,S",po::value(), "Random seed"); po::options_description clo("Command line options"); clo.add_options() @@ -303,7 +305,7 @@ int main(int argc, char** argv) { corpusf.clear(); corpuse.clear(); vocabf.insert(TD::Convert("NULL")); - vector > letters(TD::NumWords()); + vector > letters(TD::NumWords() + 1); set letset; ExtractLetters(vocabe, &letters, &letset); ExtractLetters(vocabf, &letters, NULL); @@ -312,9 +314,9 @@ int main(int argc, char** argv) { // TODO configure this const int max_src_chunk = conf["max_src_chunk"].as(); const int max_trg_chunk = conf["max_trg_chunk"].as(); - const double filter_rat = conf["filter_ratio"].as(); - const int min_trans_src = conf["min_transliterated_src_length"].as(); - Transliterations tl(max_src_chunk, max_trg_chunk, filter_rat); + const double s2t_rat = conf["expected_src_to_trg_ratio"].as(); + const BackwardEstimator be(conf["s2t"].as(), conf["t2s"].as()); + Transliterations tl(max_src_chunk, max_trg_chunk, s2t_rat, be); cerr << "Initializing transliteration graph structures ...\n"; for (int i = 0; i < corpus.size(); ++i) { @@ -325,8 +327,8 @@ int main(int argc, char** argv) { for (int k = 0; k < trg.size(); ++k) { const vector& trg_let = letters[trg[k]]; tl.Initialize(src[j], src_let, trg[k], trg_let); - if (src_let.size() < min_trans_src) - tl.Forbid(src[j], src_let, trg[k], trg_let); + //if (src_let.size() < min_trans_src) + // tl.Forbid(src[j], src_let, trg[k], trg_let); } } } diff --git a/gi/pf/backward.cc b/gi/pf/backward.cc new file mode 100644 index 00000000..b92629fd --- /dev/null +++ b/gi/pf/backward.cc @@ -0,0 +1,89 @@ +#include "backward.h" + +#include +#include + +#include "array2d.h" +#include "reachability.h" +#include "base_distributions.h" + +using namespace std; + +BackwardEstimator::BackwardEstimator(const string& s2t, + const string& t2s) : m1(new Model1(s2t)), m1inv(new Model1(t2s)) {} + +BackwardEstimator::~BackwardEstimator() { + delete m1; m1 = NULL; + delete m1inv; m1inv = NULL; +} + +float BackwardEstimator::ComputeBackwardProb(const std::vector& src, + const std::vector& trg, + unsigned src_covered, + unsigned trg_covered, + double s2t_ratio) const { + if (src_covered == src.size() || trg_covered == trg.size()) { + assert(src_covered == src.size()); + assert(trg_covered == trg.size()); + return 0; + } + static const WordID kNULL = TD::Convert(""); + const prob_t uniform_alignment(1.0 / (src.size() - src_covered + 1)); + // TODO factor in expected length ratio + prob_t e; e.logeq(Md::log_poisson(trg.size() - trg_covered, (src.size() - src_covered) * s2t_ratio)); // p(trg len remaining | src len remaining) + for (unsigned j = trg_covered; j < trg.size(); ++j) { + prob_t p = (*m1)(kNULL, trg[j]) + prob_t(1e-12); + for (unsigned i = src_covered; i < src.size(); ++i) + p += (*m1)(src[i], trg[j]); + if (p.is_0()) { + cerr << "ERROR: p(" << TD::Convert(trg[j]) << " | " << TD::GetString(src) << ") = 0!\n"; + assert(!"failed"); + } + p *= uniform_alignment; + e *= p; + } + // TODO factor in expected length ratio + const prob_t inv_uniform(1.0 / (trg.size() - trg_covered + 1.0)); + prob_t inv; + inv.logeq(Md::log_poisson(src.size() - src_covered, (trg.size() - trg_covered) / s2t_ratio)); + for (unsigned i = src_covered; i < src.size(); ++i) { + prob_t p = (*m1inv)(kNULL, src[i]) + prob_t(1e-12); + for (unsigned j = trg_covered; j < trg.size(); ++j) + p += (*m1inv)(trg[j], src[i]); + if (p.is_0()) { + cerr << "ERROR: p_inv(" << TD::Convert(src[i]) << " | " << TD::GetString(trg) << ") = 0!\n"; + assert(!"failed"); + } + p *= inv_uniform; + inv *= p; + } + return (log(e) + log(inv)) / 2; +} + +void BackwardEstimator::InitializeGrid(const vector& src, + const vector& trg, + const Reachability& r, + double s2t_ratio, + float* grid) const { + queue > q; + q.push(make_pair(0,0)); + Array2D done(src.size()+1, trg.size()+1, false); + //cerr << TD::GetString(src) << " ||| " << TD::GetString(trg) << endl; + while(!q.empty()) { + const pair n = q.front(); + q.pop(); + if (done(n.first,n.second)) continue; + done(n.first,n.second) = true; + + float lp = ComputeBackwardProb(src, trg, n.first, n.second, s2t_ratio); + if (n.first == 0 && n.second == 0) grid[0] = lp; + //cerr << " " << n.first << "," << n.second << "\t" << lp << endl; + + if (n.first == src.size() || n.second == trg.size()) continue; + const vector >& edges = r.valid_deltas[n.first][n.second]; + for (int i = 0; i < edges.size(); ++i) + q.push(make_pair(n.first + edges[i].first, n.second + edges[i].second)); + } + //static int cc = 0; ++cc; if (cc == 80) exit(1); +} + diff --git a/gi/pf/backward.h b/gi/pf/backward.h new file mode 100644 index 00000000..e67eff0c --- /dev/null +++ b/gi/pf/backward.h @@ -0,0 +1,33 @@ +#ifndef _BACKWARD_H_ +#define _BACKWARD_H_ + +#include +#include +#include "wordid.h" + +struct Reachability; +struct Model1; + +struct BackwardEstimator { + BackwardEstimator(const std::string& s2t, + const std::string& t2s); + ~BackwardEstimator(); + + void InitializeGrid(const std::vector& src, + const std::vector& trg, + const Reachability& r, + double src2trg_ratio, + float* grid) const; + + private: + float ComputeBackwardProb(const std::vector& src, + const std::vector& trg, + unsigned src_covered, + unsigned trg_covered, + double src2trg_ratio) const; + + Model1* m1; + Model1* m1inv; +}; + +#endif diff --git a/gi/pf/base_distributions.h b/gi/pf/base_distributions.h index 0d597c5c..84dacdf2 100644 --- a/gi/pf/base_distributions.h +++ b/gi/pf/base_distributions.h @@ -14,13 +14,7 @@ #include "tdict.h" #include "sampler.h" #include "m.h" - -inline std::ostream& operator<<(std::ostream& os, const std::vector& p) { - os << '['; - for (int i = 0; i < p.size(); ++i) - os << (i==0 ? "" : " ") << TD::Convert(p[i]); - return os << ']'; -} +#include "os_phrase.h" struct Model1 { explicit Model1(const std::string& fname) : diff --git a/gi/pf/guess-translits.pl b/gi/pf/guess-translits.pl index aafec13a..d00c2168 100755 --- a/gi/pf/guess-translits.pl +++ b/gi/pf/guess-translits.pl @@ -69,4 +69,4 @@ for my $f (keys %fs) { } } print STDERR "Extracted $num pairs.\n"; -print STDERR "Recommend running:\n ../../training/model1 -t -99999 output.txt\n"; +print STDERR "Recommend running:\n ../../training/model1 -v -d -t -99999 output.txt\n"; diff --git a/gi/pf/nuisance_test.cc b/gi/pf/nuisance_test.cc index 0f44fe95..fc0af9cb 100644 --- a/gi/pf/nuisance_test.cc +++ b/gi/pf/nuisance_test.cc @@ -124,9 +124,9 @@ int main(int argc, char** argv) { WordID y = TD::Convert("remember"); vector src; TD::ConvertSentence("s o u v e n o n s", &src); vector trg; TD::ConvertSentence("r e m e m b e r", &trg); - Transliterations xx; - xx.Initialize(x, src, y, trg); - return 1; +// Transliterations xx; +// xx.Initialize(x, src, y, trg); +// return 1; for (int j = 0; j < ITERS; ++j) { Base b; diff --git a/gi/pf/pyp_lm.cc b/gi/pf/pyp_lm.cc index 104f356b..52e6be2c 100644 --- a/gi/pf/pyp_lm.cc +++ b/gi/pf/pyp_lm.cc @@ -18,7 +18,7 @@ // I use templates to handle the recursive formalation of the prior, so // the order of the model has to be specified here, at compile time: -#define kORDER 4 +#define kORDER 3 using namespace std; using namespace tr1; diff --git a/gi/pf/pyp_tm.cc b/gi/pf/pyp_tm.cc new file mode 100644 index 00000000..94cbe7c3 --- /dev/null +++ b/gi/pf/pyp_tm.cc @@ -0,0 +1,113 @@ +#include "pyp_tm.h" + +#include +#include +#include + +#include "base_distributions.h" +#include "monotonic_pseg.h" +#include "conditional_pseg.h" +#include "tdict.h" +#include "ccrp.h" +#include "pyp_word_model.h" + +using namespace std; +using namespace std::tr1; + +template +struct ConditionalPYPWordModel { + ConditionalPYPWordModel(Base* b) : base(*b) {} + + void Summary() const { + cerr << "Number of conditioning contexts: " << r.size() << endl; + for (RuleModelHash::const_iterator it = r.begin(); it != r.end(); ++it) { + cerr << TD::Convert(it->first) << " \tPYP(d=" << it->second.discount() << ",s=" << it->second.strength() << ") --------------------------" << endl; + for (CCRP >::const_iterator i2 = it->second.begin(); i2 != it->second.end(); ++i2) + cerr << " " << i2->second.total_dish_count_ << '\t' << TD::GetString(i2->first) << endl; + } + } + + void ResampleHyperparameters(MT19937* rng) { + for (RuleModelHash::iterator it = r.begin(); it != r.end(); ++it) + it->second.resample_hyperparameters(rng); + } + + prob_t Prob(const WordID src, const vector& trglets) const { + RuleModelHash::const_iterator it = r.find(src); + if (it == r.end()) { + return base(trglets); + } else { + return it->second.prob(trglets, base(trglets)); + } + } + + void Increment(const WordID src, const vector& trglets, MT19937* rng) { + RuleModelHash::iterator it = r.find(src); + if (it == r.end()) + it = r.insert(make_pair(src, CCRP >(1,1,1,1,0.5,1.0))).first; + if (it->second.increment(trglets, base(trglets), rng)) + base.Increment(trglets, rng); + } + + void Decrement(const WordID src, const vector& trglets, MT19937* rng) { + RuleModelHash::iterator it = r.find(src); + assert(it != r.end()); + if (it->second.decrement(trglets, rng)) { + base.Decrement(trglets, rng); + if (it->second.num_customers() == 0) + r.erase(it); + } + } + + prob_t Likelihood() const { + prob_t p = prob_t::One(); + for (RuleModelHash::const_iterator it = r.begin(); it != r.end(); ++it) { + prob_t q; q.logeq(it->second.log_crp_prob()); + p *= q; + } + return p; + } + + unsigned UniqueConditioningContexts() const { + return r.size(); + } + + Base& base; + typedef unordered_map > > RuleModelHash; + RuleModelHash r; +}; + +PYPLexicalTranslation::PYPLexicalTranslation(const vector >& lets, + const unsigned num_letters) : + letters(lets), + up0(new PYPWordModel(num_letters)), + tmodel(new ConditionalPYPWordModel(up0)), + kX(-TD::Convert("X")) {} + +prob_t PYPLexicalTranslation::Likelihood() const { + prob_t p = up0->Likelihood(); + p *= tmodel->Likelihood(); + return p; +} + +void PYPLexicalTranslation::ResampleHyperparameters(MT19937* rng) { + tmodel->ResampleHyperparameters(rng); + up0->ResampleHyperparameters(rng); +} + +unsigned PYPLexicalTranslation::UniqueConditioningContexts() const { + return tmodel->UniqueConditioningContexts(); +} + +prob_t PYPLexicalTranslation::Prob(WordID src, WordID trg) const { + return tmodel->Prob(src, letters[trg]); +} + +void PYPLexicalTranslation::Increment(WordID src, WordID trg, MT19937* rng) { + tmodel->Increment(src, letters[trg], rng); +} + +void PYPLexicalTranslation::Decrement(WordID src, WordID trg, MT19937* rng) { + tmodel->Decrement(src, letters[trg], rng); +} + diff --git a/gi/pf/pyp_tm.h b/gi/pf/pyp_tm.h new file mode 100644 index 00000000..fa0fb28f --- /dev/null +++ b/gi/pf/pyp_tm.h @@ -0,0 +1,34 @@ +#ifndef PYP_LEX_TRANS +#define PYP_LEX_TRANS + +#include +#include "wordid.h" +#include "prob.h" +#include "sampler.h" + +struct TRule; +struct PYPWordModel; +template struct ConditionalPYPWordModel; + +struct PYPLexicalTranslation { + explicit PYPLexicalTranslation(const std::vector >& lets, + const unsigned num_letters); + + prob_t Likelihood() const; + + void ResampleHyperparameters(MT19937* rng); + prob_t Prob(WordID src, WordID trg) const; // return p(trg | src) + void Summary() const; + void Increment(WordID src, WordID trg, MT19937* rng); + void Decrement(WordID src, WordID trg, MT19937* rng); + unsigned UniqueConditioningContexts() const; + + private: + const std::vector >& letters; // spelling dictionary + PYPWordModel* up0; // base distribuction (model English word) + ConditionalPYPWordModel* tmodel; // translation distributions + // (model English word | French word) + const WordID kX; +}; + +#endif diff --git a/gi/pf/pyp_word_model.cc b/gi/pf/pyp_word_model.cc new file mode 100644 index 00000000..12df4abf --- /dev/null +++ b/gi/pf/pyp_word_model.cc @@ -0,0 +1,20 @@ +#include "pyp_word_model.h" + +#include + +using namespace std; + +void PYPWordModel::ResampleHyperparameters(MT19937* rng) { + r.resample_hyperparameters(rng); + cerr << " PYPWordModel(d=" << r.discount() << ",s=" << r.strength() << ")\n"; +} + +void PYPWordModel::Summary() const { + cerr << "PYPWordModel: generations=" << r.num_customers() + << " PYP(d=" << r.discount() << ",s=" << r.strength() << ')' << endl; + for (CCRP >::const_iterator it = r.begin(); it != r.end(); ++it) + cerr << " " << it->second.total_dish_count_ + << " (on " << it->second.table_counts_.size() << " tables) " + << TD::GetString(it->first) << endl; +} + diff --git a/gi/pf/pyp_word_model.h b/gi/pf/pyp_word_model.h new file mode 100644 index 00000000..800a4fd7 --- /dev/null +++ b/gi/pf/pyp_word_model.h @@ -0,0 +1,58 @@ +#ifndef _PYP_WORD_MODEL_H_ +#define _PYP_WORD_MODEL_H_ + +#include +#include +#include +#include "prob.h" +#include "ccrp.h" +#include "m.h" +#include "tdict.h" +#include "os_phrase.h" + +// PYP(d,s,poisson-uniform) represented as a CRP +struct PYPWordModel { + explicit PYPWordModel(const unsigned vocab_e_size, const double mean_len = 7.5) : + base(prob_t::One()), r(1,1,1,1,0.66,50.0), u0(-std::log(vocab_e_size)), mean_length(mean_len) {} + + void ResampleHyperparameters(MT19937* rng); + + inline prob_t operator()(const std::vector& s) const { + return r.prob(s, p0(s)); + } + + inline void Increment(const std::vector& s, MT19937* rng) { + if (r.increment(s, p0(s), rng)) + base *= p0(s); + } + + inline void Decrement(const std::vector& s, MT19937 *rng) { + if (r.decrement(s, rng)) + base /= p0(s); + } + + inline prob_t Likelihood() const { + prob_t p; p.logeq(r.log_crp_prob()); + p *= base; + return p; + } + + void Summary() const; + + private: + inline double logp0(const std::vector& s) const { + return Md::log_poisson(s.size(), mean_length) + s.size() * u0; + } + + inline prob_t p0(const std::vector& s) const { + prob_t p; p.logeq(logp0(s)); + return p; + } + + prob_t base; // keeps track of the draws from the base distribution + CCRP > r; + const double u0; // uniform log prob of generating a letter + const double mean_length; // mean length of a word in the base distribution +}; + +#endif diff --git a/gi/pf/reachability.cc b/gi/pf/reachability.cc index c10000f2..7d0d04ac 100644 --- a/gi/pf/reachability.cc +++ b/gi/pf/reachability.cc @@ -12,7 +12,7 @@ struct SState { int prev_trg_covered; }; -void Reachability::ComputeReachability(int srclen, int trglen, int src_max_phrase_len, int trg_max_phrase_len, double filter_ratio) { +void Reachability::ComputeReachability(int srclen, int trglen, int src_max_phrase_len, int trg_max_phrase_len) { typedef boost::multi_array, 2> array_type; array_type a(boost::extents[srclen + 1][trglen + 1]); a[0][0].push_back(SState()); @@ -31,9 +31,9 @@ void Reachability::ComputeReachability(int srclen, int trglen, int src_max_phras } a[0][0].clear(); //cerr << srclen << "," << trglen << ": Final cell contains " << a[srclen][trglen].size() << " back pointers\n"; - size_t min_allowed = (src_max_phrase_len + 1) * (trg_max_phrase_len + 1) * (filter_ratio * filter_ratio); - if (a[srclen][trglen].size() < min_allowed) { - cerr << "Sequence pair with lengths (" << srclen << ',' << trglen << ") violates reachability constraint of min indegree " << min_allowed << " with " << a[srclen][trglen].size() << " in edges\n"; + if (a[srclen][trglen].empty()) { + cerr << "Sequence pair with lengths (" << srclen << ',' << trglen << ") violates reachability constraints\n"; + nodes = 0; return; } diff --git a/gi/pf/reachability.h b/gi/pf/reachability.h index 03967d44..1e22c76a 100644 --- a/gi/pf/reachability.h +++ b/gi/pf/reachability.h @@ -18,19 +18,17 @@ struct Reachability { boost::multi_array node_addresses; // na[src_covered][trg_covered] -- the index of the node in a one-dimensional array (of size "nodes") boost::multi_array >, 2> valid_deltas; // valid_deltas[src_covered][trg_covered] list of valid transitions leaving a particular node - // filter_ratio says if the number of outgoing edges from the first cell is less than - // src_max * trg_max * filter_rat^2 then mark as non reachable - Reachability(int srclen, int trglen, int src_max_phrase_len, int trg_max_phrase_len, double filter_ratio = 0.0) : + Reachability(int srclen, int trglen, int src_max_phrase_len, int trg_max_phrase_len) : nodes(), edges(boost::extents[srclen][trglen][src_max_phrase_len+1][trg_max_phrase_len+1]), max_src_delta(boost::extents[srclen][trglen]), node_addresses(boost::extents[srclen][trglen]), valid_deltas(boost::extents[srclen][trglen]) { - ComputeReachability(srclen, trglen, src_max_phrase_len, trg_max_phrase_len, filter_ratio); + ComputeReachability(srclen, trglen, src_max_phrase_len, trg_max_phrase_len); } private: - void ComputeReachability(int srclen, int trglen, int src_max_phrase_len, int trg_max_phrase_len, double filter_ratio); + void ComputeReachability(int srclen, int trglen, int src_max_phrase_len, int trg_max_phrase_len); }; #endif diff --git a/gi/pf/transliterations.cc b/gi/pf/transliterations.cc index 8ea4ebd2..2200715e 100644 --- a/gi/pf/transliterations.cc +++ b/gi/pf/transliterations.cc @@ -5,14 +5,173 @@ #include "boost/shared_ptr.hpp" +#include "backward.h" #include "filelib.h" -#include "ccrp.h" +#include "tdict.h" +#include "trule.h" +#include "filelib.h" +#include "ccrp_nt.h" #include "m.h" #include "reachability.h" using namespace std; using namespace std::tr1; +struct TruncatedConditionalLengthModel { + TruncatedConditionalLengthModel(unsigned max_src_size, unsigned max_trg_size, double expected_src_to_trg_ratio) : + plens(max_src_size+1, vector(max_trg_size+1, 0.0)) { + for (unsigned i = 1; i <= max_src_size; ++i) { + prob_t z = prob_t::Zero(); + for (unsigned j = 1; j <= max_trg_size; ++j) + z += (plens[i][j] = prob_t(0.01 + exp(Md::log_poisson(j, i * expected_src_to_trg_ratio)))); + for (unsigned j = 1; j <= max_trg_size; ++j) + plens[i][j] /= z; + //for (unsigned j = 1; j <= max_trg_size; ++j) + // cerr << "P(trg_len=" << j << " | src_len=" << i << ") = " << plens[i][j] << endl; + } + } + + // return p(tlen | slen) for *chunks* not full words + inline const prob_t& operator()(int slen, int tlen) const { + return plens[slen][tlen]; + } + + vector > plens; +}; + +struct CondBaseDist { + CondBaseDist(unsigned max_src_size, unsigned max_trg_size, double expected_src_to_trg_ratio) : + tclm(max_src_size, max_trg_size, expected_src_to_trg_ratio) {} + + prob_t operator()(const vector& src, unsigned sf, unsigned st, + const vector& trg, unsigned tf, unsigned tt) const { + prob_t p = tclm(st - sf, tt - tf); // target len | source length ~ TCLM(source len) + assert(!"not impl"); + return p; + } + inline prob_t operator()(const vector& src, const vector& trg) const { + return (*this)(src, 0, src.size(), trg, 0, trg.size()); + } + TruncatedConditionalLengthModel tclm; +}; + +// represents transliteration phrase probabilities, e.g. +// p( a l - | A l ) , p( o | A w ) , ... +struct TransliterationChunkConditionalModel { + explicit TransliterationChunkConditionalModel(const CondBaseDist& pp0) : + d(0.0), + strength(1.0), + rp0(pp0) { + } + + void Summary() const { + std::cerr << "Number of conditioning contexts: " << r.size() << std::endl; + for (RuleModelHash::const_iterator it = r.begin(); it != r.end(); ++it) { + std::cerr << TD::GetString(it->first) << " \t(\\alpha = " << it->second.alpha() << ") --------------------------" << std::endl; + for (CCRP_NoTable::const_iterator i2 = it->second.begin(); i2 != it->second.end(); ++i2) + std::cerr << " " << i2->second << '\t' << i2->first << std::endl; + } + } + + int DecrementRule(const TRule& rule) { + RuleModelHash::iterator it = r.find(rule.f_); + assert(it != r.end()); + int count = it->second.decrement(rule); + if (count) { + if (it->second.num_customers() == 0) r.erase(it); + } + return count; + } + + int IncrementRule(const TRule& rule) { + RuleModelHash::iterator it = r.find(rule.f_); + if (it == r.end()) { + it = r.insert(make_pair(rule.f_, CCRP_NoTable(strength))).first; + } + int count = it->second.increment(rule); + return count; + } + + void IncrementRules(const std::vector& rules) { + for (int i = 0; i < rules.size(); ++i) + IncrementRule(*rules[i]); + } + + void DecrementRules(const std::vector& rules) { + for (int i = 0; i < rules.size(); ++i) + DecrementRule(*rules[i]); + } + + prob_t RuleProbability(const TRule& rule) const { + prob_t p; + RuleModelHash::const_iterator it = r.find(rule.f_); + if (it == r.end()) { + p = rp0(rule.f_, rule.e_); + } else { + p = it->second.prob(rule, rp0(rule.f_, rule.e_)); + } + return p; + } + + double LogLikelihood(const double& dd, const double& aa) const { + if (aa <= -dd) return -std::numeric_limits::infinity(); + //double llh = Md::log_beta_density(dd, 10, 3) + Md::log_gamma_density(aa, 1, 1); + double llh = //Md::log_beta_density(dd, 1, 1) + + Md::log_gamma_density(dd + aa, 1, 1); + typename std::tr1::unordered_map, CCRP_NoTable, boost::hash > >::const_iterator it; + for (it = r.begin(); it != r.end(); ++it) + llh += it->second.log_crp_prob(aa); + return llh; + } + + struct AlphaResampler { + AlphaResampler(const TransliterationChunkConditionalModel& m) : m_(m) {} + const TransliterationChunkConditionalModel& m_; + double operator()(const double& proposed_strength) const { + return m_.LogLikelihood(m_.d, proposed_strength); + } + }; + + void ResampleHyperparameters(MT19937* rng) { + typename std::tr1::unordered_map, CCRP_NoTable, boost::hash > >::iterator it; + //const unsigned nloop = 5; + const unsigned niterations = 10; + //DiscountResampler dr(*this); + AlphaResampler ar(*this); +#if 0 + for (int iter = 0; iter < nloop; ++iter) { + strength = slice_sampler1d(ar, strength, *rng, -d + std::numeric_limits::min(), + std::numeric_limits::infinity(), 0.0, niterations, 100*niterations); + double min_discount = std::numeric_limits::min(); + if (strength < 0.0) min_discount -= strength; + d = slice_sampler1d(dr, d, *rng, min_discount, + 1.0, 0.0, niterations, 100*niterations); + } +#endif + strength = slice_sampler1d(ar, strength, *rng, -d, + std::numeric_limits::infinity(), 0.0, niterations, 100*niterations); + std::cerr << "CTMModel(alpha=" << strength << ") = " << LogLikelihood(d, strength) << std::endl; + for (it = r.begin(); it != r.end(); ++it) { +#if 0 + it->second.set_discount(d); +#endif + it->second.set_alpha(strength); + } + } + + prob_t Likelihood() const { + prob_t p; p.logeq(LogLikelihood(d, strength)); + return p; + } + + const CondBaseDist& rp0; + typedef std::tr1::unordered_map, + CCRP_NoTable, + boost::hash > > RuleModelHash; + RuleModelHash r; + double d, strength; +}; + struct GraphStructure { GraphStructure() : r() {} // leak memory - these are basically static @@ -20,9 +179,9 @@ struct GraphStructure { bool IsReachable() const { return r->nodes > 0; } }; -struct BackwardEstimates { - BackwardEstimates() : gs(), backward() {} - explicit BackwardEstimates(const GraphStructure& g) : +struct ProbabilityEstimates { + ProbabilityEstimates() : gs(), backward() {} + explicit ProbabilityEstimates(const GraphStructure& g) : gs(&g), backward() { if (g.r->nodes > 0) backward = new float[g.r->nodes]; @@ -36,24 +195,32 @@ struct BackwardEstimates { } // returns an backward estimate - double operator()(int src_covered, int trg_covered) const { + double Backward(int src_covered, int trg_covered) const { if (!backward) return 0; int ind = gs->r->node_addresses[src_covered][trg_covered]; if (ind < 0) return 0; return backward[ind]; } + + prob_t estp; + float* backward; private: const GraphStructure* gs; - float* backward; }; struct TransliterationsImpl { - TransliterationsImpl(int max_src, int max_trg, double fr) : + TransliterationsImpl(int max_src, int max_trg, double sr, const BackwardEstimator& b) : + cp0(max_src, max_trg, sr), + tccm(cp0), + be(b), kMAX_SRC_CHUNK(max_src), kMAX_TRG_CHUNK(max_trg), - kFILTER_RATIO(fr), + kS2T_RATIO(sr), tot_pairs(), tot_mem() { } + const CondBaseDist cp0; + TransliterationChunkConditionalModel tccm; + const BackwardEstimator& be; void Initialize(WordID src, const vector& src_lets, WordID trg, const vector& trg_lets) { const size_t src_len = src_lets.size(); @@ -63,20 +230,29 @@ struct TransliterationsImpl { if (src_len >= graphs.size()) graphs.resize(src_len + 1); if (trg_len >= graphs[src_len].size()) graphs[src_len].resize(trg_len + 1); GraphStructure& gs = graphs[src_len][trg_len]; - if (!gs.r) - gs.r = new Reachability(src_len, trg_len, kMAX_SRC_CHUNK, kMAX_TRG_CHUNK, kFILTER_RATIO); + if (!gs.r) { + double rat = exp(fabs(log(trg_len / (src_len * kS2T_RATIO)))); + if (rat > 1.5 || (rat > 2.4 && src_len < 6)) { + cerr << " ** Forbidding transliterations of size " << src_len << "," << trg_len << ": " << rat << endl; + gs.r = new Reachability(src_len, trg_len, 0, 0); + } else { + gs.r = new Reachability(src_len, trg_len, kMAX_SRC_CHUNK, kMAX_TRG_CHUNK); + } + } + const Reachability& r = *gs.r; // init backward estimates - if (src >= bes.size()) bes.resize(src + 1); - unordered_map::iterator it = bes[src].find(trg); - if (it != bes[src].end()) return; // already initialized + if (src >= ests.size()) ests.resize(src + 1); + unordered_map::iterator it = ests[src].find(trg); + if (it != ests[src].end()) return; // already initialized - it = bes[src].insert(make_pair(trg, BackwardEstimates(gs))).first; - BackwardEstimates& b = it->second; + it = ests[src].insert(make_pair(trg, ProbabilityEstimates(gs))).first; + ProbabilityEstimates& est = it->second; if (!gs.r->nodes) return; // not derivable subject to length constraints - // TODO + be.InitializeGrid(src_lets, trg_lets, r, kS2T_RATIO, est.backward); + cerr << TD::GetString(src_lets) << " ||| " << TD::GetString(trg_lets) << " ||| " << (est.backward[0] / trg_lets.size()) << endl; tot_pairs++; tot_mem += sizeof(float) * gs.r->nodes; } @@ -92,8 +268,11 @@ struct TransliterationsImpl { const vector& tv = graphs[src.size()]; assert(trg.size() < tv.size()); const GraphStructure& gs = tv[trg.size()]; - // TODO: do prob - return prob_t::Zero(); + if (gs.r->nodes == 0) + return prob_t::Zero(); + const unordered_map::const_iterator it = ests[s].find(t); + assert(it != ests[s].end()); + return it->second.estp; } void GraphSummary() const { @@ -126,15 +305,15 @@ struct TransliterationsImpl { const int kMAX_SRC_CHUNK; const int kMAX_TRG_CHUNK; - const double kFILTER_RATIO; + const double kS2T_RATIO; unsigned tot_pairs; size_t tot_mem; vector > graphs; // graphs[src_len][trg_len] - vector > bes; // bes[src][trg] + vector > ests; // ests[src][trg] }; -Transliterations::Transliterations(int max_src, int max_trg, double fr) : - pimpl_(new TransliterationsImpl(max_src, max_trg, fr)) {} +Transliterations::Transliterations(int max_src, int max_trg, double sr, const BackwardEstimator& be) : + pimpl_(new TransliterationsImpl(max_src, max_trg, sr, be)) {} Transliterations::~Transliterations() { delete pimpl_; } void Transliterations::Initialize(WordID src, const vector& src_lets, WordID trg, const vector& trg_lets) { diff --git a/gi/pf/transliterations.h b/gi/pf/transliterations.h index ea9f9d3f..49d14684 100644 --- a/gi/pf/transliterations.h +++ b/gi/pf/transliterations.h @@ -5,11 +5,12 @@ #include "wordid.h" #include "prob.h" +struct BackwardEstimator; struct TransliterationsImpl; struct Transliterations { // max_src and max_trg indicate how big the transliteration phrases can be // see reachability.h for information about filter_ratio - explicit Transliterations(int max_src, int max_trg, double filter_ratio); + explicit Transliterations(int max_src, int max_trg, double s2t_rat, const BackwardEstimator& be); ~Transliterations(); void Initialize(WordID src, const std::vector& src_lets, WordID trg, const std::vector& trg_lets); void Forbid(WordID src, const std::vector& src_lets, WordID trg, const std::vector& trg_lets); diff --git a/utils/ccrp_nt.h b/utils/ccrp_nt.h index 79321493..6efbfc78 100644 --- a/utils/ccrp_nt.h +++ b/utils/ccrp_nt.h @@ -11,6 +11,7 @@ #include #include "sampler.h" #include "slice_sampler.h" +#include "m.h" // Chinese restaurant process (1 parameter) template > @@ -29,6 +30,7 @@ class CCRP_NoTable { alpha_prior_rate_(c_rate) {} double alpha() const { return alpha_; } + void set_alpha(const double& alpha) { alpha_ = alpha; assert(alpha_ > 0.0); } bool has_alpha_prior() const { return !std::isnan(alpha_prior_shape_); @@ -71,9 +73,10 @@ class CCRP_NoTable { return table_diff; } - double prob(const Dish& dish, const double& p0) const { + template + F prob(const Dish& dish, const F& p0) const { const unsigned at_table = num_customers(dish); - return (at_table + p0 * alpha_) / (num_customers_ + alpha_); + return (F(at_table) + p0 * F(alpha_)) / F(num_customers_ + alpha_); } double logprob(const Dish& dish, const double& logp0) const { @@ -85,20 +88,12 @@ class CCRP_NoTable { return log_crp_prob(alpha_); } - static double log_gamma_density(const double& x, const double& shape, const double& rate) { - assert(x >= 0.0); - assert(shape > 0.0); - assert(rate > 0.0); - const double lp = (shape-1)*log(x) - shape*log(rate) - x/rate - lgamma(shape); - return lp; - } - // taken from http://en.wikipedia.org/wiki/Chinese_restaurant_process // does not include P_0's double log_crp_prob(const double& alpha) const { double lp = 0.0; if (has_alpha_prior()) - lp += log_gamma_density(alpha, alpha_prior_shape_, alpha_prior_rate_); + lp += Md::log_gamma_density(alpha, alpha_prior_shape_, alpha_prior_rate_); assert(lp <= 0.0); if (num_customers_) { lp += lgamma(alpha) - lgamma(alpha + num_customers_) + -- cgit v1.2.3 From 38f28be7cd2bada87ebad78994e3c938e10c2cce Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Sat, 10 Mar 2012 12:56:15 -0500 Subject: ready to infer alignment parameters --- gi/pf/Makefile.am | 4 +- gi/pf/align-lexonly-pyp.cc | 22 ++- gi/pf/align-lexonly.cc | 332 --------------------------------------------- gi/pf/pyp_tm.cc | 6 +- gi/pf/quasi_model2.h | 115 ++++++++++++---- gi/pf/tied_resampler.h | 31 +++++ 6 files changed, 143 insertions(+), 367 deletions(-) delete mode 100644 gi/pf/align-lexonly.cc (limited to 'gi/pf/Makefile.am') diff --git a/gi/pf/Makefile.am b/gi/pf/Makefile.am index 4ce72ba1..f9c979d0 100644 --- a/gi/pf/Makefile.am +++ b/gi/pf/Makefile.am @@ -1,4 +1,4 @@ -bin_PROGRAMS = cbgi brat dpnaive pfbrat pfdist itg pfnaive condnaive align-lexonly align-lexonly-pyp learn_cfg pyp_lm nuisance_test align-tl +bin_PROGRAMS = cbgi brat dpnaive pfbrat pfdist itg pfnaive condnaive align-lexonly-pyp learn_cfg pyp_lm nuisance_test align-tl noinst_LIBRARIES = libpf.a @@ -7,8 +7,6 @@ libpf_a_SOURCES = base_distributions.cc reachability.cc cfg_wfst_composer.cc cor nuisance_test_SOURCES = nuisance_test.cc nuisance_test_LDADD = libpf.a $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a $(top_srcdir)/klm/lm/libklm.a $(top_srcdir)/klm/util/libklm_util.a -lz -align_lexonly_SOURCES = align-lexonly.cc - align_lexonly_pyp_SOURCES = align-lexonly-pyp.cc align_lexonly_pyp_LDADD = libpf.a $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a $(top_srcdir)/klm/lm/libklm.a $(top_srcdir)/klm/util/libklm_util.a -lz diff --git a/gi/pf/align-lexonly-pyp.cc b/gi/pf/align-lexonly-pyp.cc index 0c90b6ce..68cb9192 100644 --- a/gi/pf/align-lexonly-pyp.cc +++ b/gi/pf/align-lexonly-pyp.cc @@ -61,15 +61,15 @@ struct AlignedSentencePair { struct Aligner { Aligner(const vector >& lets, int num_letters, vector* c) : corpus(*c), + paj_model(4, 0.08), model(lets, num_letters), - paj(4, 0.08), kNULL(TD::Convert("NULL")) { assert(lets[kNULL].size() == 0); } vector& corpus; + QuasiModel2 paj_model; PYPLexicalTranslation model; - const QuasiModel2 paj; const WordID kNULL; void ResampleHyperparameters() { @@ -86,10 +86,12 @@ struct Aligner { a_j = prng->next() * (1 + asp.src.size()); const WordID f_a_j = (a_j ? asp.src[a_j - 1] : kNULL); model.Increment(f_a_j, asp.trg[j], &*prng); - // TODO factor in alignment prob + paj_model.Increment(a_j, j, asp.src.size(), asp.trg.size()); } } - cerr << "Corpus intialized randomly. LLH = " << model.Likelihood() << endl; + cerr << "Corpus intialized randomly." << endl; + cerr << "LLH = " << Likelihood() << " \t(Amodel=" << paj_model.Likelihood() + << " TModel=" << model.Likelihood() << ") contexts=" << model.UniqueConditioningContexts() << endl; } void ResampleCorpus() { @@ -101,19 +103,25 @@ struct Aligner { const WordID e_j = asp.trg[j]; WordID f_a_j = (a_j ? asp.src[a_j - 1] : kNULL); model.Decrement(f_a_j, e_j, prng); + paj_model.Decrement(a_j, j, asp.src.size(), asp.trg.size()); for (unsigned prop_a_j = 0; prop_a_j <= asp.src.size(); ++prop_a_j) { const WordID prop_f = (prop_a_j ? asp.src[prop_a_j - 1] : kNULL); ss[prop_a_j] = model.Prob(prop_f, e_j); - // TODO configurable - ss[prop_a_j] *= paj.Pa_j(prop_a_j, j, asp.src.size(), asp.trg.size()); + ss[prop_a_j] *= paj_model.Prob(prop_a_j, j, asp.src.size(), asp.trg.size()); } a_j = prng->SelectSample(ss); f_a_j = (a_j ? asp.src[a_j - 1] : kNULL); model.Increment(f_a_j, e_j, prng); + paj_model.Increment(a_j, j, asp.src.size(), asp.trg.size()); } } - cerr << "LLH = " << model.Likelihood() << " " << model.UniqueConditioningContexts() << endl; + cerr << "LLH = " << Likelihood() << " \t(Amodel=" << paj_model.Likelihood() + << " TModel=" << model.Likelihood() << ") contexts=" << model.UniqueConditioningContexts() << endl; + } + + prob_t Likelihood() const { + return model.Likelihood() * paj_model.Likelihood(); } }; diff --git a/gi/pf/align-lexonly.cc b/gi/pf/align-lexonly.cc deleted file mode 100644 index dbc9dc07..00000000 --- a/gi/pf/align-lexonly.cc +++ /dev/null @@ -1,332 +0,0 @@ -#include -#include -#include - -#include -#include -#include - -#include "array2d.h" -#include "base_distributions.h" -#include "monotonic_pseg.h" -#include "conditional_pseg.h" -#include "trule.h" -#include "tdict.h" -#include "stringlib.h" -#include "filelib.h" -#include "dict.h" -#include "sampler.h" -#include "ccrp_nt.h" -#include "corpus.h" -#include "ngram_base.h" - -using namespace std; -using namespace tr1; -namespace po = boost::program_options; - -void InitCommandLine(int argc, char** argv, po::variables_map* conf) { - po::options_description opts("Configuration options"); - opts.add_options() - ("samples,s",po::value()->default_value(1000),"Number of samples") - ("input,i",po::value(),"Read parallel data from") - ("random_seed,S",po::value(), "Random seed"); - po::options_description clo("Command line options"); - clo.add_options() - ("config", po::value(), "Configuration file") - ("help,h", "Print this help message and exit"); - po::options_description dconfig_options, dcmdline_options; - dconfig_options.add(opts); - dcmdline_options.add(opts).add(clo); - - po::store(parse_command_line(argc, argv, dcmdline_options), *conf); - if (conf->count("config")) { - ifstream config((*conf)["config"].as().c_str()); - po::store(po::parse_config_file(config, dconfig_options), *conf); - } - po::notify(*conf); - - if (conf->count("help") || (conf->count("input") == 0)) { - cerr << dcmdline_options << endl; - exit(1); - } -} - -shared_ptr prng; - -struct LexicalAlignment { - unsigned char src_index; - bool is_transliteration; - vector > derivation; -}; - -struct AlignedSentencePair { - vector src; - vector trg; - vector a; - Array2D posterior; -}; - -struct HierarchicalWordBase { - explicit HierarchicalWordBase(const unsigned vocab_e_size) : - base(prob_t::One()), r(25,25,10), u0(-log(vocab_e_size)) {} - - void ResampleHyperparameters(MT19937* rng) { - r.resample_hyperparameters(rng); - } - - inline double logp0(const vector& s) const { - return s.size() * u0; - } - - // return p0 of rule.e_ - prob_t operator()(const TRule& rule) const { - prob_t p; p.logeq(r.logprob(rule.e_, logp0(rule.e_))); - return p; - } - - void Increment(const TRule& rule) { - if (r.increment(rule.e_)) { - prob_t p; p.logeq(logp0(rule.e_)); - base *= p; - } - } - - void Decrement(const TRule& rule) { - if (r.decrement(rule.e_)) { - prob_t p; p.logeq(logp0(rule.e_)); - base /= p; - } - } - - prob_t Likelihood() const { - prob_t p; p.logeq(r.log_crp_prob()); - p *= base; - return p; - } - - void Summary() const { - cerr << "NUMBER OF CUSTOMERS: " << r.num_customers() << " (\\alpha=" << r.alpha() << ')' << endl; - for (CCRP_NoTable >::const_iterator it = r.begin(); it != r.end(); ++it) - cerr << " " << it->second << '\t' << TD::GetString(it->first) << endl; - } - - prob_t base; - CCRP_NoTable > r; - const double u0; -}; - -struct BasicLexicalAlignment { - explicit BasicLexicalAlignment(const vector >& lets, - const unsigned words_e, - const unsigned letters_e, - vector* corp) : - letters(lets), - corpus(*corp), - up0("fr-en.10k.translit-base.txt.gz"), - //up0(words_e), - //up0("en.chars.1gram", letters_e), - //up0("en.words.1gram"), - //up0(letters_e), - //up0("en.chars.2gram"), - tmodel(up0) { - } - - void InstantiateRule(const WordID src, - const WordID trg, - TRule* rule) const { - static const WordID kX = TD::Convert("X") * -1; - rule->lhs_ = kX; - rule->e_ = letters[trg]; - rule->f_ = letters[src]; - } - - void InitializeRandom() { - const WordID kNULL = TD::Convert("NULL"); - cerr << "Initializing with random alignments ...\n"; - for (unsigned i = 0; i < corpus.size(); ++i) { - AlignedSentencePair& asp = corpus[i]; - asp.a.resize(asp.trg.size()); - for (unsigned j = 0; j < asp.trg.size(); ++j) { - const unsigned char a_j = prng->next() * (1 + asp.src.size()); - const WordID f_a_j = (a_j ? asp.src[a_j - 1] : kNULL); - TRule r; - InstantiateRule(f_a_j, asp.trg[j], &r); - asp.a[j].is_transliteration = false; - asp.a[j].src_index = a_j; - if (tmodel.IncrementRule(r)) - up0.Increment(r); - } - } - cerr << " LLH = " << Likelihood() << endl; - } - - prob_t Likelihood() const { - prob_t p = tmodel.Likelihood(); - p *= up0.Likelihood(); - return p; - } - - void ResampleHyperparemeters() { - cerr << " LLH_prev = " << Likelihood() << flush; - tmodel.ResampleHyperparameters(&*prng); - up0.ResampleHyperparameters(&*prng); - cerr << "\tLLH_post = " << Likelihood() << endl; - } - - void ResampleCorpus(); - - const vector >& letters; // spelling dictionary - vector& corpus; - //PhraseConditionalUninformativeBase up0; - //PhraseConditionalUninformativeUnigramBase up0; - //UnigramWordBase up0; - //HierarchicalUnigramBase up0; - TableLookupBase up0; - //HierarchicalWordBase up0; - //PoissonUniformUninformativeBase up0; - //CompletelyUniformBase up0; - //FixedNgramBase up0; - //ConditionalTranslationModel tmodel; - //ConditionalTranslationModel tmodel; - //ConditionalTranslationModel tmodel; - //ConditionalTranslationModel tmodel; - //ConditionalTranslationModel tmodel; - //ConditionalTranslationModel tmodel; - ConditionalTranslationModel tmodel; - //ConditionalTranslationModel tmodel; - //ConditionalTranslationModel tmodel; -}; - -void BasicLexicalAlignment::ResampleCorpus() { - static const WordID kNULL = TD::Convert("NULL"); - for (unsigned i = 0; i < corpus.size(); ++i) { - AlignedSentencePair& asp = corpus[i]; - SampleSet ss; ss.resize(asp.src.size() + 1); - for (unsigned j = 0; j < asp.trg.size(); ++j) { - TRule r; - unsigned char& a_j = asp.a[j].src_index; - WordID f_a_j = (a_j ? asp.src[a_j - 1] : kNULL); - InstantiateRule(f_a_j, asp.trg[j], &r); - if (tmodel.DecrementRule(r)) - up0.Decrement(r); - - for (unsigned prop_a_j = 0; prop_a_j <= asp.src.size(); ++prop_a_j) { - const WordID prop_f = (prop_a_j ? asp.src[prop_a_j - 1] : kNULL); - InstantiateRule(prop_f, asp.trg[j], &r); - ss[prop_a_j] = tmodel.RuleProbability(r); - } - a_j = prng->SelectSample(ss); - f_a_j = (a_j ? asp.src[a_j - 1] : kNULL); - InstantiateRule(f_a_j, asp.trg[j], &r); - if (tmodel.IncrementRule(r)) - up0.Increment(r); - } - } - cerr << " LLH = " << tmodel.Likelihood() << endl; -} - -void ExtractLetters(const set& v, vector >* l, set* letset = NULL) { - for (set::const_iterator it = v.begin(); it != v.end(); ++it) { - if (*it >= l->size()) { l->resize(*it + 1); } - vector& letters = (*l)[*it]; - if (letters.size()) continue; // if e and f have the same word - - const string& w = TD::Convert(*it); - - size_t cur = 0; - while (cur < w.size()) { - const size_t len = UTF8Len(w[cur]); - letters.push_back(TD::Convert(w.substr(cur, len))); - if (letset) letset->insert(letters.back()); - cur += len; - } - } -} - -void Debug(const AlignedSentencePair& asp) { - cerr << TD::GetString(asp.src) << endl << TD::GetString(asp.trg) << endl; - Array2D a(asp.src.size(), asp.trg.size()); - for (unsigned j = 0; j < asp.trg.size(); ++j) - if (asp.a[j].src_index) a(asp.a[j].src_index - 1, j) = true; - cerr << a << endl; -} - -void AddSample(AlignedSentencePair* asp) { - for (unsigned j = 0; j < asp->trg.size(); ++j) - asp->posterior(asp->a[j].src_index, j)++; -} - -void WriteAlignments(const AlignedSentencePair& asp) { - bool first = true; - for (unsigned j = 0; j < asp.trg.size(); ++j) { - int src_index = -1; - int mc = -1; - for (unsigned i = 0; i <= asp.src.size(); ++i) { - if (asp.posterior(i, j) > mc) { - mc = asp.posterior(i, j); - src_index = i; - } - } - - if (src_index) { - if (first) first = false; else cout << ' '; - cout << (src_index - 1) << '-' << j; - } - } - cout << endl; -} - -int main(int argc, char** argv) { - po::variables_map conf; - InitCommandLine(argc, argv, &conf); - - if (conf.count("random_seed")) - prng.reset(new MT19937(conf["random_seed"].as())); - else - prng.reset(new MT19937); -// MT19937& rng = *prng; - - vector > corpuse, corpusf; - set vocabe, vocabf; - corpus::ReadParallelCorpus(conf["input"].as(), &corpusf, &corpuse, &vocabf, &vocabe); - cerr << "f-Corpus size: " << corpusf.size() << " sentences\n"; - cerr << "f-Vocabulary size: " << vocabf.size() << " types\n"; - cerr << "f-Corpus size: " << corpuse.size() << " sentences\n"; - cerr << "f-Vocabulary size: " << vocabe.size() << " types\n"; - assert(corpusf.size() == corpuse.size()); - - vector corpus(corpuse.size()); - for (unsigned i = 0; i < corpuse.size(); ++i) { - corpus[i].src.swap(corpusf[i]); - corpus[i].trg.swap(corpuse[i]); - corpus[i].posterior.resize(corpus[i].src.size() + 1, corpus[i].trg.size()); - } - corpusf.clear(); corpuse.clear(); - - vocabf.insert(TD::Convert("NULL")); - vector > letters(TD::NumWords()); - set letset; - ExtractLetters(vocabe, &letters, &letset); - ExtractLetters(vocabf, &letters, NULL); - letters[TD::Convert("NULL")].clear(); - - BasicLexicalAlignment x(letters, vocabe.size(), letset.size(), &corpus); - x.InitializeRandom(); - const unsigned samples = conf["samples"].as(); - for (int i = 0; i < samples; ++i) { - for (int j = 395; j < 397; ++j) Debug(corpus[j]); - cerr << i << "\t" << x.tmodel.r.size() << "\t"; - if (i % 10 == 0) x.ResampleHyperparemeters(); - x.ResampleCorpus(); - if (i > (samples / 5) && (i % 10 == 9)) for (int j = 0; j < corpus.size(); ++j) AddSample(&corpus[j]); - } - for (unsigned i = 0; i < corpus.size(); ++i) - WriteAlignments(corpus[i]); - //ModelAndData posterior(x, &corpus, vocabe, vocabf); - x.tmodel.Summary(); - x.up0.Summary(); - - //posterior.Sample(); - - return 0; -} diff --git a/gi/pf/pyp_tm.cc b/gi/pf/pyp_tm.cc index 73104fe9..bf5a6497 100644 --- a/gi/pf/pyp_tm.cc +++ b/gi/pf/pyp_tm.cc @@ -10,7 +10,6 @@ #include "tdict.h" #include "ccrp.h" #include "pyp_word_model.h" - #include "tied_resampler.h" using namespace std; @@ -18,7 +17,7 @@ using namespace std::tr1; template struct ConditionalPYPWordModel { - ConditionalPYPWordModel(Base* b) : base(*b) {} + ConditionalPYPWordModel(Base* b) : base(*b), btr(3) {} void Summary() const { cerr << "Number of conditioning contexts: " << r.size() << endl; @@ -32,6 +31,7 @@ struct ConditionalPYPWordModel { void ResampleHyperparameters(MT19937* rng) { for (RuleModelHash::iterator it = r.begin(); it != r.end(); ++it) it->second.resample_hyperparameters(rng); + btr.ResampleHyperparameters(rng); } prob_t Prob(const WordID src, const vector& trglets) const { @@ -72,7 +72,9 @@ struct ConditionalPYPWordModel { return r.size(); } + // TODO tie PYP hyperparameters based on source word frequency bins Base& base; + BinTiedResampler > > btr; typedef unordered_map > > RuleModelHash; RuleModelHash r; }; diff --git a/gi/pf/quasi_model2.h b/gi/pf/quasi_model2.h index 0095289f..8ec0a400 100644 --- a/gi/pf/quasi_model2.h +++ b/gi/pf/quasi_model2.h @@ -3,44 +3,113 @@ #include #include +#include +#include "boost/functional.hpp" #include "prob.h" #include "array2d.h" +struct AlignmentObservation { + AlignmentObservation() : src_len(), trg_len(), j(), a_j() {} + AlignmentObservation(unsigned sl, unsigned tl, unsigned tw, unsigned sw) : + src_len(sl), trg_len(tl), j(tw), a_j(sw) {} + unsigned short src_len; + unsigned short trg_len; + unsigned short j; + unsigned short a_j; +}; + +inline size_t hash_value(const AlignmentObservation& o) { + return reinterpret_cast(o); +} + +inline bool operator==(const AlignmentObservation& a, const AlignmentObservation& b) { + return hash_value(a) == hash_value(b); +} + struct QuasiModel2 { explicit QuasiModel2(double alpha, double pnull = 0.1) : alpha_(alpha), pnull_(pnull), - pnotnull_(1 - pnull), - z_(1000,1000) {} + pnotnull_(1 - pnull) {} + // a_j = 0 => NULL; src_len does *not* include null - prob_t Pa_j(unsigned a_j, unsigned j, unsigned src_len, unsigned trg_len) const { + prob_t Prob(unsigned a_j, unsigned j, unsigned src_len, unsigned trg_len) const { if (!a_j) return pnull_; - std::vector& zv = z_(src_len, trg_len); - if (zv.size() == 0) - zv.resize(trg_len); - - prob_t& z = zv[j]; - if (z.is_0()) z = ComputeZ(j, src_len, trg_len); - - prob_t p; - p.logeq(-fabs(double(a_j - 1) / src_len - double(j) / trg_len) * alpha_); - p *= pnotnull_; - p /= z; + return pnotnull_ * + prob_t(UnnormalizedProb(a_j, j, src_len, trg_len, alpha_) / GetOrComputeZ(j, src_len, trg_len)); + } + + void Increment(unsigned a_j, unsigned j, unsigned src_len, unsigned trg_len) { + assert(a_j <= src_len); + assert(j < trg_len); + ++obs_[AlignmentObservation(src_len, trg_len, j, a_j)]; + } + + void Decrement(unsigned a_j, unsigned j, unsigned src_len, unsigned trg_len) { + const AlignmentObservation ao(src_len, trg_len, j, a_j); + int &cc = obs_[ao]; + assert(cc > 0); + --cc; + if (!cc) obs_.erase(ao); + } + + prob_t Likelihood() const { + return Likelihood(alpha_, pnull_.as_float()); + } + + prob_t Likelihood(double alpha, double ppnull) const { + const prob_t pnull(ppnull); + const prob_t pnotnull(1 - ppnull); + + prob_t p = prob_t::One(); + for (ObsCount::const_iterator it = obs_.begin(); it != obs_.end(); ++it) { + const AlignmentObservation& ao = it->first; + if (ao.a_j) { + double u = UnnormalizedProb(ao.a_j, ao.j, ao.src_len, ao.trg_len, alpha); + double z = ComputeZ(ao.j, ao.src_len, ao.trg_len, alpha); + prob_t pa(u / z); + pa *= pnotnull; + pa.poweq(it->second); + p *= pa; + } else { + p *= pnull.pow(it->second); + } + } return p; } + private: - prob_t ComputeZ(unsigned j, unsigned src_len, unsigned trg_len) const { - prob_t p, z = prob_t::Zero(); - for (int a_j = 1; a_j <= src_len; ++a_j) { - p.logeq(-fabs(double(a_j - 1) / src_len - double(j) / trg_len) * alpha_); - z += p; - } + static double UnnormalizedProb(unsigned a_j, unsigned j, unsigned src_len, unsigned trg_len, double alpha) { + return exp(-fabs(double(a_j - 1) / src_len - double(j) / trg_len) * alpha); + } + + static double ComputeZ(unsigned j, unsigned src_len, unsigned trg_len, double alpha) { + double z = 0; + for (int a_j = 1; a_j <= src_len; ++a_j) + z += UnnormalizedProb(a_j, j, src_len, trg_len, alpha); return z; } + + const double& GetOrComputeZ(unsigned j, unsigned src_len, unsigned trg_len) const { + if (src_len >= zcache_.size()) + zcache_.resize(src_len + 1); + if (trg_len >= zcache_[src_len].size()) + zcache_[src_len].resize(trg_len + 1); + std::vector& zv = zcache_[src_len][trg_len]; + if (zv.size() == 0) + zv.resize(trg_len); + double& z = zv[j]; + if (!z) + z = ComputeZ(j, src_len, trg_len, alpha_); + return z; + } + double alpha_; - const prob_t pnull_; - const prob_t pnotnull_; - mutable Array2D > z_; + prob_t pnull_; + prob_t pnotnull_; + mutable std::vector > > zcache_; + typedef std::tr1::unordered_map > ObsCount; + ObsCount obs_; }; #endif diff --git a/gi/pf/tied_resampler.h b/gi/pf/tied_resampler.h index 208fb9c7..5a262f9d 100644 --- a/gi/pf/tied_resampler.h +++ b/gi/pf/tied_resampler.h @@ -2,6 +2,7 @@ #define _TIED_RESAMPLER_H_ #include +#include #include "sampler.h" #include "slice_sampler.h" #include "m.h" @@ -28,6 +29,10 @@ struct TiedResampler { crps.erase(crp); } + size_t size() const { + return crps.size(); + } + double LogLikelihood(double d, double s) const { if (s <= -d) return -std::numeric_limits::infinity(); double llh = Md::log_beta_density(d, d_alpha, d_beta) + @@ -54,6 +59,7 @@ struct TiedResampler { }; void ResampleHyperparameters(MT19937* rng, const unsigned nloop = 5, const unsigned niterations = 10) { + if (size() == 0) { std::cerr << "EMPTY - not resampling\n"; return; } const DiscountResampler dr(*this); const AlphaResampler ar(*this); for (int iter = 0; iter < nloop; ++iter) { @@ -79,4 +85,29 @@ struct TiedResampler { double discount, strength; }; +// split according to some criterion +template +struct BinTiedResampler { + explicit BinTiedResampler(unsigned nbins) : + resamplers(nbins, TiedResampler(1,1,1,1)) {} + + void Add(unsigned bin, CRP* crp) { + resamplers[bin].Add(crp); + } + + void Remove(unsigned bin, CRP* crp) { + resamplers[bin].Remove(crp); + } + + void ResampleHyperparameters(MT19937* rng) { + for (unsigned i = 0; i < resamplers.size(); ++i) { + std::cerr << "BIN " << i << " (" << resamplers[i].size() << " CRPs): " << std::flush; + resamplers[i].ResampleHyperparameters(rng); + } + } + + private: + std::vector > resamplers; +}; + #endif -- cgit v1.2.3