diff options
author | Patrick Simianer <p@simianer.de> | 2012-03-13 09:24:47 +0100 |
---|---|---|
committer | Patrick Simianer <p@simianer.de> | 2012-03-13 09:24:47 +0100 |
commit | ef6085e558e26c8819f1735425761103021b6470 (patch) | |
tree | 5cf70e4c48c64d838e1326b5a505c8c4061bff4a /gi/pf | |
parent | 10a232656a0c882b3b955d2bcfac138ce11e8a2e (diff) | |
parent | dfbc278c1057555fda9312291c8024049e00b7d8 (diff) |
merge with upstream
Diffstat (limited to 'gi/pf')
37 files changed, 3654 insertions, 71 deletions
diff --git a/gi/pf/Makefile.am b/gi/pf/Makefile.am index 42758939..f9c979d0 100644 --- a/gi/pf/Makefile.am +++ b/gi/pf/Makefile.am @@ -1,10 +1,26 @@ -bin_PROGRAMS = cbgi brat dpnaive pfbrat pfdist itg pfnaive +bin_PROGRAMS = cbgi brat dpnaive pfbrat pfdist itg pfnaive condnaive align-lexonly-pyp learn_cfg pyp_lm nuisance_test align-tl noinst_LIBRARIES = libpf.a -libpf_a_SOURCES = base_measures.cc reachability.cc cfg_wfst_composer.cc corpus.cc + +libpf_a_SOURCES = base_distributions.cc reachability.cc cfg_wfst_composer.cc corpus.cc unigrams.cc ngram_base.cc transliterations.cc backward.cc pyp_word_model.cc pyp_tm.cc + +nuisance_test_SOURCES = nuisance_test.cc +nuisance_test_LDADD = libpf.a $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a $(top_srcdir)/klm/lm/libklm.a $(top_srcdir)/klm/util/libklm_util.a -lz + +align_lexonly_pyp_SOURCES = align-lexonly-pyp.cc +align_lexonly_pyp_LDADD = libpf.a $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a $(top_srcdir)/klm/lm/libklm.a $(top_srcdir)/klm/util/libklm_util.a -lz + +align_tl_SOURCES = align-tl.cc +align_tl_LDADD = libpf.a $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a $(top_srcdir)/klm/lm/libklm.a $(top_srcdir)/klm/util/libklm_util.a -lz itg_SOURCES = itg.cc +pyp_lm_SOURCES = pyp_lm.cc + +learn_cfg_SOURCES = learn_cfg.cc + +condnaive_SOURCES = condnaive.cc + dpnaive_SOURCES = dpnaive.cc pfdist_SOURCES = pfdist.cc @@ -17,5 +33,6 @@ brat_SOURCES = brat.cc pfbrat_SOURCES = pfbrat.cc -AM_CPPFLAGS = -W -Wall -Wno-sign-compare -funroll-loops -I$(top_srcdir)/utils $(GTEST_CPPFLAGS) -I$(top_srcdir)/decoder -AM_LDFLAGS = libpf.a $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/utils/libutils.a -lz +AM_CPPFLAGS = -W -Wall -Wno-sign-compare -funroll-loops -I$(top_srcdir)/utils $(GTEST_CPPFLAGS) -I$(top_srcdir)/decoder -I$(top_srcdir)/klm + +AM_LDFLAGS = libpf.a $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/klm/lm/libklm.a $(top_srcdir)/klm/util/libklm_util.a $(top_srcdir)/utils/libutils.a -lz diff --git a/gi/pf/align-lexonly-pyp.cc b/gi/pf/align-lexonly-pyp.cc new file mode 100644 index 00000000..942dcf51 --- /dev/null +++ b/gi/pf/align-lexonly-pyp.cc @@ -0,0 +1,239 @@ +#include <iostream> +#include <queue> + +#include <boost/program_options.hpp> +#include <boost/program_options/variables_map.hpp> + +#include "tdict.h" +#include "stringlib.h" +#include "filelib.h" +#include "array2d.h" +#include "sampler.h" +#include "corpus.h" +#include "pyp_tm.h" +#include "quasi_model2.h" + +using namespace std; +namespace po = boost::program_options; + +void InitCommandLine(int argc, char** argv, po::variables_map* conf) { + po::options_description opts("Configuration options"); + opts.add_options() + ("samples,s",po::value<unsigned>()->default_value(1000),"Number of samples") + ("infer_alignment_hyperparameters,I", "Infer alpha and p_null, otherwise fixed values will be assumed") + ("p_null,0", po::value<double>()->default_value(0.08), "probability of aligning to null") + ("align_alpha,a", po::value<double>()->default_value(4.0), "how 'tight' is the bias toward be along the diagonal?") + ("input,i",po::value<string>(),"Read parallel data from") + ("random_seed,S",po::value<uint32_t>(), "Random seed"); + po::options_description clo("Command line options"); + clo.add_options() + ("config", po::value<string>(), "Configuration file") + ("help,h", "Print this help message and exit"); + po::options_description dconfig_options, dcmdline_options; + dconfig_options.add(opts); + dcmdline_options.add(opts).add(clo); + + po::store(parse_command_line(argc, argv, dcmdline_options), *conf); + if (conf->count("config")) { + ifstream config((*conf)["config"].as<string>().c_str()); + po::store(po::parse_config_file(config, dconfig_options), *conf); + } + po::notify(*conf); + + if (conf->count("help") || (conf->count("input") == 0)) { + cerr << dcmdline_options << endl; + exit(1); + } +} + +MT19937* prng; + +struct LexicalAlignment { + unsigned char src_index; + bool is_transliteration; + vector<pair<short, short> > derivation; +}; + +struct AlignedSentencePair { + vector<WordID> src; + vector<WordID> trg; + vector<LexicalAlignment> a; + Array2D<short> posterior; +}; + +struct Aligner { + Aligner(const vector<vector<WordID> >& lets, + int num_letters, + const po::variables_map& conf, + vector<AlignedSentencePair>* c) : + corpus(*c), + paj_model(conf["align_alpha"].as<double>(), conf["p_null"].as<double>()), + infer_paj(conf.count("infer_alignment_hyperparameters") > 0), + model(lets, num_letters), + kNULL(TD::Convert("NULL")) { + assert(lets[kNULL].size() == 0); + } + + vector<AlignedSentencePair>& corpus; + QuasiModel2 paj_model; + const bool infer_paj; + PYPLexicalTranslation model; + const WordID kNULL; + + void ResampleHyperparameters() { + model.ResampleHyperparameters(prng); + if (infer_paj) paj_model.ResampleHyperparameters(prng); + } + + void InitializeRandom() { + cerr << "Initializing with random alignments ...\n"; + for (unsigned i = 0; i < corpus.size(); ++i) { + AlignedSentencePair& asp = corpus[i]; + asp.a.resize(asp.trg.size()); + for (unsigned j = 0; j < asp.trg.size(); ++j) { + unsigned char& a_j = asp.a[j].src_index; + a_j = prng->next() * (1 + asp.src.size()); + const WordID f_a_j = (a_j ? asp.src[a_j - 1] : kNULL); + model.Increment(f_a_j, asp.trg[j], &*prng); + paj_model.Increment(a_j, j, asp.src.size(), asp.trg.size()); + } + } + cerr << "Corpus intialized randomly." << endl; + cerr << "LLH = " << Likelihood() << " \t(Amodel=" << paj_model.Likelihood() + << " TModel=" << model.Likelihood() << ") contexts=" << model.UniqueConditioningContexts() << endl; + } + + void ResampleCorpus() { + for (unsigned i = 0; i < corpus.size(); ++i) { + AlignedSentencePair& asp = corpus[i]; + SampleSet<prob_t> ss; ss.resize(asp.src.size() + 1); + for (unsigned j = 0; j < asp.trg.size(); ++j) { + unsigned char& a_j = asp.a[j].src_index; + const WordID e_j = asp.trg[j]; + WordID f_a_j = (a_j ? asp.src[a_j - 1] : kNULL); + model.Decrement(f_a_j, e_j, prng); + paj_model.Decrement(a_j, j, asp.src.size(), asp.trg.size()); + + for (unsigned prop_a_j = 0; prop_a_j <= asp.src.size(); ++prop_a_j) { + const WordID prop_f = (prop_a_j ? asp.src[prop_a_j - 1] : kNULL); + ss[prop_a_j] = model.Prob(prop_f, e_j); + ss[prop_a_j] *= paj_model.Prob(prop_a_j, j, asp.src.size(), asp.trg.size()); + } + a_j = prng->SelectSample(ss); + f_a_j = (a_j ? asp.src[a_j - 1] : kNULL); + model.Increment(f_a_j, e_j, prng); + paj_model.Increment(a_j, j, asp.src.size(), asp.trg.size()); + } + } + } + + prob_t Likelihood() const { + return model.Likelihood() * paj_model.Likelihood(); + } +}; + +void ExtractLetters(const set<WordID>& v, vector<vector<WordID> >* l, set<WordID>* letset = NULL) { + for (set<WordID>::const_iterator it = v.begin(); it != v.end(); ++it) { + vector<WordID>& letters = (*l)[*it]; + if (letters.size()) continue; // if e and f have the same word + + const string& w = TD::Convert(*it); + + size_t cur = 0; + while (cur < w.size()) { + const size_t len = UTF8Len(w[cur]); + letters.push_back(TD::Convert(w.substr(cur, len))); + if (letset) letset->insert(letters.back()); + cur += len; + } + } +} + +void Debug(const AlignedSentencePair& asp) { + cerr << TD::GetString(asp.src) << endl << TD::GetString(asp.trg) << endl; + Array2D<bool> a(asp.src.size(), asp.trg.size()); + for (unsigned j = 0; j < asp.trg.size(); ++j) { + assert(asp.a[j].src_index <= asp.src.size()); + if (asp.a[j].src_index) a(asp.a[j].src_index - 1, j) = true; + } + cerr << a << endl; +} + +void AddSample(AlignedSentencePair* asp) { + for (unsigned j = 0; j < asp->trg.size(); ++j) + asp->posterior(asp->a[j].src_index, j)++; +} + +void WriteAlignments(const AlignedSentencePair& asp) { + bool first = true; + for (unsigned j = 0; j < asp.trg.size(); ++j) { + int src_index = -1; + int mc = -1; + for (unsigned i = 0; i <= asp.src.size(); ++i) { + if (asp.posterior(i, j) > mc) { + mc = asp.posterior(i, j); + src_index = i; + } + } + + if (src_index) { + if (first) first = false; else cout << ' '; + cout << (src_index - 1) << '-' << j; + } + } + cout << endl; +} + +int main(int argc, char** argv) { + po::variables_map conf; + InitCommandLine(argc, argv, &conf); + + if (conf.count("random_seed")) + prng = new MT19937(conf["random_seed"].as<uint32_t>()); + else + prng = new MT19937; + + vector<vector<int> > corpuse, corpusf; + set<int> vocabe, vocabf; + corpus::ReadParallelCorpus(conf["input"].as<string>(), &corpusf, &corpuse, &vocabf, &vocabe); + cerr << "f-Corpus size: " << corpusf.size() << " sentences\n"; + cerr << "f-Vocabulary size: " << vocabf.size() << " types\n"; + cerr << "f-Corpus size: " << corpuse.size() << " sentences\n"; + cerr << "f-Vocabulary size: " << vocabe.size() << " types\n"; + assert(corpusf.size() == corpuse.size()); + + vector<AlignedSentencePair> corpus(corpuse.size()); + for (unsigned i = 0; i < corpuse.size(); ++i) { + corpus[i].src.swap(corpusf[i]); + corpus[i].trg.swap(corpuse[i]); + corpus[i].posterior.resize(corpus[i].src.size() + 1, corpus[i].trg.size()); + } + corpusf.clear(); corpuse.clear(); + + vocabf.insert(TD::Convert("NULL")); + vector<vector<WordID> > letters(TD::NumWords()); + set<WordID> letset; + ExtractLetters(vocabe, &letters, &letset); + ExtractLetters(vocabf, &letters, NULL); + letters[TD::Convert("NULL")].clear(); + + Aligner aligner(letters, letset.size(), conf, &corpus); + aligner.InitializeRandom(); + + const unsigned samples = conf["samples"].as<unsigned>(); + for (int i = 0; i < samples; ++i) { + for (int j = 65; j < 67; ++j) Debug(corpus[j]); + if (i % 10 == 9) { + aligner.ResampleHyperparameters(); + cerr << "LLH = " << aligner.Likelihood() << " \t(Amodel=" << aligner.paj_model.Likelihood() + << " TModel=" << aligner.model.Likelihood() << ") contexts=" << aligner.model.UniqueConditioningContexts() << endl; + } + aligner.ResampleCorpus(); + if (i > (samples / 5) && (i % 6 == 5)) for (int j = 0; j < corpus.size(); ++j) AddSample(&corpus[j]); + } + for (unsigned i = 0; i < corpus.size(); ++i) + WriteAlignments(corpus[i]); + aligner.model.Summary(); + + return 0; +} diff --git a/gi/pf/align-tl.cc b/gi/pf/align-tl.cc new file mode 100644 index 00000000..cbe8c6c8 --- /dev/null +++ b/gi/pf/align-tl.cc @@ -0,0 +1,339 @@ +#include <iostream> +#include <tr1/memory> +#include <queue> + +#include <boost/multi_array.hpp> +#include <boost/program_options.hpp> +#include <boost/program_options/variables_map.hpp> + +#include "backward.h" +#include "array2d.h" +#include "base_distributions.h" +#include "monotonic_pseg.h" +#include "conditional_pseg.h" +#include "trule.h" +#include "tdict.h" +#include "stringlib.h" +#include "filelib.h" +#include "dict.h" +#include "sampler.h" +#include "mfcr.h" +#include "corpus.h" +#include "ngram_base.h" +#include "transliterations.h" + +using namespace std; +using namespace tr1; +namespace po = boost::program_options; + +void InitCommandLine(int argc, char** argv, po::variables_map* conf) { + po::options_description opts("Configuration options"); + opts.add_options() + ("samples,s",po::value<unsigned>()->default_value(1000),"Number of samples") + ("input,i",po::value<string>(),"Read parallel data from") + ("s2t", po::value<string>(), "character level source-to-target prior transliteration probabilities") + ("t2s", po::value<string>(), "character level target-to-source prior transliteration probabilities") + ("max_src_chunk", po::value<unsigned>()->default_value(4), "Maximum size of translitered chunk in source") + ("max_trg_chunk", po::value<unsigned>()->default_value(4), "Maximum size of translitered chunk in target") + ("expected_src_to_trg_ratio", po::value<double>()->default_value(1.0), "If a word is transliterated, what is the expected length ratio from source to target?") + ("random_seed,S",po::value<uint32_t>(), "Random seed"); + po::options_description clo("Command line options"); + clo.add_options() + ("config", po::value<string>(), "Configuration file") + ("help,h", "Print this help message and exit"); + po::options_description dconfig_options, dcmdline_options; + dconfig_options.add(opts); + dcmdline_options.add(opts).add(clo); + + po::store(parse_command_line(argc, argv, dcmdline_options), *conf); + if (conf->count("config")) { + ifstream config((*conf)["config"].as<string>().c_str()); + po::store(po::parse_config_file(config, dconfig_options), *conf); + } + po::notify(*conf); + + if (conf->count("help") || (conf->count("input") == 0)) { + cerr << dcmdline_options << endl; + exit(1); + } +} + +shared_ptr<MT19937> prng; + +struct LexicalAlignment { + unsigned char src_index; + bool is_transliteration; + vector<pair<short, short> > derivation; +}; + +struct AlignedSentencePair { + vector<WordID> src; + vector<WordID> trg; + vector<LexicalAlignment> a; + Array2D<short> posterior; +}; + +struct HierarchicalWordBase { + explicit HierarchicalWordBase(const unsigned vocab_e_size) : + base(prob_t::One()), r(1,1,1,1,0.66,50.0), u0(-log(vocab_e_size)), l(1,prob_t::One()), v(1, prob_t::Zero()) {} + + void ResampleHyperparameters(MT19937* rng) { + r.resample_hyperparameters(rng); + } + + inline double logp0(const vector<WordID>& s) const { + return Md::log_poisson(s.size(), 7.5) + s.size() * u0; + } + + // return p0 of rule.e_ + prob_t operator()(const TRule& rule) const { + v[0].logeq(logp0(rule.e_)); + return r.prob(rule.e_, v.begin(), l.begin()); + } + + void Increment(const TRule& rule) { + v[0].logeq(logp0(rule.e_)); + if (r.increment(rule.e_, v.begin(), l.begin(), &*prng).count) { + base *= v[0] * l[0]; + } + } + + void Decrement(const TRule& rule) { + if (r.decrement(rule.e_, &*prng).count) { + base /= prob_t(exp(logp0(rule.e_))); + } + } + + prob_t Likelihood() const { + prob_t p; p.logeq(r.log_crp_prob()); + p *= base; + return p; + } + + void Summary() const { + cerr << "NUMBER OF CUSTOMERS: " << r.num_customers() << " (d=" << r.discount() << ",s=" << r.strength() << ')' << endl; + for (MFCR<1,vector<WordID> >::const_iterator it = r.begin(); it != r.end(); ++it) + cerr << " " << it->second.total_dish_count_ << " (on " << it->second.table_counts_.size() << " tables) " << TD::GetString(it->first) << endl; + } + + prob_t base; + MFCR<1,vector<WordID> > r; + const double u0; + const vector<prob_t> l; + mutable vector<prob_t> v; +}; + +struct BasicLexicalAlignment { + explicit BasicLexicalAlignment(const vector<vector<WordID> >& lets, + const unsigned words_e, + const unsigned letters_e, + vector<AlignedSentencePair>* corp) : + letters(lets), + corpus(*corp), + //up0(words_e), + //up0("en.chars.1gram", letters_e), + //up0("en.words.1gram"), + up0(letters_e), + //up0("en.chars.2gram"), + tmodel(up0) { + } + + void InstantiateRule(const WordID src, + const WordID trg, + TRule* rule) const { + static const WordID kX = TD::Convert("X") * -1; + rule->lhs_ = kX; + rule->e_ = letters[trg]; + rule->f_ = letters[src]; + } + + void InitializeRandom() { + const WordID kNULL = TD::Convert("NULL"); + cerr << "Initializing with random alignments ...\n"; + for (unsigned i = 0; i < corpus.size(); ++i) { + AlignedSentencePair& asp = corpus[i]; + asp.a.resize(asp.trg.size()); + for (unsigned j = 0; j < asp.trg.size(); ++j) { + const unsigned char a_j = prng->next() * (1 + asp.src.size()); + const WordID f_a_j = (a_j ? asp.src[a_j - 1] : kNULL); + TRule r; + InstantiateRule(f_a_j, asp.trg[j], &r); + asp.a[j].is_transliteration = false; + asp.a[j].src_index = a_j; + if (tmodel.IncrementRule(r, &*prng)) + up0.Increment(r); + } + } + cerr << " LLH = " << Likelihood() << endl; + } + + prob_t Likelihood() const { + prob_t p = tmodel.Likelihood(); + p *= up0.Likelihood(); + return p; + } + + void ResampleHyperparemeters() { + tmodel.ResampleHyperparameters(&*prng); + up0.ResampleHyperparameters(&*prng); + cerr << " (base d=" << up0.r.discount() << ",s=" << up0.r.strength() << ")\n"; + } + + void ResampleCorpus(); + + const vector<vector<WordID> >& letters; // spelling dictionary + vector<AlignedSentencePair>& corpus; + //PhraseConditionalUninformativeBase up0; + //PhraseConditionalUninformativeUnigramBase up0; + //UnigramWordBase up0; + //HierarchicalUnigramBase up0; + HierarchicalWordBase up0; + //CompletelyUniformBase up0; + //FixedNgramBase up0; + //ConditionalTranslationModel<PhraseConditionalUninformativeBase> tmodel; + //ConditionalTranslationModel<PhraseConditionalUninformativeUnigramBase> tmodel; + //ConditionalTranslationModel<UnigramWordBase> tmodel; + //ConditionalTranslationModel<HierarchicalUnigramBase> tmodel; + MConditionalTranslationModel<HierarchicalWordBase> tmodel; + //ConditionalTranslationModel<FixedNgramBase> tmodel; + //ConditionalTranslationModel<CompletelyUniformBase> tmodel; +}; + +void BasicLexicalAlignment::ResampleCorpus() { + static const WordID kNULL = TD::Convert("NULL"); + for (unsigned i = 0; i < corpus.size(); ++i) { + AlignedSentencePair& asp = corpus[i]; + SampleSet<prob_t> ss; ss.resize(asp.src.size() + 1); + for (unsigned j = 0; j < asp.trg.size(); ++j) { + TRule r; + unsigned char& a_j = asp.a[j].src_index; + WordID f_a_j = (a_j ? asp.src[a_j - 1] : kNULL); + InstantiateRule(f_a_j, asp.trg[j], &r); + if (tmodel.DecrementRule(r, &*prng)) + up0.Decrement(r); + + for (unsigned prop_a_j = 0; prop_a_j <= asp.src.size(); ++prop_a_j) { + const WordID prop_f = (prop_a_j ? asp.src[prop_a_j - 1] : kNULL); + InstantiateRule(prop_f, asp.trg[j], &r); + ss[prop_a_j] = tmodel.RuleProbability(r); + } + a_j = prng->SelectSample(ss); + f_a_j = (a_j ? asp.src[a_j - 1] : kNULL); + InstantiateRule(f_a_j, asp.trg[j], &r); + if (tmodel.IncrementRule(r, &*prng)) + up0.Increment(r); + } + } + cerr << " LLH = " << Likelihood() << endl; +} + +void ExtractLetters(const set<WordID>& v, vector<vector<WordID> >* l, set<WordID>* letset = NULL) { + for (set<WordID>::const_iterator it = v.begin(); it != v.end(); ++it) { + vector<WordID>& letters = (*l)[*it]; + if (letters.size()) continue; // if e and f have the same word + + const string& w = TD::Convert(*it); + + size_t cur = 0; + while (cur < w.size()) { + const size_t len = UTF8Len(w[cur]); + letters.push_back(TD::Convert(w.substr(cur, len))); + if (letset) letset->insert(letters.back()); + cur += len; + } + } +} + +void Debug(const AlignedSentencePair& asp) { + cerr << TD::GetString(asp.src) << endl << TD::GetString(asp.trg) << endl; + Array2D<bool> a(asp.src.size(), asp.trg.size()); + for (unsigned j = 0; j < asp.trg.size(); ++j) + if (asp.a[j].src_index) a(asp.a[j].src_index - 1, j) = true; + cerr << a << endl; +} + +void AddSample(AlignedSentencePair* asp) { + for (unsigned j = 0; j < asp->trg.size(); ++j) + asp->posterior(asp->a[j].src_index, j)++; +} + +void WriteAlignments(const AlignedSentencePair& asp) { + bool first = true; + for (unsigned j = 0; j < asp.trg.size(); ++j) { + int src_index = -1; + int mc = -1; + for (unsigned i = 0; i <= asp.src.size(); ++i) { + if (asp.posterior(i, j) > mc) { + mc = asp.posterior(i, j); + src_index = i; + } + } + + if (src_index) { + if (first) first = false; else cout << ' '; + cout << (src_index - 1) << '-' << j; + } + } + cout << endl; +} + +int main(int argc, char** argv) { + po::variables_map conf; + InitCommandLine(argc, argv, &conf); + + if (conf.count("random_seed")) + prng.reset(new MT19937(conf["random_seed"].as<uint32_t>())); + else + prng.reset(new MT19937); +// MT19937& rng = *prng; + + vector<vector<int> > corpuse, corpusf; + set<int> vocabe, vocabf; + corpus::ReadParallelCorpus(conf["input"].as<string>(), &corpusf, &corpuse, &vocabf, &vocabe); + cerr << "f-Corpus size: " << corpusf.size() << " sentences\n"; + cerr << "f-Vocabulary size: " << vocabf.size() << " types\n"; + cerr << "f-Corpus size: " << corpuse.size() << " sentences\n"; + cerr << "f-Vocabulary size: " << vocabe.size() << " types\n"; + assert(corpusf.size() == corpuse.size()); + + vector<AlignedSentencePair> corpus(corpuse.size()); + for (unsigned i = 0; i < corpuse.size(); ++i) { + corpus[i].src.swap(corpusf[i]); + corpus[i].trg.swap(corpuse[i]); + corpus[i].posterior.resize(corpus[i].src.size() + 1, corpus[i].trg.size()); + } + corpusf.clear(); corpuse.clear(); + + vocabf.insert(TD::Convert("NULL")); + vector<vector<WordID> > letters(TD::NumWords() + 1); + set<WordID> letset; + ExtractLetters(vocabe, &letters, &letset); + ExtractLetters(vocabf, &letters, NULL); + letters[TD::Convert("NULL")].clear(); + + // TODO configure this + const int max_src_chunk = conf["max_src_chunk"].as<unsigned>(); + const int max_trg_chunk = conf["max_trg_chunk"].as<unsigned>(); + const double s2t_rat = conf["expected_src_to_trg_ratio"].as<double>(); + const BackwardEstimator be(conf["s2t"].as<string>(), conf["t2s"].as<string>()); + Transliterations tl(max_src_chunk, max_trg_chunk, s2t_rat, be); + + cerr << "Initializing transliteration graph structures ...\n"; + for (int i = 0; i < corpus.size(); ++i) { + const vector<int>& src = corpus[i].src; + const vector<int>& trg = corpus[i].trg; + for (int j = 0; j < src.size(); ++j) { + const vector<int>& src_let = letters[src[j]]; + for (int k = 0; k < trg.size(); ++k) { + const vector<int>& trg_let = letters[trg[k]]; + tl.Initialize(src[j], src_let, trg[k], trg_let); + //if (src_let.size() < min_trans_src) + // tl.Forbid(src[j], src_let, trg[k], trg_let); + } + } + } + cerr << endl; + tl.GraphSummary(); + + return 0; +} diff --git a/gi/pf/backward.cc b/gi/pf/backward.cc new file mode 100644 index 00000000..b92629fd --- /dev/null +++ b/gi/pf/backward.cc @@ -0,0 +1,89 @@ +#include "backward.h" + +#include <queue> +#include <utility> + +#include "array2d.h" +#include "reachability.h" +#include "base_distributions.h" + +using namespace std; + +BackwardEstimator::BackwardEstimator(const string& s2t, + const string& t2s) : m1(new Model1(s2t)), m1inv(new Model1(t2s)) {} + +BackwardEstimator::~BackwardEstimator() { + delete m1; m1 = NULL; + delete m1inv; m1inv = NULL; +} + +float BackwardEstimator::ComputeBackwardProb(const std::vector<WordID>& src, + const std::vector<WordID>& trg, + unsigned src_covered, + unsigned trg_covered, + double s2t_ratio) const { + if (src_covered == src.size() || trg_covered == trg.size()) { + assert(src_covered == src.size()); + assert(trg_covered == trg.size()); + return 0; + } + static const WordID kNULL = TD::Convert("<eps>"); + const prob_t uniform_alignment(1.0 / (src.size() - src_covered + 1)); + // TODO factor in expected length ratio + prob_t e; e.logeq(Md::log_poisson(trg.size() - trg_covered, (src.size() - src_covered) * s2t_ratio)); // p(trg len remaining | src len remaining) + for (unsigned j = trg_covered; j < trg.size(); ++j) { + prob_t p = (*m1)(kNULL, trg[j]) + prob_t(1e-12); + for (unsigned i = src_covered; i < src.size(); ++i) + p += (*m1)(src[i], trg[j]); + if (p.is_0()) { + cerr << "ERROR: p(" << TD::Convert(trg[j]) << " | " << TD::GetString(src) << ") = 0!\n"; + assert(!"failed"); + } + p *= uniform_alignment; + e *= p; + } + // TODO factor in expected length ratio + const prob_t inv_uniform(1.0 / (trg.size() - trg_covered + 1.0)); + prob_t inv; + inv.logeq(Md::log_poisson(src.size() - src_covered, (trg.size() - trg_covered) / s2t_ratio)); + for (unsigned i = src_covered; i < src.size(); ++i) { + prob_t p = (*m1inv)(kNULL, src[i]) + prob_t(1e-12); + for (unsigned j = trg_covered; j < trg.size(); ++j) + p += (*m1inv)(trg[j], src[i]); + if (p.is_0()) { + cerr << "ERROR: p_inv(" << TD::Convert(src[i]) << " | " << TD::GetString(trg) << ") = 0!\n"; + assert(!"failed"); + } + p *= inv_uniform; + inv *= p; + } + return (log(e) + log(inv)) / 2; +} + +void BackwardEstimator::InitializeGrid(const vector<WordID>& src, + const vector<WordID>& trg, + const Reachability& r, + double s2t_ratio, + float* grid) const { + queue<pair<int,int> > q; + q.push(make_pair(0,0)); + Array2D<bool> done(src.size()+1, trg.size()+1, false); + //cerr << TD::GetString(src) << " ||| " << TD::GetString(trg) << endl; + while(!q.empty()) { + const pair<int,int> n = q.front(); + q.pop(); + if (done(n.first,n.second)) continue; + done(n.first,n.second) = true; + + float lp = ComputeBackwardProb(src, trg, n.first, n.second, s2t_ratio); + if (n.first == 0 && n.second == 0) grid[0] = lp; + //cerr << " " << n.first << "," << n.second << "\t" << lp << endl; + + if (n.first == src.size() || n.second == trg.size()) continue; + const vector<pair<short,short> >& edges = r.valid_deltas[n.first][n.second]; + for (int i = 0; i < edges.size(); ++i) + q.push(make_pair(n.first + edges[i].first, n.second + edges[i].second)); + } + //static int cc = 0; ++cc; if (cc == 80) exit(1); +} + diff --git a/gi/pf/backward.h b/gi/pf/backward.h new file mode 100644 index 00000000..e67eff0c --- /dev/null +++ b/gi/pf/backward.h @@ -0,0 +1,33 @@ +#ifndef _BACKWARD_H_ +#define _BACKWARD_H_ + +#include <vector> +#include <string> +#include "wordid.h" + +struct Reachability; +struct Model1; + +struct BackwardEstimator { + BackwardEstimator(const std::string& s2t, + const std::string& t2s); + ~BackwardEstimator(); + + void InitializeGrid(const std::vector<WordID>& src, + const std::vector<WordID>& trg, + const Reachability& r, + double src2trg_ratio, + float* grid) const; + + private: + float ComputeBackwardProb(const std::vector<WordID>& src, + const std::vector<WordID>& trg, + unsigned src_covered, + unsigned trg_covered, + double src2trg_ratio) const; + + Model1* m1; + Model1* m1inv; +}; + +#endif diff --git a/gi/pf/base_measures.cc b/gi/pf/base_distributions.cc index 8adb37d7..d9761005 100644 --- a/gi/pf/base_measures.cc +++ b/gi/pf/base_distributions.cc @@ -1,4 +1,4 @@ -#include "base_measures.h" +#include "base_distributions.h" #include <iostream> @@ -6,6 +6,79 @@ using namespace std; +TableLookupBase::TableLookupBase(const string& fname) { + cerr << "TableLookupBase reading from " << fname << " ..." << endl; + ReadFile rf(fname); + istream& in = *rf.stream(); + string line; + unsigned lc = 0; + const WordID kDIV = TD::Convert("|||"); + vector<WordID> tmp; + vector<int> le, lf; + TRule x; + x.lhs_ = -TD::Convert("X"); + bool flag = false; + while(getline(in, line)) { + ++lc; + if (lc % 1000000 == 0) { cerr << " [" << lc << ']' << endl; flag = false; } + else if (lc % 25000 == 0) { cerr << '.' << flush; flag = true; } + tmp.clear(); + TD::ConvertSentence(line, &tmp); + x.f_.clear(); + x.e_.clear(); + size_t pos = 0; + int cc = 0; + while(pos < tmp.size()) { + const WordID cur = tmp[pos++]; + if (cur == kDIV) { + ++cc; + } else if (cc == 0) { + x.f_.push_back(cur); + } else if (cc == 1) { + x.e_.push_back(cur); + } else if (cc == 2) { + table[x].logeq(atof(TD::Convert(cur))); + ++cc; + } else { + if (flag) cerr << endl; + cerr << "Bad format in " << lc << ": " << line << endl; abort(); + } + } + if (cc != 3) { + if (flag) cerr << endl; + cerr << "Bad format in " << lc << ": " << line << endl; abort(); + } + } + if (flag) cerr << endl; + cerr << " read " << lc << " entries\n"; +} + +prob_t PhraseConditionalUninformativeUnigramBase::p0(const vector<WordID>& vsrc, + const vector<WordID>& vtrg, + int start_src, int start_trg) const { + const int flen = vsrc.size() - start_src; + const int elen = vtrg.size() - start_trg; + prob_t p; + p.logeq(Md::log_poisson(elen, flen + 0.01)); // elen | flen ~Pois(flen + 0.01) + //p.logeq(log_poisson(elen, 1)); // elen | flen ~Pois(flen + 0.01) + for (int i = 0; i < elen; ++i) + p *= u(vtrg[i + start_trg]); // draw e_i ~Uniform + return p; +} + +prob_t PhraseConditionalUninformativeBase::p0(const vector<WordID>& vsrc, + const vector<WordID>& vtrg, + int start_src, int start_trg) const { + const int flen = vsrc.size() - start_src; + const int elen = vtrg.size() - start_trg; + prob_t p; + //p.logeq(log_poisson(elen, flen + 0.01)); // elen | flen ~Pois(flen + 0.01) + p.logeq(Md::log_poisson(elen, 1)); // elen | flen ~Pois(flen + 0.01) + for (int i = 0; i < elen; ++i) + p *= kUNIFORM_TARGET; // draw e_i ~Uniform + return p; +} + void Model1::LoadModel1(const string& fname) { cerr << "Loading Model 1 parameters from " << fname << " ..." << endl; ReadFile rf(fname); @@ -40,7 +113,7 @@ prob_t PhraseConditionalBase::p0(const vector<WordID>& vsrc, const int elen = vtrg.size() - start_trg; prob_t uniform_src_alignment; uniform_src_alignment.logeq(-log(flen + 1)); prob_t p; - p.logeq(log_poisson(elen, flen + 0.01)); // elen | flen ~Pois(flen + 0.01) + p.logeq(Md::log_poisson(elen, flen + 0.01)); // elen | flen ~Pois(flen + 0.01) for (int i = 0; i < elen; ++i) { // for each position i in e-RHS const WordID trg = vtrg[i + start_trg]; prob_t tp = prob_t::Zero(); @@ -66,9 +139,9 @@ prob_t PhraseJointBase::p0(const vector<WordID>& vsrc, const int elen = vtrg.size() - start_trg; prob_t uniform_src_alignment; uniform_src_alignment.logeq(-log(flen + 1)); prob_t p; - p.logeq(log_poisson(flen, 1.0)); // flen ~Pois(1) + p.logeq(Md::log_poisson(flen, 1.0)); // flen ~Pois(1) // elen | flen ~Pois(flen + 0.01) - prob_t ptrglen; ptrglen.logeq(log_poisson(elen, flen + 0.01)); + prob_t ptrglen; ptrglen.logeq(Md::log_poisson(elen, flen + 0.01)); p *= ptrglen; p *= kUNIFORM_SOURCE.pow(flen); // each f in F ~Uniform for (int i = 0; i < elen; ++i) { // for each position i in E @@ -98,9 +171,9 @@ prob_t PhraseJointBase_BiDir::p0(const vector<WordID>& vsrc, prob_t uniform_trg_alignment; uniform_trg_alignment.logeq(-log(elen + 1)); prob_t p1; - p1.logeq(log_poisson(flen, 1.0)); // flen ~Pois(1) + p1.logeq(Md::log_poisson(flen, 1.0)); // flen ~Pois(1) // elen | flen ~Pois(flen + 0.01) - prob_t ptrglen; ptrglen.logeq(log_poisson(elen, flen + 0.01)); + prob_t ptrglen; ptrglen.logeq(Md::log_poisson(elen, flen + 0.01)); p1 *= ptrglen; p1 *= kUNIFORM_SOURCE.pow(flen); // each f in F ~Uniform for (int i = 0; i < elen; ++i) { // for each position i in E @@ -120,9 +193,9 @@ prob_t PhraseJointBase_BiDir::p0(const vector<WordID>& vsrc, } prob_t p2; - p2.logeq(log_poisson(elen, 1.0)); // elen ~Pois(1) + p2.logeq(Md::log_poisson(elen, 1.0)); // elen ~Pois(1) // flen | elen ~Pois(flen + 0.01) - prob_t psrclen; psrclen.logeq(log_poisson(flen, elen + 0.01)); + prob_t psrclen; psrclen.logeq(Md::log_poisson(flen, elen + 0.01)); p2 *= psrclen; p2 *= kUNIFORM_TARGET.pow(elen); // each f in F ~Uniform for (int i = 0; i < flen; ++i) { // for each position i in E @@ -154,9 +227,9 @@ JumpBase::JumpBase() : p(200) { for (int j = min_jump; j <= max_jump; ++j) { prob_t& cp = cpd[j]; if (j < 0) - cp.logeq(log_poisson(1.5-j, 1)); + cp.logeq(Md::log_poisson(1.5-j, 1)); else if (j > 0) - cp.logeq(log_poisson(j, 1)); + cp.logeq(Md::log_poisson(j, 1)); cp.poweq(0.2); z += cp; } diff --git a/gi/pf/base_measures.h b/gi/pf/base_distributions.h index 7ce7e2e6..84dacdf2 100644 --- a/gi/pf/base_measures.h +++ b/gi/pf/base_distributions.h @@ -6,22 +6,15 @@ #include <string> #include <cmath> #include <iostream> +#include <cassert> +#include "unigrams.h" #include "trule.h" #include "prob.h" #include "tdict.h" - -inline double log_poisson(unsigned x, const double& lambda) { - assert(lambda > 0.0); - return log(lambda) * x - lgamma(x + 1) - lambda; -} - -inline std::ostream& operator<<(std::ostream& os, const std::vector<WordID>& p) { - os << '['; - for (int i = 0; i < p.size(); ++i) - os << (i==0 ? "" : " ") << TD::Convert(p[i]); - return os << ']'; -} +#include "sampler.h" +#include "m.h" +#include "os_phrase.h" struct Model1 { explicit Model1(const std::string& fname) : @@ -49,6 +42,104 @@ struct Model1 { std::vector<std::map<WordID, prob_t> > ttable; }; +struct PoissonUniformUninformativeBase { + explicit PoissonUniformUninformativeBase(const unsigned ves) : kUNIFORM(1.0 / ves) {} + prob_t operator()(const TRule& r) const { + prob_t p; p.logeq(Md::log_poisson(r.e_.size(), 1.0)); + prob_t q = kUNIFORM; q.poweq(r.e_.size()); + p *= q; + return p; + } + void Summary() const {} + void ResampleHyperparameters(MT19937*) {} + void Increment(const TRule&) {} + void Decrement(const TRule&) {} + prob_t Likelihood() const { return prob_t::One(); } + const prob_t kUNIFORM; +}; + +struct CompletelyUniformBase { + explicit CompletelyUniformBase(const unsigned ves) : kUNIFORM(1.0 / ves) {} + prob_t operator()(const TRule&) const { + return kUNIFORM; + } + void Summary() const {} + void ResampleHyperparameters(MT19937*) {} + void Increment(const TRule&) {} + void Decrement(const TRule&) {} + prob_t Likelihood() const { return prob_t::One(); } + const prob_t kUNIFORM; +}; + +struct UnigramWordBase { + explicit UnigramWordBase(const std::string& fname) : un(fname) {} + prob_t operator()(const TRule& r) const { + return un(r.e_); + } + const UnigramWordModel un; +}; + +struct RuleHasher { + size_t operator()(const TRule& r) const { + return hash_value(r); + } +}; + +struct TableLookupBase { + TableLookupBase(const std::string& fname); + + prob_t operator()(const TRule& rule) const { + const std::tr1::unordered_map<TRule,prob_t>::const_iterator it = table.find(rule); + if (it == table.end()) { + std::cerr << rule << " not found\n"; + abort(); + } + return it->second; + } + + void ResampleHyperparameters(MT19937*) {} + void Increment(const TRule&) {} + void Decrement(const TRule&) {} + prob_t Likelihood() const { return prob_t::One(); } + void Summary() const {} + + std::tr1::unordered_map<TRule,prob_t,RuleHasher> table; +}; + +struct PhraseConditionalUninformativeBase { + explicit PhraseConditionalUninformativeBase(const unsigned vocab_e_size) : + kUNIFORM_TARGET(1.0 / vocab_e_size) { + assert(vocab_e_size > 0); + } + + // return p0 of rule.e_ | rule.f_ + prob_t operator()(const TRule& rule) const { + return p0(rule.f_, rule.e_, 0, 0); + } + + prob_t p0(const std::vector<WordID>& vsrc, const std::vector<WordID>& vtrg, int start_src, int start_trg) const; + + void Summary() const {} + void ResampleHyperparameters(MT19937*) {} + void Increment(const TRule&) {} + void Decrement(const TRule&) {} + prob_t Likelihood() const { return prob_t::One(); } + const prob_t kUNIFORM_TARGET; +}; + +struct PhraseConditionalUninformativeUnigramBase { + explicit PhraseConditionalUninformativeUnigramBase(const std::string& file, const unsigned vocab_e_size) : u(file, vocab_e_size) {} + + // return p0 of rule.e_ | rule.f_ + prob_t operator()(const TRule& rule) const { + return p0(rule.f_, rule.e_, 0, 0); + } + + prob_t p0(const std::vector<WordID>& vsrc, const std::vector<WordID>& vtrg, int start_src, int start_trg) const; + + const UnigramModel u; +}; + struct PhraseConditionalBase { explicit PhraseConditionalBase(const Model1& m1, const double m1mixture, const unsigned vocab_e_size) : model1(m1), @@ -83,7 +174,7 @@ struct PhraseJointBase { assert(vocab_e_size > 0); } - // return p0 of rule.e_ | rule.f_ + // return p0 of rule.e_ , rule.f_ prob_t operator()(const TRule& rule) const { return p0(rule.f_, rule.e_, 0, 0); } @@ -113,7 +204,7 @@ struct PhraseJointBase_BiDir { assert(vocab_e_size > 0); } - // return p0 of rule.e_ | rule.f_ + // return p0 of rule.e_ , rule.f_ prob_t operator()(const TRule& rule) const { return p0(rule.f_, rule.e_, 0, 0); } diff --git a/gi/pf/brat.cc b/gi/pf/brat.cc index 7b60ef23..c2c52760 100644 --- a/gi/pf/brat.cc +++ b/gi/pf/brat.cc @@ -191,7 +191,7 @@ struct UniphraseLM { void ResampleHyperparameters(MT19937* rng) { phrases_.resample_hyperparameters(rng); gen_.resample_hyperparameters(rng); - cerr << " " << phrases_.concentration(); + cerr << " " << phrases_.alpha(); } CCRP_NoTable<vector<int> > phrases_; diff --git a/gi/pf/conditional_pseg.h b/gi/pf/conditional_pseg.h new file mode 100644 index 00000000..81ddb206 --- /dev/null +++ b/gi/pf/conditional_pseg.h @@ -0,0 +1,275 @@ +#ifndef _CONDITIONAL_PSEG_H_ +#define _CONDITIONAL_PSEG_H_ + +#include <vector> +#include <tr1/unordered_map> +#include <boost/functional/hash.hpp> +#include <iostream> + +#include "m.h" +#include "prob.h" +#include "ccrp_nt.h" +#include "mfcr.h" +#include "trule.h" +#include "base_distributions.h" +#include "tdict.h" + +template <typename ConditionalBaseMeasure> +struct MConditionalTranslationModel { + explicit MConditionalTranslationModel(ConditionalBaseMeasure& rcp0) : + rp0(rcp0), d(0.5), strength(1.0), lambdas(1, prob_t::One()), p0s(1) {} + + void Summary() const { + std::cerr << "Number of conditioning contexts: " << r.size() << std::endl; + for (RuleModelHash::const_iterator it = r.begin(); it != r.end(); ++it) { + std::cerr << TD::GetString(it->first) << " \t(d=" << it->second.discount() << ",s=" << it->second.strength() << ") --------------------------" << std::endl; + for (MFCR<1,TRule>::const_iterator i2 = it->second.begin(); i2 != it->second.end(); ++i2) + std::cerr << " " << i2->second.total_dish_count_ << '\t' << i2->first << std::endl; + } + } + + double log_likelihood(const double& dd, const double& aa) const { + if (aa <= -dd) return -std::numeric_limits<double>::infinity(); + //double llh = Md::log_beta_density(dd, 10, 3) + Md::log_gamma_density(aa, 1, 1); + double llh = Md::log_beta_density(dd, 1, 1) + + Md::log_gamma_density(dd + aa, 1, 1); + typename std::tr1::unordered_map<std::vector<WordID>, MFCR<1,TRule>, boost::hash<std::vector<WordID> > >::const_iterator it; + for (it = r.begin(); it != r.end(); ++it) + llh += it->second.log_crp_prob(dd, aa); + return llh; + } + + struct DiscountResampler { + DiscountResampler(const MConditionalTranslationModel& m) : m_(m) {} + const MConditionalTranslationModel& m_; + double operator()(const double& proposed_discount) const { + return m_.log_likelihood(proposed_discount, m_.strength); + } + }; + + struct AlphaResampler { + AlphaResampler(const MConditionalTranslationModel& m) : m_(m) {} + const MConditionalTranslationModel& m_; + double operator()(const double& proposed_strength) const { + return m_.log_likelihood(m_.d, proposed_strength); + } + }; + + void ResampleHyperparameters(MT19937* rng) { + typename std::tr1::unordered_map<std::vector<WordID>, MFCR<1,TRule>, boost::hash<std::vector<WordID> > >::iterator it; +#if 1 + for (it = r.begin(); it != r.end(); ++it) { + it->second.resample_hyperparameters(rng); + } +#else + const unsigned nloop = 5; + const unsigned niterations = 10; + DiscountResampler dr(*this); + AlphaResampler ar(*this); + for (int iter = 0; iter < nloop; ++iter) { + strength = slice_sampler1d(ar, strength, *rng, -d + std::numeric_limits<double>::min(), + std::numeric_limits<double>::infinity(), 0.0, niterations, 100*niterations); + double min_discount = std::numeric_limits<double>::min(); + if (strength < 0.0) min_discount -= strength; + d = slice_sampler1d(dr, d, *rng, min_discount, + 1.0, 0.0, niterations, 100*niterations); + } + strength = slice_sampler1d(ar, strength, *rng, -d, + std::numeric_limits<double>::infinity(), 0.0, niterations, 100*niterations); + std::cerr << "MConditionalTranslationModel(d=" << d << ",s=" << strength << ") = " << log_likelihood(d, strength) << std::endl; + for (it = r.begin(); it != r.end(); ++it) { + it->second.set_discount(d); + it->second.set_strength(strength); + } +#endif + } + + int DecrementRule(const TRule& rule, MT19937* rng) { + RuleModelHash::iterator it = r.find(rule.f_); + assert(it != r.end()); + const TableCount delta = it->second.decrement(rule, rng); + if (delta.count) { + if (it->second.num_customers() == 0) r.erase(it); + } + return delta.count; + } + + int IncrementRule(const TRule& rule, MT19937* rng) { + RuleModelHash::iterator it = r.find(rule.f_); + if (it == r.end()) { + //it = r.insert(make_pair(rule.f_, MFCR<1,TRule>(d, strength))).first; + it = r.insert(make_pair(rule.f_, MFCR<1,TRule>(1,1,1,1,0.6, -0.12))).first; + } + p0s[0] = rp0(rule); + TableCount delta = it->second.increment(rule, p0s.begin(), lambdas.begin(), rng); + return delta.count; + } + + prob_t RuleProbability(const TRule& rule) const { + prob_t p; + RuleModelHash::const_iterator it = r.find(rule.f_); + if (it == r.end()) { + p = rp0(rule); + } else { + p0s[0] = rp0(rule); + p = it->second.prob(rule, p0s.begin(), lambdas.begin()); + } + return p; + } + + prob_t Likelihood() const { + prob_t p; p.logeq(log_likelihood(d, strength)); + return p; + } + + const ConditionalBaseMeasure& rp0; + typedef std::tr1::unordered_map<std::vector<WordID>, + MFCR<1, TRule>, + boost::hash<std::vector<WordID> > > RuleModelHash; + RuleModelHash r; + double d, strength; + std::vector<prob_t> lambdas; + mutable std::vector<prob_t> p0s; +}; + +template <typename ConditionalBaseMeasure> +struct ConditionalTranslationModel { + explicit ConditionalTranslationModel(ConditionalBaseMeasure& rcp0) : + rp0(rcp0) {} + + void Summary() const { + std::cerr << "Number of conditioning contexts: " << r.size() << std::endl; + for (RuleModelHash::const_iterator it = r.begin(); it != r.end(); ++it) { + std::cerr << TD::GetString(it->first) << " \t(\\alpha = " << it->second.alpha() << ") --------------------------" << std::endl; + for (CCRP_NoTable<TRule>::const_iterator i2 = it->second.begin(); i2 != it->second.end(); ++i2) + std::cerr << " " << i2->second << '\t' << i2->first << std::endl; + } + } + + void ResampleHyperparameters(MT19937* rng) { + for (RuleModelHash::iterator it = r.begin(); it != r.end(); ++it) + it->second.resample_hyperparameters(rng); + } + + int DecrementRule(const TRule& rule) { + RuleModelHash::iterator it = r.find(rule.f_); + assert(it != r.end()); + int count = it->second.decrement(rule); + if (count) { + if (it->second.num_customers() == 0) r.erase(it); + } + return count; + } + + int IncrementRule(const TRule& rule) { + RuleModelHash::iterator it = r.find(rule.f_); + if (it == r.end()) { + it = r.insert(make_pair(rule.f_, CCRP_NoTable<TRule>(1.0, 1.0, 8.0))).first; + } + int count = it->second.increment(rule); + return count; + } + + void IncrementRules(const std::vector<TRulePtr>& rules) { + for (int i = 0; i < rules.size(); ++i) + IncrementRule(*rules[i]); + } + + void DecrementRules(const std::vector<TRulePtr>& rules) { + for (int i = 0; i < rules.size(); ++i) + DecrementRule(*rules[i]); + } + + prob_t RuleProbability(const TRule& rule) const { + prob_t p; + RuleModelHash::const_iterator it = r.find(rule.f_); + if (it == r.end()) { + p.logeq(log(rp0(rule))); + } else { + p.logeq(it->second.logprob(rule, log(rp0(rule)))); + } + return p; + } + + prob_t Likelihood() const { + prob_t p = prob_t::One(); + for (RuleModelHash::const_iterator it = r.begin(); it != r.end(); ++it) { + prob_t q; q.logeq(it->second.log_crp_prob()); + p *= q; + for (CCRP_NoTable<TRule>::const_iterator i2 = it->second.begin(); i2 != it->second.end(); ++i2) + p *= rp0(i2->first); + } + return p; + } + + const ConditionalBaseMeasure& rp0; + typedef std::tr1::unordered_map<std::vector<WordID>, + CCRP_NoTable<TRule>, + boost::hash<std::vector<WordID> > > RuleModelHash; + RuleModelHash r; +}; + +template <typename ConditionalBaseMeasure> +struct ConditionalParallelSegementationModel { + explicit ConditionalParallelSegementationModel(ConditionalBaseMeasure& rcp0) : + tmodel(rcp0), base(prob_t::One()), aligns(1,1) {} + + ConditionalTranslationModel<ConditionalBaseMeasure> tmodel; + + void DecrementRule(const TRule& rule) { + tmodel.DecrementRule(rule); + } + + void IncrementRule(const TRule& rule) { + tmodel.IncrementRule(rule); + } + + void IncrementRulesAndAlignments(const std::vector<TRulePtr>& rules) { + tmodel.IncrementRules(rules); + for (int i = 0; i < rules.size(); ++i) { + IncrementAlign(rules[i]->f_.size()); + } + } + + void DecrementRulesAndAlignments(const std::vector<TRulePtr>& rules) { + tmodel.DecrementRules(rules); + for (int i = 0; i < rules.size(); ++i) { + DecrementAlign(rules[i]->f_.size()); + } + } + + prob_t RuleProbability(const TRule& rule) const { + return tmodel.RuleProbability(rule); + } + + void IncrementAlign(unsigned span) { + if (aligns.increment(span)) { + // TODO + } + } + + void DecrementAlign(unsigned span) { + if (aligns.decrement(span)) { + // TODO + } + } + + prob_t AlignProbability(unsigned span) const { + prob_t p; + p.logeq(aligns.logprob(span, Md::log_poisson(span, 1.0))); + return p; + } + + prob_t Likelihood() const { + prob_t p; p.logeq(aligns.log_crp_prob()); + p *= base; + p *= tmodel.Likelihood(); + return p; + } + + prob_t base; + CCRP_NoTable<unsigned> aligns; +}; + +#endif + diff --git a/gi/pf/condnaive.cc b/gi/pf/condnaive.cc new file mode 100644 index 00000000..3ea88016 --- /dev/null +++ b/gi/pf/condnaive.cc @@ -0,0 +1,298 @@ +#include <iostream> +#include <tr1/memory> +#include <queue> + +#include <boost/multi_array.hpp> +#include <boost/program_options.hpp> +#include <boost/program_options/variables_map.hpp> + +#include "base_distributions.h" +#include "monotonic_pseg.h" +#include "conditional_pseg.h" +#include "trule.h" +#include "tdict.h" +#include "filelib.h" +#include "dict.h" +#include "sampler.h" +#include "ccrp_nt.h" +#include "corpus.h" + +using namespace std; +using namespace std::tr1; +namespace po = boost::program_options; + +static unsigned kMAX_SRC_PHRASE; +static unsigned kMAX_TRG_PHRASE; + +void InitCommandLine(int argc, char** argv, po::variables_map* conf) { + po::options_description opts("Configuration options"); + opts.add_options() + ("samples,s",po::value<unsigned>()->default_value(1000),"Number of samples") + ("input,i",po::value<string>(),"Read parallel data from") + ("max_src_phrase",po::value<unsigned>()->default_value(4),"Maximum length of source language phrases") + ("max_trg_phrase",po::value<unsigned>()->default_value(4),"Maximum length of target language phrases") + ("model1,m",po::value<string>(),"Model 1 parameters (used in base distribution)") + ("model1_interpolation_weight",po::value<double>()->default_value(0.95),"Mixing proportion of model 1 with uniform target distribution") + ("random_seed,S",po::value<uint32_t>(), "Random seed"); + po::options_description clo("Command line options"); + clo.add_options() + ("config", po::value<string>(), "Configuration file") + ("help,h", "Print this help message and exit"); + po::options_description dconfig_options, dcmdline_options; + dconfig_options.add(opts); + dcmdline_options.add(opts).add(clo); + + po::store(parse_command_line(argc, argv, dcmdline_options), *conf); + if (conf->count("config")) { + ifstream config((*conf)["config"].as<string>().c_str()); + po::store(po::parse_config_file(config, dconfig_options), *conf); + } + po::notify(*conf); + + if (conf->count("help") || (conf->count("input") == 0)) { + cerr << dcmdline_options << endl; + exit(1); + } +} + +shared_ptr<MT19937> prng; + +struct ModelAndData { + explicit ModelAndData(ConditionalParallelSegementationModel<PhraseConditionalBase>& m, const vector<vector<int> >& ce, const vector<vector<int> >& cf, const set<int>& ve, const set<int>& vf) : + model(m), + rng(&*prng), + corpuse(ce), + corpusf(cf), + vocabe(ve), + vocabf(vf), + mh_samples(), + mh_rejects(), + kX(-TD::Convert("X")), + derivations(corpuse.size()) {} + + void ResampleHyperparameters() { + } + + void InstantiateRule(const pair<short,short>& from, + const pair<short,short>& to, + const vector<int>& sentf, + const vector<int>& sente, + TRule* rule) const { + rule->f_.clear(); + rule->e_.clear(); + rule->lhs_ = kX; + for (short i = from.first; i < to.first; ++i) + rule->f_.push_back(sentf[i]); + for (short i = from.second; i < to.second; ++i) + rule->e_.push_back(sente[i]); + } + + void DecrementDerivation(const vector<pair<short,short> >& d, const vector<int>& sentf, const vector<int>& sente) { + if (d.size() < 2) return; + TRule x; + for (int i = 1; i < d.size(); ++i) { + InstantiateRule(d[i], d[i-1], sentf, sente, &x); + model.DecrementRule(x); + model.DecrementAlign(x.f_.size()); + } + } + + void PrintDerivation(const vector<pair<short,short> >& d, const vector<int>& sentf, const vector<int>& sente) { + if (d.size() < 2) return; + TRule x; + for (int i = 1; i < d.size(); ++i) { + InstantiateRule(d[i], d[i-1], sentf, sente, &x); + cerr << i << '/' << (d.size() - 1) << ": " << x << endl; + } + } + + void IncrementDerivation(const vector<pair<short,short> >& d, const vector<int>& sentf, const vector<int>& sente) { + if (d.size() < 2) return; + TRule x; + for (int i = 1; i < d.size(); ++i) { + InstantiateRule(d[i], d[i-1], sentf, sente, &x); + model.IncrementRule(x); + model.IncrementAlign(x.f_.size()); + } + } + + prob_t Likelihood() const { + return model.Likelihood(); + } + + prob_t DerivationProposalProbability(const vector<pair<short,short> >& d, const vector<int>& sentf, const vector<int>& sente) const { + prob_t p = prob_t::One(); + TRule x; + for (int i = 1; i < d.size(); ++i) { + InstantiateRule(d[i], d[i-1], sentf, sente, &x); + p *= model.RuleProbability(x); + p *= model.AlignProbability(x.f_.size()); + } + return p; + } + + void Sample(); + + ConditionalParallelSegementationModel<PhraseConditionalBase>& model; + MT19937* rng; + const vector<vector<int> >& corpuse, corpusf; + const set<int>& vocabe, vocabf; + unsigned mh_samples, mh_rejects; + const int kX; + vector<vector<pair<short, short> > > derivations; +}; + +void ModelAndData::Sample() { + unsigned MAXK = kMAX_SRC_PHRASE; + unsigned MAXL = kMAX_TRG_PHRASE; + TRule x; + x.lhs_ = -TD::Convert("X"); + + for (int samples = 0; samples < 1000; ++samples) { + if (samples % 1 == 0 && samples > 0) { + //ResampleHyperparameters(); + cerr << " [" << samples << " LLH=" << log(Likelihood()) << " MH=" << ((double)mh_rejects / mh_samples) << "]\n"; + for (int i = 0; i < 10; ++i) { + cerr << "SENTENCE: " << TD::GetString(corpusf[i]) << " ||| " << TD::GetString(corpuse[i]) << endl; + PrintDerivation(derivations[i], corpusf[i], corpuse[i]); + } + static TRule xx("[X] ||| w n ||| s h ||| X=0"); + const CCRP_NoTable<TRule>& dcrp = model.tmodel.r.find(xx.f_)->second; + for (CCRP_NoTable<TRule>::const_iterator it = dcrp.begin(); it != dcrp.end(); ++it) { + cerr << "\t" << it->second << "\t" << it->first << endl; + } + } + cerr << '.' << flush; + for (int s = 0; s < corpuse.size(); ++s) { + const vector<int>& sentf = corpusf[s]; + const vector<int>& sente = corpuse[s]; +// cerr << " CUSTOMERS: " << rules.num_customers() << endl; +// cerr << "SENTENCE: " << TD::GetString(sentf) << " ||| " << TD::GetString(sente) << endl; + + vector<pair<short, short> >& deriv = derivations[s]; + const prob_t p_cur = Likelihood(); + DecrementDerivation(deriv, sentf, sente); + + boost::multi_array<prob_t, 2> a(boost::extents[sentf.size() + 1][sente.size() + 1]); + boost::multi_array<prob_t, 4> trans(boost::extents[sentf.size() + 1][sente.size() + 1][MAXK][MAXL]); + a[0][0] = prob_t::One(); + for (int i = 0; i < sentf.size(); ++i) { + for (int j = 0; j < sente.size(); ++j) { + const prob_t src_a = a[i][j]; + x.f_.clear(); + for (int k = 1; k <= MAXK; ++k) { + if (i + k > sentf.size()) break; + x.f_.push_back(sentf[i + k - 1]); + x.e_.clear(); + const prob_t p_span = model.AlignProbability(k); // prob of consuming this much source + for (int l = 1; l <= MAXL; ++l) { + if (j + l > sente.size()) break; + x.e_.push_back(sente[j + l - 1]); + trans[i][j][k - 1][l - 1] = model.RuleProbability(x) * p_span; + a[i + k][j + l] += src_a * trans[i][j][k - 1][l - 1]; + } + } + } + } +// cerr << "Inside: " << log(a[sentf.size()][sente.size()]) << endl; + const prob_t q_cur = DerivationProposalProbability(deriv, sentf, sente); + + vector<pair<short,short> > newderiv; + int cur_i = sentf.size(); + int cur_j = sente.size(); + while(cur_i > 0 && cur_j > 0) { + newderiv.push_back(pair<short,short>(cur_i, cur_j)); +// cerr << "NODE: (" << cur_i << "," << cur_j << ")\n"; + SampleSet<prob_t> ss; + vector<pair<short,short> > nexts; + for (int k = 1; k <= MAXK; ++k) { + const int hyp_i = cur_i - k; + if (hyp_i < 0) break; + for (int l = 1; l <= MAXL; ++l) { + const int hyp_j = cur_j - l; + if (hyp_j < 0) break; + const prob_t& inside = a[hyp_i][hyp_j]; + if (inside == prob_t::Zero()) continue; + const prob_t& transp = trans[hyp_i][hyp_j][k - 1][l - 1]; + if (transp == prob_t::Zero()) continue; + const prob_t p = inside * transp; + ss.add(p); + nexts.push_back(pair<short,short>(hyp_i, hyp_j)); +// cerr << " (" << hyp_i << "," << hyp_j << ") <--- " << log(p) << endl; + } + } +// cerr << " sample set has " << nexts.size() << " elements.\n"; + const int selected = rng->SelectSample(ss); + cur_i = nexts[selected].first; + cur_j = nexts[selected].second; + } + newderiv.push_back(pair<short,short>(0,0)); + const prob_t q_new = DerivationProposalProbability(newderiv, sentf, sente); + IncrementDerivation(newderiv, sentf, sente); +// cerr << "SANITY: " << q_new << " " <<log(DerivationProposalProbability(newderiv, sentf, sente)) << endl; + if (deriv.empty()) { deriv = newderiv; continue; } + ++mh_samples; + + if (deriv != newderiv) { + const prob_t p_new = Likelihood(); +// cerr << "p_cur=" << log(p_cur) << "\t p_new=" << log(p_new) << endl; +// cerr << "q_cur=" << log(q_cur) << "\t q_new=" << log(q_new) << endl; + if (!rng->AcceptMetropolisHastings(p_new, p_cur, q_new, q_cur)) { + ++mh_rejects; + DecrementDerivation(newderiv, sentf, sente); + IncrementDerivation(deriv, sentf, sente); + } else { +// cerr << " ACCEPT\n"; + deriv = newderiv; + } + } + } + } +} + +int main(int argc, char** argv) { + po::variables_map conf; + InitCommandLine(argc, argv, &conf); + kMAX_TRG_PHRASE = conf["max_trg_phrase"].as<unsigned>(); + kMAX_SRC_PHRASE = conf["max_src_phrase"].as<unsigned>(); + + if (!conf.count("model1")) { + cerr << argv[0] << "Please use --model1 to specify model 1 parameters\n"; + return 1; + } + if (conf.count("random_seed")) + prng.reset(new MT19937(conf["random_seed"].as<uint32_t>())); + else + prng.reset(new MT19937); +// MT19937& rng = *prng; + + vector<vector<int> > corpuse, corpusf; + set<int> vocabe, vocabf; + corpus::ReadParallelCorpus(conf["input"].as<string>(), &corpusf, &corpuse, &vocabf, &vocabe); + cerr << "f-Corpus size: " << corpusf.size() << " sentences\n"; + cerr << "f-Vocabulary size: " << vocabf.size() << " types\n"; + cerr << "f-Corpus size: " << corpuse.size() << " sentences\n"; + cerr << "f-Vocabulary size: " << vocabe.size() << " types\n"; + assert(corpusf.size() == corpuse.size()); + + Model1 m1(conf["model1"].as<string>()); + + PhraseConditionalBase pcb0(m1, conf["model1_interpolation_weight"].as<double>(), vocabe.size()); + ConditionalParallelSegementationModel<PhraseConditionalBase> x(pcb0); + + ModelAndData posterior(x, corpuse, corpusf, vocabe, vocabf); + posterior.Sample(); + + TRule r1("[X] ||| x ||| l e ||| X=0"); + TRule r2("[X] ||| A ||| a d ||| X=0"); + TRule r3("[X] ||| n ||| e r ||| X=0"); + TRule r4("[X] ||| x A n ||| b l a g ||| X=0"); + + PhraseConditionalUninformativeBase u0(vocabe.size()); + + cerr << (pcb0(r1)*pcb0(r2)*pcb0(r3)) << endl; + cerr << (u0(r4)) << endl; + + return 0; +} + diff --git a/gi/pf/corpus.cc b/gi/pf/corpus.cc index a408e7cf..cb6e4ed7 100644 --- a/gi/pf/corpus.cc +++ b/gi/pf/corpus.cc @@ -24,11 +24,11 @@ void ReadParallelCorpus(const string& filename, istream* in = rf.stream(); assert(*in); string line; + unsigned lc = 0; const WordID kDIV = TD::Convert("|||"); vector<WordID> tmp; - while(*in) { - getline(*in, line); - if (line.empty() && !*in) break; + while(getline(*in, line)) { + ++lc; e->push_back(vector<int>()); f->push_back(vector<int>()); vector<int>& le = e->back(); @@ -39,12 +39,17 @@ void ReadParallelCorpus(const string& filename, for (unsigned i = 0; i < tmp.size(); ++i) { const int cur = tmp[i]; if (isf) { - if (kDIV == cur) { isf = false; } else { + if (kDIV == cur) { + isf = false; + } else { lf.push_back(cur); vocab_f->insert(cur); } } else { - assert(cur != kDIV); + if (cur == kDIV) { + cerr << "ERROR in " << lc << ": " << line << endl << endl; + abort(); + } le.push_back(cur); vocab_e->insert(cur); } diff --git a/gi/pf/dpnaive.cc b/gi/pf/dpnaive.cc index db1c43c7..469dff5c 100644 --- a/gi/pf/dpnaive.cc +++ b/gi/pf/dpnaive.cc @@ -6,7 +6,7 @@ #include <boost/program_options.hpp> #include <boost/program_options/variables_map.hpp> -#include "base_measures.h" +#include "base_distributions.h" #include "monotonic_pseg.h" #include "trule.h" #include "tdict.h" diff --git a/gi/pf/guess-translits.pl b/gi/pf/guess-translits.pl new file mode 100755 index 00000000..d00c2168 --- /dev/null +++ b/gi/pf/guess-translits.pl @@ -0,0 +1,72 @@ +#!/usr/bin/perl -w +use strict; +use utf8; + +my $MIN_PMI = -3; + +my %fs; +my %es; +my %ef; + +die "Usage: $0 < input.utf8.txt\n" if scalar @ARGV > 0; + +binmode(STDIN,":utf8"); +binmode(STDOUT,":utf8"); +binmode(STDERR,":utf8"); + +my $tot = 0; +print STDERR "Reading alignments from STDIN ...\n"; +while(<STDIN>) { + chomp; + my ($fsent, $esent, $alsent) = split / \|\|\| /; + die "Format should be 'foreign sentence ||| english sentence ||| 0-0 1-1 ...'\n" unless defined $fsent && defined $esent && defined $alsent; + + my @fws = split /\s+/, $fsent; + my @ews = split /\s+/, $esent; + my @as = split /\s+/, $alsent; + my %a2b; + my %b2a; + for my $ap (@as) { + my ($a,$b) = split /-/, $ap; + die "BAD INPUT: $_\n" unless defined $a && defined $b; + $a2b{$a}->{$b} = 1; + $b2a{$b}->{$a} = 1; + } + for my $a (keys %a2b) { + my $bref = $a2b{$a}; + next unless scalar keys %$bref < 2; + my $b = (keys %$bref)[0]; + next unless scalar keys %{$b2a{$b}} < 2; + my $f = $fws[$a]; + next unless defined $f; + next unless length($f) > 3; + my $e = $ews[$b]; + next unless defined $e; + next unless length($e) > 3; + + $ef{$f}->{$e}++; + $es{$e}++; + $fs{$f}++; + $tot++; + } +} +my $ltot = log($tot); +my $num = 0; +print STDERR "Extracting pairs for PMI > $MIN_PMI ...\n"; +for my $f (keys %fs) { + my $logf = log($fs{$f}); + my $esref = $ef{$f}; + for my $e (keys %$esref) { + my $loge = log($es{$e}); + my $ef = $esref->{$e}; + my $logef = log($ef); + my $pmi = $logef - ($loge + $logf); + next if $pmi < $MIN_PMI; + my @flets = split //, $f; + my @elets = split //, $e; + print "@flets ||| @elets\n"; + $num++; + } +} +print STDERR "Extracted $num pairs.\n"; +print STDERR "Recommend running:\n ../../training/model1 -v -d -t -99999 output.txt\n"; diff --git a/gi/pf/itg.cc b/gi/pf/itg.cc index ac3c16a3..a38fe672 100644 --- a/gi/pf/itg.cc +++ b/gi/pf/itg.cc @@ -27,10 +27,67 @@ ostream& operator<<(ostream& os, const vector<WordID>& p) { return os << ']'; } -double log_poisson(unsigned x, const double& lambda) { - assert(lambda > 0.0); - return log(lambda) * x - lgamma(x + 1) - lambda; -} +struct UnigramModel { + explicit UnigramModel(const string& fname, unsigned vocab_size, double p0null = 0.05) : + use_uniform_(fname.size() == 0), + p0null_(p0null), + uniform_((1.0 - p0null) / vocab_size), + probs_(TD::NumWords() + 1) { + if (fname.size() > 0) LoadUnigrams(fname); + probs_[0] = p0null_; + } + +// +// \data\ +// ngram 1=9295 +// +// \1-grams: +// -3.191193 " + + void LoadUnigrams(const string& fname) { + cerr << "Loading unigram probabilities from " << fname << " ..." << endl; + ReadFile rf(fname); + string line; + istream& in = *rf.stream(); + assert(in); + getline(in, line); + assert(line.empty()); + getline(in, line); + assert(line == "\\data\\"); + getline(in, line); + size_t pos = line.find("ngram 1="); + assert(pos == 0); + assert(line.size() > 8); + const size_t num_unigrams = atoi(&line[8]); + getline(in, line); + assert(line.empty()); + getline(in, line); + assert(line == "\\1-grams:"); + for (size_t i = 0; i < num_unigrams; ++i) { + getline(in, line); + assert(line.size() > 0); + pos = line.find('\t'); + assert(pos > 0); + assert(pos + 1 < line.size()); + const WordID w = TD::Convert(line.substr(pos + 1)); + line[pos] = 0; + float p = atof(&line[0]); + const prob_t pnon_null(1.0 - p0null_.as_float()); + if (w < probs_.size()) probs_[w].logeq(p * log(10) + log(pnon_null)); else abort(); + } + } + + const prob_t& operator()(const WordID& w) const { + if (!w) return p0null_; + if (use_uniform_) return uniform_; + return probs_[w]; + } + + const bool use_uniform_; + const prob_t p0null_; + const prob_t uniform_; + vector<prob_t> probs_; +}; struct Model1 { explicit Model1(const string& fname) : @@ -89,11 +146,11 @@ void InitCommandLine(int argc, char** argv, po::variables_map* conf) { ("samples,s",po::value<unsigned>()->default_value(1000),"Number of samples") ("particles,p",po::value<unsigned>()->default_value(25),"Number of particles") ("input,i",po::value<string>(),"Read parallel data from") - ("max_src_phrase",po::value<unsigned>()->default_value(7),"Maximum length of source language phrases") - ("max_trg_phrase",po::value<unsigned>()->default_value(7),"Maximum length of target language phrases") ("model1,m",po::value<string>(),"Model 1 parameters (used in base distribution)") ("inverse_model1,M",po::value<string>(),"Inverse Model 1 parameters (used in backward estimate)") ("model1_interpolation_weight",po::value<double>()->default_value(0.95),"Mixing proportion of model 1 with uniform target distribution") + ("src_unigram,u",po::value<string>()->default_value(""),"Source unigram distribution; empty for uniform") + ("trg_unigram,U",po::value<string>()->default_value(""),"Target unigram distribution; empty for uniform") ("random_seed,S",po::value<uint32_t>(), "Random seed"); po::options_description clo("Command line options"); clo.add_options() @@ -165,11 +222,11 @@ void ReadParallelCorpus(const string& filename, int main(int argc, char** argv) { po::variables_map conf; InitCommandLine(argc, argv, &conf); - const size_t kMAX_TRG_PHRASE = conf["max_trg_phrase"].as<unsigned>(); - const size_t kMAX_SRC_PHRASE = conf["max_src_phrase"].as<unsigned>(); const unsigned particles = conf["particles"].as<unsigned>(); const unsigned samples = conf["samples"].as<unsigned>(); - + TD::Convert("<s>"); + TD::Convert("</s>"); + TD::Convert("<unk>"); if (!conf.count("model1")) { cerr << argv[0] << "Please use --model1 to specify model 1 parameters\n"; return 1; @@ -188,23 +245,28 @@ int main(int argc, char** argv) { cerr << "F-corpus size: " << corpusf.size() << " sentences\t (" << vocabf.size() << " word types)\n"; cerr << "E-corpus size: " << corpuse.size() << " sentences\t (" << vocabe.size() << " word types)\n"; assert(corpusf.size() == corpuse.size()); + UnigramModel src_unigram(conf["src_unigram"].as<string>(), vocabf.size()); + UnigramModel trg_unigram(conf["trg_unigram"].as<string>(), vocabe.size()); + const prob_t kHALF(0.5); + const string kEMPTY = "NULL"; const int kLHS = -TD::Convert("X"); Model1 m1(conf["model1"].as<string>()); Model1 invm1(conf["inverse_model1"].as<string>()); for (int si = 0; si < conf["samples"].as<unsigned>(); ++si) { cerr << '.' << flush; for (int ci = 0; ci < corpusf.size(); ++ci) { - const vector<WordID>& src = corpusf[ci]; const vector<WordID>& trg = corpuse[ci]; - for (int i = 0; i < src.size(); ++i) { - for (int j = 0; j < trg.size(); ++j) { - const int eff_max_src = min(src.size() - i, kMAX_SRC_PHRASE); - for (int k = 0; k < eff_max_src; ++k) { - const int eff_max_trg = (k == 0 ? 1 : min(trg.size() - j, kMAX_TRG_PHRASE)); - for (int l = 0; l < eff_max_trg; ++l) { - } - } + const vector<WordID>& src = corpusf[ci]; + for (int i = 0; i <= trg.size(); ++i) { + const WordID e_i = i > 0 ? trg[i-1] : 0; + for (int j = 0; j <= src.size(); ++j) { + const WordID f_j = j > 0 ? src[j-1] : 0; + if (e_i == 0 && f_j == 0) continue; + prob_t je = kHALF * src_unigram(f_j) * m1(f_j,e_i) + kHALF * trg_unigram(e_i) * invm1(e_i,f_j); + cerr << "p( " << (e_i ? TD::Convert(e_i) : kEMPTY) << " , " << (f_j ? TD::Convert(f_j) : kEMPTY) << " ) = " << log(je) << endl; + if (e_i && f_j) + cout << "[X] ||| " << TD::Convert(f_j) << " ||| " << TD::Convert(e_i) << " ||| LogProb=" << log(je) << endl; } } } diff --git a/gi/pf/learn_cfg.cc b/gi/pf/learn_cfg.cc new file mode 100644 index 00000000..ed1772bf --- /dev/null +++ b/gi/pf/learn_cfg.cc @@ -0,0 +1,428 @@ +#include <iostream> +#include <tr1/memory> +#include <queue> + +#include <boost/functional.hpp> +#include <boost/program_options.hpp> +#include <boost/program_options/variables_map.hpp> + +#include "inside_outside.h" +#include "hg.h" +#include "bottom_up_parser.h" +#include "fdict.h" +#include "grammar.h" +#include "m.h" +#include "trule.h" +#include "tdict.h" +#include "filelib.h" +#include "dict.h" +#include "sampler.h" +#include "ccrp.h" +#include "ccrp_onetable.h" + +using namespace std; +using namespace tr1; +namespace po = boost::program_options; + +shared_ptr<MT19937> prng; +vector<int> nt_vocab; +vector<int> nt_id_to_index; +static unsigned kMAX_RULE_SIZE = 0; +static unsigned kMAX_ARITY = 0; +static bool kALLOW_MIXED = true; // allow rules with mixed terminals and NTs +static bool kHIERARCHICAL_PRIOR = false; + +void InitCommandLine(int argc, char** argv, po::variables_map* conf) { + po::options_description opts("Configuration options"); + opts.add_options() + ("samples,s",po::value<unsigned>()->default_value(1000),"Number of samples") + ("input,i",po::value<string>(),"Read parallel data from") + ("max_rule_size,m", po::value<unsigned>()->default_value(0), "Maximum rule size (0 for unlimited)") + ("max_arity,a", po::value<unsigned>()->default_value(0), "Maximum number of nonterminals in a rule (0 for unlimited)") + ("no_mixed_rules,M", "Do not mix terminals and nonterminals in a rule RHS") + ("nonterminals,n", po::value<unsigned>()->default_value(1), "Size of nonterminal vocabulary") + ("hierarchical_prior,h", "Use hierarchical prior") + ("random_seed,S",po::value<uint32_t>(), "Random seed"); + po::options_description clo("Command line options"); + clo.add_options() + ("config", po::value<string>(), "Configuration file") + ("help", "Print this help message and exit"); + po::options_description dconfig_options, dcmdline_options; + dconfig_options.add(opts); + dcmdline_options.add(opts).add(clo); + + po::store(parse_command_line(argc, argv, dcmdline_options), *conf); + if (conf->count("config")) { + ifstream config((*conf)["config"].as<string>().c_str()); + po::store(po::parse_config_file(config, dconfig_options), *conf); + } + po::notify(*conf); + + if (conf->count("help") || (conf->count("input") == 0)) { + cerr << dcmdline_options << endl; + exit(1); + } +} + +unsigned ReadCorpus(const string& filename, + vector<vector<WordID> >* e, + set<WordID>* vocab_e) { + e->clear(); + vocab_e->clear(); + istream* in; + if (filename == "-") + in = &cin; + else + in = new ifstream(filename.c_str()); + assert(*in); + string line; + unsigned toks = 0; + while(*in) { + getline(*in, line); + if (line.empty() && !*in) break; + e->push_back(vector<int>()); + vector<int>& le = e->back(); + TD::ConvertSentence(line, &le); + for (unsigned i = 0; i < le.size(); ++i) + vocab_e->insert(le[i]); + toks += le.size(); + } + if (in != &cin) delete in; + return toks; +} + +struct Grid { + // a b c d e + // 0 - 0 - - + vector<int> grid; +}; + +struct BaseRuleModel { + explicit BaseRuleModel(unsigned term_size, + unsigned nonterm_size = 1) : + unif_term(1.0 / term_size), + unif_nonterm(1.0 / nonterm_size) {} + prob_t operator()(const TRule& r) const { + prob_t p; p.logeq(Md::log_poisson(1.0, r.f_.size())); + const prob_t term_prob((2.0 + 0.01*r.f_.size()) / (r.f_.size() + 2)); + const prob_t nonterm_prob(1.0 - term_prob.as_float()); + for (unsigned i = 0; i < r.f_.size(); ++i) { + if (r.f_[i] <= 0) { // nonterminal + if (kALLOW_MIXED) p *= nonterm_prob; + p *= unif_nonterm; + } else { // terminal + if (kALLOW_MIXED) p *= term_prob; + p *= unif_term; + } + } + return p; + } + const prob_t unif_term, unif_nonterm; +}; + +struct HieroLMModel { + explicit HieroLMModel(unsigned vocab_size, unsigned num_nts = 1) : + base(vocab_size, num_nts), + q0(1,1,1,1), + nts(num_nts, CCRP<TRule>(1,1,1,1)) {} + + prob_t Prob(const TRule& r) const { + return nts[nt_id_to_index[-r.lhs_]].prob(r, p0(r)); + } + + inline prob_t p0(const TRule& r) const { + if (kHIERARCHICAL_PRIOR) + return q0.prob(r, base(r)); + else + return base(r); + } + + int Increment(const TRule& r, MT19937* rng) { + const int delta = nts[nt_id_to_index[-r.lhs_]].increment(r, p0(r), rng); + if (kHIERARCHICAL_PRIOR && delta) + q0.increment(r, base(r), rng); + return delta; + // return x.increment(r); + } + + int Decrement(const TRule& r, MT19937* rng) { + const int delta = nts[nt_id_to_index[-r.lhs_]].decrement(r, rng); + if (kHIERARCHICAL_PRIOR && delta) + q0.decrement(r, rng); + return delta; + //return x.decrement(r); + } + + prob_t Likelihood() const { + prob_t p = prob_t::One(); + for (unsigned i = 0; i < nts.size(); ++i) { + prob_t q; q.logeq(nts[i].log_crp_prob()); + p *= q; + for (CCRP<TRule>::const_iterator it = nts[i].begin(); it != nts[i].end(); ++it) { + prob_t tp = p0(it->first); + tp.poweq(it->second.table_counts_.size()); + p *= tp; + } + } + if (kHIERARCHICAL_PRIOR) { + prob_t q; q.logeq(q0.log_crp_prob()); + p *= q; + for (CCRP<TRule>::const_iterator it = q0.begin(); it != q0.end(); ++it) { + prob_t tp = base(it->first); + tp.poweq(it->second.table_counts_.size()); + p *= tp; + } + } + //for (CCRP_OneTable<TRule>::const_iterator it = x.begin(); it != x.end(); ++it) + // p *= base(it->first); + return p; + } + + void ResampleHyperparameters(MT19937* rng) { + for (unsigned i = 0; i < nts.size(); ++i) + nts[i].resample_hyperparameters(rng); + if (kHIERARCHICAL_PRIOR) { + q0.resample_hyperparameters(rng); + cerr << "[base d=" << q0.discount() << ", s=" << q0.strength() << "]"; + } + cerr << " d=" << nts[0].discount() << ", s=" << nts[0].strength() << endl; + } + + const BaseRuleModel base; + CCRP<TRule> q0; + vector<CCRP<TRule> > nts; + //CCRP_OneTable<TRule> x; +}; + +vector<GrammarIter* > tofreelist; + +HieroLMModel* plm; + +struct NPGrammarIter : public GrammarIter, public RuleBin { + NPGrammarIter() : arity() { tofreelist.push_back(this); } + NPGrammarIter(const TRulePtr& inr, const int a, int symbol) : arity(a) { + if (inr) { + r.reset(new TRule(*inr)); + } else { + r.reset(new TRule); + } + TRule& rr = *r; + rr.lhs_ = nt_vocab[0]; + rr.f_.push_back(symbol); + rr.e_.push_back(symbol < 0 ? (1-int(arity)) : symbol); + tofreelist.push_back(this); + } + inline static unsigned NextArity(int cur_a, int symbol) { + return cur_a + (symbol <= 0 ? 1 : 0); + } + virtual int GetNumRules() const { + if (r) return nt_vocab.size(); else return 0; + } + virtual TRulePtr GetIthRule(int i) const { + if (i == 0) return r; + TRulePtr nr(new TRule(*r)); + nr->lhs_ = nt_vocab[i]; + return nr; + } + virtual int Arity() const { + return arity; + } + virtual const RuleBin* GetRules() const { + if (!r) return NULL; else return this; + } + virtual const GrammarIter* Extend(int symbol) const { + const int next_arity = NextArity(arity, symbol); + if (kMAX_ARITY && next_arity > kMAX_ARITY) + return NULL; + if (!kALLOW_MIXED && r) { + bool t1 = r->f_.front() <= 0; + bool t2 = symbol <= 0; + if (t1 != t2) return NULL; + } + if (!kMAX_RULE_SIZE || !r || (r->f_.size() < kMAX_RULE_SIZE)) + return new NPGrammarIter(r, next_arity, symbol); + else + return NULL; + } + const unsigned char arity; + TRulePtr r; +}; + +struct NPGrammar : public Grammar { + virtual const GrammarIter* GetRoot() const { + return new NPGrammarIter; + } +}; + +prob_t TotalProb(const Hypergraph& hg) { + return Inside<prob_t, EdgeProb>(hg); +} + +void SampleDerivation(const Hypergraph& hg, MT19937* rng, vector<unsigned>* sampled_deriv) { + vector<prob_t> node_probs; + Inside<prob_t, EdgeProb>(hg, &node_probs); + queue<unsigned> q; + q.push(hg.nodes_.size() - 2); + while(!q.empty()) { + unsigned cur_node_id = q.front(); +// cerr << "NODE=" << cur_node_id << endl; + q.pop(); + const Hypergraph::Node& node = hg.nodes_[cur_node_id]; + const unsigned num_in_edges = node.in_edges_.size(); + unsigned sampled_edge = 0; + if (num_in_edges == 1) { + sampled_edge = node.in_edges_[0]; + } else { + //prob_t z; + assert(num_in_edges > 1); + SampleSet<prob_t> ss; + for (unsigned j = 0; j < num_in_edges; ++j) { + const Hypergraph::Edge& edge = hg.edges_[node.in_edges_[j]]; + prob_t p = edge.edge_prob_; + for (unsigned k = 0; k < edge.tail_nodes_.size(); ++k) + p *= node_probs[edge.tail_nodes_[k]]; + ss.add(p); +// cerr << log(ss[j]) << " ||| " << edge.rule_->AsString() << endl; + //z += p; + } +// for (unsigned j = 0; j < num_in_edges; ++j) { +// const Hypergraph::Edge& edge = hg.edges_[node.in_edges_[j]]; +// cerr << exp(log(ss[j] / z)) << " ||| " << edge.rule_->AsString() << endl; +// } +// cerr << " --- \n"; + sampled_edge = node.in_edges_[rng->SelectSample(ss)]; + } + sampled_deriv->push_back(sampled_edge); + const Hypergraph::Edge& edge = hg.edges_[sampled_edge]; + for (unsigned j = 0; j < edge.tail_nodes_.size(); ++j) { + q.push(edge.tail_nodes_[j]); + } + } + for (unsigned i = 0; i < sampled_deriv->size(); ++i) { + cerr << *hg.edges_[(*sampled_deriv)[i]].rule_ << endl; + } +} + +void IncrementDerivation(const Hypergraph& hg, const vector<unsigned>& d, HieroLMModel* plm, MT19937* rng) { + for (unsigned i = 0; i < d.size(); ++i) + plm->Increment(*hg.edges_[d[i]].rule_, rng); +} + +void DecrementDerivation(const Hypergraph& hg, const vector<unsigned>& d, HieroLMModel* plm, MT19937* rng) { + for (unsigned i = 0; i < d.size(); ++i) + plm->Decrement(*hg.edges_[d[i]].rule_, rng); +} + +int main(int argc, char** argv) { + po::variables_map conf; + + InitCommandLine(argc, argv, &conf); + nt_vocab.resize(conf["nonterminals"].as<unsigned>()); + assert(nt_vocab.size() > 0); + assert(nt_vocab.size() < 26); + { + string nt = "X"; + for (unsigned i = 0; i < nt_vocab.size(); ++i) { + if (nt_vocab.size() > 1) nt[0] = ('A' + i); + int pid = TD::Convert(nt); + nt_vocab[i] = -pid; + if (pid >= nt_id_to_index.size()) { + nt_id_to_index.resize(pid + 1, -1); + } + nt_id_to_index[pid] = i; + } + } + vector<GrammarPtr> grammars; + grammars.push_back(GrammarPtr(new NPGrammar)); + + const unsigned samples = conf["samples"].as<unsigned>(); + kMAX_RULE_SIZE = conf["max_rule_size"].as<unsigned>(); + if (kMAX_RULE_SIZE == 1) { + cerr << "Invalid maximum rule size: must be 0 or >1\n"; + return 1; + } + kMAX_ARITY = conf["max_arity"].as<unsigned>(); + if (kMAX_ARITY == 1) { + cerr << "Invalid maximum arity: must be 0 or >1\n"; + return 1; + } + kALLOW_MIXED = !conf.count("no_mixed_rules"); + + kHIERARCHICAL_PRIOR = conf.count("hierarchical_prior"); + + if (conf.count("random_seed")) + prng.reset(new MT19937(conf["random_seed"].as<uint32_t>())); + else + prng.reset(new MT19937); + MT19937& rng = *prng; + vector<vector<WordID> > corpuse; + set<WordID> vocabe; + cerr << "Reading corpus...\n"; + const unsigned toks = ReadCorpus(conf["input"].as<string>(), &corpuse, &vocabe); + cerr << "E-corpus size: " << corpuse.size() << " sentences\t (" << vocabe.size() << " word types)\n"; + HieroLMModel lm(vocabe.size(), nt_vocab.size()); + + plm = &lm; + ExhaustiveBottomUpParser parser(TD::Convert(-nt_vocab[0]), grammars); + + Hypergraph hg; + const int kGoal = -TD::Convert("Goal"); + const int kLP = FD::Convert("LogProb"); + SparseVector<double> v; v.set_value(kLP, 1.0); + vector<vector<unsigned> > derivs(corpuse.size()); + vector<Lattice> cl(corpuse.size()); + for (int ci = 0; ci < corpuse.size(); ++ci) { + vector<int>& src = corpuse[ci]; + Lattice& lat = cl[ci]; + lat.resize(src.size()); + for (unsigned i = 0; i < src.size(); ++i) + lat[i].push_back(LatticeArc(src[i], 0.0, 1)); + } + for (int SS=0; SS < samples; ++SS) { + const bool is_last = ((samples - 1) == SS); + prob_t dlh = prob_t::One(); + for (int ci = 0; ci < corpuse.size(); ++ci) { + const vector<int>& src = corpuse[ci]; + const Lattice& lat = cl[ci]; + cerr << TD::GetString(src) << endl; + hg.clear(); + parser.Parse(lat, &hg); // exhaustive parse + vector<unsigned>& d = derivs[ci]; + if (!is_last) DecrementDerivation(hg, d, &lm, &rng); + for (unsigned i = 0; i < hg.edges_.size(); ++i) { + TRule& r = *hg.edges_[i].rule_; + if (r.lhs_ == kGoal) + hg.edges_[i].edge_prob_ = prob_t::One(); + else + hg.edges_[i].edge_prob_ = lm.Prob(r); + } + if (!is_last) { + d.clear(); + SampleDerivation(hg, &rng, &d); + IncrementDerivation(hg, derivs[ci], &lm, &rng); + } else { + prob_t p = TotalProb(hg); + dlh *= p; + cerr << " p(sentence) = " << log(p) << "\t" << log(dlh) << endl; + } + if (tofreelist.size() > 200000) { + cerr << "Freeing ... "; + for (unsigned i = 0; i < tofreelist.size(); ++i) + delete tofreelist[i]; + tofreelist.clear(); + cerr << "Freed.\n"; + } + } + double llh = log(lm.Likelihood()); + cerr << "LLH=" << llh << "\tENTROPY=" << (-llh / log(2) / toks) << "\tPPL=" << pow(2, -llh / log(2) / toks) << endl; + if (SS % 10 == 9) lm.ResampleHyperparameters(&rng); + if (is_last) { + double z = log(dlh); + cerr << "TOTAL_PROB=" << z << "\tENTROPY=" << (-z / log(2) / toks) << "\tPPL=" << pow(2, -z / log(2) / toks) << endl; + } + } + for (unsigned i = 0; i < nt_vocab.size(); ++i) + cerr << lm.nts[i] << endl; + return 0; +} + diff --git a/gi/pf/make-freq-bins.pl b/gi/pf/make-freq-bins.pl new file mode 100755 index 00000000..fdcd3555 --- /dev/null +++ b/gi/pf/make-freq-bins.pl @@ -0,0 +1,26 @@ +#!/usr/bin/perl -w +use strict; + +my $BASE = 6; +my $CUTOFF = 3; + +my %d; +my $num = 0; +while(<>){ + chomp; + my @words = split /\s+/; + for my $w (@words) {$d{$w}++; $num++;} +} + +my @vocab = sort {$d{$b} <=> $d{$a}} keys %d; + +for (my $i=0; $i<scalar @vocab; $i++) { + my $most = $d{$vocab[$i]}; + my $least = 1; + + my $nl = -int(log($most / $num) / log($BASE) + $CUTOFF); + if ($nl < 0) { $nl = 0; } + print "$vocab[$i] $nl\n" +} + + diff --git a/gi/pf/monotonic_pseg.h b/gi/pf/monotonic_pseg.h index 301aa6d8..10d171fe 100644 --- a/gi/pf/monotonic_pseg.h +++ b/gi/pf/monotonic_pseg.h @@ -6,7 +6,7 @@ #include "prob.h" #include "ccrp_nt.h" #include "trule.h" -#include "base_measures.h" +#include "base_distributions.h" template <typename BaseMeasure> struct MonotonicParallelSegementationModel { diff --git a/gi/pf/ngram_base.cc b/gi/pf/ngram_base.cc new file mode 100644 index 00000000..1299f06f --- /dev/null +++ b/gi/pf/ngram_base.cc @@ -0,0 +1,69 @@ +#include "ngram_base.h" + +#include "lm/model.hh" +#include "tdict.h" + +using namespace std; + +namespace { +struct GICSVMapper : public lm::EnumerateVocab { + GICSVMapper(vector<lm::WordIndex>* out) : out_(out), kLM_UNKNOWN_TOKEN(0) { out_->clear(); } + void Add(lm::WordIndex index, const StringPiece &str) { + const WordID cdec_id = TD::Convert(str.as_string()); + if (cdec_id >= out_->size()) + out_->resize(cdec_id + 1, kLM_UNKNOWN_TOKEN); + (*out_)[cdec_id] = index; + } + vector<lm::WordIndex>* out_; + const lm::WordIndex kLM_UNKNOWN_TOKEN; +}; +} + +struct FixedNgramBaseImpl { + FixedNgramBaseImpl(const string& param) { + GICSVMapper vm(&cdec2klm_map_); + lm::ngram::Config conf; + conf.enumerate_vocab = &vm; + cerr << "Reading character LM from " << param << endl; + model = new lm::ngram::ProbingModel(param.c_str(), conf); + order = model->Order(); + kEOS = MapWord(TD::Convert("</s>")); + assert(kEOS > 0); + } + + lm::WordIndex MapWord(const WordID w) const { + if (w < cdec2klm_map_.size()) return cdec2klm_map_[w]; + return 0; + } + + ~FixedNgramBaseImpl() { delete model; } + + prob_t StringProbability(const vector<WordID>& s) const { + lm::ngram::State state = model->BeginSentenceState(); + double prob = 0; + for (unsigned i = 0; i < s.size(); ++i) { + const lm::ngram::State scopy(state); + prob += model->Score(scopy, MapWord(s[i]), state); + } + const lm::ngram::State scopy(state); + prob += model->Score(scopy, kEOS, state); + prob_t p; p.logeq(prob * log(10)); + return p; + } + + lm::ngram::ProbingModel* model; + unsigned order; + vector<lm::WordIndex> cdec2klm_map_; + lm::WordIndex kEOS; +}; + +FixedNgramBase::~FixedNgramBase() { delete impl; } + +FixedNgramBase::FixedNgramBase(const string& lmfname) { + impl = new FixedNgramBaseImpl(lmfname); +} + +prob_t FixedNgramBase::StringProbability(const vector<WordID>& s) const { + return impl->StringProbability(s); +} + diff --git a/gi/pf/ngram_base.h b/gi/pf/ngram_base.h new file mode 100644 index 00000000..4ea999f3 --- /dev/null +++ b/gi/pf/ngram_base.h @@ -0,0 +1,25 @@ +#ifndef _NGRAM_BASE_H_ +#define _NGRAM_BASE_H_ + +#include <string> +#include <vector> +#include "trule.h" +#include "wordid.h" +#include "prob.h" + +struct FixedNgramBaseImpl; +struct FixedNgramBase { + FixedNgramBase(const std::string& lmfname); + ~FixedNgramBase(); + prob_t StringProbability(const std::vector<WordID>& s) const; + + prob_t operator()(const TRule& rule) const { + return StringProbability(rule.e_); + } + + private: + FixedNgramBaseImpl* impl; + +}; + +#endif diff --git a/gi/pf/nuisance_test.cc b/gi/pf/nuisance_test.cc new file mode 100644 index 00000000..fc0af9cb --- /dev/null +++ b/gi/pf/nuisance_test.cc @@ -0,0 +1,161 @@ +#include "ccrp.h" + +#include <vector> +#include <iostream> + +#include "tdict.h" +#include "transliterations.h" + +using namespace std; + +MT19937 rng; + +ostream& operator<<(ostream&os, const vector<int>& v) { + os << '[' << v[0]; + if (v.size() == 2) os << ' ' << v[1]; + return os << ']'; +} + +struct Base { + Base() : llh(), v(2), v1(1), v2(1), crp(0.25, 0.5) {} + inline double p0(const vector<int>& x) const { + double p = 0.75; + if (x.size() == 2) p = 0.25; + p *= 1.0 / 3.0; + if (x.size() == 2) p *= 1.0 / 3.0; + return p; + } + double est_deriv_prob(int a, int b, int seg) const { + assert(a > 0 && a < 4); // a \in {1,2,3} + assert(b > 0 && b < 4); // b \in {1,2,3} + assert(seg == 0 || seg == 1); // seg \in {0,1} + if (seg == 0) { + v[0] = a; + v[1] = b; + return crp.prob(v, p0(v)); + } else { + v1[0] = a; + v2[0] = b; + return crp.prob(v1, p0(v1)) * crp.prob(v2, p0(v2)); + } + } + double est_marginal_prob(int a, int b) const { + return est_deriv_prob(a,b,0) + est_deriv_prob(a,b,1); + } + int increment(int a, int b, double* pw = NULL) { + double p1 = est_deriv_prob(a, b, 0); + double p2 = est_deriv_prob(a, b, 1); + //p1 = 0.5; p2 = 0.5; + int seg = rng.SelectSample(p1,p2); + double tmp = 0; + if (!pw) pw = &tmp; + double& w = *pw; + if (seg == 0) { + v[0] = a; + v[1] = b; + w = crp.prob(v, p0(v)) / p1; + if (crp.increment(v, p0(v), &rng)) { + llh += log(p0(v)); + } + } else { + v1[0] = a; + w = crp.prob(v1, p0(v1)) / p2; + if (crp.increment(v1, p0(v1), &rng)) { + llh += log(p0(v1)); + } + v2[0] = b; + w *= crp.prob(v2, p0(v2)); + if (crp.increment(v2, p0(v2), &rng)) { + llh += log(p0(v2)); + } + } + return seg; + } + void increment(int a, int b, int seg) { + if (seg == 0) { + v[0] = a; + v[1] = b; + if (crp.increment(v, p0(v), &rng)) { + llh += log(p0(v)); + } + } else { + v1[0] = a; + if (crp.increment(v1, p0(v1), &rng)) { + llh += log(p0(v1)); + } + v2[0] = b; + if (crp.increment(v2, p0(v2), &rng)) { + llh += log(p0(v2)); + } + } + } + void decrement(int a, int b, int seg) { + if (seg == 0) { + v[0] = a; + v[1] = b; + if (crp.decrement(v, &rng)) { + llh -= log(p0(v)); + } + } else { + v1[0] = a; + if (crp.decrement(v1, &rng)) { + llh -= log(p0(v1)); + } + v2[0] = b; + if (crp.decrement(v2, &rng)) { + llh -= log(p0(v2)); + } + } + } + double log_likelihood() const { + return llh + crp.log_crp_prob(); + } + double llh; + mutable vector<int> v, v1, v2; + CCRP<vector<int> > crp; +}; + +int main(int argc, char** argv) { + double tl = 0; + const int ITERS = 1000; + const int PARTICLES = 20; + const int DATAPOINTS = 50; + WordID x = TD::Convert("souvenons"); + WordID y = TD::Convert("remember"); + vector<WordID> src; TD::ConvertSentence("s o u v e n o n s", &src); + vector<WordID> trg; TD::ConvertSentence("r e m e m b e r", &trg); +// Transliterations xx; +// xx.Initialize(x, src, y, trg); +// return 1; + + for (int j = 0; j < ITERS; ++j) { + Base b; + vector<int> segs(DATAPOINTS); + SampleSet<double> ss; + vector<int> sss; + for (int i = 0; i < DATAPOINTS; i++) { + ss.clear(); + sss.clear(); + int x = ((i / 10) % 3) + 1; + int y = (i % 3) + 1; + //double ep = b.est_marginal_prob(x,y); + //cerr << "est p(" << x << "," << y << ") = " << ep << endl; + for (int n = 0; n < PARTICLES; ++n) { + double w; + int seg = b.increment(x,y,&w); + //cerr << seg << " w=" << w << endl; + ss.add(w); + sss.push_back(seg); + b.decrement(x,y,seg); + } + int seg = sss[rng.SelectSample(ss)]; + b.increment(x, y, seg); + //cerr << "Selected: " << seg << endl; + //return 1; + segs[i] = seg; + } + tl += b.log_likelihood(); + } + cerr << "LLH=" << tl / ITERS << endl; +} + diff --git a/gi/pf/os_phrase.h b/gi/pf/os_phrase.h new file mode 100644 index 00000000..dfe40cb1 --- /dev/null +++ b/gi/pf/os_phrase.h @@ -0,0 +1,15 @@ +#ifndef _OS_PHRASE_H_ +#define _OS_PHRASE_H_ + +#include <iostream> +#include <vector> +#include "tdict.h" + +inline std::ostream& operator<<(std::ostream& os, const std::vector<WordID>& p) { + os << '['; + for (int i = 0; i < p.size(); ++i) + os << (i==0 ? "" : " ") << TD::Convert(p[i]); + return os << ']'; +} + +#endif diff --git a/gi/pf/pfbrat.cc b/gi/pf/pfbrat.cc index 7b60ef23..c2c52760 100644 --- a/gi/pf/pfbrat.cc +++ b/gi/pf/pfbrat.cc @@ -191,7 +191,7 @@ struct UniphraseLM { void ResampleHyperparameters(MT19937* rng) { phrases_.resample_hyperparameters(rng); gen_.resample_hyperparameters(rng); - cerr << " " << phrases_.concentration(); + cerr << " " << phrases_.alpha(); } CCRP_NoTable<vector<int> > phrases_; diff --git a/gi/pf/pfdist.cc b/gi/pf/pfdist.cc index aae5f798..3d578db2 100644 --- a/gi/pf/pfdist.cc +++ b/gi/pf/pfdist.cc @@ -7,7 +7,7 @@ #include <boost/program_options/variables_map.hpp> #include "pf.h" -#include "base_measures.h" +#include "base_distributions.h" #include "reachability.h" #include "viterbi.h" #include "hg.h" @@ -315,7 +315,7 @@ struct BackwardEstimate { for (int i = 0; i < src_cov.size(); ++i) if (!src_cov[i]) r.push_back(src_[i]); const prob_t uniform_alignment(1.0 / r.size()); - e.logeq(log_poisson(trg_.size() - trg_cov, r.size() - 1)); // p(trg len remaining | src len remaining) + e.logeq(Md::log_poisson(trg_.size() - trg_cov, r.size() - 1)); // p(trg len remaining | src len remaining) for (unsigned j = trg_cov; j < trg_.size(); ++j) { prob_t p; for (unsigned i = 0; i < r.size(); ++i) @@ -352,7 +352,7 @@ struct BackwardEstimateSym { if (!src_cov[i]) r.push_back(src_[i]); r.push_back(0); // NULL word const prob_t uniform_alignment(1.0 / r.size()); - e.logeq(log_poisson(trg_.size() - trg_cov, r.size() - 1)); // p(trg len remaining | src len remaining) + e.logeq(Md::log_poisson(trg_.size() - trg_cov, r.size() - 1)); // p(trg len remaining | src len remaining) for (unsigned j = trg_cov; j < trg_.size(); ++j) { prob_t p; for (unsigned i = 0; i < r.size(); ++i) @@ -367,7 +367,7 @@ struct BackwardEstimateSym { r.pop_back(); const prob_t inv_uniform(1.0 / (trg_.size() - trg_cov + 1.0)); prob_t inv; - inv.logeq(log_poisson(r.size(), trg_.size() - trg_cov)); + inv.logeq(Md::log_poisson(r.size(), trg_.size() - trg_cov)); for (unsigned i = 0; i < r.size(); ++i) { prob_t p; for (unsigned j = trg_cov - 1; j < trg_.size(); ++j) diff --git a/gi/pf/pfnaive.cc b/gi/pf/pfnaive.cc index 728ec00d..e1a53f5c 100644 --- a/gi/pf/pfnaive.cc +++ b/gi/pf/pfnaive.cc @@ -7,7 +7,7 @@ #include <boost/program_options/variables_map.hpp> #include "pf.h" -#include "base_measures.h" +#include "base_distributions.h" #include "monotonic_pseg.h" #include "reachability.h" #include "viterbi.h" @@ -77,7 +77,7 @@ struct BackwardEstimateSym { r.push_back(src_[i]); r.push_back(0); // NULL word const prob_t uniform_alignment(1.0 / r.size()); - e.logeq(log_poisson(trg_.size() - trg_cov, r.size() - 1)); // p(trg len remaining | src len remaining) + e.logeq(Md::log_poisson(trg_.size() - trg_cov, r.size() - 1)); // p(trg len remaining | src len remaining) for (unsigned j = trg_cov; j < trg_.size(); ++j) { prob_t p; for (unsigned i = 0; i < r.size(); ++i) @@ -92,7 +92,7 @@ struct BackwardEstimateSym { r.pop_back(); const prob_t inv_uniform(1.0 / (trg_.size() - trg_cov + 1.0)); prob_t inv; - inv.logeq(log_poisson(r.size(), trg_.size() - trg_cov)); + inv.logeq(Md::log_poisson(r.size(), trg_.size() - trg_cov)); for (unsigned i = 0; i < r.size(); ++i) { prob_t p; for (unsigned j = trg_cov - 1; j < trg_.size(); ++j) diff --git a/gi/pf/pyp_lm.cc b/gi/pf/pyp_lm.cc new file mode 100644 index 00000000..91029688 --- /dev/null +++ b/gi/pf/pyp_lm.cc @@ -0,0 +1,209 @@ +#include <iostream> +#include <tr1/memory> +#include <queue> + +#include <boost/functional.hpp> +#include <boost/program_options.hpp> +#include <boost/program_options/variables_map.hpp> + +#include "corpus_tools.h" +#include "m.h" +#include "tdict.h" +#include "sampler.h" +#include "ccrp.h" +#include "tied_resampler.h" + +// A not very memory-efficient implementation of an N-gram LM based on PYPs +// as described in Y.-W. Teh. (2006) A Hierarchical Bayesian Language Model +// based on Pitman-Yor Processes. In Proc. ACL. + +// I use templates to handle the recursive formalation of the prior, so +// the order of the model has to be specified here, at compile time: +#define kORDER 3 + +using namespace std; +using namespace tr1; +namespace po = boost::program_options; + +shared_ptr<MT19937> prng; + +void InitCommandLine(int argc, char** argv, po::variables_map* conf) { + po::options_description opts("Configuration options"); + opts.add_options() + ("samples,n",po::value<unsigned>()->default_value(300),"Number of samples") + ("train,i",po::value<string>(),"Training data file") + ("test,T",po::value<string>(),"Test data file") + ("discount_prior_a,a",po::value<double>()->default_value(1.0), "discount ~ Beta(a,b): a=this") + ("discount_prior_b,b",po::value<double>()->default_value(1.0), "discount ~ Beta(a,b): b=this") + ("strength_prior_s,s",po::value<double>()->default_value(1.0), "strength ~ Gamma(s,r): s=this") + ("strength_prior_r,r",po::value<double>()->default_value(1.0), "strength ~ Gamma(s,r): r=this") + ("random_seed,S",po::value<uint32_t>(), "Random seed"); + po::options_description clo("Command line options"); + clo.add_options() + ("config", po::value<string>(), "Configuration file") + ("help", "Print this help message and exit"); + po::options_description dconfig_options, dcmdline_options; + dconfig_options.add(opts); + dcmdline_options.add(opts).add(clo); + + po::store(parse_command_line(argc, argv, dcmdline_options), *conf); + if (conf->count("config")) { + ifstream config((*conf)["config"].as<string>().c_str()); + po::store(po::parse_config_file(config, dconfig_options), *conf); + } + po::notify(*conf); + + if (conf->count("help") || (conf->count("train") == 0)) { + cerr << dcmdline_options << endl; + exit(1); + } +} + +template <unsigned N> struct PYPLM; + +// uniform base distribution (0-gram model) +template<> struct PYPLM<0> { + PYPLM(unsigned vs, double, double, double, double) : p0(1.0 / vs), draws() {} + void increment(WordID, const vector<WordID>&, MT19937*) { ++draws; } + void decrement(WordID, const vector<WordID>&, MT19937*) { --draws; assert(draws >= 0); } + double prob(WordID, const vector<WordID>&) const { return p0; } + void resample_hyperparameters(MT19937*) {} + double log_likelihood() const { return draws * log(p0); } + const double p0; + int draws; +}; + +// represents an N-gram LM +template <unsigned N> struct PYPLM { + PYPLM(unsigned vs, double da, double db, double ss, double sr) : + backoff(vs, da, db, ss, sr), + tr(da, db, ss, sr, 0.8, 1.0), + lookup(N-1) {} + void increment(WordID w, const vector<WordID>& context, MT19937* rng) { + const double bo = backoff.prob(w, context); + for (unsigned i = 0; i < N-1; ++i) + lookup[i] = context[context.size() - 1 - i]; + typename unordered_map<vector<WordID>, CCRP<WordID>, boost::hash<vector<WordID> > >::iterator it = p.find(lookup); + if (it == p.end()) { + it = p.insert(make_pair(lookup, CCRP<WordID>(0.5,1))).first; + tr.Add(&it->second); // add to resampler + } + if (it->second.increment(w, bo, rng)) + backoff.increment(w, context, rng); + } + void decrement(WordID w, const vector<WordID>& context, MT19937* rng) { + for (unsigned i = 0; i < N-1; ++i) + lookup[i] = context[context.size() - 1 - i]; + typename unordered_map<vector<WordID>, CCRP<WordID>, boost::hash<vector<WordID> > >::iterator it = p.find(lookup); + assert(it != p.end()); + if (it->second.decrement(w, rng)) + backoff.decrement(w, context, rng); + } + double prob(WordID w, const vector<WordID>& context) const { + const double bo = backoff.prob(w, context); + for (unsigned i = 0; i < N-1; ++i) + lookup[i] = context[context.size() - 1 - i]; + typename unordered_map<vector<WordID>, CCRP<WordID>, boost::hash<vector<WordID> > >::const_iterator it = p.find(lookup); + if (it == p.end()) return bo; + return it->second.prob(w, bo); + } + + double log_likelihood() const { + double llh = backoff.log_likelihood(); + typename unordered_map<vector<WordID>, CCRP<WordID>, boost::hash<vector<WordID> > >::const_iterator it; + for (it = p.begin(); it != p.end(); ++it) + llh += it->second.log_crp_prob(); + llh += tr.LogLikelihood(); + return llh; + } + + void resample_hyperparameters(MT19937* rng) { + tr.ResampleHyperparameters(rng); + backoff.resample_hyperparameters(rng); + } + + PYPLM<N-1> backoff; + TiedResampler<CCRP<WordID> > tr; + double discount_a, discount_b, strength_s, strength_r; + double d, strength; + mutable vector<WordID> lookup; // thread-local + unordered_map<vector<WordID>, CCRP<WordID>, boost::hash<vector<WordID> > > p; +}; + +int main(int argc, char** argv) { + po::variables_map conf; + + InitCommandLine(argc, argv, &conf); + const unsigned samples = conf["samples"].as<unsigned>(); + if (conf.count("random_seed")) + prng.reset(new MT19937(conf["random_seed"].as<uint32_t>())); + else + prng.reset(new MT19937); + MT19937& rng = *prng; + vector<vector<WordID> > corpuse; + set<WordID> vocabe; + const WordID kEOS = TD::Convert("</s>"); + cerr << "Reading corpus...\n"; + CorpusTools::ReadFromFile(conf["train"].as<string>(), &corpuse, &vocabe); + cerr << "E-corpus size: " << corpuse.size() << " sentences\t (" << vocabe.size() << " word types)\n"; + vector<vector<WordID> > test; + if (conf.count("test")) + CorpusTools::ReadFromFile(conf["test"].as<string>(), &test); + else + test = corpuse; + PYPLM<kORDER> lm(vocabe.size(), + conf["discount_prior_a"].as<double>(), + conf["discount_prior_b"].as<double>(), + conf["strength_prior_s"].as<double>(), + conf["strength_prior_r"].as<double>()); + vector<WordID> ctx(kORDER - 1, TD::Convert("<s>")); + for (int SS=0; SS < samples; ++SS) { + for (int ci = 0; ci < corpuse.size(); ++ci) { + ctx.resize(kORDER - 1); + const vector<WordID>& s = corpuse[ci]; + for (int i = 0; i <= s.size(); ++i) { + WordID w = (i < s.size() ? s[i] : kEOS); + if (SS > 0) lm.decrement(w, ctx, &rng); + lm.increment(w, ctx, &rng); + ctx.push_back(w); + } + if (SS > 0) lm.decrement(kEOS, ctx, &rng); + lm.increment(kEOS, ctx, &rng); + } + if (SS % 10 == 9) { + cerr << " [LLH=" << lm.log_likelihood() << "]" << endl; + if (SS % 20 == 19) lm.resample_hyperparameters(&rng); + } else { cerr << '.' << flush; } + } + double llh = 0; + unsigned cnt = 0; + unsigned oovs = 0; + for (int ci = 0; ci < test.size(); ++ci) { + ctx.resize(kORDER - 1); + const vector<WordID>& s = test[ci]; + for (int i = 0; i <= s.size(); ++i) { + WordID w = (i < s.size() ? s[i] : kEOS); + double lp = log(lm.prob(w, ctx)) / log(2); + if (i < s.size() && vocabe.count(w) == 0) { + cerr << "**OOV "; + ++oovs; + lp = 0; + } + cerr << "p(" << TD::Convert(w) << " |"; + for (int j = ctx.size() + 1 - kORDER; j < ctx.size(); ++j) + cerr << ' ' << TD::Convert(ctx[j]); + cerr << ") = " << lp << endl; + ctx.push_back(w); + llh -= lp; + cnt++; + } + } + cerr << " Log_10 prob: " << (-llh * log(2) / log(10)) << endl; + cerr << " Count: " << cnt << endl; + cerr << " OOVs: " << oovs << endl; + cerr << "Cross-entropy: " << (llh / cnt) << endl; + cerr << " Perplexity: " << pow(2, llh / cnt) << endl; + return 0; +} + + diff --git a/gi/pf/pyp_tm.cc b/gi/pf/pyp_tm.cc new file mode 100644 index 00000000..e21f0267 --- /dev/null +++ b/gi/pf/pyp_tm.cc @@ -0,0 +1,131 @@ +#include "pyp_tm.h" + +#include <tr1/unordered_map> +#include <iostream> +#include <queue> + +#include "tdict.h" +#include "ccrp.h" +#include "pyp_word_model.h" +#include "tied_resampler.h" + +using namespace std; +using namespace std::tr1; + +struct FreqBinner { + FreqBinner(const std::string& fname) { fd_.Load(fname); } + unsigned NumberOfBins() const { return fd_.Max() + 1; } + unsigned Bin(const WordID& w) const { return fd_.LookUp(w); } + FreqDict<unsigned> fd_; +}; + +template <typename Base, class Binner = FreqBinner> +struct ConditionalPYPWordModel { + ConditionalPYPWordModel(Base* b, const Binner* bnr = NULL) : + base(*b), + binner(bnr), + btr(binner ? binner->NumberOfBins() + 1u : 2u) {} + + void Summary() const { + cerr << "Number of conditioning contexts: " << r.size() << endl; + for (RuleModelHash::const_iterator it = r.begin(); it != r.end(); ++it) { + cerr << TD::Convert(it->first) << " \tPYP(d=" << it->second.discount() << ",s=" << it->second.strength() << ") --------------------------" << endl; + for (CCRP<vector<WordID> >::const_iterator i2 = it->second.begin(); i2 != it->second.end(); ++i2) + cerr << " " << i2->second.total_dish_count_ << '\t' << TD::GetString(i2->first) << endl; + } + } + + void ResampleHyperparameters(MT19937* rng) { + btr.ResampleHyperparameters(rng); + } + + prob_t Prob(const WordID src, const vector<WordID>& trglets) const { + RuleModelHash::const_iterator it = r.find(src); + if (it == r.end()) { + return base(trglets); + } else { + return it->second.prob(trglets, base(trglets)); + } + } + + void Increment(const WordID src, const vector<WordID>& trglets, MT19937* rng) { + RuleModelHash::iterator it = r.find(src); + if (it == r.end()) { + it = r.insert(make_pair(src, CCRP<vector<WordID> >(0.5,1.0))).first; + static const WordID kNULL = TD::Convert("NULL"); + unsigned bin = (src == kNULL ? 0 : 1); + if (binner && bin) { bin = binner->Bin(src) + 1; } + btr.Add(bin, &it->second); + } + if (it->second.increment(trglets, base(trglets), rng)) + base.Increment(trglets, rng); + } + + void Decrement(const WordID src, const vector<WordID>& trglets, MT19937* rng) { + RuleModelHash::iterator it = r.find(src); + assert(it != r.end()); + if (it->second.decrement(trglets, rng)) { + base.Decrement(trglets, rng); + } + } + + prob_t Likelihood() const { + prob_t p = prob_t::One(); + for (RuleModelHash::const_iterator it = r.begin(); it != r.end(); ++it) { + prob_t q; q.logeq(it->second.log_crp_prob()); + p *= q; + } + return p; + } + + unsigned UniqueConditioningContexts() const { + return r.size(); + } + + // TODO tie PYP hyperparameters based on source word frequency bins + Base& base; + const Binner* binner; + BinTiedResampler<CCRP<vector<WordID> > > btr; + typedef unordered_map<WordID, CCRP<vector<WordID> > > RuleModelHash; + RuleModelHash r; +}; + +PYPLexicalTranslation::PYPLexicalTranslation(const vector<vector<WordID> >& lets, + const unsigned num_letters) : + letters(lets), + up0(new PYPWordModel(num_letters)), + tmodel(new ConditionalPYPWordModel<PYPWordModel>(up0, new FreqBinner("10k.freq"))), + kX(-TD::Convert("X")) {} + +void PYPLexicalTranslation::Summary() const { + tmodel->Summary(); + up0->Summary(); +} + +prob_t PYPLexicalTranslation::Likelihood() const { + prob_t p = up0->Likelihood(); + p *= tmodel->Likelihood(); + return p; +} + +void PYPLexicalTranslation::ResampleHyperparameters(MT19937* rng) { + tmodel->ResampleHyperparameters(rng); + up0->ResampleHyperparameters(rng); +} + +unsigned PYPLexicalTranslation::UniqueConditioningContexts() const { + return tmodel->UniqueConditioningContexts(); +} + +prob_t PYPLexicalTranslation::Prob(WordID src, WordID trg) const { + return tmodel->Prob(src, letters[trg]); +} + +void PYPLexicalTranslation::Increment(WordID src, WordID trg, MT19937* rng) { + tmodel->Increment(src, letters[trg], rng); +} + +void PYPLexicalTranslation::Decrement(WordID src, WordID trg, MT19937* rng) { + tmodel->Decrement(src, letters[trg], rng); +} + diff --git a/gi/pf/pyp_tm.h b/gi/pf/pyp_tm.h new file mode 100644 index 00000000..63e7c96d --- /dev/null +++ b/gi/pf/pyp_tm.h @@ -0,0 +1,35 @@ +#ifndef PYP_LEX_TRANS +#define PYP_LEX_TRANS + +#include <vector> +#include "wordid.h" +#include "prob.h" +#include "sampler.h" +#include "freqdict.h" + +struct FreqBinner; +struct PYPWordModel; +template <typename T, class B> struct ConditionalPYPWordModel; + +struct PYPLexicalTranslation { + explicit PYPLexicalTranslation(const std::vector<std::vector<WordID> >& lets, + const unsigned num_letters); + + prob_t Likelihood() const; + + void ResampleHyperparameters(MT19937* rng); + prob_t Prob(WordID src, WordID trg) const; // return p(trg | src) + void Summary() const; + void Increment(WordID src, WordID trg, MT19937* rng); + void Decrement(WordID src, WordID trg, MT19937* rng); + unsigned UniqueConditioningContexts() const; + + private: + const std::vector<std::vector<WordID> >& letters; // spelling dictionary + PYPWordModel* up0; // base distribuction (model English word) + ConditionalPYPWordModel<PYPWordModel, FreqBinner>* tmodel; // translation distributions + // (model English word | French word) + const WordID kX; +}; + +#endif diff --git a/gi/pf/pyp_word_model.cc b/gi/pf/pyp_word_model.cc new file mode 100644 index 00000000..12df4abf --- /dev/null +++ b/gi/pf/pyp_word_model.cc @@ -0,0 +1,20 @@ +#include "pyp_word_model.h" + +#include <iostream> + +using namespace std; + +void PYPWordModel::ResampleHyperparameters(MT19937* rng) { + r.resample_hyperparameters(rng); + cerr << " PYPWordModel(d=" << r.discount() << ",s=" << r.strength() << ")\n"; +} + +void PYPWordModel::Summary() const { + cerr << "PYPWordModel: generations=" << r.num_customers() + << " PYP(d=" << r.discount() << ",s=" << r.strength() << ')' << endl; + for (CCRP<vector<WordID> >::const_iterator it = r.begin(); it != r.end(); ++it) + cerr << " " << it->second.total_dish_count_ + << " (on " << it->second.table_counts_.size() << " tables) " + << TD::GetString(it->first) << endl; +} + diff --git a/gi/pf/pyp_word_model.h b/gi/pf/pyp_word_model.h new file mode 100644 index 00000000..ff366865 --- /dev/null +++ b/gi/pf/pyp_word_model.h @@ -0,0 +1,58 @@ +#ifndef _PYP_WORD_MODEL_H_ +#define _PYP_WORD_MODEL_H_ + +#include <iostream> +#include <cmath> +#include <vector> +#include "prob.h" +#include "ccrp.h" +#include "m.h" +#include "tdict.h" +#include "os_phrase.h" + +// PYP(d,s,poisson-uniform) represented as a CRP +struct PYPWordModel { + explicit PYPWordModel(const unsigned vocab_e_size, const double mean_len = 5) : + base(prob_t::One()), r(1,1,1,1,0.66,50.0), u0(-std::log(vocab_e_size)), mean_length(mean_len) {} + + void ResampleHyperparameters(MT19937* rng); + + inline prob_t operator()(const std::vector<WordID>& s) const { + return r.prob(s, p0(s)); + } + + inline void Increment(const std::vector<WordID>& s, MT19937* rng) { + if (r.increment(s, p0(s), rng)) + base *= p0(s); + } + + inline void Decrement(const std::vector<WordID>& s, MT19937 *rng) { + if (r.decrement(s, rng)) + base /= p0(s); + } + + inline prob_t Likelihood() const { + prob_t p; p.logeq(r.log_crp_prob()); + p *= base; + return p; + } + + void Summary() const; + + private: + inline double logp0(const std::vector<WordID>& s) const { + return Md::log_poisson(s.size(), mean_length) + s.size() * u0; + } + + inline prob_t p0(const std::vector<WordID>& s) const { + prob_t p; p.logeq(logp0(s)); + return p; + } + + prob_t base; // keeps track of the draws from the base distribution + CCRP<std::vector<WordID> > r; + const double u0; // uniform log prob of generating a letter + const double mean_length; // mean length of a word in the base distribution +}; + +#endif diff --git a/gi/pf/quasi_model2.h b/gi/pf/quasi_model2.h new file mode 100644 index 00000000..588c8f84 --- /dev/null +++ b/gi/pf/quasi_model2.h @@ -0,0 +1,166 @@ +#ifndef _QUASI_MODEL2_H_ +#define _QUASI_MODEL2_H_ + +#include <vector> +#include <cmath> +#include <tr1/unordered_map> +#include "boost/functional.hpp" +#include "prob.h" +#include "array2d.h" +#include "slice_sampler.h" +#include "m.h" + +struct AlignmentObservation { + AlignmentObservation() : src_len(), trg_len(), j(), a_j() {} + AlignmentObservation(unsigned sl, unsigned tl, unsigned tw, unsigned sw) : + src_len(sl), trg_len(tl), j(tw), a_j(sw) {} + unsigned short src_len; + unsigned short trg_len; + unsigned short j; + unsigned short a_j; +}; + +inline size_t hash_value(const AlignmentObservation& o) { + return reinterpret_cast<const size_t&>(o); +} + +inline bool operator==(const AlignmentObservation& a, const AlignmentObservation& b) { + return hash_value(a) == hash_value(b); +} + +struct QuasiModel2 { + explicit QuasiModel2(double alpha, double pnull = 0.1) : + alpha_(alpha), + pnull_(pnull), + pnotnull_(1 - pnull) {} + + // a_j = 0 => NULL; src_len does *not* include null + prob_t Prob(unsigned a_j, unsigned j, unsigned src_len, unsigned trg_len) const { + if (!a_j) return pnull_; + return pnotnull_ * + prob_t(UnnormalizedProb(a_j, j, src_len, trg_len, alpha_) / GetOrComputeZ(j, src_len, trg_len)); + } + + void Increment(unsigned a_j, unsigned j, unsigned src_len, unsigned trg_len) { + assert(a_j <= src_len); + assert(j < trg_len); + ++obs_[AlignmentObservation(src_len, trg_len, j, a_j)]; + } + + void Decrement(unsigned a_j, unsigned j, unsigned src_len, unsigned trg_len) { + const AlignmentObservation ao(src_len, trg_len, j, a_j); + int &cc = obs_[ao]; + assert(cc > 0); + --cc; + if (!cc) obs_.erase(ao); + } + + struct PNullResampler { + PNullResampler(const QuasiModel2& m) : m_(m) {} + const QuasiModel2& m_; + double operator()(const double& proposed_pnull) const { + return log(m_.Likelihood(m_.alpha_, proposed_pnull)); + } + }; + + struct AlphaResampler { + AlphaResampler(const QuasiModel2& m) : m_(m) {} + const QuasiModel2& m_; + double operator()(const double& proposed_alpha) const { + return log(m_.Likelihood(proposed_alpha, m_.pnull_.as_float())); + } + }; + + void ResampleHyperparameters(MT19937* rng, const unsigned nloop = 5, const unsigned niterations = 10) { + const PNullResampler dr(*this); + const AlphaResampler ar(*this); + for (unsigned i = 0; i < nloop; ++i) { + double pnull = slice_sampler1d(dr, pnull_.as_float(), *rng, 0.00000001, + 1.0, 0.0, niterations, 100*niterations); + pnull_ = prob_t(pnull); + alpha_ = slice_sampler1d(ar, alpha_, *rng, 0.00000001, + std::numeric_limits<double>::infinity(), 0.0, niterations, 100*niterations); + } + std::cerr << "QuasiModel2(alpha=" << alpha_ << ",p_null=" + << pnull_.as_float() << ") = " << Likelihood() << std::endl; + zcache_.clear(); + } + + prob_t Likelihood() const { + return Likelihood(alpha_, pnull_.as_float()); + } + + prob_t Likelihood(double alpha, double ppnull) const { + const prob_t pnull(ppnull); + const prob_t pnotnull(1 - ppnull); + + prob_t p; + p.logeq(Md::log_gamma_density(alpha, 0.1, 25)); // TODO configure + assert(!p.is_0()); + prob_t prob_of_ppnull; prob_of_ppnull.logeq(Md::log_beta_density(ppnull, 2, 10)); + assert(!prob_of_ppnull.is_0()); + p *= prob_of_ppnull; + for (ObsCount::const_iterator it = obs_.begin(); it != obs_.end(); ++it) { + const AlignmentObservation& ao = it->first; + if (ao.a_j) { + prob_t u = XUnnormalizedProb(ao.a_j, ao.j, ao.src_len, ao.trg_len, alpha); + prob_t z = XComputeZ(ao.j, ao.src_len, ao.trg_len, alpha); + prob_t pa(u / z); + pa *= pnotnull; + pa.poweq(it->second); + p *= pa; + } else { + p *= pnull.pow(it->second); + } + } + return p; + } + + private: + static prob_t XUnnormalizedProb(unsigned a_j, unsigned j, unsigned src_len, unsigned trg_len, double alpha) { + prob_t p; + p.logeq(-fabs(double(a_j - 1) / src_len - double(j) / trg_len) * alpha); + return p; + } + + static prob_t XComputeZ(unsigned j, unsigned src_len, unsigned trg_len, double alpha) { + prob_t z = prob_t::Zero(); + for (int a_j = 1; a_j <= src_len; ++a_j) + z += XUnnormalizedProb(a_j, j, src_len, trg_len, alpha); + return z; + } + + static double UnnormalizedProb(unsigned a_j, unsigned j, unsigned src_len, unsigned trg_len, double alpha) { + return exp(-fabs(double(a_j - 1) / src_len - double(j) / trg_len) * alpha); + } + + static double ComputeZ(unsigned j, unsigned src_len, unsigned trg_len, double alpha) { + double z = 0; + for (int a_j = 1; a_j <= src_len; ++a_j) + z += UnnormalizedProb(a_j, j, src_len, trg_len, alpha); + return z; + } + + const double& GetOrComputeZ(unsigned j, unsigned src_len, unsigned trg_len) const { + if (src_len >= zcache_.size()) + zcache_.resize(src_len + 1); + if (trg_len >= zcache_[src_len].size()) + zcache_[src_len].resize(trg_len + 1); + std::vector<double>& zv = zcache_[src_len][trg_len]; + if (zv.size() == 0) + zv.resize(trg_len); + double& z = zv[j]; + if (!z) + z = ComputeZ(j, src_len, trg_len, alpha_); + return z; + } + + double alpha_; + prob_t pnull_; + prob_t pnotnull_; + mutable std::vector<std::vector<std::vector<double> > > zcache_; + typedef std::tr1::unordered_map<AlignmentObservation, int, boost::hash<AlignmentObservation> > ObsCount; + ObsCount obs_; +}; + +#endif diff --git a/gi/pf/reachability.cc b/gi/pf/reachability.cc index 73dd8d39..7d0d04ac 100644 --- a/gi/pf/reachability.cc +++ b/gi/pf/reachability.cc @@ -30,15 +30,17 @@ void Reachability::ComputeReachability(int srclen, int trglen, int src_max_phras } } a[0][0].clear(); - //cerr << "Final cell contains " << a[srclen][trglen].size() << " back pointers\n"; - if (a[srclen][trglen].size() == 0) { - cerr << "Sentence with length (" << srclen << ',' << trglen << ") violates reachability constraints\n"; + //cerr << srclen << "," << trglen << ": Final cell contains " << a[srclen][trglen].size() << " back pointers\n"; + if (a[srclen][trglen].empty()) { + cerr << "Sequence pair with lengths (" << srclen << ',' << trglen << ") violates reachability constraints\n"; + nodes = 0; return; } typedef boost::multi_array<bool, 2> rarray_type; rarray_type r(boost::extents[srclen + 1][trglen + 1]); r[srclen][trglen] = true; + nodes = 0; for (int i = srclen; i >= 0; --i) { for (int j = trglen; j >= 0; --j) { vector<SState>& prevs = a[i][j]; @@ -47,6 +49,7 @@ void Reachability::ComputeReachability(int srclen, int trglen, int src_max_phras r[prevs[k].prev_src_covered][prevs[k].prev_trg_covered] = true; int src_delta = i - prevs[k].prev_src_covered; edges[prevs[k].prev_src_covered][prevs[k].prev_trg_covered][src_delta][j - prevs[k].prev_trg_covered] = true; + valid_deltas[prevs[k].prev_src_covered][prevs[k].prev_trg_covered].push_back(make_pair<short,short>(src_delta,j - prevs[k].prev_trg_covered)); short &msd = max_src_delta[prevs[k].prev_src_covered][prevs[k].prev_trg_covered]; if (src_delta > msd) msd = src_delta; } @@ -56,9 +59,16 @@ void Reachability::ComputeReachability(int srclen, int trglen, int src_max_phras assert(!edges[0][0][0][1]); assert(!edges[0][0][0][0]); assert(max_src_delta[0][0] > 0); - //cerr << "First cell contains " << b[0][0].size() << " forward pointers\n"; - //for (int i = 0; i < b[0][0].size(); ++i) { - // cerr << " -> (" << b[0][0][i].next_src_covered << "," << b[0][0][i].next_trg_covered << ")\n"; - //} + nodes = 0; + for (int i = 0; i < srclen; ++i) { + for (int j = 0; j < trglen; ++j) { + if (valid_deltas[i][j].size() > 0) { + node_addresses[i][j] = nodes++; + } else { + node_addresses[i][j] = -1; + } + } + } + cerr << "Sequence pair with lengths (" << srclen << ',' << trglen << ") has " << valid_deltas[0][0].size() << " out edges in its root node, " << nodes << " nodes in total, and outside estimate matrix will require " << sizeof(float)*nodes << " bytes\n"; } diff --git a/gi/pf/reachability.h b/gi/pf/reachability.h index 98450ec1..1e22c76a 100644 --- a/gi/pf/reachability.h +++ b/gi/pf/reachability.h @@ -12,12 +12,18 @@ // currently forbids 0 -> n and n -> 0 alignments struct Reachability { - boost::multi_array<bool, 4> edges; // edges[src_covered][trg_covered][x][trg_delta] is this edge worth exploring? + unsigned nodes; + boost::multi_array<bool, 4> edges; // edges[src_covered][trg_covered][src_delta][trg_delta] is this edge worth exploring? boost::multi_array<short, 2> max_src_delta; // msd[src_covered][trg_covered] -- the largest src delta that's valid + boost::multi_array<short, 2> node_addresses; // na[src_covered][trg_covered] -- the index of the node in a one-dimensional array (of size "nodes") + boost::multi_array<std::vector<std::pair<short,short> >, 2> valid_deltas; // valid_deltas[src_covered][trg_covered] list of valid transitions leaving a particular node Reachability(int srclen, int trglen, int src_max_phrase_len, int trg_max_phrase_len) : + nodes(), edges(boost::extents[srclen][trglen][src_max_phrase_len+1][trg_max_phrase_len+1]), - max_src_delta(boost::extents[srclen][trglen]) { + max_src_delta(boost::extents[srclen][trglen]), + node_addresses(boost::extents[srclen][trglen]), + valid_deltas(boost::extents[srclen][trglen]) { ComputeReachability(srclen, trglen, src_max_phrase_len, trg_max_phrase_len); } diff --git a/gi/pf/tied_resampler.h b/gi/pf/tied_resampler.h new file mode 100644 index 00000000..6f45fbce --- /dev/null +++ b/gi/pf/tied_resampler.h @@ -0,0 +1,124 @@ +#ifndef _TIED_RESAMPLER_H_ +#define _TIED_RESAMPLER_H_ + +#include <set> +#include <vector> +#include "sampler.h" +#include "slice_sampler.h" +#include "m.h" + +template <class CRP> +struct TiedResampler { + explicit TiedResampler(double da, double db, double ss, double sr, double d=0.5, double s=1.0) : + d_alpha(da), + d_beta(db), + s_shape(ss), + s_rate(sr), + discount(d), + strength(s) {} + + void Add(CRP* crp) { + crps.insert(crp); + crp->set_discount(discount); + crp->set_strength(strength); + assert(!crp->has_discount_prior()); + assert(!crp->has_strength_prior()); + } + + void Remove(CRP* crp) { + crps.erase(crp); + } + + size_t size() const { + return crps.size(); + } + + double LogLikelihood(double d, double s) const { + if (s <= -d) return -std::numeric_limits<double>::infinity(); + double llh = Md::log_beta_density(d, d_alpha, d_beta) + + Md::log_gamma_density(d + s, s_shape, s_rate); + for (typename std::set<CRP*>::iterator it = crps.begin(); it != crps.end(); ++it) + llh += (*it)->log_crp_prob(d, s); + return llh; + } + + double LogLikelihood() const { + return LogLikelihood(discount, strength); + } + + struct DiscountResampler { + DiscountResampler(const TiedResampler& m) : m_(m) {} + const TiedResampler& m_; + double operator()(const double& proposed_discount) const { + return m_.LogLikelihood(proposed_discount, m_.strength); + } + }; + + struct AlphaResampler { + AlphaResampler(const TiedResampler& m) : m_(m) {} + const TiedResampler& m_; + double operator()(const double& proposed_strength) const { + return m_.LogLikelihood(m_.discount, proposed_strength); + } + }; + + void ResampleHyperparameters(MT19937* rng, const unsigned nloop = 5, const unsigned niterations = 10) { + if (size() == 0) { std::cerr << "EMPTY - not resampling\n"; return; } + const DiscountResampler dr(*this); + const AlphaResampler ar(*this); + for (int iter = 0; iter < nloop; ++iter) { + strength = slice_sampler1d(ar, strength, *rng, -discount + std::numeric_limits<double>::min(), + std::numeric_limits<double>::infinity(), 0.0, niterations, 100*niterations); + double min_discount = std::numeric_limits<double>::min(); + if (strength < 0.0) min_discount -= strength; + discount = slice_sampler1d(dr, discount, *rng, min_discount, + 1.0, 0.0, niterations, 100*niterations); + } + strength = slice_sampler1d(ar, strength, *rng, -discount + std::numeric_limits<double>::min(), + std::numeric_limits<double>::infinity(), 0.0, niterations, 100*niterations); + std::cerr << "TiedCRPs(d=" << discount << ",s=" + << strength << ") = " << LogLikelihood(discount, strength) << std::endl; + for (typename std::set<CRP*>::iterator it = crps.begin(); it != crps.end(); ++it) { + (*it)->set_discount(discount); + (*it)->set_strength(strength); + } + } + private: + std::set<CRP*> crps; + const double d_alpha, d_beta, s_shape, s_rate; + double discount, strength; +}; + +// split according to some criterion +template <class CRP> +struct BinTiedResampler { + explicit BinTiedResampler(unsigned nbins) : + resamplers(nbins, TiedResampler<CRP>(1,1,1,1)) {} + + void Add(unsigned bin, CRP* crp) { + resamplers[bin].Add(crp); + } + + void Remove(unsigned bin, CRP* crp) { + resamplers[bin].Remove(crp); + } + + void ResampleHyperparameters(MT19937* rng) { + for (unsigned i = 0; i < resamplers.size(); ++i) { + std::cerr << "BIN " << i << " (" << resamplers[i].size() << " CRPs): " << std::flush; + resamplers[i].ResampleHyperparameters(rng); + } + } + + double LogLikelihood() const { + double llh = 0; + for (unsigned i = 0; i < resamplers.size(); ++i) + llh += resamplers[i].LogLikelihood(); + return llh; + } + + private: + std::vector<TiedResampler<CRP> > resamplers; +}; + +#endif diff --git a/gi/pf/transliterations.cc b/gi/pf/transliterations.cc new file mode 100644 index 00000000..2200715e --- /dev/null +++ b/gi/pf/transliterations.cc @@ -0,0 +1,334 @@ +#include "transliterations.h" + +#include <iostream> +#include <vector> + +#include "boost/shared_ptr.hpp" + +#include "backward.h" +#include "filelib.h" +#include "tdict.h" +#include "trule.h" +#include "filelib.h" +#include "ccrp_nt.h" +#include "m.h" +#include "reachability.h" + +using namespace std; +using namespace std::tr1; + +struct TruncatedConditionalLengthModel { + TruncatedConditionalLengthModel(unsigned max_src_size, unsigned max_trg_size, double expected_src_to_trg_ratio) : + plens(max_src_size+1, vector<prob_t>(max_trg_size+1, 0.0)) { + for (unsigned i = 1; i <= max_src_size; ++i) { + prob_t z = prob_t::Zero(); + for (unsigned j = 1; j <= max_trg_size; ++j) + z += (plens[i][j] = prob_t(0.01 + exp(Md::log_poisson(j, i * expected_src_to_trg_ratio)))); + for (unsigned j = 1; j <= max_trg_size; ++j) + plens[i][j] /= z; + //for (unsigned j = 1; j <= max_trg_size; ++j) + // cerr << "P(trg_len=" << j << " | src_len=" << i << ") = " << plens[i][j] << endl; + } + } + + // return p(tlen | slen) for *chunks* not full words + inline const prob_t& operator()(int slen, int tlen) const { + return plens[slen][tlen]; + } + + vector<vector<prob_t> > plens; +}; + +struct CondBaseDist { + CondBaseDist(unsigned max_src_size, unsigned max_trg_size, double expected_src_to_trg_ratio) : + tclm(max_src_size, max_trg_size, expected_src_to_trg_ratio) {} + + prob_t operator()(const vector<WordID>& src, unsigned sf, unsigned st, + const vector<WordID>& trg, unsigned tf, unsigned tt) const { + prob_t p = tclm(st - sf, tt - tf); // target len | source length ~ TCLM(source len) + assert(!"not impl"); + return p; + } + inline prob_t operator()(const vector<WordID>& src, const vector<WordID>& trg) const { + return (*this)(src, 0, src.size(), trg, 0, trg.size()); + } + TruncatedConditionalLengthModel tclm; +}; + +// represents transliteration phrase probabilities, e.g. +// p( a l - | A l ) , p( o | A w ) , ... +struct TransliterationChunkConditionalModel { + explicit TransliterationChunkConditionalModel(const CondBaseDist& pp0) : + d(0.0), + strength(1.0), + rp0(pp0) { + } + + void Summary() const { + std::cerr << "Number of conditioning contexts: " << r.size() << std::endl; + for (RuleModelHash::const_iterator it = r.begin(); it != r.end(); ++it) { + std::cerr << TD::GetString(it->first) << " \t(\\alpha = " << it->second.alpha() << ") --------------------------" << std::endl; + for (CCRP_NoTable<TRule>::const_iterator i2 = it->second.begin(); i2 != it->second.end(); ++i2) + std::cerr << " " << i2->second << '\t' << i2->first << std::endl; + } + } + + int DecrementRule(const TRule& rule) { + RuleModelHash::iterator it = r.find(rule.f_); + assert(it != r.end()); + int count = it->second.decrement(rule); + if (count) { + if (it->second.num_customers() == 0) r.erase(it); + } + return count; + } + + int IncrementRule(const TRule& rule) { + RuleModelHash::iterator it = r.find(rule.f_); + if (it == r.end()) { + it = r.insert(make_pair(rule.f_, CCRP_NoTable<TRule>(strength))).first; + } + int count = it->second.increment(rule); + return count; + } + + void IncrementRules(const std::vector<TRulePtr>& rules) { + for (int i = 0; i < rules.size(); ++i) + IncrementRule(*rules[i]); + } + + void DecrementRules(const std::vector<TRulePtr>& rules) { + for (int i = 0; i < rules.size(); ++i) + DecrementRule(*rules[i]); + } + + prob_t RuleProbability(const TRule& rule) const { + prob_t p; + RuleModelHash::const_iterator it = r.find(rule.f_); + if (it == r.end()) { + p = rp0(rule.f_, rule.e_); + } else { + p = it->second.prob(rule, rp0(rule.f_, rule.e_)); + } + return p; + } + + double LogLikelihood(const double& dd, const double& aa) const { + if (aa <= -dd) return -std::numeric_limits<double>::infinity(); + //double llh = Md::log_beta_density(dd, 10, 3) + Md::log_gamma_density(aa, 1, 1); + double llh = //Md::log_beta_density(dd, 1, 1) + + Md::log_gamma_density(dd + aa, 1, 1); + typename std::tr1::unordered_map<std::vector<WordID>, CCRP_NoTable<TRule>, boost::hash<std::vector<WordID> > >::const_iterator it; + for (it = r.begin(); it != r.end(); ++it) + llh += it->second.log_crp_prob(aa); + return llh; + } + + struct AlphaResampler { + AlphaResampler(const TransliterationChunkConditionalModel& m) : m_(m) {} + const TransliterationChunkConditionalModel& m_; + double operator()(const double& proposed_strength) const { + return m_.LogLikelihood(m_.d, proposed_strength); + } + }; + + void ResampleHyperparameters(MT19937* rng) { + typename std::tr1::unordered_map<std::vector<WordID>, CCRP_NoTable<TRule>, boost::hash<std::vector<WordID> > >::iterator it; + //const unsigned nloop = 5; + const unsigned niterations = 10; + //DiscountResampler dr(*this); + AlphaResampler ar(*this); +#if 0 + for (int iter = 0; iter < nloop; ++iter) { + strength = slice_sampler1d(ar, strength, *rng, -d + std::numeric_limits<double>::min(), + std::numeric_limits<double>::infinity(), 0.0, niterations, 100*niterations); + double min_discount = std::numeric_limits<double>::min(); + if (strength < 0.0) min_discount -= strength; + d = slice_sampler1d(dr, d, *rng, min_discount, + 1.0, 0.0, niterations, 100*niterations); + } +#endif + strength = slice_sampler1d(ar, strength, *rng, -d, + std::numeric_limits<double>::infinity(), 0.0, niterations, 100*niterations); + std::cerr << "CTMModel(alpha=" << strength << ") = " << LogLikelihood(d, strength) << std::endl; + for (it = r.begin(); it != r.end(); ++it) { +#if 0 + it->second.set_discount(d); +#endif + it->second.set_alpha(strength); + } + } + + prob_t Likelihood() const { + prob_t p; p.logeq(LogLikelihood(d, strength)); + return p; + } + + const CondBaseDist& rp0; + typedef std::tr1::unordered_map<std::vector<WordID>, + CCRP_NoTable<TRule>, + boost::hash<std::vector<WordID> > > RuleModelHash; + RuleModelHash r; + double d, strength; +}; + +struct GraphStructure { + GraphStructure() : r() {} + // leak memory - these are basically static + const Reachability* r; + bool IsReachable() const { return r->nodes > 0; } +}; + +struct ProbabilityEstimates { + ProbabilityEstimates() : gs(), backward() {} + explicit ProbabilityEstimates(const GraphStructure& g) : + gs(&g), backward() { + if (g.r->nodes > 0) + backward = new float[g.r->nodes]; + } + // leak memory, these are static + + // returns an estimate of the marginal probability + double MarginalEstimate() const { + if (!backward) return 0; + return backward[0]; + } + + // returns an backward estimate + double Backward(int src_covered, int trg_covered) const { + if (!backward) return 0; + int ind = gs->r->node_addresses[src_covered][trg_covered]; + if (ind < 0) return 0; + return backward[ind]; + } + + prob_t estp; + float* backward; + private: + const GraphStructure* gs; +}; + +struct TransliterationsImpl { + TransliterationsImpl(int max_src, int max_trg, double sr, const BackwardEstimator& b) : + cp0(max_src, max_trg, sr), + tccm(cp0), + be(b), + kMAX_SRC_CHUNK(max_src), + kMAX_TRG_CHUNK(max_trg), + kS2T_RATIO(sr), + tot_pairs(), tot_mem() { + } + const CondBaseDist cp0; + TransliterationChunkConditionalModel tccm; + const BackwardEstimator& be; + + void Initialize(WordID src, const vector<WordID>& src_lets, WordID trg, const vector<WordID>& trg_lets) { + const size_t src_len = src_lets.size(); + const size_t trg_len = trg_lets.size(); + + // init graph structure + if (src_len >= graphs.size()) graphs.resize(src_len + 1); + if (trg_len >= graphs[src_len].size()) graphs[src_len].resize(trg_len + 1); + GraphStructure& gs = graphs[src_len][trg_len]; + if (!gs.r) { + double rat = exp(fabs(log(trg_len / (src_len * kS2T_RATIO)))); + if (rat > 1.5 || (rat > 2.4 && src_len < 6)) { + cerr << " ** Forbidding transliterations of size " << src_len << "," << trg_len << ": " << rat << endl; + gs.r = new Reachability(src_len, trg_len, 0, 0); + } else { + gs.r = new Reachability(src_len, trg_len, kMAX_SRC_CHUNK, kMAX_TRG_CHUNK); + } + } + + const Reachability& r = *gs.r; + + // init backward estimates + if (src >= ests.size()) ests.resize(src + 1); + unordered_map<WordID, ProbabilityEstimates>::iterator it = ests[src].find(trg); + if (it != ests[src].end()) return; // already initialized + + it = ests[src].insert(make_pair(trg, ProbabilityEstimates(gs))).first; + ProbabilityEstimates& est = it->second; + if (!gs.r->nodes) return; // not derivable subject to length constraints + + be.InitializeGrid(src_lets, trg_lets, r, kS2T_RATIO, est.backward); + cerr << TD::GetString(src_lets) << " ||| " << TD::GetString(trg_lets) << " ||| " << (est.backward[0] / trg_lets.size()) << endl; + tot_pairs++; + tot_mem += sizeof(float) * gs.r->nodes; + } + + void Forbid(WordID src, const vector<WordID>& src_lets, WordID trg, const vector<WordID>& trg_lets) { + const size_t src_len = src_lets.size(); + const size_t trg_len = trg_lets.size(); + // TODO + } + + prob_t EstimateProbability(WordID s, const vector<WordID>& src, WordID t, const vector<WordID>& trg) const { + assert(src.size() < graphs.size()); + const vector<GraphStructure>& tv = graphs[src.size()]; + assert(trg.size() < tv.size()); + const GraphStructure& gs = tv[trg.size()]; + if (gs.r->nodes == 0) + return prob_t::Zero(); + const unordered_map<WordID, ProbabilityEstimates>::const_iterator it = ests[s].find(t); + assert(it != ests[s].end()); + return it->second.estp; + } + + void GraphSummary() const { + double to = 0; + double tn = 0; + double tt = 0; + for (int i = 0; i < graphs.size(); ++i) { + const vector<GraphStructure>& vt = graphs[i]; + for (int j = 0; j < vt.size(); ++j) { + const GraphStructure& gs = vt[j]; + if (!gs.r) continue; + tt++; + for (int k = 0; k < i; ++k) { + for (int l = 0; l < j; ++l) { + size_t c = gs.r->valid_deltas[k][l].size(); + if (c) { + tn += 1; + to += c; + } + } + } + } + } + cerr << " Average nodes = " << (tn / tt) << endl; + cerr << "Average out-degree = " << (to / tn) << endl; + cerr << " Unique structures = " << tt << endl; + cerr << " Unique pairs = " << tot_pairs << endl; + cerr << " BEs size = " << (tot_mem / (1024.0*1024.0)) << " MB" << endl; + } + + const int kMAX_SRC_CHUNK; + const int kMAX_TRG_CHUNK; + const double kS2T_RATIO; + unsigned tot_pairs; + size_t tot_mem; + vector<vector<GraphStructure> > graphs; // graphs[src_len][trg_len] + vector<unordered_map<WordID, ProbabilityEstimates> > ests; // ests[src][trg] +}; + +Transliterations::Transliterations(int max_src, int max_trg, double sr, const BackwardEstimator& be) : + pimpl_(new TransliterationsImpl(max_src, max_trg, sr, be)) {} +Transliterations::~Transliterations() { delete pimpl_; } + +void Transliterations::Initialize(WordID src, const vector<WordID>& src_lets, WordID trg, const vector<WordID>& trg_lets) { + pimpl_->Initialize(src, src_lets, trg, trg_lets); +} + +prob_t Transliterations::EstimateProbability(WordID s, const vector<WordID>& src, WordID t, const vector<WordID>& trg) const { + return pimpl_->EstimateProbability(s, src,t, trg); +} + +void Transliterations::Forbid(WordID src, const vector<WordID>& src_lets, WordID trg, const vector<WordID>& trg_lets) { + pimpl_->Forbid(src, src_lets, trg, trg_lets); +} + +void Transliterations::GraphSummary() const { + pimpl_->GraphSummary(); +} + diff --git a/gi/pf/transliterations.h b/gi/pf/transliterations.h new file mode 100644 index 00000000..49d14684 --- /dev/null +++ b/gi/pf/transliterations.h @@ -0,0 +1,24 @@ +#ifndef _TRANSLITERATIONS_H_ +#define _TRANSLITERATIONS_H_ + +#include <vector> +#include "wordid.h" +#include "prob.h" + +struct BackwardEstimator; +struct TransliterationsImpl; +struct Transliterations { + // max_src and max_trg indicate how big the transliteration phrases can be + // see reachability.h for information about filter_ratio + explicit Transliterations(int max_src, int max_trg, double s2t_rat, const BackwardEstimator& be); + ~Transliterations(); + void Initialize(WordID src, const std::vector<WordID>& src_lets, WordID trg, const std::vector<WordID>& trg_lets); + void Forbid(WordID src, const std::vector<WordID>& src_lets, WordID trg, const std::vector<WordID>& trg_lets); + void GraphSummary() const; + prob_t EstimateProbability(WordID s, const std::vector<WordID>& src, WordID t, const std::vector<WordID>& trg) const; + private: + TransliterationsImpl* pimpl_; +}; + +#endif + diff --git a/gi/pf/unigrams.cc b/gi/pf/unigrams.cc new file mode 100644 index 00000000..40829775 --- /dev/null +++ b/gi/pf/unigrams.cc @@ -0,0 +1,80 @@ +#include "unigrams.h" + +#include <string> +#include <cmath> + +#include "stringlib.h" +#include "filelib.h" + +using namespace std; + +void UnigramModel::LoadUnigrams(const string& fname) { + cerr << "Loading unigram probabilities from " << fname << " ..." << endl; + ReadFile rf(fname); + string line; + istream& in = *rf.stream(); + assert(in); + getline(in, line); + assert(line.empty()); + getline(in, line); + assert(line == "\\data\\"); + getline(in, line); + size_t pos = line.find("ngram 1="); + assert(pos == 0); + assert(line.size() > 8); + const size_t num_unigrams = atoi(&line[8]); + getline(in, line); + assert(line.empty()); + getline(in, line); + assert(line == "\\1-grams:"); + for (size_t i = 0; i < num_unigrams; ++i) { + getline(in, line); + assert(line.size() > 0); + pos = line.find('\t'); + assert(pos > 0); + assert(pos + 1 < line.size()); + const WordID w = TD::Convert(line.substr(pos + 1)); + line[pos] = 0; + float p = atof(&line[0]); + if (w < probs_.size()) probs_[w].logeq(p * log(10)); else cerr << "WARNING: don't know about '" << TD::Convert(w) << "'\n"; + } +} + +void UnigramWordModel::LoadUnigrams(const string& fname) { + cerr << "Loading unigram probabilities from " << fname << " ..." << endl; + ReadFile rf(fname); + string line; + istream& in = *rf.stream(); + assert(in); + getline(in, line); + assert(line.empty()); + getline(in, line); + assert(line == "\\data\\"); + getline(in, line); + size_t pos = line.find("ngram 1="); + assert(pos == 0); + assert(line.size() > 8); + const size_t num_unigrams = atoi(&line[8]); + getline(in, line); + assert(line.empty()); + getline(in, line); + assert(line == "\\1-grams:"); + for (size_t i = 0; i < num_unigrams; ++i) { + getline(in, line); + assert(line.size() > 0); + pos = line.find('\t'); + assert(pos > 0); + assert(pos + 1 < line.size()); + size_t cur = pos + 1; + vector<WordID> w; + while (cur < line.size()) { + const size_t len = UTF8Len(line[cur]); + w.push_back(TD::Convert(line.substr(cur, len))); + cur += len; + } + line[pos] = 0; + float p = atof(&line[0]); + probs_[w].logeq(p * log(10.0)); + } +} + diff --git a/gi/pf/unigrams.h b/gi/pf/unigrams.h new file mode 100644 index 00000000..1660d1ed --- /dev/null +++ b/gi/pf/unigrams.h @@ -0,0 +1,69 @@ +#ifndef _UNIGRAMS_H_ +#define _UNIGRAMS_H_ + +#include <vector> +#include <string> +#include <tr1/unordered_map> +#include <boost/functional.hpp> + +#include "wordid.h" +#include "prob.h" +#include "tdict.h" + +struct UnigramModel { + explicit UnigramModel(const std::string& fname, unsigned vocab_size) : + use_uniform_(fname.size() == 0), + uniform_(1.0 / vocab_size), + probs_() { + if (fname.size() > 0) { + probs_.resize(TD::NumWords() + 1); + LoadUnigrams(fname); + } + } + + const prob_t& operator()(const WordID& w) const { + assert(w); + if (use_uniform_) return uniform_; + return probs_[w]; + } + + private: + void LoadUnigrams(const std::string& fname); + + const bool use_uniform_; + const prob_t uniform_; + std::vector<prob_t> probs_; +}; + + +// reads an ARPA unigram file and converts words like 'cat' into a string 'c a t' +struct UnigramWordModel { + explicit UnigramWordModel(const std::string& fname) : + use_uniform_(false), + uniform_(1.0), + probs_() { + LoadUnigrams(fname); + } + + explicit UnigramWordModel(const unsigned vocab_size) : + use_uniform_(true), + uniform_(1.0 / vocab_size), + probs_() {} + + const prob_t& operator()(const std::vector<WordID>& s) const { + if (use_uniform_) return uniform_; + const VectorProbHash::const_iterator it = probs_.find(s); + assert(it != probs_.end()); + return it->second; + } + + private: + void LoadUnigrams(const std::string& fname); + + const bool use_uniform_; + const prob_t uniform_; + typedef std::tr1::unordered_map<std::vector<WordID>, prob_t, boost::hash<std::vector<WordID> > > VectorProbHash; + VectorProbHash probs_; +}; + +#endif |