diff options
author | Chris Dyer <cdyer@cs.cmu.edu> | 2011-10-12 14:57:15 +0100 |
---|---|---|
committer | Chris Dyer <cdyer@cs.cmu.edu> | 2011-10-12 14:57:15 +0100 |
commit | ee84ab027c0be54800cac0c9bff62dd097354f6d (patch) | |
tree | 30344f4307d3efbafb3276807a6879c4eeeab173 /gi/pf | |
parent | a32cd0131c6325e364c82e5f6bbefc03b61e437f (diff) |
model lenght properly, clean up
Diffstat (limited to 'gi/pf')
-rw-r--r-- | gi/pf/Makefile.am | 2 | ||||
-rw-r--r-- | gi/pf/corpus.cc | 57 | ||||
-rw-r--r-- | gi/pf/corpus.h | 19 | ||||
-rw-r--r-- | gi/pf/dpnaive.cc | 95 | ||||
-rw-r--r-- | gi/pf/monotonic_pseg.h | 88 | ||||
-rw-r--r-- | gi/pf/pfnaive.cc | 116 |
6 files changed, 202 insertions, 175 deletions
diff --git a/gi/pf/Makefile.am b/gi/pf/Makefile.am index c9764ad5..42758939 100644 --- a/gi/pf/Makefile.am +++ b/gi/pf/Makefile.am @@ -1,7 +1,7 @@ bin_PROGRAMS = cbgi brat dpnaive pfbrat pfdist itg pfnaive noinst_LIBRARIES = libpf.a -libpf_a_SOURCES = base_measures.cc reachability.cc cfg_wfst_composer.cc +libpf_a_SOURCES = base_measures.cc reachability.cc cfg_wfst_composer.cc corpus.cc itg_SOURCES = itg.cc diff --git a/gi/pf/corpus.cc b/gi/pf/corpus.cc new file mode 100644 index 00000000..a408e7cf --- /dev/null +++ b/gi/pf/corpus.cc @@ -0,0 +1,57 @@ +#include "corpus.h" + +#include <set> +#include <vector> +#include <string> + +#include "tdict.h" +#include "filelib.h" + +using namespace std; + +namespace corpus { + +void ReadParallelCorpus(const string& filename, + vector<vector<WordID> >* f, + vector<vector<WordID> >* e, + set<WordID>* vocab_f, + set<WordID>* vocab_e) { + f->clear(); + e->clear(); + vocab_f->clear(); + vocab_e->clear(); + ReadFile rf(filename); + istream* in = rf.stream(); + assert(*in); + string line; + const WordID kDIV = TD::Convert("|||"); + vector<WordID> tmp; + while(*in) { + getline(*in, line); + if (line.empty() && !*in) break; + e->push_back(vector<int>()); + f->push_back(vector<int>()); + vector<int>& le = e->back(); + vector<int>& lf = f->back(); + tmp.clear(); + TD::ConvertSentence(line, &tmp); + bool isf = true; + for (unsigned i = 0; i < tmp.size(); ++i) { + const int cur = tmp[i]; + if (isf) { + if (kDIV == cur) { isf = false; } else { + lf.push_back(cur); + vocab_f->insert(cur); + } + } else { + assert(cur != kDIV); + le.push_back(cur); + vocab_e->insert(cur); + } + } + assert(isf == false); + } +} + +} + diff --git a/gi/pf/corpus.h b/gi/pf/corpus.h new file mode 100644 index 00000000..e7febdb7 --- /dev/null +++ b/gi/pf/corpus.h @@ -0,0 +1,19 @@ +#ifndef _CORPUS_H_ +#define _CORPUS_H_ + +#include <string> +#include <vector> +#include <set> +#include "wordid.h" + +namespace corpus { + +void ReadParallelCorpus(const std::string& filename, + std::vector<std::vector<WordID> >* f, + std::vector<std::vector<WordID> >* e, + std::set<WordID>* vocab_f, + std::set<WordID>* vocab_e); + +} + +#endif diff --git a/gi/pf/dpnaive.cc b/gi/pf/dpnaive.cc index 608f73d5..c926487b 100644 --- a/gi/pf/dpnaive.cc +++ b/gi/pf/dpnaive.cc @@ -7,12 +7,14 @@ #include <boost/program_options/variables_map.hpp> #include "base_measures.h" +#include "monotonic_pseg.h" #include "trule.h" #include "tdict.h" #include "filelib.h" #include "dict.h" #include "sampler.h" #include "ccrp_nt.h" +#include "corpus.h" using namespace std; using namespace std::tr1; @@ -52,57 +54,12 @@ void InitCommandLine(int argc, char** argv, po::variables_map* conf) { } } -void ReadParallelCorpus(const string& filename, - vector<vector<WordID> >* f, - vector<vector<int> >* e, - set<int>* vocab_e, - set<int>* vocab_f) { - f->clear(); - e->clear(); - vocab_f->clear(); - vocab_e->clear(); - istream* in; - if (filename == "-") - in = &cin; - else - in = new ifstream(filename.c_str()); - assert(*in); - string line; - const WordID kDIV = TD::Convert("|||"); - vector<WordID> tmp; - while(*in) { - getline(*in, line); - if (line.empty() && !*in) break; - e->push_back(vector<int>()); - f->push_back(vector<int>()); - vector<int>& le = e->back(); - vector<int>& lf = f->back(); - tmp.clear(); - TD::ConvertSentence(line, &tmp); - bool isf = true; - for (unsigned i = 0; i < tmp.size(); ++i) { - const int cur = tmp[i]; - if (isf) { - if (kDIV == cur) { isf = false; } else { - lf.push_back(cur); - vocab_f->insert(cur); - } - } else { - assert(cur != kDIV); - le.push_back(cur); - vocab_e->insert(cur); - } - } - assert(isf == false); - } - if (in != &cin) delete in; -} - shared_ptr<MT19937> prng; template <typename Base> struct ModelAndData { - explicit ModelAndData(const Base& b, const vector<vector<int> >& ce, const vector<vector<int> >& cf, const set<int>& ve, const set<int>& vf) : + explicit ModelAndData(MonotonicParallelSegementationModel& m, const Base& b, const vector<vector<int> >& ce, const vector<vector<int> >& cf, const set<int>& ve, const set<int>& vf) : + model(m), rng(&*prng), p0(b), baseprob(prob_t::One()), @@ -110,14 +67,12 @@ struct ModelAndData { corpusf(cf), vocabe(ve), vocabf(vf), - rules(1,1), mh_samples(), mh_rejects(), kX(-TD::Convert("X")), derivations(corpuse.size()) {} void ResampleHyperparameters() { - rules.resample_hyperparameters(&*prng); } void InstantiateRule(const pair<short,short>& from, @@ -139,12 +94,10 @@ struct ModelAndData { TRule x; for (int i = 1; i < d.size(); ++i) { InstantiateRule(d[i], d[i-1], sentf, sente, &x); - //cerr << "REMOVE: " << x.AsString() << endl; - if (rules.decrement(x)) { - baseprob /= p0(x); - //cerr << " (REMOVED ONLY INSTANCE)\n"; - } + model.DecrementRule(x); + model.DecrementContinue(); } + model.DecrementStop(); } void PrintDerivation(const vector<pair<short,short> >& d, const vector<int>& sentf, const vector<int>& sente) { @@ -161,39 +114,38 @@ struct ModelAndData { TRule x; for (int i = 1; i < d.size(); ++i) { InstantiateRule(d[i], d[i-1], sentf, sente, &x); - if (rules.increment(x)) { - baseprob *= p0(x); - } + model.IncrementRule(x); + model.IncrementContinue(); } + model.IncrementStop(); } prob_t Likelihood() const { - prob_t p; - p.logeq(rules.log_crp_prob()); - return p * baseprob; + return model.Likelihood(); } prob_t DerivationProposalProbability(const vector<pair<short,short> >& d, const vector<int>& sentf, const vector<int>& sente) const { - prob_t p = prob_t::One(); + prob_t p = model.StopProbability(); if (d.size() < 2) return p; TRule x; + const prob_t p_cont = model.ContinueProbability(); for (int i = 1; i < d.size(); ++i) { InstantiateRule(d[i], d[i-1], sentf, sente, &x); - prob_t rp; rp.logeq(rules.logprob(x, log(p0(x)))); - p *= rp; + p *= p_cont; + p *= model.RuleProbability(x); } return p; } void Sample(); + MonotonicParallelSegementationModel& model; MT19937* rng; const Base& p0; prob_t baseprob; // cached value of generating the table table labels from p0 // this can't be used if we go to a hierarchical prior! const vector<vector<int> >& corpuse, corpusf; const set<int>& vocabe, vocabf; - CCRP_NoTable<TRule> rules; unsigned mh_samples, mh_rejects; const int kX; vector<vector<pair<short, short> > > derivations; @@ -201,8 +153,8 @@ struct ModelAndData { template <typename Base> void ModelAndData<Base>::Sample() { - unsigned MAXK = 4; - unsigned MAXL = 4; + unsigned MAXK = kMAX_SRC_PHRASE; + unsigned MAXL = kMAX_TRG_PHRASE; TRule x; x.lhs_ = -TD::Convert("X"); for (int samples = 0; samples < 1000; ++samples) { @@ -228,6 +180,8 @@ void ModelAndData<Base>::Sample() { boost::multi_array<prob_t, 2> a(boost::extents[sentf.size() + 1][sente.size() + 1]); boost::multi_array<prob_t, 4> trans(boost::extents[sentf.size() + 1][sente.size() + 1][MAXK][MAXL]); a[0][0] = prob_t::One(); + const prob_t q_stop = model.StopProbability(); + const prob_t q_cont = model.ContinueProbability(); for (int i = 0; i < sentf.size(); ++i) { for (int j = 0; j < sente.size(); ++j) { const prob_t src_a = a[i][j]; @@ -239,7 +193,9 @@ void ModelAndData<Base>::Sample() { for (int l = 1; l <= MAXL; ++l) { if (j + l > sente.size()) break; x.e_.push_back(sente[j + l - 1]); - trans[i][j][k - 1][l - 1].logeq(rules.logprob(x, log(p0(x)))); + const bool stop_now = ((j + l) == sente.size()) && ((i + k) == sentf.size()); + const prob_t& cp = stop_now ? q_stop : q_cont; + trans[i][j][k - 1][l - 1] = model.RuleProbability(x) * cp; a[i + k][j + l] += src_a * trans[i][j][k - 1][l - 1]; } } @@ -319,7 +275,7 @@ int main(int argc, char** argv) { vector<vector<int> > corpuse, corpusf; set<int> vocabe, vocabf; - ReadParallelCorpus(conf["input"].as<string>(), &corpusf, &corpuse, &vocabf, &vocabe); + corpus::ReadParallelCorpus(conf["input"].as<string>(), &corpusf, &corpuse, &vocabf, &vocabe); cerr << "f-Corpus size: " << corpusf.size() << " sentences\n"; cerr << "f-Vocabulary size: " << vocabf.size() << " types\n"; cerr << "f-Corpus size: " << corpuse.size() << " sentences\n"; @@ -328,8 +284,9 @@ int main(int argc, char** argv) { Model1 m1(conf["model1"].as<string>()); PhraseJointBase lp0(m1, conf["model1_interpolation_weight"].as<double>(), vocabe.size(), vocabf.size()); + MonotonicParallelSegementationModel m(lp0); - ModelAndData<PhraseJointBase> posterior(lp0, corpuse, corpusf, vocabe, vocabf); + ModelAndData<PhraseJointBase> posterior(m, lp0, corpuse, corpusf, vocabe, vocabf); posterior.Sample(); return 0; diff --git a/gi/pf/monotonic_pseg.h b/gi/pf/monotonic_pseg.h new file mode 100644 index 00000000..7e6af3fc --- /dev/null +++ b/gi/pf/monotonic_pseg.h @@ -0,0 +1,88 @@ +#ifndef _MONOTONIC_PSEG_H_ +#define _MONOTONIC_PSEG_H_ + +#include <vector> + +#include "prob.h" +#include "ccrp_nt.h" +#include "trule.h" +#include "base_measures.h" + +struct MonotonicParallelSegementationModel { + explicit MonotonicParallelSegementationModel(PhraseJointBase& rcp0) : + rp0(rcp0), base(prob_t::One()), rules(1,1), stop(1.0) {} + + void DecrementRule(const TRule& rule) { + if (rules.decrement(rule)) + base /= rp0(rule); + } + + void IncrementRule(const TRule& rule) { + if (rules.increment(rule)) + base *= rp0(rule); + } + + void IncrementRulesAndStops(const std::vector<TRulePtr>& rules) { + for (int i = 0; i < rules.size(); ++i) + IncrementRule(*rules[i]); + if (rules.size()) IncrementContinue(rules.size() - 1); + IncrementStop(); + } + + void DecrementRulesAndStops(const std::vector<TRulePtr>& rules) { + for (int i = 0; i < rules.size(); ++i) + DecrementRule(*rules[i]); + if (rules.size()) { + DecrementContinue(rules.size() - 1); + DecrementStop(); + } + } + + prob_t RuleProbability(const TRule& rule) const { + prob_t p; p.logeq(rules.logprob(rule, log(rp0(rule)))); + return p; + } + + prob_t Likelihood() const { + prob_t p = base; + prob_t q; q.logeq(rules.log_crp_prob()); + p *= q; + q.logeq(stop.log_crp_prob()); + p *= q; + return p; + } + + void IncrementStop() { + stop.increment(true); + } + + void IncrementContinue(int n = 1) { + for (int i = 0; i < n; ++i) + stop.increment(false); + } + + void DecrementStop() { + stop.decrement(true); + } + + void DecrementContinue(int n = 1) { + for (int i = 0; i < n; ++i) + stop.decrement(false); + } + + prob_t StopProbability() const { + return prob_t(stop.prob(true, 0.5)); + } + + prob_t ContinueProbability() const { + return prob_t(stop.prob(false, 0.5)); + } + + const PhraseJointBase& rp0; + prob_t base; + CCRP_NoTable<TRule> rules; + CCRP_NoTable<bool> stop; +}; + +#endif + diff --git a/gi/pf/pfnaive.cc b/gi/pf/pfnaive.cc index c30e7c4f..33dc08c3 100644 --- a/gi/pf/pfnaive.cc +++ b/gi/pf/pfnaive.cc @@ -7,6 +7,7 @@ #include <boost/program_options/variables_map.hpp> #include "base_measures.h" +#include "monotonic_pseg.h" #include "reachability.h" #include "viterbi.h" #include "hg.h" @@ -17,6 +18,7 @@ #include "sampler.h" #include "ccrp_nt.h" #include "ccrp_onetable.h" +#include "corpus.h" using namespace std; using namespace tr1; @@ -58,101 +60,6 @@ void InitCommandLine(int argc, char** argv, po::variables_map* conf) { } } -void ReadParallelCorpus(const string& filename, - vector<vector<WordID> >* f, - vector<vector<WordID> >* e, - set<WordID>* vocab_f, - set<WordID>* vocab_e) { - f->clear(); - e->clear(); - vocab_f->clear(); - vocab_e->clear(); - istream* in; - if (filename == "-") - in = &cin; - else - in = new ifstream(filename.c_str()); - assert(*in); - string line; - const WordID kDIV = TD::Convert("|||"); - vector<WordID> tmp; - while(*in) { - getline(*in, line); - if (line.empty() && !*in) break; - e->push_back(vector<int>()); - f->push_back(vector<int>()); - vector<int>& le = e->back(); - vector<int>& lf = f->back(); - tmp.clear(); - TD::ConvertSentence(line, &tmp); - bool isf = true; - for (unsigned i = 0; i < tmp.size(); ++i) { - const int cur = tmp[i]; - if (isf) { - if (kDIV == cur) { isf = false; } else { - lf.push_back(cur); - vocab_f->insert(cur); - } - } else { - assert(cur != kDIV); - le.push_back(cur); - vocab_e->insert(cur); - } - } - assert(isf == false); - } - if (in != &cin) delete in; -} - -struct MyJointModel { - MyJointModel(PhraseJointBase& rcp0) : - rp0(rcp0), base(prob_t::One()), rules(1,1) {} - - void DecrementRule(const TRule& rule) { - if (rules.decrement(rule)) - base /= rp0(rule); - } - - void IncrementRule(const TRule& rule) { - if (rules.increment(rule)) - base *= rp0(rule); - } - - void IncrementRules(const vector<TRulePtr>& rules) { - for (int i = 0; i < rules.size(); ++i) - IncrementRule(*rules[i]); - } - - void DecrementRules(const vector<TRulePtr>& rules) { - for (int i = 0; i < rules.size(); ++i) - DecrementRule(*rules[i]); - } - - prob_t RuleProbability(const TRule& rule) const { - prob_t p; p.logeq(rules.logprob(rule, log(rp0(rule)))); - return p; - } - - prob_t Likelihood() const { - prob_t p = base; - prob_t q; q.logeq(rules.log_crp_prob()); - p *= q; - for (unsigned l = 1; l < src_jumps.size(); ++l) { - if (src_jumps[l].num_customers() > 0) { - prob_t q; - q.logeq(src_jumps[l].log_crp_prob()); - p *= q; - } - } - return p; - } - - const PhraseJointBase& rp0; - prob_t base; - CCRP_NoTable<TRule> rules; - vector<CCRP_NoTable<int> > src_jumps; -}; - struct BackwardEstimateSym { BackwardEstimateSym(const Model1& m1, const Model1& invm1, const vector<WordID>& src, const vector<WordID>& trg) : @@ -264,7 +171,7 @@ int main(int argc, char** argv) { vector<vector<WordID> > corpuse, corpusf; set<WordID> vocabe, vocabf; cerr << "Reading corpus...\n"; - ReadParallelCorpus(conf["input"].as<string>(), &corpusf, &corpuse, &vocabf, &vocabe); + corpus::ReadParallelCorpus(conf["input"].as<string>(), &corpusf, &corpuse, &vocabf, &vocabe); cerr << "F-corpus size: " << corpusf.size() << " sentences\t (" << vocabf.size() << " word types)\n"; cerr << "E-corpus size: " << corpuse.size() << " sentences\t (" << vocabe.size() << " word types)\n"; assert(corpusf.size() == corpuse.size()); @@ -273,13 +180,8 @@ int main(int argc, char** argv) { Model1 m1(conf["model1"].as<string>()); Model1 invm1(conf["inverse_model1"].as<string>()); -#if 0 - PhraseConditionalBase lp0(m1, conf["model1_interpolation_weight"].as<double>(), vocabe.size()); - MyConditionalModel m(lp0); -#else PhraseJointBase lp0(m1, conf["model1_interpolation_weight"].as<double>(), vocabe.size(), vocabf.size()); - MyJointModel m(lp0); -#endif + MonotonicParallelSegementationModel m(lp0); cerr << "Initializing reachability limits...\n"; vector<Particle> ps(corpusf.size()); @@ -296,7 +198,10 @@ int main(int argc, char** argv) { for (int ci = 0; ci < corpusf.size(); ++ci) { vector<int>& src = corpusf[ci]; vector<int>& trg = corpuse[ci]; - m.DecrementRules(ps[ci].rules); + m.DecrementRulesAndStops(ps[ci].rules); + const prob_t q_stop = m.StopProbability(); + const prob_t q_cont = m.ContinueProbability(); + cerr << "P(stop)=" << q_stop << "\tP(continue)=" <<q_cont << endl; BackwardEstimateSym be(m1, invm1, src, trg); const Reachability& r = reaches[ci]; @@ -336,7 +241,8 @@ int main(int argc, char** argv) { x.f_.push_back(src[i + j]); np.src_cov += x.f_.size(); np.trg_cov += x.e_.size(); - prob_t rp = m.RuleProbability(x); + const bool stop_now = (np.src_cov == src_len && np.trg_cov == trg_len); + prob_t rp = m.RuleProbability(x) * (stop_now ? q_stop : q_cont); np.gamma_last = rp; const prob_t u = pow(np.gamma_last * pow(be(np.src_cov, np.trg_cov), 1.2), 0.1); //cerr << "**rule=" << x << endl; @@ -363,7 +269,7 @@ int main(int argc, char** argv) { pfss.add(lps[i].weight); const int sampled = rng.SelectSample(pfss); ps[ci] = lps[sampled]; - m.IncrementRules(lps[sampled].rules); + m.IncrementRulesAndStops(lps[sampled].rules); for (int i = 0; i < lps[sampled].rules.size(); ++i) { cerr << "S:\t" << lps[sampled].rules[i]->AsString() << "\n"; } cerr << "tmp-LLH: " << log(m.Likelihood()) << endl; } |