From ee84ab027c0be54800cac0c9bff62dd097354f6d Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Wed, 12 Oct 2011 14:57:15 +0100 Subject: model lenght properly, clean up --- gi/pf/dpnaive.cc | 95 ++++++++++++++++---------------------------------------- 1 file changed, 26 insertions(+), 69 deletions(-) (limited to 'gi/pf/dpnaive.cc') diff --git a/gi/pf/dpnaive.cc b/gi/pf/dpnaive.cc index 608f73d5..c926487b 100644 --- a/gi/pf/dpnaive.cc +++ b/gi/pf/dpnaive.cc @@ -7,12 +7,14 @@ #include #include "base_measures.h" +#include "monotonic_pseg.h" #include "trule.h" #include "tdict.h" #include "filelib.h" #include "dict.h" #include "sampler.h" #include "ccrp_nt.h" +#include "corpus.h" using namespace std; using namespace std::tr1; @@ -52,57 +54,12 @@ void InitCommandLine(int argc, char** argv, po::variables_map* conf) { } } -void ReadParallelCorpus(const string& filename, - vector >* f, - vector >* e, - set* vocab_e, - set* vocab_f) { - f->clear(); - e->clear(); - vocab_f->clear(); - vocab_e->clear(); - istream* in; - if (filename == "-") - in = &cin; - else - in = new ifstream(filename.c_str()); - assert(*in); - string line; - const WordID kDIV = TD::Convert("|||"); - vector tmp; - while(*in) { - getline(*in, line); - if (line.empty() && !*in) break; - e->push_back(vector()); - f->push_back(vector()); - vector& le = e->back(); - vector& lf = f->back(); - tmp.clear(); - TD::ConvertSentence(line, &tmp); - bool isf = true; - for (unsigned i = 0; i < tmp.size(); ++i) { - const int cur = tmp[i]; - if (isf) { - if (kDIV == cur) { isf = false; } else { - lf.push_back(cur); - vocab_f->insert(cur); - } - } else { - assert(cur != kDIV); - le.push_back(cur); - vocab_e->insert(cur); - } - } - assert(isf == false); - } - if (in != &cin) delete in; -} - shared_ptr prng; template struct ModelAndData { - explicit ModelAndData(const Base& b, const vector >& ce, const vector >& cf, const set& ve, const set& vf) : + explicit ModelAndData(MonotonicParallelSegementationModel& m, const Base& b, const vector >& ce, const vector >& cf, const set& ve, const set& vf) : + model(m), rng(&*prng), p0(b), baseprob(prob_t::One()), @@ -110,14 +67,12 @@ struct ModelAndData { corpusf(cf), vocabe(ve), vocabf(vf), - rules(1,1), mh_samples(), mh_rejects(), kX(-TD::Convert("X")), derivations(corpuse.size()) {} void ResampleHyperparameters() { - rules.resample_hyperparameters(&*prng); } void InstantiateRule(const pair& from, @@ -139,12 +94,10 @@ struct ModelAndData { TRule x; for (int i = 1; i < d.size(); ++i) { InstantiateRule(d[i], d[i-1], sentf, sente, &x); - //cerr << "REMOVE: " << x.AsString() << endl; - if (rules.decrement(x)) { - baseprob /= p0(x); - //cerr << " (REMOVED ONLY INSTANCE)\n"; - } + model.DecrementRule(x); + model.DecrementContinue(); } + model.DecrementStop(); } void PrintDerivation(const vector >& d, const vector& sentf, const vector& sente) { @@ -161,39 +114,38 @@ struct ModelAndData { TRule x; for (int i = 1; i < d.size(); ++i) { InstantiateRule(d[i], d[i-1], sentf, sente, &x); - if (rules.increment(x)) { - baseprob *= p0(x); - } + model.IncrementRule(x); + model.IncrementContinue(); } + model.IncrementStop(); } prob_t Likelihood() const { - prob_t p; - p.logeq(rules.log_crp_prob()); - return p * baseprob; + return model.Likelihood(); } prob_t DerivationProposalProbability(const vector >& d, const vector& sentf, const vector& sente) const { - prob_t p = prob_t::One(); + prob_t p = model.StopProbability(); if (d.size() < 2) return p; TRule x; + const prob_t p_cont = model.ContinueProbability(); for (int i = 1; i < d.size(); ++i) { InstantiateRule(d[i], d[i-1], sentf, sente, &x); - prob_t rp; rp.logeq(rules.logprob(x, log(p0(x)))); - p *= rp; + p *= p_cont; + p *= model.RuleProbability(x); } return p; } void Sample(); + MonotonicParallelSegementationModel& model; MT19937* rng; const Base& p0; prob_t baseprob; // cached value of generating the table table labels from p0 // this can't be used if we go to a hierarchical prior! const vector >& corpuse, corpusf; const set& vocabe, vocabf; - CCRP_NoTable rules; unsigned mh_samples, mh_rejects; const int kX; vector > > derivations; @@ -201,8 +153,8 @@ struct ModelAndData { template void ModelAndData::Sample() { - unsigned MAXK = 4; - unsigned MAXL = 4; + unsigned MAXK = kMAX_SRC_PHRASE; + unsigned MAXL = kMAX_TRG_PHRASE; TRule x; x.lhs_ = -TD::Convert("X"); for (int samples = 0; samples < 1000; ++samples) { @@ -228,6 +180,8 @@ void ModelAndData::Sample() { boost::multi_array a(boost::extents[sentf.size() + 1][sente.size() + 1]); boost::multi_array trans(boost::extents[sentf.size() + 1][sente.size() + 1][MAXK][MAXL]); a[0][0] = prob_t::One(); + const prob_t q_stop = model.StopProbability(); + const prob_t q_cont = model.ContinueProbability(); for (int i = 0; i < sentf.size(); ++i) { for (int j = 0; j < sente.size(); ++j) { const prob_t src_a = a[i][j]; @@ -239,7 +193,9 @@ void ModelAndData::Sample() { for (int l = 1; l <= MAXL; ++l) { if (j + l > sente.size()) break; x.e_.push_back(sente[j + l - 1]); - trans[i][j][k - 1][l - 1].logeq(rules.logprob(x, log(p0(x)))); + const bool stop_now = ((j + l) == sente.size()) && ((i + k) == sentf.size()); + const prob_t& cp = stop_now ? q_stop : q_cont; + trans[i][j][k - 1][l - 1] = model.RuleProbability(x) * cp; a[i + k][j + l] += src_a * trans[i][j][k - 1][l - 1]; } } @@ -319,7 +275,7 @@ int main(int argc, char** argv) { vector > corpuse, corpusf; set vocabe, vocabf; - ReadParallelCorpus(conf["input"].as(), &corpusf, &corpuse, &vocabf, &vocabe); + corpus::ReadParallelCorpus(conf["input"].as(), &corpusf, &corpuse, &vocabf, &vocabe); cerr << "f-Corpus size: " << corpusf.size() << " sentences\n"; cerr << "f-Vocabulary size: " << vocabf.size() << " types\n"; cerr << "f-Corpus size: " << corpuse.size() << " sentences\n"; @@ -328,8 +284,9 @@ int main(int argc, char** argv) { Model1 m1(conf["model1"].as()); PhraseJointBase lp0(m1, conf["model1_interpolation_weight"].as(), vocabe.size(), vocabf.size()); + MonotonicParallelSegementationModel m(lp0); - ModelAndData posterior(lp0, corpuse, corpusf, vocabe, vocabf); + ModelAndData posterior(m, lp0, corpuse, corpusf, vocabe, vocabf); posterior.Sample(); return 0; -- cgit v1.2.3