diff options
Diffstat (limited to 'gi')
| -rw-r--r-- | gi/pf/Makefile.am | 2 | ||||
| -rw-r--r-- | gi/pf/corpus.cc | 57 | ||||
| -rw-r--r-- | gi/pf/corpus.h | 19 | ||||
| -rw-r--r-- | gi/pf/dpnaive.cc | 95 | ||||
| -rw-r--r-- | gi/pf/monotonic_pseg.h | 88 | ||||
| -rw-r--r-- | gi/pf/pfnaive.cc | 116 | 
6 files changed, 202 insertions, 175 deletions
| diff --git a/gi/pf/Makefile.am b/gi/pf/Makefile.am index c9764ad5..42758939 100644 --- a/gi/pf/Makefile.am +++ b/gi/pf/Makefile.am @@ -1,7 +1,7 @@  bin_PROGRAMS = cbgi brat dpnaive pfbrat pfdist itg pfnaive  noinst_LIBRARIES = libpf.a -libpf_a_SOURCES = base_measures.cc reachability.cc cfg_wfst_composer.cc +libpf_a_SOURCES = base_measures.cc reachability.cc cfg_wfst_composer.cc corpus.cc  itg_SOURCES = itg.cc diff --git a/gi/pf/corpus.cc b/gi/pf/corpus.cc new file mode 100644 index 00000000..a408e7cf --- /dev/null +++ b/gi/pf/corpus.cc @@ -0,0 +1,57 @@ +#include "corpus.h" + +#include <set> +#include <vector> +#include <string> + +#include "tdict.h" +#include "filelib.h" + +using namespace std; + +namespace corpus { + +void ReadParallelCorpus(const string& filename, +                vector<vector<WordID> >* f, +                vector<vector<WordID> >* e, +                set<WordID>* vocab_f, +                set<WordID>* vocab_e) { +  f->clear(); +  e->clear(); +  vocab_f->clear(); +  vocab_e->clear(); +  ReadFile rf(filename); +  istream* in = rf.stream(); +  assert(*in); +  string line; +  const WordID kDIV = TD::Convert("|||"); +  vector<WordID> tmp; +  while(*in) { +    getline(*in, line); +    if (line.empty() && !*in) break; +    e->push_back(vector<int>()); +    f->push_back(vector<int>()); +    vector<int>& le = e->back(); +    vector<int>& lf = f->back(); +    tmp.clear(); +    TD::ConvertSentence(line, &tmp); +    bool isf = true; +    for (unsigned i = 0; i < tmp.size(); ++i) { +      const int cur = tmp[i]; +      if (isf) { +        if (kDIV == cur) { isf = false; } else { +          lf.push_back(cur); +          vocab_f->insert(cur); +        } +      } else { +        assert(cur != kDIV); +        le.push_back(cur); +        vocab_e->insert(cur); +      } +    } +    assert(isf == false); +  } +} + +} + diff --git a/gi/pf/corpus.h b/gi/pf/corpus.h new file mode 100644 index 00000000..e7febdb7 --- /dev/null +++ b/gi/pf/corpus.h @@ -0,0 +1,19 @@ +#ifndef _CORPUS_H_ +#define _CORPUS_H_ + +#include <string> +#include <vector> +#include <set> +#include "wordid.h" + +namespace corpus { + +void ReadParallelCorpus(const std::string& filename, +                std::vector<std::vector<WordID> >* f, +                std::vector<std::vector<WordID> >* e, +                std::set<WordID>* vocab_f, +                std::set<WordID>* vocab_e); + +} + +#endif diff --git a/gi/pf/dpnaive.cc b/gi/pf/dpnaive.cc index 608f73d5..c926487b 100644 --- a/gi/pf/dpnaive.cc +++ b/gi/pf/dpnaive.cc @@ -7,12 +7,14 @@  #include <boost/program_options/variables_map.hpp>  #include "base_measures.h" +#include "monotonic_pseg.h"  #include "trule.h"  #include "tdict.h"  #include "filelib.h"  #include "dict.h"  #include "sampler.h"  #include "ccrp_nt.h" +#include "corpus.h"  using namespace std;  using namespace std::tr1; @@ -52,57 +54,12 @@ void InitCommandLine(int argc, char** argv, po::variables_map* conf) {    }  } -void ReadParallelCorpus(const string& filename, -                vector<vector<WordID> >* f, -                vector<vector<int> >* e, -                set<int>* vocab_e, -                set<int>* vocab_f) { -  f->clear(); -  e->clear(); -  vocab_f->clear(); -  vocab_e->clear(); -  istream* in; -  if (filename == "-") -    in = &cin; -  else -    in = new ifstream(filename.c_str()); -  assert(*in); -  string line; -  const WordID kDIV = TD::Convert("|||"); -  vector<WordID> tmp; -  while(*in) { -    getline(*in, line); -    if (line.empty() && !*in) break; -    e->push_back(vector<int>()); -    f->push_back(vector<int>()); -    vector<int>& le = e->back(); -    vector<int>& lf = f->back(); -    tmp.clear(); -    TD::ConvertSentence(line, &tmp); -    bool isf = true; -    for (unsigned i = 0; i < tmp.size(); ++i) { -      const int cur = tmp[i]; -      if (isf) { -        if (kDIV == cur) { isf = false; } else { -          lf.push_back(cur); -          vocab_f->insert(cur); -        } -      } else { -        assert(cur != kDIV); -        le.push_back(cur); -        vocab_e->insert(cur); -      } -    } -    assert(isf == false); -  } -  if (in != &cin) delete in; -} -  shared_ptr<MT19937> prng;  template <typename Base>  struct ModelAndData { -  explicit ModelAndData(const Base& b, const vector<vector<int> >& ce, const vector<vector<int> >& cf, const set<int>& ve, const set<int>& vf) : +  explicit ModelAndData(MonotonicParallelSegementationModel& m, const Base& b, const vector<vector<int> >& ce, const vector<vector<int> >& cf, const set<int>& ve, const set<int>& vf) : +     model(m),       rng(&*prng),       p0(b),       baseprob(prob_t::One()), @@ -110,14 +67,12 @@ struct ModelAndData {       corpusf(cf),       vocabe(ve),       vocabf(vf), -     rules(1,1),       mh_samples(),       mh_rejects(),       kX(-TD::Convert("X")),       derivations(corpuse.size()) {}    void ResampleHyperparameters() { -    rules.resample_hyperparameters(&*prng);    }    void InstantiateRule(const pair<short,short>& from, @@ -139,12 +94,10 @@ struct ModelAndData {      TRule x;      for (int i = 1; i < d.size(); ++i) {        InstantiateRule(d[i], d[i-1], sentf, sente, &x); -      //cerr << "REMOVE: " << x.AsString() << endl; -      if (rules.decrement(x)) { -        baseprob /= p0(x); -        //cerr << "  (REMOVED ONLY INSTANCE)\n"; -      } +      model.DecrementRule(x); +      model.DecrementContinue();      } +    model.DecrementStop();    }    void PrintDerivation(const vector<pair<short,short> >& d, const vector<int>& sentf, const vector<int>& sente) { @@ -161,39 +114,38 @@ struct ModelAndData {      TRule x;      for (int i = 1; i < d.size(); ++i) {        InstantiateRule(d[i], d[i-1], sentf, sente, &x); -      if (rules.increment(x)) { -        baseprob *= p0(x); -      } +      model.IncrementRule(x); +      model.IncrementContinue();      } +    model.IncrementStop();    }    prob_t Likelihood() const { -    prob_t p; -    p.logeq(rules.log_crp_prob()); -    return p * baseprob; +    return model.Likelihood();    }    prob_t DerivationProposalProbability(const vector<pair<short,short> >& d, const vector<int>& sentf, const vector<int>& sente) const { -    prob_t p = prob_t::One(); +    prob_t p = model.StopProbability();      if (d.size() < 2) return p;      TRule x; +    const prob_t p_cont = model.ContinueProbability();      for (int i = 1; i < d.size(); ++i) {        InstantiateRule(d[i], d[i-1], sentf, sente, &x); -      prob_t rp; rp.logeq(rules.logprob(x, log(p0(x)))); -      p *= rp; +      p *= p_cont; +      p *= model.RuleProbability(x);      }      return p;    }    void Sample(); +  MonotonicParallelSegementationModel& model;    MT19937* rng;    const Base& p0;    prob_t baseprob; // cached value of generating the table table labels from p0                     // this can't be used if we go to a hierarchical prior!    const vector<vector<int> >& corpuse, corpusf;    const set<int>& vocabe, vocabf; -  CCRP_NoTable<TRule> rules;    unsigned mh_samples, mh_rejects;    const int kX;    vector<vector<pair<short, short> > > derivations; @@ -201,8 +153,8 @@ struct ModelAndData {  template <typename Base>  void ModelAndData<Base>::Sample() { -  unsigned MAXK = 4; -  unsigned MAXL = 4; +  unsigned MAXK = kMAX_SRC_PHRASE; +  unsigned MAXL = kMAX_TRG_PHRASE;    TRule x;    x.lhs_ = -TD::Convert("X");    for (int samples = 0; samples < 1000; ++samples) { @@ -228,6 +180,8 @@ void ModelAndData<Base>::Sample() {        boost::multi_array<prob_t, 2> a(boost::extents[sentf.size() + 1][sente.size() + 1]);        boost::multi_array<prob_t, 4> trans(boost::extents[sentf.size() + 1][sente.size() + 1][MAXK][MAXL]);        a[0][0] = prob_t::One(); +      const prob_t q_stop = model.StopProbability(); +      const prob_t q_cont = model.ContinueProbability();        for (int i = 0; i < sentf.size(); ++i) {          for (int j = 0; j < sente.size(); ++j) {            const prob_t src_a = a[i][j]; @@ -239,7 +193,9 @@ void ModelAndData<Base>::Sample() {              for (int l = 1; l <= MAXL; ++l) {                if (j + l > sente.size()) break;                x.e_.push_back(sente[j + l - 1]); -              trans[i][j][k - 1][l - 1].logeq(rules.logprob(x, log(p0(x)))); +              const bool stop_now = ((j + l) == sente.size()) && ((i + k) == sentf.size()); +              const prob_t& cp = stop_now ? q_stop : q_cont; +              trans[i][j][k - 1][l - 1] = model.RuleProbability(x) * cp;                a[i + k][j + l] += src_a * trans[i][j][k - 1][l - 1];              }            } @@ -319,7 +275,7 @@ int main(int argc, char** argv) {    vector<vector<int> > corpuse, corpusf;    set<int> vocabe, vocabf; -  ReadParallelCorpus(conf["input"].as<string>(), &corpusf, &corpuse, &vocabf, &vocabe); +  corpus::ReadParallelCorpus(conf["input"].as<string>(), &corpusf, &corpuse, &vocabf, &vocabe);    cerr << "f-Corpus size: " << corpusf.size() << " sentences\n";    cerr << "f-Vocabulary size: " << vocabf.size() << " types\n";    cerr << "f-Corpus size: " << corpuse.size() << " sentences\n"; @@ -328,8 +284,9 @@ int main(int argc, char** argv) {    Model1 m1(conf["model1"].as<string>());    PhraseJointBase lp0(m1, conf["model1_interpolation_weight"].as<double>(), vocabe.size(), vocabf.size()); +  MonotonicParallelSegementationModel m(lp0); -  ModelAndData<PhraseJointBase> posterior(lp0, corpuse, corpusf, vocabe, vocabf); +  ModelAndData<PhraseJointBase> posterior(m, lp0, corpuse, corpusf, vocabe, vocabf);    posterior.Sample();    return 0; diff --git a/gi/pf/monotonic_pseg.h b/gi/pf/monotonic_pseg.h new file mode 100644 index 00000000..7e6af3fc --- /dev/null +++ b/gi/pf/monotonic_pseg.h @@ -0,0 +1,88 @@ +#ifndef _MONOTONIC_PSEG_H_ +#define _MONOTONIC_PSEG_H_ + +#include <vector> + +#include "prob.h" +#include "ccrp_nt.h" +#include "trule.h" +#include "base_measures.h" + +struct MonotonicParallelSegementationModel { +  explicit MonotonicParallelSegementationModel(PhraseJointBase& rcp0) : +    rp0(rcp0), base(prob_t::One()), rules(1,1), stop(1.0) {} + +  void DecrementRule(const TRule& rule) { +    if (rules.decrement(rule)) +      base /= rp0(rule); +  } + +  void IncrementRule(const TRule& rule) { +    if (rules.increment(rule)) +      base *= rp0(rule); +  } + +  void IncrementRulesAndStops(const std::vector<TRulePtr>& rules) { +    for (int i = 0; i < rules.size(); ++i) +      IncrementRule(*rules[i]); +    if (rules.size()) IncrementContinue(rules.size() - 1); +    IncrementStop(); +  } + +  void DecrementRulesAndStops(const std::vector<TRulePtr>& rules) { +    for (int i = 0; i < rules.size(); ++i) +      DecrementRule(*rules[i]); +    if (rules.size()) { +      DecrementContinue(rules.size() - 1); +      DecrementStop(); +    } +  } + +  prob_t RuleProbability(const TRule& rule) const { +    prob_t p; p.logeq(rules.logprob(rule, log(rp0(rule)))); +    return p; +  } + +  prob_t Likelihood() const { +    prob_t p = base; +    prob_t q; q.logeq(rules.log_crp_prob()); +    p *= q; +    q.logeq(stop.log_crp_prob()); +    p *= q; +    return p; +  } + +  void IncrementStop() { +    stop.increment(true); +  } + +  void IncrementContinue(int n = 1) { +    for (int i = 0; i < n; ++i) +      stop.increment(false); +  } + +  void DecrementStop() { +    stop.decrement(true); +  } + +  void DecrementContinue(int n = 1) { +    for (int i = 0; i < n; ++i) +      stop.decrement(false); +  } + +  prob_t StopProbability() const { +    return prob_t(stop.prob(true, 0.5)); +  } + +  prob_t ContinueProbability() const { +    return prob_t(stop.prob(false, 0.5)); +  } + +  const PhraseJointBase& rp0; +  prob_t base; +  CCRP_NoTable<TRule> rules; +  CCRP_NoTable<bool> stop; +}; + +#endif + diff --git a/gi/pf/pfnaive.cc b/gi/pf/pfnaive.cc index c30e7c4f..33dc08c3 100644 --- a/gi/pf/pfnaive.cc +++ b/gi/pf/pfnaive.cc @@ -7,6 +7,7 @@  #include <boost/program_options/variables_map.hpp>  #include "base_measures.h" +#include "monotonic_pseg.h"  #include "reachability.h"  #include "viterbi.h"  #include "hg.h" @@ -17,6 +18,7 @@  #include "sampler.h"  #include "ccrp_nt.h"  #include "ccrp_onetable.h" +#include "corpus.h"  using namespace std;  using namespace tr1; @@ -58,101 +60,6 @@ void InitCommandLine(int argc, char** argv, po::variables_map* conf) {    }  } -void ReadParallelCorpus(const string& filename, -                vector<vector<WordID> >* f, -                vector<vector<WordID> >* e, -                set<WordID>* vocab_f, -                set<WordID>* vocab_e) { -  f->clear(); -  e->clear(); -  vocab_f->clear(); -  vocab_e->clear(); -  istream* in; -  if (filename == "-") -    in = &cin; -  else -    in = new ifstream(filename.c_str()); -  assert(*in); -  string line; -  const WordID kDIV = TD::Convert("|||"); -  vector<WordID> tmp; -  while(*in) { -    getline(*in, line); -    if (line.empty() && !*in) break; -    e->push_back(vector<int>()); -    f->push_back(vector<int>()); -    vector<int>& le = e->back(); -    vector<int>& lf = f->back(); -    tmp.clear(); -    TD::ConvertSentence(line, &tmp); -    bool isf = true; -    for (unsigned i = 0; i < tmp.size(); ++i) { -      const int cur = tmp[i]; -      if (isf) { -        if (kDIV == cur) { isf = false; } else { -          lf.push_back(cur); -          vocab_f->insert(cur); -        } -      } else { -        assert(cur != kDIV); -        le.push_back(cur); -        vocab_e->insert(cur); -      } -    } -    assert(isf == false); -  } -  if (in != &cin) delete in; -} - -struct MyJointModel { -  MyJointModel(PhraseJointBase& rcp0) : -    rp0(rcp0), base(prob_t::One()), rules(1,1) {} - -  void DecrementRule(const TRule& rule) { -    if (rules.decrement(rule)) -      base /= rp0(rule); -  } - -  void IncrementRule(const TRule& rule) { -    if (rules.increment(rule)) -      base *= rp0(rule); -  } - -  void IncrementRules(const vector<TRulePtr>& rules) { -    for (int i = 0; i < rules.size(); ++i) -      IncrementRule(*rules[i]); -  } - -  void DecrementRules(const vector<TRulePtr>& rules) { -    for (int i = 0; i < rules.size(); ++i) -      DecrementRule(*rules[i]); -  } - -  prob_t RuleProbability(const TRule& rule) const { -    prob_t p; p.logeq(rules.logprob(rule, log(rp0(rule)))); -    return p; -  } - -  prob_t Likelihood() const { -    prob_t p = base; -    prob_t q; q.logeq(rules.log_crp_prob()); -    p *= q; -    for (unsigned l = 1; l < src_jumps.size(); ++l) { -      if (src_jumps[l].num_customers() > 0) { -        prob_t q; -        q.logeq(src_jumps[l].log_crp_prob()); -        p *= q; -      } -    } -    return p; -  } - -  const PhraseJointBase& rp0; -  prob_t base; -  CCRP_NoTable<TRule> rules; -  vector<CCRP_NoTable<int> > src_jumps; -}; -  struct BackwardEstimateSym {    BackwardEstimateSym(const Model1& m1,                        const Model1& invm1, const vector<WordID>& src, const vector<WordID>& trg) : @@ -264,7 +171,7 @@ int main(int argc, char** argv) {    vector<vector<WordID> > corpuse, corpusf;    set<WordID> vocabe, vocabf;    cerr << "Reading corpus...\n"; -  ReadParallelCorpus(conf["input"].as<string>(), &corpusf, &corpuse, &vocabf, &vocabe); +  corpus::ReadParallelCorpus(conf["input"].as<string>(), &corpusf, &corpuse, &vocabf, &vocabe);    cerr << "F-corpus size: " << corpusf.size() << " sentences\t (" << vocabf.size() << " word types)\n";    cerr << "E-corpus size: " << corpuse.size() << " sentences\t (" << vocabe.size() << " word types)\n";    assert(corpusf.size() == corpuse.size()); @@ -273,13 +180,8 @@ int main(int argc, char** argv) {    Model1 m1(conf["model1"].as<string>());    Model1 invm1(conf["inverse_model1"].as<string>()); -#if 0 -  PhraseConditionalBase lp0(m1, conf["model1_interpolation_weight"].as<double>(), vocabe.size()); -  MyConditionalModel m(lp0); -#else    PhraseJointBase lp0(m1, conf["model1_interpolation_weight"].as<double>(), vocabe.size(), vocabf.size()); -  MyJointModel m(lp0); -#endif +  MonotonicParallelSegementationModel m(lp0);    cerr << "Initializing reachability limits...\n";    vector<Particle> ps(corpusf.size()); @@ -296,7 +198,10 @@ int main(int argc, char** argv) {      for (int ci = 0; ci < corpusf.size(); ++ci) {        vector<int>& src = corpusf[ci];        vector<int>& trg = corpuse[ci]; -      m.DecrementRules(ps[ci].rules); +      m.DecrementRulesAndStops(ps[ci].rules); +      const prob_t q_stop = m.StopProbability(); +      const prob_t q_cont = m.ContinueProbability(); +      cerr << "P(stop)=" << q_stop << "\tP(continue)=" <<q_cont << endl;        BackwardEstimateSym be(m1, invm1, src, trg);        const Reachability& r = reaches[ci]; @@ -336,7 +241,8 @@ int main(int argc, char** argv) {                    x.f_.push_back(src[i + j]);                  np.src_cov += x.f_.size();                  np.trg_cov += x.e_.size(); -                prob_t rp = m.RuleProbability(x); +                const bool stop_now = (np.src_cov == src_len && np.trg_cov == trg_len); +                prob_t rp = m.RuleProbability(x) * (stop_now ? q_stop : q_cont);                  np.gamma_last = rp;                  const prob_t u = pow(np.gamma_last * pow(be(np.src_cov, np.trg_cov), 1.2), 0.1);                  //cerr << "**rule=" << x << endl; @@ -363,7 +269,7 @@ int main(int argc, char** argv) {          pfss.add(lps[i].weight);        const int sampled = rng.SelectSample(pfss);        ps[ci] = lps[sampled]; -      m.IncrementRules(lps[sampled].rules); +      m.IncrementRulesAndStops(lps[sampled].rules);        for (int i = 0; i < lps[sampled].rules.size(); ++i) { cerr << "S:\t" << lps[sampled].rules[i]->AsString() << "\n"; }        cerr << "tmp-LLH: " << log(m.Likelihood()) << endl;      } | 
