diff options
| author | Patrick Simianer <p@simianer.de> | 2011-10-20 02:31:25 +0200 | 
|---|---|---|
| committer | Patrick Simianer <p@simianer.de> | 2011-10-20 02:31:25 +0200 | 
| commit | a5a92ebe23c5819ed104313426012011e32539da (patch) | |
| tree | 3416818c758d5ece4e71fe522c571e75ea04f100 /gi | |
| parent | b88332caac2cbe737c99b8098813f868ca876d8b (diff) | |
| parent | 78baccbb4231bb84a456702d4f574f8e601a8182 (diff) | |
finalized merge
Diffstat (limited to 'gi')
| -rw-r--r-- | gi/markov_al/Makefile.am | 6 | ||||
| -rw-r--r-- | gi/markov_al/README | 2 | ||||
| -rw-r--r-- | gi/markov_al/ml.cc | 470 | ||||
| -rw-r--r-- | gi/pf/Makefile.am | 21 | ||||
| -rw-r--r-- | gi/pf/README | 2 | ||||
| -rw-r--r-- | gi/pf/base_measures.cc | 112 | ||||
| -rw-r--r-- | gi/pf/base_measures.h | 116 | ||||
| -rw-r--r-- | gi/pf/brat.cc | 543 | ||||
| -rw-r--r-- | gi/pf/cbgi.cc | 330 | ||||
| -rw-r--r-- | gi/pf/cfg_wfst_composer.cc | 730 | ||||
| -rw-r--r-- | gi/pf/cfg_wfst_composer.h | 46 | ||||
| -rw-r--r-- | gi/pf/corpus.cc | 57 | ||||
| -rw-r--r-- | gi/pf/corpus.h | 19 | ||||
| -rw-r--r-- | gi/pf/dpnaive.cc | 294 | ||||
| -rw-r--r-- | gi/pf/itg.cc | 213 | ||||
| -rw-r--r-- | gi/pf/monotonic_pseg.h | 88 | ||||
| -rw-r--r-- | gi/pf/pfbrat.cc | 543 | ||||
| -rw-r--r-- | gi/pf/pfdist.cc | 610 | ||||
| -rw-r--r-- | gi/pf/pfdist.new.cc | 620 | ||||
| -rw-r--r-- | gi/pf/pfnaive.cc | 280 | ||||
| -rw-r--r-- | gi/pf/reachability.cc | 64 | ||||
| -rw-r--r-- | gi/pf/reachability.h | 28 | ||||
| -rw-r--r-- | gi/pf/tpf.cc | 99 | 
23 files changed, 5293 insertions, 0 deletions
| diff --git a/gi/markov_al/Makefile.am b/gi/markov_al/Makefile.am new file mode 100644 index 00000000..fe3e3349 --- /dev/null +++ b/gi/markov_al/Makefile.am @@ -0,0 +1,6 @@ +bin_PROGRAMS = ml + +ml_SOURCES = ml.cc + +AM_CPPFLAGS = -W -Wall -Wno-sign-compare -funroll-loops -I$(top_srcdir)/utils $(GTEST_CPPFLAGS) -I$(top_srcdir)/decoder +AM_LDFLAGS = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/utils/libutils.a -lz diff --git a/gi/markov_al/README b/gi/markov_al/README new file mode 100644 index 00000000..9c10f7cd --- /dev/null +++ b/gi/markov_al/README @@ -0,0 +1,2 @@ +Experimental translation models with Markovian dependencies. + diff --git a/gi/markov_al/ml.cc b/gi/markov_al/ml.cc new file mode 100644 index 00000000..1e71edd6 --- /dev/null +++ b/gi/markov_al/ml.cc @@ -0,0 +1,470 @@ +#include <iostream> +#include <tr1/unordered_map> + +#include <boost/shared_ptr.hpp> +#include <boost/functional.hpp> +#include <boost/program_options.hpp> +#include <boost/program_options/variables_map.hpp> + +#include "tdict.h" +#include "filelib.h" +#include "sampler.h" +#include "ccrp_onetable.h" +#include "array2d.h" + +using namespace std; +using namespace std::tr1; +namespace po = boost::program_options; + +void PrintTopCustomers(const CCRP_OneTable<WordID>& crp) { +  for (CCRP_OneTable<WordID>::const_iterator it = crp.begin(); it != crp.end(); ++it) { +    cerr << "  " << TD::Convert(it->first) << " = " << it->second << endl; +  } +} + +void PrintAlignment(const vector<WordID>& src, const vector<WordID>& trg, const vector<unsigned char>& a) { +  cerr << TD::GetString(src) << endl << TD::GetString(trg) << endl; +  Array2D<bool> al(src.size(), trg.size()); +  for (int i = 0; i < a.size(); ++i) +    if (a[i] != 255) al(a[i], i) = true; +  cerr << al << endl; +} + +void InitCommandLine(int argc, char** argv, po::variables_map* conf) { +  po::options_description opts("Configuration options"); +  opts.add_options() +        ("samples,s",po::value<unsigned>()->default_value(1000),"Number of samples") +        ("input,i",po::value<string>(),"Read parallel data from") +        ("random_seed,S",po::value<uint32_t>(), "Random seed"); +  po::options_description clo("Command line options"); +  clo.add_options() +        ("config", po::value<string>(), "Configuration file") +        ("help,h", "Print this help message and exit"); +  po::options_description dconfig_options, dcmdline_options; +  dconfig_options.add(opts); +  dcmdline_options.add(opts).add(clo); +   +  po::store(parse_command_line(argc, argv, dcmdline_options), *conf); +  if (conf->count("config")) { +    ifstream config((*conf)["config"].as<string>().c_str()); +    po::store(po::parse_config_file(config, dconfig_options), *conf); +  } +  po::notify(*conf); + +  if (conf->count("help") || (conf->count("input") == 0)) { +    cerr << dcmdline_options << endl; +    exit(1); +  } +} + +struct Unigram; +struct Bigram { +  Bigram() : trg(), cond() {} +  Bigram(WordID prev, WordID cur, WordID t) : trg(t) { cond.first = prev; cond.second = cur; } +  const pair<WordID,WordID>& ConditioningPair() const { +    return cond; +  } +  WordID& prev_src() { return cond.first; } +  WordID& cur_src() { return cond.second; } +  const WordID& prev_src() const { return cond.first; } +  const WordID& cur_src() const { return cond.second; } +  WordID trg; + private: +  pair<WordID, WordID> cond; +}; + +struct Unigram { +  Unigram() : cur_src(), trg() {} +  Unigram(WordID s, WordID t) : cur_src(s), trg(t) {} +  WordID cur_src; +  WordID trg; +}; + +ostream& operator<<(ostream& os, const Bigram& b) { +  os << "( " << TD::Convert(b.trg) << " | " << TD::Convert(b.prev_src()) << " , " << TD::Convert(b.cur_src()) << " )"; +  return os; +} + +ostream& operator<<(ostream& os, const Unigram& u) { +  os << "( " << TD::Convert(u.trg) << " | " << TD::Convert(u.cur_src) << " )"; +  return os; +} + +bool operator==(const Bigram& a, const Bigram& b) { +  return a.trg == b.trg && a.cur_src() == b.cur_src() && a.prev_src() == b.prev_src(); +} + +bool operator==(const Unigram& a, const Unigram& b) { +  return a.trg == b.trg && a.cur_src == b.cur_src; +} + +size_t hash_value(const Bigram& b) { +  size_t h = boost::hash_value(b.prev_src()); +  boost::hash_combine(h, boost::hash_value(b.cur_src())); +  boost::hash_combine(h, boost::hash_value(b.trg)); +  return h; +} + +size_t hash_value(const Unigram& u) { +  size_t h = boost::hash_value(u.cur_src); +  boost::hash_combine(h, boost::hash_value(u.trg)); +  return h; +} + +void ReadParallelCorpus(const string& filename, +                vector<vector<WordID> >* f, +                vector<vector<WordID> >* e, +                set<WordID>* vocab_f, +                set<WordID>* vocab_e) { +  f->clear(); +  e->clear(); +  vocab_f->clear(); +  vocab_e->clear(); +  istream* in; +  if (filename == "-") +    in = &cin; +  else +    in = new ifstream(filename.c_str()); +  assert(*in); +  string line; +  const WordID kDIV = TD::Convert("|||"); +  vector<WordID> tmp; +  while(*in) { +    getline(*in, line); +    if (line.empty() && !*in) break; +    e->push_back(vector<int>()); +    f->push_back(vector<int>()); +    vector<int>& le = e->back(); +    vector<int>& lf = f->back(); +    tmp.clear(); +    TD::ConvertSentence(line, &tmp); +    bool isf = true; +    for (unsigned i = 0; i < tmp.size(); ++i) { +      const int cur = tmp[i]; +      if (isf) { +        if (kDIV == cur) { isf = false; } else { +          lf.push_back(cur); +          vocab_f->insert(cur); +        } +      } else { +        assert(cur != kDIV); +        le.push_back(cur); +        vocab_e->insert(cur); +      } +    } +    assert(isf == false); +  } +  if (in != &cin) delete in; +} + +struct UnigramModel { +  UnigramModel(size_t src_voc_size, size_t trg_voc_size) : +    unigrams(TD::NumWords() + 1, CCRP_OneTable<WordID>(1,1,1,1)), +    p0(1.0 / trg_voc_size) {} + +  void increment(const Bigram& b) { +    unigrams[b.cur_src()].increment(b.trg); +  } + +  void decrement(const Bigram& b) { +    unigrams[b.cur_src()].decrement(b.trg); +  } + +  double prob(const Bigram& b) const { +    const double q0 = unigrams[b.cur_src()].prob(b.trg, p0); +    return q0; +  } + +  double LogLikelihood() const { +    double llh = 0; +    for (unsigned i = 0; i < unigrams.size(); ++i) { +      const CCRP_OneTable<WordID>& crp = unigrams[i]; +      if (crp.num_customers() > 0) { +        llh += crp.log_crp_prob(); +        llh += crp.num_tables() * log(p0); +      } +    } +    return llh; +  } + +  void ResampleHyperparameters(MT19937* rng) { +    for (unsigned i = 0; i < unigrams.size(); ++i) +      unigrams[i].resample_hyperparameters(rng); +  } + +  vector<CCRP_OneTable<WordID> > unigrams;  // unigrams[src].prob(trg, p0) = p(trg|src) + +  const double p0; +}; + +struct BigramModel { +  BigramModel(size_t src_voc_size, size_t trg_voc_size) : +    unigrams(TD::NumWords() + 1, CCRP_OneTable<WordID>(1,1,1,1)), +    p0(1.0 / trg_voc_size) {} + +  void increment(const Bigram& b) { +    BigramMap::iterator it = bigrams.find(b.ConditioningPair()); +    if (it == bigrams.end()) { +      it = bigrams.insert(make_pair(b.ConditioningPair(), CCRP_OneTable<WordID>(1,1,1,1))).first; +    } +    if (it->second.increment(b.trg)) +      unigrams[b.cur_src()].increment(b.trg); +  } + +  void decrement(const Bigram& b) { +    BigramMap::iterator it = bigrams.find(b.ConditioningPair()); +    assert(it != bigrams.end()); +    if (it->second.decrement(b.trg)) { +      unigrams[b.cur_src()].decrement(b.trg); +      if (it->second.num_customers() == 0) +        bigrams.erase(it); +    } +  } + +  double prob(const Bigram& b) const { +    const double q0 = unigrams[b.cur_src()].prob(b.trg, p0); +    const BigramMap::const_iterator it = bigrams.find(b.ConditioningPair()); +    if (it == bigrams.end()) return q0; +    return it->second.prob(b.trg, q0); +  } + +  double LogLikelihood() const { +    double llh = 0; +    for (unsigned i = 0; i < unigrams.size(); ++i) { +      const CCRP_OneTable<WordID>& crp = unigrams[i]; +      if (crp.num_customers() > 0) { +        llh += crp.log_crp_prob(); +        llh += crp.num_tables() * log(p0); +      } +    } +    for (BigramMap::const_iterator it = bigrams.begin(); it != bigrams.end(); ++it) { +      const CCRP_OneTable<WordID>& crp = it->second; +      const WordID cur_src = it->first.second; +      llh += crp.log_crp_prob(); +      for (CCRP_OneTable<WordID>::const_iterator bit = crp.begin(); bit != crp.end(); ++bit) { +        llh += log(unigrams[cur_src].prob(bit->second, p0)); +      } +    } +    return llh; +  } + +  void ResampleHyperparameters(MT19937* rng) { +    for (unsigned i = 0; i < unigrams.size(); ++i) +      unigrams[i].resample_hyperparameters(rng); +    for (BigramMap::iterator it = bigrams.begin(); it != bigrams.end(); ++it) +      it->second.resample_hyperparameters(rng); +  } + +  typedef unordered_map<pair<WordID,WordID>, CCRP_OneTable<WordID>, boost::hash<pair<WordID,WordID> > > BigramMap; +  BigramMap bigrams;  // bigrams[(src-1,src)].prob(trg, q0) = p(trg|src,src-1) +  vector<CCRP_OneTable<WordID> > unigrams;  // unigrams[src].prob(trg, p0) = p(trg|src) + +  const double p0; +}; + +struct BigramAlignmentModel { +  BigramAlignmentModel(size_t src_voc_size, size_t trg_voc_size) : bigrams(TD::NumWords() + 1, CCRP_OneTable<WordID>(1,1,1,1)), p0(1.0 / src_voc_size) {} +  void increment(WordID prev, WordID next) { +    bigrams[prev].increment(next);  // hierarchy? +  } +  void decrement(WordID prev, WordID next) { +    bigrams[prev].decrement(next);  // hierarchy? +  } +  double prob(WordID prev, WordID next) { +    return bigrams[prev].prob(next, p0); +  } +  double LogLikelihood() const { +    double llh = 0; +    for (unsigned i = 0; i < bigrams.size(); ++i) { +      const CCRP_OneTable<WordID>& crp = bigrams[i]; +      if (crp.num_customers() > 0) { +        llh += crp.log_crp_prob(); +        llh += crp.num_tables() * log(p0); +      } +    } +    return llh; +  } + +  vector<CCRP_OneTable<WordID> > bigrams;  // bigrams[prev].prob(next, p0) = p(next|prev) +  const double p0; +}; + +struct Alignment { +  vector<unsigned char> a; +}; + +int main(int argc, char** argv) { +  po::variables_map conf; +  InitCommandLine(argc, argv, &conf); +  const unsigned samples = conf["samples"].as<unsigned>(); + +  boost::shared_ptr<MT19937> prng; +  if (conf.count("random_seed")) +    prng.reset(new MT19937(conf["random_seed"].as<uint32_t>())); +  else +    prng.reset(new MT19937); +  MT19937& rng = *prng; + +  vector<vector<WordID> > corpuse, corpusf; +  set<WordID> vocabe, vocabf; +  cerr << "Reading corpus...\n"; +  ReadParallelCorpus(conf["input"].as<string>(), &corpusf, &corpuse, &vocabf, &vocabe); +  cerr << "F-corpus size: " << corpusf.size() << " sentences\t (" << vocabf.size() << " word types)\n"; +  cerr << "E-corpus size: " << corpuse.size() << " sentences\t (" << vocabe.size() << " word types)\n"; +  assert(corpusf.size() == corpuse.size()); +  const size_t corpus_len = corpusf.size(); +  const WordID kNULL = TD::Convert("<eps>"); +  const WordID kBOS = TD::Convert("<s>"); +  const WordID kEOS = TD::Convert("</s>"); +  Bigram TT(kBOS, TD::Convert("我"), TD::Convert("i")); +  Bigram TT2(kBOS, TD::Convert("要"), TD::Convert("i")); + +  UnigramModel model(vocabf.size(), vocabe.size()); +  vector<Alignment> alignments(corpus_len); +  for (unsigned ci = 0; ci < corpus_len; ++ci) { +    const vector<WordID>& src = corpusf[ci]; +    const vector<WordID>& trg = corpuse[ci]; +    vector<unsigned char>& alg = alignments[ci].a; +    alg.resize(trg.size()); +    int lenp1 = src.size() + 1; +    WordID prev_src = kBOS; +    for (int j = 0; j < trg.size(); ++j) { +      int samp = lenp1 * rng.next(); +      --samp; +      if (samp < 0) samp = 255; +      alg[j] = samp; +      WordID cur_src = (samp == 255 ? kNULL : src[alg[j]]); +      Bigram b(prev_src, cur_src, trg[j]); +      model.increment(b); +      prev_src = cur_src; +    } +    Bigram b(prev_src, kEOS, kEOS); +    model.increment(b); +  } +  cerr << "Initial LLH: " << model.LogLikelihood() << endl; + +  SampleSet<double> ss; +  for (unsigned si = 0; si < 50; ++si) { +    for (unsigned ci = 0; ci < corpus_len; ++ci) { +      const vector<WordID>& src = corpusf[ci]; +      const vector<WordID>& trg = corpuse[ci]; +      vector<unsigned char>& alg = alignments[ci].a; +      WordID prev_src = kBOS; +      for (unsigned j = 0; j < trg.size(); ++j) { +        unsigned char& a_j = alg[j]; +        WordID cur_e_a_j = (a_j == 255 ? kNULL : src[a_j]); +        Bigram b(prev_src, cur_e_a_j, trg[j]); +        //cerr << "DEC: " << b << "\t" << nextb << endl; +        model.decrement(b); +        ss.clear(); +        for (unsigned i = 0; i <= src.size(); ++i) { +          const WordID cur_src = (i ? src[i-1] : kNULL); +          b.cur_src() = cur_src; +          ss.add(model.prob(b)); +        } +        int sampled_a_j = rng.SelectSample(ss); +        a_j = (sampled_a_j ? sampled_a_j - 1 : 255); +        cur_e_a_j = (a_j == 255 ? kNULL : src[a_j]); +        b.cur_src() = cur_e_a_j; +        //cerr << "INC: " << b << "\t" << nextb << endl; +        model.increment(b); +        prev_src = cur_e_a_j; +      } +    } +    cerr << '.' << flush; +    if (si % 10 == 9) { +      cerr << "[LLH prev=" << model.LogLikelihood(); +      //model.ResampleHyperparameters(&rng); +      cerr << " new=" << model.LogLikelihood() << "]\n"; +      //pair<WordID,WordID> xx = make_pair(kBOS, TD::Convert("我")); +      //PrintTopCustomers(model.bigrams.find(xx)->second); +      cerr << "p(" << TT << ") = " << model.prob(TT) << endl; +      cerr << "p(" << TT2 << ") = " << model.prob(TT2) << endl; +      PrintAlignment(corpusf[0], corpuse[0], alignments[0].a); +    } +  } +  { +  // MODEL 2 +  BigramModel model(vocabf.size(), vocabe.size()); +  BigramAlignmentModel amodel(vocabf.size(), vocabe.size()); +  for (unsigned ci = 0; ci < corpus_len; ++ci) { +    const vector<WordID>& src = corpusf[ci]; +    const vector<WordID>& trg = corpuse[ci]; +    vector<unsigned char>& alg = alignments[ci].a; +    WordID prev_src = kBOS; +    for (int j = 0; j < trg.size(); ++j) { +      WordID cur_src = (alg[j] == 255 ? kNULL : src[alg[j]]); +      Bigram b(prev_src, cur_src, trg[j]); +      model.increment(b); +      amodel.increment(prev_src, cur_src); +      prev_src = cur_src; +    } +    amodel.increment(prev_src, kEOS); +    Bigram b(prev_src, kEOS, kEOS); +    model.increment(b); +  } +  cerr << "Initial LLH: " << model.LogLikelihood() << " " << amodel.LogLikelihood() << endl; + +  SampleSet<double> ss; +  for (unsigned si = 0; si < samples; ++si) { +    for (unsigned ci = 0; ci < corpus_len; ++ci) { +      const vector<WordID>& src = corpusf[ci]; +      const vector<WordID>& trg = corpuse[ci]; +      vector<unsigned char>& alg = alignments[ci].a; +      WordID prev_src = kBOS; +      for (unsigned j = 0; j < trg.size(); ++j) { +        unsigned char& a_j = alg[j]; +        WordID cur_e_a_j = (a_j == 255 ? kNULL : src[a_j]); +        Bigram b(prev_src, cur_e_a_j, trg[j]); +        WordID next_src = kEOS; +        WordID next_trg = kEOS; +        if (j < (trg.size() - 1)) { +          next_src = (alg[j+1] == 255 ? kNULL : src[alg[j + 1]]); +          next_trg = trg[j + 1]; +        } +        Bigram nextb(cur_e_a_j, next_src, next_trg); +        //cerr << "DEC: " << b << "\t" << nextb << endl; +        model.decrement(b); +        model.decrement(nextb); +        amodel.decrement(prev_src, cur_e_a_j); +        amodel.decrement(cur_e_a_j, next_src); +        ss.clear(); +        for (unsigned i = 0; i <= src.size(); ++i) { +          const WordID cur_src = (i ? src[i-1] : kNULL); +          b.cur_src() = cur_src; +          ss.add(model.prob(b) * model.prob(nextb) * amodel.prob(prev_src, cur_src) * amodel.prob(cur_src, next_src)); +          //cerr << log(ss[ss.size() - 1]) << "\t" << b << endl; +        } +        int sampled_a_j = rng.SelectSample(ss); +        a_j = (sampled_a_j ? sampled_a_j - 1 : 255); +        cur_e_a_j = (a_j == 255 ? kNULL : src[a_j]); +        b.cur_src() = cur_e_a_j; +        nextb.prev_src() = cur_e_a_j; +        //cerr << "INC: " << b << "\t" << nextb << endl; +        //exit(1); +        model.increment(b); +        model.increment(nextb); +        amodel.increment(prev_src, cur_e_a_j); +        amodel.increment(cur_e_a_j, next_src); +        prev_src = cur_e_a_j; +      } +    } +    cerr << '.' << flush; +    if (si % 10 == 9) { +      cerr << "[LLH prev=" << (model.LogLikelihood() + amodel.LogLikelihood()); +      //model.ResampleHyperparameters(&rng); +      cerr << " new=" << model.LogLikelihood() << "]\n"; +      pair<WordID,WordID> xx = make_pair(kBOS, TD::Convert("我")); +      cerr << "p(" << TT << ") = " << model.prob(TT) << endl; +      cerr << "p(" << TT2 << ") = " << model.prob(TT2) << endl; +      pair<WordID,WordID> xx2 = make_pair(kBOS, TD::Convert("要")); +      PrintTopCustomers(model.bigrams.find(xx)->second); +      //PrintTopCustomers(amodel.bigrams[TD::Convert("<s>")]); +      //PrintTopCustomers(model.unigrams[TD::Convert("<eps>")]); +      PrintAlignment(corpusf[0], corpuse[0], alignments[0].a); +    } +  } +  } +  return 0; +} + diff --git a/gi/pf/Makefile.am b/gi/pf/Makefile.am new file mode 100644 index 00000000..42758939 --- /dev/null +++ b/gi/pf/Makefile.am @@ -0,0 +1,21 @@ +bin_PROGRAMS = cbgi brat dpnaive pfbrat pfdist itg pfnaive + +noinst_LIBRARIES = libpf.a +libpf_a_SOURCES = base_measures.cc reachability.cc cfg_wfst_composer.cc corpus.cc + +itg_SOURCES = itg.cc + +dpnaive_SOURCES = dpnaive.cc + +pfdist_SOURCES = pfdist.cc + +pfnaive_SOURCES = pfnaive.cc + +cbgi_SOURCES = cbgi.cc + +brat_SOURCES = brat.cc + +pfbrat_SOURCES = pfbrat.cc + +AM_CPPFLAGS = -W -Wall -Wno-sign-compare -funroll-loops -I$(top_srcdir)/utils $(GTEST_CPPFLAGS) -I$(top_srcdir)/decoder +AM_LDFLAGS = libpf.a $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/utils/libutils.a -lz diff --git a/gi/pf/README b/gi/pf/README new file mode 100644 index 00000000..62e47541 --- /dev/null +++ b/gi/pf/README @@ -0,0 +1,2 @@ +Experimental Bayesian alignment tools. Nothing to see here. + diff --git a/gi/pf/base_measures.cc b/gi/pf/base_measures.cc new file mode 100644 index 00000000..f8ddfd32 --- /dev/null +++ b/gi/pf/base_measures.cc @@ -0,0 +1,112 @@ +#include "base_measures.h" + +#include <iostream> + +#include "filelib.h" + +using namespace std; + +void Model1::LoadModel1(const string& fname) { +  cerr << "Loading Model 1 parameters from " << fname << " ..." << endl; +  ReadFile rf(fname); +  istream& in = *rf.stream(); +  string line; +  unsigned lc = 0; +  while(getline(in, line)) { +    ++lc; +    int cur = 0; +    int start = 0; +    while(cur < line.size() && line[cur] != ' ') { ++cur; } +    assert(cur != line.size()); +    line[cur] = 0; +    const WordID src = TD::Convert(&line[0]); +    ++cur; +    start = cur; +    while(cur < line.size() && line[cur] != ' ') { ++cur; } +    assert(cur != line.size()); +    line[cur] = 0; +    WordID trg = TD::Convert(&line[start]); +    const double logprob = strtod(&line[cur + 1], NULL); +    if (src >= ttable.size()) ttable.resize(src + 1); +    ttable[src][trg].logeq(logprob); +  } +  cerr << "  read " << lc << " parameters.\n"; +} + +prob_t PhraseConditionalBase::p0(const vector<WordID>& vsrc, +                                 const vector<WordID>& vtrg, +                                 int start_src, int start_trg) const { +  const int flen = vsrc.size() - start_src; +  const int elen = vtrg.size() - start_trg; +  prob_t uniform_src_alignment; uniform_src_alignment.logeq(-log(flen + 1)); +  prob_t p; +  p.logeq(log_poisson(elen, flen + 0.01));       // elen | flen          ~Pois(flen + 0.01) +  for (int i = 0; i < elen; ++i) {               // for each position i in e-RHS +    const WordID trg = vtrg[i + start_trg]; +    prob_t tp = prob_t::Zero(); +    for (int j = -1; j < flen; ++j) { +      const WordID src = j < 0 ? 0 : vsrc[j + start_src]; +      tp += kM1MIXTURE * model1(src, trg); +      tp += kUNIFORM_MIXTURE * kUNIFORM_TARGET; +    } +    tp *= uniform_src_alignment;                 //     draw a_i         ~uniform +    p *= tp;                                     //     draw e_i         ~Model1(f_a_i) / uniform +  } +  if (p.is_0()) { +    cerr << "Zero! " << vsrc << "\nTRG=" << vtrg << endl; +    abort(); +  } +  return p; +} + +prob_t PhraseJointBase::p0(const vector<WordID>& vsrc, +                           const vector<WordID>& vtrg, +                           int start_src, int start_trg) const { +  const int flen = vsrc.size() - start_src; +  const int elen = vtrg.size() - start_trg; +  prob_t uniform_src_alignment; uniform_src_alignment.logeq(-log(flen + 1)); +  prob_t p; +  p.logeq(log_poisson(flen, 1.0));               // flen                 ~Pois(1) +                                                 // elen | flen          ~Pois(flen + 0.01) +  prob_t ptrglen; ptrglen.logeq(log_poisson(elen, flen + 0.01)); +  p *= ptrglen; +  p *= kUNIFORM_SOURCE.pow(flen);                // each f in F ~Uniform +  for (int i = 0; i < elen; ++i) {               // for each position i in E +    const WordID trg = vtrg[i + start_trg]; +    prob_t tp = prob_t::Zero(); +    for (int j = -1; j < flen; ++j) { +      const WordID src = j < 0 ? 0 : vsrc[j + start_src]; +      tp += kM1MIXTURE * model1(src, trg); +      tp += kUNIFORM_MIXTURE * kUNIFORM_TARGET; +    } +    tp *= uniform_src_alignment;                 //     draw a_i         ~uniform +    p *= tp;                                     //     draw e_i         ~Model1(f_a_i) / uniform +  } +  if (p.is_0()) { +    cerr << "Zero! " << vsrc << "\nTRG=" << vtrg << endl; +    abort(); +  } +  return p; +} + +JumpBase::JumpBase() : p(200) { +  for (unsigned src_len = 1; src_len < 200; ++src_len) { +    map<int, prob_t>& cpd = p[src_len]; +    int min_jump = 1 - src_len; +    int max_jump = src_len; +    prob_t z; +    for (int j = min_jump; j <= max_jump; ++j) { +      prob_t& cp = cpd[j]; +      if (j < 0) +        cp.logeq(log_poisson(1.5-j, 1)); +      else if (j > 0) +        cp.logeq(log_poisson(j, 1)); +      cp.poweq(0.2); +      z += cp; +    } +    for (int j = min_jump; j <= max_jump; ++j) { +      cpd[j] /= z; +    } +  } +} + diff --git a/gi/pf/base_measures.h b/gi/pf/base_measures.h new file mode 100644 index 00000000..df17aa62 --- /dev/null +++ b/gi/pf/base_measures.h @@ -0,0 +1,116 @@ +#ifndef _BASE_MEASURES_H_ +#define _BASE_MEASURES_H_ + +#include <vector> +#include <map> +#include <string> +#include <cmath> +#include <iostream> + +#include "trule.h" +#include "prob.h" +#include "tdict.h" + +inline double log_poisson(unsigned x, const double& lambda) { +  assert(lambda > 0.0); +  return log(lambda) * x - lgamma(x + 1) - lambda; +} + +inline std::ostream& operator<<(std::ostream& os, const std::vector<WordID>& p) { +  os << '['; +  for (int i = 0; i < p.size(); ++i) +    os << (i==0 ? "" : " ") << TD::Convert(p[i]); +  return os << ']'; +} + +struct Model1 { +  explicit Model1(const std::string& fname) : +      kNULL(TD::Convert("<eps>")), +      kZERO() { +    LoadModel1(fname); +  } + +  void LoadModel1(const std::string& fname); + +  // returns prob 0 if src or trg is not found +  const prob_t& operator()(WordID src, WordID trg) const { +    if (src == 0) src = kNULL; +    if (src < ttable.size()) { +      const std::map<WordID, prob_t>& cpd = ttable[src]; +      const std::map<WordID, prob_t>::const_iterator it = cpd.find(trg); +      if (it != cpd.end()) +        return it->second; +    } +    return kZERO; +  } + +  const WordID kNULL; +  const prob_t kZERO; +  std::vector<std::map<WordID, prob_t> > ttable; +}; + +struct PhraseConditionalBase { +  explicit PhraseConditionalBase(const Model1& m1, const double m1mixture, const unsigned vocab_e_size) : +      model1(m1), +      kM1MIXTURE(m1mixture), +      kUNIFORM_MIXTURE(1.0 - m1mixture), +      kUNIFORM_TARGET(1.0 / vocab_e_size) { +    assert(m1mixture >= 0.0 && m1mixture <= 1.0); +    assert(vocab_e_size > 0); +  } + +  // return p0 of rule.e_ | rule.f_ +  prob_t operator()(const TRule& rule) const { +    return p0(rule.f_, rule.e_, 0, 0); +  } + +  prob_t p0(const std::vector<WordID>& vsrc, const std::vector<WordID>& vtrg, int start_src, int start_trg) const; + +  const Model1& model1; +  const prob_t kM1MIXTURE;  // Model 1 mixture component +  const prob_t kUNIFORM_MIXTURE; // uniform mixture component +  const prob_t kUNIFORM_TARGET; +}; + +struct PhraseJointBase { +  explicit PhraseJointBase(const Model1& m1, const double m1mixture, const unsigned vocab_e_size, const unsigned vocab_f_size) : +      model1(m1), +      kM1MIXTURE(m1mixture), +      kUNIFORM_MIXTURE(1.0 - m1mixture), +      kUNIFORM_SOURCE(1.0 / vocab_f_size), +      kUNIFORM_TARGET(1.0 / vocab_e_size) { +    assert(m1mixture >= 0.0 && m1mixture <= 1.0); +    assert(vocab_e_size > 0); +  } + +  // return p0 of rule.e_ | rule.f_ +  prob_t operator()(const TRule& rule) const { +    return p0(rule.f_, rule.e_, 0, 0); +  } + +  prob_t p0(const std::vector<WordID>& vsrc, const std::vector<WordID>& vtrg, int start_src, int start_trg) const; + +  const Model1& model1; +  const prob_t kM1MIXTURE;  // Model 1 mixture component +  const prob_t kUNIFORM_MIXTURE; // uniform mixture component +  const prob_t kUNIFORM_SOURCE; +  const prob_t kUNIFORM_TARGET; +}; + +// base distribution for jump size multinomials +// basically p(0) = 0 and then, p(1) is max, and then +// you drop as you move to the max jump distance +struct JumpBase { +  JumpBase(); + +  const prob_t& operator()(int jump, unsigned src_len) const { +    assert(jump != 0); +    const std::map<int, prob_t>::const_iterator it = p[src_len].find(jump); +    assert(it != p[src_len].end()); +    return it->second; +  } +  std::vector<std::map<int, prob_t> > p; +}; + + +#endif diff --git a/gi/pf/brat.cc b/gi/pf/brat.cc new file mode 100644 index 00000000..7b60ef23 --- /dev/null +++ b/gi/pf/brat.cc @@ -0,0 +1,543 @@ +#include <iostream> +#include <tr1/memory> +#include <queue> + +#include <boost/functional.hpp> +#include <boost/multi_array.hpp> +#include <boost/program_options.hpp> +#include <boost/program_options/variables_map.hpp> + +#include "viterbi.h" +#include "hg.h" +#include "trule.h" +#include "tdict.h" +#include "filelib.h" +#include "dict.h" +#include "sampler.h" +#include "ccrp_nt.h" +#include "cfg_wfst_composer.h" + +using namespace std; +using namespace tr1; +namespace po = boost::program_options; + +static unsigned kMAX_SRC_PHRASE; +static unsigned kMAX_TRG_PHRASE; +struct FSTState; + +double log_poisson(unsigned x, const double& lambda) { +  assert(lambda > 0.0); +  return log(lambda) * x - lgamma(x + 1) - lambda; +} + +struct ConditionalBase { +  explicit ConditionalBase(const double m1mixture, const unsigned vocab_e_size, const string& model1fname) : +      kM1MIXTURE(m1mixture), +      kUNIFORM_MIXTURE(1.0 - m1mixture), +      kUNIFORM_TARGET(1.0 / vocab_e_size), +      kNULL(TD::Convert("<eps>")) { +    assert(m1mixture >= 0.0 && m1mixture <= 1.0); +    assert(vocab_e_size > 0); +    LoadModel1(model1fname); +  } + +  void LoadModel1(const string& fname) { +    cerr << "Loading Model 1 parameters from " << fname << " ..." << endl; +    ReadFile rf(fname); +    istream& in = *rf.stream(); +    string line; +    unsigned lc = 0; +    while(getline(in, line)) { +      ++lc; +      int cur = 0; +      int start = 0; +      while(cur < line.size() && line[cur] != ' ') { ++cur; } +      assert(cur != line.size()); +      line[cur] = 0; +      const WordID src = TD::Convert(&line[0]); +      ++cur; +      start = cur; +      while(cur < line.size() && line[cur] != ' ') { ++cur; } +      assert(cur != line.size()); +      line[cur] = 0; +      WordID trg = TD::Convert(&line[start]); +      const double logprob = strtod(&line[cur + 1], NULL); +      if (src >= ttable.size()) ttable.resize(src + 1); +      ttable[src][trg].logeq(logprob); +    } +    cerr << "  read " << lc << " parameters.\n"; +  } + +  // return logp0 of rule.e_ | rule.f_ +  prob_t operator()(const TRule& rule) const { +    const int flen = rule.f_.size(); +    const int elen = rule.e_.size(); +    prob_t uniform_src_alignment; uniform_src_alignment.logeq(-log(flen + 1)); +    prob_t p; +    p.logeq(log_poisson(elen, flen + 0.01));       // elen | flen          ~Pois(flen + 0.01) +    for (int i = 0; i < elen; ++i) {               // for each position i in e-RHS +      const WordID trg = rule.e_[i]; +      prob_t tp = prob_t::Zero(); +      for (int j = -1; j < flen; ++j) { +        const WordID src = j < 0 ? kNULL : rule.f_[j]; +        const map<WordID, prob_t>::const_iterator it = ttable[src].find(trg); +        if (it != ttable[src].end()) { +          tp += kM1MIXTURE * it->second; +        } +        tp += kUNIFORM_MIXTURE * kUNIFORM_TARGET; +      } +      tp *= uniform_src_alignment;                 //     draw a_i         ~uniform +      p *= tp;                                     //     draw e_i         ~Model1(f_a_i) / uniform +    } +    return p; +  } + +  const prob_t kM1MIXTURE;  // Model 1 mixture component +  const prob_t kUNIFORM_MIXTURE; // uniform mixture component +  const prob_t kUNIFORM_TARGET; +  const WordID kNULL; +  vector<map<WordID, prob_t> > ttable; +}; + +void InitCommandLine(int argc, char** argv, po::variables_map* conf) { +  po::options_description opts("Configuration options"); +  opts.add_options() +        ("samples,s",po::value<unsigned>()->default_value(1000),"Number of samples") +        ("input,i",po::value<string>(),"Read parallel data from") +        ("max_src_phrase",po::value<unsigned>()->default_value(3),"Maximum length of source language phrases") +        ("max_trg_phrase",po::value<unsigned>()->default_value(3),"Maximum length of target language phrases") +        ("model1,m",po::value<string>(),"Model 1 parameters (used in base distribution)") +        ("model1_interpolation_weight",po::value<double>()->default_value(0.95),"Mixing proportion of model 1 with uniform target distribution") +        ("random_seed,S",po::value<uint32_t>(), "Random seed"); +  po::options_description clo("Command line options"); +  clo.add_options() +        ("config", po::value<string>(), "Configuration file") +        ("help,h", "Print this help message and exit"); +  po::options_description dconfig_options, dcmdline_options; +  dconfig_options.add(opts); +  dcmdline_options.add(opts).add(clo); +   +  po::store(parse_command_line(argc, argv, dcmdline_options), *conf); +  if (conf->count("config")) { +    ifstream config((*conf)["config"].as<string>().c_str()); +    po::store(po::parse_config_file(config, dconfig_options), *conf); +  } +  po::notify(*conf); + +  if (conf->count("help") || (conf->count("input") == 0)) { +    cerr << dcmdline_options << endl; +    exit(1); +  } +} + +void ReadParallelCorpus(const string& filename, +                vector<vector<WordID> >* f, +                vector<vector<int> >* e, +                set<int>* vocab_f, +                set<int>* vocab_e) { +  f->clear(); +  e->clear(); +  vocab_f->clear(); +  vocab_e->clear(); +  istream* in; +  if (filename == "-") +    in = &cin; +  else +    in = new ifstream(filename.c_str()); +  assert(*in); +  string line; +  const WordID kDIV = TD::Convert("|||"); +  vector<WordID> tmp; +  while(*in) { +    getline(*in, line); +    if (line.empty() && !*in) break; +    e->push_back(vector<int>()); +    f->push_back(vector<int>()); +    vector<int>& le = e->back(); +    vector<int>& lf = f->back(); +    tmp.clear(); +    TD::ConvertSentence(line, &tmp); +    bool isf = true; +    for (unsigned i = 0; i < tmp.size(); ++i) { +      const int cur = tmp[i]; +      if (isf) { +        if (kDIV == cur) { isf = false; } else { +          lf.push_back(cur); +          vocab_f->insert(cur); +        } +      } else { +        assert(cur != kDIV); +        le.push_back(cur); +        vocab_e->insert(cur); +      } +    } +    assert(isf == false); +  } +  if (in != &cin) delete in; +} + +struct UniphraseLM { +  UniphraseLM(const vector<vector<int> >& corpus, +              const set<int>& vocab, +              const po::variables_map& conf) : +    phrases_(1,1), +    gen_(1,1), +    corpus_(corpus), +    uniform_word_(1.0 / vocab.size()), +    gen_p0_(0.5), +    p_end_(0.5), +    use_poisson_(conf.count("poisson_length") > 0) {} + +  void ResampleHyperparameters(MT19937* rng) { +    phrases_.resample_hyperparameters(rng); +    gen_.resample_hyperparameters(rng); +    cerr << " " << phrases_.concentration(); +  } + +  CCRP_NoTable<vector<int> > phrases_; +  CCRP_NoTable<bool> gen_; +  vector<vector<bool> > z_;   // z_[i] is there a phrase boundary after the ith word +  const vector<vector<int> >& corpus_; +  const double uniform_word_; +  const double gen_p0_; +  const double p_end_; // in base length distribution, p of the end of a phrase +  const bool use_poisson_; +}; + +struct Reachability { +  boost::multi_array<bool, 4> edges;  // edges[src_covered][trg_covered][x][trg_delta] is this edge worth exploring? +  boost::multi_array<short, 2> max_src_delta; // msd[src_covered][trg_covered] -- the largest src delta that's valid + +  Reachability(int srclen, int trglen, int src_max_phrase_len, int trg_max_phrase_len) : +      edges(boost::extents[srclen][trglen][src_max_phrase_len+1][trg_max_phrase_len+1]), +      max_src_delta(boost::extents[srclen][trglen]) { +    ComputeReachability(srclen, trglen, src_max_phrase_len, trg_max_phrase_len); +  } + + private: +  struct SState { +    SState() : prev_src_covered(), prev_trg_covered() {} +    SState(int i, int j) : prev_src_covered(i), prev_trg_covered(j) {} +    int prev_src_covered; +    int prev_trg_covered; +  }; + +  struct NState { +    NState() : next_src_covered(), next_trg_covered() {} +    NState(int i, int j) : next_src_covered(i), next_trg_covered(j) {} +    int next_src_covered; +    int next_trg_covered; +  }; + +  void ComputeReachability(int srclen, int trglen, int src_max_phrase_len, int trg_max_phrase_len) { +    typedef boost::multi_array<vector<SState>, 2> array_type; +    array_type a(boost::extents[srclen + 1][trglen + 1]); +    a[0][0].push_back(SState()); +    for (int i = 0; i < srclen; ++i) { +      for (int j = 0; j < trglen; ++j) { +        if (a[i][j].size() == 0) continue; +        const SState prev(i,j); +        for (int k = 1; k <= src_max_phrase_len; ++k) { +          if ((i + k) > srclen) continue; +          for (int l = 1; l <= trg_max_phrase_len; ++l) { +            if ((j + l) > trglen) continue; +            a[i + k][j + l].push_back(prev); +          } +        } +      } +    } +    a[0][0].clear(); +    cerr << "Final cell contains " << a[srclen][trglen].size() << " back pointers\n"; +    assert(a[srclen][trglen].size() > 0); + +    typedef boost::multi_array<bool, 2> rarray_type; +    rarray_type r(boost::extents[srclen + 1][trglen + 1]); +//    typedef boost::multi_array<vector<NState>, 2> narray_type; +//    narray_type b(boost::extents[srclen + 1][trglen + 1]); +    r[srclen][trglen] = true; +    for (int i = srclen; i >= 0; --i) { +      for (int j = trglen; j >= 0; --j) { +        vector<SState>& prevs = a[i][j]; +        if (!r[i][j]) { prevs.clear(); } +//        const NState nstate(i,j); +        for (int k = 0; k < prevs.size(); ++k) { +          r[prevs[k].prev_src_covered][prevs[k].prev_trg_covered] = true; +          int src_delta = i - prevs[k].prev_src_covered; +          edges[prevs[k].prev_src_covered][prevs[k].prev_trg_covered][src_delta][j - prevs[k].prev_trg_covered] = true; +          short &msd = max_src_delta[prevs[k].prev_src_covered][prevs[k].prev_trg_covered]; +          if (src_delta > msd) msd = src_delta; +//          b[prevs[k].prev_src_covered][prevs[k].prev_trg_covered].push_back(nstate); +        } +      } +    } +    assert(!edges[0][0][1][0]); +    assert(!edges[0][0][0][1]); +    assert(!edges[0][0][0][0]); +    cerr << "  MAX SRC DELTA[0][0] = " << max_src_delta[0][0] << endl; +    assert(max_src_delta[0][0] > 0); +    //cerr << "First cell contains " << b[0][0].size() << " forward pointers\n"; +    //for (int i = 0; i < b[0][0].size(); ++i) { +    //  cerr << "  -> (" << b[0][0][i].next_src_covered << "," << b[0][0][i].next_trg_covered << ")\n"; +    //} +  } +}; + +ostream& operator<<(ostream& os, const FSTState& q); +struct FSTState { +  explicit FSTState(int src_size) : +      trg_covered_(), +      src_covered_(), +      src_coverage_(src_size) {} + +  FSTState(short trg_covered, short src_covered, const vector<bool>& src_coverage, const vector<short>& src_prefix) : +      trg_covered_(trg_covered), +      src_covered_(src_covered), +      src_coverage_(src_coverage), +      src_prefix_(src_prefix) { +    if (src_coverage_.size() == src_covered) { +      assert(src_prefix.size() == 0); +    } +  } + +  // if we extend by the word at src_position, what are +  // the next states that are reachable and lie on a valid +  // path to the final state? +  vector<FSTState> Extensions(int src_position, int src_len, int trg_len, const Reachability& r) const { +    assert(src_position < src_coverage_.size()); +    if (src_coverage_[src_position]) { +      cerr << "Trying to extend " << *this << " with position " << src_position << endl; +      abort(); +    } +    vector<bool> ncvg = src_coverage_; +    ncvg[src_position] = true; + +    vector<FSTState> res; +    const int trg_remaining = trg_len - trg_covered_; +    if (trg_remaining <= 0) { +      cerr << "Target appears to have been covered: " << *this << " (trg_len=" << trg_len << ",trg_covered=" << trg_covered_ << ")" << endl; +      abort(); +    } +    const int src_remaining = src_len - src_covered_; +    if (src_remaining <= 0) { +      cerr << "Source appears to have been covered: " << *this << endl; +      abort(); +    } + +    for (int tc = 1; tc <= kMAX_TRG_PHRASE; ++tc) { +      if (r.edges[src_covered_][trg_covered_][src_prefix_.size() + 1][tc]) { +        int nc = src_prefix_.size() + 1 + src_covered_; +        res.push_back(FSTState(trg_covered_ + tc, nc, ncvg, vector<short>())); +      } +    } + +    if ((src_prefix_.size() + 1) < r.max_src_delta[src_covered_][trg_covered_]) { +      vector<short> nsp = src_prefix_; +      nsp.push_back(src_position); +      res.push_back(FSTState(trg_covered_, src_covered_, ncvg, nsp)); +    } + +    if (res.size() == 0) { +      cerr << *this << " can't be extended!\n"; +      abort(); +    } +    return res; +  } + +  short trg_covered_, src_covered_; +  vector<bool> src_coverage_; +  vector<short> src_prefix_; +}; +bool operator<(const FSTState& q, const FSTState& r) { +  if (q.trg_covered_ != r.trg_covered_) return q.trg_covered_ < r.trg_covered_; +  if (q.src_covered_!= r.src_covered_) return q.src_covered_ < r.src_covered_; +  if (q.src_coverage_ != r.src_coverage_) return q.src_coverage_ < r.src_coverage_; +  return q.src_prefix_ < r.src_prefix_; +} + +ostream& operator<<(ostream& os, const FSTState& q) { +  os << "[" << q.trg_covered_ << " : "; +  for (int i = 0; i < q.src_coverage_.size(); ++i) +    os << q.src_coverage_[i]; +  os << " : <"; +  for (int i = 0; i < q.src_prefix_.size(); ++i) { +    if (i != 0) os << ' '; +    os << q.src_prefix_[i]; +  } +  return os << ">]"; +} + +struct MyModel { +  MyModel(ConditionalBase& rcp0) : rp0(rcp0) {} +  typedef unordered_map<vector<WordID>, CCRP_NoTable<TRule>, boost::hash<vector<WordID> > > SrcToRuleCRPMap; + +  void DecrementRule(const TRule& rule) { +    SrcToRuleCRPMap::iterator it = rules.find(rule.f_); +    assert(it != rules.end()); +    it->second.decrement(rule); +    if (it->second.num_customers() == 0) rules.erase(it); +  } + +  void IncrementRule(const TRule& rule) { +    SrcToRuleCRPMap::iterator it = rules.find(rule.f_); +    if (it == rules.end()) { +      CCRP_NoTable<TRule> crp(1,1); +      it = rules.insert(make_pair(rule.f_, crp)).first; +    } +    it->second.increment(rule); +  } + +  // conditioned on rule.f_ +  prob_t RuleConditionalProbability(const TRule& rule) const { +    const prob_t base = rp0(rule); +    SrcToRuleCRPMap::const_iterator it = rules.find(rule.f_); +    if (it == rules.end()) { +      return base; +    } else { +      const double lp = it->second.logprob(rule, log(base)); +      prob_t q; q.logeq(lp); +      return q; +    } +  } + +  const ConditionalBase& rp0; +  SrcToRuleCRPMap rules; +}; + +struct MyFST : public WFST { +  MyFST(const vector<WordID>& ssrc, const vector<WordID>& strg, MyModel* m) : +      src(ssrc), trg(strg), +      r(src.size(),trg.size(),kMAX_SRC_PHRASE, kMAX_TRG_PHRASE), +      model(m) { +    FSTState in(src.size()); +    cerr << " INIT: " << in << endl; +    init = GetNode(in); +    for (int i = 0; i < in.src_coverage_.size(); ++i) in.src_coverage_[i] = true; +    in.src_covered_ = src.size(); +    in.trg_covered_ = trg.size(); +    cerr << "FINAL: " << in << endl; +    final = GetNode(in); +  } +  virtual const WFSTNode* Final() const; +  virtual const WFSTNode* Initial() const; + +  const WFSTNode* GetNode(const FSTState& q); +  map<FSTState, boost::shared_ptr<WFSTNode> > m; +  const vector<WordID>& src; +  const vector<WordID>& trg; +  Reachability r; +  const WFSTNode* init; +  const WFSTNode* final; +  MyModel* model; +}; + +struct MyNode : public WFSTNode { +  MyNode(const FSTState& q, MyFST* fst) : state(q), container(fst) {} +  virtual vector<pair<const WFSTNode*, TRulePtr> > ExtendInput(unsigned srcindex) const; +  const FSTState state; +  mutable MyFST* container; +}; + +vector<pair<const WFSTNode*, TRulePtr> > MyNode::ExtendInput(unsigned srcindex) const { +  cerr << "EXTEND " << state << " with " << srcindex << endl; +  vector<FSTState> ext = state.Extensions(srcindex, container->src.size(), container->trg.size(), container->r); +  vector<pair<const WFSTNode*,TRulePtr> > res(ext.size()); +  for (unsigned i = 0; i < ext.size(); ++i) { +    res[i].first = container->GetNode(ext[i]); +    if (ext[i].src_prefix_.size() == 0) { +      const unsigned trg_from = state.trg_covered_; +      const unsigned trg_to = ext[i].trg_covered_; +      const unsigned prev_prfx_size = state.src_prefix_.size(); +      res[i].second.reset(new TRule); +      res[i].second->lhs_ = -TD::Convert("X"); +      vector<WordID>& src = res[i].second->f_; +      vector<WordID>& trg = res[i].second->e_; +      src.resize(prev_prfx_size + 1); +      for (unsigned j = 0; j < prev_prfx_size; ++j) +        src[j] = container->src[state.src_prefix_[j]]; +      src[prev_prfx_size] = container->src[srcindex]; +      for (unsigned j = trg_from; j < trg_to; ++j) +        trg.push_back(container->trg[j]); +      res[i].second->scores_.set_value(FD::Convert("Proposal"), log(container->model->RuleConditionalProbability(*res[i].second))); +    } +  } +  return res; +} + +const WFSTNode* MyFST::GetNode(const FSTState& q) { +  boost::shared_ptr<WFSTNode>& res = m[q]; +  if (!res) { +    res.reset(new MyNode(q, this)); +  } +  return &*res; +} + +const WFSTNode* MyFST::Final() const { +  return final; +} + +const WFSTNode* MyFST::Initial() const { +  return init; +} + +int main(int argc, char** argv) { +  po::variables_map conf; +  InitCommandLine(argc, argv, &conf); +  kMAX_TRG_PHRASE = conf["max_trg_phrase"].as<unsigned>(); +  kMAX_SRC_PHRASE = conf["max_src_phrase"].as<unsigned>(); + +  if (!conf.count("model1")) { +    cerr << argv[0] << "Please use --model1 to specify model 1 parameters\n"; +    return 1; +  } +  shared_ptr<MT19937> prng; +  if (conf.count("random_seed")) +    prng.reset(new MT19937(conf["random_seed"].as<uint32_t>())); +  else +    prng.reset(new MT19937); +  MT19937& rng = *prng; + +  vector<vector<int> > corpuse, corpusf; +  set<int> vocabe, vocabf; +  ReadParallelCorpus(conf["input"].as<string>(), &corpusf, &corpuse, &vocabf, &vocabe); +  cerr << "f-Corpus size: " << corpusf.size() << " sentences\n"; +  cerr << "f-Vocabulary size: " << vocabf.size() << " types\n"; +  cerr << "f-Corpus size: " << corpuse.size() << " sentences\n"; +  cerr << "f-Vocabulary size: " << vocabe.size() << " types\n"; +  assert(corpusf.size() == corpuse.size()); + +  ConditionalBase lp0(conf["model1_interpolation_weight"].as<double>(), +                      vocabe.size(), +                      conf["model1"].as<string>()); +  MyModel m(lp0); + +  TRule x("[X] ||| kAnwntR myN ||| at the convent ||| 0"); +  m.IncrementRule(x); +  TRule y("[X] ||| nY dyN ||| gave ||| 0"); +  m.IncrementRule(y); + + +  MyFST fst(corpusf[0], corpuse[0], &m); +  ifstream in("./kimura.g"); +  assert(in); +  CFG_WFSTComposer comp(fst); +  Hypergraph hg; +  bool succeed = comp.Compose(&in, &hg); +  hg.PrintGraphviz(); +  if (succeed) { cerr << "SUCCESS.\n"; } else { cerr << "FAILURE REPORTED.\n"; } + +#if 0 +  ifstream in2("./amnabooks.g"); +  assert(in2); +  MyFST fst2(corpusf[1], corpuse[1], &m); +  CFG_WFSTComposer comp2(fst2); +  Hypergraph hg2; +  bool succeed2 = comp2.Compose(&in2, &hg2); +  if (succeed2) { cerr << "SUCCESS.\n"; } else { cerr << "FAILURE REPORTED.\n"; } +#endif + +  SparseVector<double> w; w.set_value(FD::Convert("Proposal"), 1.0); +  hg.Reweight(w); +  cerr << ViterbiFTree(hg) << endl; +  return 0; +} + diff --git a/gi/pf/cbgi.cc b/gi/pf/cbgi.cc new file mode 100644 index 00000000..97f1ba34 --- /dev/null +++ b/gi/pf/cbgi.cc @@ -0,0 +1,330 @@ +#include <queue> +#include <sstream> +#include <iostream> + +#include <boost/unordered_map.hpp> +#include <boost/functional/hash.hpp> + +#include "sampler.h" +#include "filelib.h" +#include "hg_io.h" +#include "hg.h" +#include "ccrp_nt.h" +#include "trule.h" +#include "inside_outside.h" + +using namespace std; +using namespace std::tr1; + +double log_poisson(unsigned x, const double& lambda) { +  assert(lambda > 0.0); +  return log(lambda) * x - lgamma(x + 1) - lambda; +} + +double log_decay(unsigned x, const double& b) { +  assert(b > 1.0); +  assert(x > 0); +  return log(b - 1) - x * log(b); +} + +struct SimpleBase { +  SimpleBase(unsigned esize, unsigned fsize, unsigned ntsize = 144) : +    uniform_e(-log(esize)), +    uniform_f(-log(fsize)), +    uniform_nt(-log(ntsize)) { +  } + +  // binomial coefficient +  static double choose(unsigned n, unsigned k) { +    return exp(lgamma(n + 1) - lgamma(k + 1) - lgamma(n - k + 1)); +  } + +  // count the number of patterns of terminals and NTs in the rule, given elen and flen +  static double log_number_of_patterns(const unsigned flen, const unsigned elen) { +    static vector<vector<double> > counts; +    if (elen >= counts.size()) counts.resize(elen + 1); +    if (flen >= counts[elen].size()) counts[elen].resize(flen + 1); +    double& count = counts[elen][flen]; +    if (count) return log(count); +    const unsigned max_arity = min(elen, flen); +    for (unsigned a = 0; a <= max_arity; ++a) +      count += choose(elen, a) * choose(flen, a); +    return log(count); +  } + +  // return logp0 of rule | LHS +  double operator()(const TRule& rule) const { +    const unsigned flen = rule.f_.size(); +    const unsigned elen = rule.e_.size(); +#if 0 +    double p = 0; +    p += log_poisson(flen, 0.5);                   // flen                 ~Pois(0.5) +    p += log_poisson(elen, flen);                  // elen | flen          ~Pois(flen) +    p -= log_number_of_patterns(flen, elen);       // pattern | flen,elen  ~Uniform +    for (unsigned i = 0; i < flen; ++i) {          // for each position in f-RHS +      if (rule.f_[i] <= 0)                         //   according to pattern +        p += uniform_nt;                           //     draw NT          ~Uniform +      else +        p += uniform_f;                            //     draw f terminal  ~Uniform +    } +    p -= lgamma(rule.Arity() + 1);                 // draw permutation     ~Uniform  +    for (unsigned i = 0; i < elen; ++i) {          // for each position in e-RHS +      if (rule.e_[i] > 0)                          //   according to pattern +        p += uniform_e;                            //     draw e|f term    ~Uniform +        // TODO this should prob be model 1 +    } +#else +    double p = 0; +    bool is_abstract = rule.f_[0] <= 0; +    p += log(0.5); +    if (is_abstract) { +      if (flen == 2) p += log(0.99); else p += log(0.01); +    } else { +      p += log_decay(flen, 3); +    } + +    for (unsigned i = 0; i < flen; ++i) {          // for each position in f-RHS +      if (rule.f_[i] <= 0)                         //   according to pattern +        p += uniform_nt;                           //     draw NT          ~Uniform +      else +        p += uniform_f;                            //     draw f terminal  ~Uniform +    } +#endif +    return p; +  } +  const double uniform_e; +  const double uniform_f; +  const double uniform_nt; +  vector<double> arities; +}; + +MT19937* rng = NULL; + +template <typename Base> +struct MHSamplerEdgeProb { +  MHSamplerEdgeProb(const Hypergraph& hg, +                  const map<int, CCRP_NoTable<TRule> >& rdp, +                  const Base& logp0, +                  const bool exclude_multiword_terminals) : edge_probs(hg.edges_.size()) { +    for (int i = 0; i < edge_probs.size(); ++i) { +      const TRule& rule = *hg.edges_[i].rule_; +      const map<int, CCRP_NoTable<TRule> >::const_iterator it = rdp.find(rule.lhs_); +      assert(it != rdp.end()); +      const CCRP_NoTable<TRule>& crp = it->second; +      edge_probs[i].logeq(crp.logprob(rule, logp0(rule))); +      if (exclude_multiword_terminals && rule.f_[0] > 0 && rule.f_.size() > 1) +        edge_probs[i] = prob_t::Zero(); +    } +  } +  inline prob_t operator()(const Hypergraph::Edge& e) const { +    return edge_probs[e.id_]; +  } +  prob_t DerivationProb(const vector<int>& d) const { +    prob_t p = prob_t::One(); +    for (unsigned i = 0; i < d.size(); ++i) +      p *= edge_probs[d[i]]; +    return p; +  } +  vector<prob_t> edge_probs; +}; + +template <typename Base> +struct ModelAndData { +  ModelAndData() : +     base_lh(prob_t::One()), +     logp0(10000, 10000), +     mh_samples(), +     mh_rejects() {} + +  void SampleCorpus(const string& hgpath, int i); +  void ResampleHyperparameters() { +    for (map<int, CCRP_NoTable<TRule> >::iterator it = rules.begin(); it != rules.end(); ++it) +      it->second.resample_hyperparameters(rng); +  } + +  CCRP_NoTable<TRule>& RuleCRP(int lhs) { +    map<int, CCRP_NoTable<TRule> >::iterator it = rules.find(lhs); +    if (it == rules.end()) { +      rules.insert(make_pair(lhs, CCRP_NoTable<TRule>(1,1))); +      it = rules.find(lhs); +    } +    return it->second; +  } + +  void IncrementRule(const TRule& rule) { +    CCRP_NoTable<TRule>& crp = RuleCRP(rule.lhs_); +    if (crp.increment(rule)) { +      prob_t p; p.logeq(logp0(rule)); +      base_lh *= p; +    } +  } + +  void DecrementRule(const TRule& rule) { +    CCRP_NoTable<TRule>& crp = RuleCRP(rule.lhs_); +    if (crp.decrement(rule)) { +      prob_t p; p.logeq(logp0(rule)); +      base_lh /= p; +    } +  } + +  void DecrementDerivation(const Hypergraph& hg, const vector<int>& d) { +    for (unsigned i = 0; i < d.size(); ++i) { +      const TRule& rule = *hg.edges_[d[i]].rule_; +      DecrementRule(rule); +    } +  } + +  void IncrementDerivation(const Hypergraph& hg, const vector<int>& d) { +    for (unsigned i = 0; i < d.size(); ++i) { +      const TRule& rule = *hg.edges_[d[i]].rule_; +      IncrementRule(rule); +    } +  } + +  prob_t Likelihood() const { +    prob_t p = prob_t::One(); +    for (map<int, CCRP_NoTable<TRule> >::const_iterator it = rules.begin(); it != rules.end(); ++it) { +      prob_t q; q.logeq(it->second.log_crp_prob()); +      p *= q; +    } +    p *= base_lh; +    return p; +  } + +  void ResampleDerivation(const Hypergraph& hg, vector<int>* sampled_derivation); + +  map<int, CCRP_NoTable<TRule> > rules;  // [lhs] -> distribution over RHSs +  prob_t base_lh; +  SimpleBase logp0; +  vector<vector<int> > samples;   // sampled derivations +  unsigned int mh_samples; +  unsigned int mh_rejects; +}; + +template <typename Base> +void ModelAndData<Base>::SampleCorpus(const string& hgpath, int n) { +  vector<Hypergraph> hgs(n); hgs.clear(); +  boost::unordered_map<TRule, unsigned> acc; +  map<int, unsigned> tot; +  for (int i = 0; i < n; ++i) { +    ostringstream os; +    os << hgpath << '/' << i << ".json.gz"; +    if (!FileExists(os.str())) continue; +    hgs.push_back(Hypergraph()); +    ReadFile rf(os.str()); +    HypergraphIO::ReadFromJSON(rf.stream(), &hgs.back()); +  } +  cerr << "Read " << hgs.size() << " alignment hypergraphs.\n"; +  samples.resize(hgs.size()); +  const unsigned SAMPLES = 2000; +  const unsigned burnin = 3 * SAMPLES / 4; +  const unsigned every = 20; +  for (unsigned s = 0; s < SAMPLES; ++s) { +    if (s % 10 == 0) { +      if (s > 0) { cerr << endl; ResampleHyperparameters(); } +      cerr << "[" << s << " LLH=" << log(Likelihood()) << " REJECTS=" << ((double)mh_rejects / mh_samples) << " LHS's=" << rules.size() << " base=" << log(base_lh) << "] "; +    } +    cerr << '.'; +    for (unsigned i = 0; i < hgs.size(); ++i) { +      ResampleDerivation(hgs[i], &samples[i]); +      if (s > burnin && s % every == 0) { +        for (unsigned j = 0; j < samples[i].size(); ++j) { +          const TRule& rule = *hgs[i].edges_[samples[i][j]].rule_; +          ++acc[rule]; +          ++tot[rule.lhs_]; +        } +      } +    } +  } +  cerr << endl; +  for (boost::unordered_map<TRule,unsigned>::iterator it = acc.begin(); it != acc.end(); ++it) { +    cout << it->first << " MyProb=" << log(it->second)-log(tot[it->first.lhs_]) << endl; +  } +} + +template <typename Base> +void ModelAndData<Base>::ResampleDerivation(const Hypergraph& hg, vector<int>* sampled_deriv) { +  vector<int> cur; +  cur.swap(*sampled_deriv); + +  const prob_t p_cur = Likelihood(); +  DecrementDerivation(hg, cur); +  if (cur.empty()) { +    // first iteration, create restaurants +    for (int i = 0; i < hg.edges_.size(); ++i) +      RuleCRP(hg.edges_[i].rule_->lhs_); +  } +  MHSamplerEdgeProb<SimpleBase> wf(hg, rules, logp0, cur.empty()); +//  MHSamplerEdgeProb<SimpleBase> wf(hg, rules, logp0, false); +  const prob_t q_cur = wf.DerivationProb(cur); +  vector<prob_t> node_probs; +  Inside<prob_t, MHSamplerEdgeProb<SimpleBase> >(hg, &node_probs, wf); +  queue<unsigned> q; +  q.push(hg.nodes_.size() - 3); +  while(!q.empty()) { +    unsigned cur_node_id = q.front(); +//    cerr << "NODE=" << cur_node_id << endl; +    q.pop(); +    const Hypergraph::Node& node = hg.nodes_[cur_node_id]; +    const unsigned num_in_edges = node.in_edges_.size(); +    unsigned sampled_edge = 0; +    if (num_in_edges == 1) { +      sampled_edge = node.in_edges_[0]; +    } else { +      prob_t z; +      assert(num_in_edges > 1); +      SampleSet<prob_t> ss; +      for (unsigned j = 0; j < num_in_edges; ++j) { +        const Hypergraph::Edge& edge = hg.edges_[node.in_edges_[j]]; +        prob_t p = wf.edge_probs[edge.id_];             // edge proposal prob +        for (unsigned k = 0; k < edge.tail_nodes_.size(); ++k) +          p *= node_probs[edge.tail_nodes_[k]]; +        ss.add(p); +//        cerr << log(ss[j]) << " ||| " << edge.rule_->AsString() << endl; +        z += p; +      } +//      for (unsigned j = 0; j < num_in_edges; ++j) { +//        const Hypergraph::Edge& edge = hg.edges_[node.in_edges_[j]]; +//        cerr << exp(log(ss[j] / z)) << " ||| " << edge.rule_->AsString() << endl; +//      } +//      cerr << " --- \n"; +      sampled_edge = node.in_edges_[rng->SelectSample(ss)]; +    } +    sampled_deriv->push_back(sampled_edge); +    const Hypergraph::Edge& edge = hg.edges_[sampled_edge]; +    for (unsigned j = 0; j < edge.tail_nodes_.size(); ++j) { +      q.push(edge.tail_nodes_[j]); +    } +  } +  IncrementDerivation(hg, *sampled_deriv); + +//  cerr << "sampled derivation contains " << sampled_deriv->size() << " edges\n"; +//  cerr << "DERIV:\n"; +//  for (int i = 0; i < sampled_deriv->size(); ++i) { +//    cerr << "  " << hg.edges_[(*sampled_deriv)[i]].rule_->AsString() << endl; +//  } + +  if (cur.empty()) return;  // accept first sample + +  ++mh_samples; +  // only need to do MH if proposal is different to current state +  if (cur != *sampled_deriv) { +    const prob_t q_prop = wf.DerivationProb(*sampled_deriv); +    const prob_t p_prop = Likelihood(); +    if (!rng->AcceptMetropolisHastings(p_prop, p_cur, q_prop, q_cur)) { +      ++mh_rejects; +      DecrementDerivation(hg, *sampled_deriv); +      IncrementDerivation(hg, cur); +      swap(cur, *sampled_deriv); +    } +  } +} + +int main(int argc, char** argv) { +  rng = new MT19937; +  ModelAndData<SimpleBase> m; +  m.SampleCorpus("./hgs", 50); +  // m.SampleCorpus("./btec/hgs", 5000); +  return 0; +} + diff --git a/gi/pf/cfg_wfst_composer.cc b/gi/pf/cfg_wfst_composer.cc new file mode 100644 index 00000000..a31b5be8 --- /dev/null +++ b/gi/pf/cfg_wfst_composer.cc @@ -0,0 +1,730 @@ +#include "cfg_wfst_composer.h" + +#include <iostream> +#include <fstream> +#include <map> +#include <queue> +#include <tr1/unordered_set> + +#include <boost/shared_ptr.hpp> +#include <boost/program_options.hpp> +#include <boost/program_options/variables_map.hpp> +#include "fast_lexical_cast.hpp" + +#include "phrasetable_fst.h" +#include "sparse_vector.h" +#include "tdict.h" +#include "hg.h" + +using boost::shared_ptr; +namespace po = boost::program_options; +using namespace std; +using namespace std::tr1; + +WFSTNode::~WFSTNode() {} +WFST::~WFST() {} + +// Define the following macro if you want to see lots of debugging output +// when you run the chart parser +#undef DEBUG_CHART_PARSER + +// A few constants used by the chart parser /////////////// +static const int kMAX_NODES = 2000000; +static const string kPHRASE_STRING = "X"; +static bool constants_need_init = true; +static WordID kUNIQUE_START; +static WordID kPHRASE; +static TRulePtr kX1X2; +static TRulePtr kX1; +static WordID kEPS; +static TRulePtr kEPSRule; + +static void InitializeConstants() { +  if (constants_need_init) { +    kPHRASE = TD::Convert(kPHRASE_STRING) * -1; +    kUNIQUE_START = TD::Convert("S") * -1; +    kX1X2.reset(new TRule("[X] ||| [X,1] [X,2] ||| [X,1] [X,2]")); +    kX1.reset(new TRule("[X] ||| [X,1] ||| [X,1]")); +    kEPSRule.reset(new TRule("[X] ||| <eps> ||| <eps>")); +    kEPS = TD::Convert("<eps>"); +    constants_need_init = false; +  } +} +//////////////////////////////////////////////////////////// + +class EGrammarNode { +  friend bool CFG_WFSTComposer::Compose(const Hypergraph& src_forest, Hypergraph* trg_forest); +  friend void AddGrammarRule(const string& r, map<WordID, EGrammarNode>* g); + public: +#ifdef DEBUG_CHART_PARSER +  string hint; +#endif +  EGrammarNode() : is_some_rule_complete(false), is_root(false) {} +  const map<WordID, EGrammarNode>& GetTerminals() const { return tptr; } +  const map<WordID, EGrammarNode>& GetNonTerminals() const { return ntptr; } +  bool HasNonTerminals() const { return (!ntptr.empty()); } +  bool HasTerminals() const { return (!tptr.empty()); } +  bool RuleCompletes() const { +    return (is_some_rule_complete || (ntptr.empty() && tptr.empty())); +  } +  bool GrammarContinues() const { +    return !(ntptr.empty() && tptr.empty()); +  } +  bool IsRoot() const { +    return is_root; +  } +  // these are the features associated with the rule from the start +  // node up to this point.  If you use these features, you must +  // not Extend() this rule. +  const SparseVector<double>& GetCFGProductionFeatures() const { +    return input_features; +  } + +  const EGrammarNode* Extend(const WordID& t) const { +    if (t < 0) { +      map<WordID, EGrammarNode>::const_iterator it = ntptr.find(t); +      if (it == ntptr.end()) return NULL; +      return &it->second; +    } else { +      map<WordID, EGrammarNode>::const_iterator it = tptr.find(t); +      if (it == tptr.end()) return NULL; +      return &it->second; +    } +  } + + private: +  map<WordID, EGrammarNode> tptr; +  map<WordID, EGrammarNode> ntptr; +  SparseVector<double> input_features; +  bool is_some_rule_complete; +  bool is_root; +}; +typedef map<WordID, EGrammarNode> EGrammar;    // indexed by the rule LHS + +// edges are immutable once created +struct Edge { +#ifdef DEBUG_CHART_PARSER +  static int id_count; +  const int id; +#endif +  const WordID cat;                   // lhs side of rule proved/being proved +  const EGrammarNode* const dot;      // dot position +  const WFSTNode* const q;             // start of span +  const WFSTNode* const r;             // end of span +  const Edge* const active_parent;    // back pointer, NULL for PREDICT items +  const Edge* const passive_parent;   // back pointer, NULL for SCAN and PREDICT items +  TRulePtr tps;   // translations +  shared_ptr<SparseVector<double> > features; // features from CFG rule + +  bool IsPassive() const { +    // when a rule is completed, this value will be set +    return static_cast<bool>(features); +  } +  bool IsActive() const { return !IsPassive(); } +  bool IsInitial() const { +    return !(active_parent || passive_parent); +  } +  bool IsCreatedByScan() const { +    return active_parent && !passive_parent && !dot->IsRoot(); +  } +  bool IsCreatedByPredict() const { +    return dot->IsRoot(); +  } +  bool IsCreatedByComplete() const { +    return active_parent && passive_parent; +  } + +  // constructor for PREDICT +  Edge(WordID c, const EGrammarNode* d, const WFSTNode* q_and_r) : +#ifdef DEBUG_CHART_PARSER +    id(++id_count), +#endif +    cat(c), dot(d), q(q_and_r), r(q_and_r), active_parent(NULL), passive_parent(NULL), tps() {} +  Edge(WordID c, const EGrammarNode* d, const WFSTNode* q_and_r, const Edge* act_parent) : +#ifdef DEBUG_CHART_PARSER +    id(++id_count), +#endif +    cat(c), dot(d), q(q_and_r), r(q_and_r), active_parent(act_parent), passive_parent(NULL), tps() {} + +  // constructors for SCAN +  Edge(WordID c, const EGrammarNode* d, const WFSTNode* i, const WFSTNode* j, +       const Edge* act_par, const TRulePtr& translations) : +#ifdef DEBUG_CHART_PARSER +    id(++id_count), +#endif +    cat(c), dot(d), q(i), r(j), active_parent(act_par), passive_parent(NULL), tps(translations) {} + +  Edge(WordID c, const EGrammarNode* d, const WFSTNode* i, const WFSTNode* j, +       const Edge* act_par, const TRulePtr& translations, +       const SparseVector<double>& feats) : +#ifdef DEBUG_CHART_PARSER +    id(++id_count), +#endif +    cat(c), dot(d), q(i), r(j), active_parent(act_par), passive_parent(NULL), tps(translations), +    features(new SparseVector<double>(feats)) {} + +  // constructors for COMPLETE +  Edge(WordID c, const EGrammarNode* d, const WFSTNode* i, const WFSTNode* j, +       const Edge* act_par, const Edge *pas_par) : +#ifdef DEBUG_CHART_PARSER +    id(++id_count), +#endif +    cat(c), dot(d), q(i), r(j), active_parent(act_par), passive_parent(pas_par), tps() { +      assert(pas_par->IsPassive()); +      assert(act_par->IsActive()); +    } + +  Edge(WordID c, const EGrammarNode* d, const WFSTNode* i, const WFSTNode* j, +       const Edge* act_par, const Edge *pas_par, const SparseVector<double>& feats) : +#ifdef DEBUG_CHART_PARSER +    id(++id_count), +#endif +    cat(c), dot(d), q(i), r(j), active_parent(act_par), passive_parent(pas_par), tps(), +    features(new SparseVector<double>(feats)) { +      assert(pas_par->IsPassive()); +      assert(act_par->IsActive()); +    } + +  // constructor for COMPLETE query +  Edge(const WFSTNode* _r) : +#ifdef DEBUG_CHART_PARSER +    id(0), +#endif +    cat(0), dot(NULL), q(NULL), +    r(_r), active_parent(NULL), passive_parent(NULL), tps() {} +  // constructor for MERGE quere +  Edge(const WFSTNode* _q, int) : +#ifdef DEBUG_CHART_PARSER +    id(0), +#endif +    cat(0), dot(NULL), q(_q), +    r(NULL), active_parent(NULL), passive_parent(NULL), tps() {} +}; +#ifdef DEBUG_CHART_PARSER +int Edge::id_count = 0; +#endif + +ostream& operator<<(ostream& os, const Edge& e) { +  string type = "PREDICT"; +  if (e.IsCreatedByScan()) +    type = "SCAN"; +  else if (e.IsCreatedByComplete()) +    type = "COMPLETE"; +  os << "[" +#ifdef DEBUG_CHART_PARSER +     << '(' << e.id << ") " +#else +     << '(' << &e << ") " +#endif +     << "q=" << e.q << ", r=" << e.r +     << ", cat="<< TD::Convert(e.cat*-1) << ", dot=" +     << e.dot +#ifdef DEBUG_CHART_PARSER +     << e.dot->hint +#endif +     << (e.IsActive() ? ", Active" : ", Passive") +     << ", " << type; +#ifdef DEBUG_CHART_PARSER +  if (e.active_parent) { os << ", act.parent=(" << e.active_parent->id << ')'; } +  if (e.passive_parent) { os << ", psv.parent=(" << e.passive_parent->id << ')'; } +#endif +  if (e.tps) { os << ", tps=" << e.tps->AsString(); } +  return os << ']'; +} + +struct Traversal { +  const Edge* const edge;      // result from the active / passive combination +  const Edge* const active; +  const Edge* const passive; +  Traversal(const Edge* me, const Edge* a, const Edge* p) : edge(me), active(a), passive(p) {} +}; + +struct UniqueTraversalHash { +  size_t operator()(const Traversal* t) const { +    size_t x = 5381; +    x = ((x << 5) + x) ^ reinterpret_cast<size_t>(t->active); +    x = ((x << 5) + x) ^ reinterpret_cast<size_t>(t->passive); +    x = ((x << 5) + x) ^ t->edge->IsActive(); +    return x; +  } +}; + +struct UniqueTraversalEquals { +  size_t operator()(const Traversal* a, const Traversal* b) const { +    return (a->passive == b->passive && a->active == b->active && a->edge->IsActive() == b->edge->IsActive()); +  } +}; + +struct UniqueEdgeHash { +  size_t operator()(const Edge* e) const { +    size_t x = 5381; +    if (e->IsActive()) { +      x = ((x << 5) + x) ^ reinterpret_cast<size_t>(e->dot); +      x = ((x << 5) + x) ^ reinterpret_cast<size_t>(e->q); +      x = ((x << 5) + x) ^ reinterpret_cast<size_t>(e->r); +      x = ((x << 5) + x) ^ static_cast<size_t>(e->cat); +      x += 13; +    } else {  // with passive edges, we don't care about the dot +      x = ((x << 5) + x) ^ reinterpret_cast<size_t>(e->q); +      x = ((x << 5) + x) ^ reinterpret_cast<size_t>(e->r); +      x = ((x << 5) + x) ^ static_cast<size_t>(e->cat); +    } +    return x; +  } +}; + +struct UniqueEdgeEquals { +  bool operator()(const Edge* a, const Edge* b) const { +    if (a->IsActive() != b->IsActive()) return false; +    if (a->IsActive()) { +      return (a->cat == b->cat) && (a->dot == b->dot) && (a->q == b->q) && (a->r == b->r); +    } else { +      return (a->cat == b->cat) && (a->q == b->q) && (a->r == b->r); +    } +  } +}; + +struct REdgeHash { +  size_t operator()(const Edge* e) const { +    size_t x = 5381; +    x = ((x << 5) + x) ^ reinterpret_cast<size_t>(e->r); +    return x; +  } +}; + +struct REdgeEquals { +  bool operator()(const Edge* a, const Edge* b) const { +    return (a->r == b->r); +  } +}; + +struct QEdgeHash { +  size_t operator()(const Edge* e) const { +    size_t x = 5381; +    x = ((x << 5) + x) ^ reinterpret_cast<size_t>(e->q); +    return x; +  } +}; + +struct QEdgeEquals { +  bool operator()(const Edge* a, const Edge* b) const { +    return (a->q == b->q); +  } +}; + +struct EdgeQueue { +  queue<const Edge*> q; +  EdgeQueue() {} +  void clear() { while(!q.empty()) q.pop(); } +  bool HasWork() const { return !q.empty(); } +  const Edge* Next() { const Edge* res = q.front(); q.pop(); return res; } +  void AddEdge(const Edge* s) { q.push(s); } +}; + +class CFG_WFSTComposerImpl { + public: +  CFG_WFSTComposerImpl(WordID start_cat, +                       const WFSTNode* q_0, +                       const WFSTNode* q_final) : start_cat_(start_cat), q_0_(q_0), q_final_(q_final) {} + +  // returns false if the intersection is empty +  bool Compose(const EGrammar& g, Hypergraph* forest) { +    goal_node = NULL; +    EGrammar::const_iterator sit = g.find(start_cat_); +    forest->ReserveNodes(kMAX_NODES); +    assert(sit != g.end()); +    Edge* init = new Edge(start_cat_, &sit->second, q_0_); +    assert(IncorporateNewEdge(init)); +    while (exp_agenda.HasWork() || agenda.HasWork()) { +      while(exp_agenda.HasWork()) { +        const Edge* edge = exp_agenda.Next(); +        FinishEdge(edge, forest); +      } +      if (agenda.HasWork()) { +        const Edge* edge = agenda.Next(); +#ifdef DEBUG_CHART_PARSER +        cerr << "processing (" << edge->id << ')' << endl; +#endif +        if (edge->IsActive()) { +          if (edge->dot->HasTerminals()) +            DoScan(edge); +          if (edge->dot->HasNonTerminals()) { +            DoMergeWithPassives(edge); +            DoPredict(edge, g); +          } +        } else { +          DoComplete(edge); +        } +      } +    } +    if (goal_node) { +      forest->PruneUnreachable(goal_node->id_); +      forest->EpsilonRemove(kEPS); +    } +    FreeAll(); +    return goal_node; +  } + +  void FreeAll() { +    for (int i = 0; i < free_list_.size(); ++i) +      delete free_list_[i]; +    free_list_.clear(); +    for (int i = 0; i < traversal_free_list_.size(); ++i) +      delete traversal_free_list_[i]; +    traversal_free_list_.clear(); +    all_traversals.clear(); +    exp_agenda.clear(); +    agenda.clear(); +    tps2node.clear(); +    edge2node.clear(); +    all_edges.clear(); +    passive_edges.clear(); +    active_edges.clear(); +  } + +  ~CFG_WFSTComposerImpl() { +    FreeAll(); +  } + +  // returns the total number of edges created during composition +  int EdgesCreated() const { +    return free_list_.size(); +  } + + private: +  void DoScan(const Edge* edge) { +    // here, we assume that the FST will potentially have many more outgoing +    // edges than the grammar, which will be just a couple.  If you want to +    // efficiently handle the case where both are relatively large, this code +    // will need to change how the intersection is done.  The best general +    // solution would probably be the Baeza-Yates double binary search. + +    const EGrammarNode* dot = edge->dot; +    const WFSTNode* r = edge->r; +    const map<WordID, EGrammarNode>& terms = dot->GetTerminals(); +    for (map<WordID, EGrammarNode>::const_iterator git = terms.begin(); +         git != terms.end(); ++git) { + +      if (!(TD::Convert(git->first)[0] >= '0' && TD::Convert(git->first)[0] <= '9')) { +        std::cerr << "TERMINAL SYMBOL: " << TD::Convert(git->first) << endl; +        abort(); +      } +      std::vector<std::pair<const WFSTNode*, TRulePtr> > extensions = r->ExtendInput(atoi(TD::Convert(git->first))); +      for (unsigned nsi = 0; nsi < extensions.size(); ++nsi) { +        const WFSTNode* next_r = extensions[nsi].first; +        const EGrammarNode* next_dot = &git->second; +        const bool grammar_continues = next_dot->GrammarContinues(); +        const bool rule_completes    = next_dot->RuleCompletes(); +        if (extensions[nsi].second) +          cerr << "!!! " << extensions[nsi].second->AsString() << endl; +        // cerr << "  rule completes: " << rule_completes << " after consuming " << TD::Convert(git->first) << endl; +        assert(grammar_continues || rule_completes); +        const SparseVector<double>& input_features = next_dot->GetCFGProductionFeatures(); +        if (rule_completes) +          IncorporateNewEdge(new Edge(edge->cat, next_dot, edge->q, next_r, edge, extensions[nsi].second, input_features)); +        if (grammar_continues) +          IncorporateNewEdge(new Edge(edge->cat, next_dot, edge->q, next_r, edge, extensions[nsi].second)); +      } +    } +  } + +  void DoPredict(const Edge* edge, const EGrammar& g) { +    const EGrammarNode* dot = edge->dot; +    const map<WordID, EGrammarNode>& non_terms = dot->GetNonTerminals(); +    for (map<WordID, EGrammarNode>::const_iterator git = non_terms.begin(); +         git != non_terms.end(); ++git) { +      const WordID nt_to_predict = git->first; +      //cerr << edge->id << " -- " << TD::Convert(nt_to_predict*-1) << endl; +      EGrammar::const_iterator egi = g.find(nt_to_predict); +      if (egi == g.end()) { +        cerr << "[ERROR] Can't find any grammar rules with a LHS of type " +             << TD::Convert(-1*nt_to_predict) << '!' << endl; +        continue; +      } +      assert(edge->IsActive()); +      const EGrammarNode* new_dot = &egi->second; +      Edge* new_edge = new Edge(nt_to_predict, new_dot, edge->r, edge); +      IncorporateNewEdge(new_edge); +    } +  } + +  void DoComplete(const Edge* passive) { +#ifdef DEBUG_CHART_PARSER +    cerr << "  complete: " << *passive << endl; +#endif +    const WordID completed_nt = passive->cat; +    const WFSTNode* q = passive->q; +    const WFSTNode* next_r = passive->r; +    const Edge query(q); +    const pair<unordered_multiset<const Edge*, REdgeHash, REdgeEquals>::iterator, +         unordered_multiset<const Edge*, REdgeHash, REdgeEquals>::iterator > p = +      active_edges.equal_range(&query); +    for (unordered_multiset<const Edge*, REdgeHash, REdgeEquals>::iterator it = p.first; +         it != p.second; ++it) { +      const Edge* active = *it; +#ifdef DEBUG_CHART_PARSER +      cerr << "    pos: " << *active << endl; +#endif +      const EGrammarNode* next_dot = active->dot->Extend(completed_nt); +      if (!next_dot) continue; +      const SparseVector<double>& input_features = next_dot->GetCFGProductionFeatures(); +      // add up to 2 rules +      if (next_dot->RuleCompletes()) +        IncorporateNewEdge(new Edge(active->cat, next_dot, active->q, next_r, active, passive, input_features)); +      if (next_dot->GrammarContinues()) +        IncorporateNewEdge(new Edge(active->cat, next_dot, active->q, next_r, active, passive)); +    } +  } + +  void DoMergeWithPassives(const Edge* active) { +    // edge is active, has non-terminals, we need to find the passives that can extend it +    assert(active->IsActive()); +    assert(active->dot->HasNonTerminals()); +#ifdef DEBUG_CHART_PARSER +    cerr << "  merge active with passives: ACT=" << *active << endl; +#endif +    const Edge query(active->r, 1); +    const pair<unordered_multiset<const Edge*, QEdgeHash, QEdgeEquals>::iterator, +         unordered_multiset<const Edge*, QEdgeHash, QEdgeEquals>::iterator > p = +      passive_edges.equal_range(&query); +    for (unordered_multiset<const Edge*, QEdgeHash, QEdgeEquals>::iterator it = p.first; +         it != p.second; ++it) { +      const Edge* passive = *it; +      const EGrammarNode* next_dot = active->dot->Extend(passive->cat); +      if (!next_dot) continue; +      const WFSTNode* next_r = passive->r; +      const SparseVector<double>& input_features = next_dot->GetCFGProductionFeatures(); +      if (next_dot->RuleCompletes()) +        IncorporateNewEdge(new Edge(active->cat, next_dot, active->q, next_r, active, passive, input_features)); +      if (next_dot->GrammarContinues()) +        IncorporateNewEdge(new Edge(active->cat, next_dot, active->q, next_r, active, passive)); +    } +  } + +  // take ownership of edge memory, add to various indexes, etc +  // returns true if this edge is new +  bool IncorporateNewEdge(Edge* edge) { +    free_list_.push_back(edge); +    if (edge->passive_parent && edge->active_parent) { +      Traversal* t = new Traversal(edge, edge->active_parent, edge->passive_parent); +      traversal_free_list_.push_back(t); +      if (all_traversals.find(t) != all_traversals.end()) { +        return false; +      } else { +        all_traversals.insert(t); +      } +    } +    exp_agenda.AddEdge(edge); +    return true; +  } + +  bool FinishEdge(const Edge* edge, Hypergraph* hg) { +    bool is_new = false; +    if (all_edges.find(edge) == all_edges.end()) { +#ifdef DEBUG_CHART_PARSER +      cerr << *edge << " is NEW\n"; +#endif +      all_edges.insert(edge); +      is_new = true; +      if (edge->IsPassive()) passive_edges.insert(edge); +      if (edge->IsActive()) active_edges.insert(edge); +      agenda.AddEdge(edge); +    } else { +#ifdef DEBUG_CHART_PARSER +      cerr << *edge << " is NOT NEW.\n"; +#endif +    } +    AddEdgeToTranslationForest(edge, hg); +    return is_new; +  } + +  // build the translation forest +  void AddEdgeToTranslationForest(const Edge* edge, Hypergraph* hg) { +    assert(hg->nodes_.size() < kMAX_NODES); +    Hypergraph::Node* tps = NULL; +    // first add any target language rules +    if (edge->tps) { +      Hypergraph::Node*& node = tps2node[(size_t)edge->tps.get()]; +      if (!node) { +        // cerr << "Creating phrases for " << edge->tps << endl; +        const TRulePtr& rule = edge->tps; +        node = hg->AddNode(kPHRASE); +        Hypergraph::Edge* hg_edge = hg->AddEdge(rule, Hypergraph::TailNodeVector()); +        hg_edge->feature_values_ += rule->GetFeatureValues(); +        hg->ConnectEdgeToHeadNode(hg_edge, node); +      } +      tps = node; +    } +    Hypergraph::Node*& head_node = edge2node[edge]; +    if (!head_node) +      head_node = hg->AddNode(kPHRASE); +    if (edge->cat == start_cat_ && edge->q == q_0_ && edge->r == q_final_ && edge->IsPassive()) { +      assert(goal_node == NULL || goal_node == head_node); +      goal_node = head_node; +    } +    Hypergraph::TailNodeVector tail; +    SparseVector<double> extra; +    if (edge->IsCreatedByPredict()) { +      // extra.set_value(FD::Convert("predict"), 1); +    } else if (edge->IsCreatedByScan()) { +      tail.push_back(edge2node[edge->active_parent]->id_); +      if (tps) { +        tail.push_back(tps->id_); +      } +      //extra.set_value(FD::Convert("scan"), 1); +    } else if (edge->IsCreatedByComplete()) { +      tail.push_back(edge2node[edge->active_parent]->id_); +      tail.push_back(edge2node[edge->passive_parent]->id_); +      //extra.set_value(FD::Convert("complete"), 1); +    } else { +      assert(!"unexpected edge type!"); +    } +    //cerr << head_node->id_ << "<--" << *edge << endl; + +#ifdef DEBUG_CHART_PARSER +      for (int i = 0; i < tail.size(); ++i) +        if (tail[i] == head_node->id_) { +          cerr << "ERROR: " << *edge << "\n   i=" << i << endl; +          if (i == 1) { cerr << "\tP: " << *edge->passive_parent << endl; } +          if (i == 0) { cerr << "\tA: " << *edge->active_parent << endl; } +          assert(!"self-loop found!"); +        } +#endif +    Hypergraph::Edge* hg_edge = NULL; +    if (tail.size() == 0) { +      hg_edge = hg->AddEdge(kEPSRule, tail); +    } else if (tail.size() == 1) { +      hg_edge = hg->AddEdge(kX1, tail); +    } else if (tail.size() == 2) { +      hg_edge = hg->AddEdge(kX1X2, tail); +    } +    if (edge->features) +      hg_edge->feature_values_ += *edge->features; +    hg_edge->feature_values_ += extra; +    hg->ConnectEdgeToHeadNode(hg_edge, head_node); +  } + +  Hypergraph::Node* goal_node; +  EdgeQueue exp_agenda; +  EdgeQueue agenda; +  unordered_map<size_t, Hypergraph::Node*> tps2node; +  unordered_map<const Edge*, Hypergraph::Node*, UniqueEdgeHash, UniqueEdgeEquals> edge2node; +  unordered_set<const Traversal*, UniqueTraversalHash, UniqueTraversalEquals> all_traversals; +  unordered_set<const Edge*, UniqueEdgeHash, UniqueEdgeEquals> all_edges; +  unordered_multiset<const Edge*, QEdgeHash, QEdgeEquals> passive_edges; +  unordered_multiset<const Edge*, REdgeHash, REdgeEquals> active_edges; +  vector<Edge*> free_list_; +  vector<Traversal*> traversal_free_list_; +  const WordID start_cat_; +  const WFSTNode* const q_0_; +  const WFSTNode* const q_final_; +}; + +#ifdef DEBUG_CHART_PARSER +static string TrimRule(const string& r) { +  size_t start = r.find(" |||") + 5; +  size_t end = r.rfind(" |||"); +  return r.substr(start, end - start); +} +#endif + +void AddGrammarRule(const string& r, EGrammar* g) { +  const size_t pos = r.find(" ||| "); +  if (pos == string::npos || r[0] != '[') { +    cerr << "Bad rule: " << r << endl; +    return; +  } +  const size_t rpos = r.rfind(" ||| "); +  string feats; +  string rs = r; +  if (rpos != pos) { +    feats = r.substr(rpos + 5); +    rs = r.substr(0, rpos); +  } +  string rhs = rs.substr(pos + 5); +  string trule = rs + " ||| " + rhs + " ||| " + feats; +  TRule tr(trule); +  cerr << "X: " << tr.e_[0] << endl; +#ifdef DEBUG_CHART_PARSER +  string hint_last_rule; +#endif +  EGrammarNode* cur = &(*g)[tr.GetLHS()]; +  cur->is_root = true; +  for (int i = 0; i < tr.FLength(); ++i) { +    WordID sym = tr.f()[i]; +#ifdef DEBUG_CHART_PARSER +    hint_last_rule = TD::Convert(sym < 0 ? -sym : sym); +    cur->hint += " <@@> (*" + hint_last_rule + ") " + TrimRule(tr.AsString()); +#endif +    if (sym < 0) +      cur = &cur->ntptr[sym]; +    else +      cur = &cur->tptr[sym]; +  } +#ifdef DEBUG_CHART_PARSER +  cur->hint += " <@@> (" + hint_last_rule + "*) " + TrimRule(tr.AsString()); +#endif +  cur->is_some_rule_complete = true; +  cur->input_features = tr.GetFeatureValues(); +} + +CFG_WFSTComposer::~CFG_WFSTComposer() { +  delete pimpl_; +} + +CFG_WFSTComposer::CFG_WFSTComposer(const WFST& wfst) { +  InitializeConstants(); +  pimpl_ = new CFG_WFSTComposerImpl(kUNIQUE_START, wfst.Initial(), wfst.Final()); +} + +bool CFG_WFSTComposer::Compose(const Hypergraph& src_forest, Hypergraph* trg_forest) { +  // first, convert the src forest into an EGrammar +  EGrammar g; +  const int nedges = src_forest.edges_.size(); +  const int nnodes = src_forest.nodes_.size(); +  vector<int> cats(nnodes); +  bool assign_cats = false; +  for (int i = 0; i < nnodes; ++i) +    if (assign_cats) { +      cats[i] = TD::Convert("CAT_" + boost::lexical_cast<string>(i)) * -1; +    } else { +      cats[i] = src_forest.nodes_[i].cat_; +    } +  // construct the grammar +  for (int i = 0; i < nedges; ++i) { +    const Hypergraph::Edge& edge = src_forest.edges_[i]; +    const vector<WordID>& src = edge.rule_->f(); +    EGrammarNode* cur = &g[cats[edge.head_node_]]; +    cur->is_root = true; +    int ntc = 0; +    for (int j = 0; j < src.size(); ++j) { +      WordID sym = src[j]; +      if (sym <= 0) { +        sym = cats[edge.tail_nodes_[ntc]]; +        ++ntc; +        cur = &cur->ntptr[sym]; +      } else { +        cur = &cur->tptr[sym]; +      } +    } +    cur->is_some_rule_complete = true; +    cur->input_features = edge.feature_values_; +  } +  EGrammarNode& goal_rule = g[kUNIQUE_START]; +  assert((goal_rule.ntptr.size() == 1 && goal_rule.tptr.size() == 0) || +         (goal_rule.ntptr.size() == 0 && goal_rule.tptr.size() == 1)); + +  return pimpl_->Compose(g, trg_forest); +} + +bool CFG_WFSTComposer::Compose(istream* in, Hypergraph* trg_forest) { +  EGrammar g; +  while(*in) { +    string line; +    getline(*in, line); +    if (line.empty()) continue; +    AddGrammarRule(line, &g); +  } + +  return pimpl_->Compose(g, trg_forest); +} diff --git a/gi/pf/cfg_wfst_composer.h b/gi/pf/cfg_wfst_composer.h new file mode 100644 index 00000000..cf47f459 --- /dev/null +++ b/gi/pf/cfg_wfst_composer.h @@ -0,0 +1,46 @@ +#ifndef _CFG_WFST_COMPOSER_H_ +#define _CFG_WFST_COMPOSER_H_ + +#include <iostream> +#include <vector> +#include <utility> + +#include "trule.h" +#include "wordid.h" + +class CFG_WFSTComposerImpl; +class Hypergraph; + +struct WFSTNode { +  virtual ~WFSTNode(); +  // returns the next states reachable by consuming srcindex (which identifies a word) +  // paired with the output string generated by taking that transition. +  virtual std::vector<std::pair<const WFSTNode*,TRulePtr> > ExtendInput(unsigned srcindex) const = 0; +}; + +struct WFST { +  virtual ~WFST(); +  virtual const WFSTNode* Final() const = 0; +  virtual const WFSTNode* Initial() const = 0; +}; + +class CFG_WFSTComposer { + public: +  ~CFG_WFSTComposer(); +  explicit CFG_WFSTComposer(const WFST& wfst); +  bool Compose(const Hypergraph& in_forest, Hypergraph* trg_forest); + +  // reads the grammar from a file. There must be a single top-level +  // S -> X rule.  Anything else is possible. Format is: +  // [S] ||| [SS,1] +  // [SS] ||| [NP,1] [VP,2] ||| Feature1=0.2 Feature2=-2.3 +  // [SS] ||| [VP,1] [NP,2] ||| Feature1=0.8 +  // [NP] ||| [DET,1] [N,2] ||| Feature3=2 +  // ... +  bool Compose(std::istream* grammar_file, Hypergraph* trg_forest); + + private: +  CFG_WFSTComposerImpl* pimpl_; +}; + +#endif diff --git a/gi/pf/corpus.cc b/gi/pf/corpus.cc new file mode 100644 index 00000000..a408e7cf --- /dev/null +++ b/gi/pf/corpus.cc @@ -0,0 +1,57 @@ +#include "corpus.h" + +#include <set> +#include <vector> +#include <string> + +#include "tdict.h" +#include "filelib.h" + +using namespace std; + +namespace corpus { + +void ReadParallelCorpus(const string& filename, +                vector<vector<WordID> >* f, +                vector<vector<WordID> >* e, +                set<WordID>* vocab_f, +                set<WordID>* vocab_e) { +  f->clear(); +  e->clear(); +  vocab_f->clear(); +  vocab_e->clear(); +  ReadFile rf(filename); +  istream* in = rf.stream(); +  assert(*in); +  string line; +  const WordID kDIV = TD::Convert("|||"); +  vector<WordID> tmp; +  while(*in) { +    getline(*in, line); +    if (line.empty() && !*in) break; +    e->push_back(vector<int>()); +    f->push_back(vector<int>()); +    vector<int>& le = e->back(); +    vector<int>& lf = f->back(); +    tmp.clear(); +    TD::ConvertSentence(line, &tmp); +    bool isf = true; +    for (unsigned i = 0; i < tmp.size(); ++i) { +      const int cur = tmp[i]; +      if (isf) { +        if (kDIV == cur) { isf = false; } else { +          lf.push_back(cur); +          vocab_f->insert(cur); +        } +      } else { +        assert(cur != kDIV); +        le.push_back(cur); +        vocab_e->insert(cur); +      } +    } +    assert(isf == false); +  } +} + +} + diff --git a/gi/pf/corpus.h b/gi/pf/corpus.h new file mode 100644 index 00000000..e7febdb7 --- /dev/null +++ b/gi/pf/corpus.h @@ -0,0 +1,19 @@ +#ifndef _CORPUS_H_ +#define _CORPUS_H_ + +#include <string> +#include <vector> +#include <set> +#include "wordid.h" + +namespace corpus { + +void ReadParallelCorpus(const std::string& filename, +                std::vector<std::vector<WordID> >* f, +                std::vector<std::vector<WordID> >* e, +                std::set<WordID>* vocab_f, +                std::set<WordID>* vocab_e); + +} + +#endif diff --git a/gi/pf/dpnaive.cc b/gi/pf/dpnaive.cc new file mode 100644 index 00000000..c926487b --- /dev/null +++ b/gi/pf/dpnaive.cc @@ -0,0 +1,294 @@ +#include <iostream> +#include <tr1/memory> +#include <queue> + +#include <boost/multi_array.hpp> +#include <boost/program_options.hpp> +#include <boost/program_options/variables_map.hpp> + +#include "base_measures.h" +#include "monotonic_pseg.h" +#include "trule.h" +#include "tdict.h" +#include "filelib.h" +#include "dict.h" +#include "sampler.h" +#include "ccrp_nt.h" +#include "corpus.h" + +using namespace std; +using namespace std::tr1; +namespace po = boost::program_options; + +static unsigned kMAX_SRC_PHRASE; +static unsigned kMAX_TRG_PHRASE; + +void InitCommandLine(int argc, char** argv, po::variables_map* conf) { +  po::options_description opts("Configuration options"); +  opts.add_options() +        ("samples,s",po::value<unsigned>()->default_value(1000),"Number of samples") +        ("input,i",po::value<string>(),"Read parallel data from") +        ("max_src_phrase",po::value<unsigned>()->default_value(4),"Maximum length of source language phrases") +        ("max_trg_phrase",po::value<unsigned>()->default_value(4),"Maximum length of target language phrases") +        ("model1,m",po::value<string>(),"Model 1 parameters (used in base distribution)") +        ("model1_interpolation_weight",po::value<double>()->default_value(0.95),"Mixing proportion of model 1 with uniform target distribution") +        ("random_seed,S",po::value<uint32_t>(), "Random seed"); +  po::options_description clo("Command line options"); +  clo.add_options() +        ("config", po::value<string>(), "Configuration file") +        ("help,h", "Print this help message and exit"); +  po::options_description dconfig_options, dcmdline_options; +  dconfig_options.add(opts); +  dcmdline_options.add(opts).add(clo); +   +  po::store(parse_command_line(argc, argv, dcmdline_options), *conf); +  if (conf->count("config")) { +    ifstream config((*conf)["config"].as<string>().c_str()); +    po::store(po::parse_config_file(config, dconfig_options), *conf); +  } +  po::notify(*conf); + +  if (conf->count("help") || (conf->count("input") == 0)) { +    cerr << dcmdline_options << endl; +    exit(1); +  } +} + +shared_ptr<MT19937> prng; + +template <typename Base> +struct ModelAndData { +  explicit ModelAndData(MonotonicParallelSegementationModel& m, const Base& b, const vector<vector<int> >& ce, const vector<vector<int> >& cf, const set<int>& ve, const set<int>& vf) : +     model(m), +     rng(&*prng), +     p0(b), +     baseprob(prob_t::One()), +     corpuse(ce), +     corpusf(cf), +     vocabe(ve), +     vocabf(vf), +     mh_samples(), +     mh_rejects(), +     kX(-TD::Convert("X")), +     derivations(corpuse.size()) {} + +  void ResampleHyperparameters() { +  } + +  void InstantiateRule(const pair<short,short>& from, +                       const pair<short,short>& to, +                       const vector<int>& sentf, +                       const vector<int>& sente, +                       TRule* rule) const { +    rule->f_.clear(); +    rule->e_.clear(); +    rule->lhs_ = kX; +    for (short i = from.first; i < to.first; ++i) +      rule->f_.push_back(sentf[i]); +    for (short i = from.second; i < to.second; ++i) +      rule->e_.push_back(sente[i]); +  } + +  void DecrementDerivation(const vector<pair<short,short> >& d, const vector<int>& sentf, const vector<int>& sente) { +    if (d.size() < 2) return; +    TRule x; +    for (int i = 1; i < d.size(); ++i) { +      InstantiateRule(d[i], d[i-1], sentf, sente, &x); +      model.DecrementRule(x); +      model.DecrementContinue(); +    } +    model.DecrementStop(); +  } + +  void PrintDerivation(const vector<pair<short,short> >& d, const vector<int>& sentf, const vector<int>& sente) { +    if (d.size() < 2) return; +    TRule x; +    for (int i = 1; i < d.size(); ++i) { +      InstantiateRule(d[i], d[i-1], sentf, sente, &x); +      cerr << i << '/' << (d.size() - 1) << ": " << x << endl; +    } +  } + +  void IncrementDerivation(const vector<pair<short,short> >& d, const vector<int>& sentf, const vector<int>& sente) { +    if (d.size() < 2) return; +    TRule x; +    for (int i = 1; i < d.size(); ++i) { +      InstantiateRule(d[i], d[i-1], sentf, sente, &x); +      model.IncrementRule(x); +      model.IncrementContinue(); +    } +    model.IncrementStop(); +  } + +  prob_t Likelihood() const { +    return model.Likelihood(); +  } + +  prob_t DerivationProposalProbability(const vector<pair<short,short> >& d, const vector<int>& sentf, const vector<int>& sente) const { +    prob_t p = model.StopProbability(); +    if (d.size() < 2) return p; +    TRule x; +    const prob_t p_cont = model.ContinueProbability(); +    for (int i = 1; i < d.size(); ++i) { +      InstantiateRule(d[i], d[i-1], sentf, sente, &x); +      p *= p_cont; +      p *= model.RuleProbability(x); +    } +    return p; +  } + +  void Sample(); + +  MonotonicParallelSegementationModel& model; +  MT19937* rng; +  const Base& p0; +  prob_t baseprob; // cached value of generating the table table labels from p0 +                   // this can't be used if we go to a hierarchical prior! +  const vector<vector<int> >& corpuse, corpusf; +  const set<int>& vocabe, vocabf; +  unsigned mh_samples, mh_rejects; +  const int kX; +  vector<vector<pair<short, short> > > derivations; +}; + +template <typename Base> +void ModelAndData<Base>::Sample() { +  unsigned MAXK = kMAX_SRC_PHRASE; +  unsigned MAXL = kMAX_TRG_PHRASE; +  TRule x; +  x.lhs_ = -TD::Convert("X"); +  for (int samples = 0; samples < 1000; ++samples) { +    if (samples % 1 == 0 && samples > 0) { +      //ResampleHyperparameters(); +      cerr << " [" << samples << " LLH=" << log(Likelihood()) << " MH=" << ((double)mh_rejects / mh_samples) << "]\n"; +      for (int i = 0; i < 10; ++i) { +        cerr << "SENTENCE: " << TD::GetString(corpusf[i]) << " ||| " << TD::GetString(corpuse[i]) << endl; +        PrintDerivation(derivations[i], corpusf[i], corpuse[i]); +      } +    } +    cerr << '.' << flush; +    for (int s = 0; s < corpuse.size(); ++s) { +      const vector<int>& sentf = corpusf[s]; +      const vector<int>& sente = corpuse[s]; +//      cerr << "  CUSTOMERS: " << rules.num_customers() << endl; +//      cerr << "SENTENCE: " << TD::GetString(sentf) << " ||| " << TD::GetString(sente) << endl; + +      vector<pair<short, short> >& deriv = derivations[s]; +      const prob_t p_cur = Likelihood(); +      DecrementDerivation(deriv, sentf, sente); + +      boost::multi_array<prob_t, 2> a(boost::extents[sentf.size() + 1][sente.size() + 1]); +      boost::multi_array<prob_t, 4> trans(boost::extents[sentf.size() + 1][sente.size() + 1][MAXK][MAXL]); +      a[0][0] = prob_t::One(); +      const prob_t q_stop = model.StopProbability(); +      const prob_t q_cont = model.ContinueProbability(); +      for (int i = 0; i < sentf.size(); ++i) { +        for (int j = 0; j < sente.size(); ++j) { +          const prob_t src_a = a[i][j]; +          x.f_.clear(); +          for (int k = 1; k <= MAXK; ++k) { +            if (i + k > sentf.size()) break; +            x.f_.push_back(sentf[i + k - 1]); +            x.e_.clear(); +            for (int l = 1; l <= MAXL; ++l) { +              if (j + l > sente.size()) break; +              x.e_.push_back(sente[j + l - 1]); +              const bool stop_now = ((j + l) == sente.size()) && ((i + k) == sentf.size()); +              const prob_t& cp = stop_now ? q_stop : q_cont; +              trans[i][j][k - 1][l - 1] = model.RuleProbability(x) * cp; +              a[i + k][j + l] += src_a * trans[i][j][k - 1][l - 1]; +            } +          } +        } +      } +//      cerr << "Inside: " << log(a[sentf.size()][sente.size()]) << endl; +      const prob_t q_cur = DerivationProposalProbability(deriv, sentf, sente); + +      vector<pair<short,short> > newderiv; +      int cur_i = sentf.size(); +      int cur_j = sente.size(); +      while(cur_i > 0 && cur_j > 0) { +        newderiv.push_back(pair<short,short>(cur_i, cur_j)); +//        cerr << "NODE: (" << cur_i << "," << cur_j << ")\n"; +        SampleSet<prob_t> ss; +        vector<pair<short,short> > nexts; +        for (int k = 1; k <= MAXK; ++k) { +          const int hyp_i = cur_i - k; +          if (hyp_i < 0) break; +          for (int l = 1; l <= MAXL; ++l) { +            const int hyp_j = cur_j - l; +            if (hyp_j < 0) break; +            const prob_t& inside = a[hyp_i][hyp_j]; +            if (inside == prob_t::Zero()) continue; +            const prob_t& transp = trans[hyp_i][hyp_j][k - 1][l - 1]; +            if (transp == prob_t::Zero()) continue; +            const prob_t p = inside * transp; +            ss.add(p); +            nexts.push_back(pair<short,short>(hyp_i, hyp_j)); +//            cerr << "    (" << hyp_i << "," << hyp_j << ")  <--- " << log(p) << endl; +          } +        } +//        cerr << "  sample set has " << nexts.size() << " elements.\n"; +        const int selected = rng->SelectSample(ss); +        cur_i = nexts[selected].first; +        cur_j = nexts[selected].second; +      } +      newderiv.push_back(pair<short,short>(0,0)); +      const prob_t q_new = DerivationProposalProbability(newderiv, sentf, sente); +      IncrementDerivation(newderiv, sentf, sente); +//      cerr << "SANITY: " << q_new << "  " <<log(DerivationProposalProbability(newderiv, sentf, sente)) << endl; +      if (deriv.empty()) { deriv = newderiv; continue; } +      ++mh_samples; + +      if (deriv != newderiv) { +        const prob_t p_new = Likelihood(); +//        cerr << "p_cur=" << log(p_cur) << "\t p_new=" << log(p_new) << endl; +//        cerr << "q_cur=" << log(q_cur) << "\t q_new=" << log(q_new) << endl; +        if (!rng->AcceptMetropolisHastings(p_new, p_cur, q_new, q_cur)) { +          ++mh_rejects; +          DecrementDerivation(newderiv, sentf, sente); +          IncrementDerivation(deriv, sentf, sente); +        } else { +//          cerr << "  ACCEPT\n"; +          deriv = newderiv; +        } +      } +    } +  } +} + +int main(int argc, char** argv) { +  po::variables_map conf; +  InitCommandLine(argc, argv, &conf); +  kMAX_TRG_PHRASE = conf["max_trg_phrase"].as<unsigned>(); +  kMAX_SRC_PHRASE = conf["max_src_phrase"].as<unsigned>(); + +  if (!conf.count("model1")) { +    cerr << argv[0] << "Please use --model1 to specify model 1 parameters\n"; +    return 1; +  } +  if (conf.count("random_seed")) +    prng.reset(new MT19937(conf["random_seed"].as<uint32_t>())); +  else +    prng.reset(new MT19937); +//  MT19937& rng = *prng; + +  vector<vector<int> > corpuse, corpusf; +  set<int> vocabe, vocabf; +  corpus::ReadParallelCorpus(conf["input"].as<string>(), &corpusf, &corpuse, &vocabf, &vocabe); +  cerr << "f-Corpus size: " << corpusf.size() << " sentences\n"; +  cerr << "f-Vocabulary size: " << vocabf.size() << " types\n"; +  cerr << "f-Corpus size: " << corpuse.size() << " sentences\n"; +  cerr << "f-Vocabulary size: " << vocabe.size() << " types\n"; +  assert(corpusf.size() == corpuse.size()); + +  Model1 m1(conf["model1"].as<string>()); +  PhraseJointBase lp0(m1, conf["model1_interpolation_weight"].as<double>(), vocabe.size(), vocabf.size()); +  MonotonicParallelSegementationModel m(lp0); + +  ModelAndData<PhraseJointBase> posterior(m, lp0, corpuse, corpusf, vocabe, vocabf); +  posterior.Sample(); + +  return 0; +} + diff --git a/gi/pf/itg.cc b/gi/pf/itg.cc new file mode 100644 index 00000000..ac3c16a3 --- /dev/null +++ b/gi/pf/itg.cc @@ -0,0 +1,213 @@ +#include <iostream> +#include <tr1/memory> +#include <queue> + +#include <boost/functional.hpp> +#include <boost/program_options.hpp> +#include <boost/program_options/variables_map.hpp> + +#include "viterbi.h" +#include "hg.h" +#include "trule.h" +#include "tdict.h" +#include "filelib.h" +#include "dict.h" +#include "sampler.h" +#include "ccrp_nt.h" +#include "ccrp_onetable.h" + +using namespace std; +using namespace tr1; +namespace po = boost::program_options; + +ostream& operator<<(ostream& os, const vector<WordID>& p) { +  os << '['; +  for (int i = 0; i < p.size(); ++i) +    os << (i==0 ? "" : " ") << TD::Convert(p[i]); +  return os << ']'; +} + +double log_poisson(unsigned x, const double& lambda) { +  assert(lambda > 0.0); +  return log(lambda) * x - lgamma(x + 1) - lambda; +} + +struct Model1 { +  explicit Model1(const string& fname) : +      kNULL(TD::Convert("<eps>")), +      kZERO() { +    LoadModel1(fname); +  } + +  void LoadModel1(const string& fname) { +    cerr << "Loading Model 1 parameters from " << fname << " ..." << endl; +    ReadFile rf(fname); +    istream& in = *rf.stream(); +    string line; +    unsigned lc = 0; +    while(getline(in, line)) { +      ++lc; +      int cur = 0; +      int start = 0; +      while(cur < line.size() && line[cur] != ' ') { ++cur; } +      assert(cur != line.size()); +      line[cur] = 0; +      const WordID src = TD::Convert(&line[0]); +      ++cur; +      start = cur; +      while(cur < line.size() && line[cur] != ' ') { ++cur; } +      assert(cur != line.size()); +      line[cur] = 0; +      WordID trg = TD::Convert(&line[start]); +      const double logprob = strtod(&line[cur + 1], NULL); +      if (src >= ttable.size()) ttable.resize(src + 1); +      ttable[src][trg].logeq(logprob); +    } +    cerr << "  read " << lc << " parameters.\n"; +  } + +  // returns prob 0 if src or trg is not found! +  const prob_t& operator()(WordID src, WordID trg) const { +    if (src == 0) src = kNULL; +    if (src < ttable.size()) { +      const map<WordID, prob_t>& cpd = ttable[src]; +      const map<WordID, prob_t>::const_iterator it = cpd.find(trg); +      if (it != cpd.end()) +        return it->second; +    } +    return kZERO; +  } + +  const WordID kNULL; +  const prob_t kZERO; +  vector<map<WordID, prob_t> > ttable; +}; + +void InitCommandLine(int argc, char** argv, po::variables_map* conf) { +  po::options_description opts("Configuration options"); +  opts.add_options() +        ("samples,s",po::value<unsigned>()->default_value(1000),"Number of samples") +        ("particles,p",po::value<unsigned>()->default_value(25),"Number of particles") +        ("input,i",po::value<string>(),"Read parallel data from") +        ("max_src_phrase",po::value<unsigned>()->default_value(7),"Maximum length of source language phrases") +        ("max_trg_phrase",po::value<unsigned>()->default_value(7),"Maximum length of target language phrases") +        ("model1,m",po::value<string>(),"Model 1 parameters (used in base distribution)") +        ("inverse_model1,M",po::value<string>(),"Inverse Model 1 parameters (used in backward estimate)") +        ("model1_interpolation_weight",po::value<double>()->default_value(0.95),"Mixing proportion of model 1 with uniform target distribution") +        ("random_seed,S",po::value<uint32_t>(), "Random seed"); +  po::options_description clo("Command line options"); +  clo.add_options() +        ("config", po::value<string>(), "Configuration file") +        ("help,h", "Print this help message and exit"); +  po::options_description dconfig_options, dcmdline_options; +  dconfig_options.add(opts); +  dcmdline_options.add(opts).add(clo); +   +  po::store(parse_command_line(argc, argv, dcmdline_options), *conf); +  if (conf->count("config")) { +    ifstream config((*conf)["config"].as<string>().c_str()); +    po::store(po::parse_config_file(config, dconfig_options), *conf); +  } +  po::notify(*conf); + +  if (conf->count("help") || (conf->count("input") == 0)) { +    cerr << dcmdline_options << endl; +    exit(1); +  } +} + +void ReadParallelCorpus(const string& filename, +                vector<vector<WordID> >* f, +                vector<vector<WordID> >* e, +                set<WordID>* vocab_f, +                set<WordID>* vocab_e) { +  f->clear(); +  e->clear(); +  vocab_f->clear(); +  vocab_e->clear(); +  istream* in; +  if (filename == "-") +    in = &cin; +  else +    in = new ifstream(filename.c_str()); +  assert(*in); +  string line; +  const WordID kDIV = TD::Convert("|||"); +  vector<WordID> tmp; +  while(*in) { +    getline(*in, line); +    if (line.empty() && !*in) break; +    e->push_back(vector<int>()); +    f->push_back(vector<int>()); +    vector<int>& le = e->back(); +    vector<int>& lf = f->back(); +    tmp.clear(); +    TD::ConvertSentence(line, &tmp); +    bool isf = true; +    for (unsigned i = 0; i < tmp.size(); ++i) { +      const int cur = tmp[i]; +      if (isf) { +        if (kDIV == cur) { isf = false; } else { +          lf.push_back(cur); +          vocab_f->insert(cur); +        } +      } else { +        assert(cur != kDIV); +        le.push_back(cur); +        vocab_e->insert(cur); +      } +    } +    assert(isf == false); +  } +  if (in != &cin) delete in; +} + +int main(int argc, char** argv) { +  po::variables_map conf; +  InitCommandLine(argc, argv, &conf); +  const size_t kMAX_TRG_PHRASE = conf["max_trg_phrase"].as<unsigned>(); +  const size_t kMAX_SRC_PHRASE = conf["max_src_phrase"].as<unsigned>(); +  const unsigned particles = conf["particles"].as<unsigned>(); +  const unsigned samples = conf["samples"].as<unsigned>(); + +  if (!conf.count("model1")) { +    cerr << argv[0] << "Please use --model1 to specify model 1 parameters\n"; +    return 1; +  } +  shared_ptr<MT19937> prng; +  if (conf.count("random_seed")) +    prng.reset(new MT19937(conf["random_seed"].as<uint32_t>())); +  else +    prng.reset(new MT19937); +  MT19937& rng = *prng; + +  vector<vector<WordID> > corpuse, corpusf; +  set<WordID> vocabe, vocabf; +  cerr << "Reading corpus...\n"; +  ReadParallelCorpus(conf["input"].as<string>(), &corpusf, &corpuse, &vocabf, &vocabe); +  cerr << "F-corpus size: " << corpusf.size() << " sentences\t (" << vocabf.size() << " word types)\n"; +  cerr << "E-corpus size: " << corpuse.size() << " sentences\t (" << vocabe.size() << " word types)\n"; +  assert(corpusf.size() == corpuse.size()); + +  const int kLHS = -TD::Convert("X"); +  Model1 m1(conf["model1"].as<string>()); +  Model1 invm1(conf["inverse_model1"].as<string>()); +  for (int si = 0; si < conf["samples"].as<unsigned>(); ++si) { +    cerr << '.' << flush; +    for (int ci = 0; ci < corpusf.size(); ++ci) { +      const vector<WordID>& src = corpusf[ci]; +      const vector<WordID>& trg = corpuse[ci]; +      for (int i = 0; i < src.size(); ++i) { +        for (int j = 0; j < trg.size(); ++j) { +          const int eff_max_src = min(src.size() - i, kMAX_SRC_PHRASE); +          for (int k = 0; k < eff_max_src; ++k) { +            const int eff_max_trg = (k == 0 ? 1 : min(trg.size() - j, kMAX_TRG_PHRASE)); +            for (int l = 0; l < eff_max_trg; ++l) { +            } +          } +        } +      } +    } +  } +} + diff --git a/gi/pf/monotonic_pseg.h b/gi/pf/monotonic_pseg.h new file mode 100644 index 00000000..7e6af3fc --- /dev/null +++ b/gi/pf/monotonic_pseg.h @@ -0,0 +1,88 @@ +#ifndef _MONOTONIC_PSEG_H_ +#define _MONOTONIC_PSEG_H_ + +#include <vector> + +#include "prob.h" +#include "ccrp_nt.h" +#include "trule.h" +#include "base_measures.h" + +struct MonotonicParallelSegementationModel { +  explicit MonotonicParallelSegementationModel(PhraseJointBase& rcp0) : +    rp0(rcp0), base(prob_t::One()), rules(1,1), stop(1.0) {} + +  void DecrementRule(const TRule& rule) { +    if (rules.decrement(rule)) +      base /= rp0(rule); +  } + +  void IncrementRule(const TRule& rule) { +    if (rules.increment(rule)) +      base *= rp0(rule); +  } + +  void IncrementRulesAndStops(const std::vector<TRulePtr>& rules) { +    for (int i = 0; i < rules.size(); ++i) +      IncrementRule(*rules[i]); +    if (rules.size()) IncrementContinue(rules.size() - 1); +    IncrementStop(); +  } + +  void DecrementRulesAndStops(const std::vector<TRulePtr>& rules) { +    for (int i = 0; i < rules.size(); ++i) +      DecrementRule(*rules[i]); +    if (rules.size()) { +      DecrementContinue(rules.size() - 1); +      DecrementStop(); +    } +  } + +  prob_t RuleProbability(const TRule& rule) const { +    prob_t p; p.logeq(rules.logprob(rule, log(rp0(rule)))); +    return p; +  } + +  prob_t Likelihood() const { +    prob_t p = base; +    prob_t q; q.logeq(rules.log_crp_prob()); +    p *= q; +    q.logeq(stop.log_crp_prob()); +    p *= q; +    return p; +  } + +  void IncrementStop() { +    stop.increment(true); +  } + +  void IncrementContinue(int n = 1) { +    for (int i = 0; i < n; ++i) +      stop.increment(false); +  } + +  void DecrementStop() { +    stop.decrement(true); +  } + +  void DecrementContinue(int n = 1) { +    for (int i = 0; i < n; ++i) +      stop.decrement(false); +  } + +  prob_t StopProbability() const { +    return prob_t(stop.prob(true, 0.5)); +  } + +  prob_t ContinueProbability() const { +    return prob_t(stop.prob(false, 0.5)); +  } + +  const PhraseJointBase& rp0; +  prob_t base; +  CCRP_NoTable<TRule> rules; +  CCRP_NoTable<bool> stop; +}; + +#endif + diff --git a/gi/pf/pfbrat.cc b/gi/pf/pfbrat.cc new file mode 100644 index 00000000..7b60ef23 --- /dev/null +++ b/gi/pf/pfbrat.cc @@ -0,0 +1,543 @@ +#include <iostream> +#include <tr1/memory> +#include <queue> + +#include <boost/functional.hpp> +#include <boost/multi_array.hpp> +#include <boost/program_options.hpp> +#include <boost/program_options/variables_map.hpp> + +#include "viterbi.h" +#include "hg.h" +#include "trule.h" +#include "tdict.h" +#include "filelib.h" +#include "dict.h" +#include "sampler.h" +#include "ccrp_nt.h" +#include "cfg_wfst_composer.h" + +using namespace std; +using namespace tr1; +namespace po = boost::program_options; + +static unsigned kMAX_SRC_PHRASE; +static unsigned kMAX_TRG_PHRASE; +struct FSTState; + +double log_poisson(unsigned x, const double& lambda) { +  assert(lambda > 0.0); +  return log(lambda) * x - lgamma(x + 1) - lambda; +} + +struct ConditionalBase { +  explicit ConditionalBase(const double m1mixture, const unsigned vocab_e_size, const string& model1fname) : +      kM1MIXTURE(m1mixture), +      kUNIFORM_MIXTURE(1.0 - m1mixture), +      kUNIFORM_TARGET(1.0 / vocab_e_size), +      kNULL(TD::Convert("<eps>")) { +    assert(m1mixture >= 0.0 && m1mixture <= 1.0); +    assert(vocab_e_size > 0); +    LoadModel1(model1fname); +  } + +  void LoadModel1(const string& fname) { +    cerr << "Loading Model 1 parameters from " << fname << " ..." << endl; +    ReadFile rf(fname); +    istream& in = *rf.stream(); +    string line; +    unsigned lc = 0; +    while(getline(in, line)) { +      ++lc; +      int cur = 0; +      int start = 0; +      while(cur < line.size() && line[cur] != ' ') { ++cur; } +      assert(cur != line.size()); +      line[cur] = 0; +      const WordID src = TD::Convert(&line[0]); +      ++cur; +      start = cur; +      while(cur < line.size() && line[cur] != ' ') { ++cur; } +      assert(cur != line.size()); +      line[cur] = 0; +      WordID trg = TD::Convert(&line[start]); +      const double logprob = strtod(&line[cur + 1], NULL); +      if (src >= ttable.size()) ttable.resize(src + 1); +      ttable[src][trg].logeq(logprob); +    } +    cerr << "  read " << lc << " parameters.\n"; +  } + +  // return logp0 of rule.e_ | rule.f_ +  prob_t operator()(const TRule& rule) const { +    const int flen = rule.f_.size(); +    const int elen = rule.e_.size(); +    prob_t uniform_src_alignment; uniform_src_alignment.logeq(-log(flen + 1)); +    prob_t p; +    p.logeq(log_poisson(elen, flen + 0.01));       // elen | flen          ~Pois(flen + 0.01) +    for (int i = 0; i < elen; ++i) {               // for each position i in e-RHS +      const WordID trg = rule.e_[i]; +      prob_t tp = prob_t::Zero(); +      for (int j = -1; j < flen; ++j) { +        const WordID src = j < 0 ? kNULL : rule.f_[j]; +        const map<WordID, prob_t>::const_iterator it = ttable[src].find(trg); +        if (it != ttable[src].end()) { +          tp += kM1MIXTURE * it->second; +        } +        tp += kUNIFORM_MIXTURE * kUNIFORM_TARGET; +      } +      tp *= uniform_src_alignment;                 //     draw a_i         ~uniform +      p *= tp;                                     //     draw e_i         ~Model1(f_a_i) / uniform +    } +    return p; +  } + +  const prob_t kM1MIXTURE;  // Model 1 mixture component +  const prob_t kUNIFORM_MIXTURE; // uniform mixture component +  const prob_t kUNIFORM_TARGET; +  const WordID kNULL; +  vector<map<WordID, prob_t> > ttable; +}; + +void InitCommandLine(int argc, char** argv, po::variables_map* conf) { +  po::options_description opts("Configuration options"); +  opts.add_options() +        ("samples,s",po::value<unsigned>()->default_value(1000),"Number of samples") +        ("input,i",po::value<string>(),"Read parallel data from") +        ("max_src_phrase",po::value<unsigned>()->default_value(3),"Maximum length of source language phrases") +        ("max_trg_phrase",po::value<unsigned>()->default_value(3),"Maximum length of target language phrases") +        ("model1,m",po::value<string>(),"Model 1 parameters (used in base distribution)") +        ("model1_interpolation_weight",po::value<double>()->default_value(0.95),"Mixing proportion of model 1 with uniform target distribution") +        ("random_seed,S",po::value<uint32_t>(), "Random seed"); +  po::options_description clo("Command line options"); +  clo.add_options() +        ("config", po::value<string>(), "Configuration file") +        ("help,h", "Print this help message and exit"); +  po::options_description dconfig_options, dcmdline_options; +  dconfig_options.add(opts); +  dcmdline_options.add(opts).add(clo); +   +  po::store(parse_command_line(argc, argv, dcmdline_options), *conf); +  if (conf->count("config")) { +    ifstream config((*conf)["config"].as<string>().c_str()); +    po::store(po::parse_config_file(config, dconfig_options), *conf); +  } +  po::notify(*conf); + +  if (conf->count("help") || (conf->count("input") == 0)) { +    cerr << dcmdline_options << endl; +    exit(1); +  } +} + +void ReadParallelCorpus(const string& filename, +                vector<vector<WordID> >* f, +                vector<vector<int> >* e, +                set<int>* vocab_f, +                set<int>* vocab_e) { +  f->clear(); +  e->clear(); +  vocab_f->clear(); +  vocab_e->clear(); +  istream* in; +  if (filename == "-") +    in = &cin; +  else +    in = new ifstream(filename.c_str()); +  assert(*in); +  string line; +  const WordID kDIV = TD::Convert("|||"); +  vector<WordID> tmp; +  while(*in) { +    getline(*in, line); +    if (line.empty() && !*in) break; +    e->push_back(vector<int>()); +    f->push_back(vector<int>()); +    vector<int>& le = e->back(); +    vector<int>& lf = f->back(); +    tmp.clear(); +    TD::ConvertSentence(line, &tmp); +    bool isf = true; +    for (unsigned i = 0; i < tmp.size(); ++i) { +      const int cur = tmp[i]; +      if (isf) { +        if (kDIV == cur) { isf = false; } else { +          lf.push_back(cur); +          vocab_f->insert(cur); +        } +      } else { +        assert(cur != kDIV); +        le.push_back(cur); +        vocab_e->insert(cur); +      } +    } +    assert(isf == false); +  } +  if (in != &cin) delete in; +} + +struct UniphraseLM { +  UniphraseLM(const vector<vector<int> >& corpus, +              const set<int>& vocab, +              const po::variables_map& conf) : +    phrases_(1,1), +    gen_(1,1), +    corpus_(corpus), +    uniform_word_(1.0 / vocab.size()), +    gen_p0_(0.5), +    p_end_(0.5), +    use_poisson_(conf.count("poisson_length") > 0) {} + +  void ResampleHyperparameters(MT19937* rng) { +    phrases_.resample_hyperparameters(rng); +    gen_.resample_hyperparameters(rng); +    cerr << " " << phrases_.concentration(); +  } + +  CCRP_NoTable<vector<int> > phrases_; +  CCRP_NoTable<bool> gen_; +  vector<vector<bool> > z_;   // z_[i] is there a phrase boundary after the ith word +  const vector<vector<int> >& corpus_; +  const double uniform_word_; +  const double gen_p0_; +  const double p_end_; // in base length distribution, p of the end of a phrase +  const bool use_poisson_; +}; + +struct Reachability { +  boost::multi_array<bool, 4> edges;  // edges[src_covered][trg_covered][x][trg_delta] is this edge worth exploring? +  boost::multi_array<short, 2> max_src_delta; // msd[src_covered][trg_covered] -- the largest src delta that's valid + +  Reachability(int srclen, int trglen, int src_max_phrase_len, int trg_max_phrase_len) : +      edges(boost::extents[srclen][trglen][src_max_phrase_len+1][trg_max_phrase_len+1]), +      max_src_delta(boost::extents[srclen][trglen]) { +    ComputeReachability(srclen, trglen, src_max_phrase_len, trg_max_phrase_len); +  } + + private: +  struct SState { +    SState() : prev_src_covered(), prev_trg_covered() {} +    SState(int i, int j) : prev_src_covered(i), prev_trg_covered(j) {} +    int prev_src_covered; +    int prev_trg_covered; +  }; + +  struct NState { +    NState() : next_src_covered(), next_trg_covered() {} +    NState(int i, int j) : next_src_covered(i), next_trg_covered(j) {} +    int next_src_covered; +    int next_trg_covered; +  }; + +  void ComputeReachability(int srclen, int trglen, int src_max_phrase_len, int trg_max_phrase_len) { +    typedef boost::multi_array<vector<SState>, 2> array_type; +    array_type a(boost::extents[srclen + 1][trglen + 1]); +    a[0][0].push_back(SState()); +    for (int i = 0; i < srclen; ++i) { +      for (int j = 0; j < trglen; ++j) { +        if (a[i][j].size() == 0) continue; +        const SState prev(i,j); +        for (int k = 1; k <= src_max_phrase_len; ++k) { +          if ((i + k) > srclen) continue; +          for (int l = 1; l <= trg_max_phrase_len; ++l) { +            if ((j + l) > trglen) continue; +            a[i + k][j + l].push_back(prev); +          } +        } +      } +    } +    a[0][0].clear(); +    cerr << "Final cell contains " << a[srclen][trglen].size() << " back pointers\n"; +    assert(a[srclen][trglen].size() > 0); + +    typedef boost::multi_array<bool, 2> rarray_type; +    rarray_type r(boost::extents[srclen + 1][trglen + 1]); +//    typedef boost::multi_array<vector<NState>, 2> narray_type; +//    narray_type b(boost::extents[srclen + 1][trglen + 1]); +    r[srclen][trglen] = true; +    for (int i = srclen; i >= 0; --i) { +      for (int j = trglen; j >= 0; --j) { +        vector<SState>& prevs = a[i][j]; +        if (!r[i][j]) { prevs.clear(); } +//        const NState nstate(i,j); +        for (int k = 0; k < prevs.size(); ++k) { +          r[prevs[k].prev_src_covered][prevs[k].prev_trg_covered] = true; +          int src_delta = i - prevs[k].prev_src_covered; +          edges[prevs[k].prev_src_covered][prevs[k].prev_trg_covered][src_delta][j - prevs[k].prev_trg_covered] = true; +          short &msd = max_src_delta[prevs[k].prev_src_covered][prevs[k].prev_trg_covered]; +          if (src_delta > msd) msd = src_delta; +//          b[prevs[k].prev_src_covered][prevs[k].prev_trg_covered].push_back(nstate); +        } +      } +    } +    assert(!edges[0][0][1][0]); +    assert(!edges[0][0][0][1]); +    assert(!edges[0][0][0][0]); +    cerr << "  MAX SRC DELTA[0][0] = " << max_src_delta[0][0] << endl; +    assert(max_src_delta[0][0] > 0); +    //cerr << "First cell contains " << b[0][0].size() << " forward pointers\n"; +    //for (int i = 0; i < b[0][0].size(); ++i) { +    //  cerr << "  -> (" << b[0][0][i].next_src_covered << "," << b[0][0][i].next_trg_covered << ")\n"; +    //} +  } +}; + +ostream& operator<<(ostream& os, const FSTState& q); +struct FSTState { +  explicit FSTState(int src_size) : +      trg_covered_(), +      src_covered_(), +      src_coverage_(src_size) {} + +  FSTState(short trg_covered, short src_covered, const vector<bool>& src_coverage, const vector<short>& src_prefix) : +      trg_covered_(trg_covered), +      src_covered_(src_covered), +      src_coverage_(src_coverage), +      src_prefix_(src_prefix) { +    if (src_coverage_.size() == src_covered) { +      assert(src_prefix.size() == 0); +    } +  } + +  // if we extend by the word at src_position, what are +  // the next states that are reachable and lie on a valid +  // path to the final state? +  vector<FSTState> Extensions(int src_position, int src_len, int trg_len, const Reachability& r) const { +    assert(src_position < src_coverage_.size()); +    if (src_coverage_[src_position]) { +      cerr << "Trying to extend " << *this << " with position " << src_position << endl; +      abort(); +    } +    vector<bool> ncvg = src_coverage_; +    ncvg[src_position] = true; + +    vector<FSTState> res; +    const int trg_remaining = trg_len - trg_covered_; +    if (trg_remaining <= 0) { +      cerr << "Target appears to have been covered: " << *this << " (trg_len=" << trg_len << ",trg_covered=" << trg_covered_ << ")" << endl; +      abort(); +    } +    const int src_remaining = src_len - src_covered_; +    if (src_remaining <= 0) { +      cerr << "Source appears to have been covered: " << *this << endl; +      abort(); +    } + +    for (int tc = 1; tc <= kMAX_TRG_PHRASE; ++tc) { +      if (r.edges[src_covered_][trg_covered_][src_prefix_.size() + 1][tc]) { +        int nc = src_prefix_.size() + 1 + src_covered_; +        res.push_back(FSTState(trg_covered_ + tc, nc, ncvg, vector<short>())); +      } +    } + +    if ((src_prefix_.size() + 1) < r.max_src_delta[src_covered_][trg_covered_]) { +      vector<short> nsp = src_prefix_; +      nsp.push_back(src_position); +      res.push_back(FSTState(trg_covered_, src_covered_, ncvg, nsp)); +    } + +    if (res.size() == 0) { +      cerr << *this << " can't be extended!\n"; +      abort(); +    } +    return res; +  } + +  short trg_covered_, src_covered_; +  vector<bool> src_coverage_; +  vector<short> src_prefix_; +}; +bool operator<(const FSTState& q, const FSTState& r) { +  if (q.trg_covered_ != r.trg_covered_) return q.trg_covered_ < r.trg_covered_; +  if (q.src_covered_!= r.src_covered_) return q.src_covered_ < r.src_covered_; +  if (q.src_coverage_ != r.src_coverage_) return q.src_coverage_ < r.src_coverage_; +  return q.src_prefix_ < r.src_prefix_; +} + +ostream& operator<<(ostream& os, const FSTState& q) { +  os << "[" << q.trg_covered_ << " : "; +  for (int i = 0; i < q.src_coverage_.size(); ++i) +    os << q.src_coverage_[i]; +  os << " : <"; +  for (int i = 0; i < q.src_prefix_.size(); ++i) { +    if (i != 0) os << ' '; +    os << q.src_prefix_[i]; +  } +  return os << ">]"; +} + +struct MyModel { +  MyModel(ConditionalBase& rcp0) : rp0(rcp0) {} +  typedef unordered_map<vector<WordID>, CCRP_NoTable<TRule>, boost::hash<vector<WordID> > > SrcToRuleCRPMap; + +  void DecrementRule(const TRule& rule) { +    SrcToRuleCRPMap::iterator it = rules.find(rule.f_); +    assert(it != rules.end()); +    it->second.decrement(rule); +    if (it->second.num_customers() == 0) rules.erase(it); +  } + +  void IncrementRule(const TRule& rule) { +    SrcToRuleCRPMap::iterator it = rules.find(rule.f_); +    if (it == rules.end()) { +      CCRP_NoTable<TRule> crp(1,1); +      it = rules.insert(make_pair(rule.f_, crp)).first; +    } +    it->second.increment(rule); +  } + +  // conditioned on rule.f_ +  prob_t RuleConditionalProbability(const TRule& rule) const { +    const prob_t base = rp0(rule); +    SrcToRuleCRPMap::const_iterator it = rules.find(rule.f_); +    if (it == rules.end()) { +      return base; +    } else { +      const double lp = it->second.logprob(rule, log(base)); +      prob_t q; q.logeq(lp); +      return q; +    } +  } + +  const ConditionalBase& rp0; +  SrcToRuleCRPMap rules; +}; + +struct MyFST : public WFST { +  MyFST(const vector<WordID>& ssrc, const vector<WordID>& strg, MyModel* m) : +      src(ssrc), trg(strg), +      r(src.size(),trg.size(),kMAX_SRC_PHRASE, kMAX_TRG_PHRASE), +      model(m) { +    FSTState in(src.size()); +    cerr << " INIT: " << in << endl; +    init = GetNode(in); +    for (int i = 0; i < in.src_coverage_.size(); ++i) in.src_coverage_[i] = true; +    in.src_covered_ = src.size(); +    in.trg_covered_ = trg.size(); +    cerr << "FINAL: " << in << endl; +    final = GetNode(in); +  } +  virtual const WFSTNode* Final() const; +  virtual const WFSTNode* Initial() const; + +  const WFSTNode* GetNode(const FSTState& q); +  map<FSTState, boost::shared_ptr<WFSTNode> > m; +  const vector<WordID>& src; +  const vector<WordID>& trg; +  Reachability r; +  const WFSTNode* init; +  const WFSTNode* final; +  MyModel* model; +}; + +struct MyNode : public WFSTNode { +  MyNode(const FSTState& q, MyFST* fst) : state(q), container(fst) {} +  virtual vector<pair<const WFSTNode*, TRulePtr> > ExtendInput(unsigned srcindex) const; +  const FSTState state; +  mutable MyFST* container; +}; + +vector<pair<const WFSTNode*, TRulePtr> > MyNode::ExtendInput(unsigned srcindex) const { +  cerr << "EXTEND " << state << " with " << srcindex << endl; +  vector<FSTState> ext = state.Extensions(srcindex, container->src.size(), container->trg.size(), container->r); +  vector<pair<const WFSTNode*,TRulePtr> > res(ext.size()); +  for (unsigned i = 0; i < ext.size(); ++i) { +    res[i].first = container->GetNode(ext[i]); +    if (ext[i].src_prefix_.size() == 0) { +      const unsigned trg_from = state.trg_covered_; +      const unsigned trg_to = ext[i].trg_covered_; +      const unsigned prev_prfx_size = state.src_prefix_.size(); +      res[i].second.reset(new TRule); +      res[i].second->lhs_ = -TD::Convert("X"); +      vector<WordID>& src = res[i].second->f_; +      vector<WordID>& trg = res[i].second->e_; +      src.resize(prev_prfx_size + 1); +      for (unsigned j = 0; j < prev_prfx_size; ++j) +        src[j] = container->src[state.src_prefix_[j]]; +      src[prev_prfx_size] = container->src[srcindex]; +      for (unsigned j = trg_from; j < trg_to; ++j) +        trg.push_back(container->trg[j]); +      res[i].second->scores_.set_value(FD::Convert("Proposal"), log(container->model->RuleConditionalProbability(*res[i].second))); +    } +  } +  return res; +} + +const WFSTNode* MyFST::GetNode(const FSTState& q) { +  boost::shared_ptr<WFSTNode>& res = m[q]; +  if (!res) { +    res.reset(new MyNode(q, this)); +  } +  return &*res; +} + +const WFSTNode* MyFST::Final() const { +  return final; +} + +const WFSTNode* MyFST::Initial() const { +  return init; +} + +int main(int argc, char** argv) { +  po::variables_map conf; +  InitCommandLine(argc, argv, &conf); +  kMAX_TRG_PHRASE = conf["max_trg_phrase"].as<unsigned>(); +  kMAX_SRC_PHRASE = conf["max_src_phrase"].as<unsigned>(); + +  if (!conf.count("model1")) { +    cerr << argv[0] << "Please use --model1 to specify model 1 parameters\n"; +    return 1; +  } +  shared_ptr<MT19937> prng; +  if (conf.count("random_seed")) +    prng.reset(new MT19937(conf["random_seed"].as<uint32_t>())); +  else +    prng.reset(new MT19937); +  MT19937& rng = *prng; + +  vector<vector<int> > corpuse, corpusf; +  set<int> vocabe, vocabf; +  ReadParallelCorpus(conf["input"].as<string>(), &corpusf, &corpuse, &vocabf, &vocabe); +  cerr << "f-Corpus size: " << corpusf.size() << " sentences\n"; +  cerr << "f-Vocabulary size: " << vocabf.size() << " types\n"; +  cerr << "f-Corpus size: " << corpuse.size() << " sentences\n"; +  cerr << "f-Vocabulary size: " << vocabe.size() << " types\n"; +  assert(corpusf.size() == corpuse.size()); + +  ConditionalBase lp0(conf["model1_interpolation_weight"].as<double>(), +                      vocabe.size(), +                      conf["model1"].as<string>()); +  MyModel m(lp0); + +  TRule x("[X] ||| kAnwntR myN ||| at the convent ||| 0"); +  m.IncrementRule(x); +  TRule y("[X] ||| nY dyN ||| gave ||| 0"); +  m.IncrementRule(y); + + +  MyFST fst(corpusf[0], corpuse[0], &m); +  ifstream in("./kimura.g"); +  assert(in); +  CFG_WFSTComposer comp(fst); +  Hypergraph hg; +  bool succeed = comp.Compose(&in, &hg); +  hg.PrintGraphviz(); +  if (succeed) { cerr << "SUCCESS.\n"; } else { cerr << "FAILURE REPORTED.\n"; } + +#if 0 +  ifstream in2("./amnabooks.g"); +  assert(in2); +  MyFST fst2(corpusf[1], corpuse[1], &m); +  CFG_WFSTComposer comp2(fst2); +  Hypergraph hg2; +  bool succeed2 = comp2.Compose(&in2, &hg2); +  if (succeed2) { cerr << "SUCCESS.\n"; } else { cerr << "FAILURE REPORTED.\n"; } +#endif + +  SparseVector<double> w; w.set_value(FD::Convert("Proposal"), 1.0); +  hg.Reweight(w); +  cerr << ViterbiFTree(hg) << endl; +  return 0; +} + diff --git a/gi/pf/pfdist.cc b/gi/pf/pfdist.cc new file mode 100644 index 00000000..81abd61b --- /dev/null +++ b/gi/pf/pfdist.cc @@ -0,0 +1,610 @@ +#include <iostream> +#include <tr1/memory> +#include <queue> + +#include <boost/functional.hpp> +#include <boost/program_options.hpp> +#include <boost/program_options/variables_map.hpp> + +#include "base_measures.h" +#include "reachability.h" +#include "viterbi.h" +#include "hg.h" +#include "trule.h" +#include "tdict.h" +#include "filelib.h" +#include "dict.h" +#include "sampler.h" +#include "ccrp_nt.h" +#include "ccrp_onetable.h" + +using namespace std; +using namespace tr1; +namespace po = boost::program_options; + +shared_ptr<MT19937> prng; + +void InitCommandLine(int argc, char** argv, po::variables_map* conf) { +  po::options_description opts("Configuration options"); +  opts.add_options() +        ("samples,s",po::value<unsigned>()->default_value(1000),"Number of samples") +        ("particles,p",po::value<unsigned>()->default_value(30),"Number of particles") +        ("filter_frequency,f",po::value<unsigned>()->default_value(5),"Number of time steps between filterings") +        ("input,i",po::value<string>(),"Read parallel data from") +        ("max_src_phrase",po::value<unsigned>()->default_value(5),"Maximum length of source language phrases") +        ("max_trg_phrase",po::value<unsigned>()->default_value(5),"Maximum length of target language phrases") +        ("model1,m",po::value<string>(),"Model 1 parameters (used in base distribution)") +        ("inverse_model1,M",po::value<string>(),"Inverse Model 1 parameters (used in backward estimate)") +        ("model1_interpolation_weight",po::value<double>()->default_value(0.95),"Mixing proportion of model 1 with uniform target distribution") +        ("random_seed,S",po::value<uint32_t>(), "Random seed"); +  po::options_description clo("Command line options"); +  clo.add_options() +        ("config", po::value<string>(), "Configuration file") +        ("help,h", "Print this help message and exit"); +  po::options_description dconfig_options, dcmdline_options; +  dconfig_options.add(opts); +  dcmdline_options.add(opts).add(clo); +   +  po::store(parse_command_line(argc, argv, dcmdline_options), *conf); +  if (conf->count("config")) { +    ifstream config((*conf)["config"].as<string>().c_str()); +    po::store(po::parse_config_file(config, dconfig_options), *conf); +  } +  po::notify(*conf); + +  if (conf->count("help") || (conf->count("input") == 0)) { +    cerr << dcmdline_options << endl; +    exit(1); +  } +} + +void ReadParallelCorpus(const string& filename, +                vector<vector<WordID> >* f, +                vector<vector<WordID> >* e, +                set<WordID>* vocab_f, +                set<WordID>* vocab_e) { +  f->clear(); +  e->clear(); +  vocab_f->clear(); +  vocab_e->clear(); +  istream* in; +  if (filename == "-") +    in = &cin; +  else +    in = new ifstream(filename.c_str()); +  assert(*in); +  string line; +  const WordID kDIV = TD::Convert("|||"); +  vector<WordID> tmp; +  while(*in) { +    getline(*in, line); +    if (line.empty() && !*in) break; +    e->push_back(vector<int>()); +    f->push_back(vector<int>()); +    vector<int>& le = e->back(); +    vector<int>& lf = f->back(); +    tmp.clear(); +    TD::ConvertSentence(line, &tmp); +    bool isf = true; +    for (unsigned i = 0; i < tmp.size(); ++i) { +      const int cur = tmp[i]; +      if (isf) { +        if (kDIV == cur) { isf = false; } else { +          lf.push_back(cur); +          vocab_f->insert(cur); +        } +      } else { +        assert(cur != kDIV); +        le.push_back(cur); +        vocab_e->insert(cur); +      } +    } +    assert(isf == false); +  } +  if (in != &cin) delete in; +} + +#if 0 +struct MyConditionalModel { +  MyConditionalModel(PhraseConditionalBase& rcp0) : rp0(&rcp0), base(prob_t::One()), src_phrases(1,1), src_jumps(200, CCRP_NoTable<int>(1,1)) {} + +  prob_t srcp0(const vector<WordID>& src) const { +    prob_t p(1.0 / 3000.0); +    p.poweq(src.size()); +    prob_t lenp; lenp.logeq(log_poisson(src.size(), 1.0)); +    p *= lenp; +    return p; +  } + +  void DecrementRule(const TRule& rule) { +    const RuleCRPMap::iterator it = rules.find(rule.f_); +    assert(it != rules.end()); +    if (it->second.decrement(rule)) { +      base /= (*rp0)(rule); +      if (it->second.num_customers() == 0) +        rules.erase(it); +    } +    if (src_phrases.decrement(rule.f_)) +      base /= srcp0(rule.f_); +  } + +  void IncrementRule(const TRule& rule) { +    RuleCRPMap::iterator it = rules.find(rule.f_); +    if (it == rules.end()) +      it = rules.insert(make_pair(rule.f_, CCRP_NoTable<TRule>(1,1))).first; +    if (it->second.increment(rule)) { +      base *= (*rp0)(rule); +    } +    if (src_phrases.increment(rule.f_)) +      base *= srcp0(rule.f_); +  } + +  void IncrementRules(const vector<TRulePtr>& rules) { +    for (int i = 0; i < rules.size(); ++i) +      IncrementRule(*rules[i]); +  } + +  void DecrementRules(const vector<TRulePtr>& rules) { +    for (int i = 0; i < rules.size(); ++i) +      DecrementRule(*rules[i]); +  } + +  void IncrementJump(int dist, unsigned src_len) { +    assert(src_len > 0); +    if (src_jumps[src_len].increment(dist)) +      base *= jp0(dist, src_len); +  } + +  void DecrementJump(int dist, unsigned src_len) { +    assert(src_len > 0); +    if (src_jumps[src_len].decrement(dist)) +      base /= jp0(dist, src_len); +  } + +  void IncrementJumps(const vector<int>& js, unsigned src_len) { +    for (unsigned i = 0; i < js.size(); ++i) +      IncrementJump(js[i], src_len); +  } + +  void DecrementJumps(const vector<int>& js, unsigned src_len) { +    for (unsigned i = 0; i < js.size(); ++i) +      DecrementJump(js[i], src_len); +  } + +  // p(jump = dist | src_len , z) +  prob_t JumpProbability(int dist, unsigned src_len) { +    const prob_t p0 = jp0(dist, src_len); +    const double lp = src_jumps[src_len].logprob(dist, log(p0)); +    prob_t q; q.logeq(lp); +    return q; +  } + +  // p(rule.f_ | z) * p(rule.e_ | rule.f_ , z) +  prob_t RuleProbability(const TRule& rule) const { +    const prob_t p0 = (*rp0)(rule); +    prob_t srcp; srcp.logeq(src_phrases.logprob(rule.f_, log(srcp0(rule.f_)))); +    const RuleCRPMap::const_iterator it = rules.find(rule.f_); +    if (it == rules.end()) return srcp * p0; +    const double lp = it->second.logprob(rule, log(p0)); +    prob_t q; q.logeq(lp); +    return q * srcp; +  } + +  prob_t Likelihood() const { +    prob_t p = base; +    for (RuleCRPMap::const_iterator it = rules.begin(); +         it != rules.end(); ++it) { +      prob_t cl; cl.logeq(it->second.log_crp_prob()); +      p *= cl; +    } +    for (unsigned l = 1; l < src_jumps.size(); ++l) { +      if (src_jumps[l].num_customers() > 0) { +        prob_t q; +        q.logeq(src_jumps[l].log_crp_prob()); +        p *= q; +      } +    } +    return p; +  } + +  JumpBase jp0; +  const PhraseConditionalBase* rp0; +  prob_t base; +  typedef unordered_map<vector<WordID>, CCRP_NoTable<TRule>, boost::hash<vector<WordID> > > RuleCRPMap; +  RuleCRPMap rules; +  CCRP_NoTable<vector<WordID> > src_phrases; +  vector<CCRP_NoTable<int> > src_jumps; +}; + +#endif + +struct MyJointModel { +  MyJointModel(PhraseJointBase& rcp0) : +    rp0(rcp0), base(prob_t::One()), rules(1,1), src_jumps(200, CCRP_NoTable<int>(1,1)) {} + +  void DecrementRule(const TRule& rule) { +    if (rules.decrement(rule)) +      base /= rp0(rule); +  } + +  void IncrementRule(const TRule& rule) { +    if (rules.increment(rule)) +      base *= rp0(rule); +  } + +  void IncrementRules(const vector<TRulePtr>& rules) { +    for (int i = 0; i < rules.size(); ++i) +      IncrementRule(*rules[i]); +  } + +  void DecrementRules(const vector<TRulePtr>& rules) { +    for (int i = 0; i < rules.size(); ++i) +      DecrementRule(*rules[i]); +  } + +  void IncrementJump(int dist, unsigned src_len) { +    assert(src_len > 0); +    if (src_jumps[src_len].increment(dist)) +      base *= jp0(dist, src_len); +  } + +  void DecrementJump(int dist, unsigned src_len) { +    assert(src_len > 0); +    if (src_jumps[src_len].decrement(dist)) +      base /= jp0(dist, src_len); +  } + +  void IncrementJumps(const vector<int>& js, unsigned src_len) { +    for (unsigned i = 0; i < js.size(); ++i) +      IncrementJump(js[i], src_len); +  } + +  void DecrementJumps(const vector<int>& js, unsigned src_len) { +    for (unsigned i = 0; i < js.size(); ++i) +      DecrementJump(js[i], src_len); +  } + +  // p(jump = dist | src_len , z) +  prob_t JumpProbability(int dist, unsigned src_len) { +    const prob_t p0 = jp0(dist, src_len); +    const double lp = src_jumps[src_len].logprob(dist, log(p0)); +    prob_t q; q.logeq(lp); +    return q; +  } + +  // p(rule.f_ | z) * p(rule.e_ | rule.f_ , z) +  prob_t RuleProbability(const TRule& rule) const { +    prob_t p; p.logeq(rules.logprob(rule, log(rp0(rule)))); +    return p; +  } + +  prob_t Likelihood() const { +    prob_t p = base; +    prob_t q; q.logeq(rules.log_crp_prob()); +    p *= q; +    for (unsigned l = 1; l < src_jumps.size(); ++l) { +      if (src_jumps[l].num_customers() > 0) { +        prob_t q; +        q.logeq(src_jumps[l].log_crp_prob()); +        p *= q; +      } +    } +    return p; +  } + +  JumpBase jp0; +  const PhraseJointBase& rp0; +  prob_t base; +  CCRP_NoTable<TRule> rules; +  vector<CCRP_NoTable<int> > src_jumps; +}; + +struct BackwardEstimate { +  BackwardEstimate(const Model1& m1, const vector<WordID>& src, const vector<WordID>& trg) : +      model1_(m1), src_(src), trg_(trg) { +  } +  const prob_t& operator()(const vector<bool>& src_cov, unsigned trg_cov) const { +    assert(src_.size() == src_cov.size()); +    assert(trg_cov <= trg_.size()); +    prob_t& e = cache_[src_cov][trg_cov]; +    if (e.is_0()) { +      if (trg_cov == trg_.size()) { e = prob_t::One(); return e; } +      vector<WordID> r(src_.size() + 1); r.clear(); +      r.push_back(0);  // NULL word +      for (int i = 0; i < src_cov.size(); ++i) +        if (!src_cov[i]) r.push_back(src_[i]); +      const prob_t uniform_alignment(1.0 / r.size()); +      e.logeq(log_poisson(trg_.size() - trg_cov, r.size() - 1)); // p(trg len remaining | src len remaining) +      for (unsigned j = trg_cov; j < trg_.size(); ++j) { +        prob_t p; +        for (unsigned i = 0; i < r.size(); ++i) +          p += model1_(r[i], trg_[j]); +        if (p.is_0()) { +          cerr << "ERROR: p(" << TD::Convert(trg_[j]) << " | " << TD::GetString(r) << ") = 0!\n"; +          abort(); +        } +        p *= uniform_alignment; +        e *= p; +      } +    } +    return e; +  } +  const Model1& model1_; +  const vector<WordID>& src_; +  const vector<WordID>& trg_; +  mutable unordered_map<vector<bool>, map<unsigned, prob_t>, boost::hash<vector<bool> > > cache_; +}; + +struct BackwardEstimateSym { +  BackwardEstimateSym(const Model1& m1, +                      const Model1& invm1, const vector<WordID>& src, const vector<WordID>& trg) : +      model1_(m1), invmodel1_(invm1), src_(src), trg_(trg) { +  } +  const prob_t& operator()(const vector<bool>& src_cov, unsigned trg_cov) const { +    assert(src_.size() == src_cov.size()); +    assert(trg_cov <= trg_.size()); +    prob_t& e = cache_[src_cov][trg_cov]; +    if (e.is_0()) { +      if (trg_cov == trg_.size()) { e = prob_t::One(); return e; } +      vector<WordID> r(src_.size() + 1); r.clear(); +      for (int i = 0; i < src_cov.size(); ++i) +        if (!src_cov[i]) r.push_back(src_[i]); +      r.push_back(0);  // NULL word +      const prob_t uniform_alignment(1.0 / r.size()); +      e.logeq(log_poisson(trg_.size() - trg_cov, r.size() - 1)); // p(trg len remaining | src len remaining) +      for (unsigned j = trg_cov; j < trg_.size(); ++j) { +        prob_t p; +        for (unsigned i = 0; i < r.size(); ++i) +          p += model1_(r[i], trg_[j]); +        if (p.is_0()) { +          cerr << "ERROR: p(" << TD::Convert(trg_[j]) << " | " << TD::GetString(r) << ") = 0!\n"; +          abort(); +        } +        p *= uniform_alignment; +        e *= p; +      } +      r.pop_back(); +      const prob_t inv_uniform(1.0 / (trg_.size() - trg_cov + 1.0)); +      prob_t inv; +      inv.logeq(log_poisson(r.size(), trg_.size() - trg_cov)); +      for (unsigned i = 0; i < r.size(); ++i) { +        prob_t p; +        for (unsigned j = trg_cov - 1; j < trg_.size(); ++j) +          p += invmodel1_(j < trg_cov ? 0 : trg_[j], r[i]); +        if (p.is_0()) { +          cerr << "ERROR: p_inv(" << TD::Convert(r[i]) << " | " << TD::GetString(trg_) << ") = 0!\n"; +          abort(); +        } +        p *= inv_uniform; +        inv *= p; +      } +      prob_t x = pow(e * inv, 0.5); +      e = x; +      //cerr << "Forward: " << log(e) << "\tBackward: " << log(inv) << "\t prop: " << log(x) << endl; +    } +    return e; +  } +  const Model1& model1_; +  const Model1& invmodel1_; +  const vector<WordID>& src_; +  const vector<WordID>& trg_; +  mutable unordered_map<vector<bool>, map<unsigned, prob_t>, boost::hash<vector<bool> > > cache_; +}; + +struct Particle { +  Particle() : weight(prob_t::One()), src_cov(), trg_cov(), prev_pos(-1) {} +  prob_t weight; +  prob_t gamma_last; +  vector<int> src_jumps; +  vector<TRulePtr> rules; +  vector<bool> src_cv; +  int src_cov; +  int trg_cov; +  int prev_pos; +}; + +ostream& operator<<(ostream& o, const vector<bool>& v) { +  for (int i = 0; i < v.size(); ++i) +    o << (v[i] ? '1' : '0'); +  return o; +} +ostream& operator<<(ostream& o, const Particle& p) { +  o << "[cv=" << p.src_cv << "  src_cov=" << p.src_cov << " trg_cov=" << p.trg_cov << " last_pos=" << p.prev_pos << " num_rules=" << p.rules.size() << "  w=" << log(p.weight) << ']'; +  return o; +} + +void FilterCrapParticlesAndReweight(vector<Particle>* pps) { +  vector<Particle>& ps = *pps; +  SampleSet<prob_t> ss; +  for (int i = 0; i < ps.size(); ++i) +    ss.add(ps[i].weight); +  vector<Particle> nps; nps.reserve(ps.size()); +  const prob_t uniform_weight(1.0 / ps.size()); +  for (int i = 0; i < ps.size(); ++i) { +    nps.push_back(ps[prng->SelectSample(ss)]); +    nps[i].weight = uniform_weight; +  } +  nps.swap(ps); +} + +int main(int argc, char** argv) { +  po::variables_map conf; +  InitCommandLine(argc, argv, &conf); +  const unsigned kMAX_TRG_PHRASE = conf["max_trg_phrase"].as<unsigned>(); +  const unsigned kMAX_SRC_PHRASE = conf["max_src_phrase"].as<unsigned>(); +  const unsigned particles = conf["particles"].as<unsigned>(); +  const unsigned samples = conf["samples"].as<unsigned>(); +  const unsigned rejuv_freq = conf["filter_frequency"].as<unsigned>(); + +  if (!conf.count("model1")) { +    cerr << argv[0] << "Please use --model1 to specify model 1 parameters\n"; +    return 1; +  } +  if (conf.count("random_seed")) +    prng.reset(new MT19937(conf["random_seed"].as<uint32_t>())); +  else +    prng.reset(new MT19937); +  MT19937& rng = *prng; + +  vector<vector<WordID> > corpuse, corpusf; +  set<WordID> vocabe, vocabf; +  cerr << "Reading corpus...\n"; +  ReadParallelCorpus(conf["input"].as<string>(), &corpusf, &corpuse, &vocabf, &vocabe); +  cerr << "F-corpus size: " << corpusf.size() << " sentences\t (" << vocabf.size() << " word types)\n"; +  cerr << "E-corpus size: " << corpuse.size() << " sentences\t (" << vocabe.size() << " word types)\n"; +  assert(corpusf.size() == corpuse.size()); + +  const int kLHS = -TD::Convert("X"); +  Model1 m1(conf["model1"].as<string>()); +  Model1 invm1(conf["inverse_model1"].as<string>()); + +#if 0 +  PhraseConditionalBase lp0(m1, conf["model1_interpolation_weight"].as<double>(), vocabe.size()); +  MyConditionalModel m(lp0); +#else +  PhraseJointBase lp0(m1, conf["model1_interpolation_weight"].as<double>(), vocabe.size(), vocabf.size()); +  MyJointModel m(lp0); +#endif + +  cerr << "Initializing reachability limits...\n"; +  vector<Particle> ps(corpusf.size()); +  vector<Reachability> reaches; reaches.reserve(corpusf.size()); +  for (int ci = 0; ci < corpusf.size(); ++ci) +    reaches.push_back(Reachability(corpusf[ci].size(), +                                   corpuse[ci].size(), +                                   kMAX_SRC_PHRASE, +                                   kMAX_TRG_PHRASE)); +  cerr << "Sampling...\n";  +  vector<Particle> tmp_p(10000);  // work space +  SampleSet<prob_t> pfss; +  for (int SS=0; SS < samples; ++SS) { +    for (int ci = 0; ci < corpusf.size(); ++ci) { +      vector<int>& src = corpusf[ci]; +      vector<int>& trg = corpuse[ci]; +      m.DecrementRules(ps[ci].rules); +      m.DecrementJumps(ps[ci].src_jumps, src.size()); + +      //BackwardEstimate be(m1, src, trg); +      BackwardEstimateSym be(m1, invm1, src, trg); +      const Reachability& r = reaches[ci]; +      vector<Particle> lps(particles); + +      for (int pi = 0; pi < particles; ++pi) { +        Particle& p = lps[pi]; +        p.src_cv.resize(src.size(), false); +      } + +      bool all_complete = false; +      while(!all_complete) { +        SampleSet<prob_t> ss; + +        // all particles have now been extended a bit, we will reweight them now +        if (lps[0].trg_cov > 0) +          FilterCrapParticlesAndReweight(&lps); + +        // loop over all particles and extend them +        bool done_nothing = true; +        for (int pi = 0; pi < particles; ++pi) { +          Particle& p = lps[pi]; +          int tic = 0; +          while(p.trg_cov < trg.size() && tic < rejuv_freq) { +            ++tic; +            done_nothing = false; +            ss.clear(); +            TRule x; x.lhs_ = kLHS; +            prob_t z; +            int first_uncovered = src.size(); +            int last_uncovered = -1; +            for (int i = 0; i < src.size(); ++i) { +              const bool is_uncovered = !p.src_cv[i]; +              if (i < first_uncovered && is_uncovered) first_uncovered = i; +              if (is_uncovered && i > last_uncovered) last_uncovered = i; +            } +            assert(last_uncovered > -1); +            assert(first_uncovered < src.size()); + +            for (int trg_len = 1; trg_len <= kMAX_TRG_PHRASE; ++trg_len) { +              x.e_.push_back(trg[trg_len - 1 + p.trg_cov]); +              for (int src_len = 1; src_len <= kMAX_SRC_PHRASE; ++src_len) { +                if (!r.edges[p.src_cov][p.trg_cov][src_len][trg_len]) continue; + +                const int last_possible_start = last_uncovered - src_len + 1; +                assert(last_possible_start >= 0); +                //cerr << src_len << "," << trg_len << " is allowed. E=" << TD::GetString(x.e_) << endl; +                //cerr << "  first_uncovered=" << first_uncovered << "  last_possible_start=" << last_possible_start << endl; +                for (int i = first_uncovered; i <= last_possible_start; ++i) { +                  if (p.src_cv[i]) continue; +                  assert(ss.size() < tmp_p.size());  // if fails increase tmp_p size +                  Particle& np = tmp_p[ss.size()]; +                  np = p; +                  x.f_.clear(); +                  int gap_add = 0; +                  bool bad = false; +                  prob_t jp = prob_t::One(); +                  int prev_pos = p.prev_pos; +                  for (int j = 0; j < src_len; ++j) { +                    if ((j + i + gap_add) == src.size()) { bad = true; break; } +                    while ((i+j+gap_add) < src.size() && p.src_cv[i + j + gap_add]) { ++gap_add; } +                    if ((j + i + gap_add) == src.size()) { bad = true; break; } +                    np.src_cv[i + j + gap_add] = true; +                    x.f_.push_back(src[i + j + gap_add]); +                    jp *= m.JumpProbability(i + j + gap_add - prev_pos, src.size()); +                    int jump = i + j + gap_add - prev_pos; +                    assert(jump != 0); +                    np.src_jumps.push_back(jump); +                    prev_pos = i + j + gap_add; +                  } +                  if (bad) continue; +                  np.prev_pos = prev_pos; +                  np.src_cov += x.f_.size(); +                  np.trg_cov += x.e_.size(); +                  if (x.f_.size() != src_len) continue; +                  prob_t rp = m.RuleProbability(x); +                  np.gamma_last = rp * jp; +                  const prob_t u = pow(np.gamma_last * be(np.src_cv, np.trg_cov), 0.2); +                  //cerr << "**rule=" << x << endl; +                  //cerr << "  u=" << log(u) << "  rule=" << rp << " jump=" << jp << endl; +                  ss.add(u); +                  np.rules.push_back(TRulePtr(new TRule(x))); +                  z += u; + +                  const bool completed = (p.trg_cov == trg.size()); +                  if (completed) { +                    int last_jump = src.size() - p.prev_pos; +                    assert(last_jump > 0); +                    p.src_jumps.push_back(last_jump); +                    p.weight *= m.JumpProbability(last_jump, src.size()); +                  } +                } +              } +            } +            cerr << "number of edges to consider: " << ss.size() << endl; +            const int sampled = rng.SelectSample(ss); +            prob_t q_n = ss[sampled] / z; +            p = tmp_p[sampled]; +            //m.IncrementRule(*p.rules.back()); +            p.weight *= p.gamma_last / q_n; +            cerr << "[w=" << log(p.weight) << "]\tsampled rule: " << p.rules.back()->AsString() << endl; +            cerr << p << endl; +          } +        } // loop over particles (pi = 0 .. particles) +        if (done_nothing) all_complete = true; +      } +      pfss.clear(); +      for (int i = 0; i < lps.size(); ++i) +        pfss.add(lps[i].weight); +      const int sampled = rng.SelectSample(pfss); +      ps[ci] = lps[sampled]; +      m.IncrementRules(lps[sampled].rules); +      m.IncrementJumps(lps[sampled].src_jumps, src.size()); +      for (int i = 0; i < lps[sampled].rules.size(); ++i) { cerr << "S:\t" << lps[sampled].rules[i]->AsString() << "\n"; } +      cerr << "tmp-LLH: " << log(m.Likelihood()) << endl; +    } +    cerr << "LLH: " << log(m.Likelihood()) << endl; +    for (int sni = 0; sni < 5; ++sni) { +      for (int i = 0; i < ps[sni].rules.size(); ++i) { cerr << "\t" << ps[sni].rules[i]->AsString() << endl; } +    } +  } +  return 0; +} + diff --git a/gi/pf/pfdist.new.cc b/gi/pf/pfdist.new.cc new file mode 100644 index 00000000..3169eb75 --- /dev/null +++ b/gi/pf/pfdist.new.cc @@ -0,0 +1,620 @@ +#include <iostream> +#include <tr1/memory> +#include <queue> + +#include <boost/functional.hpp> +#include <boost/program_options.hpp> +#include <boost/program_options/variables_map.hpp> + +#include "base_measures.h" +#include "reachability.h" +#include "viterbi.h" +#include "hg.h" +#include "trule.h" +#include "tdict.h" +#include "filelib.h" +#include "dict.h" +#include "sampler.h" +#include "ccrp_nt.h" +#include "ccrp_onetable.h" + +using namespace std; +using namespace tr1; +namespace po = boost::program_options; + +shared_ptr<MT19937> prng; + +size_t hash_value(const TRule& r) { +  size_t h = boost::hash_value(r.e_); +  boost::hash_combine(h, -r.lhs_); +  boost::hash_combine(h, boost::hash_value(r.f_)); +  return h; +} + +bool operator==(const TRule& a, const TRule& b) { +  return (a.lhs_ == b.lhs_ && a.e_ == b.e_ && a.f_ == b.f_); +} + +void InitCommandLine(int argc, char** argv, po::variables_map* conf) { +  po::options_description opts("Configuration options"); +  opts.add_options() +        ("samples,s",po::value<unsigned>()->default_value(1000),"Number of samples") +        ("particles,p",po::value<unsigned>()->default_value(25),"Number of particles") +        ("input,i",po::value<string>(),"Read parallel data from") +        ("max_src_phrase",po::value<unsigned>()->default_value(5),"Maximum length of source language phrases") +        ("max_trg_phrase",po::value<unsigned>()->default_value(5),"Maximum length of target language phrases") +        ("model1,m",po::value<string>(),"Model 1 parameters (used in base distribution)") +        ("inverse_model1,M",po::value<string>(),"Inverse Model 1 parameters (used in backward estimate)") +        ("model1_interpolation_weight",po::value<double>()->default_value(0.95),"Mixing proportion of model 1 with uniform target distribution") +        ("random_seed,S",po::value<uint32_t>(), "Random seed"); +  po::options_description clo("Command line options"); +  clo.add_options() +        ("config", po::value<string>(), "Configuration file") +        ("help,h", "Print this help message and exit"); +  po::options_description dconfig_options, dcmdline_options; +  dconfig_options.add(opts); +  dcmdline_options.add(opts).add(clo); +   +  po::store(parse_command_line(argc, argv, dcmdline_options), *conf); +  if (conf->count("config")) { +    ifstream config((*conf)["config"].as<string>().c_str()); +    po::store(po::parse_config_file(config, dconfig_options), *conf); +  } +  po::notify(*conf); + +  if (conf->count("help") || (conf->count("input") == 0)) { +    cerr << dcmdline_options << endl; +    exit(1); +  } +} + +void ReadParallelCorpus(const string& filename, +                vector<vector<WordID> >* f, +                vector<vector<WordID> >* e, +                set<WordID>* vocab_f, +                set<WordID>* vocab_e) { +  f->clear(); +  e->clear(); +  vocab_f->clear(); +  vocab_e->clear(); +  istream* in; +  if (filename == "-") +    in = &cin; +  else +    in = new ifstream(filename.c_str()); +  assert(*in); +  string line; +  const WordID kDIV = TD::Convert("|||"); +  vector<WordID> tmp; +  while(*in) { +    getline(*in, line); +    if (line.empty() && !*in) break; +    e->push_back(vector<int>()); +    f->push_back(vector<int>()); +    vector<int>& le = e->back(); +    vector<int>& lf = f->back(); +    tmp.clear(); +    TD::ConvertSentence(line, &tmp); +    bool isf = true; +    for (unsigned i = 0; i < tmp.size(); ++i) { +      const int cur = tmp[i]; +      if (isf) { +        if (kDIV == cur) { isf = false; } else { +          lf.push_back(cur); +          vocab_f->insert(cur); +        } +      } else { +        assert(cur != kDIV); +        le.push_back(cur); +        vocab_e->insert(cur); +      } +    } +    assert(isf == false); +  } +  if (in != &cin) delete in; +} + +#if 0 +struct MyConditionalModel { +  MyConditionalModel(PhraseConditionalBase& rcp0) : rp0(&rcp0), base(prob_t::One()), src_phrases(1,1), src_jumps(200, CCRP_NoTable<int>(1,1)) {} + +  prob_t srcp0(const vector<WordID>& src) const { +    prob_t p(1.0 / 3000.0); +    p.poweq(src.size()); +    prob_t lenp; lenp.logeq(log_poisson(src.size(), 1.0)); +    p *= lenp; +    return p; +  } + +  void DecrementRule(const TRule& rule) { +    const RuleCRPMap::iterator it = rules.find(rule.f_); +    assert(it != rules.end()); +    if (it->second.decrement(rule)) { +      base /= (*rp0)(rule); +      if (it->second.num_customers() == 0) +        rules.erase(it); +    } +    if (src_phrases.decrement(rule.f_)) +      base /= srcp0(rule.f_); +  } + +  void IncrementRule(const TRule& rule) { +    RuleCRPMap::iterator it = rules.find(rule.f_); +    if (it == rules.end()) +      it = rules.insert(make_pair(rule.f_, CCRP_NoTable<TRule>(1,1))).first; +    if (it->second.increment(rule)) { +      base *= (*rp0)(rule); +    } +    if (src_phrases.increment(rule.f_)) +      base *= srcp0(rule.f_); +  } + +  void IncrementRules(const vector<TRulePtr>& rules) { +    for (int i = 0; i < rules.size(); ++i) +      IncrementRule(*rules[i]); +  } + +  void DecrementRules(const vector<TRulePtr>& rules) { +    for (int i = 0; i < rules.size(); ++i) +      DecrementRule(*rules[i]); +  } + +  void IncrementJump(int dist, unsigned src_len) { +    assert(src_len > 0); +    if (src_jumps[src_len].increment(dist)) +      base *= jp0(dist, src_len); +  } + +  void DecrementJump(int dist, unsigned src_len) { +    assert(src_len > 0); +    if (src_jumps[src_len].decrement(dist)) +      base /= jp0(dist, src_len); +  } + +  void IncrementJumps(const vector<int>& js, unsigned src_len) { +    for (unsigned i = 0; i < js.size(); ++i) +      IncrementJump(js[i], src_len); +  } + +  void DecrementJumps(const vector<int>& js, unsigned src_len) { +    for (unsigned i = 0; i < js.size(); ++i) +      DecrementJump(js[i], src_len); +  } + +  // p(jump = dist | src_len , z) +  prob_t JumpProbability(int dist, unsigned src_len) { +    const prob_t p0 = jp0(dist, src_len); +    const double lp = src_jumps[src_len].logprob(dist, log(p0)); +    prob_t q; q.logeq(lp); +    return q; +  } + +  // p(rule.f_ | z) * p(rule.e_ | rule.f_ , z) +  prob_t RuleProbability(const TRule& rule) const { +    const prob_t p0 = (*rp0)(rule); +    prob_t srcp; srcp.logeq(src_phrases.logprob(rule.f_, log(srcp0(rule.f_)))); +    const RuleCRPMap::const_iterator it = rules.find(rule.f_); +    if (it == rules.end()) return srcp * p0; +    const double lp = it->second.logprob(rule, log(p0)); +    prob_t q; q.logeq(lp); +    return q * srcp; +  } + +  prob_t Likelihood() const { +    prob_t p = base; +    for (RuleCRPMap::const_iterator it = rules.begin(); +         it != rules.end(); ++it) { +      prob_t cl; cl.logeq(it->second.log_crp_prob()); +      p *= cl; +    } +    for (unsigned l = 1; l < src_jumps.size(); ++l) { +      if (src_jumps[l].num_customers() > 0) { +        prob_t q; +        q.logeq(src_jumps[l].log_crp_prob()); +        p *= q; +      } +    } +    return p; +  } + +  JumpBase jp0; +  const PhraseConditionalBase* rp0; +  prob_t base; +  typedef unordered_map<vector<WordID>, CCRP_NoTable<TRule>, boost::hash<vector<WordID> > > RuleCRPMap; +  RuleCRPMap rules; +  CCRP_NoTable<vector<WordID> > src_phrases; +  vector<CCRP_NoTable<int> > src_jumps; +}; + +#endif + +struct MyJointModel { +  MyJointModel(PhraseJointBase& rcp0) : +    rp0(rcp0), base(prob_t::One()), rules(1,1), src_jumps(200, CCRP_NoTable<int>(1,1)) {} + +  void DecrementRule(const TRule& rule) { +    if (rules.decrement(rule)) +      base /= rp0(rule); +  } + +  void IncrementRule(const TRule& rule) { +    if (rules.increment(rule)) +      base *= rp0(rule); +  } + +  void IncrementRules(const vector<TRulePtr>& rules) { +    for (int i = 0; i < rules.size(); ++i) +      IncrementRule(*rules[i]); +  } + +  void DecrementRules(const vector<TRulePtr>& rules) { +    for (int i = 0; i < rules.size(); ++i) +      DecrementRule(*rules[i]); +  } + +  void IncrementJump(int dist, unsigned src_len) { +    assert(src_len > 0); +    if (src_jumps[src_len].increment(dist)) +      base *= jp0(dist, src_len); +  } + +  void DecrementJump(int dist, unsigned src_len) { +    assert(src_len > 0); +    if (src_jumps[src_len].decrement(dist)) +      base /= jp0(dist, src_len); +  } + +  void IncrementJumps(const vector<int>& js, unsigned src_len) { +    for (unsigned i = 0; i < js.size(); ++i) +      IncrementJump(js[i], src_len); +  } + +  void DecrementJumps(const vector<int>& js, unsigned src_len) { +    for (unsigned i = 0; i < js.size(); ++i) +      DecrementJump(js[i], src_len); +  } + +  // p(jump = dist | src_len , z) +  prob_t JumpProbability(int dist, unsigned src_len) { +    const prob_t p0 = jp0(dist, src_len); +    const double lp = src_jumps[src_len].logprob(dist, log(p0)); +    prob_t q; q.logeq(lp); +    return q; +  } + +  // p(rule.f_ | z) * p(rule.e_ | rule.f_ , z) +  prob_t RuleProbability(const TRule& rule) const { +    prob_t p; p.logeq(rules.logprob(rule, log(rp0(rule)))); +    return p; +  } + +  prob_t Likelihood() const { +    prob_t p = base; +    prob_t q; q.logeq(rules.log_crp_prob()); +    p *= q; +    for (unsigned l = 1; l < src_jumps.size(); ++l) { +      if (src_jumps[l].num_customers() > 0) { +        prob_t q; +        q.logeq(src_jumps[l].log_crp_prob()); +        p *= q; +      } +    } +    return p; +  } + +  JumpBase jp0; +  const PhraseJointBase& rp0; +  prob_t base; +  CCRP_NoTable<TRule> rules; +  vector<CCRP_NoTable<int> > src_jumps; +}; + +struct BackwardEstimate { +  BackwardEstimate(const Model1& m1, const vector<WordID>& src, const vector<WordID>& trg) : +      model1_(m1), src_(src), trg_(trg) { +  } +  const prob_t& operator()(const vector<bool>& src_cov, unsigned trg_cov) const { +    assert(src_.size() == src_cov.size()); +    assert(trg_cov <= trg_.size()); +    prob_t& e = cache_[src_cov][trg_cov]; +    if (e.is_0()) { +      if (trg_cov == trg_.size()) { e = prob_t::One(); return e; } +      vector<WordID> r(src_.size() + 1); r.clear(); +      r.push_back(0);  // NULL word +      for (int i = 0; i < src_cov.size(); ++i) +        if (!src_cov[i]) r.push_back(src_[i]); +      const prob_t uniform_alignment(1.0 / r.size()); +      e.logeq(log_poisson(trg_.size() - trg_cov, r.size() - 1)); // p(trg len remaining | src len remaining) +      for (unsigned j = trg_cov; j < trg_.size(); ++j) { +        prob_t p; +        for (unsigned i = 0; i < r.size(); ++i) +          p += model1_(r[i], trg_[j]); +        if (p.is_0()) { +          cerr << "ERROR: p(" << TD::Convert(trg_[j]) << " | " << TD::GetString(r) << ") = 0!\n"; +          abort(); +        } +        p *= uniform_alignment; +        e *= p; +      } +    } +    return e; +  } +  const Model1& model1_; +  const vector<WordID>& src_; +  const vector<WordID>& trg_; +  mutable unordered_map<vector<bool>, map<unsigned, prob_t>, boost::hash<vector<bool> > > cache_; +}; + +struct BackwardEstimateSym { +  BackwardEstimateSym(const Model1& m1, +                      const Model1& invm1, const vector<WordID>& src, const vector<WordID>& trg) : +      model1_(m1), invmodel1_(invm1), src_(src), trg_(trg) { +  } +  const prob_t& operator()(const vector<bool>& src_cov, unsigned trg_cov) const { +    assert(src_.size() == src_cov.size()); +    assert(trg_cov <= trg_.size()); +    prob_t& e = cache_[src_cov][trg_cov]; +    if (e.is_0()) { +      if (trg_cov == trg_.size()) { e = prob_t::One(); return e; } +      vector<WordID> r(src_.size() + 1); r.clear(); +      for (int i = 0; i < src_cov.size(); ++i) +        if (!src_cov[i]) r.push_back(src_[i]); +      r.push_back(0);  // NULL word +      const prob_t uniform_alignment(1.0 / r.size()); +      e.logeq(log_poisson(trg_.size() - trg_cov, r.size() - 1)); // p(trg len remaining | src len remaining) +      for (unsigned j = trg_cov; j < trg_.size(); ++j) { +        prob_t p; +        for (unsigned i = 0; i < r.size(); ++i) +          p += model1_(r[i], trg_[j]); +        if (p.is_0()) { +          cerr << "ERROR: p(" << TD::Convert(trg_[j]) << " | " << TD::GetString(r) << ") = 0!\n"; +          abort(); +        } +        p *= uniform_alignment; +        e *= p; +      } +      r.pop_back(); +      const prob_t inv_uniform(1.0 / (trg_.size() - trg_cov + 1.0)); +      prob_t inv; +      inv.logeq(log_poisson(r.size(), trg_.size() - trg_cov)); +      for (unsigned i = 0; i < r.size(); ++i) { +        prob_t p; +        for (unsigned j = trg_cov - 1; j < trg_.size(); ++j) +          p += invmodel1_(j < trg_cov ? 0 : trg_[j], r[i]); +        if (p.is_0()) { +          cerr << "ERROR: p_inv(" << TD::Convert(r[i]) << " | " << TD::GetString(trg_) << ") = 0!\n"; +          abort(); +        } +        p *= inv_uniform; +        inv *= p; +      } +      prob_t x = pow(e * inv, 0.5); +      e = x; +      //cerr << "Forward: " << log(e) << "\tBackward: " << log(inv) << "\t prop: " << log(x) << endl; +    } +    return e; +  } +  const Model1& model1_; +  const Model1& invmodel1_; +  const vector<WordID>& src_; +  const vector<WordID>& trg_; +  mutable unordered_map<vector<bool>, map<unsigned, prob_t>, boost::hash<vector<bool> > > cache_; +}; + +struct Particle { +  Particle() : weight(prob_t::One()), src_cov(), trg_cov(), prev_pos(-1) {} +  prob_t weight; +  prob_t gamma_last; +  vector<int> src_jumps; +  vector<TRulePtr> rules; +  vector<bool> src_cv; +  int src_cov; +  int trg_cov; +  int prev_pos; +}; + +ostream& operator<<(ostream& o, const vector<bool>& v) { +  for (int i = 0; i < v.size(); ++i) +    o << (v[i] ? '1' : '0'); +  return o; +} +ostream& operator<<(ostream& o, const Particle& p) { +  o << "[cv=" << p.src_cv << "  src_cov=" << p.src_cov << " trg_cov=" << p.trg_cov << " last_pos=" << p.prev_pos << " num_rules=" << p.rules.size() << "  w=" << log(p.weight) << ']'; +  return o; +} + +void FilterCrapParticlesAndReweight(vector<Particle>* pps) { +  vector<Particle>& ps = *pps; +  SampleSet<prob_t> ss; +  for (int i = 0; i < ps.size(); ++i) +    ss.add(ps[i].weight); +  vector<Particle> nps; nps.reserve(ps.size()); +  const prob_t uniform_weight(1.0 / ps.size()); +  for (int i = 0; i < ps.size(); ++i) { +    nps.push_back(ps[prng->SelectSample(ss)]); +    nps[i].weight = uniform_weight; +  } +  nps.swap(ps); +} + +int main(int argc, char** argv) { +  po::variables_map conf; +  InitCommandLine(argc, argv, &conf); +  const unsigned kMAX_TRG_PHRASE = conf["max_trg_phrase"].as<unsigned>(); +  const unsigned kMAX_SRC_PHRASE = conf["max_src_phrase"].as<unsigned>(); +  const unsigned particles = conf["particles"].as<unsigned>(); +  const unsigned samples = conf["samples"].as<unsigned>(); + +  if (!conf.count("model1")) { +    cerr << argv[0] << "Please use --model1 to specify model 1 parameters\n"; +    return 1; +  } +  if (conf.count("random_seed")) +    prng.reset(new MT19937(conf["random_seed"].as<uint32_t>())); +  else +    prng.reset(new MT19937); +  MT19937& rng = *prng; + +  vector<vector<WordID> > corpuse, corpusf; +  set<WordID> vocabe, vocabf; +  cerr << "Reading corpus...\n"; +  ReadParallelCorpus(conf["input"].as<string>(), &corpusf, &corpuse, &vocabf, &vocabe); +  cerr << "F-corpus size: " << corpusf.size() << " sentences\t (" << vocabf.size() << " word types)\n"; +  cerr << "E-corpus size: " << corpuse.size() << " sentences\t (" << vocabe.size() << " word types)\n"; +  assert(corpusf.size() == corpuse.size()); + +  const int kLHS = -TD::Convert("X"); +  Model1 m1(conf["model1"].as<string>()); +  Model1 invm1(conf["inverse_model1"].as<string>()); + +#if 0 +  PhraseConditionalBase lp0(m1, conf["model1_interpolation_weight"].as<double>(), vocabe.size()); +  MyConditionalModel m(lp0); +#else +  PhraseJointBase lp0(m1, conf["model1_interpolation_weight"].as<double>(), vocabe.size(), vocabf.size()); +  MyJointModel m(lp0); +#endif + +  cerr << "Initializing reachability limits...\n"; +  vector<Particle> ps(corpusf.size()); +  vector<Reachability> reaches; reaches.reserve(corpusf.size()); +  for (int ci = 0; ci < corpusf.size(); ++ci) +    reaches.push_back(Reachability(corpusf[ci].size(), +                                   corpuse[ci].size(), +                                   kMAX_SRC_PHRASE, +                                   kMAX_TRG_PHRASE)); +  cerr << "Sampling...\n";  +  vector<Particle> tmp_p(10000);  // work space +  SampleSet<prob_t> pfss; +  for (int SS=0; SS < samples; ++SS) { +    for (int ci = 0; ci < corpusf.size(); ++ci) { +      vector<int>& src = corpusf[ci]; +      vector<int>& trg = corpuse[ci]; +      m.DecrementRules(ps[ci].rules); +      m.DecrementJumps(ps[ci].src_jumps, src.size()); + +      //BackwardEstimate be(m1, src, trg); +      BackwardEstimateSym be(m1, invm1, src, trg); +      const Reachability& r = reaches[ci]; +      vector<Particle> lps(particles); + +      for (int pi = 0; pi < particles; ++pi) { +        Particle& p = lps[pi]; +        p.src_cv.resize(src.size(), false); +      } + +      bool all_complete = false; +      while(!all_complete) { +        SampleSet<prob_t> ss; + +        // all particles have now been extended a bit, we will reweight them now +        if (lps[0].trg_cov > 0) +          FilterCrapParticlesAndReweight(&lps); + +        // loop over all particles and extend them +        bool done_nothing = true; +        for (int pi = 0; pi < particles; ++pi) { +          Particle& p = lps[pi]; +          int tic = 0; +          const int rejuv_freq = 1; +          while(p.trg_cov < trg.size() && tic < rejuv_freq) { +            ++tic; +            done_nothing = false; +            ss.clear(); +            TRule x; x.lhs_ = kLHS; +            prob_t z; +            int first_uncovered = src.size(); +            int last_uncovered = -1; +            for (int i = 0; i < src.size(); ++i) { +              const bool is_uncovered = !p.src_cv[i]; +              if (i < first_uncovered && is_uncovered) first_uncovered = i; +              if (is_uncovered && i > last_uncovered) last_uncovered = i; +            } +            assert(last_uncovered > -1); +            assert(first_uncovered < src.size()); + +            for (int trg_len = 1; trg_len <= kMAX_TRG_PHRASE; ++trg_len) { +              x.e_.push_back(trg[trg_len - 1 + p.trg_cov]); +              for (int src_len = 1; src_len <= kMAX_SRC_PHRASE; ++src_len) { +                if (!r.edges[p.src_cov][p.trg_cov][src_len][trg_len]) continue; + +                const int last_possible_start = last_uncovered - src_len + 1; +                assert(last_possible_start >= 0); +                //cerr << src_len << "," << trg_len << " is allowed. E=" << TD::GetString(x.e_) << endl; +                //cerr << "  first_uncovered=" << first_uncovered << "  last_possible_start=" << last_possible_start << endl; +                for (int i = first_uncovered; i <= last_possible_start; ++i) { +                  if (p.src_cv[i]) continue; +                  assert(ss.size() < tmp_p.size());  // if fails increase tmp_p size +                  Particle& np = tmp_p[ss.size()]; +                  np = p; +                  x.f_.clear(); +                  int gap_add = 0; +                  bool bad = false; +                  prob_t jp = prob_t::One(); +                  int prev_pos = p.prev_pos; +                  for (int j = 0; j < src_len; ++j) { +                    if ((j + i + gap_add) == src.size()) { bad = true; break; } +                    while ((i+j+gap_add) < src.size() && p.src_cv[i + j + gap_add]) { ++gap_add; } +                    if ((j + i + gap_add) == src.size()) { bad = true; break; } +                    np.src_cv[i + j + gap_add] = true; +                    x.f_.push_back(src[i + j + gap_add]); +                    jp *= m.JumpProbability(i + j + gap_add - prev_pos, src.size()); +                    int jump = i + j + gap_add - prev_pos; +                    assert(jump != 0); +                    np.src_jumps.push_back(jump); +                    prev_pos = i + j + gap_add; +                  } +                  if (bad) continue; +                  np.prev_pos = prev_pos; +                  np.src_cov += x.f_.size(); +                  np.trg_cov += x.e_.size(); +                  if (x.f_.size() != src_len) continue; +                  prob_t rp = m.RuleProbability(x); +                  np.gamma_last = rp * jp; +                  const prob_t u = pow(np.gamma_last * be(np.src_cv, np.trg_cov), 0.2); +                  //cerr << "**rule=" << x << endl; +                  //cerr << "  u=" << log(u) << "  rule=" << rp << " jump=" << jp << endl; +                  ss.add(u); +                  np.rules.push_back(TRulePtr(new TRule(x))); +                  z += u; + +                  const bool completed = (p.trg_cov == trg.size()); +                  if (completed) { +                    int last_jump = src.size() - p.prev_pos; +                    assert(last_jump > 0); +                    p.src_jumps.push_back(last_jump); +                    p.weight *= m.JumpProbability(last_jump, src.size()); +                  } +                } +              } +            } +            cerr << "number of edges to consider: " << ss.size() << endl; +            const int sampled = rng.SelectSample(ss); +            prob_t q_n = ss[sampled] / z; +            p = tmp_p[sampled]; +            //m.IncrementRule(*p.rules.back()); +            p.weight *= p.gamma_last / q_n; +            cerr << "[w=" << log(p.weight) << "]\tsampled rule: " << p.rules.back()->AsString() << endl; +            cerr << p << endl; +          } +        } // loop over particles (pi = 0 .. particles) +        if (done_nothing) all_complete = true; +      } +      pfss.clear(); +      for (int i = 0; i < lps.size(); ++i) +        pfss.add(lps[i].weight); +      const int sampled = rng.SelectSample(pfss); +      ps[ci] = lps[sampled]; +      m.IncrementRules(lps[sampled].rules); +      m.IncrementJumps(lps[sampled].src_jumps, src.size()); +      for (int i = 0; i < lps[sampled].rules.size(); ++i) { cerr << "S:\t" << lps[sampled].rules[i]->AsString() << "\n"; } +      cerr << "tmp-LLH: " << log(m.Likelihood()) << endl; +    } +    cerr << "LLH: " << log(m.Likelihood()) << endl; +    for (int sni = 0; sni < 5; ++sni) { +      for (int i = 0; i < ps[sni].rules.size(); ++i) { cerr << "\t" << ps[sni].rules[i]->AsString() << endl; } +    } +  } +  return 0; +} + diff --git a/gi/pf/pfnaive.cc b/gi/pf/pfnaive.cc new file mode 100644 index 00000000..33dc08c3 --- /dev/null +++ b/gi/pf/pfnaive.cc @@ -0,0 +1,280 @@ +#include <iostream> +#include <tr1/memory> +#include <queue> + +#include <boost/functional.hpp> +#include <boost/program_options.hpp> +#include <boost/program_options/variables_map.hpp> + +#include "base_measures.h" +#include "monotonic_pseg.h" +#include "reachability.h" +#include "viterbi.h" +#include "hg.h" +#include "trule.h" +#include "tdict.h" +#include "filelib.h" +#include "dict.h" +#include "sampler.h" +#include "ccrp_nt.h" +#include "ccrp_onetable.h" +#include "corpus.h" + +using namespace std; +using namespace tr1; +namespace po = boost::program_options; + +shared_ptr<MT19937> prng; + +void InitCommandLine(int argc, char** argv, po::variables_map* conf) { +  po::options_description opts("Configuration options"); +  opts.add_options() +        ("samples,s",po::value<unsigned>()->default_value(1000),"Number of samples") +        ("particles,p",po::value<unsigned>()->default_value(30),"Number of particles") +        ("filter_frequency,f",po::value<unsigned>()->default_value(5),"Number of time steps between filterings") +        ("input,i",po::value<string>(),"Read parallel data from") +        ("max_src_phrase",po::value<unsigned>()->default_value(5),"Maximum length of source language phrases") +        ("max_trg_phrase",po::value<unsigned>()->default_value(5),"Maximum length of target language phrases") +        ("model1,m",po::value<string>(),"Model 1 parameters (used in base distribution)") +        ("inverse_model1,M",po::value<string>(),"Inverse Model 1 parameters (used in backward estimate)") +        ("model1_interpolation_weight",po::value<double>()->default_value(0.95),"Mixing proportion of model 1 with uniform target distribution") +        ("random_seed,S",po::value<uint32_t>(), "Random seed"); +  po::options_description clo("Command line options"); +  clo.add_options() +        ("config", po::value<string>(), "Configuration file") +        ("help,h", "Print this help message and exit"); +  po::options_description dconfig_options, dcmdline_options; +  dconfig_options.add(opts); +  dcmdline_options.add(opts).add(clo); +   +  po::store(parse_command_line(argc, argv, dcmdline_options), *conf); +  if (conf->count("config")) { +    ifstream config((*conf)["config"].as<string>().c_str()); +    po::store(po::parse_config_file(config, dconfig_options), *conf); +  } +  po::notify(*conf); + +  if (conf->count("help") || (conf->count("input") == 0)) { +    cerr << dcmdline_options << endl; +    exit(1); +  } +} + +struct BackwardEstimateSym { +  BackwardEstimateSym(const Model1& m1, +                      const Model1& invm1, const vector<WordID>& src, const vector<WordID>& trg) : +      model1_(m1), invmodel1_(invm1), src_(src), trg_(trg) { +  } +  const prob_t& operator()(unsigned src_cov, unsigned trg_cov) const { +    assert(src_cov <= src_.size()); +    assert(trg_cov <= trg_.size()); +    prob_t& e = cache_[src_cov][trg_cov]; +    if (e.is_0()) { +      if (trg_cov == trg_.size()) { e = prob_t::One(); return e; } +      vector<WordID> r(src_.size() + 1); r.clear(); +      for (int i = src_cov; i < src_.size(); ++i) +        r.push_back(src_[i]); +      r.push_back(0);  // NULL word +      const prob_t uniform_alignment(1.0 / r.size()); +      e.logeq(log_poisson(trg_.size() - trg_cov, r.size() - 1)); // p(trg len remaining | src len remaining) +      for (unsigned j = trg_cov; j < trg_.size(); ++j) { +        prob_t p; +        for (unsigned i = 0; i < r.size(); ++i) +          p += model1_(r[i], trg_[j]); +        if (p.is_0()) { +          cerr << "ERROR: p(" << TD::Convert(trg_[j]) << " | " << TD::GetString(r) << ") = 0!\n"; +          abort(); +        } +        p *= uniform_alignment; +        e *= p; +      } +      r.pop_back(); +      const prob_t inv_uniform(1.0 / (trg_.size() - trg_cov + 1.0)); +      prob_t inv; +      inv.logeq(log_poisson(r.size(), trg_.size() - trg_cov)); +      for (unsigned i = 0; i < r.size(); ++i) { +        prob_t p; +        for (unsigned j = trg_cov - 1; j < trg_.size(); ++j) +          p += invmodel1_(j < trg_cov ? 0 : trg_[j], r[i]); +        if (p.is_0()) { +          cerr << "ERROR: p_inv(" << TD::Convert(r[i]) << " | " << TD::GetString(trg_) << ") = 0!\n"; +          abort(); +        } +        p *= inv_uniform; +        inv *= p; +      } +      prob_t x = pow(e * inv, 0.5); +      e = x; +      //cerr << "Forward: " << log(e) << "\tBackward: " << log(inv) << "\t prop: " << log(x) << endl; +    } +    return e; +  } +  const Model1& model1_; +  const Model1& invmodel1_; +  const vector<WordID>& src_; +  const vector<WordID>& trg_; +  mutable unordered_map<unsigned, map<unsigned, prob_t> > cache_; +}; + +struct Particle { +  Particle() : weight(prob_t::One()), src_cov(), trg_cov() {} +  prob_t weight; +  prob_t gamma_last; +  vector<TRulePtr> rules; +  int src_cov; +  int trg_cov; +}; + +ostream& operator<<(ostream& o, const vector<bool>& v) { +  for (int i = 0; i < v.size(); ++i) +    o << (v[i] ? '1' : '0'); +  return o; +} +ostream& operator<<(ostream& o, const Particle& p) { +  o << "[src_cov=" << p.src_cov << " trg_cov=" << p.trg_cov << " num_rules=" << p.rules.size() << "  w=" << log(p.weight) << ']'; +  return o; +} + +void FilterCrapParticlesAndReweight(vector<Particle>* pps) { +  vector<Particle>& ps = *pps; +  SampleSet<prob_t> ss; +  for (int i = 0; i < ps.size(); ++i) +    ss.add(ps[i].weight); +  vector<Particle> nps; nps.reserve(ps.size()); +  const prob_t uniform_weight(1.0 / ps.size()); +  for (int i = 0; i < ps.size(); ++i) { +    nps.push_back(ps[prng->SelectSample(ss)]); +    nps[i].weight = uniform_weight; +  } +  nps.swap(ps); +} + +int main(int argc, char** argv) { +  po::variables_map conf; +  InitCommandLine(argc, argv, &conf); +  const unsigned kMAX_TRG_PHRASE = conf["max_trg_phrase"].as<unsigned>(); +  const unsigned kMAX_SRC_PHRASE = conf["max_src_phrase"].as<unsigned>(); +  const unsigned particles = conf["particles"].as<unsigned>(); +  const unsigned samples = conf["samples"].as<unsigned>(); +  const unsigned rejuv_freq = conf["filter_frequency"].as<unsigned>(); + +  if (!conf.count("model1")) { +    cerr << argv[0] << "Please use --model1 to specify model 1 parameters\n"; +    return 1; +  } +  if (conf.count("random_seed")) +    prng.reset(new MT19937(conf["random_seed"].as<uint32_t>())); +  else +    prng.reset(new MT19937); +  MT19937& rng = *prng; + +  vector<vector<WordID> > corpuse, corpusf; +  set<WordID> vocabe, vocabf; +  cerr << "Reading corpus...\n"; +  corpus::ReadParallelCorpus(conf["input"].as<string>(), &corpusf, &corpuse, &vocabf, &vocabe); +  cerr << "F-corpus size: " << corpusf.size() << " sentences\t (" << vocabf.size() << " word types)\n"; +  cerr << "E-corpus size: " << corpuse.size() << " sentences\t (" << vocabe.size() << " word types)\n"; +  assert(corpusf.size() == corpuse.size()); + +  const int kLHS = -TD::Convert("X"); +  Model1 m1(conf["model1"].as<string>()); +  Model1 invm1(conf["inverse_model1"].as<string>()); + +  PhraseJointBase lp0(m1, conf["model1_interpolation_weight"].as<double>(), vocabe.size(), vocabf.size()); +  MonotonicParallelSegementationModel m(lp0); + +  cerr << "Initializing reachability limits...\n"; +  vector<Particle> ps(corpusf.size()); +  vector<Reachability> reaches; reaches.reserve(corpusf.size()); +  for (int ci = 0; ci < corpusf.size(); ++ci) +    reaches.push_back(Reachability(corpusf[ci].size(), +                                   corpuse[ci].size(), +                                   kMAX_SRC_PHRASE, +                                   kMAX_TRG_PHRASE)); +  cerr << "Sampling...\n";  +  vector<Particle> tmp_p(10000);  // work space +  SampleSet<prob_t> pfss; +  for (int SS=0; SS < samples; ++SS) { +    for (int ci = 0; ci < corpusf.size(); ++ci) { +      vector<int>& src = corpusf[ci]; +      vector<int>& trg = corpuse[ci]; +      m.DecrementRulesAndStops(ps[ci].rules); +      const prob_t q_stop = m.StopProbability(); +      const prob_t q_cont = m.ContinueProbability(); +      cerr << "P(stop)=" << q_stop << "\tP(continue)=" <<q_cont << endl; + +      BackwardEstimateSym be(m1, invm1, src, trg); +      const Reachability& r = reaches[ci]; +      vector<Particle> lps(particles); + +      bool all_complete = false; +      while(!all_complete) { +        SampleSet<prob_t> ss; + +        // all particles have now been extended a bit, we will reweight them now +        if (lps[0].trg_cov > 0) +          FilterCrapParticlesAndReweight(&lps); + +        // loop over all particles and extend them +        bool done_nothing = true; +        for (int pi = 0; pi < particles; ++pi) { +          Particle& p = lps[pi]; +          int tic = 0; +          while(p.trg_cov < trg.size() && tic < rejuv_freq) { +            ++tic; +            done_nothing = false; +            ss.clear(); +            TRule x; x.lhs_ = kLHS; +            prob_t z; + +            for (int trg_len = 1; trg_len <= kMAX_TRG_PHRASE; ++trg_len) { +              x.e_.push_back(trg[trg_len - 1 + p.trg_cov]); +              for (int src_len = 1; src_len <= kMAX_SRC_PHRASE; ++src_len) { +                if (!r.edges[p.src_cov][p.trg_cov][src_len][trg_len]) continue; + +                int i = p.src_cov; +                assert(ss.size() < tmp_p.size());  // if fails increase tmp_p size +                Particle& np = tmp_p[ss.size()]; +                np = p; +                x.f_.clear(); +                for (int j = 0; j < src_len; ++j) +                  x.f_.push_back(src[i + j]); +                np.src_cov += x.f_.size(); +                np.trg_cov += x.e_.size(); +                const bool stop_now = (np.src_cov == src_len && np.trg_cov == trg_len); +                prob_t rp = m.RuleProbability(x) * (stop_now ? q_stop : q_cont); +                np.gamma_last = rp; +                const prob_t u = pow(np.gamma_last * pow(be(np.src_cov, np.trg_cov), 1.2), 0.1); +                //cerr << "**rule=" << x << endl; +                //cerr << "  u=" << log(u) << "  rule=" << rp << endl; +                ss.add(u); +                np.rules.push_back(TRulePtr(new TRule(x))); +                z += u; +              } +            } +            //cerr << "number of edges to consider: " << ss.size() << endl; +            const int sampled = rng.SelectSample(ss); +            prob_t q_n = ss[sampled] / z; +            p = tmp_p[sampled]; +            //m.IncrementRule(*p.rules.back()); +            p.weight *= p.gamma_last / q_n; +            //cerr << "[w=" << log(p.weight) << "]\tsampled rule: " << p.rules.back()->AsString() << endl; +            //cerr << p << endl; +          } +        } // loop over particles (pi = 0 .. particles) +        if (done_nothing) all_complete = true; +      } +      pfss.clear(); +      for (int i = 0; i < lps.size(); ++i) +        pfss.add(lps[i].weight); +      const int sampled = rng.SelectSample(pfss); +      ps[ci] = lps[sampled]; +      m.IncrementRulesAndStops(lps[sampled].rules); +      for (int i = 0; i < lps[sampled].rules.size(); ++i) { cerr << "S:\t" << lps[sampled].rules[i]->AsString() << "\n"; } +      cerr << "tmp-LLH: " << log(m.Likelihood()) << endl; +    } +    cerr << "LLH: " << log(m.Likelihood()) << endl; +  } +  return 0; +} + diff --git a/gi/pf/reachability.cc b/gi/pf/reachability.cc new file mode 100644 index 00000000..73dd8d39 --- /dev/null +++ b/gi/pf/reachability.cc @@ -0,0 +1,64 @@ +#include "reachability.h" + +#include <vector> +#include <iostream> + +using namespace std; + +struct SState { +  SState() : prev_src_covered(), prev_trg_covered() {} +  SState(int i, int j) : prev_src_covered(i), prev_trg_covered(j) {} +  int prev_src_covered; +  int prev_trg_covered; +}; + +void Reachability::ComputeReachability(int srclen, int trglen, int src_max_phrase_len, int trg_max_phrase_len) { +    typedef boost::multi_array<vector<SState>, 2> array_type; +    array_type a(boost::extents[srclen + 1][trglen + 1]); +    a[0][0].push_back(SState()); +    for (int i = 0; i < srclen; ++i) { +      for (int j = 0; j < trglen; ++j) { +        if (a[i][j].size() == 0) continue; +        const SState prev(i,j); +        for (int k = 1; k <= src_max_phrase_len; ++k) { +          if ((i + k) > srclen) continue; +          for (int l = 1; l <= trg_max_phrase_len; ++l) { +            if ((j + l) > trglen) continue; +            a[i + k][j + l].push_back(prev); +          } +        } +      } +    } +    a[0][0].clear(); +    //cerr << "Final cell contains " << a[srclen][trglen].size() << " back pointers\n"; +    if (a[srclen][trglen].size() == 0) { +      cerr << "Sentence with length (" << srclen << ',' << trglen << ") violates reachability constraints\n"; +      return; +    } + +    typedef boost::multi_array<bool, 2> rarray_type; +    rarray_type r(boost::extents[srclen + 1][trglen + 1]); +    r[srclen][trglen] = true; +    for (int i = srclen; i >= 0; --i) { +      for (int j = trglen; j >= 0; --j) { +        vector<SState>& prevs = a[i][j]; +        if (!r[i][j]) { prevs.clear(); } +        for (int k = 0; k < prevs.size(); ++k) { +          r[prevs[k].prev_src_covered][prevs[k].prev_trg_covered] = true; +          int src_delta = i - prevs[k].prev_src_covered; +          edges[prevs[k].prev_src_covered][prevs[k].prev_trg_covered][src_delta][j - prevs[k].prev_trg_covered] = true; +          short &msd = max_src_delta[prevs[k].prev_src_covered][prevs[k].prev_trg_covered]; +          if (src_delta > msd) msd = src_delta; +        } +      } +    } +    assert(!edges[0][0][1][0]); +    assert(!edges[0][0][0][1]); +    assert(!edges[0][0][0][0]); +    assert(max_src_delta[0][0] > 0); +    //cerr << "First cell contains " << b[0][0].size() << " forward pointers\n"; +    //for (int i = 0; i < b[0][0].size(); ++i) { +    //  cerr << "  -> (" << b[0][0][i].next_src_covered << "," << b[0][0][i].next_trg_covered << ")\n"; +    //} +  } + diff --git a/gi/pf/reachability.h b/gi/pf/reachability.h new file mode 100644 index 00000000..98450ec1 --- /dev/null +++ b/gi/pf/reachability.h @@ -0,0 +1,28 @@ +#ifndef _REACHABILITY_H_ +#define _REACHABILITY_H_ + +#include "boost/multi_array.hpp" + +// determines minimum and maximum lengths of outgoing edges from all +// coverage positions such that the alignment path respects src and +// trg maximum phrase sizes +// +// runs in O(n^2 * src_max * trg_max) time but should be relatively fast +// +// currently forbids 0 -> n and n -> 0 alignments + +struct Reachability { +  boost::multi_array<bool, 4> edges;  // edges[src_covered][trg_covered][x][trg_delta] is this edge worth exploring? +  boost::multi_array<short, 2> max_src_delta; // msd[src_covered][trg_covered] -- the largest src delta that's valid + +  Reachability(int srclen, int trglen, int src_max_phrase_len, int trg_max_phrase_len) : +      edges(boost::extents[srclen][trglen][src_max_phrase_len+1][trg_max_phrase_len+1]), +      max_src_delta(boost::extents[srclen][trglen]) { +    ComputeReachability(srclen, trglen, src_max_phrase_len, trg_max_phrase_len); +  } + + private: +  void ComputeReachability(int srclen, int trglen, int src_max_phrase_len, int trg_max_phrase_len); +}; + +#endif diff --git a/gi/pf/tpf.cc b/gi/pf/tpf.cc new file mode 100644 index 00000000..7348d21c --- /dev/null +++ b/gi/pf/tpf.cc @@ -0,0 +1,99 @@ +#include <iostream> +#include <tr1/memory> +#include <queue> + +#include "sampler.h" + +using namespace std; +using namespace tr1; + +shared_ptr<MT19937> prng; + +struct Particle { +  Particle() : weight(prob_t::One()) {} +  vector<int> states; +  prob_t weight; +  prob_t gamma_last; +}; + +ostream& operator<<(ostream& os, const Particle& p) { +  os << "["; +  for (int i = 0; i < p.states.size(); ++i) os << p.states[i] << ' '; +  os << "| w=" << log(p.weight) << ']'; +  return os; +} + +void Rejuvenate(vector<Particle>& pps) { +  SampleSet<prob_t> ss; +  vector<Particle> nps(pps.size()); +  for (int i = 0; i < pps.size(); ++i) { +//    cerr << pps[i] << endl; +    ss.add(pps[i].weight); +  } +//  cerr << "REJUVINATING...\n"; +  for (int i = 0; i < pps.size(); ++i) { +    nps[i] = pps[prng->SelectSample(ss)]; +    nps[i].weight = prob_t(1.0 / pps.size()); +//    cerr << nps[i] << endl; +  } +  nps.swap(pps); +//  exit(1); +} + +int main(int argc, char** argv) { +  const unsigned particles = 100; +  prng.reset(new MT19937); +  MT19937& rng = *prng; + +  // q(a) = 0.8 +  // q(b) = 0.8 +  // q(c) = 0.4 +  SampleSet<double> ssq; +  ssq.add(0.4); +  ssq.add(0.6); +  ssq.add(0); +  double qz = 1; + +  // p(a) = 0.2 +  // p(b) = 0.8 +  vector<double> p(3); +  p[0] = 0.2; +  p[1] = 0.8; +  p[2] = 0; + +  vector<int> counts(3); +  int tot = 0; + +  vector<Particle> pps(particles); +  SampleSet<prob_t> ppss; +  int LEN = 12; +  int PP = 1; +  while (pps[0].states.size() < LEN) { +    for (int pi = 0; pi < particles; ++pi) { +      Particle& prt = pps[pi]; + +      bool redo = true; +      const Particle savedp = prt; +      while (redo) { +        redo = false; +        for (int i = 0; i < PP; ++i) { +          int s = rng.SelectSample(ssq); +          double gamma_last = p[s]; +          if (!gamma_last) { redo = true; break; } +          double q = ssq[s] / qz; +          prt.states.push_back(s); +          prt.weight *= prob_t(gamma_last / q); +        } +        if (redo) { prt = savedp; continue; } +      } +    } +    Rejuvenate(pps); +  } +  ppss.clear(); +  for (int i = 0; i < particles; ++i) { ppss.add(pps[i].weight); } +  int sp = rng.SelectSample(ppss); +  cerr << pps[sp] << endl; + +  return 0; +} + | 
