diff options
Diffstat (limited to 'gi/pf')
50 files changed, 0 insertions, 9460 deletions
diff --git a/gi/pf/Makefile.am b/gi/pf/Makefile.am deleted file mode 100644 index 86f8e07b..00000000 --- a/gi/pf/Makefile.am +++ /dev/null @@ -1,44 +0,0 @@ -bin_PROGRAMS = cbgi brat dpnaive pfbrat pfdist itg pfnaive condnaive align-lexonly-pyp learn_cfg pyp_lm nuisance_test align-tl pf_test bayes_lattice_score - -noinst_LIBRARIES = libpf.a - -libpf_a_SOURCES = base_distributions.cc reachability.cc cfg_wfst_composer.cc corpus.cc unigrams.cc ngram_base.cc transliterations.cc backward.cc hpyp_tm.cc pyp_tm.cc - -bayes_lattice_score_SOURCES = bayes_lattice_score.cc -bayes_lattice_score_LDADD = libpf.a $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a $(top_srcdir)/klm/lm/libklm.a $(top_srcdir)/klm/util/libklm_util.a -lz - -pf_test_SOURCES = pf_test.cc -pf_test_LDADD = libpf.a $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a $(top_srcdir)/klm/lm/libklm.a $(top_srcdir)/klm/util/libklm_util.a -lz - -nuisance_test_SOURCES = nuisance_test.cc -nuisance_test_LDADD = libpf.a $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a $(top_srcdir)/klm/lm/libklm.a $(top_srcdir)/klm/util/libklm_util.a -lz - -align_lexonly_pyp_SOURCES = align-lexonly-pyp.cc -align_lexonly_pyp_LDADD = libpf.a $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a $(top_srcdir)/klm/lm/libklm.a $(top_srcdir)/klm/util/libklm_util.a -lz - -align_tl_SOURCES = align-tl.cc -align_tl_LDADD = libpf.a $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a $(top_srcdir)/klm/lm/libklm.a $(top_srcdir)/klm/util/libklm_util.a -lz - -itg_SOURCES = itg.cc - -pyp_lm_SOURCES = pyp_lm.cc - -learn_cfg_SOURCES = learn_cfg.cc - -condnaive_SOURCES = condnaive.cc - -dpnaive_SOURCES = dpnaive.cc - -pfdist_SOURCES = pfdist.cc - -pfnaive_SOURCES = pfnaive.cc - -cbgi_SOURCES = cbgi.cc - -brat_SOURCES = brat.cc - -pfbrat_SOURCES = pfbrat.cc - -AM_CPPFLAGS = -W -Wall -Wno-sign-compare -funroll-loops -I$(top_srcdir)/utils $(GTEST_CPPFLAGS) -I$(top_srcdir)/decoder -I$(top_srcdir)/klm - -AM_LDFLAGS = libpf.a $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/klm/lm/libklm.a $(top_srcdir)/klm/util/libklm_util.a $(top_srcdir)/utils/libutils.a -lz diff --git a/gi/pf/README b/gi/pf/README deleted file mode 100644 index 62e47541..00000000 --- a/gi/pf/README +++ /dev/null @@ -1,2 +0,0 @@ -Experimental Bayesian alignment tools. Nothing to see here. - diff --git a/gi/pf/align-lexonly-pyp.cc b/gi/pf/align-lexonly-pyp.cc deleted file mode 100644 index e7509f57..00000000 --- a/gi/pf/align-lexonly-pyp.cc +++ /dev/null @@ -1,243 +0,0 @@ -#include <iostream> -#include <queue> - -#include <boost/program_options.hpp> -#include <boost/program_options/variables_map.hpp> - -#include "tdict.h" -#include "stringlib.h" -#include "filelib.h" -#include "array2d.h" -#include "sampler.h" -#include "corpus.h" -#include "pyp_tm.h" -#include "hpyp_tm.h" -#include "quasi_model2.h" - -using namespace std; -namespace po = boost::program_options; - -void InitCommandLine(int argc, char** argv, po::variables_map* conf) { -  po::options_description opts("Configuration options"); -  opts.add_options() -        ("samples,s",po::value<unsigned>()->default_value(1000),"Number of samples") -        ("infer_alignment_hyperparameters,I", "Infer alpha and p_null, otherwise fixed values will be assumed") -        ("p_null,0", po::value<double>()->default_value(0.08), "probability of aligning to null") -        ("align_alpha,a", po::value<double>()->default_value(4.0), "how 'tight' is the bias toward be along the diagonal?") -        ("input,i",po::value<string>(),"Read parallel data from") -        ("random_seed,S",po::value<uint32_t>(), "Random seed"); -  po::options_description clo("Command line options"); -  clo.add_options() -        ("config", po::value<string>(), "Configuration file") -        ("help,h", "Print this help message and exit"); -  po::options_description dconfig_options, dcmdline_options; -  dconfig_options.add(opts); -  dcmdline_options.add(opts).add(clo); -   -  po::store(parse_command_line(argc, argv, dcmdline_options), *conf); -  if (conf->count("config")) { -    ifstream config((*conf)["config"].as<string>().c_str()); -    po::store(po::parse_config_file(config, dconfig_options), *conf); -  } -  po::notify(*conf); - -  if (conf->count("help") || (conf->count("input") == 0)) { -    cerr << dcmdline_options << endl; -    exit(1); -  } -} - -MT19937* prng; - -struct LexicalAlignment { -  unsigned char src_index; -  bool is_transliteration; -  vector<pair<short, short> > derivation; -}; - -struct AlignedSentencePair { -  vector<WordID> src; -  vector<WordID> trg; -  vector<LexicalAlignment> a; -  Array2D<short> posterior; -}; - -template <class LexicalTranslationModel> -struct Aligner { -  Aligner(const vector<vector<WordID> >& lets, -          int vocab_size, -          int num_letters, -          const po::variables_map& conf, -          vector<AlignedSentencePair>* c) : -      corpus(*c), -      paj_model(conf["align_alpha"].as<double>(), conf["p_null"].as<double>()), -      infer_paj(conf.count("infer_alignment_hyperparameters") > 0), -      model(lets, vocab_size, num_letters), -      kNULL(TD::Convert("NULL")) { -    assert(lets[kNULL].size() == 0); -  } - -  vector<AlignedSentencePair>& corpus; -  QuasiModel2 paj_model; -  const bool infer_paj; -  LexicalTranslationModel model; -  const WordID kNULL; - -  void ResampleHyperparameters() { -    model.ResampleHyperparameters(prng); -    if (infer_paj) paj_model.ResampleHyperparameters(prng); -  } - -  void InitializeRandom() { -    cerr << "Initializing with random alignments ...\n"; -    for (unsigned i = 0; i < corpus.size(); ++i) { -      AlignedSentencePair& asp = corpus[i]; -      asp.a.resize(asp.trg.size()); -      for (unsigned j = 0; j < asp.trg.size(); ++j) { -        unsigned char& a_j = asp.a[j].src_index; -        a_j = prng->next() * (1 + asp.src.size()); -        const WordID f_a_j = (a_j ? asp.src[a_j - 1] : kNULL); -        model.Increment(f_a_j, asp.trg[j], &*prng); -        paj_model.Increment(a_j, j, asp.src.size(), asp.trg.size()); -      } -    } -    cerr << "Corpus intialized randomly." << endl; -    cerr << "LLH = " << Likelihood() << "    \t(Amodel=" << paj_model.Likelihood() -         << " TModel=" << model.Likelihood() << ") contexts=" << model.UniqueConditioningContexts() << endl; -  } - -  void ResampleCorpus() { -    for (unsigned i = 0; i < corpus.size(); ++i) { -      AlignedSentencePair& asp = corpus[i]; -      SampleSet<prob_t> ss; ss.resize(asp.src.size() + 1); -      for (unsigned j = 0; j < asp.trg.size(); ++j) { -        unsigned char& a_j = asp.a[j].src_index; -        const WordID e_j = asp.trg[j]; -        WordID f_a_j = (a_j ? asp.src[a_j - 1] : kNULL); -        model.Decrement(f_a_j, e_j, prng); -        paj_model.Decrement(a_j, j, asp.src.size(), asp.trg.size()); - -        for (unsigned prop_a_j = 0; prop_a_j <= asp.src.size(); ++prop_a_j) { -          const WordID prop_f = (prop_a_j ? asp.src[prop_a_j - 1] : kNULL); -          ss[prop_a_j] = model.Prob(prop_f, e_j); -          ss[prop_a_j] *= paj_model.Prob(prop_a_j, j, asp.src.size(), asp.trg.size()); -        } -        a_j = prng->SelectSample(ss); -        f_a_j = (a_j ? asp.src[a_j - 1] : kNULL); -        model.Increment(f_a_j, e_j, prng); -        paj_model.Increment(a_j, j, asp.src.size(), asp.trg.size()); -      } -    } -  } - -  prob_t Likelihood() const { -    return model.Likelihood() * paj_model.Likelihood(); -  } -}; - -void ExtractLetters(const set<WordID>& v, vector<vector<WordID> >* l, set<WordID>* letset = NULL) { -  for (set<WordID>::const_iterator it = v.begin(); it != v.end(); ++it) { -    vector<WordID>& letters = (*l)[*it]; -    if (letters.size()) continue;   // if e and f have the same word - -    const string& w = TD::Convert(*it); -     -    size_t cur = 0; -    while (cur < w.size()) { -      const size_t len = UTF8Len(w[cur]); -      letters.push_back(TD::Convert(w.substr(cur, len))); -      if (letset) letset->insert(letters.back()); -      cur += len; -    } -  } -} - -void Debug(const AlignedSentencePair& asp) { -  cerr << TD::GetString(asp.src) << endl << TD::GetString(asp.trg) << endl; -  Array2D<bool> a(asp.src.size(), asp.trg.size()); -  for (unsigned j = 0; j < asp.trg.size(); ++j) { -    assert(asp.a[j].src_index <= asp.src.size()); -    if (asp.a[j].src_index) a(asp.a[j].src_index - 1, j) = true; -  } -  cerr << a << endl; -} - -void AddSample(AlignedSentencePair* asp) { -  for (unsigned j = 0; j < asp->trg.size(); ++j) -    asp->posterior(asp->a[j].src_index, j)++; -} - -void WriteAlignments(const AlignedSentencePair& asp) { -  bool first = true; -  for (unsigned j = 0; j < asp.trg.size(); ++j) { -    int src_index = -1; -    int mc = -1; -    for (unsigned i = 0; i <= asp.src.size(); ++i) { -      if (asp.posterior(i, j) > mc) { -        mc = asp.posterior(i, j); -        src_index = i; -      } -    } - -    if (src_index) { -      if (first) first = false; else cout << ' '; -      cout << (src_index - 1) << '-' << j; -    } -  } -  cout << endl; -} - -int main(int argc, char** argv) { -  po::variables_map conf; -  InitCommandLine(argc, argv, &conf); - -  if (conf.count("random_seed")) -    prng = new MT19937(conf["random_seed"].as<uint32_t>()); -  else -    prng = new MT19937; - -  vector<vector<int> > corpuse, corpusf; -  set<int> vocabe, vocabf; -  corpus::ReadParallelCorpus(conf["input"].as<string>(), &corpusf, &corpuse, &vocabf, &vocabe); -  cerr << "f-Corpus size: " << corpusf.size() << " sentences\n"; -  cerr << "f-Vocabulary size: " << vocabf.size() << " types\n"; -  cerr << "f-Corpus size: " << corpuse.size() << " sentences\n"; -  cerr << "f-Vocabulary size: " << vocabe.size() << " types\n"; -  assert(corpusf.size() == corpuse.size()); - -  vector<AlignedSentencePair> corpus(corpuse.size()); -  for (unsigned i = 0; i < corpuse.size(); ++i) { -    corpus[i].src.swap(corpusf[i]); -    corpus[i].trg.swap(corpuse[i]); -    corpus[i].posterior.resize(corpus[i].src.size() + 1, corpus[i].trg.size()); -  } -  corpusf.clear(); corpuse.clear(); - -  vocabf.insert(TD::Convert("NULL")); -  vector<vector<WordID> > letters(TD::NumWords()); -  set<WordID> letset; -  ExtractLetters(vocabe, &letters, &letset); -  ExtractLetters(vocabf, &letters, NULL); -  letters[TD::Convert("NULL")].clear(); - -  //Aligner<PYPLexicalTranslation> aligner(letters, vocabe.size(), letset.size(), conf, &corpus); -  Aligner<HPYPLexicalTranslation> aligner(letters, vocabe.size(), letset.size(), conf, &corpus); -  aligner.InitializeRandom(); - -  const unsigned samples = conf["samples"].as<unsigned>(); -  for (int i = 0; i < samples; ++i) { -    for (int j = 65; j < 67; ++j) Debug(corpus[j]); -    if (i % 10 == 9) { -      aligner.ResampleHyperparameters(); -      cerr << "LLH = " << aligner.Likelihood() << "    \t(Amodel=" << aligner.paj_model.Likelihood() -           << " TModel=" << aligner.model.Likelihood() << ") contexts=" << aligner.model.UniqueConditioningContexts() << endl; -    } -    aligner.ResampleCorpus(); -    if (i > (samples / 5) && (i % 6 == 5)) for (int j = 0; j < corpus.size(); ++j) AddSample(&corpus[j]); -  } -  for (unsigned i = 0; i < corpus.size(); ++i) -    WriteAlignments(corpus[i]); -  aligner.model.Summary(); - -  return 0; -} diff --git a/gi/pf/align-tl.cc b/gi/pf/align-tl.cc deleted file mode 100644 index f6608f1d..00000000 --- a/gi/pf/align-tl.cc +++ /dev/null @@ -1,339 +0,0 @@ -#include <iostream> -#include <tr1/memory> -#include <queue> - -#include <boost/multi_array.hpp> -#include <boost/program_options.hpp> -#include <boost/program_options/variables_map.hpp> - -#include "backward.h" -#include "array2d.h" -#include "base_distributions.h" -#include "monotonic_pseg.h" -#include "conditional_pseg.h" -#include "trule.h" -#include "tdict.h" -#include "stringlib.h" -#include "filelib.h" -#include "dict.h" -#include "sampler.h" -#include "mfcr.h" -#include "corpus.h" -#include "ngram_base.h" -#include "transliterations.h" - -using namespace std; -using namespace tr1; -namespace po = boost::program_options; - -void InitCommandLine(int argc, char** argv, po::variables_map* conf) { -  po::options_description opts("Configuration options"); -  opts.add_options() -        ("samples,s",po::value<unsigned>()->default_value(1000),"Number of samples") -        ("input,i",po::value<string>(),"Read parallel data from") -        ("s2t", po::value<string>(), "character level source-to-target prior transliteration probabilities") -        ("t2s", po::value<string>(), "character level target-to-source prior transliteration probabilities") -        ("max_src_chunk", po::value<unsigned>()->default_value(4), "Maximum size of translitered chunk in source") -        ("max_trg_chunk", po::value<unsigned>()->default_value(4), "Maximum size of translitered chunk in target") -        ("expected_src_to_trg_ratio", po::value<double>()->default_value(1.0), "If a word is transliterated, what is the expected length ratio from source to target?") -        ("random_seed,S",po::value<uint32_t>(), "Random seed"); -  po::options_description clo("Command line options"); -  clo.add_options() -        ("config", po::value<string>(), "Configuration file") -        ("help,h", "Print this help message and exit"); -  po::options_description dconfig_options, dcmdline_options; -  dconfig_options.add(opts); -  dcmdline_options.add(opts).add(clo); -   -  po::store(parse_command_line(argc, argv, dcmdline_options), *conf); -  if (conf->count("config")) { -    ifstream config((*conf)["config"].as<string>().c_str()); -    po::store(po::parse_config_file(config, dconfig_options), *conf); -  } -  po::notify(*conf); - -  if (conf->count("help") || (conf->count("input") == 0)) { -    cerr << dcmdline_options << endl; -    exit(1); -  } -} - -boost::shared_ptr<MT19937> prng; - -struct LexicalAlignment { -  unsigned char src_index; -  bool is_transliteration; -  vector<pair<short, short> > derivation; -}; - -struct AlignedSentencePair { -  vector<WordID> src; -  vector<WordID> trg; -  vector<LexicalAlignment> a; -  Array2D<short> posterior; -}; - -struct HierarchicalWordBase { -  explicit HierarchicalWordBase(const unsigned vocab_e_size) : -      base(prob_t::One()), r(1,1,1,1,0.66,50.0), u0(-log(vocab_e_size)), l(1,prob_t::One()), v(1, prob_t::Zero()) {} - -  void ResampleHyperparameters(MT19937* rng) { -    r.resample_hyperparameters(rng); -  } - -  inline double logp0(const vector<WordID>& s) const { -    return Md::log_poisson(s.size(), 7.5) + s.size() * u0; -  } - -  // return p0 of rule.e_ -  prob_t operator()(const TRule& rule) const { -    v[0].logeq(logp0(rule.e_)); -    return r.prob(rule.e_, v.begin(), l.begin()); -  } - -  void Increment(const TRule& rule) { -    v[0].logeq(logp0(rule.e_)); -    if (r.increment(rule.e_, v.begin(), l.begin(), &*prng).count) { -      base *= v[0] * l[0]; -    } -  } - -  void Decrement(const TRule& rule) { -    if (r.decrement(rule.e_, &*prng).count) { -      base /= prob_t(exp(logp0(rule.e_))); -    } -  } - -  prob_t Likelihood() const { -    prob_t p; p.logeq(r.log_crp_prob()); -    p *= base; -    return p; -  } - -  void Summary() const { -    cerr << "NUMBER OF CUSTOMERS: " << r.num_customers() << "  (d=" << r.discount() << ",s=" << r.strength() << ')' << endl; -    for (MFCR<1,vector<WordID> >::const_iterator it = r.begin(); it != r.end(); ++it) -      cerr << "   " << it->second.total_dish_count_ << " (on " << it->second.table_counts_.size() << " tables) " << TD::GetString(it->first) << endl; -  } - -  prob_t base; -  MFCR<1,vector<WordID> > r; -  const double u0; -  const vector<prob_t> l; -  mutable vector<prob_t> v; -}; - -struct BasicLexicalAlignment { -  explicit BasicLexicalAlignment(const vector<vector<WordID> >& lets, -                                 const unsigned words_e, -                                 const unsigned letters_e, -                                 vector<AlignedSentencePair>* corp) : -      letters(lets), -      corpus(*corp), -      //up0(words_e), -      //up0("en.chars.1gram", letters_e), -      //up0("en.words.1gram"), -      up0(letters_e), -      //up0("en.chars.2gram"), -      tmodel(up0) { -  } - -  void InstantiateRule(const WordID src, -                       const WordID trg, -                       TRule* rule) const { -    static const WordID kX = TD::Convert("X") * -1; -    rule->lhs_ = kX; -    rule->e_ = letters[trg]; -    rule->f_ = letters[src]; -  } - -  void InitializeRandom() { -    const WordID kNULL = TD::Convert("NULL"); -    cerr << "Initializing with random alignments ...\n"; -    for (unsigned i = 0; i < corpus.size(); ++i) { -      AlignedSentencePair& asp = corpus[i]; -      asp.a.resize(asp.trg.size()); -      for (unsigned j = 0; j < asp.trg.size(); ++j) { -        const unsigned char a_j = prng->next() * (1 + asp.src.size()); -        const WordID f_a_j = (a_j ? asp.src[a_j - 1] : kNULL); -        TRule r; -        InstantiateRule(f_a_j, asp.trg[j], &r); -        asp.a[j].is_transliteration = false; -        asp.a[j].src_index = a_j; -        if (tmodel.IncrementRule(r, &*prng)) -          up0.Increment(r); -      } -    } -    cerr << "  LLH = " << Likelihood() << endl; -  } - -  prob_t Likelihood() const { -    prob_t p = tmodel.Likelihood(); -    p *= up0.Likelihood(); -    return p; -  } - -  void ResampleHyperparemeters() { -    tmodel.ResampleHyperparameters(&*prng); -    up0.ResampleHyperparameters(&*prng); -    cerr << "  (base d=" << up0.r.discount() << ",s=" << up0.r.strength() << ")\n"; -  } - -  void ResampleCorpus(); - -  const vector<vector<WordID> >& letters; // spelling dictionary -  vector<AlignedSentencePair>& corpus; -  //PhraseConditionalUninformativeBase up0; -  //PhraseConditionalUninformativeUnigramBase up0; -  //UnigramWordBase up0; -  //HierarchicalUnigramBase up0; -  HierarchicalWordBase up0; -  //CompletelyUniformBase up0; -  //FixedNgramBase up0; -  //ConditionalTranslationModel<PhraseConditionalUninformativeBase> tmodel; -  //ConditionalTranslationModel<PhraseConditionalUninformativeUnigramBase> tmodel; -  //ConditionalTranslationModel<UnigramWordBase> tmodel; -  //ConditionalTranslationModel<HierarchicalUnigramBase> tmodel; -  MConditionalTranslationModel<HierarchicalWordBase> tmodel; -  //ConditionalTranslationModel<FixedNgramBase> tmodel; -  //ConditionalTranslationModel<CompletelyUniformBase> tmodel; -}; - -void BasicLexicalAlignment::ResampleCorpus() { -  static const WordID kNULL = TD::Convert("NULL"); -  for (unsigned i = 0; i < corpus.size(); ++i) { -    AlignedSentencePair& asp = corpus[i]; -    SampleSet<prob_t> ss; ss.resize(asp.src.size() + 1); -    for (unsigned j = 0; j < asp.trg.size(); ++j) { -      TRule r; -      unsigned char& a_j = asp.a[j].src_index; -      WordID f_a_j = (a_j ? asp.src[a_j - 1] : kNULL); -      InstantiateRule(f_a_j, asp.trg[j], &r); -      if (tmodel.DecrementRule(r, &*prng)) -        up0.Decrement(r); - -      for (unsigned prop_a_j = 0; prop_a_j <= asp.src.size(); ++prop_a_j) { -        const WordID prop_f = (prop_a_j ? asp.src[prop_a_j - 1] : kNULL); -        InstantiateRule(prop_f, asp.trg[j], &r); -        ss[prop_a_j] = tmodel.RuleProbability(r); -      } -      a_j = prng->SelectSample(ss); -      f_a_j = (a_j ? asp.src[a_j - 1] : kNULL); -      InstantiateRule(f_a_j, asp.trg[j], &r); -      if (tmodel.IncrementRule(r, &*prng)) -        up0.Increment(r); -    } -  } -  cerr << "  LLH = " << Likelihood() << endl; -} - -void ExtractLetters(const set<WordID>& v, vector<vector<WordID> >* l, set<WordID>* letset = NULL) { -  for (set<WordID>::const_iterator it = v.begin(); it != v.end(); ++it) { -    vector<WordID>& letters = (*l)[*it]; -    if (letters.size()) continue;   // if e and f have the same word - -    const string& w = TD::Convert(*it); -     -    size_t cur = 0; -    while (cur < w.size()) { -      const size_t len = UTF8Len(w[cur]); -      letters.push_back(TD::Convert(w.substr(cur, len))); -      if (letset) letset->insert(letters.back()); -      cur += len; -    } -  } -} - -void Debug(const AlignedSentencePair& asp) { -  cerr << TD::GetString(asp.src) << endl << TD::GetString(asp.trg) << endl; -  Array2D<bool> a(asp.src.size(), asp.trg.size()); -  for (unsigned j = 0; j < asp.trg.size(); ++j) -    if (asp.a[j].src_index) a(asp.a[j].src_index - 1, j) = true; -  cerr << a << endl; -} - -void AddSample(AlignedSentencePair* asp) { -  for (unsigned j = 0; j < asp->trg.size(); ++j) -    asp->posterior(asp->a[j].src_index, j)++; -} - -void WriteAlignments(const AlignedSentencePair& asp) { -  bool first = true; -  for (unsigned j = 0; j < asp.trg.size(); ++j) { -    int src_index = -1; -    int mc = -1; -    for (unsigned i = 0; i <= asp.src.size(); ++i) { -      if (asp.posterior(i, j) > mc) { -        mc = asp.posterior(i, j); -        src_index = i; -      } -    } - -    if (src_index) { -      if (first) first = false; else cout << ' '; -      cout << (src_index - 1) << '-' << j; -    } -  } -  cout << endl; -} - -int main(int argc, char** argv) { -  po::variables_map conf; -  InitCommandLine(argc, argv, &conf); - -  if (conf.count("random_seed")) -    prng.reset(new MT19937(conf["random_seed"].as<uint32_t>())); -  else -    prng.reset(new MT19937); -//  MT19937& rng = *prng; - -  vector<vector<int> > corpuse, corpusf; -  set<int> vocabe, vocabf; -  corpus::ReadParallelCorpus(conf["input"].as<string>(), &corpusf, &corpuse, &vocabf, &vocabe); -  cerr << "f-Corpus size: " << corpusf.size() << " sentences\n"; -  cerr << "f-Vocabulary size: " << vocabf.size() << " types\n"; -  cerr << "f-Corpus size: " << corpuse.size() << " sentences\n"; -  cerr << "f-Vocabulary size: " << vocabe.size() << " types\n"; -  assert(corpusf.size() == corpuse.size()); - -  vector<AlignedSentencePair> corpus(corpuse.size()); -  for (unsigned i = 0; i < corpuse.size(); ++i) { -    corpus[i].src.swap(corpusf[i]); -    corpus[i].trg.swap(corpuse[i]); -    corpus[i].posterior.resize(corpus[i].src.size() + 1, corpus[i].trg.size()); -  } -  corpusf.clear(); corpuse.clear(); - -  vocabf.insert(TD::Convert("NULL")); -  vector<vector<WordID> > letters(TD::NumWords() + 1); -  set<WordID> letset; -  ExtractLetters(vocabe, &letters, &letset); -  ExtractLetters(vocabf, &letters, NULL); -  letters[TD::Convert("NULL")].clear(); - -  // TODO configure this -  const int max_src_chunk = conf["max_src_chunk"].as<unsigned>(); -  const int max_trg_chunk = conf["max_trg_chunk"].as<unsigned>(); -  const double s2t_rat = conf["expected_src_to_trg_ratio"].as<double>(); -  const BackwardEstimator be(conf["s2t"].as<string>(), conf["t2s"].as<string>()); -  Transliterations tl(max_src_chunk, max_trg_chunk, s2t_rat, be);  - -  cerr << "Initializing transliteration graph structures ...\n"; -  for (int i = 0; i < corpus.size(); ++i) { -    const vector<int>& src = corpus[i].src; -    const vector<int>& trg = corpus[i].trg; -    for (int j = 0; j < src.size(); ++j) { -      const vector<int>& src_let = letters[src[j]]; -      for (int k = 0; k < trg.size(); ++k) { -        const vector<int>& trg_let = letters[trg[k]]; -        tl.Initialize(src[j], src_let, trg[k], trg_let); -        //if (src_let.size() < min_trans_src) -        //  tl.Forbid(src[j], src_let, trg[k], trg_let); -      } -    } -  } -  cerr << endl; -  tl.GraphSummary(); - -  return 0; -} diff --git a/gi/pf/backward.cc b/gi/pf/backward.cc deleted file mode 100644 index b92629fd..00000000 --- a/gi/pf/backward.cc +++ /dev/null @@ -1,89 +0,0 @@ -#include "backward.h" - -#include <queue> -#include <utility> - -#include "array2d.h" -#include "reachability.h" -#include "base_distributions.h" - -using namespace std; - -BackwardEstimator::BackwardEstimator(const string& s2t, -                    const string& t2s) : m1(new Model1(s2t)), m1inv(new Model1(t2s)) {} - -BackwardEstimator::~BackwardEstimator() { -  delete m1; m1 = NULL; -  delete m1inv; m1inv = NULL; -} - -float BackwardEstimator::ComputeBackwardProb(const std::vector<WordID>& src, -                                             const std::vector<WordID>& trg, -                                             unsigned src_covered, -                                             unsigned trg_covered, -                                             double s2t_ratio) const { -  if (src_covered == src.size() || trg_covered == trg.size()) { -    assert(src_covered == src.size()); -    assert(trg_covered == trg.size()); -    return 0; -  } -  static const WordID kNULL = TD::Convert("<eps>"); -  const prob_t uniform_alignment(1.0 / (src.size() - src_covered + 1)); -  // TODO factor in expected length ratio -  prob_t e; e.logeq(Md::log_poisson(trg.size() - trg_covered, (src.size() - src_covered) * s2t_ratio)); // p(trg len remaining | src len remaining) -  for (unsigned j = trg_covered; j < trg.size(); ++j) { -    prob_t p = (*m1)(kNULL, trg[j]) + prob_t(1e-12); -    for (unsigned i = src_covered; i < src.size(); ++i) -      p += (*m1)(src[i], trg[j]); -    if (p.is_0()) { -      cerr << "ERROR: p(" << TD::Convert(trg[j]) << " | " << TD::GetString(src) << ") = 0!\n"; -      assert(!"failed"); -    } -    p *= uniform_alignment; -    e *= p; -  } -  // TODO factor in expected length ratio -  const prob_t inv_uniform(1.0 / (trg.size() - trg_covered + 1.0)); -  prob_t inv; -  inv.logeq(Md::log_poisson(src.size() - src_covered, (trg.size() - trg_covered) / s2t_ratio)); -  for (unsigned i = src_covered; i < src.size(); ++i) { -    prob_t p = (*m1inv)(kNULL, src[i]) + prob_t(1e-12); -    for (unsigned j = trg_covered; j < trg.size(); ++j) -      p += (*m1inv)(trg[j], src[i]); -    if (p.is_0()) { -      cerr << "ERROR: p_inv(" << TD::Convert(src[i]) << " | " << TD::GetString(trg) << ") = 0!\n"; -      assert(!"failed"); -    } -    p *= inv_uniform; -    inv *= p; -  } -  return (log(e) + log(inv)) / 2; -} - -void BackwardEstimator::InitializeGrid(const vector<WordID>& src, -                      const vector<WordID>& trg, -                      const Reachability& r, -                      double s2t_ratio, -                      float* grid) const { -  queue<pair<int,int> > q; -  q.push(make_pair(0,0)); -  Array2D<bool> done(src.size()+1, trg.size()+1, false); -  //cerr << TD::GetString(src) << " ||| " << TD::GetString(trg) << endl; -  while(!q.empty()) { -    const pair<int,int> n = q.front(); -    q.pop(); -    if (done(n.first,n.second)) continue; -    done(n.first,n.second) = true; - -    float lp = ComputeBackwardProb(src, trg, n.first, n.second, s2t_ratio); -    if (n.first == 0 && n.second == 0) grid[0] = lp; -    //cerr << "  " << n.first << "," << n.second << "\t" << lp << endl; - -    if (n.first == src.size() || n.second == trg.size()) continue; -    const vector<pair<short,short> >& edges = r.valid_deltas[n.first][n.second]; -    for (int i = 0; i < edges.size(); ++i) -      q.push(make_pair(n.first + edges[i].first, n.second + edges[i].second)); -  } -  //static int cc = 0; ++cc; if (cc == 80) exit(1); -} - diff --git a/gi/pf/backward.h b/gi/pf/backward.h deleted file mode 100644 index e67eff0c..00000000 --- a/gi/pf/backward.h +++ /dev/null @@ -1,33 +0,0 @@ -#ifndef _BACKWARD_H_ -#define _BACKWARD_H_ - -#include <vector> -#include <string> -#include "wordid.h" - -struct Reachability; -struct Model1; - -struct BackwardEstimator { -  BackwardEstimator(const std::string& s2t, -                    const std::string& t2s); -  ~BackwardEstimator(); - -  void InitializeGrid(const std::vector<WordID>& src, -                      const std::vector<WordID>& trg, -                      const Reachability& r, -                      double src2trg_ratio, -                      float* grid) const; - - private: -  float ComputeBackwardProb(const std::vector<WordID>& src, -                            const std::vector<WordID>& trg, -                            unsigned src_covered, -                            unsigned trg_covered, -                            double src2trg_ratio) const; - -  Model1* m1; -  Model1* m1inv; -}; - -#endif diff --git a/gi/pf/base_distributions.cc b/gi/pf/base_distributions.cc deleted file mode 100644 index 57e0bbe1..00000000 --- a/gi/pf/base_distributions.cc +++ /dev/null @@ -1,241 +0,0 @@ -#include "base_distributions.h" - -#include <iostream> - -#include "filelib.h" - -using namespace std; - -TableLookupBase::TableLookupBase(const string& fname) { -  cerr << "TableLookupBase reading from " << fname << " ..." << endl; -  ReadFile rf(fname); -  istream& in = *rf.stream(); -  string line; -  unsigned lc = 0; -  const WordID kDIV = TD::Convert("|||"); -  vector<WordID> tmp; -  vector<int> le, lf; -  TRule x; -  x.lhs_ = -TD::Convert("X"); -  bool flag = false; -  while(getline(in, line)) { -    ++lc; -    if (lc % 1000000 == 0) { cerr << " [" << lc << ']' << endl; flag = false; } -    else if (lc % 25000 == 0) { cerr << '.' << flush; flag = true; } -    tmp.clear(); -    TD::ConvertSentence(line, &tmp); -    x.f_.clear(); -    x.e_.clear(); -    size_t pos = 0; -    int cc = 0; -    while(pos < tmp.size()) { -      const WordID cur = tmp[pos++]; -      if (cur == kDIV) { -        ++cc; -      } else if (cc == 0) { -        x.f_.push_back(cur);     -      } else if (cc == 1) { -        x.e_.push_back(cur); -      } else if (cc == 2) { -        table[x].logeq(atof(TD::Convert(cur).c_str())); -        ++cc; -      } else { -        if (flag) cerr << endl; -        cerr << "Bad format in " << lc << ": " << line << endl; abort(); -      } -    } -    if (cc != 3) { -      if (flag) cerr << endl; -      cerr << "Bad format in " << lc << ": " << line << endl; abort(); -    } -  } -  if (flag) cerr << endl; -  cerr << " read " << lc << " entries\n"; -} - -prob_t PhraseConditionalUninformativeUnigramBase::p0(const vector<WordID>& vsrc, -                                                     const vector<WordID>& vtrg, -                                                     int start_src, int start_trg) const { -  const int flen = vsrc.size() - start_src; -  const int elen = vtrg.size() - start_trg; -  prob_t p; -  p.logeq(Md::log_poisson(elen, flen + 0.01));       // elen | flen          ~Pois(flen + 0.01) -  //p.logeq(log_poisson(elen, 1));       // elen | flen          ~Pois(flen + 0.01) -  for (int i = 0; i < elen; ++i) -    p *= u(vtrg[i + start_trg]);                        // draw e_i             ~Uniform -  return p; -} - -prob_t PhraseConditionalUninformativeBase::p0(const vector<WordID>& vsrc, -                                              const vector<WordID>& vtrg, -                                              int start_src, int start_trg) const { -  const int flen = vsrc.size() - start_src; -  const int elen = vtrg.size() - start_trg; -  prob_t p; -  //p.logeq(log_poisson(elen, flen + 0.01));       // elen | flen          ~Pois(flen + 0.01) -  p.logeq(Md::log_poisson(elen, 1));       // elen | flen          ~Pois(flen + 0.01) -  for (int i = 0; i < elen; ++i) -    p *= kUNIFORM_TARGET;                        // draw e_i             ~Uniform -  return p; -} - -void Model1::LoadModel1(const string& fname) { -  cerr << "Loading Model 1 parameters from " << fname << " ..." << endl; -  ReadFile rf(fname); -  istream& in = *rf.stream(); -  string line; -  unsigned lc = 0; -  while(getline(in, line)) { -    ++lc; -    int cur = 0; -    int start = 0; -    while(cur < line.size() && line[cur] != ' ') { ++cur; } -    assert(cur != line.size()); -    line[cur] = 0; -    const WordID src = TD::Convert(&line[0]); -    ++cur; -    start = cur; -    while(cur < line.size() && line[cur] != ' ') { ++cur; } -    assert(cur != line.size()); -    line[cur] = 0; -    WordID trg = TD::Convert(&line[start]); -    const double logprob = strtod(&line[cur + 1], NULL); -    if (src >= ttable.size()) ttable.resize(src + 1); -    ttable[src][trg].logeq(logprob); -  } -  cerr << "  read " << lc << " parameters.\n"; -} - -prob_t PhraseConditionalBase::p0(const vector<WordID>& vsrc, -                                 const vector<WordID>& vtrg, -                                 int start_src, int start_trg) const { -  const int flen = vsrc.size() - start_src; -  const int elen = vtrg.size() - start_trg; -  prob_t uniform_src_alignment; uniform_src_alignment.logeq(-log(flen + 1)); -  prob_t p; -  p.logeq(Md::log_poisson(elen, flen + 0.01));       // elen | flen          ~Pois(flen + 0.01) -  for (int i = 0; i < elen; ++i) {               // for each position i in e-RHS -    const WordID trg = vtrg[i + start_trg]; -    prob_t tp = prob_t::Zero(); -    for (int j = -1; j < flen; ++j) { -      const WordID src = j < 0 ? 0 : vsrc[j + start_src]; -      tp += kM1MIXTURE * model1(src, trg); -      tp += kUNIFORM_MIXTURE * kUNIFORM_TARGET; -    } -    tp *= uniform_src_alignment;                 //     draw a_i         ~uniform -    p *= tp;                                     //     draw e_i         ~Model1(f_a_i) / uniform -  } -  if (p.is_0()) { -    cerr << "Zero! " << vsrc << "\nTRG=" << vtrg << endl; -    abort(); -  } -  return p; -} - -prob_t PhraseJointBase::p0(const vector<WordID>& vsrc, -                           const vector<WordID>& vtrg, -                           int start_src, int start_trg) const { -  const int flen = vsrc.size() - start_src; -  const int elen = vtrg.size() - start_trg; -  prob_t uniform_src_alignment; uniform_src_alignment.logeq(-log(flen + 1)); -  prob_t p; -  p.logeq(Md::log_poisson(flen, 1.0));               // flen                 ~Pois(1) -                                                 // elen | flen          ~Pois(flen + 0.01) -  prob_t ptrglen; ptrglen.logeq(Md::log_poisson(elen, flen + 0.01)); -  p *= ptrglen; -  p *= kUNIFORM_SOURCE.pow(flen);                // each f in F ~Uniform -  for (int i = 0; i < elen; ++i) {               // for each position i in E -    const WordID trg = vtrg[i + start_trg]; -    prob_t tp = prob_t::Zero(); -    for (int j = -1; j < flen; ++j) { -      const WordID src = j < 0 ? 0 : vsrc[j + start_src]; -      tp += kM1MIXTURE * model1(src, trg); -      tp += kUNIFORM_MIXTURE * kUNIFORM_TARGET; -    } -    tp *= uniform_src_alignment;                 //     draw a_i         ~uniform -    p *= tp;                                     //     draw e_i         ~Model1(f_a_i) / uniform -  } -  if (p.is_0()) { -    cerr << "Zero! " << vsrc << "\nTRG=" << vtrg << endl; -    abort(); -  } -  return p; -} - -prob_t PhraseJointBase_BiDir::p0(const vector<WordID>& vsrc, -                                 const vector<WordID>& vtrg, -                                 int start_src, int start_trg) const { -  const int flen = vsrc.size() - start_src; -  const int elen = vtrg.size() - start_trg; -  prob_t uniform_src_alignment; uniform_src_alignment.logeq(-log(flen + 1)); -  prob_t uniform_trg_alignment; uniform_trg_alignment.logeq(-log(elen + 1)); - -  prob_t p1; -  p1.logeq(Md::log_poisson(flen, 1.0));               // flen                 ~Pois(1) -                                                 // elen | flen          ~Pois(flen + 0.01) -  prob_t ptrglen; ptrglen.logeq(Md::log_poisson(elen, flen + 0.01)); -  p1 *= ptrglen; -  p1 *= kUNIFORM_SOURCE.pow(flen);                // each f in F ~Uniform -  for (int i = 0; i < elen; ++i) {               // for each position i in E -    const WordID trg = vtrg[i + start_trg]; -    prob_t tp = prob_t::Zero(); -    for (int j = -1; j < flen; ++j) { -      const WordID src = j < 0 ? 0 : vsrc[j + start_src]; -      tp += kM1MIXTURE * model1(src, trg); -      tp += kUNIFORM_MIXTURE * kUNIFORM_TARGET; -    } -    tp *= uniform_src_alignment;                 //     draw a_i         ~uniform -    p1 *= tp;                                     //     draw e_i         ~Model1(f_a_i) / uniform -  } -  if (p1.is_0()) { -    cerr << "Zero! " << vsrc << "\nTRG=" << vtrg << endl; -    abort(); -  } - -  prob_t p2; -  p2.logeq(Md::log_poisson(elen, 1.0));               // elen                 ~Pois(1) -                                                 // flen | elen          ~Pois(flen + 0.01) -  prob_t psrclen; psrclen.logeq(Md::log_poisson(flen, elen + 0.01)); -  p2 *= psrclen; -  p2 *= kUNIFORM_TARGET.pow(elen);                // each f in F ~Uniform -  for (int i = 0; i < flen; ++i) {               // for each position i in E -    const WordID src = vsrc[i + start_src]; -    prob_t tp = prob_t::Zero(); -    for (int j = -1; j < elen; ++j) { -      const WordID trg = j < 0 ? 0 : vtrg[j + start_trg]; -      tp += kM1MIXTURE * invmodel1(trg, src); -      tp += kUNIFORM_MIXTURE * kUNIFORM_SOURCE; -    } -    tp *= uniform_trg_alignment;                 //     draw a_i         ~uniform -    p2 *= tp;                                     //     draw e_i         ~Model1(f_a_i) / uniform -  } -  if (p2.is_0()) { -    cerr << "Zero! " << vsrc << "\nTRG=" << vtrg << endl; -    abort(); -  } - -  static const prob_t kHALF(0.5); -  return (p1 + p2) * kHALF; -} - -JumpBase::JumpBase() : p(200) { -  for (unsigned src_len = 1; src_len < 200; ++src_len) { -    map<int, prob_t>& cpd = p[src_len]; -    int min_jump = 1 - src_len; -    int max_jump = src_len; -    prob_t z; -    for (int j = min_jump; j <= max_jump; ++j) { -      prob_t& cp = cpd[j]; -      if (j < 0) -        cp.logeq(Md::log_poisson(1.5-j, 1)); -      else if (j > 0) -        cp.logeq(Md::log_poisson(j, 1)); -      cp.poweq(0.2); -      z += cp; -    } -    for (int j = min_jump; j <= max_jump; ++j) { -      cpd[j] /= z; -    } -  } -} - diff --git a/gi/pf/base_distributions.h b/gi/pf/base_distributions.h deleted file mode 100644 index 41b513f8..00000000 --- a/gi/pf/base_distributions.h +++ /dev/null @@ -1,238 +0,0 @@ -#ifndef _BASE_MEASURES_H_ -#define _BASE_MEASURES_H_ - -#include <vector> -#include <map> -#include <string> -#include <cmath> -#include <iostream> -#include <cassert> - -#include "unigrams.h" -#include "trule.h" -#include "prob.h" -#include "tdict.h" -#include "sampler.h" -#include "m.h" -#include "os_phrase.h" - -struct Model1 { -  explicit Model1(const std::string& fname) : -      kNULL(TD::Convert("<eps>")), -      kZERO() { -    LoadModel1(fname); -  } - -  void LoadModel1(const std::string& fname); - -  // returns prob 0 if src or trg is not found -  const prob_t& operator()(WordID src, WordID trg) const { -    if (src == 0) src = kNULL; -    if (src < ttable.size()) { -      const std::map<WordID, prob_t>& cpd = ttable[src]; -      const std::map<WordID, prob_t>::const_iterator it = cpd.find(trg); -      if (it != cpd.end()) -        return it->second; -    } -    return kZERO; -  } - -  const WordID kNULL; -  const prob_t kZERO; -  std::vector<std::map<WordID, prob_t> > ttable; -}; - -struct PoissonUniformUninformativeBase { -  explicit PoissonUniformUninformativeBase(const unsigned ves) : kUNIFORM(1.0 / ves) {} -  prob_t operator()(const TRule& r) const { -    prob_t p; p.logeq(Md::log_poisson(r.e_.size(), 1.0)); -    prob_t q = kUNIFORM; q.poweq(r.e_.size()); -    p *= q; -    return p; -  } -  void Summary() const {} -  void ResampleHyperparameters(MT19937*) {} -  void Increment(const TRule&) {} -  void Decrement(const TRule&) {} -  prob_t Likelihood() const { return prob_t::One(); } -  const prob_t kUNIFORM; -}; - -struct CompletelyUniformBase { -  explicit CompletelyUniformBase(const unsigned ves) : kUNIFORM(1.0 / ves) {} -  prob_t operator()(const TRule&) const { -    return kUNIFORM; -  } -  void Summary() const {} -  void ResampleHyperparameters(MT19937*) {} -  void Increment(const TRule&) {} -  void Decrement(const TRule&) {} -  prob_t Likelihood() const { return prob_t::One(); } -  const prob_t kUNIFORM; -}; - -struct UnigramWordBase { -  explicit UnigramWordBase(const std::string& fname) : un(fname) {} -  prob_t operator()(const TRule& r) const { -    return un(r.e_); -  } -  const UnigramWordModel un; -}; - -struct RuleHasher { -  size_t operator()(const TRule& r) const { -    return hash_value(r); -  } -}; - -struct TableLookupBase { -  TableLookupBase(const std::string& fname); - -  prob_t operator()(const TRule& rule) const { -    const std::tr1::unordered_map<TRule,prob_t,RuleHasher>::const_iterator it = table.find(rule); -    if (it == table.end()) { -      std::cerr << rule << " not found\n"; -      abort(); -    } -    return it->second; -  } - -  void ResampleHyperparameters(MT19937*) {} -  void Increment(const TRule&) {} -  void Decrement(const TRule&) {} -  prob_t Likelihood() const { return prob_t::One(); } -  void Summary() const {} - -  std::tr1::unordered_map<TRule,prob_t,RuleHasher> table; -}; - -struct PhraseConditionalUninformativeBase { -  explicit PhraseConditionalUninformativeBase(const unsigned vocab_e_size) : -      kUNIFORM_TARGET(1.0 / vocab_e_size) { -    assert(vocab_e_size > 0); -  } - -  // return p0 of rule.e_ | rule.f_ -  prob_t operator()(const TRule& rule) const { -    return p0(rule.f_, rule.e_, 0, 0); -  } - -  prob_t p0(const std::vector<WordID>& vsrc, const std::vector<WordID>& vtrg, int start_src, int start_trg) const; - -  void Summary() const {} -  void ResampleHyperparameters(MT19937*) {} -  void Increment(const TRule&) {} -  void Decrement(const TRule&) {} -  prob_t Likelihood() const { return prob_t::One(); } -  const prob_t kUNIFORM_TARGET; -}; - -struct PhraseConditionalUninformativeUnigramBase { -  explicit PhraseConditionalUninformativeUnigramBase(const std::string& file, const unsigned vocab_e_size) : u(file, vocab_e_size) {} - -  // return p0 of rule.e_ | rule.f_ -  prob_t operator()(const TRule& rule) const { -    return p0(rule.f_, rule.e_, 0, 0); -  } - -  prob_t p0(const std::vector<WordID>& vsrc, const std::vector<WordID>& vtrg, int start_src, int start_trg) const; - -  const UnigramModel u; -}; - -struct PhraseConditionalBase { -  explicit PhraseConditionalBase(const Model1& m1, const double m1mixture, const unsigned vocab_e_size) : -      model1(m1), -      kM1MIXTURE(m1mixture), -      kUNIFORM_MIXTURE(1.0 - m1mixture), -      kUNIFORM_TARGET(1.0 / vocab_e_size) { -    assert(m1mixture >= 0.0 && m1mixture <= 1.0); -    assert(vocab_e_size > 0); -  } - -  // return p0 of rule.e_ | rule.f_ -  prob_t operator()(const TRule& rule) const { -    return p0(rule.f_, rule.e_, 0, 0); -  } - -  prob_t p0(const std::vector<WordID>& vsrc, const std::vector<WordID>& vtrg, int start_src, int start_trg) const; - -  const Model1& model1; -  const prob_t kM1MIXTURE;  // Model 1 mixture component -  const prob_t kUNIFORM_MIXTURE; // uniform mixture component -  const prob_t kUNIFORM_TARGET; -}; - -struct PhraseJointBase { -  explicit PhraseJointBase(const Model1& m1, const double m1mixture, const unsigned vocab_e_size, const unsigned vocab_f_size) : -      model1(m1), -      kM1MIXTURE(m1mixture), -      kUNIFORM_MIXTURE(1.0 - m1mixture), -      kUNIFORM_SOURCE(1.0 / vocab_f_size), -      kUNIFORM_TARGET(1.0 / vocab_e_size) { -    assert(m1mixture >= 0.0 && m1mixture <= 1.0); -    assert(vocab_e_size > 0); -  } - -  // return p0 of rule.e_ , rule.f_ -  prob_t operator()(const TRule& rule) const { -    return p0(rule.f_, rule.e_, 0, 0); -  } - -  prob_t p0(const std::vector<WordID>& vsrc, const std::vector<WordID>& vtrg, int start_src, int start_trg) const; - -  const Model1& model1; -  const prob_t kM1MIXTURE;  // Model 1 mixture component -  const prob_t kUNIFORM_MIXTURE; // uniform mixture component -  const prob_t kUNIFORM_SOURCE; -  const prob_t kUNIFORM_TARGET; -}; - -struct PhraseJointBase_BiDir { -  explicit PhraseJointBase_BiDir(const Model1& m1, -                                 const Model1& im1, -                                 const double m1mixture, -                                 const unsigned vocab_e_size, -                                 const unsigned vocab_f_size) : -      model1(m1), -      invmodel1(im1), -      kM1MIXTURE(m1mixture), -      kUNIFORM_MIXTURE(1.0 - m1mixture), -      kUNIFORM_SOURCE(1.0 / vocab_f_size), -      kUNIFORM_TARGET(1.0 / vocab_e_size) { -    assert(m1mixture >= 0.0 && m1mixture <= 1.0); -    assert(vocab_e_size > 0); -  } - -  // return p0 of rule.e_ , rule.f_ -  prob_t operator()(const TRule& rule) const { -    return p0(rule.f_, rule.e_, 0, 0); -  } - -  prob_t p0(const std::vector<WordID>& vsrc, const std::vector<WordID>& vtrg, int start_src, int start_trg) const; - -  const Model1& model1; -  const Model1& invmodel1; -  const prob_t kM1MIXTURE;  // Model 1 mixture component -  const prob_t kUNIFORM_MIXTURE; // uniform mixture component -  const prob_t kUNIFORM_SOURCE; -  const prob_t kUNIFORM_TARGET; -}; - -// base distribution for jump size multinomials -// basically p(0) = 0 and then, p(1) is max, and then -// you drop as you move to the max jump distance -struct JumpBase { -  JumpBase(); - -  const prob_t& operator()(int jump, unsigned src_len) const { -    assert(jump != 0); -    const std::map<int, prob_t>::const_iterator it = p[src_len].find(jump); -    assert(it != p[src_len].end()); -    return it->second; -  } -  std::vector<std::map<int, prob_t> > p; -}; - - -#endif diff --git a/gi/pf/bayes_lattice_score.cc b/gi/pf/bayes_lattice_score.cc deleted file mode 100644 index 70cb8dc2..00000000 --- a/gi/pf/bayes_lattice_score.cc +++ /dev/null @@ -1,309 +0,0 @@ -#include <iostream> -#include <queue> - -#include <boost/functional.hpp> -#include <boost/program_options.hpp> -#include <boost/program_options/variables_map.hpp> - -#include "inside_outside.h" -#include "hg.h" -#include "hg_io.h" -#include "bottom_up_parser.h" -#include "fdict.h" -#include "grammar.h" -#include "m.h" -#include "trule.h" -#include "tdict.h" -#include "filelib.h" -#include "dict.h" -#include "sampler.h" -#include "ccrp.h" -#include "ccrp_onetable.h" - -using namespace std; -using namespace tr1; -namespace po = boost::program_options; - -boost::shared_ptr<MT19937> prng; - -void InitCommandLine(int argc, char** argv, po::variables_map* conf) { -  po::options_description opts("Configuration options"); -  opts.add_options() -        ("samples,s",po::value<unsigned>()->default_value(1000),"Number of samples") -        ("input,i",po::value<string>(),"Read parallel data from") -        ("random_seed,S",po::value<uint32_t>(), "Random seed"); -  po::options_description clo("Command line options"); -  clo.add_options() -        ("config", po::value<string>(), "Configuration file") -        ("help", "Print this help message and exit"); -  po::options_description dconfig_options, dcmdline_options; -  dconfig_options.add(opts); -  dcmdline_options.add(opts).add(clo); -   -  po::store(parse_command_line(argc, argv, dcmdline_options), *conf); -  if (conf->count("config")) { -    ifstream config((*conf)["config"].as<string>().c_str()); -    po::store(po::parse_config_file(config, dconfig_options), *conf); -  } -  po::notify(*conf); - -  if (conf->count("help") || (conf->count("input") == 0)) { -    cerr << dcmdline_options << endl; -    exit(1); -  } -} - -unsigned ReadCorpus(const string& filename, -                    vector<Lattice>* e, -                    set<WordID>* vocab_e) { -  e->clear(); -  vocab_e->clear(); -  ReadFile rf(filename); -  istream* in = rf.stream(); -  assert(*in); -  string line; -  unsigned toks = 0; -  while(*in) { -    getline(*in, line); -    if (line.empty() && !*in) break; -    e->push_back(Lattice()); -    Lattice& le = e->back(); -    LatticeTools::ConvertTextOrPLF(line, & le); -    for (unsigned i = 0; i < le.size(); ++i) -      for (unsigned j = 0; j < le[i].size(); ++j) -        vocab_e->insert(le[i][j].label); -    toks += le.size(); -  } -  return toks; -} - -struct BaseModel { -  explicit BaseModel(unsigned tc) : -      unif(1.0 / tc), p(prob_t::One()) {} -  prob_t prob(const TRule& r) const { -    return unif; -  } -  void increment(const TRule& r, MT19937* rng) { -    p *= prob(r); -  } -  void decrement(const TRule& r, MT19937* rng) { -    p /= prob(r); -  } -  prob_t Likelihood() const { -    return p; -  } -  const prob_t unif; -  prob_t p; -}; - -struct UnigramModel { -  explicit UnigramModel(unsigned tc) : base(tc), crp(1,1,1,1), glue(1,1,1,1) {} -  BaseModel base; -  CCRP<TRule> crp; -  CCRP<TRule> glue; - -  prob_t Prob(const TRule& r) const { -    if (r.Arity() != 0) { -      return glue.prob(r, prob_t(0.5)); -    } -    return crp.prob(r, base.prob(r)); -  } - -  int Increment(const TRule& r, MT19937* rng) { -    if (r.Arity() != 0) { -      glue.increment(r, 0.5, rng); -      return 0; -    } else { -      if (crp.increment(r, base.prob(r), rng)) { -        base.increment(r, rng); -        return 1; -      } -      return 0; -    } -  } - -  int Decrement(const TRule& r, MT19937* rng) { -    if (r.Arity() != 0) { -      glue.decrement(r, rng); -      return 0; -    } else { -      if (crp.decrement(r, rng)) { -        base.decrement(r, rng); -        return -1; -      } -      return 0; -    } -  } - -  prob_t Likelihood() const { -    prob_t p; -    p.logeq(crp.log_crp_prob() + glue.log_crp_prob()); -    p *= base.Likelihood(); -    return p; -  } - -  void ResampleHyperparameters(MT19937* rng) { -    crp.resample_hyperparameters(rng); -    glue.resample_hyperparameters(rng); -    cerr << " d=" << crp.discount() << ", s=" << crp.strength() << "\t STOP d=" << glue.discount() << ", s=" << glue.strength() << endl; -  } -}; - -UnigramModel* plm; - -void SampleDerivation(const Hypergraph& hg, MT19937* rng, vector<unsigned>* sampled_deriv) { -  vector<prob_t> node_probs; -  Inside<prob_t, EdgeProb>(hg, &node_probs); -  queue<unsigned> q; -  q.push(hg.nodes_.size() - 2); -  while(!q.empty()) { -    unsigned cur_node_id = q.front(); -//    cerr << "NODE=" << cur_node_id << endl; -    q.pop(); -    const Hypergraph::Node& node = hg.nodes_[cur_node_id]; -    const unsigned num_in_edges = node.in_edges_.size(); -    unsigned sampled_edge = 0; -    if (num_in_edges == 1) { -      sampled_edge = node.in_edges_[0]; -    } else { -      //prob_t z; -      assert(num_in_edges > 1); -      SampleSet<prob_t> ss; -      for (unsigned j = 0; j < num_in_edges; ++j) { -        const Hypergraph::Edge& edge = hg.edges_[node.in_edges_[j]]; -        prob_t p = edge.edge_prob_; -        for (unsigned k = 0; k < edge.tail_nodes_.size(); ++k) -          p *= node_probs[edge.tail_nodes_[k]]; -        ss.add(p); -//        cerr << log(ss[j]) << " ||| " << edge.rule_->AsString() << endl; -        //z += p; -      } -//      for (unsigned j = 0; j < num_in_edges; ++j) { -//        const Hypergraph::Edge& edge = hg.edges_[node.in_edges_[j]]; -//        cerr << exp(log(ss[j] / z)) << " ||| " << edge.rule_->AsString() << endl; -//      } -//      cerr << " --- \n"; -      sampled_edge = node.in_edges_[rng->SelectSample(ss)]; -    } -    sampled_deriv->push_back(sampled_edge); -    const Hypergraph::Edge& edge = hg.edges_[sampled_edge]; -    for (unsigned j = 0; j < edge.tail_nodes_.size(); ++j) { -      q.push(edge.tail_nodes_[j]); -    } -  } -//  for (unsigned i = 0; i < sampled_deriv->size(); ++i) { -//    cerr << *hg.edges_[(*sampled_deriv)[i]].rule_ << endl; -//  } -} - -void IncrementDerivation(const Hypergraph& hg, const vector<unsigned>& d, UnigramModel* plm, MT19937* rng) { -  for (unsigned i = 0; i < d.size(); ++i) -    plm->Increment(*hg.edges_[d[i]].rule_, rng); -} - -void DecrementDerivation(const Hypergraph& hg, const vector<unsigned>& d, UnigramModel* plm, MT19937* rng) { -  for (unsigned i = 0; i < d.size(); ++i) -    plm->Decrement(*hg.edges_[d[i]].rule_, rng); -} - -prob_t TotalProb(const Hypergraph& hg) { -  return Inside<prob_t, EdgeProb>(hg); -} - -void IncrementLatticePath(const Hypergraph& hg, const vector<unsigned>& d, Lattice* pl) { -  Lattice& lat = *pl; -  for (int i = 0; i < d.size(); ++i) { -    const Hypergraph::Edge& edge = hg.edges_[d[i]]; -    if (edge.rule_->Arity() != 0) continue; -    WordID sym = edge.rule_->e_[0]; -    vector<LatticeArc>& las = lat[edge.i_]; -    int dist = edge.j_ - edge.i_; -    assert(dist > 0); -    for (int j = 0; j < las.size(); ++j) { -      if (las[j].dist2next == dist && -          las[j].label == sym) { -        las[j].cost += 1; -      } -    } -  } -} - -int main(int argc, char** argv) { -  po::variables_map conf; - -  InitCommandLine(argc, argv, &conf); -  vector<GrammarPtr> grammars(2); -  grammars[0].reset(new GlueGrammar("S","X")); -  const unsigned samples = conf["samples"].as<unsigned>(); - -  if (conf.count("random_seed")) -    prng.reset(new MT19937(conf["random_seed"].as<uint32_t>())); -  else -    prng.reset(new MT19937); -  MT19937& rng = *prng; -  vector<Lattice> corpuse; -  set<WordID> vocabe; -  cerr << "Reading corpus...\n"; -  const unsigned toks = ReadCorpus(conf["input"].as<string>(), &corpuse, &vocabe); -  cerr << "E-corpus size: " << corpuse.size() << " lattices\t (" << vocabe.size() << " word types)\n"; -  UnigramModel lm(vocabe.size()); -  vector<Hypergraph> hgs(corpuse.size()); -  vector<vector<unsigned> > derivs(corpuse.size()); -  for (int i = 0; i < corpuse.size(); ++i) { -    grammars[1].reset(new PassThroughGrammar(corpuse[i], "X")); -    ExhaustiveBottomUpParser parser("S", grammars); -    bool res = parser.Parse(corpuse[i], &hgs[i]);  // exhaustive parse -    assert(res); -  } - -  double csamples = 0; -  for (int SS=0; SS < samples; ++SS) { -    const bool is_last = ((samples - 1) == SS); -    prob_t dlh = prob_t::One(); -    bool record_sample = (SS > (samples * 1 / 3) && (SS % 5 == 3)); -    if (record_sample) csamples++; -    for (int ci = 0; ci < corpuse.size(); ++ci) { -      Lattice& lat = corpuse[ci]; -      Hypergraph& hg = hgs[ci]; -      vector<unsigned>& d = derivs[ci]; -      if (!is_last) DecrementDerivation(hg, d, &lm, &rng); -      for (unsigned i = 0; i < hg.edges_.size(); ++i) { -        TRule& r = *hg.edges_[i].rule_; -        if (r.Arity() != 0) -          hg.edges_[i].edge_prob_ = prob_t::One(); -        else -          hg.edges_[i].edge_prob_ = lm.Prob(r); -      } -      if (!is_last) { -        d.clear(); -        SampleDerivation(hg, &rng, &d); -        IncrementDerivation(hg, derivs[ci], &lm, &rng); -      } else { -        prob_t p = TotalProb(hg); -        dlh *= p; -        cerr << " p(sentence) = " << log(p) << "\t" << log(dlh) << endl; -      } -      if (record_sample) IncrementLatticePath(hg, derivs[ci], &lat); -    } -    double llh = log(lm.Likelihood()); -    cerr << "LLH=" << llh << "\tENTROPY=" << (-llh / log(2) / toks) << "\tPPL=" << pow(2, -llh / log(2) / toks) << endl; -    if (SS % 10 == 9) lm.ResampleHyperparameters(&rng); -    if (is_last) { -      double z = log(dlh); -      cerr << "TOTAL_PROB=" << z << "\tENTROPY=" << (-z / log(2) / toks) << "\tPPL=" << pow(2, -z / log(2) / toks) << endl; -    } -  } -  cerr << lm.crp << endl; -  cerr << lm.glue << endl; -  for (int i = 0; i < corpuse.size(); ++i) { -    for (int j = 0; j < corpuse[i].size(); ++j) -      for (int k = 0; k < corpuse[i][j].size(); ++k) { -        corpuse[i][j][k].cost /= csamples; -        corpuse[i][j][k].cost += 1e-3; -        corpuse[i][j][k].cost = log(corpuse[i][j][k].cost); -      } -    cout << HypergraphIO::AsPLF(corpuse[i]) << endl; -  } -  return 0; -} - diff --git a/gi/pf/brat.cc b/gi/pf/brat.cc deleted file mode 100644 index 832f22cf..00000000 --- a/gi/pf/brat.cc +++ /dev/null @@ -1,543 +0,0 @@ -#include <iostream> -#include <tr1/memory> -#include <queue> - -#include <boost/functional.hpp> -#include <boost/multi_array.hpp> -#include <boost/program_options.hpp> -#include <boost/program_options/variables_map.hpp> - -#include "viterbi.h" -#include "hg.h" -#include "trule.h" -#include "tdict.h" -#include "filelib.h" -#include "dict.h" -#include "sampler.h" -#include "ccrp_nt.h" -#include "cfg_wfst_composer.h" - -using namespace std; -using namespace tr1; -namespace po = boost::program_options; - -static unsigned kMAX_SRC_PHRASE; -static unsigned kMAX_TRG_PHRASE; -struct FSTState; - -double log_poisson(unsigned x, const double& lambda) { -  assert(lambda > 0.0); -  return log(lambda) * x - lgamma(x + 1) - lambda; -} - -struct ConditionalBase { -  explicit ConditionalBase(const double m1mixture, const unsigned vocab_e_size, const string& model1fname) : -      kM1MIXTURE(m1mixture), -      kUNIFORM_MIXTURE(1.0 - m1mixture), -      kUNIFORM_TARGET(1.0 / vocab_e_size), -      kNULL(TD::Convert("<eps>")) { -    assert(m1mixture >= 0.0 && m1mixture <= 1.0); -    assert(vocab_e_size > 0); -    LoadModel1(model1fname); -  } - -  void LoadModel1(const string& fname) { -    cerr << "Loading Model 1 parameters from " << fname << " ..." << endl; -    ReadFile rf(fname); -    istream& in = *rf.stream(); -    string line; -    unsigned lc = 0; -    while(getline(in, line)) { -      ++lc; -      int cur = 0; -      int start = 0; -      while(cur < line.size() && line[cur] != ' ') { ++cur; } -      assert(cur != line.size()); -      line[cur] = 0; -      const WordID src = TD::Convert(&line[0]); -      ++cur; -      start = cur; -      while(cur < line.size() && line[cur] != ' ') { ++cur; } -      assert(cur != line.size()); -      line[cur] = 0; -      WordID trg = TD::Convert(&line[start]); -      const double logprob = strtod(&line[cur + 1], NULL); -      if (src >= ttable.size()) ttable.resize(src + 1); -      ttable[src][trg].logeq(logprob); -    } -    cerr << "  read " << lc << " parameters.\n"; -  } - -  // return logp0 of rule.e_ | rule.f_ -  prob_t operator()(const TRule& rule) const { -    const int flen = rule.f_.size(); -    const int elen = rule.e_.size(); -    prob_t uniform_src_alignment; uniform_src_alignment.logeq(-log(flen + 1)); -    prob_t p; -    p.logeq(log_poisson(elen, flen + 0.01));       // elen | flen          ~Pois(flen + 0.01) -    for (int i = 0; i < elen; ++i) {               // for each position i in e-RHS -      const WordID trg = rule.e_[i]; -      prob_t tp = prob_t::Zero(); -      for (int j = -1; j < flen; ++j) { -        const WordID src = j < 0 ? kNULL : rule.f_[j]; -        const map<WordID, prob_t>::const_iterator it = ttable[src].find(trg); -        if (it != ttable[src].end()) { -          tp += kM1MIXTURE * it->second; -        } -        tp += kUNIFORM_MIXTURE * kUNIFORM_TARGET; -      } -      tp *= uniform_src_alignment;                 //     draw a_i         ~uniform -      p *= tp;                                     //     draw e_i         ~Model1(f_a_i) / uniform -    } -    return p; -  } - -  const prob_t kM1MIXTURE;  // Model 1 mixture component -  const prob_t kUNIFORM_MIXTURE; // uniform mixture component -  const prob_t kUNIFORM_TARGET; -  const WordID kNULL; -  vector<map<WordID, prob_t> > ttable; -}; - -void InitCommandLine(int argc, char** argv, po::variables_map* conf) { -  po::options_description opts("Configuration options"); -  opts.add_options() -        ("samples,s",po::value<unsigned>()->default_value(1000),"Number of samples") -        ("input,i",po::value<string>(),"Read parallel data from") -        ("max_src_phrase",po::value<unsigned>()->default_value(3),"Maximum length of source language phrases") -        ("max_trg_phrase",po::value<unsigned>()->default_value(3),"Maximum length of target language phrases") -        ("model1,m",po::value<string>(),"Model 1 parameters (used in base distribution)") -        ("model1_interpolation_weight",po::value<double>()->default_value(0.95),"Mixing proportion of model 1 with uniform target distribution") -        ("random_seed,S",po::value<uint32_t>(), "Random seed"); -  po::options_description clo("Command line options"); -  clo.add_options() -        ("config", po::value<string>(), "Configuration file") -        ("help,h", "Print this help message and exit"); -  po::options_description dconfig_options, dcmdline_options; -  dconfig_options.add(opts); -  dcmdline_options.add(opts).add(clo); -   -  po::store(parse_command_line(argc, argv, dcmdline_options), *conf); -  if (conf->count("config")) { -    ifstream config((*conf)["config"].as<string>().c_str()); -    po::store(po::parse_config_file(config, dconfig_options), *conf); -  } -  po::notify(*conf); - -  if (conf->count("help") || (conf->count("input") == 0)) { -    cerr << dcmdline_options << endl; -    exit(1); -  } -} - -void ReadParallelCorpus(const string& filename, -                vector<vector<WordID> >* f, -                vector<vector<int> >* e, -                set<int>* vocab_f, -                set<int>* vocab_e) { -  f->clear(); -  e->clear(); -  vocab_f->clear(); -  vocab_e->clear(); -  istream* in; -  if (filename == "-") -    in = &cin; -  else -    in = new ifstream(filename.c_str()); -  assert(*in); -  string line; -  const WordID kDIV = TD::Convert("|||"); -  vector<WordID> tmp; -  while(*in) { -    getline(*in, line); -    if (line.empty() && !*in) break; -    e->push_back(vector<int>()); -    f->push_back(vector<int>()); -    vector<int>& le = e->back(); -    vector<int>& lf = f->back(); -    tmp.clear(); -    TD::ConvertSentence(line, &tmp); -    bool isf = true; -    for (unsigned i = 0; i < tmp.size(); ++i) { -      const int cur = tmp[i]; -      if (isf) { -        if (kDIV == cur) { isf = false; } else { -          lf.push_back(cur); -          vocab_f->insert(cur); -        } -      } else { -        assert(cur != kDIV); -        le.push_back(cur); -        vocab_e->insert(cur); -      } -    } -    assert(isf == false); -  } -  if (in != &cin) delete in; -} - -struct UniphraseLM { -  UniphraseLM(const vector<vector<int> >& corpus, -              const set<int>& vocab, -              const po::variables_map& conf) : -    phrases_(1,1), -    gen_(1,1), -    corpus_(corpus), -    uniform_word_(1.0 / vocab.size()), -    gen_p0_(0.5), -    p_end_(0.5), -    use_poisson_(conf.count("poisson_length") > 0) {} - -  void ResampleHyperparameters(MT19937* rng) { -    phrases_.resample_hyperparameters(rng); -    gen_.resample_hyperparameters(rng); -    cerr << " " << phrases_.alpha(); -  } - -  CCRP_NoTable<vector<int> > phrases_; -  CCRP_NoTable<bool> gen_; -  vector<vector<bool> > z_;   // z_[i] is there a phrase boundary after the ith word -  const vector<vector<int> >& corpus_; -  const double uniform_word_; -  const double gen_p0_; -  const double p_end_; // in base length distribution, p of the end of a phrase -  const bool use_poisson_; -}; - -struct Reachability { -  boost::multi_array<bool, 4> edges;  // edges[src_covered][trg_covered][x][trg_delta] is this edge worth exploring? -  boost::multi_array<short, 2> max_src_delta; // msd[src_covered][trg_covered] -- the largest src delta that's valid - -  Reachability(int srclen, int trglen, int src_max_phrase_len, int trg_max_phrase_len) : -      edges(boost::extents[srclen][trglen][src_max_phrase_len+1][trg_max_phrase_len+1]), -      max_src_delta(boost::extents[srclen][trglen]) { -    ComputeReachability(srclen, trglen, src_max_phrase_len, trg_max_phrase_len); -  } - - private: -  struct SState { -    SState() : prev_src_covered(), prev_trg_covered() {} -    SState(int i, int j) : prev_src_covered(i), prev_trg_covered(j) {} -    int prev_src_covered; -    int prev_trg_covered; -  }; - -  struct NState { -    NState() : next_src_covered(), next_trg_covered() {} -    NState(int i, int j) : next_src_covered(i), next_trg_covered(j) {} -    int next_src_covered; -    int next_trg_covered; -  }; - -  void ComputeReachability(int srclen, int trglen, int src_max_phrase_len, int trg_max_phrase_len) { -    typedef boost::multi_array<vector<SState>, 2> array_type; -    array_type a(boost::extents[srclen + 1][trglen + 1]); -    a[0][0].push_back(SState()); -    for (int i = 0; i < srclen; ++i) { -      for (int j = 0; j < trglen; ++j) { -        if (a[i][j].size() == 0) continue; -        const SState prev(i,j); -        for (int k = 1; k <= src_max_phrase_len; ++k) { -          if ((i + k) > srclen) continue; -          for (int l = 1; l <= trg_max_phrase_len; ++l) { -            if ((j + l) > trglen) continue; -            a[i + k][j + l].push_back(prev); -          } -        } -      } -    } -    a[0][0].clear(); -    cerr << "Final cell contains " << a[srclen][trglen].size() << " back pointers\n"; -    assert(a[srclen][trglen].size() > 0); - -    typedef boost::multi_array<bool, 2> rarray_type; -    rarray_type r(boost::extents[srclen + 1][trglen + 1]); -//    typedef boost::multi_array<vector<NState>, 2> narray_type; -//    narray_type b(boost::extents[srclen + 1][trglen + 1]); -    r[srclen][trglen] = true; -    for (int i = srclen; i >= 0; --i) { -      for (int j = trglen; j >= 0; --j) { -        vector<SState>& prevs = a[i][j]; -        if (!r[i][j]) { prevs.clear(); } -//        const NState nstate(i,j); -        for (int k = 0; k < prevs.size(); ++k) { -          r[prevs[k].prev_src_covered][prevs[k].prev_trg_covered] = true; -          int src_delta = i - prevs[k].prev_src_covered; -          edges[prevs[k].prev_src_covered][prevs[k].prev_trg_covered][src_delta][j - prevs[k].prev_trg_covered] = true; -          short &msd = max_src_delta[prevs[k].prev_src_covered][prevs[k].prev_trg_covered]; -          if (src_delta > msd) msd = src_delta; -//          b[prevs[k].prev_src_covered][prevs[k].prev_trg_covered].push_back(nstate); -        } -      } -    } -    assert(!edges[0][0][1][0]); -    assert(!edges[0][0][0][1]); -    assert(!edges[0][0][0][0]); -    cerr << "  MAX SRC DELTA[0][0] = " << max_src_delta[0][0] << endl; -    assert(max_src_delta[0][0] > 0); -    //cerr << "First cell contains " << b[0][0].size() << " forward pointers\n"; -    //for (int i = 0; i < b[0][0].size(); ++i) { -    //  cerr << "  -> (" << b[0][0][i].next_src_covered << "," << b[0][0][i].next_trg_covered << ")\n"; -    //} -  } -}; - -ostream& operator<<(ostream& os, const FSTState& q); -struct FSTState { -  explicit FSTState(int src_size) : -      trg_covered_(), -      src_covered_(), -      src_coverage_(src_size) {} - -  FSTState(short trg_covered, short src_covered, const vector<bool>& src_coverage, const vector<short>& src_prefix) : -      trg_covered_(trg_covered), -      src_covered_(src_covered), -      src_coverage_(src_coverage), -      src_prefix_(src_prefix) { -    if (src_coverage_.size() == src_covered) { -      assert(src_prefix.size() == 0); -    } -  } - -  // if we extend by the word at src_position, what are -  // the next states that are reachable and lie on a valid -  // path to the final state? -  vector<FSTState> Extensions(int src_position, int src_len, int trg_len, const Reachability& r) const { -    assert(src_position < src_coverage_.size()); -    if (src_coverage_[src_position]) { -      cerr << "Trying to extend " << *this << " with position " << src_position << endl; -      abort(); -    } -    vector<bool> ncvg = src_coverage_; -    ncvg[src_position] = true; - -    vector<FSTState> res; -    const int trg_remaining = trg_len - trg_covered_; -    if (trg_remaining <= 0) { -      cerr << "Target appears to have been covered: " << *this << " (trg_len=" << trg_len << ",trg_covered=" << trg_covered_ << ")" << endl; -      abort(); -    } -    const int src_remaining = src_len - src_covered_; -    if (src_remaining <= 0) { -      cerr << "Source appears to have been covered: " << *this << endl; -      abort(); -    } - -    for (int tc = 1; tc <= kMAX_TRG_PHRASE; ++tc) { -      if (r.edges[src_covered_][trg_covered_][src_prefix_.size() + 1][tc]) { -        int nc = src_prefix_.size() + 1 + src_covered_; -        res.push_back(FSTState(trg_covered_ + tc, nc, ncvg, vector<short>())); -      } -    } - -    if ((src_prefix_.size() + 1) < r.max_src_delta[src_covered_][trg_covered_]) { -      vector<short> nsp = src_prefix_; -      nsp.push_back(src_position); -      res.push_back(FSTState(trg_covered_, src_covered_, ncvg, nsp)); -    } - -    if (res.size() == 0) { -      cerr << *this << " can't be extended!\n"; -      abort(); -    } -    return res; -  } - -  short trg_covered_, src_covered_; -  vector<bool> src_coverage_; -  vector<short> src_prefix_; -}; -bool operator<(const FSTState& q, const FSTState& r) { -  if (q.trg_covered_ != r.trg_covered_) return q.trg_covered_ < r.trg_covered_; -  if (q.src_covered_!= r.src_covered_) return q.src_covered_ < r.src_covered_; -  if (q.src_coverage_ != r.src_coverage_) return q.src_coverage_ < r.src_coverage_; -  return q.src_prefix_ < r.src_prefix_; -} - -ostream& operator<<(ostream& os, const FSTState& q) { -  os << "[" << q.trg_covered_ << " : "; -  for (int i = 0; i < q.src_coverage_.size(); ++i) -    os << q.src_coverage_[i]; -  os << " : <"; -  for (int i = 0; i < q.src_prefix_.size(); ++i) { -    if (i != 0) os << ' '; -    os << q.src_prefix_[i]; -  } -  return os << ">]"; -} - -struct MyModel { -  MyModel(ConditionalBase& rcp0) : rp0(rcp0) {} -  typedef unordered_map<vector<WordID>, CCRP_NoTable<TRule>, boost::hash<vector<WordID> > > SrcToRuleCRPMap; - -  void DecrementRule(const TRule& rule) { -    SrcToRuleCRPMap::iterator it = rules.find(rule.f_); -    assert(it != rules.end()); -    it->second.decrement(rule); -    if (it->second.num_customers() == 0) rules.erase(it); -  } - -  void IncrementRule(const TRule& rule) { -    SrcToRuleCRPMap::iterator it = rules.find(rule.f_); -    if (it == rules.end()) { -      CCRP_NoTable<TRule> crp(1,1); -      it = rules.insert(make_pair(rule.f_, crp)).first; -    } -    it->second.increment(rule); -  } - -  // conditioned on rule.f_ -  prob_t RuleConditionalProbability(const TRule& rule) const { -    const prob_t base = rp0(rule); -    SrcToRuleCRPMap::const_iterator it = rules.find(rule.f_); -    if (it == rules.end()) { -      return base; -    } else { -      const double lp = it->second.logprob(rule, log(base)); -      prob_t q; q.logeq(lp); -      return q; -    } -  } - -  const ConditionalBase& rp0; -  SrcToRuleCRPMap rules; -}; - -struct MyFST : public WFST { -  MyFST(const vector<WordID>& ssrc, const vector<WordID>& strg, MyModel* m) : -      src(ssrc), trg(strg), -      r(src.size(),trg.size(),kMAX_SRC_PHRASE, kMAX_TRG_PHRASE), -      model(m) { -    FSTState in(src.size()); -    cerr << " INIT: " << in << endl; -    init = GetNode(in); -    for (int i = 0; i < in.src_coverage_.size(); ++i) in.src_coverage_[i] = true; -    in.src_covered_ = src.size(); -    in.trg_covered_ = trg.size(); -    cerr << "FINAL: " << in << endl; -    final = GetNode(in); -  } -  virtual const WFSTNode* Final() const; -  virtual const WFSTNode* Initial() const; - -  const WFSTNode* GetNode(const FSTState& q); -  map<FSTState, boost::shared_ptr<WFSTNode> > m; -  const vector<WordID>& src; -  const vector<WordID>& trg; -  Reachability r; -  const WFSTNode* init; -  const WFSTNode* final; -  MyModel* model; -}; - -struct MyNode : public WFSTNode { -  MyNode(const FSTState& q, MyFST* fst) : state(q), container(fst) {} -  virtual vector<pair<const WFSTNode*, TRulePtr> > ExtendInput(unsigned srcindex) const; -  const FSTState state; -  mutable MyFST* container; -}; - -vector<pair<const WFSTNode*, TRulePtr> > MyNode::ExtendInput(unsigned srcindex) const { -  cerr << "EXTEND " << state << " with " << srcindex << endl; -  vector<FSTState> ext = state.Extensions(srcindex, container->src.size(), container->trg.size(), container->r); -  vector<pair<const WFSTNode*,TRulePtr> > res(ext.size()); -  for (unsigned i = 0; i < ext.size(); ++i) { -    res[i].first = container->GetNode(ext[i]); -    if (ext[i].src_prefix_.size() == 0) { -      const unsigned trg_from = state.trg_covered_; -      const unsigned trg_to = ext[i].trg_covered_; -      const unsigned prev_prfx_size = state.src_prefix_.size(); -      res[i].second.reset(new TRule); -      res[i].second->lhs_ = -TD::Convert("X"); -      vector<WordID>& src = res[i].second->f_; -      vector<WordID>& trg = res[i].second->e_; -      src.resize(prev_prfx_size + 1); -      for (unsigned j = 0; j < prev_prfx_size; ++j) -        src[j] = container->src[state.src_prefix_[j]]; -      src[prev_prfx_size] = container->src[srcindex]; -      for (unsigned j = trg_from; j < trg_to; ++j) -        trg.push_back(container->trg[j]); -      res[i].second->scores_.set_value(FD::Convert("Proposal"), log(container->model->RuleConditionalProbability(*res[i].second))); -    } -  } -  return res; -} - -const WFSTNode* MyFST::GetNode(const FSTState& q) { -  boost::shared_ptr<WFSTNode>& res = m[q]; -  if (!res) { -    res.reset(new MyNode(q, this)); -  } -  return &*res; -} - -const WFSTNode* MyFST::Final() const { -  return final; -} - -const WFSTNode* MyFST::Initial() const { -  return init; -} - -int main(int argc, char** argv) { -  po::variables_map conf; -  InitCommandLine(argc, argv, &conf); -  kMAX_TRG_PHRASE = conf["max_trg_phrase"].as<unsigned>(); -  kMAX_SRC_PHRASE = conf["max_src_phrase"].as<unsigned>(); - -  if (!conf.count("model1")) { -    cerr << argv[0] << "Please use --model1 to specify model 1 parameters\n"; -    return 1; -  } -  boost::shared_ptr<MT19937> prng; -  if (conf.count("random_seed")) -    prng.reset(new MT19937(conf["random_seed"].as<uint32_t>())); -  else -    prng.reset(new MT19937); -  MT19937& rng = *prng; - -  vector<vector<int> > corpuse, corpusf; -  set<int> vocabe, vocabf; -  ReadParallelCorpus(conf["input"].as<string>(), &corpusf, &corpuse, &vocabf, &vocabe); -  cerr << "f-Corpus size: " << corpusf.size() << " sentences\n"; -  cerr << "f-Vocabulary size: " << vocabf.size() << " types\n"; -  cerr << "f-Corpus size: " << corpuse.size() << " sentences\n"; -  cerr << "f-Vocabulary size: " << vocabe.size() << " types\n"; -  assert(corpusf.size() == corpuse.size()); - -  ConditionalBase lp0(conf["model1_interpolation_weight"].as<double>(), -                      vocabe.size(), -                      conf["model1"].as<string>()); -  MyModel m(lp0); - -  TRule x("[X] ||| kAnwntR myN ||| at the convent ||| 0"); -  m.IncrementRule(x); -  TRule y("[X] ||| nY dyN ||| gave ||| 0"); -  m.IncrementRule(y); - - -  MyFST fst(corpusf[0], corpuse[0], &m); -  ifstream in("./kimura.g"); -  assert(in); -  CFG_WFSTComposer comp(fst); -  Hypergraph hg; -  bool succeed = comp.Compose(&in, &hg); -  hg.PrintGraphviz(); -  if (succeed) { cerr << "SUCCESS.\n"; } else { cerr << "FAILURE REPORTED.\n"; } - -#if 0 -  ifstream in2("./amnabooks.g"); -  assert(in2); -  MyFST fst2(corpusf[1], corpuse[1], &m); -  CFG_WFSTComposer comp2(fst2); -  Hypergraph hg2; -  bool succeed2 = comp2.Compose(&in2, &hg2); -  if (succeed2) { cerr << "SUCCESS.\n"; } else { cerr << "FAILURE REPORTED.\n"; } -#endif - -  SparseVector<double> w; w.set_value(FD::Convert("Proposal"), 1.0); -  hg.Reweight(w); -  cerr << ViterbiFTree(hg) << endl; -  return 0; -} - diff --git a/gi/pf/cbgi.cc b/gi/pf/cbgi.cc deleted file mode 100644 index 97f1ba34..00000000 --- a/gi/pf/cbgi.cc +++ /dev/null @@ -1,330 +0,0 @@ -#include <queue> -#include <sstream> -#include <iostream> - -#include <boost/unordered_map.hpp> -#include <boost/functional/hash.hpp> - -#include "sampler.h" -#include "filelib.h" -#include "hg_io.h" -#include "hg.h" -#include "ccrp_nt.h" -#include "trule.h" -#include "inside_outside.h" - -using namespace std; -using namespace std::tr1; - -double log_poisson(unsigned x, const double& lambda) { -  assert(lambda > 0.0); -  return log(lambda) * x - lgamma(x + 1) - lambda; -} - -double log_decay(unsigned x, const double& b) { -  assert(b > 1.0); -  assert(x > 0); -  return log(b - 1) - x * log(b); -} - -struct SimpleBase { -  SimpleBase(unsigned esize, unsigned fsize, unsigned ntsize = 144) : -    uniform_e(-log(esize)), -    uniform_f(-log(fsize)), -    uniform_nt(-log(ntsize)) { -  } - -  // binomial coefficient -  static double choose(unsigned n, unsigned k) { -    return exp(lgamma(n + 1) - lgamma(k + 1) - lgamma(n - k + 1)); -  } - -  // count the number of patterns of terminals and NTs in the rule, given elen and flen -  static double log_number_of_patterns(const unsigned flen, const unsigned elen) { -    static vector<vector<double> > counts; -    if (elen >= counts.size()) counts.resize(elen + 1); -    if (flen >= counts[elen].size()) counts[elen].resize(flen + 1); -    double& count = counts[elen][flen]; -    if (count) return log(count); -    const unsigned max_arity = min(elen, flen); -    for (unsigned a = 0; a <= max_arity; ++a) -      count += choose(elen, a) * choose(flen, a); -    return log(count); -  } - -  // return logp0 of rule | LHS -  double operator()(const TRule& rule) const { -    const unsigned flen = rule.f_.size(); -    const unsigned elen = rule.e_.size(); -#if 0 -    double p = 0; -    p += log_poisson(flen, 0.5);                   // flen                 ~Pois(0.5) -    p += log_poisson(elen, flen);                  // elen | flen          ~Pois(flen) -    p -= log_number_of_patterns(flen, elen);       // pattern | flen,elen  ~Uniform -    for (unsigned i = 0; i < flen; ++i) {          // for each position in f-RHS -      if (rule.f_[i] <= 0)                         //   according to pattern -        p += uniform_nt;                           //     draw NT          ~Uniform -      else -        p += uniform_f;                            //     draw f terminal  ~Uniform -    } -    p -= lgamma(rule.Arity() + 1);                 // draw permutation     ~Uniform  -    for (unsigned i = 0; i < elen; ++i) {          // for each position in e-RHS -      if (rule.e_[i] > 0)                          //   according to pattern -        p += uniform_e;                            //     draw e|f term    ~Uniform -        // TODO this should prob be model 1 -    } -#else -    double p = 0; -    bool is_abstract = rule.f_[0] <= 0; -    p += log(0.5); -    if (is_abstract) { -      if (flen == 2) p += log(0.99); else p += log(0.01); -    } else { -      p += log_decay(flen, 3); -    } - -    for (unsigned i = 0; i < flen; ++i) {          // for each position in f-RHS -      if (rule.f_[i] <= 0)                         //   according to pattern -        p += uniform_nt;                           //     draw NT          ~Uniform -      else -        p += uniform_f;                            //     draw f terminal  ~Uniform -    } -#endif -    return p; -  } -  const double uniform_e; -  const double uniform_f; -  const double uniform_nt; -  vector<double> arities; -}; - -MT19937* rng = NULL; - -template <typename Base> -struct MHSamplerEdgeProb { -  MHSamplerEdgeProb(const Hypergraph& hg, -                  const map<int, CCRP_NoTable<TRule> >& rdp, -                  const Base& logp0, -                  const bool exclude_multiword_terminals) : edge_probs(hg.edges_.size()) { -    for (int i = 0; i < edge_probs.size(); ++i) { -      const TRule& rule = *hg.edges_[i].rule_; -      const map<int, CCRP_NoTable<TRule> >::const_iterator it = rdp.find(rule.lhs_); -      assert(it != rdp.end()); -      const CCRP_NoTable<TRule>& crp = it->second; -      edge_probs[i].logeq(crp.logprob(rule, logp0(rule))); -      if (exclude_multiword_terminals && rule.f_[0] > 0 && rule.f_.size() > 1) -        edge_probs[i] = prob_t::Zero(); -    } -  } -  inline prob_t operator()(const Hypergraph::Edge& e) const { -    return edge_probs[e.id_]; -  } -  prob_t DerivationProb(const vector<int>& d) const { -    prob_t p = prob_t::One(); -    for (unsigned i = 0; i < d.size(); ++i) -      p *= edge_probs[d[i]]; -    return p; -  } -  vector<prob_t> edge_probs; -}; - -template <typename Base> -struct ModelAndData { -  ModelAndData() : -     base_lh(prob_t::One()), -     logp0(10000, 10000), -     mh_samples(), -     mh_rejects() {} - -  void SampleCorpus(const string& hgpath, int i); -  void ResampleHyperparameters() { -    for (map<int, CCRP_NoTable<TRule> >::iterator it = rules.begin(); it != rules.end(); ++it) -      it->second.resample_hyperparameters(rng); -  } - -  CCRP_NoTable<TRule>& RuleCRP(int lhs) { -    map<int, CCRP_NoTable<TRule> >::iterator it = rules.find(lhs); -    if (it == rules.end()) { -      rules.insert(make_pair(lhs, CCRP_NoTable<TRule>(1,1))); -      it = rules.find(lhs); -    } -    return it->second; -  } - -  void IncrementRule(const TRule& rule) { -    CCRP_NoTable<TRule>& crp = RuleCRP(rule.lhs_); -    if (crp.increment(rule)) { -      prob_t p; p.logeq(logp0(rule)); -      base_lh *= p; -    } -  } - -  void DecrementRule(const TRule& rule) { -    CCRP_NoTable<TRule>& crp = RuleCRP(rule.lhs_); -    if (crp.decrement(rule)) { -      prob_t p; p.logeq(logp0(rule)); -      base_lh /= p; -    } -  } - -  void DecrementDerivation(const Hypergraph& hg, const vector<int>& d) { -    for (unsigned i = 0; i < d.size(); ++i) { -      const TRule& rule = *hg.edges_[d[i]].rule_; -      DecrementRule(rule); -    } -  } - -  void IncrementDerivation(const Hypergraph& hg, const vector<int>& d) { -    for (unsigned i = 0; i < d.size(); ++i) { -      const TRule& rule = *hg.edges_[d[i]].rule_; -      IncrementRule(rule); -    } -  } - -  prob_t Likelihood() const { -    prob_t p = prob_t::One(); -    for (map<int, CCRP_NoTable<TRule> >::const_iterator it = rules.begin(); it != rules.end(); ++it) { -      prob_t q; q.logeq(it->second.log_crp_prob()); -      p *= q; -    } -    p *= base_lh; -    return p; -  } - -  void ResampleDerivation(const Hypergraph& hg, vector<int>* sampled_derivation); - -  map<int, CCRP_NoTable<TRule> > rules;  // [lhs] -> distribution over RHSs -  prob_t base_lh; -  SimpleBase logp0; -  vector<vector<int> > samples;   // sampled derivations -  unsigned int mh_samples; -  unsigned int mh_rejects; -}; - -template <typename Base> -void ModelAndData<Base>::SampleCorpus(const string& hgpath, int n) { -  vector<Hypergraph> hgs(n); hgs.clear(); -  boost::unordered_map<TRule, unsigned> acc; -  map<int, unsigned> tot; -  for (int i = 0; i < n; ++i) { -    ostringstream os; -    os << hgpath << '/' << i << ".json.gz"; -    if (!FileExists(os.str())) continue; -    hgs.push_back(Hypergraph()); -    ReadFile rf(os.str()); -    HypergraphIO::ReadFromJSON(rf.stream(), &hgs.back()); -  } -  cerr << "Read " << hgs.size() << " alignment hypergraphs.\n"; -  samples.resize(hgs.size()); -  const unsigned SAMPLES = 2000; -  const unsigned burnin = 3 * SAMPLES / 4; -  const unsigned every = 20; -  for (unsigned s = 0; s < SAMPLES; ++s) { -    if (s % 10 == 0) { -      if (s > 0) { cerr << endl; ResampleHyperparameters(); } -      cerr << "[" << s << " LLH=" << log(Likelihood()) << " REJECTS=" << ((double)mh_rejects / mh_samples) << " LHS's=" << rules.size() << " base=" << log(base_lh) << "] "; -    } -    cerr << '.'; -    for (unsigned i = 0; i < hgs.size(); ++i) { -      ResampleDerivation(hgs[i], &samples[i]); -      if (s > burnin && s % every == 0) { -        for (unsigned j = 0; j < samples[i].size(); ++j) { -          const TRule& rule = *hgs[i].edges_[samples[i][j]].rule_; -          ++acc[rule]; -          ++tot[rule.lhs_]; -        } -      } -    } -  } -  cerr << endl; -  for (boost::unordered_map<TRule,unsigned>::iterator it = acc.begin(); it != acc.end(); ++it) { -    cout << it->first << " MyProb=" << log(it->second)-log(tot[it->first.lhs_]) << endl; -  } -} - -template <typename Base> -void ModelAndData<Base>::ResampleDerivation(const Hypergraph& hg, vector<int>* sampled_deriv) { -  vector<int> cur; -  cur.swap(*sampled_deriv); - -  const prob_t p_cur = Likelihood(); -  DecrementDerivation(hg, cur); -  if (cur.empty()) { -    // first iteration, create restaurants -    for (int i = 0; i < hg.edges_.size(); ++i) -      RuleCRP(hg.edges_[i].rule_->lhs_); -  } -  MHSamplerEdgeProb<SimpleBase> wf(hg, rules, logp0, cur.empty()); -//  MHSamplerEdgeProb<SimpleBase> wf(hg, rules, logp0, false); -  const prob_t q_cur = wf.DerivationProb(cur); -  vector<prob_t> node_probs; -  Inside<prob_t, MHSamplerEdgeProb<SimpleBase> >(hg, &node_probs, wf); -  queue<unsigned> q; -  q.push(hg.nodes_.size() - 3); -  while(!q.empty()) { -    unsigned cur_node_id = q.front(); -//    cerr << "NODE=" << cur_node_id << endl; -    q.pop(); -    const Hypergraph::Node& node = hg.nodes_[cur_node_id]; -    const unsigned num_in_edges = node.in_edges_.size(); -    unsigned sampled_edge = 0; -    if (num_in_edges == 1) { -      sampled_edge = node.in_edges_[0]; -    } else { -      prob_t z; -      assert(num_in_edges > 1); -      SampleSet<prob_t> ss; -      for (unsigned j = 0; j < num_in_edges; ++j) { -        const Hypergraph::Edge& edge = hg.edges_[node.in_edges_[j]]; -        prob_t p = wf.edge_probs[edge.id_];             // edge proposal prob -        for (unsigned k = 0; k < edge.tail_nodes_.size(); ++k) -          p *= node_probs[edge.tail_nodes_[k]]; -        ss.add(p); -//        cerr << log(ss[j]) << " ||| " << edge.rule_->AsString() << endl; -        z += p; -      } -//      for (unsigned j = 0; j < num_in_edges; ++j) { -//        const Hypergraph::Edge& edge = hg.edges_[node.in_edges_[j]]; -//        cerr << exp(log(ss[j] / z)) << " ||| " << edge.rule_->AsString() << endl; -//      } -//      cerr << " --- \n"; -      sampled_edge = node.in_edges_[rng->SelectSample(ss)]; -    } -    sampled_deriv->push_back(sampled_edge); -    const Hypergraph::Edge& edge = hg.edges_[sampled_edge]; -    for (unsigned j = 0; j < edge.tail_nodes_.size(); ++j) { -      q.push(edge.tail_nodes_[j]); -    } -  } -  IncrementDerivation(hg, *sampled_deriv); - -//  cerr << "sampled derivation contains " << sampled_deriv->size() << " edges\n"; -//  cerr << "DERIV:\n"; -//  for (int i = 0; i < sampled_deriv->size(); ++i) { -//    cerr << "  " << hg.edges_[(*sampled_deriv)[i]].rule_->AsString() << endl; -//  } - -  if (cur.empty()) return;  // accept first sample - -  ++mh_samples; -  // only need to do MH if proposal is different to current state -  if (cur != *sampled_deriv) { -    const prob_t q_prop = wf.DerivationProb(*sampled_deriv); -    const prob_t p_prop = Likelihood(); -    if (!rng->AcceptMetropolisHastings(p_prop, p_cur, q_prop, q_cur)) { -      ++mh_rejects; -      DecrementDerivation(hg, *sampled_deriv); -      IncrementDerivation(hg, cur); -      swap(cur, *sampled_deriv); -    } -  } -} - -int main(int argc, char** argv) { -  rng = new MT19937; -  ModelAndData<SimpleBase> m; -  m.SampleCorpus("./hgs", 50); -  // m.SampleCorpus("./btec/hgs", 5000); -  return 0; -} - diff --git a/gi/pf/cfg_wfst_composer.cc b/gi/pf/cfg_wfst_composer.cc deleted file mode 100644 index 21d5ec5b..00000000 --- a/gi/pf/cfg_wfst_composer.cc +++ /dev/null @@ -1,731 +0,0 @@ -#include "cfg_wfst_composer.h" - -#include <iostream> -#include <fstream> -#include <map> -#include <queue> -#include <tr1/unordered_map> -#include <tr1/unordered_set> - -#include <boost/shared_ptr.hpp> -#include <boost/program_options.hpp> -#include <boost/program_options/variables_map.hpp> -#include "fast_lexical_cast.hpp" - -#include "phrasetable_fst.h" -#include "sparse_vector.h" -#include "tdict.h" -#include "hg.h" -#include "hg_remove_eps.h" - -namespace po = boost::program_options; -using namespace std; -using namespace std::tr1; - -WFSTNode::~WFSTNode() {} -WFST::~WFST() {} - -// Define the following macro if you want to see lots of debugging output -// when you run the chart parser -#undef DEBUG_CHART_PARSER - -// A few constants used by the chart parser /////////////// -static const int kMAX_NODES = 2000000; -static const string kPHRASE_STRING = "X"; -static bool constants_need_init = true; -static WordID kUNIQUE_START; -static WordID kPHRASE; -static TRulePtr kX1X2; -static TRulePtr kX1; -static WordID kEPS; -static TRulePtr kEPSRule; - -static void InitializeConstants() { -  if (constants_need_init) { -    kPHRASE = TD::Convert(kPHRASE_STRING) * -1; -    kUNIQUE_START = TD::Convert("S") * -1; -    kX1X2.reset(new TRule("[X] ||| [X,1] [X,2] ||| [X,1] [X,2]")); -    kX1.reset(new TRule("[X] ||| [X,1] ||| [X,1]")); -    kEPSRule.reset(new TRule("[X] ||| <eps> ||| <eps>")); -    kEPS = TD::Convert("<eps>"); -    constants_need_init = false; -  } -} -//////////////////////////////////////////////////////////// - -class EGrammarNode { -  friend bool CFG_WFSTComposer::Compose(const Hypergraph& src_forest, Hypergraph* trg_forest); -  friend void AddGrammarRule(const string& r, map<WordID, EGrammarNode>* g); - public: -#ifdef DEBUG_CHART_PARSER -  string hint; -#endif -  EGrammarNode() : is_some_rule_complete(false), is_root(false) {} -  const map<WordID, EGrammarNode>& GetTerminals() const { return tptr; } -  const map<WordID, EGrammarNode>& GetNonTerminals() const { return ntptr; } -  bool HasNonTerminals() const { return (!ntptr.empty()); } -  bool HasTerminals() const { return (!tptr.empty()); } -  bool RuleCompletes() const { -    return (is_some_rule_complete || (ntptr.empty() && tptr.empty())); -  } -  bool GrammarContinues() const { -    return !(ntptr.empty() && tptr.empty()); -  } -  bool IsRoot() const { -    return is_root; -  } -  // these are the features associated with the rule from the start -  // node up to this point.  If you use these features, you must -  // not Extend() this rule. -  const SparseVector<double>& GetCFGProductionFeatures() const { -    return input_features; -  } - -  const EGrammarNode* Extend(const WordID& t) const { -    if (t < 0) { -      map<WordID, EGrammarNode>::const_iterator it = ntptr.find(t); -      if (it == ntptr.end()) return NULL; -      return &it->second; -    } else { -      map<WordID, EGrammarNode>::const_iterator it = tptr.find(t); -      if (it == tptr.end()) return NULL; -      return &it->second; -    } -  } - - private: -  map<WordID, EGrammarNode> tptr; -  map<WordID, EGrammarNode> ntptr; -  SparseVector<double> input_features; -  bool is_some_rule_complete; -  bool is_root; -}; -typedef map<WordID, EGrammarNode> EGrammar;    // indexed by the rule LHS - -// edges are immutable once created -struct Edge { -#ifdef DEBUG_CHART_PARSER -  static int id_count; -  const int id; -#endif -  const WordID cat;                   // lhs side of rule proved/being proved -  const EGrammarNode* const dot;      // dot position -  const WFSTNode* const q;             // start of span -  const WFSTNode* const r;             // end of span -  const Edge* const active_parent;    // back pointer, NULL for PREDICT items -  const Edge* const passive_parent;   // back pointer, NULL for SCAN and PREDICT items -  TRulePtr tps;   // translations -  boost::shared_ptr<SparseVector<double> > features; // features from CFG rule - -  bool IsPassive() const { -    // when a rule is completed, this value will be set -    return static_cast<bool>(features); -  } -  bool IsActive() const { return !IsPassive(); } -  bool IsInitial() const { -    return !(active_parent || passive_parent); -  } -  bool IsCreatedByScan() const { -    return active_parent && !passive_parent && !dot->IsRoot(); -  } -  bool IsCreatedByPredict() const { -    return dot->IsRoot(); -  } -  bool IsCreatedByComplete() const { -    return active_parent && passive_parent; -  } - -  // constructor for PREDICT -  Edge(WordID c, const EGrammarNode* d, const WFSTNode* q_and_r) : -#ifdef DEBUG_CHART_PARSER -    id(++id_count), -#endif -    cat(c), dot(d), q(q_and_r), r(q_and_r), active_parent(NULL), passive_parent(NULL), tps() {} -  Edge(WordID c, const EGrammarNode* d, const WFSTNode* q_and_r, const Edge* act_parent) : -#ifdef DEBUG_CHART_PARSER -    id(++id_count), -#endif -    cat(c), dot(d), q(q_and_r), r(q_and_r), active_parent(act_parent), passive_parent(NULL), tps() {} - -  // constructors for SCAN -  Edge(WordID c, const EGrammarNode* d, const WFSTNode* i, const WFSTNode* j, -       const Edge* act_par, const TRulePtr& translations) : -#ifdef DEBUG_CHART_PARSER -    id(++id_count), -#endif -    cat(c), dot(d), q(i), r(j), active_parent(act_par), passive_parent(NULL), tps(translations) {} - -  Edge(WordID c, const EGrammarNode* d, const WFSTNode* i, const WFSTNode* j, -       const Edge* act_par, const TRulePtr& translations, -       const SparseVector<double>& feats) : -#ifdef DEBUG_CHART_PARSER -    id(++id_count), -#endif -    cat(c), dot(d), q(i), r(j), active_parent(act_par), passive_parent(NULL), tps(translations), -    features(new SparseVector<double>(feats)) {} - -  // constructors for COMPLETE -  Edge(WordID c, const EGrammarNode* d, const WFSTNode* i, const WFSTNode* j, -       const Edge* act_par, const Edge *pas_par) : -#ifdef DEBUG_CHART_PARSER -    id(++id_count), -#endif -    cat(c), dot(d), q(i), r(j), active_parent(act_par), passive_parent(pas_par), tps() { -      assert(pas_par->IsPassive()); -      assert(act_par->IsActive()); -    } - -  Edge(WordID c, const EGrammarNode* d, const WFSTNode* i, const WFSTNode* j, -       const Edge* act_par, const Edge *pas_par, const SparseVector<double>& feats) : -#ifdef DEBUG_CHART_PARSER -    id(++id_count), -#endif -    cat(c), dot(d), q(i), r(j), active_parent(act_par), passive_parent(pas_par), tps(), -    features(new SparseVector<double>(feats)) { -      assert(pas_par->IsPassive()); -      assert(act_par->IsActive()); -    } - -  // constructor for COMPLETE query -  Edge(const WFSTNode* _r) : -#ifdef DEBUG_CHART_PARSER -    id(0), -#endif -    cat(0), dot(NULL), q(NULL), -    r(_r), active_parent(NULL), passive_parent(NULL), tps() {} -  // constructor for MERGE quere -  Edge(const WFSTNode* _q, int) : -#ifdef DEBUG_CHART_PARSER -    id(0), -#endif -    cat(0), dot(NULL), q(_q), -    r(NULL), active_parent(NULL), passive_parent(NULL), tps() {} -}; -#ifdef DEBUG_CHART_PARSER -int Edge::id_count = 0; -#endif - -ostream& operator<<(ostream& os, const Edge& e) { -  string type = "PREDICT"; -  if (e.IsCreatedByScan()) -    type = "SCAN"; -  else if (e.IsCreatedByComplete()) -    type = "COMPLETE"; -  os << "[" -#ifdef DEBUG_CHART_PARSER -     << '(' << e.id << ") " -#else -     << '(' << &e << ") " -#endif -     << "q=" << e.q << ", r=" << e.r -     << ", cat="<< TD::Convert(e.cat*-1) << ", dot=" -     << e.dot -#ifdef DEBUG_CHART_PARSER -     << e.dot->hint -#endif -     << (e.IsActive() ? ", Active" : ", Passive") -     << ", " << type; -#ifdef DEBUG_CHART_PARSER -  if (e.active_parent) { os << ", act.parent=(" << e.active_parent->id << ')'; } -  if (e.passive_parent) { os << ", psv.parent=(" << e.passive_parent->id << ')'; } -#endif -  if (e.tps) { os << ", tps=" << e.tps->AsString(); } -  return os << ']'; -} - -struct Traversal { -  const Edge* const edge;      // result from the active / passive combination -  const Edge* const active; -  const Edge* const passive; -  Traversal(const Edge* me, const Edge* a, const Edge* p) : edge(me), active(a), passive(p) {} -}; - -struct UniqueTraversalHash { -  size_t operator()(const Traversal* t) const { -    size_t x = 5381; -    x = ((x << 5) + x) ^ reinterpret_cast<size_t>(t->active); -    x = ((x << 5) + x) ^ reinterpret_cast<size_t>(t->passive); -    x = ((x << 5) + x) ^ t->edge->IsActive(); -    return x; -  } -}; - -struct UniqueTraversalEquals { -  size_t operator()(const Traversal* a, const Traversal* b) const { -    return (a->passive == b->passive && a->active == b->active && a->edge->IsActive() == b->edge->IsActive()); -  } -}; - -struct UniqueEdgeHash { -  size_t operator()(const Edge* e) const { -    size_t x = 5381; -    if (e->IsActive()) { -      x = ((x << 5) + x) ^ reinterpret_cast<size_t>(e->dot); -      x = ((x << 5) + x) ^ reinterpret_cast<size_t>(e->q); -      x = ((x << 5) + x) ^ reinterpret_cast<size_t>(e->r); -      x = ((x << 5) + x) ^ static_cast<size_t>(e->cat); -      x += 13; -    } else {  // with passive edges, we don't care about the dot -      x = ((x << 5) + x) ^ reinterpret_cast<size_t>(e->q); -      x = ((x << 5) + x) ^ reinterpret_cast<size_t>(e->r); -      x = ((x << 5) + x) ^ static_cast<size_t>(e->cat); -    } -    return x; -  } -}; - -struct UniqueEdgeEquals { -  bool operator()(const Edge* a, const Edge* b) const { -    if (a->IsActive() != b->IsActive()) return false; -    if (a->IsActive()) { -      return (a->cat == b->cat) && (a->dot == b->dot) && (a->q == b->q) && (a->r == b->r); -    } else { -      return (a->cat == b->cat) && (a->q == b->q) && (a->r == b->r); -    } -  } -}; - -struct REdgeHash { -  size_t operator()(const Edge* e) const { -    size_t x = 5381; -    x = ((x << 5) + x) ^ reinterpret_cast<size_t>(e->r); -    return x; -  } -}; - -struct REdgeEquals { -  bool operator()(const Edge* a, const Edge* b) const { -    return (a->r == b->r); -  } -}; - -struct QEdgeHash { -  size_t operator()(const Edge* e) const { -    size_t x = 5381; -    x = ((x << 5) + x) ^ reinterpret_cast<size_t>(e->q); -    return x; -  } -}; - -struct QEdgeEquals { -  bool operator()(const Edge* a, const Edge* b) const { -    return (a->q == b->q); -  } -}; - -struct EdgeQueue { -  queue<const Edge*> q; -  EdgeQueue() {} -  void clear() { while(!q.empty()) q.pop(); } -  bool HasWork() const { return !q.empty(); } -  const Edge* Next() { const Edge* res = q.front(); q.pop(); return res; } -  void AddEdge(const Edge* s) { q.push(s); } -}; - -class CFG_WFSTComposerImpl { - public: -  CFG_WFSTComposerImpl(WordID start_cat, -                       const WFSTNode* q_0, -                       const WFSTNode* q_final) : start_cat_(start_cat), q_0_(q_0), q_final_(q_final) {} - -  // returns false if the intersection is empty -  bool Compose(const EGrammar& g, Hypergraph* forest) { -    goal_node = NULL; -    EGrammar::const_iterator sit = g.find(start_cat_); -    forest->ReserveNodes(kMAX_NODES); -    assert(sit != g.end()); -    Edge* init = new Edge(start_cat_, &sit->second, q_0_); -    assert(IncorporateNewEdge(init)); -    while (exp_agenda.HasWork() || agenda.HasWork()) { -      while(exp_agenda.HasWork()) { -        const Edge* edge = exp_agenda.Next(); -        FinishEdge(edge, forest); -      } -      if (agenda.HasWork()) { -        const Edge* edge = agenda.Next(); -#ifdef DEBUG_CHART_PARSER -        cerr << "processing (" << edge->id << ')' << endl; -#endif -        if (edge->IsActive()) { -          if (edge->dot->HasTerminals()) -            DoScan(edge); -          if (edge->dot->HasNonTerminals()) { -            DoMergeWithPassives(edge); -            DoPredict(edge, g); -          } -        } else { -          DoComplete(edge); -        } -      } -    } -    if (goal_node) { -      forest->PruneUnreachable(goal_node->id_); -      RemoveEpsilons(forest, kEPS); -    } -    FreeAll(); -    return goal_node; -  } - -  void FreeAll() { -    for (int i = 0; i < free_list_.size(); ++i) -      delete free_list_[i]; -    free_list_.clear(); -    for (int i = 0; i < traversal_free_list_.size(); ++i) -      delete traversal_free_list_[i]; -    traversal_free_list_.clear(); -    all_traversals.clear(); -    exp_agenda.clear(); -    agenda.clear(); -    tps2node.clear(); -    edge2node.clear(); -    all_edges.clear(); -    passive_edges.clear(); -    active_edges.clear(); -  } - -  ~CFG_WFSTComposerImpl() { -    FreeAll(); -  } - -  // returns the total number of edges created during composition -  int EdgesCreated() const { -    return free_list_.size(); -  } - - private: -  void DoScan(const Edge* edge) { -    // here, we assume that the FST will potentially have many more outgoing -    // edges than the grammar, which will be just a couple.  If you want to -    // efficiently handle the case where both are relatively large, this code -    // will need to change how the intersection is done.  The best general -    // solution would probably be the Baeza-Yates double binary search. - -    const EGrammarNode* dot = edge->dot; -    const WFSTNode* r = edge->r; -    const map<WordID, EGrammarNode>& terms = dot->GetTerminals(); -    for (map<WordID, EGrammarNode>::const_iterator git = terms.begin(); -         git != terms.end(); ++git) { - -      if (!(TD::Convert(git->first)[0] >= '0' && TD::Convert(git->first)[0] <= '9')) { -        std::cerr << "TERMINAL SYMBOL: " << TD::Convert(git->first) << endl; -        abort(); -      } -      std::vector<std::pair<const WFSTNode*, TRulePtr> > extensions = r->ExtendInput(atoi(TD::Convert(git->first).c_str())); -      for (unsigned nsi = 0; nsi < extensions.size(); ++nsi) { -        const WFSTNode* next_r = extensions[nsi].first; -        const EGrammarNode* next_dot = &git->second; -        const bool grammar_continues = next_dot->GrammarContinues(); -        const bool rule_completes    = next_dot->RuleCompletes(); -        if (extensions[nsi].second) -          cerr << "!!! " << extensions[nsi].second->AsString() << endl; -        // cerr << "  rule completes: " << rule_completes << " after consuming " << TD::Convert(git->first) << endl; -        assert(grammar_continues || rule_completes); -        const SparseVector<double>& input_features = next_dot->GetCFGProductionFeatures(); -        if (rule_completes) -          IncorporateNewEdge(new Edge(edge->cat, next_dot, edge->q, next_r, edge, extensions[nsi].second, input_features)); -        if (grammar_continues) -          IncorporateNewEdge(new Edge(edge->cat, next_dot, edge->q, next_r, edge, extensions[nsi].second)); -      } -    } -  } - -  void DoPredict(const Edge* edge, const EGrammar& g) { -    const EGrammarNode* dot = edge->dot; -    const map<WordID, EGrammarNode>& non_terms = dot->GetNonTerminals(); -    for (map<WordID, EGrammarNode>::const_iterator git = non_terms.begin(); -         git != non_terms.end(); ++git) { -      const WordID nt_to_predict = git->first; -      //cerr << edge->id << " -- " << TD::Convert(nt_to_predict*-1) << endl; -      EGrammar::const_iterator egi = g.find(nt_to_predict); -      if (egi == g.end()) { -        cerr << "[ERROR] Can't find any grammar rules with a LHS of type " -             << TD::Convert(-1*nt_to_predict) << '!' << endl; -        continue; -      } -      assert(edge->IsActive()); -      const EGrammarNode* new_dot = &egi->second; -      Edge* new_edge = new Edge(nt_to_predict, new_dot, edge->r, edge); -      IncorporateNewEdge(new_edge); -    } -  } - -  void DoComplete(const Edge* passive) { -#ifdef DEBUG_CHART_PARSER -    cerr << "  complete: " << *passive << endl; -#endif -    const WordID completed_nt = passive->cat; -    const WFSTNode* q = passive->q; -    const WFSTNode* next_r = passive->r; -    const Edge query(q); -    const pair<unordered_multiset<const Edge*, REdgeHash, REdgeEquals>::iterator, -         unordered_multiset<const Edge*, REdgeHash, REdgeEquals>::iterator > p = -      active_edges.equal_range(&query); -    for (unordered_multiset<const Edge*, REdgeHash, REdgeEquals>::iterator it = p.first; -         it != p.second; ++it) { -      const Edge* active = *it; -#ifdef DEBUG_CHART_PARSER -      cerr << "    pos: " << *active << endl; -#endif -      const EGrammarNode* next_dot = active->dot->Extend(completed_nt); -      if (!next_dot) continue; -      const SparseVector<double>& input_features = next_dot->GetCFGProductionFeatures(); -      // add up to 2 rules -      if (next_dot->RuleCompletes()) -        IncorporateNewEdge(new Edge(active->cat, next_dot, active->q, next_r, active, passive, input_features)); -      if (next_dot->GrammarContinues()) -        IncorporateNewEdge(new Edge(active->cat, next_dot, active->q, next_r, active, passive)); -    } -  } - -  void DoMergeWithPassives(const Edge* active) { -    // edge is active, has non-terminals, we need to find the passives that can extend it -    assert(active->IsActive()); -    assert(active->dot->HasNonTerminals()); -#ifdef DEBUG_CHART_PARSER -    cerr << "  merge active with passives: ACT=" << *active << endl; -#endif -    const Edge query(active->r, 1); -    const pair<unordered_multiset<const Edge*, QEdgeHash, QEdgeEquals>::iterator, -         unordered_multiset<const Edge*, QEdgeHash, QEdgeEquals>::iterator > p = -      passive_edges.equal_range(&query); -    for (unordered_multiset<const Edge*, QEdgeHash, QEdgeEquals>::iterator it = p.first; -         it != p.second; ++it) { -      const Edge* passive = *it; -      const EGrammarNode* next_dot = active->dot->Extend(passive->cat); -      if (!next_dot) continue; -      const WFSTNode* next_r = passive->r; -      const SparseVector<double>& input_features = next_dot->GetCFGProductionFeatures(); -      if (next_dot->RuleCompletes()) -        IncorporateNewEdge(new Edge(active->cat, next_dot, active->q, next_r, active, passive, input_features)); -      if (next_dot->GrammarContinues()) -        IncorporateNewEdge(new Edge(active->cat, next_dot, active->q, next_r, active, passive)); -    } -  } - -  // take ownership of edge memory, add to various indexes, etc -  // returns true if this edge is new -  bool IncorporateNewEdge(Edge* edge) { -    free_list_.push_back(edge); -    if (edge->passive_parent && edge->active_parent) { -      Traversal* t = new Traversal(edge, edge->active_parent, edge->passive_parent); -      traversal_free_list_.push_back(t); -      if (all_traversals.find(t) != all_traversals.end()) { -        return false; -      } else { -        all_traversals.insert(t); -      } -    } -    exp_agenda.AddEdge(edge); -    return true; -  } - -  bool FinishEdge(const Edge* edge, Hypergraph* hg) { -    bool is_new = false; -    if (all_edges.find(edge) == all_edges.end()) { -#ifdef DEBUG_CHART_PARSER -      cerr << *edge << " is NEW\n"; -#endif -      all_edges.insert(edge); -      is_new = true; -      if (edge->IsPassive()) passive_edges.insert(edge); -      if (edge->IsActive()) active_edges.insert(edge); -      agenda.AddEdge(edge); -    } else { -#ifdef DEBUG_CHART_PARSER -      cerr << *edge << " is NOT NEW.\n"; -#endif -    } -    AddEdgeToTranslationForest(edge, hg); -    return is_new; -  } - -  // build the translation forest -  void AddEdgeToTranslationForest(const Edge* edge, Hypergraph* hg) { -    assert(hg->nodes_.size() < kMAX_NODES); -    Hypergraph::Node* tps = NULL; -    // first add any target language rules -    if (edge->tps) { -      Hypergraph::Node*& node = tps2node[(size_t)edge->tps.get()]; -      if (!node) { -        // cerr << "Creating phrases for " << edge->tps << endl; -        const TRulePtr& rule = edge->tps; -        node = hg->AddNode(kPHRASE); -        Hypergraph::Edge* hg_edge = hg->AddEdge(rule, Hypergraph::TailNodeVector()); -        hg_edge->feature_values_ += rule->GetFeatureValues(); -        hg->ConnectEdgeToHeadNode(hg_edge, node); -      } -      tps = node; -    } -    Hypergraph::Node*& head_node = edge2node[edge]; -    if (!head_node) -      head_node = hg->AddNode(kPHRASE); -    if (edge->cat == start_cat_ && edge->q == q_0_ && edge->r == q_final_ && edge->IsPassive()) { -      assert(goal_node == NULL || goal_node == head_node); -      goal_node = head_node; -    } -    Hypergraph::TailNodeVector tail; -    SparseVector<double> extra; -    if (edge->IsCreatedByPredict()) { -      // extra.set_value(FD::Convert("predict"), 1); -    } else if (edge->IsCreatedByScan()) { -      tail.push_back(edge2node[edge->active_parent]->id_); -      if (tps) { -        tail.push_back(tps->id_); -      } -      //extra.set_value(FD::Convert("scan"), 1); -    } else if (edge->IsCreatedByComplete()) { -      tail.push_back(edge2node[edge->active_parent]->id_); -      tail.push_back(edge2node[edge->passive_parent]->id_); -      //extra.set_value(FD::Convert("complete"), 1); -    } else { -      assert(!"unexpected edge type!"); -    } -    //cerr << head_node->id_ << "<--" << *edge << endl; - -#ifdef DEBUG_CHART_PARSER -      for (int i = 0; i < tail.size(); ++i) -        if (tail[i] == head_node->id_) { -          cerr << "ERROR: " << *edge << "\n   i=" << i << endl; -          if (i == 1) { cerr << "\tP: " << *edge->passive_parent << endl; } -          if (i == 0) { cerr << "\tA: " << *edge->active_parent << endl; } -          assert(!"self-loop found!"); -        } -#endif -    Hypergraph::Edge* hg_edge = NULL; -    if (tail.size() == 0) { -      hg_edge = hg->AddEdge(kEPSRule, tail); -    } else if (tail.size() == 1) { -      hg_edge = hg->AddEdge(kX1, tail); -    } else if (tail.size() == 2) { -      hg_edge = hg->AddEdge(kX1X2, tail); -    } -    if (edge->features) -      hg_edge->feature_values_ += *edge->features; -    hg_edge->feature_values_ += extra; -    hg->ConnectEdgeToHeadNode(hg_edge, head_node); -  } - -  Hypergraph::Node* goal_node; -  EdgeQueue exp_agenda; -  EdgeQueue agenda; -  unordered_map<size_t, Hypergraph::Node*> tps2node; -  unordered_map<const Edge*, Hypergraph::Node*, UniqueEdgeHash, UniqueEdgeEquals> edge2node; -  unordered_set<const Traversal*, UniqueTraversalHash, UniqueTraversalEquals> all_traversals; -  unordered_set<const Edge*, UniqueEdgeHash, UniqueEdgeEquals> all_edges; -  unordered_multiset<const Edge*, QEdgeHash, QEdgeEquals> passive_edges; -  unordered_multiset<const Edge*, REdgeHash, REdgeEquals> active_edges; -  vector<Edge*> free_list_; -  vector<Traversal*> traversal_free_list_; -  const WordID start_cat_; -  const WFSTNode* const q_0_; -  const WFSTNode* const q_final_; -}; - -#ifdef DEBUG_CHART_PARSER -static string TrimRule(const string& r) { -  size_t start = r.find(" |||") + 5; -  size_t end = r.rfind(" |||"); -  return r.substr(start, end - start); -} -#endif - -void AddGrammarRule(const string& r, EGrammar* g) { -  const size_t pos = r.find(" ||| "); -  if (pos == string::npos || r[0] != '[') { -    cerr << "Bad rule: " << r << endl; -    return; -  } -  const size_t rpos = r.rfind(" ||| "); -  string feats; -  string rs = r; -  if (rpos != pos) { -    feats = r.substr(rpos + 5); -    rs = r.substr(0, rpos); -  } -  string rhs = rs.substr(pos + 5); -  string trule = rs + " ||| " + rhs + " ||| " + feats; -  TRule tr(trule); -  cerr << "X: " << tr.e_[0] << endl; -#ifdef DEBUG_CHART_PARSER -  string hint_last_rule; -#endif -  EGrammarNode* cur = &(*g)[tr.GetLHS()]; -  cur->is_root = true; -  for (int i = 0; i < tr.FLength(); ++i) { -    WordID sym = tr.f()[i]; -#ifdef DEBUG_CHART_PARSER -    hint_last_rule = TD::Convert(sym < 0 ? -sym : sym); -    cur->hint += " <@@> (*" + hint_last_rule + ") " + TrimRule(tr.AsString()); -#endif -    if (sym < 0) -      cur = &cur->ntptr[sym]; -    else -      cur = &cur->tptr[sym]; -  } -#ifdef DEBUG_CHART_PARSER -  cur->hint += " <@@> (" + hint_last_rule + "*) " + TrimRule(tr.AsString()); -#endif -  cur->is_some_rule_complete = true; -  cur->input_features = tr.GetFeatureValues(); -} - -CFG_WFSTComposer::~CFG_WFSTComposer() { -  delete pimpl_; -} - -CFG_WFSTComposer::CFG_WFSTComposer(const WFST& wfst) { -  InitializeConstants(); -  pimpl_ = new CFG_WFSTComposerImpl(kUNIQUE_START, wfst.Initial(), wfst.Final()); -} - -bool CFG_WFSTComposer::Compose(const Hypergraph& src_forest, Hypergraph* trg_forest) { -  // first, convert the src forest into an EGrammar -  EGrammar g; -  const int nedges = src_forest.edges_.size(); -  const int nnodes = src_forest.nodes_.size(); -  vector<int> cats(nnodes); -  bool assign_cats = false; -  for (int i = 0; i < nnodes; ++i) -    if (assign_cats) { -      cats[i] = TD::Convert("CAT_" + boost::lexical_cast<string>(i)) * -1; -    } else { -      cats[i] = src_forest.nodes_[i].cat_; -    } -  // construct the grammar -  for (int i = 0; i < nedges; ++i) { -    const Hypergraph::Edge& edge = src_forest.edges_[i]; -    const vector<WordID>& src = edge.rule_->f(); -    EGrammarNode* cur = &g[cats[edge.head_node_]]; -    cur->is_root = true; -    int ntc = 0; -    for (int j = 0; j < src.size(); ++j) { -      WordID sym = src[j]; -      if (sym <= 0) { -        sym = cats[edge.tail_nodes_[ntc]]; -        ++ntc; -        cur = &cur->ntptr[sym]; -      } else { -        cur = &cur->tptr[sym]; -      } -    } -    cur->is_some_rule_complete = true; -    cur->input_features = edge.feature_values_; -  } -  EGrammarNode& goal_rule = g[kUNIQUE_START]; -  assert((goal_rule.ntptr.size() == 1 && goal_rule.tptr.size() == 0) || -         (goal_rule.ntptr.size() == 0 && goal_rule.tptr.size() == 1)); - -  return pimpl_->Compose(g, trg_forest); -} - -bool CFG_WFSTComposer::Compose(istream* in, Hypergraph* trg_forest) { -  EGrammar g; -  while(*in) { -    string line; -    getline(*in, line); -    if (line.empty()) continue; -    AddGrammarRule(line, &g); -  } - -  return pimpl_->Compose(g, trg_forest); -} diff --git a/gi/pf/cfg_wfst_composer.h b/gi/pf/cfg_wfst_composer.h deleted file mode 100644 index cf47f459..00000000 --- a/gi/pf/cfg_wfst_composer.h +++ /dev/null @@ -1,46 +0,0 @@ -#ifndef _CFG_WFST_COMPOSER_H_ -#define _CFG_WFST_COMPOSER_H_ - -#include <iostream> -#include <vector> -#include <utility> - -#include "trule.h" -#include "wordid.h" - -class CFG_WFSTComposerImpl; -class Hypergraph; - -struct WFSTNode { -  virtual ~WFSTNode(); -  // returns the next states reachable by consuming srcindex (which identifies a word) -  // paired with the output string generated by taking that transition. -  virtual std::vector<std::pair<const WFSTNode*,TRulePtr> > ExtendInput(unsigned srcindex) const = 0; -}; - -struct WFST { -  virtual ~WFST(); -  virtual const WFSTNode* Final() const = 0; -  virtual const WFSTNode* Initial() const = 0; -}; - -class CFG_WFSTComposer { - public: -  ~CFG_WFSTComposer(); -  explicit CFG_WFSTComposer(const WFST& wfst); -  bool Compose(const Hypergraph& in_forest, Hypergraph* trg_forest); - -  // reads the grammar from a file. There must be a single top-level -  // S -> X rule.  Anything else is possible. Format is: -  // [S] ||| [SS,1] -  // [SS] ||| [NP,1] [VP,2] ||| Feature1=0.2 Feature2=-2.3 -  // [SS] ||| [VP,1] [NP,2] ||| Feature1=0.8 -  // [NP] ||| [DET,1] [N,2] ||| Feature3=2 -  // ... -  bool Compose(std::istream* grammar_file, Hypergraph* trg_forest); - - private: -  CFG_WFSTComposerImpl* pimpl_; -}; - -#endif diff --git a/gi/pf/conditional_pseg.h b/gi/pf/conditional_pseg.h deleted file mode 100644 index 81ddb206..00000000 --- a/gi/pf/conditional_pseg.h +++ /dev/null @@ -1,275 +0,0 @@ -#ifndef _CONDITIONAL_PSEG_H_ -#define _CONDITIONAL_PSEG_H_ - -#include <vector> -#include <tr1/unordered_map> -#include <boost/functional/hash.hpp> -#include <iostream> - -#include "m.h" -#include "prob.h" -#include "ccrp_nt.h" -#include "mfcr.h" -#include "trule.h" -#include "base_distributions.h" -#include "tdict.h" - -template <typename ConditionalBaseMeasure> -struct MConditionalTranslationModel { -  explicit MConditionalTranslationModel(ConditionalBaseMeasure& rcp0) : -    rp0(rcp0), d(0.5), strength(1.0), lambdas(1, prob_t::One()), p0s(1) {} - -  void Summary() const { -    std::cerr << "Number of conditioning contexts: " << r.size() << std::endl; -    for (RuleModelHash::const_iterator it = r.begin(); it != r.end(); ++it) { -      std::cerr << TD::GetString(it->first) << "   \t(d=" << it->second.discount() << ",s=" << it->second.strength() << ") --------------------------" << std::endl; -      for (MFCR<1,TRule>::const_iterator i2 = it->second.begin(); i2 != it->second.end(); ++i2) -        std::cerr << "   " << i2->second.total_dish_count_ << '\t' << i2->first << std::endl; -    } -  } - -  double log_likelihood(const double& dd, const double& aa) const { -    if (aa <= -dd) return -std::numeric_limits<double>::infinity(); -    //double llh = Md::log_beta_density(dd, 10, 3) + Md::log_gamma_density(aa, 1, 1); -    double llh = Md::log_beta_density(dd, 1, 1) + -                 Md::log_gamma_density(dd + aa, 1, 1); -    typename std::tr1::unordered_map<std::vector<WordID>, MFCR<1,TRule>, boost::hash<std::vector<WordID> > >::const_iterator it; -    for (it = r.begin(); it != r.end(); ++it) -      llh += it->second.log_crp_prob(dd, aa); -    return llh; -  } - -  struct DiscountResampler { -    DiscountResampler(const MConditionalTranslationModel& m) : m_(m) {} -    const MConditionalTranslationModel& m_; -    double operator()(const double& proposed_discount) const { -      return m_.log_likelihood(proposed_discount, m_.strength); -    } -  }; - -  struct AlphaResampler { -    AlphaResampler(const MConditionalTranslationModel& m) : m_(m) {} -    const MConditionalTranslationModel& m_; -    double operator()(const double& proposed_strength) const { -      return m_.log_likelihood(m_.d, proposed_strength); -    } -  }; - -  void ResampleHyperparameters(MT19937* rng) { -    typename std::tr1::unordered_map<std::vector<WordID>, MFCR<1,TRule>, boost::hash<std::vector<WordID> > >::iterator it; -#if 1 -    for (it = r.begin(); it != r.end(); ++it) { -      it->second.resample_hyperparameters(rng); -    } -#else -    const unsigned nloop = 5; -    const unsigned niterations = 10; -    DiscountResampler dr(*this); -    AlphaResampler ar(*this); -    for (int iter = 0; iter < nloop; ++iter) { -      strength = slice_sampler1d(ar, strength, *rng, -d + std::numeric_limits<double>::min(), -                              std::numeric_limits<double>::infinity(), 0.0, niterations, 100*niterations); -      double min_discount = std::numeric_limits<double>::min(); -      if (strength < 0.0) min_discount -= strength; -      d = slice_sampler1d(dr, d, *rng, min_discount, -                          1.0, 0.0, niterations, 100*niterations); -    } -    strength = slice_sampler1d(ar, strength, *rng, -d, -                            std::numeric_limits<double>::infinity(), 0.0, niterations, 100*niterations); -    std::cerr << "MConditionalTranslationModel(d=" << d << ",s=" << strength << ") = " << log_likelihood(d, strength) << std::endl; -    for (it = r.begin(); it != r.end(); ++it) { -      it->second.set_discount(d); -      it->second.set_strength(strength); -    } -#endif -  } - -  int DecrementRule(const TRule& rule, MT19937* rng) { -    RuleModelHash::iterator it = r.find(rule.f_); -    assert(it != r.end()); -    const TableCount delta = it->second.decrement(rule, rng); -    if (delta.count) { -      if (it->second.num_customers() == 0) r.erase(it); -    } -    return delta.count; -  } - -  int IncrementRule(const TRule& rule, MT19937* rng) { -    RuleModelHash::iterator it = r.find(rule.f_); -    if (it == r.end()) { -      //it = r.insert(make_pair(rule.f_, MFCR<1,TRule>(d, strength))).first; -      it = r.insert(make_pair(rule.f_, MFCR<1,TRule>(1,1,1,1,0.6, -0.12))).first; -    } -    p0s[0] = rp0(rule);  -    TableCount delta = it->second.increment(rule, p0s.begin(), lambdas.begin(), rng); -    return delta.count; -  } - -  prob_t RuleProbability(const TRule& rule) const { -    prob_t p; -    RuleModelHash::const_iterator it = r.find(rule.f_); -    if (it == r.end()) { -      p = rp0(rule); -    } else { -      p0s[0] = rp0(rule); -      p = it->second.prob(rule, p0s.begin(), lambdas.begin()); -    } -    return p; -  } - -  prob_t Likelihood() const { -    prob_t p; p.logeq(log_likelihood(d, strength)); -    return p; -  } - -  const ConditionalBaseMeasure& rp0; -  typedef std::tr1::unordered_map<std::vector<WordID>, -                                  MFCR<1, TRule>, -                                  boost::hash<std::vector<WordID> > > RuleModelHash; -  RuleModelHash r; -  double d, strength; -  std::vector<prob_t> lambdas; -  mutable std::vector<prob_t> p0s; -}; - -template <typename ConditionalBaseMeasure> -struct ConditionalTranslationModel { -  explicit ConditionalTranslationModel(ConditionalBaseMeasure& rcp0) : -    rp0(rcp0) {} - -  void Summary() const { -    std::cerr << "Number of conditioning contexts: " << r.size() << std::endl; -    for (RuleModelHash::const_iterator it = r.begin(); it != r.end(); ++it) { -      std::cerr << TD::GetString(it->first) << "   \t(\\alpha = " << it->second.alpha() << ") --------------------------" << std::endl; -      for (CCRP_NoTable<TRule>::const_iterator i2 = it->second.begin(); i2 != it->second.end(); ++i2) -        std::cerr << "   " << i2->second << '\t' << i2->first << std::endl; -    } -  } - -  void ResampleHyperparameters(MT19937* rng) { -    for (RuleModelHash::iterator it = r.begin(); it != r.end(); ++it) -      it->second.resample_hyperparameters(rng); -  }  - -  int DecrementRule(const TRule& rule) { -    RuleModelHash::iterator it = r.find(rule.f_); -    assert(it != r.end());     -    int count = it->second.decrement(rule); -    if (count) { -      if (it->second.num_customers() == 0) r.erase(it); -    } -    return count; -  } - -  int IncrementRule(const TRule& rule) { -    RuleModelHash::iterator it = r.find(rule.f_); -    if (it == r.end()) { -      it = r.insert(make_pair(rule.f_, CCRP_NoTable<TRule>(1.0, 1.0, 8.0))).first; -    }  -    int count = it->second.increment(rule); -    return count; -  } - -  void IncrementRules(const std::vector<TRulePtr>& rules) { -    for (int i = 0; i < rules.size(); ++i) -      IncrementRule(*rules[i]); -  } - -  void DecrementRules(const std::vector<TRulePtr>& rules) { -    for (int i = 0; i < rules.size(); ++i) -      DecrementRule(*rules[i]); -  } - -  prob_t RuleProbability(const TRule& rule) const { -    prob_t p; -    RuleModelHash::const_iterator it = r.find(rule.f_); -    if (it == r.end()) { -      p.logeq(log(rp0(rule))); -    } else { -      p.logeq(it->second.logprob(rule, log(rp0(rule)))); -    } -    return p; -  } - -  prob_t Likelihood() const { -    prob_t p = prob_t::One(); -    for (RuleModelHash::const_iterator it = r.begin(); it != r.end(); ++it) { -      prob_t q; q.logeq(it->second.log_crp_prob()); -      p *= q; -      for (CCRP_NoTable<TRule>::const_iterator i2 = it->second.begin(); i2 != it->second.end(); ++i2) -        p *= rp0(i2->first); -    } -    return p; -  } - -  const ConditionalBaseMeasure& rp0; -  typedef std::tr1::unordered_map<std::vector<WordID>, -                                  CCRP_NoTable<TRule>, -                                  boost::hash<std::vector<WordID> > > RuleModelHash; -  RuleModelHash r; -}; - -template <typename ConditionalBaseMeasure> -struct ConditionalParallelSegementationModel { -  explicit ConditionalParallelSegementationModel(ConditionalBaseMeasure& rcp0) : -    tmodel(rcp0), base(prob_t::One()), aligns(1,1) {} - -  ConditionalTranslationModel<ConditionalBaseMeasure> tmodel; - -  void DecrementRule(const TRule& rule) { -    tmodel.DecrementRule(rule); -  } - -  void IncrementRule(const TRule& rule) { -    tmodel.IncrementRule(rule); -  } - -  void IncrementRulesAndAlignments(const std::vector<TRulePtr>& rules) { -    tmodel.IncrementRules(rules); -    for (int i = 0; i < rules.size(); ++i) { -      IncrementAlign(rules[i]->f_.size()); -    } -  } - -  void DecrementRulesAndAlignments(const std::vector<TRulePtr>& rules) { -    tmodel.DecrementRules(rules); -    for (int i = 0; i < rules.size(); ++i) { -      DecrementAlign(rules[i]->f_.size()); -    } -  } - -  prob_t RuleProbability(const TRule& rule) const { -    return tmodel.RuleProbability(rule); -  } - -  void IncrementAlign(unsigned span) { -    if (aligns.increment(span)) { -      // TODO -    } -  } - -  void DecrementAlign(unsigned span) { -    if (aligns.decrement(span)) { -      // TODO -    } -  } - -  prob_t AlignProbability(unsigned span) const { -    prob_t p; -    p.logeq(aligns.logprob(span, Md::log_poisson(span, 1.0))); -    return p; -  } - -  prob_t Likelihood() const { -    prob_t p; p.logeq(aligns.log_crp_prob()); -    p *= base; -    p *= tmodel.Likelihood(); -    return p; -  } - -  prob_t base; -  CCRP_NoTable<unsigned> aligns; -}; - -#endif - diff --git a/gi/pf/condnaive.cc b/gi/pf/condnaive.cc deleted file mode 100644 index 419731ac..00000000 --- a/gi/pf/condnaive.cc +++ /dev/null @@ -1,298 +0,0 @@ -#include <iostream> -#include <tr1/memory> -#include <queue> - -#include <boost/multi_array.hpp> -#include <boost/program_options.hpp> -#include <boost/program_options/variables_map.hpp> - -#include "base_distributions.h" -#include "monotonic_pseg.h" -#include "conditional_pseg.h" -#include "trule.h" -#include "tdict.h" -#include "filelib.h" -#include "dict.h" -#include "sampler.h" -#include "ccrp_nt.h" -#include "corpus.h" - -using namespace std; -using namespace std::tr1; -namespace po = boost::program_options; - -static unsigned kMAX_SRC_PHRASE; -static unsigned kMAX_TRG_PHRASE; - -void InitCommandLine(int argc, char** argv, po::variables_map* conf) { -  po::options_description opts("Configuration options"); -  opts.add_options() -        ("samples,s",po::value<unsigned>()->default_value(1000),"Number of samples") -        ("input,i",po::value<string>(),"Read parallel data from") -        ("max_src_phrase",po::value<unsigned>()->default_value(4),"Maximum length of source language phrases") -        ("max_trg_phrase",po::value<unsigned>()->default_value(4),"Maximum length of target language phrases") -        ("model1,m",po::value<string>(),"Model 1 parameters (used in base distribution)") -        ("model1_interpolation_weight",po::value<double>()->default_value(0.95),"Mixing proportion of model 1 with uniform target distribution") -        ("random_seed,S",po::value<uint32_t>(), "Random seed"); -  po::options_description clo("Command line options"); -  clo.add_options() -        ("config", po::value<string>(), "Configuration file") -        ("help,h", "Print this help message and exit"); -  po::options_description dconfig_options, dcmdline_options; -  dconfig_options.add(opts); -  dcmdline_options.add(opts).add(clo); -   -  po::store(parse_command_line(argc, argv, dcmdline_options), *conf); -  if (conf->count("config")) { -    ifstream config((*conf)["config"].as<string>().c_str()); -    po::store(po::parse_config_file(config, dconfig_options), *conf); -  } -  po::notify(*conf); - -  if (conf->count("help") || (conf->count("input") == 0)) { -    cerr << dcmdline_options << endl; -    exit(1); -  } -} - -boost::shared_ptr<MT19937> prng; - -struct ModelAndData { -  explicit ModelAndData(ConditionalParallelSegementationModel<PhraseConditionalBase>& m, const vector<vector<int> >& ce, const vector<vector<int> >& cf, const set<int>& ve, const set<int>& vf) : -     model(m), -     rng(&*prng), -     corpuse(ce), -     corpusf(cf), -     vocabe(ve), -     vocabf(vf), -     mh_samples(), -     mh_rejects(), -     kX(-TD::Convert("X")), -     derivations(corpuse.size()) {} - -  void ResampleHyperparameters() { -  } - -  void InstantiateRule(const pair<short,short>& from, -                       const pair<short,short>& to, -                       const vector<int>& sentf, -                       const vector<int>& sente, -                       TRule* rule) const { -    rule->f_.clear(); -    rule->e_.clear(); -    rule->lhs_ = kX; -    for (short i = from.first; i < to.first; ++i) -      rule->f_.push_back(sentf[i]); -    for (short i = from.second; i < to.second; ++i) -      rule->e_.push_back(sente[i]); -  } - -  void DecrementDerivation(const vector<pair<short,short> >& d, const vector<int>& sentf, const vector<int>& sente) { -    if (d.size() < 2) return; -    TRule x; -    for (int i = 1; i < d.size(); ++i) { -      InstantiateRule(d[i], d[i-1], sentf, sente, &x); -      model.DecrementRule(x); -      model.DecrementAlign(x.f_.size()); -    } -  } - -  void PrintDerivation(const vector<pair<short,short> >& d, const vector<int>& sentf, const vector<int>& sente) { -    if (d.size() < 2) return; -    TRule x; -    for (int i = 1; i < d.size(); ++i) { -      InstantiateRule(d[i], d[i-1], sentf, sente, &x); -      cerr << i << '/' << (d.size() - 1) << ": " << x << endl; -    } -  } - -  void IncrementDerivation(const vector<pair<short,short> >& d, const vector<int>& sentf, const vector<int>& sente) { -    if (d.size() < 2) return; -    TRule x; -    for (int i = 1; i < d.size(); ++i) { -      InstantiateRule(d[i], d[i-1], sentf, sente, &x); -      model.IncrementRule(x); -      model.IncrementAlign(x.f_.size()); -    } -  } - -  prob_t Likelihood() const { -    return model.Likelihood(); -  } - -  prob_t DerivationProposalProbability(const vector<pair<short,short> >& d, const vector<int>& sentf, const vector<int>& sente) const { -    prob_t p = prob_t::One(); -    TRule x; -    for (int i = 1; i < d.size(); ++i) { -      InstantiateRule(d[i], d[i-1], sentf, sente, &x); -      p *= model.RuleProbability(x); -      p *= model.AlignProbability(x.f_.size()); -    } -    return p; -  } - -  void Sample(); - -  ConditionalParallelSegementationModel<PhraseConditionalBase>& model; -  MT19937* rng; -  const vector<vector<int> >& corpuse, corpusf; -  const set<int>& vocabe, vocabf; -  unsigned mh_samples, mh_rejects; -  const int kX; -  vector<vector<pair<short, short> > > derivations; -}; - -void ModelAndData::Sample() { -  unsigned MAXK = kMAX_SRC_PHRASE; -  unsigned MAXL = kMAX_TRG_PHRASE; -  TRule x; -  x.lhs_ = -TD::Convert("X"); - -  for (int samples = 0; samples < 1000; ++samples) { -    if (samples % 1 == 0 && samples > 0) { -      //ResampleHyperparameters(); -      cerr << " [" << samples << " LLH=" << log(Likelihood()) << " MH=" << ((double)mh_rejects / mh_samples) << "]\n"; -      for (int i = 0; i < 10; ++i) { -        cerr << "SENTENCE: " << TD::GetString(corpusf[i]) << " ||| " << TD::GetString(corpuse[i]) << endl; -        PrintDerivation(derivations[i], corpusf[i], corpuse[i]); -      } -      static TRule xx("[X] ||| w n ||| s h ||| X=0"); -      const CCRP_NoTable<TRule>& dcrp = model.tmodel.r.find(xx.f_)->second; -      for (CCRP_NoTable<TRule>::const_iterator it = dcrp.begin(); it != dcrp.end(); ++it) { -        cerr << "\t" << it->second << "\t" << it->first << endl; -      } -    } -    cerr << '.' << flush; -    for (int s = 0; s < corpuse.size(); ++s) { -      const vector<int>& sentf = corpusf[s]; -      const vector<int>& sente = corpuse[s]; -//      cerr << "  CUSTOMERS: " << rules.num_customers() << endl; -//      cerr << "SENTENCE: " << TD::GetString(sentf) << " ||| " << TD::GetString(sente) << endl; - -      vector<pair<short, short> >& deriv = derivations[s]; -      const prob_t p_cur = Likelihood(); -      DecrementDerivation(deriv, sentf, sente); - -      boost::multi_array<prob_t, 2> a(boost::extents[sentf.size() + 1][sente.size() + 1]); -      boost::multi_array<prob_t, 4> trans(boost::extents[sentf.size() + 1][sente.size() + 1][MAXK][MAXL]); -      a[0][0] = prob_t::One(); -      for (int i = 0; i < sentf.size(); ++i) { -        for (int j = 0; j < sente.size(); ++j) { -          const prob_t src_a = a[i][j]; -          x.f_.clear(); -          for (int k = 1; k <= MAXK; ++k) { -            if (i + k > sentf.size()) break; -            x.f_.push_back(sentf[i + k - 1]); -            x.e_.clear(); -            const prob_t p_span = model.AlignProbability(k);  // prob of consuming this much source -            for (int l = 1; l <= MAXL; ++l) { -              if (j + l > sente.size()) break; -              x.e_.push_back(sente[j + l - 1]); -              trans[i][j][k - 1][l - 1] = model.RuleProbability(x) * p_span; -              a[i + k][j + l] += src_a * trans[i][j][k - 1][l - 1]; -            } -          } -        } -      } -//      cerr << "Inside: " << log(a[sentf.size()][sente.size()]) << endl; -      const prob_t q_cur = DerivationProposalProbability(deriv, sentf, sente); - -      vector<pair<short,short> > newderiv; -      int cur_i = sentf.size(); -      int cur_j = sente.size(); -      while(cur_i > 0 && cur_j > 0) { -        newderiv.push_back(pair<short,short>(cur_i, cur_j)); -//        cerr << "NODE: (" << cur_i << "," << cur_j << ")\n"; -        SampleSet<prob_t> ss; -        vector<pair<short,short> > nexts; -        for (int k = 1; k <= MAXK; ++k) { -          const int hyp_i = cur_i - k; -          if (hyp_i < 0) break; -          for (int l = 1; l <= MAXL; ++l) { -            const int hyp_j = cur_j - l; -            if (hyp_j < 0) break; -            const prob_t& inside = a[hyp_i][hyp_j]; -            if (inside == prob_t::Zero()) continue; -            const prob_t& transp = trans[hyp_i][hyp_j][k - 1][l - 1]; -            if (transp == prob_t::Zero()) continue; -            const prob_t p = inside * transp; -            ss.add(p); -            nexts.push_back(pair<short,short>(hyp_i, hyp_j)); -//            cerr << "    (" << hyp_i << "," << hyp_j << ")  <--- " << log(p) << endl; -          } -        } -//        cerr << "  sample set has " << nexts.size() << " elements.\n"; -        const int selected = rng->SelectSample(ss); -        cur_i = nexts[selected].first; -        cur_j = nexts[selected].second; -      } -      newderiv.push_back(pair<short,short>(0,0)); -      const prob_t q_new = DerivationProposalProbability(newderiv, sentf, sente); -      IncrementDerivation(newderiv, sentf, sente); -//      cerr << "SANITY: " << q_new << "  " <<log(DerivationProposalProbability(newderiv, sentf, sente)) << endl; -      if (deriv.empty()) { deriv = newderiv; continue; } -      ++mh_samples; - -      if (deriv != newderiv) { -        const prob_t p_new = Likelihood(); -//        cerr << "p_cur=" << log(p_cur) << "\t p_new=" << log(p_new) << endl; -//        cerr << "q_cur=" << log(q_cur) << "\t q_new=" << log(q_new) << endl; -        if (!rng->AcceptMetropolisHastings(p_new, p_cur, q_new, q_cur)) { -          ++mh_rejects; -          DecrementDerivation(newderiv, sentf, sente); -          IncrementDerivation(deriv, sentf, sente); -        } else { -//          cerr << "  ACCEPT\n"; -          deriv = newderiv; -        } -      } -    } -  } -} - -int main(int argc, char** argv) { -  po::variables_map conf; -  InitCommandLine(argc, argv, &conf); -  kMAX_TRG_PHRASE = conf["max_trg_phrase"].as<unsigned>(); -  kMAX_SRC_PHRASE = conf["max_src_phrase"].as<unsigned>(); - -  if (!conf.count("model1")) { -    cerr << argv[0] << "Please use --model1 to specify model 1 parameters\n"; -    return 1; -  } -  if (conf.count("random_seed")) -    prng.reset(new MT19937(conf["random_seed"].as<uint32_t>())); -  else -    prng.reset(new MT19937); -//  MT19937& rng = *prng; - -  vector<vector<int> > corpuse, corpusf; -  set<int> vocabe, vocabf; -  corpus::ReadParallelCorpus(conf["input"].as<string>(), &corpusf, &corpuse, &vocabf, &vocabe); -  cerr << "f-Corpus size: " << corpusf.size() << " sentences\n"; -  cerr << "f-Vocabulary size: " << vocabf.size() << " types\n"; -  cerr << "f-Corpus size: " << corpuse.size() << " sentences\n"; -  cerr << "f-Vocabulary size: " << vocabe.size() << " types\n"; -  assert(corpusf.size() == corpuse.size()); - -  Model1 m1(conf["model1"].as<string>()); - -  PhraseConditionalBase pcb0(m1, conf["model1_interpolation_weight"].as<double>(), vocabe.size()); -  ConditionalParallelSegementationModel<PhraseConditionalBase> x(pcb0);   - -  ModelAndData posterior(x, corpuse, corpusf, vocabe, vocabf); -  posterior.Sample(); - -  TRule r1("[X] ||| x ||| l e ||| X=0"); -  TRule r2("[X] ||| A ||| a d ||| X=0"); -  TRule r3("[X] ||| n ||| e r ||| X=0"); -  TRule r4("[X] ||| x A n ||| b l a g ||| X=0"); - -  PhraseConditionalUninformativeBase u0(vocabe.size()); - -  cerr << (pcb0(r1)*pcb0(r2)*pcb0(r3)) << endl; -  cerr << (u0(r4)) << endl; - -  return 0; -} - diff --git a/gi/pf/corpus.cc b/gi/pf/corpus.cc deleted file mode 100644 index cb6e4ed7..00000000 --- a/gi/pf/corpus.cc +++ /dev/null @@ -1,62 +0,0 @@ -#include "corpus.h" - -#include <set> -#include <vector> -#include <string> - -#include "tdict.h" -#include "filelib.h" - -using namespace std; - -namespace corpus { - -void ReadParallelCorpus(const string& filename, -                vector<vector<WordID> >* f, -                vector<vector<WordID> >* e, -                set<WordID>* vocab_f, -                set<WordID>* vocab_e) { -  f->clear(); -  e->clear(); -  vocab_f->clear(); -  vocab_e->clear(); -  ReadFile rf(filename); -  istream* in = rf.stream(); -  assert(*in); -  string line; -  unsigned lc = 0; -  const WordID kDIV = TD::Convert("|||"); -  vector<WordID> tmp; -  while(getline(*in, line)) { -    ++lc; -    e->push_back(vector<int>()); -    f->push_back(vector<int>()); -    vector<int>& le = e->back(); -    vector<int>& lf = f->back(); -    tmp.clear(); -    TD::ConvertSentence(line, &tmp); -    bool isf = true; -    for (unsigned i = 0; i < tmp.size(); ++i) { -      const int cur = tmp[i]; -      if (isf) { -        if (kDIV == cur) { -          isf = false; -        } else { -          lf.push_back(cur); -          vocab_f->insert(cur); -        } -      } else { -        if (cur == kDIV) { -          cerr << "ERROR in " << lc << ": " << line << endl << endl; -          abort(); -        } -        le.push_back(cur); -        vocab_e->insert(cur); -      } -    } -    assert(isf == false); -  } -} - -} - diff --git a/gi/pf/corpus.h b/gi/pf/corpus.h deleted file mode 100644 index e7febdb7..00000000 --- a/gi/pf/corpus.h +++ /dev/null @@ -1,19 +0,0 @@ -#ifndef _CORPUS_H_ -#define _CORPUS_H_ - -#include <string> -#include <vector> -#include <set> -#include "wordid.h" - -namespace corpus { - -void ReadParallelCorpus(const std::string& filename, -                std::vector<std::vector<WordID> >* f, -                std::vector<std::vector<WordID> >* e, -                std::set<WordID>* vocab_f, -                std::set<WordID>* vocab_e); - -} - -#endif diff --git a/gi/pf/dpnaive.cc b/gi/pf/dpnaive.cc deleted file mode 100644 index 75ccad72..00000000 --- a/gi/pf/dpnaive.cc +++ /dev/null @@ -1,301 +0,0 @@ -#include <iostream> -#include <tr1/memory> -#include <queue> - -#include <boost/multi_array.hpp> -#include <boost/program_options.hpp> -#include <boost/program_options/variables_map.hpp> - -#include "base_distributions.h" -#include "monotonic_pseg.h" -#include "trule.h" -#include "tdict.h" -#include "filelib.h" -#include "dict.h" -#include "sampler.h" -#include "ccrp_nt.h" -#include "corpus.h" - -using namespace std; -using namespace std::tr1; -namespace po = boost::program_options; - -static unsigned kMAX_SRC_PHRASE; -static unsigned kMAX_TRG_PHRASE; - -void InitCommandLine(int argc, char** argv, po::variables_map* conf) { -  po::options_description opts("Configuration options"); -  opts.add_options() -        ("samples,s",po::value<unsigned>()->default_value(1000),"Number of samples") -        ("input,i",po::value<string>(),"Read parallel data from") -        ("max_src_phrase",po::value<unsigned>()->default_value(4),"Maximum length of source language phrases") -        ("max_trg_phrase",po::value<unsigned>()->default_value(4),"Maximum length of target language phrases") -        ("model1,m",po::value<string>(),"Model 1 parameters (used in base distribution)") -        ("inverse_model1,M",po::value<string>(),"Inverse Model 1 parameters (used in base distribution)") -        ("model1_interpolation_weight",po::value<double>()->default_value(0.95),"Mixing proportion of model 1 with uniform target distribution") -        ("random_seed,S",po::value<uint32_t>(), "Random seed"); -  po::options_description clo("Command line options"); -  clo.add_options() -        ("config", po::value<string>(), "Configuration file") -        ("help,h", "Print this help message and exit"); -  po::options_description dconfig_options, dcmdline_options; -  dconfig_options.add(opts); -  dcmdline_options.add(opts).add(clo); -   -  po::store(parse_command_line(argc, argv, dcmdline_options), *conf); -  if (conf->count("config")) { -    ifstream config((*conf)["config"].as<string>().c_str()); -    po::store(po::parse_config_file(config, dconfig_options), *conf); -  } -  po::notify(*conf); - -  if (conf->count("help") || (conf->count("input") == 0)) { -    cerr << dcmdline_options << endl; -    exit(1); -  } -} - -boost::shared_ptr<MT19937> prng; - -template <typename Base> -struct ModelAndData { -  explicit ModelAndData(MonotonicParallelSegementationModel<PhraseJointBase_BiDir>& m, const Base& b, const vector<vector<int> >& ce, const vector<vector<int> >& cf, const set<int>& ve, const set<int>& vf) : -     model(m), -     rng(&*prng), -     p0(b), -     baseprob(prob_t::One()), -     corpuse(ce), -     corpusf(cf), -     vocabe(ve), -     vocabf(vf), -     mh_samples(), -     mh_rejects(), -     kX(-TD::Convert("X")), -     derivations(corpuse.size()) {} - -  void ResampleHyperparameters() { -  } - -  void InstantiateRule(const pair<short,short>& from, -                       const pair<short,short>& to, -                       const vector<int>& sentf, -                       const vector<int>& sente, -                       TRule* rule) const { -    rule->f_.clear(); -    rule->e_.clear(); -    rule->lhs_ = kX; -    for (short i = from.first; i < to.first; ++i) -      rule->f_.push_back(sentf[i]); -    for (short i = from.second; i < to.second; ++i) -      rule->e_.push_back(sente[i]); -  } - -  void DecrementDerivation(const vector<pair<short,short> >& d, const vector<int>& sentf, const vector<int>& sente) { -    if (d.size() < 2) return; -    TRule x; -    for (int i = 1; i < d.size(); ++i) { -      InstantiateRule(d[i], d[i-1], sentf, sente, &x); -      model.DecrementRule(x); -      model.DecrementContinue(); -    } -    model.DecrementStop(); -  } - -  void PrintDerivation(const vector<pair<short,short> >& d, const vector<int>& sentf, const vector<int>& sente) { -    if (d.size() < 2) return; -    TRule x; -    for (int i = 1; i < d.size(); ++i) { -      InstantiateRule(d[i], d[i-1], sentf, sente, &x); -      cerr << i << '/' << (d.size() - 1) << ": " << x << endl; -    } -  } - -  void IncrementDerivation(const vector<pair<short,short> >& d, const vector<int>& sentf, const vector<int>& sente) { -    if (d.size() < 2) return; -    TRule x; -    for (int i = 1; i < d.size(); ++i) { -      InstantiateRule(d[i], d[i-1], sentf, sente, &x); -      model.IncrementRule(x); -      model.IncrementContinue(); -    } -    model.IncrementStop(); -  } - -  prob_t Likelihood() const { -    return model.Likelihood(); -  } - -  prob_t DerivationProposalProbability(const vector<pair<short,short> >& d, const vector<int>& sentf, const vector<int>& sente) const { -    prob_t p = model.StopProbability(); -    if (d.size() < 2) return p; -    TRule x; -    const prob_t p_cont = model.ContinueProbability(); -    for (int i = 1; i < d.size(); ++i) { -      InstantiateRule(d[i], d[i-1], sentf, sente, &x); -      p *= p_cont; -      p *= model.RuleProbability(x); -    } -    return p; -  } - -  void Sample(); - -  MonotonicParallelSegementationModel<PhraseJointBase_BiDir>& model; -  MT19937* rng; -  const Base& p0; -  prob_t baseprob; // cached value of generating the table table labels from p0 -                   // this can't be used if we go to a hierarchical prior! -  const vector<vector<int> >& corpuse, corpusf; -  const set<int>& vocabe, vocabf; -  unsigned mh_samples, mh_rejects; -  const int kX; -  vector<vector<pair<short, short> > > derivations; -}; - -template <typename Base> -void ModelAndData<Base>::Sample() { -  unsigned MAXK = kMAX_SRC_PHRASE; -  unsigned MAXL = kMAX_TRG_PHRASE; -  TRule x; -  x.lhs_ = -TD::Convert("X"); -  for (int samples = 0; samples < 1000; ++samples) { -    if (samples % 1 == 0 && samples > 0) { -      //ResampleHyperparameters(); -      cerr << " [" << samples << " LLH=" << log(Likelihood()) << " MH=" << ((double)mh_rejects / mh_samples) << "]\n"; -      for (int i = 0; i < 10; ++i) { -        cerr << "SENTENCE: " << TD::GetString(corpusf[i]) << " ||| " << TD::GetString(corpuse[i]) << endl; -        PrintDerivation(derivations[i], corpusf[i], corpuse[i]); -      } -    } -    cerr << '.' << flush; -    for (int s = 0; s < corpuse.size(); ++s) { -      const vector<int>& sentf = corpusf[s]; -      const vector<int>& sente = corpuse[s]; -//      cerr << "  CUSTOMERS: " << rules.num_customers() << endl; -//      cerr << "SENTENCE: " << TD::GetString(sentf) << " ||| " << TD::GetString(sente) << endl; - -      vector<pair<short, short> >& deriv = derivations[s]; -      const prob_t p_cur = Likelihood(); -      DecrementDerivation(deriv, sentf, sente); - -      boost::multi_array<prob_t, 2> a(boost::extents[sentf.size() + 1][sente.size() + 1]); -      boost::multi_array<prob_t, 4> trans(boost::extents[sentf.size() + 1][sente.size() + 1][MAXK][MAXL]); -      a[0][0] = prob_t::One(); -      const prob_t q_stop = model.StopProbability(); -      const prob_t q_cont = model.ContinueProbability(); -      for (int i = 0; i < sentf.size(); ++i) { -        for (int j = 0; j < sente.size(); ++j) { -          const prob_t src_a = a[i][j]; -          x.f_.clear(); -          for (int k = 1; k <= MAXK; ++k) { -            if (i + k > sentf.size()) break; -            x.f_.push_back(sentf[i + k - 1]); -            x.e_.clear(); -            for (int l = 1; l <= MAXL; ++l) { -              if (j + l > sente.size()) break; -              x.e_.push_back(sente[j + l - 1]); -              const bool stop_now = ((j + l) == sente.size()) && ((i + k) == sentf.size()); -              const prob_t& cp = stop_now ? q_stop : q_cont; -              trans[i][j][k - 1][l - 1] = model.RuleProbability(x) * cp; -              a[i + k][j + l] += src_a * trans[i][j][k - 1][l - 1]; -            } -          } -        } -      } -//      cerr << "Inside: " << log(a[sentf.size()][sente.size()]) << endl; -      const prob_t q_cur = DerivationProposalProbability(deriv, sentf, sente); - -      vector<pair<short,short> > newderiv; -      int cur_i = sentf.size(); -      int cur_j = sente.size(); -      while(cur_i > 0 && cur_j > 0) { -        newderiv.push_back(pair<short,short>(cur_i, cur_j)); -//        cerr << "NODE: (" << cur_i << "," << cur_j << ")\n"; -        SampleSet<prob_t> ss; -        vector<pair<short,short> > nexts; -        for (int k = 1; k <= MAXK; ++k) { -          const int hyp_i = cur_i - k; -          if (hyp_i < 0) break; -          for (int l = 1; l <= MAXL; ++l) { -            const int hyp_j = cur_j - l; -            if (hyp_j < 0) break; -            const prob_t& inside = a[hyp_i][hyp_j]; -            if (inside == prob_t::Zero()) continue; -            const prob_t& transp = trans[hyp_i][hyp_j][k - 1][l - 1]; -            if (transp == prob_t::Zero()) continue; -            const prob_t p = inside * transp; -            ss.add(p); -            nexts.push_back(pair<short,short>(hyp_i, hyp_j)); -//            cerr << "    (" << hyp_i << "," << hyp_j << ")  <--- " << log(p) << endl; -          } -        } -//        cerr << "  sample set has " << nexts.size() << " elements.\n"; -        const int selected = rng->SelectSample(ss); -        cur_i = nexts[selected].first; -        cur_j = nexts[selected].second; -      } -      newderiv.push_back(pair<short,short>(0,0)); -      const prob_t q_new = DerivationProposalProbability(newderiv, sentf, sente); -      IncrementDerivation(newderiv, sentf, sente); -//      cerr << "SANITY: " << q_new << "  " <<log(DerivationProposalProbability(newderiv, sentf, sente)) << endl; -      if (deriv.empty()) { deriv = newderiv; continue; } -      ++mh_samples; - -      if (deriv != newderiv) { -        const prob_t p_new = Likelihood(); -//        cerr << "p_cur=" << log(p_cur) << "\t p_new=" << log(p_new) << endl; -//        cerr << "q_cur=" << log(q_cur) << "\t q_new=" << log(q_new) << endl; -        if (!rng->AcceptMetropolisHastings(p_new, p_cur, q_new, q_cur)) { -          ++mh_rejects; -          DecrementDerivation(newderiv, sentf, sente); -          IncrementDerivation(deriv, sentf, sente); -        } else { -//          cerr << "  ACCEPT\n"; -          deriv = newderiv; -        } -      } -    } -  } -} - -int main(int argc, char** argv) { -  po::variables_map conf; -  InitCommandLine(argc, argv, &conf); -  kMAX_TRG_PHRASE = conf["max_trg_phrase"].as<unsigned>(); -  kMAX_SRC_PHRASE = conf["max_src_phrase"].as<unsigned>(); - -  if (!conf.count("model1")) { -    cerr << argv[0] << "Please use --model1 to specify model 1 parameters\n"; -    return 1; -  } -  if (!conf.count("inverse_model1")) { -    cerr << argv[0] << "Please use --inverse_model1 to specify inverse model 1 parameters\n"; -    return 1; -  } -  if (conf.count("random_seed")) -    prng.reset(new MT19937(conf["random_seed"].as<uint32_t>())); -  else -    prng.reset(new MT19937); -//  MT19937& rng = *prng; - -  vector<vector<int> > corpuse, corpusf; -  set<int> vocabe, vocabf; -  corpus::ReadParallelCorpus(conf["input"].as<string>(), &corpusf, &corpuse, &vocabf, &vocabe); -  cerr << "f-Corpus size: " << corpusf.size() << " sentences\n"; -  cerr << "f-Vocabulary size: " << vocabf.size() << " types\n"; -  cerr << "f-Corpus size: " << corpuse.size() << " sentences\n"; -  cerr << "f-Vocabulary size: " << vocabe.size() << " types\n"; -  assert(corpusf.size() == corpuse.size()); - -  Model1 m1(conf["model1"].as<string>()); -  Model1 invm1(conf["inverse_model1"].as<string>()); -//  PhraseJointBase lp0(m1, conf["model1_interpolation_weight"].as<double>(), vocabe.size(), vocabf.size()); -  PhraseJointBase_BiDir alp0(m1, invm1, conf["model1_interpolation_weight"].as<double>(), vocabe.size(), vocabf.size()); -  MonotonicParallelSegementationModel<PhraseJointBase_BiDir> m(alp0); - -  ModelAndData<PhraseJointBase_BiDir> posterior(m, alp0, corpuse, corpusf, vocabe, vocabf); -  posterior.Sample(); - -  return 0; -} - diff --git a/gi/pf/guess-translits.pl b/gi/pf/guess-translits.pl deleted file mode 100755 index d00c2168..00000000 --- a/gi/pf/guess-translits.pl +++ /dev/null @@ -1,72 +0,0 @@ -#!/usr/bin/perl -w -use strict; -use utf8; - -my $MIN_PMI = -3; - -my %fs; -my %es; -my %ef; - -die "Usage: $0 < input.utf8.txt\n" if scalar @ARGV > 0; - -binmode(STDIN,":utf8"); -binmode(STDOUT,":utf8"); -binmode(STDERR,":utf8"); - -my $tot = 0; -print STDERR "Reading alignments from STDIN ...\n"; -while(<STDIN>) { -  chomp; -  my ($fsent, $esent, $alsent) = split / \|\|\| /; -  die "Format should be 'foreign sentence ||| english sentence ||| 0-0 1-1 ...'\n" unless defined $fsent && defined $esent && defined $alsent; - -  my @fws = split /\s+/, $fsent;   -  my @ews = split /\s+/, $esent; -  my @as = split /\s+/, $alsent; -  my %a2b; -  my %b2a; -  for my $ap (@as) { -    my ($a,$b) = split /-/, $ap; -    die "BAD INPUT: $_\n" unless defined $a && defined $b; -    $a2b{$a}->{$b} = 1; -    $b2a{$b}->{$a} = 1; -  } -  for my $a (keys %a2b) { -    my $bref = $a2b{$a}; -    next unless scalar keys %$bref < 2; -    my $b = (keys %$bref)[0]; -    next unless scalar keys %{$b2a{$b}} < 2; -    my $f = $fws[$a]; -    next unless defined $f; -    next unless length($f) > 3; -    my $e = $ews[$b]; -    next unless defined $e; -    next unless length($e) > 3; - -    $ef{$f}->{$e}++; -    $es{$e}++; -    $fs{$f}++; -    $tot++; -  }   -} -my $ltot = log($tot); -my $num = 0; -print STDERR "Extracting pairs for PMI > $MIN_PMI ...\n"; -for my $f (keys %fs) { -  my $logf = log($fs{$f}); -  my $esref = $ef{$f}; -  for my $e (keys %$esref) { -    my $loge = log($es{$e}); -    my $ef = $esref->{$e}; -    my $logef = log($ef); -    my $pmi = $logef - ($loge + $logf); -    next if $pmi < $MIN_PMI; -    my @flets = split //, $f; -    my @elets = split //, $e; -    print "@flets ||| @elets\n"; -    $num++; -  } -} -print STDERR "Extracted $num pairs.\n"; -print STDERR "Recommend running:\n   ../../training/model1 -v -d -t -99999 output.txt\n"; diff --git a/gi/pf/hpyp_tm.cc b/gi/pf/hpyp_tm.cc deleted file mode 100644 index f362d3f8..00000000 --- a/gi/pf/hpyp_tm.cc +++ /dev/null @@ -1,133 +0,0 @@ -#include "hpyp_tm.h" - -#include <tr1/unordered_map> -#include <iostream> -#include <queue> - -#include "tdict.h" -#include "ccrp.h" -#include "pyp_word_model.h" -#include "tied_resampler.h" - -using namespace std; -using namespace std::tr1; - -struct FreqBinner { -  FreqBinner(const std::string& fname) { fd_.Load(fname); } -  unsigned NumberOfBins() const { return fd_.Max() + 1; } -  unsigned Bin(const WordID& w) const { return fd_.LookUp(w); } -  FreqDict<unsigned> fd_; -}; - -template <typename Base, class Binner = FreqBinner> -struct ConditionalPYPWordModel { -  ConditionalPYPWordModel(Base* b, const Binner* bnr = NULL) : -      base(*b), -      binner(bnr), -      btr(binner ? binner->NumberOfBins() + 1u : 2u) {} - -  void Summary() const { -    cerr << "Number of conditioning contexts: " << r.size() << endl; -    for (RuleModelHash::const_iterator it = r.begin(); it != r.end(); ++it) { -      cerr << TD::Convert(it->first) << "   \tPYP(d=" << it->second.discount() << ",s=" << it->second.strength() << ") --------------------------" << endl; -      for (CCRP<vector<WordID> >::const_iterator i2 = it->second.begin(); i2 != it->second.end(); ++i2) -        cerr << "   " << i2->second << endl; -    } -  } - -  void ResampleHyperparameters(MT19937* rng) { -    btr.ResampleHyperparameters(rng); -  }  - -  prob_t Prob(const WordID src, const vector<WordID>& trglets) const { -    RuleModelHash::const_iterator it = r.find(src); -    if (it == r.end()) { -      return base(trglets); -    } else { -      return it->second.prob(trglets, base(trglets)); -    } -  } - -  void Increment(const WordID src, const vector<WordID>& trglets, MT19937* rng) { -    RuleModelHash::iterator it = r.find(src); -    if (it == r.end()) { -      it = r.insert(make_pair(src, CCRP<vector<WordID> >(0.5,1.0))).first; -      static const WordID kNULL = TD::Convert("NULL"); -      unsigned bin = (src == kNULL ? 0 : 1); -      if (binner && bin) { bin = binner->Bin(src) + 1; } -      btr.Add(bin, &it->second); -    } -    if (it->second.increment(trglets, base(trglets), rng)) -      base.Increment(trglets, rng); -  } - -  void Decrement(const WordID src, const vector<WordID>& trglets, MT19937* rng) { -    RuleModelHash::iterator it = r.find(src); -    assert(it != r.end()); -    if (it->second.decrement(trglets, rng)) { -      base.Decrement(trglets, rng); -    } -  } - -  prob_t Likelihood() const { -    prob_t p = prob_t::One(); -    for (RuleModelHash::const_iterator it = r.begin(); it != r.end(); ++it) { -      prob_t q; q.logeq(it->second.log_crp_prob()); -      p *= q; -    } -    return p; -  } - -  unsigned UniqueConditioningContexts() const { -    return r.size(); -  } - -  // TODO tie PYP hyperparameters based on source word frequency bins -  Base& base; -  const Binner* binner; -  BinTiedResampler<CCRP<vector<WordID> > > btr; -  typedef unordered_map<WordID, CCRP<vector<WordID> > > RuleModelHash; -  RuleModelHash r; -}; - -HPYPLexicalTranslation::HPYPLexicalTranslation(const vector<vector<WordID> >& lets, -                                               const unsigned vocab_size, -                                               const unsigned num_letters) : -    letters(lets), -    base(vocab_size, num_letters, 5), -    up0(new PYPWordModel<PoissonUniformWordModel>(&base)), -    tmodel(new ConditionalPYPWordModel<PYPWordModel<PoissonUniformWordModel> >(up0, new FreqBinner("10k.freq"))), -    kX(-TD::Convert("X")) {} - -void HPYPLexicalTranslation::Summary() const { -  tmodel->Summary(); -  up0->Summary(); -} - -prob_t HPYPLexicalTranslation::Likelihood() const { -  prob_t p = up0->Likelihood(); -  p *= tmodel->Likelihood(); -  return p; -} - -void HPYPLexicalTranslation::ResampleHyperparameters(MT19937* rng) { -  tmodel->ResampleHyperparameters(rng); -  up0->ResampleHyperparameters(rng); -} - -unsigned HPYPLexicalTranslation::UniqueConditioningContexts() const { -  return tmodel->UniqueConditioningContexts(); -} - -prob_t HPYPLexicalTranslation::Prob(WordID src, WordID trg) const { -  return tmodel->Prob(src, letters[trg]); -} - -void HPYPLexicalTranslation::Increment(WordID src, WordID trg, MT19937* rng) { -  tmodel->Increment(src, letters[trg], rng); -} - -void HPYPLexicalTranslation::Decrement(WordID src, WordID trg, MT19937* rng) { -  tmodel->Decrement(src, letters[trg], rng); -} - diff --git a/gi/pf/hpyp_tm.h b/gi/pf/hpyp_tm.h deleted file mode 100644 index af3215ba..00000000 --- a/gi/pf/hpyp_tm.h +++ /dev/null @@ -1,38 +0,0 @@ -#ifndef HPYP_LEX_TRANS -#define HPYP_LEX_TRANS - -#include <vector> -#include "wordid.h" -#include "prob.h" -#include "sampler.h" -#include "freqdict.h" -#include "poisson_uniform_word_model.h" - -struct FreqBinner; -template <class B> struct PYPWordModel; -template <typename T, class B> struct ConditionalPYPWordModel; - -struct HPYPLexicalTranslation { -  explicit HPYPLexicalTranslation(const std::vector<std::vector<WordID> >& lets, -                                 const unsigned vocab_size, -                                 const unsigned num_letters); - -  prob_t Likelihood() const; - -  void ResampleHyperparameters(MT19937* rng); -  prob_t Prob(WordID src, WordID trg) const;  // return p(trg | src) -  void Summary() const; -  void Increment(WordID src, WordID trg, MT19937* rng); -  void Decrement(WordID src, WordID trg, MT19937* rng); -  unsigned UniqueConditioningContexts() const; - - private: -  const std::vector<std::vector<WordID> >& letters;   // spelling dictionary -  PoissonUniformWordModel base;  // "generator" of English types -  PYPWordModel<PoissonUniformWordModel>* up0;  // model English lexicon -  ConditionalPYPWordModel<PYPWordModel<PoissonUniformWordModel>, FreqBinner>* tmodel;  // translation distributions -                      // (model English word | French word) -  const WordID kX; -}; - -#endif diff --git a/gi/pf/itg.cc b/gi/pf/itg.cc deleted file mode 100644 index 29ec3860..00000000 --- a/gi/pf/itg.cc +++ /dev/null @@ -1,275 +0,0 @@ -#include <iostream> -#include <tr1/memory> -#include <queue> - -#include <boost/functional.hpp> -#include <boost/program_options.hpp> -#include <boost/program_options/variables_map.hpp> - -#include "viterbi.h" -#include "hg.h" -#include "trule.h" -#include "tdict.h" -#include "filelib.h" -#include "dict.h" -#include "sampler.h" -#include "ccrp_nt.h" -#include "ccrp_onetable.h" - -using namespace std; -using namespace tr1; -namespace po = boost::program_options; - -ostream& operator<<(ostream& os, const vector<WordID>& p) { -  os << '['; -  for (int i = 0; i < p.size(); ++i) -    os << (i==0 ? "" : " ") << TD::Convert(p[i]); -  return os << ']'; -} - -struct UnigramModel { -  explicit UnigramModel(const string& fname, unsigned vocab_size, double p0null = 0.05) : -      use_uniform_(fname.size() == 0), -      p0null_(p0null), -      uniform_((1.0 - p0null) / vocab_size), -      probs_(TD::NumWords() + 1) { -    if (fname.size() > 0) LoadUnigrams(fname); -    probs_[0] = p0null_; -  } - -//  -// \data\ -// ngram 1=9295 -//  -// \1-grams: -// -3.191193	" - -  void LoadUnigrams(const string& fname) { -    cerr << "Loading unigram probabilities from " << fname << " ..." << endl; -    ReadFile rf(fname); -    string line; -    istream& in = *rf.stream(); -    assert(in); -    getline(in, line); -    assert(line.empty()); -    getline(in, line); -    assert(line == "\\data\\"); -    getline(in, line); -    size_t pos = line.find("ngram 1="); -    assert(pos == 0); -    assert(line.size() > 8); -    const size_t num_unigrams = atoi(&line[8]); -    getline(in, line); -    assert(line.empty()); -    getline(in, line); -    assert(line == "\\1-grams:"); -    for (size_t i = 0; i < num_unigrams; ++i) { -      getline(in, line); -      assert(line.size() > 0); -      pos = line.find('\t'); -      assert(pos > 0); -      assert(pos + 1 < line.size()); -      const WordID w = TD::Convert(line.substr(pos + 1)); -      line[pos] = 0; -      float p = atof(&line[0]); -      const prob_t pnon_null(1.0 - p0null_.as_float()); -      if (w < probs_.size()) probs_[w].logeq(p * log(10) + log(pnon_null)); else abort(); -    } -  } - -  const prob_t& operator()(const WordID& w) const { -    if (!w) return p0null_; -    if (use_uniform_) return uniform_; -    return probs_[w]; -  } - -  const bool use_uniform_; -  const prob_t p0null_; -  const prob_t uniform_; -  vector<prob_t> probs_; -}; - -struct Model1 { -  explicit Model1(const string& fname) : -      kNULL(TD::Convert("<eps>")), -      kZERO() { -    LoadModel1(fname); -  } - -  void LoadModel1(const string& fname) { -    cerr << "Loading Model 1 parameters from " << fname << " ..." << endl; -    ReadFile rf(fname); -    istream& in = *rf.stream(); -    string line; -    unsigned lc = 0; -    while(getline(in, line)) { -      ++lc; -      int cur = 0; -      int start = 0; -      while(cur < line.size() && line[cur] != ' ') { ++cur; } -      assert(cur != line.size()); -      line[cur] = 0; -      const WordID src = TD::Convert(&line[0]); -      ++cur; -      start = cur; -      while(cur < line.size() && line[cur] != ' ') { ++cur; } -      assert(cur != line.size()); -      line[cur] = 0; -      WordID trg = TD::Convert(&line[start]); -      const double logprob = strtod(&line[cur + 1], NULL); -      if (src >= ttable.size()) ttable.resize(src + 1); -      ttable[src][trg].logeq(logprob); -    } -    cerr << "  read " << lc << " parameters.\n"; -  } - -  // returns prob 0 if src or trg is not found! -  const prob_t& operator()(WordID src, WordID trg) const { -    if (src == 0) src = kNULL; -    if (src < ttable.size()) { -      const map<WordID, prob_t>& cpd = ttable[src]; -      const map<WordID, prob_t>::const_iterator it = cpd.find(trg); -      if (it != cpd.end()) -        return it->second; -    } -    return kZERO; -  } - -  const WordID kNULL; -  const prob_t kZERO; -  vector<map<WordID, prob_t> > ttable; -}; - -void InitCommandLine(int argc, char** argv, po::variables_map* conf) { -  po::options_description opts("Configuration options"); -  opts.add_options() -        ("samples,s",po::value<unsigned>()->default_value(1000),"Number of samples") -        ("particles,p",po::value<unsigned>()->default_value(25),"Number of particles") -        ("input,i",po::value<string>(),"Read parallel data from") -        ("model1,m",po::value<string>(),"Model 1 parameters (used in base distribution)") -        ("inverse_model1,M",po::value<string>(),"Inverse Model 1 parameters (used in backward estimate)") -        ("model1_interpolation_weight",po::value<double>()->default_value(0.95),"Mixing proportion of model 1 with uniform target distribution") -        ("src_unigram,u",po::value<string>()->default_value(""),"Source unigram distribution; empty for uniform") -        ("trg_unigram,U",po::value<string>()->default_value(""),"Target unigram distribution; empty for uniform") -        ("random_seed,S",po::value<uint32_t>(), "Random seed"); -  po::options_description clo("Command line options"); -  clo.add_options() -        ("config", po::value<string>(), "Configuration file") -        ("help,h", "Print this help message and exit"); -  po::options_description dconfig_options, dcmdline_options; -  dconfig_options.add(opts); -  dcmdline_options.add(opts).add(clo); -   -  po::store(parse_command_line(argc, argv, dcmdline_options), *conf); -  if (conf->count("config")) { -    ifstream config((*conf)["config"].as<string>().c_str()); -    po::store(po::parse_config_file(config, dconfig_options), *conf); -  } -  po::notify(*conf); - -  if (conf->count("help") || (conf->count("input") == 0)) { -    cerr << dcmdline_options << endl; -    exit(1); -  } -} - -void ReadParallelCorpus(const string& filename, -                vector<vector<WordID> >* f, -                vector<vector<WordID> >* e, -                set<WordID>* vocab_f, -                set<WordID>* vocab_e) { -  f->clear(); -  e->clear(); -  vocab_f->clear(); -  vocab_e->clear(); -  istream* in; -  if (filename == "-") -    in = &cin; -  else -    in = new ifstream(filename.c_str()); -  assert(*in); -  string line; -  const WordID kDIV = TD::Convert("|||"); -  vector<WordID> tmp; -  while(*in) { -    getline(*in, line); -    if (line.empty() && !*in) break; -    e->push_back(vector<int>()); -    f->push_back(vector<int>()); -    vector<int>& le = e->back(); -    vector<int>& lf = f->back(); -    tmp.clear(); -    TD::ConvertSentence(line, &tmp); -    bool isf = true; -    for (unsigned i = 0; i < tmp.size(); ++i) { -      const int cur = tmp[i]; -      if (isf) { -        if (kDIV == cur) { isf = false; } else { -          lf.push_back(cur); -          vocab_f->insert(cur); -        } -      } else { -        assert(cur != kDIV); -        le.push_back(cur); -        vocab_e->insert(cur); -      } -    } -    assert(isf == false); -  } -  if (in != &cin) delete in; -} - -int main(int argc, char** argv) { -  po::variables_map conf; -  InitCommandLine(argc, argv, &conf); -  const unsigned particles = conf["particles"].as<unsigned>(); -  const unsigned samples = conf["samples"].as<unsigned>(); -  TD::Convert("<s>"); -  TD::Convert("</s>"); -  TD::Convert("<unk>"); -  if (!conf.count("model1")) { -    cerr << argv[0] << "Please use --model1 to specify model 1 parameters\n"; -    return 1; -  } -  boost::shared_ptr<MT19937> prng; -  if (conf.count("random_seed")) -    prng.reset(new MT19937(conf["random_seed"].as<uint32_t>())); -  else -    prng.reset(new MT19937); -  MT19937& rng = *prng; - -  vector<vector<WordID> > corpuse, corpusf; -  set<WordID> vocabe, vocabf; -  cerr << "Reading corpus...\n"; -  ReadParallelCorpus(conf["input"].as<string>(), &corpusf, &corpuse, &vocabf, &vocabe); -  cerr << "F-corpus size: " << corpusf.size() << " sentences\t (" << vocabf.size() << " word types)\n"; -  cerr << "E-corpus size: " << corpuse.size() << " sentences\t (" << vocabe.size() << " word types)\n"; -  assert(corpusf.size() == corpuse.size()); -  UnigramModel src_unigram(conf["src_unigram"].as<string>(), vocabf.size()); -  UnigramModel trg_unigram(conf["trg_unigram"].as<string>(), vocabe.size()); -  const prob_t kHALF(0.5); - -  const string kEMPTY = "NULL"; -  const int kLHS = -TD::Convert("X"); -  Model1 m1(conf["model1"].as<string>()); -  Model1 invm1(conf["inverse_model1"].as<string>()); -  for (int si = 0; si < conf["samples"].as<unsigned>(); ++si) { -    cerr << '.' << flush; -    for (int ci = 0; ci < corpusf.size(); ++ci) { -      const vector<WordID>& trg = corpuse[ci]; -      const vector<WordID>& src = corpusf[ci]; -      for (int i = 0; i <= trg.size(); ++i) { -        const WordID e_i = i > 0 ? trg[i-1] : 0; -        for (int j = 0; j <= src.size(); ++j) { -          const WordID f_j = j > 0 ? src[j-1] : 0; -          if (e_i == 0 && f_j == 0) continue; -          prob_t je = kHALF * src_unigram(f_j) * m1(f_j,e_i) + kHALF * trg_unigram(e_i) * invm1(e_i,f_j); -          cerr << "p( " << (e_i ? TD::Convert(e_i) : kEMPTY) << " , " << (f_j ? TD::Convert(f_j) : kEMPTY) << " ) = " << log(je) << endl; -          if (e_i && f_j) -            cout << "[X] ||| " << TD::Convert(f_j) << " ||| " << TD::Convert(e_i) << " ||| LogProb=" << log(je) << endl; -        } -      } -    } -  } -} - diff --git a/gi/pf/learn_cfg.cc b/gi/pf/learn_cfg.cc deleted file mode 100644 index 1d5126e4..00000000 --- a/gi/pf/learn_cfg.cc +++ /dev/null @@ -1,428 +0,0 @@ -#include <iostream> -#include <tr1/memory> -#include <queue> - -#include <boost/functional.hpp> -#include <boost/program_options.hpp> -#include <boost/program_options/variables_map.hpp> - -#include "inside_outside.h" -#include "hg.h" -#include "bottom_up_parser.h" -#include "fdict.h" -#include "grammar.h" -#include "m.h" -#include "trule.h" -#include "tdict.h" -#include "filelib.h" -#include "dict.h" -#include "sampler.h" -#include "ccrp.h" -#include "ccrp_onetable.h" - -using namespace std; -using namespace tr1; -namespace po = boost::program_options; - -boost::shared_ptr<MT19937> prng; -vector<int> nt_vocab; -vector<int> nt_id_to_index; -static unsigned kMAX_RULE_SIZE = 0; -static unsigned kMAX_ARITY = 0; -static bool kALLOW_MIXED = true;  // allow rules with mixed terminals and NTs -static bool kHIERARCHICAL_PRIOR = false; - -void InitCommandLine(int argc, char** argv, po::variables_map* conf) { -  po::options_description opts("Configuration options"); -  opts.add_options() -        ("samples,s",po::value<unsigned>()->default_value(1000),"Number of samples") -        ("input,i",po::value<string>(),"Read parallel data from") -        ("max_rule_size,m", po::value<unsigned>()->default_value(0), "Maximum rule size (0 for unlimited)") -        ("max_arity,a", po::value<unsigned>()->default_value(0), "Maximum number of nonterminals in a rule (0 for unlimited)") -        ("no_mixed_rules,M", "Do not mix terminals and nonterminals in a rule RHS") -        ("nonterminals,n", po::value<unsigned>()->default_value(1), "Size of nonterminal vocabulary") -        ("hierarchical_prior,h", "Use hierarchical prior") -        ("random_seed,S",po::value<uint32_t>(), "Random seed"); -  po::options_description clo("Command line options"); -  clo.add_options() -        ("config", po::value<string>(), "Configuration file") -        ("help", "Print this help message and exit"); -  po::options_description dconfig_options, dcmdline_options; -  dconfig_options.add(opts); -  dcmdline_options.add(opts).add(clo); -   -  po::store(parse_command_line(argc, argv, dcmdline_options), *conf); -  if (conf->count("config")) { -    ifstream config((*conf)["config"].as<string>().c_str()); -    po::store(po::parse_config_file(config, dconfig_options), *conf); -  } -  po::notify(*conf); - -  if (conf->count("help") || (conf->count("input") == 0)) { -    cerr << dcmdline_options << endl; -    exit(1); -  } -} - -unsigned ReadCorpus(const string& filename, -                    vector<vector<WordID> >* e, -                    set<WordID>* vocab_e) { -  e->clear(); -  vocab_e->clear(); -  istream* in; -  if (filename == "-") -    in = &cin; -  else -    in = new ifstream(filename.c_str()); -  assert(*in); -  string line; -  unsigned toks = 0; -  while(*in) { -    getline(*in, line); -    if (line.empty() && !*in) break; -    e->push_back(vector<int>()); -    vector<int>& le = e->back(); -    TD::ConvertSentence(line, &le); -    for (unsigned i = 0; i < le.size(); ++i) -      vocab_e->insert(le[i]); -    toks += le.size(); -  } -  if (in != &cin) delete in; -  return toks; -} - -struct Grid { -  // a b c d e -  // 0 - 0 - - -  vector<int> grid; -}; - -struct BaseRuleModel { -  explicit BaseRuleModel(unsigned term_size, -                         unsigned nonterm_size = 1) : -      unif_term(1.0 / term_size), -      unif_nonterm(1.0 / nonterm_size) {} -  prob_t operator()(const TRule& r) const { -    prob_t p; p.logeq(Md::log_poisson(1.0, r.f_.size())); -    const prob_t term_prob((2.0 + 0.01*r.f_.size()) / (r.f_.size() + 2)); -    const prob_t nonterm_prob(1.0 - term_prob.as_float()); -    for (unsigned i = 0; i < r.f_.size(); ++i) { -      if (r.f_[i] <= 0) {     // nonterminal -        if (kALLOW_MIXED) p *= nonterm_prob; -        p *= unif_nonterm; -      } else {                // terminal -        if (kALLOW_MIXED) p *= term_prob; -        p *= unif_term; -      } -    } -    return p; -  } -  const prob_t unif_term, unif_nonterm; -}; - -struct HieroLMModel { -  explicit HieroLMModel(unsigned vocab_size, unsigned num_nts = 1) : -      base(vocab_size, num_nts), -      q0(1,1,1,1), -      nts(num_nts, CCRP<TRule>(1,1,1,1)) {} - -  prob_t Prob(const TRule& r) const { -    return nts[nt_id_to_index[-r.lhs_]].prob(r, p0(r)); -  } - -  inline prob_t p0(const TRule& r) const { -    if (kHIERARCHICAL_PRIOR) -      return q0.prob(r, base(r)); -    else -      return base(r); -  } - -  int Increment(const TRule& r, MT19937* rng) { -    const int delta = nts[nt_id_to_index[-r.lhs_]].increment(r, p0(r), rng); -    if (kHIERARCHICAL_PRIOR && delta) -      q0.increment(r, base(r), rng); -    return delta; -    // return x.increment(r); -  } - -  int Decrement(const TRule& r, MT19937* rng) { -    const int delta = nts[nt_id_to_index[-r.lhs_]].decrement(r, rng); -    if (kHIERARCHICAL_PRIOR && delta) -      q0.decrement(r, rng); -    return delta; -    //return x.decrement(r); -  } - -  prob_t Likelihood() const { -    prob_t p = prob_t::One(); -    for (unsigned i = 0; i < nts.size(); ++i) { -      prob_t q; q.logeq(nts[i].log_crp_prob()); -      p *= q; -      for (CCRP<TRule>::const_iterator it = nts[i].begin(); it != nts[i].end(); ++it) { -        prob_t tp = p0(it->first); -        tp.poweq(it->second.num_tables()); -        p *= tp; -      } -    } -    if (kHIERARCHICAL_PRIOR) { -      prob_t q; q.logeq(q0.log_crp_prob()); -      p *= q; -      for (CCRP<TRule>::const_iterator it = q0.begin(); it != q0.end(); ++it) { -        prob_t tp = base(it->first); -        tp.poweq(it->second.num_tables()); -        p *= tp; -      } -    } -    //for (CCRP_OneTable<TRule>::const_iterator it = x.begin(); it != x.end(); ++it) -    //    p *= base(it->first); -    return p; -  } - -  void ResampleHyperparameters(MT19937* rng) { -    for (unsigned i = 0; i < nts.size(); ++i) -      nts[i].resample_hyperparameters(rng); -    if (kHIERARCHICAL_PRIOR) { -      q0.resample_hyperparameters(rng); -      cerr << "[base d=" << q0.discount() << ", s=" << q0.strength() << "]"; -    } -    cerr << " d=" << nts[0].discount() << ", s=" << nts[0].strength() << endl; -  } - -  const BaseRuleModel base; -  CCRP<TRule> q0; -  vector<CCRP<TRule> > nts; -  //CCRP_OneTable<TRule> x; -}; - -vector<GrammarIter* > tofreelist; - -HieroLMModel* plm; - -struct NPGrammarIter : public GrammarIter, public RuleBin { -  NPGrammarIter() : arity() { tofreelist.push_back(this); } -  NPGrammarIter(const TRulePtr& inr, const int a, int symbol) : arity(a) { -    if (inr) { -      r.reset(new TRule(*inr)); -    } else { -      r.reset(new TRule); -    } -    TRule& rr = *r; -    rr.lhs_ = nt_vocab[0]; -    rr.f_.push_back(symbol); -    rr.e_.push_back(symbol < 0 ? (1-int(arity)) : symbol); -    tofreelist.push_back(this); -  } -  inline static unsigned NextArity(int cur_a, int symbol) { -    return cur_a + (symbol <= 0 ? 1 : 0); -  } -  virtual int GetNumRules() const { -    if (r) return nt_vocab.size(); else return 0; -  } -  virtual TRulePtr GetIthRule(int i) const { -    if (i == 0) return r; -    TRulePtr nr(new TRule(*r)); -    nr->lhs_ = nt_vocab[i]; -    return nr; -  } -  virtual int Arity() const { -    return arity; -  } -  virtual const RuleBin* GetRules() const { -    if (!r) return NULL; else return this; -  } -  virtual const GrammarIter* Extend(int symbol) const { -    const int next_arity = NextArity(arity, symbol); -    if (kMAX_ARITY && next_arity > kMAX_ARITY) -      return NULL; -    if (!kALLOW_MIXED && r) { -      bool t1 = r->f_.front() <= 0; -      bool t2 = symbol <= 0; -      if (t1 != t2) return NULL; -    } -    if (!kMAX_RULE_SIZE || !r || (r->f_.size() < kMAX_RULE_SIZE)) -      return new NPGrammarIter(r, next_arity, symbol); -    else -      return NULL; -  } -  const unsigned char arity; -  TRulePtr r; -}; - -struct NPGrammar : public Grammar { -  virtual const GrammarIter* GetRoot() const { -    return new NPGrammarIter; -  } -}; - -prob_t TotalProb(const Hypergraph& hg) { -  return Inside<prob_t, EdgeProb>(hg); -} - -void SampleDerivation(const Hypergraph& hg, MT19937* rng, vector<unsigned>* sampled_deriv) { -  vector<prob_t> node_probs; -  Inside<prob_t, EdgeProb>(hg, &node_probs); -  queue<unsigned> q; -  q.push(hg.nodes_.size() - 2); -  while(!q.empty()) { -    unsigned cur_node_id = q.front(); -//    cerr << "NODE=" << cur_node_id << endl; -    q.pop(); -    const Hypergraph::Node& node = hg.nodes_[cur_node_id]; -    const unsigned num_in_edges = node.in_edges_.size(); -    unsigned sampled_edge = 0; -    if (num_in_edges == 1) { -      sampled_edge = node.in_edges_[0]; -    } else { -      //prob_t z; -      assert(num_in_edges > 1); -      SampleSet<prob_t> ss; -      for (unsigned j = 0; j < num_in_edges; ++j) { -        const Hypergraph::Edge& edge = hg.edges_[node.in_edges_[j]]; -        prob_t p = edge.edge_prob_; -        for (unsigned k = 0; k < edge.tail_nodes_.size(); ++k) -          p *= node_probs[edge.tail_nodes_[k]]; -        ss.add(p); -//        cerr << log(ss[j]) << " ||| " << edge.rule_->AsString() << endl; -        //z += p; -      } -//      for (unsigned j = 0; j < num_in_edges; ++j) { -//        const Hypergraph::Edge& edge = hg.edges_[node.in_edges_[j]]; -//        cerr << exp(log(ss[j] / z)) << " ||| " << edge.rule_->AsString() << endl; -//      } -//      cerr << " --- \n"; -      sampled_edge = node.in_edges_[rng->SelectSample(ss)]; -    } -    sampled_deriv->push_back(sampled_edge); -    const Hypergraph::Edge& edge = hg.edges_[sampled_edge]; -    for (unsigned j = 0; j < edge.tail_nodes_.size(); ++j) { -      q.push(edge.tail_nodes_[j]); -    } -  } -  for (unsigned i = 0; i < sampled_deriv->size(); ++i) { -    cerr << *hg.edges_[(*sampled_deriv)[i]].rule_ << endl; -  } -} - -void IncrementDerivation(const Hypergraph& hg, const vector<unsigned>& d, HieroLMModel* plm, MT19937* rng) { -  for (unsigned i = 0; i < d.size(); ++i) -    plm->Increment(*hg.edges_[d[i]].rule_, rng); -} - -void DecrementDerivation(const Hypergraph& hg, const vector<unsigned>& d, HieroLMModel* plm, MT19937* rng) { -  for (unsigned i = 0; i < d.size(); ++i) -    plm->Decrement(*hg.edges_[d[i]].rule_, rng); -} - -int main(int argc, char** argv) { -  po::variables_map conf; - -  InitCommandLine(argc, argv, &conf); -  nt_vocab.resize(conf["nonterminals"].as<unsigned>()); -  assert(nt_vocab.size() > 0); -  assert(nt_vocab.size() < 26); -  { -    string nt = "X"; -    for (unsigned i = 0; i < nt_vocab.size(); ++i) { -      if (nt_vocab.size() > 1) nt[0] = ('A' + i); -      int pid = TD::Convert(nt); -      nt_vocab[i] = -pid; -      if (pid >= nt_id_to_index.size()) { -        nt_id_to_index.resize(pid + 1, -1); -      } -      nt_id_to_index[pid] = i; -    } -  } -  vector<GrammarPtr> grammars; -  grammars.push_back(GrammarPtr(new NPGrammar)); - -  const unsigned samples = conf["samples"].as<unsigned>(); -  kMAX_RULE_SIZE = conf["max_rule_size"].as<unsigned>(); -  if (kMAX_RULE_SIZE == 1) { -    cerr << "Invalid maximum rule size: must be 0 or >1\n"; -    return 1; -  } -  kMAX_ARITY = conf["max_arity"].as<unsigned>(); -  if (kMAX_ARITY == 1) { -    cerr << "Invalid maximum arity: must be 0 or >1\n"; -    return 1; -  } -  kALLOW_MIXED = !conf.count("no_mixed_rules"); - -  kHIERARCHICAL_PRIOR = conf.count("hierarchical_prior"); - -  if (conf.count("random_seed")) -    prng.reset(new MT19937(conf["random_seed"].as<uint32_t>())); -  else -    prng.reset(new MT19937); -  MT19937& rng = *prng; -  vector<vector<WordID> > corpuse; -  set<WordID> vocabe; -  cerr << "Reading corpus...\n"; -  const unsigned toks = ReadCorpus(conf["input"].as<string>(), &corpuse, &vocabe); -  cerr << "E-corpus size: " << corpuse.size() << " sentences\t (" << vocabe.size() << " word types)\n"; -  HieroLMModel lm(vocabe.size(), nt_vocab.size()); - -  plm = &lm; -  ExhaustiveBottomUpParser parser(TD::Convert(-nt_vocab[0]), grammars); - -  Hypergraph hg; -  const int kGoal = -TD::Convert("Goal"); -  const int kLP = FD::Convert("LogProb"); -  SparseVector<double> v; v.set_value(kLP, 1.0); -  vector<vector<unsigned> > derivs(corpuse.size()); -  vector<Lattice> cl(corpuse.size()); -  for (int ci = 0; ci < corpuse.size(); ++ci) { -    vector<int>& src = corpuse[ci]; -    Lattice& lat = cl[ci]; -    lat.resize(src.size()); -    for (unsigned i = 0; i < src.size(); ++i) -      lat[i].push_back(LatticeArc(src[i], 0.0, 1)); -  } -  for (int SS=0; SS < samples; ++SS) { -    const bool is_last = ((samples - 1) == SS); -    prob_t dlh = prob_t::One(); -    for (int ci = 0; ci < corpuse.size(); ++ci) { -      const vector<int>& src = corpuse[ci]; -      const Lattice& lat = cl[ci]; -      cerr << TD::GetString(src) << endl; -      hg.clear(); -      parser.Parse(lat, &hg);  // exhaustive parse -      vector<unsigned>& d = derivs[ci]; -      if (!is_last) DecrementDerivation(hg, d, &lm, &rng); -      for (unsigned i = 0; i < hg.edges_.size(); ++i) { -        TRule& r = *hg.edges_[i].rule_; -        if (r.lhs_ == kGoal) -          hg.edges_[i].edge_prob_ = prob_t::One(); -        else -          hg.edges_[i].edge_prob_ = lm.Prob(r); -      } -      if (!is_last) { -        d.clear(); -        SampleDerivation(hg, &rng, &d); -        IncrementDerivation(hg, derivs[ci], &lm, &rng); -      } else { -        prob_t p = TotalProb(hg); -        dlh *= p; -        cerr << " p(sentence) = " << log(p) << "\t" << log(dlh) << endl; -      } -      if (tofreelist.size() > 200000) { -        cerr << "Freeing ... "; -        for (unsigned i = 0; i < tofreelist.size(); ++i) -          delete tofreelist[i]; -        tofreelist.clear(); -        cerr << "Freed.\n"; -      } -    } -    double llh = log(lm.Likelihood()); -    cerr << "LLH=" << llh << "\tENTROPY=" << (-llh / log(2) / toks) << "\tPPL=" << pow(2, -llh / log(2) / toks) << endl; -    if (SS % 10 == 9) lm.ResampleHyperparameters(&rng); -    if (is_last) { -      double z = log(dlh); -      cerr << "TOTAL_PROB=" << z << "\tENTROPY=" << (-z / log(2) / toks) << "\tPPL=" << pow(2, -z / log(2) / toks) << endl; -    } -  } -  for (unsigned i = 0; i < nt_vocab.size(); ++i) -    cerr << lm.nts[i] << endl; -  return 0; -} - diff --git a/gi/pf/make-freq-bins.pl b/gi/pf/make-freq-bins.pl deleted file mode 100755 index fdcd3555..00000000 --- a/gi/pf/make-freq-bins.pl +++ /dev/null @@ -1,26 +0,0 @@ -#!/usr/bin/perl -w -use strict; - -my $BASE = 6; -my $CUTOFF = 3; - -my %d; -my $num = 0; -while(<>){ - chomp; - my @words = split /\s+/; - for my $w (@words) {$d{$w}++; $num++;} -} - -my @vocab = sort {$d{$b} <=> $d{$a}} keys %d; - -for (my $i=0; $i<scalar @vocab; $i++) { -  my $most = $d{$vocab[$i]}; -  my $least = 1; - -  my $nl = -int(log($most / $num) / log($BASE) + $CUTOFF); -  if ($nl < 0) { $nl = 0; } -  print "$vocab[$i] $nl\n" -} - - diff --git a/gi/pf/mh_test.cc b/gi/pf/mh_test.cc deleted file mode 100644 index 296e7285..00000000 --- a/gi/pf/mh_test.cc +++ /dev/null @@ -1,148 +0,0 @@ -#include "ccrp.h" - -#include <vector> -#include <iostream> - -#include "tdict.h" -#include "transliterations.h" - -using namespace std; - -MT19937 rng; - -static bool verbose = false; - -struct Model { - -  Model() : bp(), base(0.2, 0.6) , ccrps(5, CCRP<int>(0.8, 0.5)) {} - -  double p0(int x) const { -    assert(x > 0); -    assert(x < 5); -    return 1.0/4.0; -  } - -  double llh() const { -    double lh = bp + base.log_crp_prob(); -    for (int ctx = 1; ctx < 5; ++ctx) -      lh += ccrps[ctx].log_crp_prob(); -    return lh; -  } - -  double prob(int ctx, int x) const { -    assert(ctx > 0 && ctx < 5); -    return ccrps[ctx].prob(x, base.prob(x, p0(x))); -  } - -  void increment(int ctx, int x) { -    assert(ctx > 0 && ctx < 5); -    if (ccrps[ctx].increment(x, base.prob(x, p0(x)), &rng)) { -      if (base.increment(x, p0(x), &rng)) { -        bp += log(1.0 / 4.0); -      } -    } -  } - -  // this is just a biased estimate -  double est_base_prob(int x) { -    return (x + 1) * x / 40.0; -  } - -  void increment_is(int ctx, int x) { -    assert(ctx > 0 && ctx < 5); -    SampleSet<double> ss; -    const int PARTICLES = 25; -    vector<CCRP<int> > s1s(PARTICLES, CCRP<int>(0.5,0.5)); -    vector<CCRP<int> > sbs(PARTICLES, CCRP<int>(0.5,0.5)); -    vector<double> sp0s(PARTICLES); - -    CCRP<int> s1 = ccrps[ctx]; -    CCRP<int> sb = base; -    double sp0 = bp; -    for (int pp = 0; pp < PARTICLES; ++pp) { -      if (pp > 0) { -        ccrps[ctx] = s1; -        base = sb; -        bp = sp0; -      } - -      double q = 1; -      double gamma = 1; -      double est_p = est_base_prob(x); -      //base.prob(x, p0(x)) + rng.next() * 0.1; -      if (ccrps[ctx].increment(x, est_p, &rng, &q)) { -        gamma = q * base.prob(x, p0(x)); -        q *= est_p; -        if (verbose) cerr << "(DP-base draw) "; -        double qq = -1; -        if (base.increment(x, p0(x), &rng, &qq)) { -          if (verbose) cerr << "(G0 draw) "; -          bp += log(p0(x)); -          qq *= p0(x); -        } -      } else { gamma = q; } -      double w = gamma / q; -      if (verbose) -        cerr << "gamma=" << gamma << " q=" << q << "\tw=" << w << endl; -      ss.add(w); -      s1s[pp] = ccrps[ctx]; -      sbs[pp] = base; -      sp0s[pp] = bp; -    } -    int ps = rng.SelectSample(ss); -    ccrps[ctx] = s1s[ps]; -    base = sbs[ps]; -    bp = sp0s[ps]; -    if (verbose) { -      cerr << "SELECTED: " << ps << endl; -      static int cc = 0; cc++; if (cc ==10) exit(1); -    } -  } - -  void decrement(int ctx, int x) { -    assert(ctx > 0 && ctx < 5); -    if (ccrps[ctx].decrement(x, &rng)) { -      if (base.decrement(x, &rng)) { -        bp -= log(p0(x)); -      } -    } -  } - -  double bp; -  CCRP<int> base; -  vector<CCRP<int> > ccrps; - -}; - -int main(int argc, char** argv) { -  if (argc > 1) { verbose = true; } -  vector<int> counts(15, 0); -  vector<int> tcounts(15, 0); -  int points[] = {1,2, 2,2, 3,2, 4,1, 3, 4, 3, 3, 2, 3, 4, 1, 4, 1, 3, 2, 1, 3, 1, 4, 0, 0}; -  double tlh = 0; -  double tt = 0; -  for (int n = 0; n < 1000; ++n) { -    if (n % 10 == 0) cerr << '.'; -    if ((n+1) % 400 == 0) cerr << " [" << (n+1) << "]\n"; -    Model m; -    for (int *x = points; *x; x += 2) -      m.increment(x[0], x[1]); - -    for (int j = 0; j < 24; ++j) { -      for (int *x = points; *x; x += 2) { -        if (rng.next() < 0.8) { -          m.decrement(x[0], x[1]); -          m.increment_is(x[0], x[1]); -        } -      } -    } -    counts[m.base.num_customers()]++; -    tcounts[m.base.num_tables()]++; -    tlh += m.llh(); -    tt += 1.0; -  } -  cerr << "mean LLH = " << (tlh / tt) << endl; -  for (int i = 0; i < 15; ++i) -    cerr << i << ": " << (counts[i] / tt) << "\t" << (tcounts[i] / tt) << endl; -} - diff --git a/gi/pf/monotonic_pseg.h b/gi/pf/monotonic_pseg.h deleted file mode 100644 index 10d171fe..00000000 --- a/gi/pf/monotonic_pseg.h +++ /dev/null @@ -1,89 +0,0 @@ -#ifndef _MONOTONIC_PSEG_H_ -#define _MONOTONIC_PSEG_H_ - -#include <vector> - -#include "prob.h" -#include "ccrp_nt.h" -#include "trule.h" -#include "base_distributions.h" - -template <typename BaseMeasure> -struct MonotonicParallelSegementationModel { -  explicit MonotonicParallelSegementationModel(BaseMeasure& rcp0) : -    rp0(rcp0), base(prob_t::One()), rules(1,1), stop(1.0) {} - -  void DecrementRule(const TRule& rule) { -    if (rules.decrement(rule)) -      base /= rp0(rule); -  } - -  void IncrementRule(const TRule& rule) { -    if (rules.increment(rule)) -      base *= rp0(rule); -  } - -  void IncrementRulesAndStops(const std::vector<TRulePtr>& rules) { -    for (int i = 0; i < rules.size(); ++i) -      IncrementRule(*rules[i]); -    if (rules.size()) IncrementContinue(rules.size() - 1); -    IncrementStop(); -  } - -  void DecrementRulesAndStops(const std::vector<TRulePtr>& rules) { -    for (int i = 0; i < rules.size(); ++i) -      DecrementRule(*rules[i]); -    if (rules.size()) { -      DecrementContinue(rules.size() - 1); -      DecrementStop(); -    } -  } - -  prob_t RuleProbability(const TRule& rule) const { -    prob_t p; p.logeq(rules.logprob(rule, log(rp0(rule)))); -    return p; -  } - -  prob_t Likelihood() const { -    prob_t p = base; -    prob_t q; q.logeq(rules.log_crp_prob()); -    p *= q; -    q.logeq(stop.log_crp_prob()); -    p *= q; -    return p; -  } - -  void IncrementStop() { -    stop.increment(true); -  } - -  void IncrementContinue(int n = 1) { -    for (int i = 0; i < n; ++i) -      stop.increment(false); -  } - -  void DecrementStop() { -    stop.decrement(true); -  } - -  void DecrementContinue(int n = 1) { -    for (int i = 0; i < n; ++i) -      stop.decrement(false); -  } - -  prob_t StopProbability() const { -    return prob_t(stop.prob(true, 0.5)); -  } - -  prob_t ContinueProbability() const { -    return prob_t(stop.prob(false, 0.5)); -  } - -  const BaseMeasure& rp0; -  prob_t base; -  CCRP_NoTable<TRule> rules; -  CCRP_NoTable<bool> stop; -}; - -#endif - diff --git a/gi/pf/ngram_base.cc b/gi/pf/ngram_base.cc deleted file mode 100644 index 1299f06f..00000000 --- a/gi/pf/ngram_base.cc +++ /dev/null @@ -1,69 +0,0 @@ -#include "ngram_base.h" - -#include "lm/model.hh" -#include "tdict.h" - -using namespace std; - -namespace { -struct GICSVMapper : public lm::EnumerateVocab { -  GICSVMapper(vector<lm::WordIndex>* out) : out_(out), kLM_UNKNOWN_TOKEN(0) { out_->clear(); } -  void Add(lm::WordIndex index, const StringPiece &str) { -    const WordID cdec_id = TD::Convert(str.as_string()); -    if (cdec_id >= out_->size()) -      out_->resize(cdec_id + 1, kLM_UNKNOWN_TOKEN); -    (*out_)[cdec_id] = index; -  } -  vector<lm::WordIndex>* out_; -  const lm::WordIndex kLM_UNKNOWN_TOKEN; -}; -} - -struct FixedNgramBaseImpl { -  FixedNgramBaseImpl(const string& param) { -    GICSVMapper vm(&cdec2klm_map_); -    lm::ngram::Config conf; -    conf.enumerate_vocab = &vm; -    cerr << "Reading character LM from " << param << endl; -    model = new lm::ngram::ProbingModel(param.c_str(), conf); -    order = model->Order(); -    kEOS = MapWord(TD::Convert("</s>")); -    assert(kEOS > 0); -  } - -  lm::WordIndex MapWord(const WordID w) const { -    if (w < cdec2klm_map_.size()) return cdec2klm_map_[w]; -    return 0; -  } - -  ~FixedNgramBaseImpl() { delete model; } - -  prob_t StringProbability(const vector<WordID>& s) const { -    lm::ngram::State state = model->BeginSentenceState(); -    double prob = 0; -    for (unsigned i = 0; i < s.size(); ++i) { -      const lm::ngram::State scopy(state); -      prob += model->Score(scopy, MapWord(s[i]), state); -    } -    const lm::ngram::State scopy(state); -    prob += model->Score(scopy, kEOS, state); -    prob_t p; p.logeq(prob * log(10)); -    return p; -  } - -  lm::ngram::ProbingModel* model; -  unsigned order; -  vector<lm::WordIndex> cdec2klm_map_; -  lm::WordIndex kEOS; -}; - -FixedNgramBase::~FixedNgramBase() { delete impl; } - -FixedNgramBase::FixedNgramBase(const string& lmfname) { -  impl = new FixedNgramBaseImpl(lmfname); -} - -prob_t FixedNgramBase::StringProbability(const vector<WordID>& s) const { -  return impl->StringProbability(s); -} - diff --git a/gi/pf/ngram_base.h b/gi/pf/ngram_base.h deleted file mode 100644 index 4ea999f3..00000000 --- a/gi/pf/ngram_base.h +++ /dev/null @@ -1,25 +0,0 @@ -#ifndef _NGRAM_BASE_H_ -#define _NGRAM_BASE_H_ - -#include <string> -#include <vector> -#include "trule.h" -#include "wordid.h" -#include "prob.h" - -struct FixedNgramBaseImpl; -struct FixedNgramBase { -  FixedNgramBase(const std::string& lmfname); -  ~FixedNgramBase(); -  prob_t StringProbability(const std::vector<WordID>& s) const; - -  prob_t operator()(const TRule& rule) const { -    return StringProbability(rule.e_); -  } - - private: -  FixedNgramBaseImpl* impl; - -}; - -#endif diff --git a/gi/pf/nuisance_test.cc b/gi/pf/nuisance_test.cc deleted file mode 100644 index fc0af9cb..00000000 --- a/gi/pf/nuisance_test.cc +++ /dev/null @@ -1,161 +0,0 @@ -#include "ccrp.h" - -#include <vector> -#include <iostream> - -#include "tdict.h" -#include "transliterations.h" - -using namespace std; - -MT19937 rng; - -ostream& operator<<(ostream&os, const vector<int>& v) { -  os << '[' << v[0]; -  if (v.size() == 2) os << ' ' << v[1]; -  return os << ']'; -} - -struct Base { -  Base() : llh(), v(2), v1(1), v2(1), crp(0.25, 0.5) {} -  inline double p0(const vector<int>& x) const { -    double p = 0.75; -    if (x.size() == 2) p = 0.25; -    p *= 1.0 / 3.0; -    if (x.size() == 2) p *= 1.0 / 3.0; -    return p; -  } -  double est_deriv_prob(int a, int b, int seg) const { -    assert(a > 0 && a < 4);  // a \in {1,2,3} -    assert(b > 0 && b < 4);  // b \in {1,2,3} -    assert(seg == 0 || seg == 1);   // seg \in {0,1} -    if (seg == 0) { -      v[0] = a; -      v[1] = b; -      return crp.prob(v, p0(v)); -    } else { -      v1[0] = a; -      v2[0] = b; -      return crp.prob(v1, p0(v1)) * crp.prob(v2, p0(v2)); -    } -  } -  double est_marginal_prob(int a, int b) const { -    return est_deriv_prob(a,b,0) + est_deriv_prob(a,b,1); -  } -  int increment(int a, int b, double* pw = NULL) { -    double p1 = est_deriv_prob(a, b, 0); -    double p2 = est_deriv_prob(a, b, 1); -    //p1 = 0.5; p2 = 0.5; -    int seg = rng.SelectSample(p1,p2); -    double tmp = 0; -    if (!pw) pw = &tmp; -    double& w = *pw; -    if (seg == 0) { -      v[0] = a; -      v[1] = b; -      w = crp.prob(v, p0(v)) / p1; -      if (crp.increment(v, p0(v), &rng)) { -        llh += log(p0(v)); -      } -    } else { -      v1[0] = a; -      w = crp.prob(v1, p0(v1)) / p2; -      if (crp.increment(v1, p0(v1), &rng)) { -        llh += log(p0(v1)); -      } -      v2[0] = b; -      w *= crp.prob(v2, p0(v2)); -      if (crp.increment(v2, p0(v2), &rng)) { -        llh += log(p0(v2)); -      } -    } -    return seg; -  } -  void increment(int a, int b, int seg) { -    if (seg == 0) { -      v[0] = a; -      v[1] = b; -      if (crp.increment(v, p0(v), &rng)) { -        llh += log(p0(v)); -      } -    } else { -      v1[0] = a; -      if (crp.increment(v1, p0(v1), &rng)) { -        llh += log(p0(v1)); -      } -      v2[0] = b; -      if (crp.increment(v2, p0(v2), &rng)) { -        llh += log(p0(v2)); -      } -    } -  } -  void decrement(int a, int b, int seg) { -    if (seg == 0) { -      v[0] = a; -      v[1] = b; -      if (crp.decrement(v, &rng)) { -        llh -= log(p0(v)); -      } -    } else { -      v1[0] = a; -      if (crp.decrement(v1, &rng)) { -        llh -= log(p0(v1)); -      } -      v2[0] = b; -      if (crp.decrement(v2, &rng)) { -        llh -= log(p0(v2)); -      } -    } -  } -  double log_likelihood() const { -    return llh + crp.log_crp_prob(); -  } -  double llh; -  mutable vector<int> v, v1, v2; -  CCRP<vector<int> > crp; -}; - -int main(int argc, char** argv) { -  double tl = 0; -  const int ITERS = 1000; -  const int PARTICLES = 20; -  const int DATAPOINTS = 50; -  WordID x = TD::Convert("souvenons"); -  WordID y = TD::Convert("remember"); -  vector<WordID> src; TD::ConvertSentence("s o u v e n o n s", &src); -  vector<WordID> trg; TD::ConvertSentence("r e m e m b e r", &trg); -//  Transliterations xx; -//  xx.Initialize(x, src, y, trg); -//  return 1; - - for (int j = 0; j < ITERS; ++j) { -  Base b; -  vector<int> segs(DATAPOINTS); -  SampleSet<double> ss; -  vector<int> sss; -  for (int i = 0; i < DATAPOINTS; i++) { -    ss.clear(); -    sss.clear(); -    int x = ((i / 10) % 3) + 1; -    int y = (i % 3) + 1; -    //double ep = b.est_marginal_prob(x,y); -    //cerr << "est p(" << x << "," << y << ") = " << ep << endl; -    for (int n = 0; n < PARTICLES; ++n) { -      double w; -      int seg = b.increment(x,y,&w); -      //cerr << seg << " w=" << w << endl; -      ss.add(w); -      sss.push_back(seg); -      b.decrement(x,y,seg); -    } -    int seg = sss[rng.SelectSample(ss)]; -    b.increment(x, y, seg); -    //cerr << "Selected: " << seg << endl; -    //return 1; -    segs[i] = seg; -  } -  tl += b.log_likelihood(); - } -  cerr << "LLH=" << tl / ITERS << endl; -} - diff --git a/gi/pf/os_phrase.h b/gi/pf/os_phrase.h deleted file mode 100644 index dfe40cb1..00000000 --- a/gi/pf/os_phrase.h +++ /dev/null @@ -1,15 +0,0 @@ -#ifndef _OS_PHRASE_H_ -#define _OS_PHRASE_H_ - -#include <iostream> -#include <vector> -#include "tdict.h" - -inline std::ostream& operator<<(std::ostream& os, const std::vector<WordID>& p) { -  os << '['; -  for (int i = 0; i < p.size(); ++i) -    os << (i==0 ? "" : " ") << TD::Convert(p[i]); -  return os << ']'; -} - -#endif diff --git a/gi/pf/pf.h b/gi/pf/pf.h deleted file mode 100644 index ede7cda8..00000000 --- a/gi/pf/pf.h +++ /dev/null @@ -1,84 +0,0 @@ -#ifndef _PF_H_ -#define _PF_H_ - -#include <cassert> -#include <vector> -#include "sampler.h" -#include "prob.h" - -template <typename ParticleType> -struct ParticleRenormalizer { -  void operator()(std::vector<ParticleType>* pv) const { -    if (pv->empty()) return; -    prob_t z = prob_t::Zero(); -    for (unsigned i = 0; i < pv->size(); ++i) -      z += (*pv)[i].weight; -    assert(z > prob_t::Zero()); -    for (unsigned i = 0; i < pv->size(); ++i) -      (*pv)[i].weight /= z; -  } -}; - -template <typename ParticleType> -struct MultinomialResampleFilter { -  explicit MultinomialResampleFilter(MT19937* rng) : rng_(rng) {} - -  void operator()(std::vector<ParticleType>* pv) { -    if (pv->empty()) return; -    std::vector<ParticleType>& ps = *pv; -    SampleSet<prob_t> ss; -    for (int i = 0; i < ps.size(); ++i) -      ss.add(ps[i].weight); -    std::vector<ParticleType> nps; nps.reserve(ps.size()); -    const prob_t uniform_weight(1.0 / ps.size()); -    for (int i = 0; i < ps.size(); ++i) { -      nps.push_back(ps[rng_->SelectSample(ss)]); -      nps[i].weight = uniform_weight; -    } -    nps.swap(ps); -  } - - private: -  MT19937* rng_; -}; - -template <typename ParticleType> -struct SystematicResampleFilter { -  explicit SystematicResampleFilter(MT19937* rng) : rng_(rng), renorm_() {} - -  void operator()(std::vector<ParticleType>* pv) { -    if (pv->empty()) return; -    renorm_(pv); -    std::vector<ParticleType>& ps = *pv; -    std::vector<ParticleType> nps; nps.reserve(ps.size()); -    double lower = 0, upper = 0; -    const double skip = 1.0 / ps.size(); -    double u_j = rng_->next() * skip; -    //std::cerr << "u_0: " << u_j << std::endl; -    int j = 0; -    for (unsigned i = 0; i < ps.size(); ++i) { -      upper += ps[i].weight.as_float(); -      //std::cerr << "lower: " << lower << " upper: " << upper << std::endl; -      // how many children does ps[i] have? -      while (u_j < lower) { u_j += skip; ++j; } -      while (u_j >= lower && u_j <= upper) { -        assert(j < ps.size()); -        nps.push_back(ps[i]); -        u_j += skip; -        //std::cerr << " add u_j=" << u_j << std::endl; -        ++j; -      } -      lower = upper; -    } -    //std::cerr << ps.size() << " " << nps.size() << "\n"; -    assert(ps.size() == nps.size()); -    //exit(1); -    ps.swap(nps); -  } - - private: -  MT19937* rng_; -  ParticleRenormalizer<ParticleType> renorm_; -}; - -#endif diff --git a/gi/pf/pf_test.cc b/gi/pf/pf_test.cc deleted file mode 100644 index 296e7285..00000000 --- a/gi/pf/pf_test.cc +++ /dev/null @@ -1,148 +0,0 @@ -#include "ccrp.h" - -#include <vector> -#include <iostream> - -#include "tdict.h" -#include "transliterations.h" - -using namespace std; - -MT19937 rng; - -static bool verbose = false; - -struct Model { - -  Model() : bp(), base(0.2, 0.6) , ccrps(5, CCRP<int>(0.8, 0.5)) {} - -  double p0(int x) const { -    assert(x > 0); -    assert(x < 5); -    return 1.0/4.0; -  } - -  double llh() const { -    double lh = bp + base.log_crp_prob(); -    for (int ctx = 1; ctx < 5; ++ctx) -      lh += ccrps[ctx].log_crp_prob(); -    return lh; -  } - -  double prob(int ctx, int x) const { -    assert(ctx > 0 && ctx < 5); -    return ccrps[ctx].prob(x, base.prob(x, p0(x))); -  } - -  void increment(int ctx, int x) { -    assert(ctx > 0 && ctx < 5); -    if (ccrps[ctx].increment(x, base.prob(x, p0(x)), &rng)) { -      if (base.increment(x, p0(x), &rng)) { -        bp += log(1.0 / 4.0); -      } -    } -  } - -  // this is just a biased estimate -  double est_base_prob(int x) { -    return (x + 1) * x / 40.0; -  } - -  void increment_is(int ctx, int x) { -    assert(ctx > 0 && ctx < 5); -    SampleSet<double> ss; -    const int PARTICLES = 25; -    vector<CCRP<int> > s1s(PARTICLES, CCRP<int>(0.5,0.5)); -    vector<CCRP<int> > sbs(PARTICLES, CCRP<int>(0.5,0.5)); -    vector<double> sp0s(PARTICLES); - -    CCRP<int> s1 = ccrps[ctx]; -    CCRP<int> sb = base; -    double sp0 = bp; -    for (int pp = 0; pp < PARTICLES; ++pp) { -      if (pp > 0) { -        ccrps[ctx] = s1; -        base = sb; -        bp = sp0; -      } - -      double q = 1; -      double gamma = 1; -      double est_p = est_base_prob(x); -      //base.prob(x, p0(x)) + rng.next() * 0.1; -      if (ccrps[ctx].increment(x, est_p, &rng, &q)) { -        gamma = q * base.prob(x, p0(x)); -        q *= est_p; -        if (verbose) cerr << "(DP-base draw) "; -        double qq = -1; -        if (base.increment(x, p0(x), &rng, &qq)) { -          if (verbose) cerr << "(G0 draw) "; -          bp += log(p0(x)); -          qq *= p0(x); -        } -      } else { gamma = q; } -      double w = gamma / q; -      if (verbose) -        cerr << "gamma=" << gamma << " q=" << q << "\tw=" << w << endl; -      ss.add(w); -      s1s[pp] = ccrps[ctx]; -      sbs[pp] = base; -      sp0s[pp] = bp; -    } -    int ps = rng.SelectSample(ss); -    ccrps[ctx] = s1s[ps]; -    base = sbs[ps]; -    bp = sp0s[ps]; -    if (verbose) { -      cerr << "SELECTED: " << ps << endl; -      static int cc = 0; cc++; if (cc ==10) exit(1); -    } -  } - -  void decrement(int ctx, int x) { -    assert(ctx > 0 && ctx < 5); -    if (ccrps[ctx].decrement(x, &rng)) { -      if (base.decrement(x, &rng)) { -        bp -= log(p0(x)); -      } -    } -  } - -  double bp; -  CCRP<int> base; -  vector<CCRP<int> > ccrps; - -}; - -int main(int argc, char** argv) { -  if (argc > 1) { verbose = true; } -  vector<int> counts(15, 0); -  vector<int> tcounts(15, 0); -  int points[] = {1,2, 2,2, 3,2, 4,1, 3, 4, 3, 3, 2, 3, 4, 1, 4, 1, 3, 2, 1, 3, 1, 4, 0, 0}; -  double tlh = 0; -  double tt = 0; -  for (int n = 0; n < 1000; ++n) { -    if (n % 10 == 0) cerr << '.'; -    if ((n+1) % 400 == 0) cerr << " [" << (n+1) << "]\n"; -    Model m; -    for (int *x = points; *x; x += 2) -      m.increment(x[0], x[1]); - -    for (int j = 0; j < 24; ++j) { -      for (int *x = points; *x; x += 2) { -        if (rng.next() < 0.8) { -          m.decrement(x[0], x[1]); -          m.increment_is(x[0], x[1]); -        } -      } -    } -    counts[m.base.num_customers()]++; -    tcounts[m.base.num_tables()]++; -    tlh += m.llh(); -    tt += 1.0; -  } -  cerr << "mean LLH = " << (tlh / tt) << endl; -  for (int i = 0; i < 15; ++i) -    cerr << i << ": " << (counts[i] / tt) << "\t" << (tcounts[i] / tt) << endl; -} - diff --git a/gi/pf/pfbrat.cc b/gi/pf/pfbrat.cc deleted file mode 100644 index 832f22cf..00000000 --- a/gi/pf/pfbrat.cc +++ /dev/null @@ -1,543 +0,0 @@ -#include <iostream> -#include <tr1/memory> -#include <queue> - -#include <boost/functional.hpp> -#include <boost/multi_array.hpp> -#include <boost/program_options.hpp> -#include <boost/program_options/variables_map.hpp> - -#include "viterbi.h" -#include "hg.h" -#include "trule.h" -#include "tdict.h" -#include "filelib.h" -#include "dict.h" -#include "sampler.h" -#include "ccrp_nt.h" -#include "cfg_wfst_composer.h" - -using namespace std; -using namespace tr1; -namespace po = boost::program_options; - -static unsigned kMAX_SRC_PHRASE; -static unsigned kMAX_TRG_PHRASE; -struct FSTState; - -double log_poisson(unsigned x, const double& lambda) { -  assert(lambda > 0.0); -  return log(lambda) * x - lgamma(x + 1) - lambda; -} - -struct ConditionalBase { -  explicit ConditionalBase(const double m1mixture, const unsigned vocab_e_size, const string& model1fname) : -      kM1MIXTURE(m1mixture), -      kUNIFORM_MIXTURE(1.0 - m1mixture), -      kUNIFORM_TARGET(1.0 / vocab_e_size), -      kNULL(TD::Convert("<eps>")) { -    assert(m1mixture >= 0.0 && m1mixture <= 1.0); -    assert(vocab_e_size > 0); -    LoadModel1(model1fname); -  } - -  void LoadModel1(const string& fname) { -    cerr << "Loading Model 1 parameters from " << fname << " ..." << endl; -    ReadFile rf(fname); -    istream& in = *rf.stream(); -    string line; -    unsigned lc = 0; -    while(getline(in, line)) { -      ++lc; -      int cur = 0; -      int start = 0; -      while(cur < line.size() && line[cur] != ' ') { ++cur; } -      assert(cur != line.size()); -      line[cur] = 0; -      const WordID src = TD::Convert(&line[0]); -      ++cur; -      start = cur; -      while(cur < line.size() && line[cur] != ' ') { ++cur; } -      assert(cur != line.size()); -      line[cur] = 0; -      WordID trg = TD::Convert(&line[start]); -      const double logprob = strtod(&line[cur + 1], NULL); -      if (src >= ttable.size()) ttable.resize(src + 1); -      ttable[src][trg].logeq(logprob); -    } -    cerr << "  read " << lc << " parameters.\n"; -  } - -  // return logp0 of rule.e_ | rule.f_ -  prob_t operator()(const TRule& rule) const { -    const int flen = rule.f_.size(); -    const int elen = rule.e_.size(); -    prob_t uniform_src_alignment; uniform_src_alignment.logeq(-log(flen + 1)); -    prob_t p; -    p.logeq(log_poisson(elen, flen + 0.01));       // elen | flen          ~Pois(flen + 0.01) -    for (int i = 0; i < elen; ++i) {               // for each position i in e-RHS -      const WordID trg = rule.e_[i]; -      prob_t tp = prob_t::Zero(); -      for (int j = -1; j < flen; ++j) { -        const WordID src = j < 0 ? kNULL : rule.f_[j]; -        const map<WordID, prob_t>::const_iterator it = ttable[src].find(trg); -        if (it != ttable[src].end()) { -          tp += kM1MIXTURE * it->second; -        } -        tp += kUNIFORM_MIXTURE * kUNIFORM_TARGET; -      } -      tp *= uniform_src_alignment;                 //     draw a_i         ~uniform -      p *= tp;                                     //     draw e_i         ~Model1(f_a_i) / uniform -    } -    return p; -  } - -  const prob_t kM1MIXTURE;  // Model 1 mixture component -  const prob_t kUNIFORM_MIXTURE; // uniform mixture component -  const prob_t kUNIFORM_TARGET; -  const WordID kNULL; -  vector<map<WordID, prob_t> > ttable; -}; - -void InitCommandLine(int argc, char** argv, po::variables_map* conf) { -  po::options_description opts("Configuration options"); -  opts.add_options() -        ("samples,s",po::value<unsigned>()->default_value(1000),"Number of samples") -        ("input,i",po::value<string>(),"Read parallel data from") -        ("max_src_phrase",po::value<unsigned>()->default_value(3),"Maximum length of source language phrases") -        ("max_trg_phrase",po::value<unsigned>()->default_value(3),"Maximum length of target language phrases") -        ("model1,m",po::value<string>(),"Model 1 parameters (used in base distribution)") -        ("model1_interpolation_weight",po::value<double>()->default_value(0.95),"Mixing proportion of model 1 with uniform target distribution") -        ("random_seed,S",po::value<uint32_t>(), "Random seed"); -  po::options_description clo("Command line options"); -  clo.add_options() -        ("config", po::value<string>(), "Configuration file") -        ("help,h", "Print this help message and exit"); -  po::options_description dconfig_options, dcmdline_options; -  dconfig_options.add(opts); -  dcmdline_options.add(opts).add(clo); -   -  po::store(parse_command_line(argc, argv, dcmdline_options), *conf); -  if (conf->count("config")) { -    ifstream config((*conf)["config"].as<string>().c_str()); -    po::store(po::parse_config_file(config, dconfig_options), *conf); -  } -  po::notify(*conf); - -  if (conf->count("help") || (conf->count("input") == 0)) { -    cerr << dcmdline_options << endl; -    exit(1); -  } -} - -void ReadParallelCorpus(const string& filename, -                vector<vector<WordID> >* f, -                vector<vector<int> >* e, -                set<int>* vocab_f, -                set<int>* vocab_e) { -  f->clear(); -  e->clear(); -  vocab_f->clear(); -  vocab_e->clear(); -  istream* in; -  if (filename == "-") -    in = &cin; -  else -    in = new ifstream(filename.c_str()); -  assert(*in); -  string line; -  const WordID kDIV = TD::Convert("|||"); -  vector<WordID> tmp; -  while(*in) { -    getline(*in, line); -    if (line.empty() && !*in) break; -    e->push_back(vector<int>()); -    f->push_back(vector<int>()); -    vector<int>& le = e->back(); -    vector<int>& lf = f->back(); -    tmp.clear(); -    TD::ConvertSentence(line, &tmp); -    bool isf = true; -    for (unsigned i = 0; i < tmp.size(); ++i) { -      const int cur = tmp[i]; -      if (isf) { -        if (kDIV == cur) { isf = false; } else { -          lf.push_back(cur); -          vocab_f->insert(cur); -        } -      } else { -        assert(cur != kDIV); -        le.push_back(cur); -        vocab_e->insert(cur); -      } -    } -    assert(isf == false); -  } -  if (in != &cin) delete in; -} - -struct UniphraseLM { -  UniphraseLM(const vector<vector<int> >& corpus, -              const set<int>& vocab, -              const po::variables_map& conf) : -    phrases_(1,1), -    gen_(1,1), -    corpus_(corpus), -    uniform_word_(1.0 / vocab.size()), -    gen_p0_(0.5), -    p_end_(0.5), -    use_poisson_(conf.count("poisson_length") > 0) {} - -  void ResampleHyperparameters(MT19937* rng) { -    phrases_.resample_hyperparameters(rng); -    gen_.resample_hyperparameters(rng); -    cerr << " " << phrases_.alpha(); -  } - -  CCRP_NoTable<vector<int> > phrases_; -  CCRP_NoTable<bool> gen_; -  vector<vector<bool> > z_;   // z_[i] is there a phrase boundary after the ith word -  const vector<vector<int> >& corpus_; -  const double uniform_word_; -  const double gen_p0_; -  const double p_end_; // in base length distribution, p of the end of a phrase -  const bool use_poisson_; -}; - -struct Reachability { -  boost::multi_array<bool, 4> edges;  // edges[src_covered][trg_covered][x][trg_delta] is this edge worth exploring? -  boost::multi_array<short, 2> max_src_delta; // msd[src_covered][trg_covered] -- the largest src delta that's valid - -  Reachability(int srclen, int trglen, int src_max_phrase_len, int trg_max_phrase_len) : -      edges(boost::extents[srclen][trglen][src_max_phrase_len+1][trg_max_phrase_len+1]), -      max_src_delta(boost::extents[srclen][trglen]) { -    ComputeReachability(srclen, trglen, src_max_phrase_len, trg_max_phrase_len); -  } - - private: -  struct SState { -    SState() : prev_src_covered(), prev_trg_covered() {} -    SState(int i, int j) : prev_src_covered(i), prev_trg_covered(j) {} -    int prev_src_covered; -    int prev_trg_covered; -  }; - -  struct NState { -    NState() : next_src_covered(), next_trg_covered() {} -    NState(int i, int j) : next_src_covered(i), next_trg_covered(j) {} -    int next_src_covered; -    int next_trg_covered; -  }; - -  void ComputeReachability(int srclen, int trglen, int src_max_phrase_len, int trg_max_phrase_len) { -    typedef boost::multi_array<vector<SState>, 2> array_type; -    array_type a(boost::extents[srclen + 1][trglen + 1]); -    a[0][0].push_back(SState()); -    for (int i = 0; i < srclen; ++i) { -      for (int j = 0; j < trglen; ++j) { -        if (a[i][j].size() == 0) continue; -        const SState prev(i,j); -        for (int k = 1; k <= src_max_phrase_len; ++k) { -          if ((i + k) > srclen) continue; -          for (int l = 1; l <= trg_max_phrase_len; ++l) { -            if ((j + l) > trglen) continue; -            a[i + k][j + l].push_back(prev); -          } -        } -      } -    } -    a[0][0].clear(); -    cerr << "Final cell contains " << a[srclen][trglen].size() << " back pointers\n"; -    assert(a[srclen][trglen].size() > 0); - -    typedef boost::multi_array<bool, 2> rarray_type; -    rarray_type r(boost::extents[srclen + 1][trglen + 1]); -//    typedef boost::multi_array<vector<NState>, 2> narray_type; -//    narray_type b(boost::extents[srclen + 1][trglen + 1]); -    r[srclen][trglen] = true; -    for (int i = srclen; i >= 0; --i) { -      for (int j = trglen; j >= 0; --j) { -        vector<SState>& prevs = a[i][j]; -        if (!r[i][j]) { prevs.clear(); } -//        const NState nstate(i,j); -        for (int k = 0; k < prevs.size(); ++k) { -          r[prevs[k].prev_src_covered][prevs[k].prev_trg_covered] = true; -          int src_delta = i - prevs[k].prev_src_covered; -          edges[prevs[k].prev_src_covered][prevs[k].prev_trg_covered][src_delta][j - prevs[k].prev_trg_covered] = true; -          short &msd = max_src_delta[prevs[k].prev_src_covered][prevs[k].prev_trg_covered]; -          if (src_delta > msd) msd = src_delta; -//          b[prevs[k].prev_src_covered][prevs[k].prev_trg_covered].push_back(nstate); -        } -      } -    } -    assert(!edges[0][0][1][0]); -    assert(!edges[0][0][0][1]); -    assert(!edges[0][0][0][0]); -    cerr << "  MAX SRC DELTA[0][0] = " << max_src_delta[0][0] << endl; -    assert(max_src_delta[0][0] > 0); -    //cerr << "First cell contains " << b[0][0].size() << " forward pointers\n"; -    //for (int i = 0; i < b[0][0].size(); ++i) { -    //  cerr << "  -> (" << b[0][0][i].next_src_covered << "," << b[0][0][i].next_trg_covered << ")\n"; -    //} -  } -}; - -ostream& operator<<(ostream& os, const FSTState& q); -struct FSTState { -  explicit FSTState(int src_size) : -      trg_covered_(), -      src_covered_(), -      src_coverage_(src_size) {} - -  FSTState(short trg_covered, short src_covered, const vector<bool>& src_coverage, const vector<short>& src_prefix) : -      trg_covered_(trg_covered), -      src_covered_(src_covered), -      src_coverage_(src_coverage), -      src_prefix_(src_prefix) { -    if (src_coverage_.size() == src_covered) { -      assert(src_prefix.size() == 0); -    } -  } - -  // if we extend by the word at src_position, what are -  // the next states that are reachable and lie on a valid -  // path to the final state? -  vector<FSTState> Extensions(int src_position, int src_len, int trg_len, const Reachability& r) const { -    assert(src_position < src_coverage_.size()); -    if (src_coverage_[src_position]) { -      cerr << "Trying to extend " << *this << " with position " << src_position << endl; -      abort(); -    } -    vector<bool> ncvg = src_coverage_; -    ncvg[src_position] = true; - -    vector<FSTState> res; -    const int trg_remaining = trg_len - trg_covered_; -    if (trg_remaining <= 0) { -      cerr << "Target appears to have been covered: " << *this << " (trg_len=" << trg_len << ",trg_covered=" << trg_covered_ << ")" << endl; -      abort(); -    } -    const int src_remaining = src_len - src_covered_; -    if (src_remaining <= 0) { -      cerr << "Source appears to have been covered: " << *this << endl; -      abort(); -    } - -    for (int tc = 1; tc <= kMAX_TRG_PHRASE; ++tc) { -      if (r.edges[src_covered_][trg_covered_][src_prefix_.size() + 1][tc]) { -        int nc = src_prefix_.size() + 1 + src_covered_; -        res.push_back(FSTState(trg_covered_ + tc, nc, ncvg, vector<short>())); -      } -    } - -    if ((src_prefix_.size() + 1) < r.max_src_delta[src_covered_][trg_covered_]) { -      vector<short> nsp = src_prefix_; -      nsp.push_back(src_position); -      res.push_back(FSTState(trg_covered_, src_covered_, ncvg, nsp)); -    } - -    if (res.size() == 0) { -      cerr << *this << " can't be extended!\n"; -      abort(); -    } -    return res; -  } - -  short trg_covered_, src_covered_; -  vector<bool> src_coverage_; -  vector<short> src_prefix_; -}; -bool operator<(const FSTState& q, const FSTState& r) { -  if (q.trg_covered_ != r.trg_covered_) return q.trg_covered_ < r.trg_covered_; -  if (q.src_covered_!= r.src_covered_) return q.src_covered_ < r.src_covered_; -  if (q.src_coverage_ != r.src_coverage_) return q.src_coverage_ < r.src_coverage_; -  return q.src_prefix_ < r.src_prefix_; -} - -ostream& operator<<(ostream& os, const FSTState& q) { -  os << "[" << q.trg_covered_ << " : "; -  for (int i = 0; i < q.src_coverage_.size(); ++i) -    os << q.src_coverage_[i]; -  os << " : <"; -  for (int i = 0; i < q.src_prefix_.size(); ++i) { -    if (i != 0) os << ' '; -    os << q.src_prefix_[i]; -  } -  return os << ">]"; -} - -struct MyModel { -  MyModel(ConditionalBase& rcp0) : rp0(rcp0) {} -  typedef unordered_map<vector<WordID>, CCRP_NoTable<TRule>, boost::hash<vector<WordID> > > SrcToRuleCRPMap; - -  void DecrementRule(const TRule& rule) { -    SrcToRuleCRPMap::iterator it = rules.find(rule.f_); -    assert(it != rules.end()); -    it->second.decrement(rule); -    if (it->second.num_customers() == 0) rules.erase(it); -  } - -  void IncrementRule(const TRule& rule) { -    SrcToRuleCRPMap::iterator it = rules.find(rule.f_); -    if (it == rules.end()) { -      CCRP_NoTable<TRule> crp(1,1); -      it = rules.insert(make_pair(rule.f_, crp)).first; -    } -    it->second.increment(rule); -  } - -  // conditioned on rule.f_ -  prob_t RuleConditionalProbability(const TRule& rule) const { -    const prob_t base = rp0(rule); -    SrcToRuleCRPMap::const_iterator it = rules.find(rule.f_); -    if (it == rules.end()) { -      return base; -    } else { -      const double lp = it->second.logprob(rule, log(base)); -      prob_t q; q.logeq(lp); -      return q; -    } -  } - -  const ConditionalBase& rp0; -  SrcToRuleCRPMap rules; -}; - -struct MyFST : public WFST { -  MyFST(const vector<WordID>& ssrc, const vector<WordID>& strg, MyModel* m) : -      src(ssrc), trg(strg), -      r(src.size(),trg.size(),kMAX_SRC_PHRASE, kMAX_TRG_PHRASE), -      model(m) { -    FSTState in(src.size()); -    cerr << " INIT: " << in << endl; -    init = GetNode(in); -    for (int i = 0; i < in.src_coverage_.size(); ++i) in.src_coverage_[i] = true; -    in.src_covered_ = src.size(); -    in.trg_covered_ = trg.size(); -    cerr << "FINAL: " << in << endl; -    final = GetNode(in); -  } -  virtual const WFSTNode* Final() const; -  virtual const WFSTNode* Initial() const; - -  const WFSTNode* GetNode(const FSTState& q); -  map<FSTState, boost::shared_ptr<WFSTNode> > m; -  const vector<WordID>& src; -  const vector<WordID>& trg; -  Reachability r; -  const WFSTNode* init; -  const WFSTNode* final; -  MyModel* model; -}; - -struct MyNode : public WFSTNode { -  MyNode(const FSTState& q, MyFST* fst) : state(q), container(fst) {} -  virtual vector<pair<const WFSTNode*, TRulePtr> > ExtendInput(unsigned srcindex) const; -  const FSTState state; -  mutable MyFST* container; -}; - -vector<pair<const WFSTNode*, TRulePtr> > MyNode::ExtendInput(unsigned srcindex) const { -  cerr << "EXTEND " << state << " with " << srcindex << endl; -  vector<FSTState> ext = state.Extensions(srcindex, container->src.size(), container->trg.size(), container->r); -  vector<pair<const WFSTNode*,TRulePtr> > res(ext.size()); -  for (unsigned i = 0; i < ext.size(); ++i) { -    res[i].first = container->GetNode(ext[i]); -    if (ext[i].src_prefix_.size() == 0) { -      const unsigned trg_from = state.trg_covered_; -      const unsigned trg_to = ext[i].trg_covered_; -      const unsigned prev_prfx_size = state.src_prefix_.size(); -      res[i].second.reset(new TRule); -      res[i].second->lhs_ = -TD::Convert("X"); -      vector<WordID>& src = res[i].second->f_; -      vector<WordID>& trg = res[i].second->e_; -      src.resize(prev_prfx_size + 1); -      for (unsigned j = 0; j < prev_prfx_size; ++j) -        src[j] = container->src[state.src_prefix_[j]]; -      src[prev_prfx_size] = container->src[srcindex]; -      for (unsigned j = trg_from; j < trg_to; ++j) -        trg.push_back(container->trg[j]); -      res[i].second->scores_.set_value(FD::Convert("Proposal"), log(container->model->RuleConditionalProbability(*res[i].second))); -    } -  } -  return res; -} - -const WFSTNode* MyFST::GetNode(const FSTState& q) { -  boost::shared_ptr<WFSTNode>& res = m[q]; -  if (!res) { -    res.reset(new MyNode(q, this)); -  } -  return &*res; -} - -const WFSTNode* MyFST::Final() const { -  return final; -} - -const WFSTNode* MyFST::Initial() const { -  return init; -} - -int main(int argc, char** argv) { -  po::variables_map conf; -  InitCommandLine(argc, argv, &conf); -  kMAX_TRG_PHRASE = conf["max_trg_phrase"].as<unsigned>(); -  kMAX_SRC_PHRASE = conf["max_src_phrase"].as<unsigned>(); - -  if (!conf.count("model1")) { -    cerr << argv[0] << "Please use --model1 to specify model 1 parameters\n"; -    return 1; -  } -  boost::shared_ptr<MT19937> prng; -  if (conf.count("random_seed")) -    prng.reset(new MT19937(conf["random_seed"].as<uint32_t>())); -  else -    prng.reset(new MT19937); -  MT19937& rng = *prng; - -  vector<vector<int> > corpuse, corpusf; -  set<int> vocabe, vocabf; -  ReadParallelCorpus(conf["input"].as<string>(), &corpusf, &corpuse, &vocabf, &vocabe); -  cerr << "f-Corpus size: " << corpusf.size() << " sentences\n"; -  cerr << "f-Vocabulary size: " << vocabf.size() << " types\n"; -  cerr << "f-Corpus size: " << corpuse.size() << " sentences\n"; -  cerr << "f-Vocabulary size: " << vocabe.size() << " types\n"; -  assert(corpusf.size() == corpuse.size()); - -  ConditionalBase lp0(conf["model1_interpolation_weight"].as<double>(), -                      vocabe.size(), -                      conf["model1"].as<string>()); -  MyModel m(lp0); - -  TRule x("[X] ||| kAnwntR myN ||| at the convent ||| 0"); -  m.IncrementRule(x); -  TRule y("[X] ||| nY dyN ||| gave ||| 0"); -  m.IncrementRule(y); - - -  MyFST fst(corpusf[0], corpuse[0], &m); -  ifstream in("./kimura.g"); -  assert(in); -  CFG_WFSTComposer comp(fst); -  Hypergraph hg; -  bool succeed = comp.Compose(&in, &hg); -  hg.PrintGraphviz(); -  if (succeed) { cerr << "SUCCESS.\n"; } else { cerr << "FAILURE REPORTED.\n"; } - -#if 0 -  ifstream in2("./amnabooks.g"); -  assert(in2); -  MyFST fst2(corpusf[1], corpuse[1], &m); -  CFG_WFSTComposer comp2(fst2); -  Hypergraph hg2; -  bool succeed2 = comp2.Compose(&in2, &hg2); -  if (succeed2) { cerr << "SUCCESS.\n"; } else { cerr << "FAILURE REPORTED.\n"; } -#endif - -  SparseVector<double> w; w.set_value(FD::Convert("Proposal"), 1.0); -  hg.Reweight(w); -  cerr << ViterbiFTree(hg) << endl; -  return 0; -} - diff --git a/gi/pf/pfdist.cc b/gi/pf/pfdist.cc deleted file mode 100644 index a3e46064..00000000 --- a/gi/pf/pfdist.cc +++ /dev/null @@ -1,598 +0,0 @@ -#include <iostream> -#include <tr1/memory> -#include <queue> - -#include <boost/functional.hpp> -#include <boost/program_options.hpp> -#include <boost/program_options/variables_map.hpp> - -#include "pf.h" -#include "base_distributions.h" -#include "reachability.h" -#include "viterbi.h" -#include "hg.h" -#include "trule.h" -#include "tdict.h" -#include "filelib.h" -#include "dict.h" -#include "sampler.h" -#include "ccrp_nt.h" -#include "ccrp_onetable.h" - -using namespace std; -using namespace tr1; -namespace po = boost::program_options; - -boost::shared_ptr<MT19937> prng; - -void InitCommandLine(int argc, char** argv, po::variables_map* conf) { -  po::options_description opts("Configuration options"); -  opts.add_options() -        ("samples,s",po::value<unsigned>()->default_value(1000),"Number of samples") -        ("particles,p",po::value<unsigned>()->default_value(30),"Number of particles") -        ("filter_frequency,f",po::value<unsigned>()->default_value(5),"Number of time steps between filterings") -        ("input,i",po::value<string>(),"Read parallel data from") -        ("max_src_phrase",po::value<unsigned>()->default_value(5),"Maximum length of source language phrases") -        ("max_trg_phrase",po::value<unsigned>()->default_value(5),"Maximum length of target language phrases") -        ("model1,m",po::value<string>(),"Model 1 parameters (used in base distribution)") -        ("inverse_model1,M",po::value<string>(),"Inverse Model 1 parameters (used in backward estimate)") -        ("model1_interpolation_weight",po::value<double>()->default_value(0.95),"Mixing proportion of model 1 with uniform target distribution") -        ("random_seed,S",po::value<uint32_t>(), "Random seed"); -  po::options_description clo("Command line options"); -  clo.add_options() -        ("config", po::value<string>(), "Configuration file") -        ("help,h", "Print this help message and exit"); -  po::options_description dconfig_options, dcmdline_options; -  dconfig_options.add(opts); -  dcmdline_options.add(opts).add(clo); -   -  po::store(parse_command_line(argc, argv, dcmdline_options), *conf); -  if (conf->count("config")) { -    ifstream config((*conf)["config"].as<string>().c_str()); -    po::store(po::parse_config_file(config, dconfig_options), *conf); -  } -  po::notify(*conf); - -  if (conf->count("help") || (conf->count("input") == 0)) { -    cerr << dcmdline_options << endl; -    exit(1); -  } -} - -void ReadParallelCorpus(const string& filename, -                vector<vector<WordID> >* f, -                vector<vector<WordID> >* e, -                set<WordID>* vocab_f, -                set<WordID>* vocab_e) { -  f->clear(); -  e->clear(); -  vocab_f->clear(); -  vocab_e->clear(); -  istream* in; -  if (filename == "-") -    in = &cin; -  else -    in = new ifstream(filename.c_str()); -  assert(*in); -  string line; -  const WordID kDIV = TD::Convert("|||"); -  vector<WordID> tmp; -  while(*in) { -    getline(*in, line); -    if (line.empty() && !*in) break; -    e->push_back(vector<int>()); -    f->push_back(vector<int>()); -    vector<int>& le = e->back(); -    vector<int>& lf = f->back(); -    tmp.clear(); -    TD::ConvertSentence(line, &tmp); -    bool isf = true; -    for (unsigned i = 0; i < tmp.size(); ++i) { -      const int cur = tmp[i]; -      if (isf) { -        if (kDIV == cur) { isf = false; } else { -          lf.push_back(cur); -          vocab_f->insert(cur); -        } -      } else { -        assert(cur != kDIV); -        le.push_back(cur); -        vocab_e->insert(cur); -      } -    } -    assert(isf == false); -  } -  if (in != &cin) delete in; -} - -#if 0 -struct MyConditionalModel { -  MyConditionalModel(PhraseConditionalBase& rcp0) : rp0(&rcp0), base(prob_t::One()), src_phrases(1,1), src_jumps(200, CCRP_NoTable<int>(1,1)) {} - -  prob_t srcp0(const vector<WordID>& src) const { -    prob_t p(1.0 / 3000.0); -    p.poweq(src.size()); -    prob_t lenp; lenp.logeq(log_poisson(src.size(), 1.0)); -    p *= lenp; -    return p; -  } - -  void DecrementRule(const TRule& rule) { -    const RuleCRPMap::iterator it = rules.find(rule.f_); -    assert(it != rules.end()); -    if (it->second.decrement(rule)) { -      base /= (*rp0)(rule); -      if (it->second.num_customers() == 0) -        rules.erase(it); -    } -    if (src_phrases.decrement(rule.f_)) -      base /= srcp0(rule.f_); -  } - -  void IncrementRule(const TRule& rule) { -    RuleCRPMap::iterator it = rules.find(rule.f_); -    if (it == rules.end()) -      it = rules.insert(make_pair(rule.f_, CCRP_NoTable<TRule>(1,1))).first; -    if (it->second.increment(rule)) { -      base *= (*rp0)(rule); -    } -    if (src_phrases.increment(rule.f_)) -      base *= srcp0(rule.f_); -  } - -  void IncrementRules(const vector<TRulePtr>& rules) { -    for (int i = 0; i < rules.size(); ++i) -      IncrementRule(*rules[i]); -  } - -  void DecrementRules(const vector<TRulePtr>& rules) { -    for (int i = 0; i < rules.size(); ++i) -      DecrementRule(*rules[i]); -  } - -  void IncrementJump(int dist, unsigned src_len) { -    assert(src_len > 0); -    if (src_jumps[src_len].increment(dist)) -      base *= jp0(dist, src_len); -  } - -  void DecrementJump(int dist, unsigned src_len) { -    assert(src_len > 0); -    if (src_jumps[src_len].decrement(dist)) -      base /= jp0(dist, src_len); -  } - -  void IncrementJumps(const vector<int>& js, unsigned src_len) { -    for (unsigned i = 0; i < js.size(); ++i) -      IncrementJump(js[i], src_len); -  } - -  void DecrementJumps(const vector<int>& js, unsigned src_len) { -    for (unsigned i = 0; i < js.size(); ++i) -      DecrementJump(js[i], src_len); -  } - -  // p(jump = dist | src_len , z) -  prob_t JumpProbability(int dist, unsigned src_len) { -    const prob_t p0 = jp0(dist, src_len); -    const double lp = src_jumps[src_len].logprob(dist, log(p0)); -    prob_t q; q.logeq(lp); -    return q; -  } - -  // p(rule.f_ | z) * p(rule.e_ | rule.f_ , z) -  prob_t RuleProbability(const TRule& rule) const { -    const prob_t p0 = (*rp0)(rule); -    prob_t srcp; srcp.logeq(src_phrases.logprob(rule.f_, log(srcp0(rule.f_)))); -    const RuleCRPMap::const_iterator it = rules.find(rule.f_); -    if (it == rules.end()) return srcp * p0; -    const double lp = it->second.logprob(rule, log(p0)); -    prob_t q; q.logeq(lp); -    return q * srcp; -  } - -  prob_t Likelihood() const { -    prob_t p = base; -    for (RuleCRPMap::const_iterator it = rules.begin(); -         it != rules.end(); ++it) { -      prob_t cl; cl.logeq(it->second.log_crp_prob()); -      p *= cl; -    } -    for (unsigned l = 1; l < src_jumps.size(); ++l) { -      if (src_jumps[l].num_customers() > 0) { -        prob_t q; -        q.logeq(src_jumps[l].log_crp_prob()); -        p *= q; -      } -    } -    return p; -  } - -  JumpBase jp0; -  const PhraseConditionalBase* rp0; -  prob_t base; -  typedef unordered_map<vector<WordID>, CCRP_NoTable<TRule>, boost::hash<vector<WordID> > > RuleCRPMap; -  RuleCRPMap rules; -  CCRP_NoTable<vector<WordID> > src_phrases; -  vector<CCRP_NoTable<int> > src_jumps; -}; - -#endif - -struct MyJointModel { -  MyJointModel(PhraseJointBase& rcp0) : -    rp0(rcp0), base(prob_t::One()), rules(1,1), src_jumps(200, CCRP_NoTable<int>(1,1)) {} - -  void DecrementRule(const TRule& rule) { -    if (rules.decrement(rule)) -      base /= rp0(rule); -  } - -  void IncrementRule(const TRule& rule) { -    if (rules.increment(rule)) -      base *= rp0(rule); -  } - -  void IncrementRules(const vector<TRulePtr>& rules) { -    for (int i = 0; i < rules.size(); ++i) -      IncrementRule(*rules[i]); -  } - -  void DecrementRules(const vector<TRulePtr>& rules) { -    for (int i = 0; i < rules.size(); ++i) -      DecrementRule(*rules[i]); -  } - -  void IncrementJump(int dist, unsigned src_len) { -    assert(src_len > 0); -    if (src_jumps[src_len].increment(dist)) -      base *= jp0(dist, src_len); -  } - -  void DecrementJump(int dist, unsigned src_len) { -    assert(src_len > 0); -    if (src_jumps[src_len].decrement(dist)) -      base /= jp0(dist, src_len); -  } - -  void IncrementJumps(const vector<int>& js, unsigned src_len) { -    for (unsigned i = 0; i < js.size(); ++i) -      IncrementJump(js[i], src_len); -  } - -  void DecrementJumps(const vector<int>& js, unsigned src_len) { -    for (unsigned i = 0; i < js.size(); ++i) -      DecrementJump(js[i], src_len); -  } - -  // p(jump = dist | src_len , z) -  prob_t JumpProbability(int dist, unsigned src_len) { -    const prob_t p0 = jp0(dist, src_len); -    const double lp = src_jumps[src_len].logprob(dist, log(p0)); -    prob_t q; q.logeq(lp); -    return q; -  } - -  // p(rule.f_ | z) * p(rule.e_ | rule.f_ , z) -  prob_t RuleProbability(const TRule& rule) const { -    prob_t p; p.logeq(rules.logprob(rule, log(rp0(rule)))); -    return p; -  } - -  prob_t Likelihood() const { -    prob_t p = base; -    prob_t q; q.logeq(rules.log_crp_prob()); -    p *= q; -    for (unsigned l = 1; l < src_jumps.size(); ++l) { -      if (src_jumps[l].num_customers() > 0) { -        prob_t q; -        q.logeq(src_jumps[l].log_crp_prob()); -        p *= q; -      } -    } -    return p; -  } - -  JumpBase jp0; -  const PhraseJointBase& rp0; -  prob_t base; -  CCRP_NoTable<TRule> rules; -  vector<CCRP_NoTable<int> > src_jumps; -}; - -struct BackwardEstimate { -  BackwardEstimate(const Model1& m1, const vector<WordID>& src, const vector<WordID>& trg) : -      model1_(m1), src_(src), trg_(trg) { -  } -  const prob_t& operator()(const vector<bool>& src_cov, unsigned trg_cov) const { -    assert(src_.size() == src_cov.size()); -    assert(trg_cov <= trg_.size()); -    prob_t& e = cache_[src_cov][trg_cov]; -    if (e.is_0()) { -      if (trg_cov == trg_.size()) { e = prob_t::One(); return e; } -      vector<WordID> r(src_.size() + 1); r.clear(); -      r.push_back(0);  // NULL word -      for (int i = 0; i < src_cov.size(); ++i) -        if (!src_cov[i]) r.push_back(src_[i]); -      const prob_t uniform_alignment(1.0 / r.size()); -      e.logeq(Md::log_poisson(trg_.size() - trg_cov, r.size() - 1)); // p(trg len remaining | src len remaining) -      for (unsigned j = trg_cov; j < trg_.size(); ++j) { -        prob_t p; -        for (unsigned i = 0; i < r.size(); ++i) -          p += model1_(r[i], trg_[j]); -        if (p.is_0()) { -          cerr << "ERROR: p(" << TD::Convert(trg_[j]) << " | " << TD::GetString(r) << ") = 0!\n"; -          abort(); -        } -        p *= uniform_alignment; -        e *= p; -      } -    } -    return e; -  } -  const Model1& model1_; -  const vector<WordID>& src_; -  const vector<WordID>& trg_; -  mutable unordered_map<vector<bool>, map<unsigned, prob_t>, boost::hash<vector<bool> > > cache_; -}; - -struct BackwardEstimateSym { -  BackwardEstimateSym(const Model1& m1, -                      const Model1& invm1, const vector<WordID>& src, const vector<WordID>& trg) : -      model1_(m1), invmodel1_(invm1), src_(src), trg_(trg) { -  } -  const prob_t& operator()(const vector<bool>& src_cov, unsigned trg_cov) const { -    assert(src_.size() == src_cov.size()); -    assert(trg_cov <= trg_.size()); -    prob_t& e = cache_[src_cov][trg_cov]; -    if (e.is_0()) { -      if (trg_cov == trg_.size()) { e = prob_t::One(); return e; } -      vector<WordID> r(src_.size() + 1); r.clear(); -      for (int i = 0; i < src_cov.size(); ++i) -        if (!src_cov[i]) r.push_back(src_[i]); -      r.push_back(0);  // NULL word -      const prob_t uniform_alignment(1.0 / r.size()); -      e.logeq(Md::log_poisson(trg_.size() - trg_cov, r.size() - 1)); // p(trg len remaining | src len remaining) -      for (unsigned j = trg_cov; j < trg_.size(); ++j) { -        prob_t p; -        for (unsigned i = 0; i < r.size(); ++i) -          p += model1_(r[i], trg_[j]); -        if (p.is_0()) { -          cerr << "ERROR: p(" << TD::Convert(trg_[j]) << " | " << TD::GetString(r) << ") = 0!\n"; -          abort(); -        } -        p *= uniform_alignment; -        e *= p; -      } -      r.pop_back(); -      const prob_t inv_uniform(1.0 / (trg_.size() - trg_cov + 1.0)); -      prob_t inv; -      inv.logeq(Md::log_poisson(r.size(), trg_.size() - trg_cov)); -      for (unsigned i = 0; i < r.size(); ++i) { -        prob_t p; -        for (unsigned j = trg_cov - 1; j < trg_.size(); ++j) -          p += invmodel1_(j < trg_cov ? 0 : trg_[j], r[i]); -        if (p.is_0()) { -          cerr << "ERROR: p_inv(" << TD::Convert(r[i]) << " | " << TD::GetString(trg_) << ") = 0!\n"; -          abort(); -        } -        p *= inv_uniform; -        inv *= p; -      } -      prob_t x = pow(e * inv, 0.5); -      e = x; -      //cerr << "Forward: " << log(e) << "\tBackward: " << log(inv) << "\t prop: " << log(x) << endl; -    } -    return e; -  } -  const Model1& model1_; -  const Model1& invmodel1_; -  const vector<WordID>& src_; -  const vector<WordID>& trg_; -  mutable unordered_map<vector<bool>, map<unsigned, prob_t>, boost::hash<vector<bool> > > cache_; -}; - -struct Particle { -  Particle() : weight(prob_t::One()), src_cov(), trg_cov(), prev_pos(-1) {} -  prob_t weight; -  prob_t gamma_last; -  vector<int> src_jumps; -  vector<TRulePtr> rules; -  vector<bool> src_cv; -  int src_cov; -  int trg_cov; -  int prev_pos; -}; - -ostream& operator<<(ostream& o, const vector<bool>& v) { -  for (int i = 0; i < v.size(); ++i) -    o << (v[i] ? '1' : '0'); -  return o; -} -ostream& operator<<(ostream& o, const Particle& p) { -  o << "[cv=" << p.src_cv << "  src_cov=" << p.src_cov << " trg_cov=" << p.trg_cov << " last_pos=" << p.prev_pos << " num_rules=" << p.rules.size() << "  w=" << log(p.weight) << ']'; -  return o; -} - -int main(int argc, char** argv) { -  po::variables_map conf; -  InitCommandLine(argc, argv, &conf); -  const unsigned kMAX_TRG_PHRASE = conf["max_trg_phrase"].as<unsigned>(); -  const unsigned kMAX_SRC_PHRASE = conf["max_src_phrase"].as<unsigned>(); -  const unsigned particles = conf["particles"].as<unsigned>(); -  const unsigned samples = conf["samples"].as<unsigned>(); -  const unsigned rejuv_freq = conf["filter_frequency"].as<unsigned>(); - -  if (!conf.count("model1")) { -    cerr << argv[0] << "Please use --model1 to specify model 1 parameters\n"; -    return 1; -  } -  if (conf.count("random_seed")) -    prng.reset(new MT19937(conf["random_seed"].as<uint32_t>())); -  else -    prng.reset(new MT19937); -  MT19937& rng = *prng; - -  vector<vector<WordID> > corpuse, corpusf; -  set<WordID> vocabe, vocabf; -  cerr << "Reading corpus...\n"; -  ReadParallelCorpus(conf["input"].as<string>(), &corpusf, &corpuse, &vocabf, &vocabe); -  cerr << "F-corpus size: " << corpusf.size() << " sentences\t (" << vocabf.size() << " word types)\n"; -  cerr << "E-corpus size: " << corpuse.size() << " sentences\t (" << vocabe.size() << " word types)\n"; -  assert(corpusf.size() == corpuse.size()); - -  const int kLHS = -TD::Convert("X"); -  Model1 m1(conf["model1"].as<string>()); -  Model1 invm1(conf["inverse_model1"].as<string>()); - -#if 0 -  PhraseConditionalBase lp0(m1, conf["model1_interpolation_weight"].as<double>(), vocabe.size()); -  MyConditionalModel m(lp0); -#else -  PhraseJointBase lp0(m1, conf["model1_interpolation_weight"].as<double>(), vocabe.size(), vocabf.size()); -  MyJointModel m(lp0); -#endif - -  MultinomialResampleFilter<Particle> filter(&rng); -  cerr << "Initializing reachability limits...\n"; -  vector<Particle> ps(corpusf.size()); -  vector<Reachability> reaches; reaches.reserve(corpusf.size()); -  for (int ci = 0; ci < corpusf.size(); ++ci) -    reaches.push_back(Reachability(corpusf[ci].size(), -                                   corpuse[ci].size(), -                                   kMAX_SRC_PHRASE, -                                   kMAX_TRG_PHRASE)); -  cerr << "Sampling...\n";  -  vector<Particle> tmp_p(10000);  // work space -  SampleSet<prob_t> pfss; -  for (int SS=0; SS < samples; ++SS) { -    for (int ci = 0; ci < corpusf.size(); ++ci) { -      vector<int>& src = corpusf[ci]; -      vector<int>& trg = corpuse[ci]; -      m.DecrementRules(ps[ci].rules); -      m.DecrementJumps(ps[ci].src_jumps, src.size()); - -      //BackwardEstimate be(m1, src, trg); -      BackwardEstimateSym be(m1, invm1, src, trg); -      const Reachability& r = reaches[ci]; -      vector<Particle> lps(particles); - -      for (int pi = 0; pi < particles; ++pi) { -        Particle& p = lps[pi]; -        p.src_cv.resize(src.size(), false); -      } - -      bool all_complete = false; -      while(!all_complete) { -        SampleSet<prob_t> ss; - -        // all particles have now been extended a bit, we will reweight them now -        if (lps[0].trg_cov > 0) -          filter(&lps); - -        // loop over all particles and extend them -        bool done_nothing = true; -        for (int pi = 0; pi < particles; ++pi) { -          Particle& p = lps[pi]; -          int tic = 0; -          while(p.trg_cov < trg.size() && tic < rejuv_freq) { -            ++tic; -            done_nothing = false; -            ss.clear(); -            TRule x; x.lhs_ = kLHS; -            prob_t z; -            int first_uncovered = src.size(); -            int last_uncovered = -1; -            for (int i = 0; i < src.size(); ++i) { -              const bool is_uncovered = !p.src_cv[i]; -              if (i < first_uncovered && is_uncovered) first_uncovered = i; -              if (is_uncovered && i > last_uncovered) last_uncovered = i; -            } -            assert(last_uncovered > -1); -            assert(first_uncovered < src.size()); - -            for (int trg_len = 1; trg_len <= kMAX_TRG_PHRASE; ++trg_len) { -              x.e_.push_back(trg[trg_len - 1 + p.trg_cov]); -              for (int src_len = 1; src_len <= kMAX_SRC_PHRASE; ++src_len) { -                if (!r.edges[p.src_cov][p.trg_cov][src_len][trg_len]) continue; - -                const int last_possible_start = last_uncovered - src_len + 1; -                assert(last_possible_start >= 0); -                //cerr << src_len << "," << trg_len << " is allowed. E=" << TD::GetString(x.e_) << endl; -                //cerr << "  first_uncovered=" << first_uncovered << "  last_possible_start=" << last_possible_start << endl; -                for (int i = first_uncovered; i <= last_possible_start; ++i) { -                  if (p.src_cv[i]) continue; -                  assert(ss.size() < tmp_p.size());  // if fails increase tmp_p size -                  Particle& np = tmp_p[ss.size()]; -                  np = p; -                  x.f_.clear(); -                  int gap_add = 0; -                  bool bad = false; -                  prob_t jp = prob_t::One(); -                  int prev_pos = p.prev_pos; -                  for (int j = 0; j < src_len; ++j) { -                    if ((j + i + gap_add) == src.size()) { bad = true; break; } -                    while ((i+j+gap_add) < src.size() && p.src_cv[i + j + gap_add]) { ++gap_add; } -                    if ((j + i + gap_add) == src.size()) { bad = true; break; } -                    np.src_cv[i + j + gap_add] = true; -                    x.f_.push_back(src[i + j + gap_add]); -                    jp *= m.JumpProbability(i + j + gap_add - prev_pos, src.size()); -                    int jump = i + j + gap_add - prev_pos; -                    assert(jump != 0); -                    np.src_jumps.push_back(jump); -                    prev_pos = i + j + gap_add; -                  } -                  if (bad) continue; -                  np.prev_pos = prev_pos; -                  np.src_cov += x.f_.size(); -                  np.trg_cov += x.e_.size(); -                  if (x.f_.size() != src_len) continue; -                  prob_t rp = m.RuleProbability(x); -                  np.gamma_last = rp * jp; -                  const prob_t u = pow(np.gamma_last * be(np.src_cv, np.trg_cov), 0.2); -                  //cerr << "**rule=" << x << endl; -                  //cerr << "  u=" << log(u) << "  rule=" << rp << " jump=" << jp << endl; -                  ss.add(u); -                  np.rules.push_back(TRulePtr(new TRule(x))); -                  z += u; - -                  const bool completed = (p.trg_cov == trg.size()); -                  if (completed) { -                    int last_jump = src.size() - p.prev_pos; -                    assert(last_jump > 0); -                    p.src_jumps.push_back(last_jump); -                    p.weight *= m.JumpProbability(last_jump, src.size()); -                  } -                } -              } -            } -            cerr << "number of edges to consider: " << ss.size() << endl; -            const int sampled = rng.SelectSample(ss); -            prob_t q_n = ss[sampled] / z; -            p = tmp_p[sampled]; -            //m.IncrementRule(*p.rules.back()); -            p.weight *= p.gamma_last / q_n; -            cerr << "[w=" << log(p.weight) << "]\tsampled rule: " << p.rules.back()->AsString() << endl; -            cerr << p << endl; -          } -        } // loop over particles (pi = 0 .. particles) -        if (done_nothing) all_complete = true; -      } -      pfss.clear(); -      for (int i = 0; i < lps.size(); ++i) -        pfss.add(lps[i].weight); -      const int sampled = rng.SelectSample(pfss); -      ps[ci] = lps[sampled]; -      m.IncrementRules(lps[sampled].rules); -      m.IncrementJumps(lps[sampled].src_jumps, src.size()); -      for (int i = 0; i < lps[sampled].rules.size(); ++i) { cerr << "S:\t" << lps[sampled].rules[i]->AsString() << "\n"; } -      cerr << "tmp-LLH: " << log(m.Likelihood()) << endl; -    } -    cerr << "LLH: " << log(m.Likelihood()) << endl; -    for (int sni = 0; sni < 5; ++sni) { -      for (int i = 0; i < ps[sni].rules.size(); ++i) { cerr << "\t" << ps[sni].rules[i]->AsString() << endl; } -    } -  } -  return 0; -} - diff --git a/gi/pf/pfdist.new.cc b/gi/pf/pfdist.new.cc deleted file mode 100644 index 3169eb75..00000000 --- a/gi/pf/pfdist.new.cc +++ /dev/null @@ -1,620 +0,0 @@ -#include <iostream> -#include <tr1/memory> -#include <queue> - -#include <boost/functional.hpp> -#include <boost/program_options.hpp> -#include <boost/program_options/variables_map.hpp> - -#include "base_measures.h" -#include "reachability.h" -#include "viterbi.h" -#include "hg.h" -#include "trule.h" -#include "tdict.h" -#include "filelib.h" -#include "dict.h" -#include "sampler.h" -#include "ccrp_nt.h" -#include "ccrp_onetable.h" - -using namespace std; -using namespace tr1; -namespace po = boost::program_options; - -shared_ptr<MT19937> prng; - -size_t hash_value(const TRule& r) { -  size_t h = boost::hash_value(r.e_); -  boost::hash_combine(h, -r.lhs_); -  boost::hash_combine(h, boost::hash_value(r.f_)); -  return h; -} - -bool operator==(const TRule& a, const TRule& b) { -  return (a.lhs_ == b.lhs_ && a.e_ == b.e_ && a.f_ == b.f_); -} - -void InitCommandLine(int argc, char** argv, po::variables_map* conf) { -  po::options_description opts("Configuration options"); -  opts.add_options() -        ("samples,s",po::value<unsigned>()->default_value(1000),"Number of samples") -        ("particles,p",po::value<unsigned>()->default_value(25),"Number of particles") -        ("input,i",po::value<string>(),"Read parallel data from") -        ("max_src_phrase",po::value<unsigned>()->default_value(5),"Maximum length of source language phrases") -        ("max_trg_phrase",po::value<unsigned>()->default_value(5),"Maximum length of target language phrases") -        ("model1,m",po::value<string>(),"Model 1 parameters (used in base distribution)") -        ("inverse_model1,M",po::value<string>(),"Inverse Model 1 parameters (used in backward estimate)") -        ("model1_interpolation_weight",po::value<double>()->default_value(0.95),"Mixing proportion of model 1 with uniform target distribution") -        ("random_seed,S",po::value<uint32_t>(), "Random seed"); -  po::options_description clo("Command line options"); -  clo.add_options() -        ("config", po::value<string>(), "Configuration file") -        ("help,h", "Print this help message and exit"); -  po::options_description dconfig_options, dcmdline_options; -  dconfig_options.add(opts); -  dcmdline_options.add(opts).add(clo); -   -  po::store(parse_command_line(argc, argv, dcmdline_options), *conf); -  if (conf->count("config")) { -    ifstream config((*conf)["config"].as<string>().c_str()); -    po::store(po::parse_config_file(config, dconfig_options), *conf); -  } -  po::notify(*conf); - -  if (conf->count("help") || (conf->count("input") == 0)) { -    cerr << dcmdline_options << endl; -    exit(1); -  } -} - -void ReadParallelCorpus(const string& filename, -                vector<vector<WordID> >* f, -                vector<vector<WordID> >* e, -                set<WordID>* vocab_f, -                set<WordID>* vocab_e) { -  f->clear(); -  e->clear(); -  vocab_f->clear(); -  vocab_e->clear(); -  istream* in; -  if (filename == "-") -    in = &cin; -  else -    in = new ifstream(filename.c_str()); -  assert(*in); -  string line; -  const WordID kDIV = TD::Convert("|||"); -  vector<WordID> tmp; -  while(*in) { -    getline(*in, line); -    if (line.empty() && !*in) break; -    e->push_back(vector<int>()); -    f->push_back(vector<int>()); -    vector<int>& le = e->back(); -    vector<int>& lf = f->back(); -    tmp.clear(); -    TD::ConvertSentence(line, &tmp); -    bool isf = true; -    for (unsigned i = 0; i < tmp.size(); ++i) { -      const int cur = tmp[i]; -      if (isf) { -        if (kDIV == cur) { isf = false; } else { -          lf.push_back(cur); -          vocab_f->insert(cur); -        } -      } else { -        assert(cur != kDIV); -        le.push_back(cur); -        vocab_e->insert(cur); -      } -    } -    assert(isf == false); -  } -  if (in != &cin) delete in; -} - -#if 0 -struct MyConditionalModel { -  MyConditionalModel(PhraseConditionalBase& rcp0) : rp0(&rcp0), base(prob_t::One()), src_phrases(1,1), src_jumps(200, CCRP_NoTable<int>(1,1)) {} - -  prob_t srcp0(const vector<WordID>& src) const { -    prob_t p(1.0 / 3000.0); -    p.poweq(src.size()); -    prob_t lenp; lenp.logeq(log_poisson(src.size(), 1.0)); -    p *= lenp; -    return p; -  } - -  void DecrementRule(const TRule& rule) { -    const RuleCRPMap::iterator it = rules.find(rule.f_); -    assert(it != rules.end()); -    if (it->second.decrement(rule)) { -      base /= (*rp0)(rule); -      if (it->second.num_customers() == 0) -        rules.erase(it); -    } -    if (src_phrases.decrement(rule.f_)) -      base /= srcp0(rule.f_); -  } - -  void IncrementRule(const TRule& rule) { -    RuleCRPMap::iterator it = rules.find(rule.f_); -    if (it == rules.end()) -      it = rules.insert(make_pair(rule.f_, CCRP_NoTable<TRule>(1,1))).first; -    if (it->second.increment(rule)) { -      base *= (*rp0)(rule); -    } -    if (src_phrases.increment(rule.f_)) -      base *= srcp0(rule.f_); -  } - -  void IncrementRules(const vector<TRulePtr>& rules) { -    for (int i = 0; i < rules.size(); ++i) -      IncrementRule(*rules[i]); -  } - -  void DecrementRules(const vector<TRulePtr>& rules) { -    for (int i = 0; i < rules.size(); ++i) -      DecrementRule(*rules[i]); -  } - -  void IncrementJump(int dist, unsigned src_len) { -    assert(src_len > 0); -    if (src_jumps[src_len].increment(dist)) -      base *= jp0(dist, src_len); -  } - -  void DecrementJump(int dist, unsigned src_len) { -    assert(src_len > 0); -    if (src_jumps[src_len].decrement(dist)) -      base /= jp0(dist, src_len); -  } - -  void IncrementJumps(const vector<int>& js, unsigned src_len) { -    for (unsigned i = 0; i < js.size(); ++i) -      IncrementJump(js[i], src_len); -  } - -  void DecrementJumps(const vector<int>& js, unsigned src_len) { -    for (unsigned i = 0; i < js.size(); ++i) -      DecrementJump(js[i], src_len); -  } - -  // p(jump = dist | src_len , z) -  prob_t JumpProbability(int dist, unsigned src_len) { -    const prob_t p0 = jp0(dist, src_len); -    const double lp = src_jumps[src_len].logprob(dist, log(p0)); -    prob_t q; q.logeq(lp); -    return q; -  } - -  // p(rule.f_ | z) * p(rule.e_ | rule.f_ , z) -  prob_t RuleProbability(const TRule& rule) const { -    const prob_t p0 = (*rp0)(rule); -    prob_t srcp; srcp.logeq(src_phrases.logprob(rule.f_, log(srcp0(rule.f_)))); -    const RuleCRPMap::const_iterator it = rules.find(rule.f_); -    if (it == rules.end()) return srcp * p0; -    const double lp = it->second.logprob(rule, log(p0)); -    prob_t q; q.logeq(lp); -    return q * srcp; -  } - -  prob_t Likelihood() const { -    prob_t p = base; -    for (RuleCRPMap::const_iterator it = rules.begin(); -         it != rules.end(); ++it) { -      prob_t cl; cl.logeq(it->second.log_crp_prob()); -      p *= cl; -    } -    for (unsigned l = 1; l < src_jumps.size(); ++l) { -      if (src_jumps[l].num_customers() > 0) { -        prob_t q; -        q.logeq(src_jumps[l].log_crp_prob()); -        p *= q; -      } -    } -    return p; -  } - -  JumpBase jp0; -  const PhraseConditionalBase* rp0; -  prob_t base; -  typedef unordered_map<vector<WordID>, CCRP_NoTable<TRule>, boost::hash<vector<WordID> > > RuleCRPMap; -  RuleCRPMap rules; -  CCRP_NoTable<vector<WordID> > src_phrases; -  vector<CCRP_NoTable<int> > src_jumps; -}; - -#endif - -struct MyJointModel { -  MyJointModel(PhraseJointBase& rcp0) : -    rp0(rcp0), base(prob_t::One()), rules(1,1), src_jumps(200, CCRP_NoTable<int>(1,1)) {} - -  void DecrementRule(const TRule& rule) { -    if (rules.decrement(rule)) -      base /= rp0(rule); -  } - -  void IncrementRule(const TRule& rule) { -    if (rules.increment(rule)) -      base *= rp0(rule); -  } - -  void IncrementRules(const vector<TRulePtr>& rules) { -    for (int i = 0; i < rules.size(); ++i) -      IncrementRule(*rules[i]); -  } - -  void DecrementRules(const vector<TRulePtr>& rules) { -    for (int i = 0; i < rules.size(); ++i) -      DecrementRule(*rules[i]); -  } - -  void IncrementJump(int dist, unsigned src_len) { -    assert(src_len > 0); -    if (src_jumps[src_len].increment(dist)) -      base *= jp0(dist, src_len); -  } - -  void DecrementJump(int dist, unsigned src_len) { -    assert(src_len > 0); -    if (src_jumps[src_len].decrement(dist)) -      base /= jp0(dist, src_len); -  } - -  void IncrementJumps(const vector<int>& js, unsigned src_len) { -    for (unsigned i = 0; i < js.size(); ++i) -      IncrementJump(js[i], src_len); -  } - -  void DecrementJumps(const vector<int>& js, unsigned src_len) { -    for (unsigned i = 0; i < js.size(); ++i) -      DecrementJump(js[i], src_len); -  } - -  // p(jump = dist | src_len , z) -  prob_t JumpProbability(int dist, unsigned src_len) { -    const prob_t p0 = jp0(dist, src_len); -    const double lp = src_jumps[src_len].logprob(dist, log(p0)); -    prob_t q; q.logeq(lp); -    return q; -  } - -  // p(rule.f_ | z) * p(rule.e_ | rule.f_ , z) -  prob_t RuleProbability(const TRule& rule) const { -    prob_t p; p.logeq(rules.logprob(rule, log(rp0(rule)))); -    return p; -  } - -  prob_t Likelihood() const { -    prob_t p = base; -    prob_t q; q.logeq(rules.log_crp_prob()); -    p *= q; -    for (unsigned l = 1; l < src_jumps.size(); ++l) { -      if (src_jumps[l].num_customers() > 0) { -        prob_t q; -        q.logeq(src_jumps[l].log_crp_prob()); -        p *= q; -      } -    } -    return p; -  } - -  JumpBase jp0; -  const PhraseJointBase& rp0; -  prob_t base; -  CCRP_NoTable<TRule> rules; -  vector<CCRP_NoTable<int> > src_jumps; -}; - -struct BackwardEstimate { -  BackwardEstimate(const Model1& m1, const vector<WordID>& src, const vector<WordID>& trg) : -      model1_(m1), src_(src), trg_(trg) { -  } -  const prob_t& operator()(const vector<bool>& src_cov, unsigned trg_cov) const { -    assert(src_.size() == src_cov.size()); -    assert(trg_cov <= trg_.size()); -    prob_t& e = cache_[src_cov][trg_cov]; -    if (e.is_0()) { -      if (trg_cov == trg_.size()) { e = prob_t::One(); return e; } -      vector<WordID> r(src_.size() + 1); r.clear(); -      r.push_back(0);  // NULL word -      for (int i = 0; i < src_cov.size(); ++i) -        if (!src_cov[i]) r.push_back(src_[i]); -      const prob_t uniform_alignment(1.0 / r.size()); -      e.logeq(log_poisson(trg_.size() - trg_cov, r.size() - 1)); // p(trg len remaining | src len remaining) -      for (unsigned j = trg_cov; j < trg_.size(); ++j) { -        prob_t p; -        for (unsigned i = 0; i < r.size(); ++i) -          p += model1_(r[i], trg_[j]); -        if (p.is_0()) { -          cerr << "ERROR: p(" << TD::Convert(trg_[j]) << " | " << TD::GetString(r) << ") = 0!\n"; -          abort(); -        } -        p *= uniform_alignment; -        e *= p; -      } -    } -    return e; -  } -  const Model1& model1_; -  const vector<WordID>& src_; -  const vector<WordID>& trg_; -  mutable unordered_map<vector<bool>, map<unsigned, prob_t>, boost::hash<vector<bool> > > cache_; -}; - -struct BackwardEstimateSym { -  BackwardEstimateSym(const Model1& m1, -                      const Model1& invm1, const vector<WordID>& src, const vector<WordID>& trg) : -      model1_(m1), invmodel1_(invm1), src_(src), trg_(trg) { -  } -  const prob_t& operator()(const vector<bool>& src_cov, unsigned trg_cov) const { -    assert(src_.size() == src_cov.size()); -    assert(trg_cov <= trg_.size()); -    prob_t& e = cache_[src_cov][trg_cov]; -    if (e.is_0()) { -      if (trg_cov == trg_.size()) { e = prob_t::One(); return e; } -      vector<WordID> r(src_.size() + 1); r.clear(); -      for (int i = 0; i < src_cov.size(); ++i) -        if (!src_cov[i]) r.push_back(src_[i]); -      r.push_back(0);  // NULL word -      const prob_t uniform_alignment(1.0 / r.size()); -      e.logeq(log_poisson(trg_.size() - trg_cov, r.size() - 1)); // p(trg len remaining | src len remaining) -      for (unsigned j = trg_cov; j < trg_.size(); ++j) { -        prob_t p; -        for (unsigned i = 0; i < r.size(); ++i) -          p += model1_(r[i], trg_[j]); -        if (p.is_0()) { -          cerr << "ERROR: p(" << TD::Convert(trg_[j]) << " | " << TD::GetString(r) << ") = 0!\n"; -          abort(); -        } -        p *= uniform_alignment; -        e *= p; -      } -      r.pop_back(); -      const prob_t inv_uniform(1.0 / (trg_.size() - trg_cov + 1.0)); -      prob_t inv; -      inv.logeq(log_poisson(r.size(), trg_.size() - trg_cov)); -      for (unsigned i = 0; i < r.size(); ++i) { -        prob_t p; -        for (unsigned j = trg_cov - 1; j < trg_.size(); ++j) -          p += invmodel1_(j < trg_cov ? 0 : trg_[j], r[i]); -        if (p.is_0()) { -          cerr << "ERROR: p_inv(" << TD::Convert(r[i]) << " | " << TD::GetString(trg_) << ") = 0!\n"; -          abort(); -        } -        p *= inv_uniform; -        inv *= p; -      } -      prob_t x = pow(e * inv, 0.5); -      e = x; -      //cerr << "Forward: " << log(e) << "\tBackward: " << log(inv) << "\t prop: " << log(x) << endl; -    } -    return e; -  } -  const Model1& model1_; -  const Model1& invmodel1_; -  const vector<WordID>& src_; -  const vector<WordID>& trg_; -  mutable unordered_map<vector<bool>, map<unsigned, prob_t>, boost::hash<vector<bool> > > cache_; -}; - -struct Particle { -  Particle() : weight(prob_t::One()), src_cov(), trg_cov(), prev_pos(-1) {} -  prob_t weight; -  prob_t gamma_last; -  vector<int> src_jumps; -  vector<TRulePtr> rules; -  vector<bool> src_cv; -  int src_cov; -  int trg_cov; -  int prev_pos; -}; - -ostream& operator<<(ostream& o, const vector<bool>& v) { -  for (int i = 0; i < v.size(); ++i) -    o << (v[i] ? '1' : '0'); -  return o; -} -ostream& operator<<(ostream& o, const Particle& p) { -  o << "[cv=" << p.src_cv << "  src_cov=" << p.src_cov << " trg_cov=" << p.trg_cov << " last_pos=" << p.prev_pos << " num_rules=" << p.rules.size() << "  w=" << log(p.weight) << ']'; -  return o; -} - -void FilterCrapParticlesAndReweight(vector<Particle>* pps) { -  vector<Particle>& ps = *pps; -  SampleSet<prob_t> ss; -  for (int i = 0; i < ps.size(); ++i) -    ss.add(ps[i].weight); -  vector<Particle> nps; nps.reserve(ps.size()); -  const prob_t uniform_weight(1.0 / ps.size()); -  for (int i = 0; i < ps.size(); ++i) { -    nps.push_back(ps[prng->SelectSample(ss)]); -    nps[i].weight = uniform_weight; -  } -  nps.swap(ps); -} - -int main(int argc, char** argv) { -  po::variables_map conf; -  InitCommandLine(argc, argv, &conf); -  const unsigned kMAX_TRG_PHRASE = conf["max_trg_phrase"].as<unsigned>(); -  const unsigned kMAX_SRC_PHRASE = conf["max_src_phrase"].as<unsigned>(); -  const unsigned particles = conf["particles"].as<unsigned>(); -  const unsigned samples = conf["samples"].as<unsigned>(); - -  if (!conf.count("model1")) { -    cerr << argv[0] << "Please use --model1 to specify model 1 parameters\n"; -    return 1; -  } -  if (conf.count("random_seed")) -    prng.reset(new MT19937(conf["random_seed"].as<uint32_t>())); -  else -    prng.reset(new MT19937); -  MT19937& rng = *prng; - -  vector<vector<WordID> > corpuse, corpusf; -  set<WordID> vocabe, vocabf; -  cerr << "Reading corpus...\n"; -  ReadParallelCorpus(conf["input"].as<string>(), &corpusf, &corpuse, &vocabf, &vocabe); -  cerr << "F-corpus size: " << corpusf.size() << " sentences\t (" << vocabf.size() << " word types)\n"; -  cerr << "E-corpus size: " << corpuse.size() << " sentences\t (" << vocabe.size() << " word types)\n"; -  assert(corpusf.size() == corpuse.size()); - -  const int kLHS = -TD::Convert("X"); -  Model1 m1(conf["model1"].as<string>()); -  Model1 invm1(conf["inverse_model1"].as<string>()); - -#if 0 -  PhraseConditionalBase lp0(m1, conf["model1_interpolation_weight"].as<double>(), vocabe.size()); -  MyConditionalModel m(lp0); -#else -  PhraseJointBase lp0(m1, conf["model1_interpolation_weight"].as<double>(), vocabe.size(), vocabf.size()); -  MyJointModel m(lp0); -#endif - -  cerr << "Initializing reachability limits...\n"; -  vector<Particle> ps(corpusf.size()); -  vector<Reachability> reaches; reaches.reserve(corpusf.size()); -  for (int ci = 0; ci < corpusf.size(); ++ci) -    reaches.push_back(Reachability(corpusf[ci].size(), -                                   corpuse[ci].size(), -                                   kMAX_SRC_PHRASE, -                                   kMAX_TRG_PHRASE)); -  cerr << "Sampling...\n";  -  vector<Particle> tmp_p(10000);  // work space -  SampleSet<prob_t> pfss; -  for (int SS=0; SS < samples; ++SS) { -    for (int ci = 0; ci < corpusf.size(); ++ci) { -      vector<int>& src = corpusf[ci]; -      vector<int>& trg = corpuse[ci]; -      m.DecrementRules(ps[ci].rules); -      m.DecrementJumps(ps[ci].src_jumps, src.size()); - -      //BackwardEstimate be(m1, src, trg); -      BackwardEstimateSym be(m1, invm1, src, trg); -      const Reachability& r = reaches[ci]; -      vector<Particle> lps(particles); - -      for (int pi = 0; pi < particles; ++pi) { -        Particle& p = lps[pi]; -        p.src_cv.resize(src.size(), false); -      } - -      bool all_complete = false; -      while(!all_complete) { -        SampleSet<prob_t> ss; - -        // all particles have now been extended a bit, we will reweight them now -        if (lps[0].trg_cov > 0) -          FilterCrapParticlesAndReweight(&lps); - -        // loop over all particles and extend them -        bool done_nothing = true; -        for (int pi = 0; pi < particles; ++pi) { -          Particle& p = lps[pi]; -          int tic = 0; -          const int rejuv_freq = 1; -          while(p.trg_cov < trg.size() && tic < rejuv_freq) { -            ++tic; -            done_nothing = false; -            ss.clear(); -            TRule x; x.lhs_ = kLHS; -            prob_t z; -            int first_uncovered = src.size(); -            int last_uncovered = -1; -            for (int i = 0; i < src.size(); ++i) { -              const bool is_uncovered = !p.src_cv[i]; -              if (i < first_uncovered && is_uncovered) first_uncovered = i; -              if (is_uncovered && i > last_uncovered) last_uncovered = i; -            } -            assert(last_uncovered > -1); -            assert(first_uncovered < src.size()); - -            for (int trg_len = 1; trg_len <= kMAX_TRG_PHRASE; ++trg_len) { -              x.e_.push_back(trg[trg_len - 1 + p.trg_cov]); -              for (int src_len = 1; src_len <= kMAX_SRC_PHRASE; ++src_len) { -                if (!r.edges[p.src_cov][p.trg_cov][src_len][trg_len]) continue; - -                const int last_possible_start = last_uncovered - src_len + 1; -                assert(last_possible_start >= 0); -                //cerr << src_len << "," << trg_len << " is allowed. E=" << TD::GetString(x.e_) << endl; -                //cerr << "  first_uncovered=" << first_uncovered << "  last_possible_start=" << last_possible_start << endl; -                for (int i = first_uncovered; i <= last_possible_start; ++i) { -                  if (p.src_cv[i]) continue; -                  assert(ss.size() < tmp_p.size());  // if fails increase tmp_p size -                  Particle& np = tmp_p[ss.size()]; -                  np = p; -                  x.f_.clear(); -                  int gap_add = 0; -                  bool bad = false; -                  prob_t jp = prob_t::One(); -                  int prev_pos = p.prev_pos; -                  for (int j = 0; j < src_len; ++j) { -                    if ((j + i + gap_add) == src.size()) { bad = true; break; } -                    while ((i+j+gap_add) < src.size() && p.src_cv[i + j + gap_add]) { ++gap_add; } -                    if ((j + i + gap_add) == src.size()) { bad = true; break; } -                    np.src_cv[i + j + gap_add] = true; -                    x.f_.push_back(src[i + j + gap_add]); -                    jp *= m.JumpProbability(i + j + gap_add - prev_pos, src.size()); -                    int jump = i + j + gap_add - prev_pos; -                    assert(jump != 0); -                    np.src_jumps.push_back(jump); -                    prev_pos = i + j + gap_add; -                  } -                  if (bad) continue; -                  np.prev_pos = prev_pos; -                  np.src_cov += x.f_.size(); -                  np.trg_cov += x.e_.size(); -                  if (x.f_.size() != src_len) continue; -                  prob_t rp = m.RuleProbability(x); -                  np.gamma_last = rp * jp; -                  const prob_t u = pow(np.gamma_last * be(np.src_cv, np.trg_cov), 0.2); -                  //cerr << "**rule=" << x << endl; -                  //cerr << "  u=" << log(u) << "  rule=" << rp << " jump=" << jp << endl; -                  ss.add(u); -                  np.rules.push_back(TRulePtr(new TRule(x))); -                  z += u; - -                  const bool completed = (p.trg_cov == trg.size()); -                  if (completed) { -                    int last_jump = src.size() - p.prev_pos; -                    assert(last_jump > 0); -                    p.src_jumps.push_back(last_jump); -                    p.weight *= m.JumpProbability(last_jump, src.size()); -                  } -                } -              } -            } -            cerr << "number of edges to consider: " << ss.size() << endl; -            const int sampled = rng.SelectSample(ss); -            prob_t q_n = ss[sampled] / z; -            p = tmp_p[sampled]; -            //m.IncrementRule(*p.rules.back()); -            p.weight *= p.gamma_last / q_n; -            cerr << "[w=" << log(p.weight) << "]\tsampled rule: " << p.rules.back()->AsString() << endl; -            cerr << p << endl; -          } -        } // loop over particles (pi = 0 .. particles) -        if (done_nothing) all_complete = true; -      } -      pfss.clear(); -      for (int i = 0; i < lps.size(); ++i) -        pfss.add(lps[i].weight); -      const int sampled = rng.SelectSample(pfss); -      ps[ci] = lps[sampled]; -      m.IncrementRules(lps[sampled].rules); -      m.IncrementJumps(lps[sampled].src_jumps, src.size()); -      for (int i = 0; i < lps[sampled].rules.size(); ++i) { cerr << "S:\t" << lps[sampled].rules[i]->AsString() << "\n"; } -      cerr << "tmp-LLH: " << log(m.Likelihood()) << endl; -    } -    cerr << "LLH: " << log(m.Likelihood()) << endl; -    for (int sni = 0; sni < 5; ++sni) { -      for (int i = 0; i < ps[sni].rules.size(); ++i) { cerr << "\t" << ps[sni].rules[i]->AsString() << endl; } -    } -  } -  return 0; -} - diff --git a/gi/pf/pfnaive.cc b/gi/pf/pfnaive.cc deleted file mode 100644 index 958ec4e2..00000000 --- a/gi/pf/pfnaive.cc +++ /dev/null @@ -1,284 +0,0 @@ -#include <iostream> -#include <tr1/memory> -#include <queue> - -#include <boost/functional.hpp> -#include <boost/program_options.hpp> -#include <boost/program_options/variables_map.hpp> - -#include "pf.h" -#include "base_distributions.h" -#include "monotonic_pseg.h" -#include "reachability.h" -#include "viterbi.h" -#include "hg.h" -#include "trule.h" -#include "tdict.h" -#include "filelib.h" -#include "dict.h" -#include "sampler.h" -#include "ccrp_nt.h" -#include "ccrp_onetable.h" -#include "corpus.h" - -using namespace std; -using namespace tr1; -namespace po = boost::program_options; - -boost::shared_ptr<MT19937> prng; - -void InitCommandLine(int argc, char** argv, po::variables_map* conf) { -  po::options_description opts("Configuration options"); -  opts.add_options() -        ("samples,s",po::value<unsigned>()->default_value(1000),"Number of samples") -        ("particles,p",po::value<unsigned>()->default_value(30),"Number of particles") -        ("filter_frequency,f",po::value<unsigned>()->default_value(5),"Number of time steps between filterings") -        ("input,i",po::value<string>(),"Read parallel data from") -        ("max_src_phrase",po::value<unsigned>()->default_value(5),"Maximum length of source language phrases") -        ("max_trg_phrase",po::value<unsigned>()->default_value(5),"Maximum length of target language phrases") -        ("model1,m",po::value<string>(),"Model 1 parameters (used in base distribution)") -        ("inverse_model1,M",po::value<string>(),"Inverse Model 1 parameters (used in backward estimate)") -        ("model1_interpolation_weight",po::value<double>()->default_value(0.95),"Mixing proportion of model 1 with uniform target distribution") -        ("random_seed,S",po::value<uint32_t>(), "Random seed"); -  po::options_description clo("Command line options"); -  clo.add_options() -        ("config", po::value<string>(), "Configuration file") -        ("help,h", "Print this help message and exit"); -  po::options_description dconfig_options, dcmdline_options; -  dconfig_options.add(opts); -  dcmdline_options.add(opts).add(clo); -   -  po::store(parse_command_line(argc, argv, dcmdline_options), *conf); -  if (conf->count("config")) { -    ifstream config((*conf)["config"].as<string>().c_str()); -    po::store(po::parse_config_file(config, dconfig_options), *conf); -  } -  po::notify(*conf); - -  if (conf->count("help") || (conf->count("input") == 0)) { -    cerr << dcmdline_options << endl; -    exit(1); -  } -} - -struct BackwardEstimateSym { -  BackwardEstimateSym(const Model1& m1, -                      const Model1& invm1, const vector<WordID>& src, const vector<WordID>& trg) : -      model1_(m1), invmodel1_(invm1), src_(src), trg_(trg) { -  } -  const prob_t& operator()(unsigned src_cov, unsigned trg_cov) const { -    assert(src_cov <= src_.size()); -    assert(trg_cov <= trg_.size()); -    prob_t& e = cache_[src_cov][trg_cov]; -    if (e.is_0()) { -      if (trg_cov == trg_.size()) { e = prob_t::One(); return e; } -      vector<WordID> r(src_.size() + 1); r.clear(); -      for (int i = src_cov; i < src_.size(); ++i) -        r.push_back(src_[i]); -      r.push_back(0);  // NULL word -      const prob_t uniform_alignment(1.0 / r.size()); -      e.logeq(Md::log_poisson(trg_.size() - trg_cov, r.size() - 1)); // p(trg len remaining | src len remaining) -      for (unsigned j = trg_cov; j < trg_.size(); ++j) { -        prob_t p; -        for (unsigned i = 0; i < r.size(); ++i) -          p += model1_(r[i], trg_[j]); -        if (p.is_0()) { -          cerr << "ERROR: p(" << TD::Convert(trg_[j]) << " | " << TD::GetString(r) << ") = 0!\n"; -          abort(); -        } -        p *= uniform_alignment; -        e *= p; -      } -      r.pop_back(); -      const prob_t inv_uniform(1.0 / (trg_.size() - trg_cov + 1.0)); -      prob_t inv; -      inv.logeq(Md::log_poisson(r.size(), trg_.size() - trg_cov)); -      for (unsigned i = 0; i < r.size(); ++i) { -        prob_t p; -        for (unsigned j = trg_cov - 1; j < trg_.size(); ++j) -          p += invmodel1_(j < trg_cov ? 0 : trg_[j], r[i]); -        if (p.is_0()) { -          cerr << "ERROR: p_inv(" << TD::Convert(r[i]) << " | " << TD::GetString(trg_) << ") = 0!\n"; -          abort(); -        } -        p *= inv_uniform; -        inv *= p; -      } -      prob_t x = pow(e * inv, 0.5); -      e = x; -      //cerr << "Forward: " << log(e) << "\tBackward: " << log(inv) << "\t prop: " << log(x) << endl; -    } -    return e; -  } -  const Model1& model1_; -  const Model1& invmodel1_; -  const vector<WordID>& src_; -  const vector<WordID>& trg_; -  mutable unordered_map<unsigned, map<unsigned, prob_t> > cache_; -}; - -struct Particle { -  Particle() : weight(prob_t::One()), src_cov(), trg_cov() {} -  prob_t weight; -  prob_t gamma_last; -  vector<TRulePtr> rules; -  int src_cov; -  int trg_cov; -}; - -ostream& operator<<(ostream& o, const vector<bool>& v) { -  for (int i = 0; i < v.size(); ++i) -    o << (v[i] ? '1' : '0'); -  return o; -} -ostream& operator<<(ostream& o, const Particle& p) { -  o << "[src_cov=" << p.src_cov << " trg_cov=" << p.trg_cov << " num_rules=" << p.rules.size() << "  w=" << log(p.weight) << ']'; -  return o; -} - -int main(int argc, char** argv) { -  po::variables_map conf; -  InitCommandLine(argc, argv, &conf); -  const unsigned kMAX_TRG_PHRASE = conf["max_trg_phrase"].as<unsigned>(); -  const unsigned kMAX_SRC_PHRASE = conf["max_src_phrase"].as<unsigned>(); -  const unsigned particles = conf["particles"].as<unsigned>(); -  const unsigned samples = conf["samples"].as<unsigned>(); -  const unsigned rejuv_freq = conf["filter_frequency"].as<unsigned>(); - -  if (!conf.count("model1")) { -    cerr << argv[0] << "Please use --model1 to specify model 1 parameters\n"; -    return 1; -  } -  if (conf.count("random_seed")) -    prng.reset(new MT19937(conf["random_seed"].as<uint32_t>())); -  else -    prng.reset(new MT19937); -  MT19937& rng = *prng; - -  vector<vector<WordID> > corpuse, corpusf; -  set<WordID> vocabe, vocabf; -  cerr << "Reading corpus...\n"; -  corpus::ReadParallelCorpus(conf["input"].as<string>(), &corpusf, &corpuse, &vocabf, &vocabe); -  cerr << "F-corpus size: " << corpusf.size() << " sentences\t (" << vocabf.size() << " word types)\n"; -  cerr << "E-corpus size: " << corpuse.size() << " sentences\t (" << vocabe.size() << " word types)\n"; -  assert(corpusf.size() == corpuse.size()); - -  const int kLHS = -TD::Convert("X"); -  Model1 m1(conf["model1"].as<string>()); -  Model1 invm1(conf["inverse_model1"].as<string>()); - -  PhraseJointBase lp0(m1, conf["model1_interpolation_weight"].as<double>(), vocabe.size(), vocabf.size()); -  PhraseJointBase_BiDir alp0(m1, invm1, conf["model1_interpolation_weight"].as<double>(), vocabe.size(), vocabf.size()); -  MonotonicParallelSegementationModel<PhraseJointBase_BiDir> m(alp0); -  TRule xx("[X] ||| ms. kimura ||| MS. KIMURA ||| X=0"); -  cerr << xx << endl << lp0(xx) << " " << alp0(xx) << endl; -  TRule xx12("[X] ||| . ||| PHARMACY . ||| X=0"); -  TRule xx21("[X] ||| pharmacy . ||| . ||| X=0"); -//  TRule xx22("[X] ||| . ||| . ||| X=0"); -  TRule xx22("[X] ||| . ||| THE . ||| X=0"); -  cerr << xx12 << "\t" << lp0(xx12) << " " << alp0(xx12) << endl; -  cerr << xx21 << "\t" << lp0(xx21) << " " << alp0(xx21) << endl; -  cerr << xx22 << "\t" << lp0(xx22) << " " << alp0(xx22) << endl; - -  cerr << "Initializing reachability limits...\n"; -  vector<Particle> ps(corpusf.size()); -  vector<Reachability> reaches; reaches.reserve(corpusf.size()); -  for (int ci = 0; ci < corpusf.size(); ++ci) -    reaches.push_back(Reachability(corpusf[ci].size(), -                                   corpuse[ci].size(), -                                   kMAX_SRC_PHRASE, -                                   kMAX_TRG_PHRASE)); -  cerr << "Sampling...\n";  -  vector<Particle> tmp_p(10000);  // work space -  SampleSet<prob_t> pfss; -  SystematicResampleFilter<Particle> filter(&rng); -  // MultinomialResampleFilter<Particle> filter(&rng); -  for (int SS=0; SS < samples; ++SS) { -    for (int ci = 0; ci < corpusf.size(); ++ci) { -      vector<int>& src = corpusf[ci]; -      vector<int>& trg = corpuse[ci]; -      m.DecrementRulesAndStops(ps[ci].rules); -      const prob_t q_stop = m.StopProbability(); -      const prob_t q_cont = m.ContinueProbability(); -      cerr << "P(stop)=" << q_stop << "\tP(continue)=" <<q_cont << endl; - -      BackwardEstimateSym be(m1, invm1, src, trg); -      const Reachability& r = reaches[ci]; -      vector<Particle> lps(particles); - -      bool all_complete = false; -      while(!all_complete) { -        SampleSet<prob_t> ss; - -        // all particles have now been extended a bit, we will reweight them now -        if (lps[0].trg_cov > 0) -          filter(&lps); - -        // loop over all particles and extend them -        bool done_nothing = true; -        for (int pi = 0; pi < particles; ++pi) { -          Particle& p = lps[pi]; -          int tic = 0; -          while(p.trg_cov < trg.size() && tic < rejuv_freq) { -            ++tic; -            done_nothing = false; -            ss.clear(); -            TRule x; x.lhs_ = kLHS; -            prob_t z; - -            for (int trg_len = 1; trg_len <= kMAX_TRG_PHRASE; ++trg_len) { -              x.e_.push_back(trg[trg_len - 1 + p.trg_cov]); -              for (int src_len = 1; src_len <= kMAX_SRC_PHRASE; ++src_len) { -                if (!r.edges[p.src_cov][p.trg_cov][src_len][trg_len]) continue; - -                int i = p.src_cov; -                assert(ss.size() < tmp_p.size());  // if fails increase tmp_p size -                Particle& np = tmp_p[ss.size()]; -                np = p; -                x.f_.clear(); -                for (int j = 0; j < src_len; ++j) -                  x.f_.push_back(src[i + j]); -                np.src_cov += x.f_.size(); -                np.trg_cov += x.e_.size(); -                const bool stop_now = (np.src_cov == src_len && np.trg_cov == trg_len); -                prob_t rp = m.RuleProbability(x) * (stop_now ? q_stop : q_cont); -                np.gamma_last = rp; -                const prob_t u = pow(np.gamma_last * pow(be(np.src_cov, np.trg_cov), 1.2), 0.1); -                //cerr << "**rule=" << x << endl; -                //cerr << "  u=" << log(u) << "  rule=" << rp << endl; -                ss.add(u); -                np.rules.push_back(TRulePtr(new TRule(x))); -                z += u; -              } -            } -            //cerr << "number of edges to consider: " << ss.size() << endl; -            const int sampled = rng.SelectSample(ss); -            prob_t q_n = ss[sampled] / z; -            p = tmp_p[sampled]; -            //m.IncrementRule(*p.rules.back()); -            p.weight *= p.gamma_last / q_n; -            //cerr << "[w=" << log(p.weight) << "]\tsampled rule: " << p.rules.back()->AsString() << endl; -            //cerr << p << endl; -          } -        } // loop over particles (pi = 0 .. particles) -        if (done_nothing) all_complete = true; -        prob_t wv = prob_t::Zero(); -        for (int pp = 0; pp < lps.size(); ++pp) -          wv += lps[pp].weight; -        for (int pp = 0; pp < lps.size(); ++pp) -          lps[pp].weight /= wv; -      } -      pfss.clear(); -      for (int i = 0; i < lps.size(); ++i) -        pfss.add(lps[i].weight); -      const int sampled = rng.SelectSample(pfss); -      ps[ci] = lps[sampled]; -      m.IncrementRulesAndStops(lps[sampled].rules); -      for (int i = 0; i < lps[sampled].rules.size(); ++i) { cerr << "S:\t" << lps[sampled].rules[i]->AsString() << "\n"; } -      cerr << "tmp-LLH: " << log(m.Likelihood()) << endl; -    } -    cerr << "LLH: " << log(m.Likelihood()) << endl; -  } -  return 0; -} - diff --git a/gi/pf/poisson_uniform_word_model.h b/gi/pf/poisson_uniform_word_model.h deleted file mode 100644 index 76204a0e..00000000 --- a/gi/pf/poisson_uniform_word_model.h +++ /dev/null @@ -1,50 +0,0 @@ -#ifndef _POISSON_UNIFORM_WORD_MODEL_H_ -#define _POISSON_UNIFORM_WORD_MODEL_H_ - -#include <cmath> -#include <vector> -#include "prob.h" -#include "m.h" - -// len ~ Poisson(lambda) -//   for (1..len) -//     e_i ~ Uniform({Vocabulary}) -struct PoissonUniformWordModel { -  explicit PoissonUniformWordModel(const unsigned vocab_size, -                                   const unsigned alphabet_size, -                                   const double mean_len = 5) : -    lh(prob_t::One()), -    v0(-std::log(vocab_size)), -    u0(-std::log(alphabet_size)), -    mean_length(mean_len) {} - -  void ResampleHyperparameters(MT19937*) {} - -  inline prob_t operator()(const std::vector<WordID>& s) const { -    prob_t p; -    p.logeq(Md::log_poisson(s.size(), mean_length) + s.size() * u0); -    //p.logeq(v0); -    return p; -  } - -  inline void Increment(const std::vector<WordID>& w, MT19937*) { -    lh *= (*this)(w); -  } - -  inline void Decrement(const std::vector<WordID>& w, MT19937 *) { -    lh /= (*this)(w); -  } - -  inline prob_t Likelihood() const { return lh; } - -  void Summary() const {} - - private: - -  prob_t lh;  // keeps track of the draws from the base distribution -  const double v0;  // uniform log prob of generating a word -  const double u0;  // uniform log prob of generating a letter -  const double mean_length;  // mean length of a word in the base distribution -}; - -#endif diff --git a/gi/pf/pyp_lm.cc b/gi/pf/pyp_lm.cc deleted file mode 100644 index 605d8206..00000000 --- a/gi/pf/pyp_lm.cc +++ /dev/null @@ -1,273 +0,0 @@ -#include <iostream> -#include <tr1/memory> -#include <queue> - -#include <boost/functional.hpp> -#include <boost/program_options.hpp> -#include <boost/program_options/variables_map.hpp> - -#include "gamma_poisson.h" -#include "corpus_tools.h" -#include "m.h" -#include "tdict.h" -#include "sampler.h" -#include "ccrp.h" -#include "tied_resampler.h" - -// A not very memory-efficient implementation of an N-gram LM based on PYPs -// as described in Y.-W. Teh. (2006) A Hierarchical Bayesian Language Model -// based on Pitman-Yor Processes. In Proc. ACL. - -// I use templates to handle the recursive formalation of the prior, so -// the order of the model has to be specified here, at compile time: -#define kORDER 3 - -using namespace std; -using namespace tr1; -namespace po = boost::program_options; - -boost::shared_ptr<MT19937> prng; - -void InitCommandLine(int argc, char** argv, po::variables_map* conf) { -  po::options_description opts("Configuration options"); -  opts.add_options() -        ("samples,n",po::value<unsigned>()->default_value(300),"Number of samples") -        ("train,i",po::value<string>(),"Training data file") -        ("test,T",po::value<string>(),"Test data file") -        ("discount_prior_a,a",po::value<double>()->default_value(1.0), "discount ~ Beta(a,b): a=this") -        ("discount_prior_b,b",po::value<double>()->default_value(1.0), "discount ~ Beta(a,b): b=this") -        ("strength_prior_s,s",po::value<double>()->default_value(1.0), "strength ~ Gamma(s,r): s=this") -        ("strength_prior_r,r",po::value<double>()->default_value(1.0), "strength ~ Gamma(s,r): r=this") -        ("random_seed,S",po::value<uint32_t>(), "Random seed"); -  po::options_description clo("Command line options"); -  clo.add_options() -        ("config", po::value<string>(), "Configuration file") -        ("help", "Print this help message and exit"); -  po::options_description dconfig_options, dcmdline_options; -  dconfig_options.add(opts); -  dcmdline_options.add(opts).add(clo); -   -  po::store(parse_command_line(argc, argv, dcmdline_options), *conf); -  if (conf->count("config")) { -    ifstream config((*conf)["config"].as<string>().c_str()); -    po::store(po::parse_config_file(config, dconfig_options), *conf); -  } -  po::notify(*conf); - -  if (conf->count("help") || (conf->count("train") == 0)) { -    cerr << dcmdline_options << endl; -    exit(1); -  } -} - -// uniform distribution over a fixed vocabulary -struct UniformVocabulary { -  UniformVocabulary(unsigned vs, double, double, double, double) : p0(1.0 / vs), draws() {} -  void increment(WordID, const vector<WordID>&, MT19937*) { ++draws; } -  void decrement(WordID, const vector<WordID>&, MT19937*) { --draws; assert(draws >= 0); } -  double prob(WordID, const vector<WordID>&) const { return p0; } -  void resample_hyperparameters(MT19937*) {} -  double log_likelihood() const { return draws * log(p0); } -  const double p0; -  int draws; -}; - -// Lord Rothschild. 1986. THE DISTRIBUTION OF ENGLISH DICTIONARY WORD LENGTHS. -// Journal of Statistical Planning and Inference 14 (1986) 311-322 -struct PoissonLengthUniformCharWordModel { -  explicit PoissonLengthUniformCharWordModel(unsigned vocab_size, double, double, double, double) : plen(5,5), uc(-log(95)), llh() {} -  void increment(WordID w, const vector<WordID>& v, MT19937*) { -    llh += log(prob(w, v)); // this isn't quite right -    plen.increment(TD::Convert(w).size() - 1); -  } -  void decrement(WordID w, const vector<WordID>& v, MT19937*) { -    plen.decrement(TD::Convert(w).size() - 1); -    llh -= log(prob(w, v)); // this isn't quite right -  } -  double prob(WordID w, const vector<WordID>&) const { -    const unsigned len = TD::Convert(w).size(); -    return plen.prob(len - 1) * exp(uc * len); -  } -  double log_likelihood() const { return llh; } -  void resample_hyperparameters(MT19937*) {} -  GammaPoisson plen; -  const double uc; -  double llh; -}; - -struct PYPAdaptedPoissonLengthUniformCharWordModel { -  explicit PYPAdaptedPoissonLengthUniformCharWordModel(unsigned vocab_size, double, double, double, double) : -    base(vocab_size,1,1,1,1), -    crp(1,1,1,1) {} -  void increment(WordID w, const vector<WordID>& v, MT19937* rng) { -    double p0 = base.prob(w, v); -    if (crp.increment(w, p0, rng)) -      base.increment(w, v, rng); -  } -  void decrement(WordID w, const vector<WordID>& v, MT19937* rng) { -    if (crp.decrement(w, rng)) -      base.decrement(w, v, rng); -  } -  double prob(WordID w, const vector<WordID>& v) const { -    double p0 = base.prob(w, v); -    return crp.prob(w, p0); -  } -  double log_likelihood() const { return crp.log_crp_prob() + base.log_likelihood(); } -  void resample_hyperparameters(MT19937* rng) { crp.resample_hyperparameters(rng); } -  PoissonLengthUniformCharWordModel base; -  CCRP<WordID> crp; -}; - -template <unsigned N> struct PYPLM; - -#if 1 -template<> struct PYPLM<0> : public UniformVocabulary { -  PYPLM(unsigned vs, double a, double b, double c, double d) : -    UniformVocabulary(vs, a, b, c, d) {} -}; -#else -#if 0 -template<> struct PYPLM<0> : public PoissonLengthUniformCharWordModel { -  PYPLM(unsigned vs, double a, double b, double c, double d) : -    PoissonLengthUniformCharWordModel(vs, a, b, c, d) {} -}; -#else -template<> struct PYPLM<0> : public PYPAdaptedPoissonLengthUniformCharWordModel { -  PYPLM(unsigned vs, double a, double b, double c, double d) : -    PYPAdaptedPoissonLengthUniformCharWordModel(vs, a, b, c, d) {} -}; -#endif -#endif - -// represents an N-gram LM -template <unsigned N> struct PYPLM { -  PYPLM(unsigned vs, double da, double db, double ss, double sr) : -      backoff(vs, da, db, ss, sr), -      tr(da, db, ss, sr, 0.8, 1.0), -      lookup(N-1) {} -  void increment(WordID w, const vector<WordID>& context, MT19937* rng) { -    const double bo = backoff.prob(w, context); -    for (unsigned i = 0; i < N-1; ++i) -      lookup[i] = context[context.size() - 1 - i]; -    typename unordered_map<vector<WordID>, CCRP<WordID>, boost::hash<vector<WordID> > >::iterator it = p.find(lookup); -    if (it == p.end()) { -      it = p.insert(make_pair(lookup, CCRP<WordID>(0.5,1))).first; -      tr.Add(&it->second);  // add to resampler -    } -    if (it->second.increment(w, bo, rng)) -      backoff.increment(w, context, rng); -  } -  void decrement(WordID w, const vector<WordID>& context, MT19937* rng) { -    for (unsigned i = 0; i < N-1; ++i) -      lookup[i] = context[context.size() - 1 - i]; -    typename unordered_map<vector<WordID>, CCRP<WordID>, boost::hash<vector<WordID> > >::iterator it = p.find(lookup); -    assert(it != p.end()); -    if (it->second.decrement(w, rng)) -      backoff.decrement(w, context, rng); -  } -  double prob(WordID w, const vector<WordID>& context) const { -    const double bo = backoff.prob(w, context); -    for (unsigned i = 0; i < N-1; ++i) -      lookup[i] = context[context.size() - 1 - i]; -    typename unordered_map<vector<WordID>, CCRP<WordID>, boost::hash<vector<WordID> > >::const_iterator it = p.find(lookup); -    if (it == p.end()) return bo; -    return it->second.prob(w, bo); -  } - -  double log_likelihood() const { -    double llh = backoff.log_likelihood(); -    typename unordered_map<vector<WordID>, CCRP<WordID>, boost::hash<vector<WordID> > >::const_iterator it; -    for (it = p.begin(); it != p.end(); ++it) -      llh += it->second.log_crp_prob(); -    llh += tr.LogLikelihood(); -    return llh; -  } - -  void resample_hyperparameters(MT19937* rng) { -    tr.ResampleHyperparameters(rng); -    backoff.resample_hyperparameters(rng); -  } - -  PYPLM<N-1> backoff; -  TiedResampler<CCRP<WordID> > tr; -  double discount_a, discount_b, strength_s, strength_r; -  double d, strength; -  mutable vector<WordID> lookup;  // thread-local -  unordered_map<vector<WordID>, CCRP<WordID>, boost::hash<vector<WordID> > > p; -}; - -int main(int argc, char** argv) { -  po::variables_map conf; - -  InitCommandLine(argc, argv, &conf); -  const unsigned samples = conf["samples"].as<unsigned>(); -  if (conf.count("random_seed")) -    prng.reset(new MT19937(conf["random_seed"].as<uint32_t>())); -  else -    prng.reset(new MT19937); -  MT19937& rng = *prng; -  vector<vector<WordID> > corpuse; -  set<WordID> vocabe; -  const WordID kEOS = TD::Convert("</s>"); -  cerr << "Reading corpus...\n"; -  CorpusTools::ReadFromFile(conf["train"].as<string>(), &corpuse, &vocabe); -  cerr << "E-corpus size: " << corpuse.size() << " sentences\t (" << vocabe.size() << " word types)\n"; -  vector<vector<WordID> > test; -  if (conf.count("test")) -    CorpusTools::ReadFromFile(conf["test"].as<string>(), &test); -  else -    test = corpuse; -  PYPLM<kORDER> lm(vocabe.size(), -                   conf["discount_prior_a"].as<double>(), -                   conf["discount_prior_b"].as<double>(), -                   conf["strength_prior_s"].as<double>(), -                   conf["strength_prior_r"].as<double>()); -  vector<WordID> ctx(kORDER - 1, TD::Convert("<s>")); -  for (int SS=0; SS < samples; ++SS) { -    for (int ci = 0; ci < corpuse.size(); ++ci) { -      ctx.resize(kORDER - 1); -      const vector<WordID>& s = corpuse[ci]; -      for (int i = 0; i <= s.size(); ++i) { -        WordID w = (i < s.size() ? s[i] : kEOS); -        if (SS > 0) lm.decrement(w, ctx, &rng); -        lm.increment(w, ctx, &rng); -        ctx.push_back(w); -      } -    } -    if (SS % 10 == 9) { -      cerr << " [LLH=" << lm.log_likelihood() << "]" << endl; -      if (SS % 30 == 29) lm.resample_hyperparameters(&rng); -    } else { cerr << '.' << flush; } -  } -  double llh = 0; -  unsigned cnt = 0; -  unsigned oovs = 0; -  for (int ci = 0; ci < test.size(); ++ci) { -    ctx.resize(kORDER - 1); -    const vector<WordID>& s = test[ci]; -    for (int i = 0; i <= s.size(); ++i) { -      WordID w = (i < s.size() ? s[i] : kEOS); -      double lp = log(lm.prob(w, ctx)) / log(2); -      if (i < s.size() && vocabe.count(w) == 0) { -        cerr << "**OOV "; -        ++oovs; -        lp = 0; -      } -      cerr << "p(" << TD::Convert(w) << " |"; -      for (int j = ctx.size() + 1 - kORDER; j < ctx.size(); ++j) -        cerr << ' ' << TD::Convert(ctx[j]); -      cerr << ") = " << lp << endl; -      ctx.push_back(w); -      llh -= lp; -      cnt++; -    } -  } -  cerr << "  Log_10 prob: " << (-llh * log(2) / log(10)) << endl; -  cerr << "        Count: " << cnt << endl; -  cerr << "         OOVs: " << oovs << endl; -  cerr << "Cross-entropy: " << (llh / cnt) << endl; -  cerr << "   Perplexity: " << pow(2, llh / cnt) << endl; -  return 0; -} - - diff --git a/gi/pf/pyp_tm.cc b/gi/pf/pyp_tm.cc deleted file mode 100644 index 37b9a604..00000000 --- a/gi/pf/pyp_tm.cc +++ /dev/null @@ -1,128 +0,0 @@ -#include "pyp_tm.h" - -#include <tr1/unordered_map> -#include <iostream> -#include <queue> - -#include "tdict.h" -#include "ccrp.h" -#include "pyp_word_model.h" -#include "tied_resampler.h" - -using namespace std; -using namespace std::tr1; - -struct FreqBinner { -  FreqBinner(const std::string& fname) { fd_.Load(fname); } -  unsigned NumberOfBins() const { return fd_.Max() + 1; } -  unsigned Bin(const WordID& w) const { return fd_.LookUp(w); } -  FreqDict<unsigned> fd_; -}; - -template <typename Base, class Binner = FreqBinner> -struct ConditionalPYPWordModel { -  ConditionalPYPWordModel(Base* b, const Binner* bnr = NULL) : -      base(*b), -      binner(bnr), -      btr(binner ? binner->NumberOfBins() + 1u : 2u) {} - -  void Summary() const { -    cerr << "Number of conditioning contexts: " << r.size() << endl; -    for (RuleModelHash::const_iterator it = r.begin(); it != r.end(); ++it) { -      cerr << TD::Convert(it->first) << "   \tPYP(d=" << it->second.discount() << ",s=" << it->second.strength() << ") --------------------------" << endl; -      for (CCRP<vector<WordID> >::const_iterator i2 = it->second.begin(); i2 != it->second.end(); ++i2) -        cerr << "   " << i2->second << '\t' << TD::GetString(i2->first) << endl; -    } -  } - -  void ResampleHyperparameters(MT19937* rng) { -    btr.ResampleHyperparameters(rng); -  }  - -  prob_t Prob(const WordID src, const vector<WordID>& trglets) const { -    RuleModelHash::const_iterator it = r.find(src); -    if (it == r.end()) { -      return base(trglets); -    } else { -      return it->second.prob(trglets, base(trglets)); -    } -  } - -  void Increment(const WordID src, const vector<WordID>& trglets, MT19937* rng) { -    RuleModelHash::iterator it = r.find(src); -    if (it == r.end()) { -      it = r.insert(make_pair(src, CCRP<vector<WordID> >(0.5,1.0))).first; -      static const WordID kNULL = TD::Convert("NULL"); -      unsigned bin = (src == kNULL ? 0 : 1); -      if (binner && bin) { bin = binner->Bin(src) + 1; } -      btr.Add(bin, &it->second); -    } -    if (it->second.increment(trglets, base(trglets), rng)) -      base.Increment(trglets, rng); -  } - -  void Decrement(const WordID src, const vector<WordID>& trglets, MT19937* rng) { -    RuleModelHash::iterator it = r.find(src); -    assert(it != r.end()); -    if (it->second.decrement(trglets, rng)) { -      base.Decrement(trglets, rng); -    } -  } - -  prob_t Likelihood() const { -    prob_t p = prob_t::One(); -    for (RuleModelHash::const_iterator it = r.begin(); it != r.end(); ++it) { -      prob_t q; q.logeq(it->second.log_crp_prob()); -      p *= q; -    } -    return p; -  } - -  unsigned UniqueConditioningContexts() const { -    return r.size(); -  } - -  // TODO tie PYP hyperparameters based on source word frequency bins -  Base& base; -  const Binner* binner; -  BinTiedResampler<CCRP<vector<WordID> > > btr; -  typedef unordered_map<WordID, CCRP<vector<WordID> > > RuleModelHash; -  RuleModelHash r; -}; - -PYPLexicalTranslation::PYPLexicalTranslation(const vector<vector<WordID> >& lets, -                                             const unsigned vocab_size, -                                             const unsigned num_letters) : -    letters(lets), -    base(vocab_size, num_letters, 5), -    tmodel(new ConditionalPYPWordModel<PoissonUniformWordModel>(&base, new FreqBinner("10k.freq"))), -    kX(-TD::Convert("X")) {} - -void PYPLexicalTranslation::Summary() const { -  tmodel->Summary(); -} - -prob_t PYPLexicalTranslation::Likelihood() const { -  return tmodel->Likelihood() * base.Likelihood(); -} - -void PYPLexicalTranslation::ResampleHyperparameters(MT19937* rng) { -  tmodel->ResampleHyperparameters(rng); -} - -unsigned PYPLexicalTranslation::UniqueConditioningContexts() const { -  return tmodel->UniqueConditioningContexts(); -} - -prob_t PYPLexicalTranslation::Prob(WordID src, WordID trg) const { -  return tmodel->Prob(src, letters[trg]); -} - -void PYPLexicalTranslation::Increment(WordID src, WordID trg, MT19937* rng) { -  tmodel->Increment(src, letters[trg], rng); -} - -void PYPLexicalTranslation::Decrement(WordID src, WordID trg, MT19937* rng) { -  tmodel->Decrement(src, letters[trg], rng); -} - diff --git a/gi/pf/pyp_tm.h b/gi/pf/pyp_tm.h deleted file mode 100644 index 2b076a25..00000000 --- a/gi/pf/pyp_tm.h +++ /dev/null @@ -1,36 +0,0 @@ -#ifndef PYP_LEX_TRANS -#define PYP_LEX_TRANS - -#include <vector> -#include "wordid.h" -#include "prob.h" -#include "sampler.h" -#include "freqdict.h" -#include "poisson_uniform_word_model.h" - -struct FreqBinner; -template <typename T, class B> struct ConditionalPYPWordModel; - -struct PYPLexicalTranslation { -  explicit PYPLexicalTranslation(const std::vector<std::vector<WordID> >& lets, -                                 const unsigned vocab_size, -                                 const unsigned num_letters); - -  prob_t Likelihood() const; - -  void ResampleHyperparameters(MT19937* rng); -  prob_t Prob(WordID src, WordID trg) const;  // return p(trg | src) -  void Summary() const; -  void Increment(WordID src, WordID trg, MT19937* rng); -  void Decrement(WordID src, WordID trg, MT19937* rng); -  unsigned UniqueConditioningContexts() const; - - private: -  const std::vector<std::vector<WordID> >& letters;   // spelling dictionary -  PoissonUniformWordModel base;  // "generator" of English types -  ConditionalPYPWordModel<PoissonUniformWordModel, FreqBinner>* tmodel;  // translation distributions -                      // (model English word | French word) -  const WordID kX; -}; - -#endif diff --git a/gi/pf/pyp_word_model.h b/gi/pf/pyp_word_model.h deleted file mode 100644 index 0bebb751..00000000 --- a/gi/pf/pyp_word_model.h +++ /dev/null @@ -1,61 +0,0 @@ -#ifndef _PYP_WORD_MODEL_H_ -#define _PYP_WORD_MODEL_H_ - -#include <iostream> -#include <cmath> -#include <vector> -#include "prob.h" -#include "ccrp.h" -#include "m.h" -#include "tdict.h" -#include "os_phrase.h" - -// PYP(d,s,poisson-uniform) represented as a CRP -template <class Base> -struct PYPWordModel { -  explicit PYPWordModel(Base* b) : -      base(*b), -      r(1,1,1,1,0.66,50.0) -    {} - -  void ResampleHyperparameters(MT19937* rng) { -    r.resample_hyperparameters(rng); -    std::cerr << " PYPWordModel(d=" << r.discount() << ",s=" << r.strength() << ")\n"; -  } - -  inline prob_t operator()(const std::vector<WordID>& s) const { -    return r.prob(s, base(s)); -  } - -  inline void Increment(const std::vector<WordID>& s, MT19937* rng) { -    if (r.increment(s, base(s), rng)) -      base.Increment(s, rng); -  } - -  inline void Decrement(const std::vector<WordID>& s, MT19937 *rng) { -    if (r.decrement(s, rng)) -      base.Decrement(s, rng); -  } - -  inline prob_t Likelihood() const { -    prob_t p; p.logeq(r.log_crp_prob()); -    p *= base.Likelihood(); -    return p; -  } - -  void Summary() const { -    std::cerr << "PYPWordModel: generations=" << r.num_customers() -         << " PYP(d=" << r.discount() << ",s=" << r.strength() << ')' << std::endl; -    for (typename CCRP<std::vector<WordID> >::const_iterator it = r.begin(); it != r.end(); ++it) { -      std::cerr << "  " << it->second -                << TD::GetString(it->first) << std::endl; -    } -  } - - private: - -  Base& base;  // keeps track of the draws from the base distribution -  CCRP<std::vector<WordID> > r; -}; - -#endif diff --git a/gi/pf/quasi_model2.h b/gi/pf/quasi_model2.h deleted file mode 100644 index 4075affe..00000000 --- a/gi/pf/quasi_model2.h +++ /dev/null @@ -1,177 +0,0 @@ -#ifndef _QUASI_MODEL2_H_ -#define _QUASI_MODEL2_H_ - -#include <vector> -#include <cmath> -#include <tr1/unordered_map> -#include "boost/functional.hpp" -#include "prob.h" -#include "array2d.h" -#include "slice_sampler.h" -#include "m.h" -#include "have_64_bits.h" - -struct AlignmentObservation { -  AlignmentObservation() : src_len(), trg_len(), j(), a_j() {} -  AlignmentObservation(unsigned sl, unsigned tl, unsigned tw, unsigned sw) : -      src_len(sl), trg_len(tl), j(tw), a_j(sw) {} -  unsigned short src_len; -  unsigned short trg_len; -  unsigned short j; -  unsigned short a_j; -}; - -#ifdef HAVE_64_BITS -inline size_t hash_value(const AlignmentObservation& o) { -  return reinterpret_cast<const size_t&>(o); -} -inline bool operator==(const AlignmentObservation& a, const AlignmentObservation& b) { -  return hash_value(a) == hash_value(b); -} -#else -inline size_t hash_value(const AlignmentObservation& o) { -  size_t h = 1; -  boost::hash_combine(h, o.src_len); -  boost::hash_combine(h, o.trg_len); -  boost::hash_combine(h, o.j); -  boost::hash_combine(h, o.a_j); -  return h; -} -#endif - -struct QuasiModel2 { -  explicit QuasiModel2(double alpha, double pnull = 0.1) : -      alpha_(alpha), -      pnull_(pnull), -      pnotnull_(1 - pnull) {} - -  // a_j = 0 => NULL; src_len does *not* include null -  prob_t Prob(unsigned a_j, unsigned j, unsigned src_len, unsigned trg_len) const { -    if (!a_j) return pnull_; -    return pnotnull_ * -       prob_t(UnnormalizedProb(a_j, j, src_len, trg_len, alpha_) / GetOrComputeZ(j, src_len, trg_len)); -  } - -  void Increment(unsigned a_j, unsigned j, unsigned src_len, unsigned trg_len) { -    assert(a_j <= src_len); -    assert(j < trg_len); -    ++obs_[AlignmentObservation(src_len, trg_len, j, a_j)]; -  } - -  void Decrement(unsigned a_j, unsigned j, unsigned src_len, unsigned trg_len) { -    const AlignmentObservation ao(src_len, trg_len, j, a_j); -    int &cc = obs_[ao]; -    assert(cc > 0); -    --cc; -    if (!cc) obs_.erase(ao); -  } - -  struct PNullResampler { -    PNullResampler(const QuasiModel2& m) : m_(m) {} -    const QuasiModel2& m_; -    double operator()(const double& proposed_pnull) const { -      return log(m_.Likelihood(m_.alpha_, proposed_pnull)); -    } -  }; - -  struct AlphaResampler { -    AlphaResampler(const QuasiModel2& m) : m_(m) {} -    const QuasiModel2& m_; -    double operator()(const double& proposed_alpha) const { -      return log(m_.Likelihood(proposed_alpha, m_.pnull_.as_float())); -    } -  }; - -  void ResampleHyperparameters(MT19937* rng, const unsigned nloop = 5, const unsigned niterations = 10) { -    const PNullResampler dr(*this); -    const AlphaResampler ar(*this); -    for (unsigned i = 0; i < nloop; ++i) { -      double pnull = slice_sampler1d(dr, pnull_.as_float(), *rng, 0.00000001, -                            1.0, 0.0, niterations, 100*niterations); -      pnull_ = prob_t(pnull); -      alpha_ = slice_sampler1d(ar, alpha_, *rng, 0.00000001, -                              std::numeric_limits<double>::infinity(), 0.0, niterations, 100*niterations); -    } -    std::cerr << "QuasiModel2(alpha=" << alpha_ << ",p_null=" -              << pnull_.as_float() << ") = " << Likelihood() << std::endl; -    zcache_.clear(); -  } - -  prob_t Likelihood() const { -    return Likelihood(alpha_, pnull_.as_float()); -  } - -  prob_t Likelihood(double alpha, double ppnull) const { -    const prob_t pnull(ppnull); -    const prob_t pnotnull(1 - ppnull); - -    prob_t p; -    p.logeq(Md::log_gamma_density(alpha, 0.1, 25));  // TODO configure -    assert(!p.is_0()); -    prob_t prob_of_ppnull; prob_of_ppnull.logeq(Md::log_beta_density(ppnull, 2, 10)); -    assert(!prob_of_ppnull.is_0()); -    p *= prob_of_ppnull; -    for (ObsCount::const_iterator it = obs_.begin(); it != obs_.end(); ++it) { -      const AlignmentObservation& ao = it->first; -      if (ao.a_j) { -        prob_t u = XUnnormalizedProb(ao.a_j, ao.j, ao.src_len, ao.trg_len, alpha); -        prob_t z = XComputeZ(ao.j, ao.src_len, ao.trg_len, alpha); -        prob_t pa(u / z); -        pa *= pnotnull; -        pa.poweq(it->second); -        p *= pa; -      } else { -        p *= pnull.pow(it->second); -      } -    } -    return p; -  } - - private: -  static prob_t XUnnormalizedProb(unsigned a_j, unsigned j, unsigned src_len, unsigned trg_len, double alpha) { -    prob_t p; -    p.logeq(-fabs(double(a_j - 1) / src_len - double(j) / trg_len) * alpha); -    return p; -  } - -  static prob_t XComputeZ(unsigned j, unsigned src_len, unsigned trg_len, double alpha) { -    prob_t z = prob_t::Zero(); -    for (int a_j = 1; a_j <= src_len; ++a_j) -      z += XUnnormalizedProb(a_j, j, src_len, trg_len, alpha); -    return z; -  } - -  static double UnnormalizedProb(unsigned a_j, unsigned j, unsigned src_len, unsigned trg_len, double alpha) { -    return exp(-fabs(double(a_j - 1) / src_len - double(j) / trg_len) * alpha); -  } - -  static double ComputeZ(unsigned j, unsigned src_len, unsigned trg_len, double alpha) { -    double z = 0; -    for (int a_j = 1; a_j <= src_len; ++a_j) -      z += UnnormalizedProb(a_j, j, src_len, trg_len, alpha); -    return z; -  } - -  const double& GetOrComputeZ(unsigned j, unsigned src_len, unsigned trg_len) const { -    if (src_len >= zcache_.size()) -      zcache_.resize(src_len + 1); -    if (trg_len >= zcache_[src_len].size()) -      zcache_[src_len].resize(trg_len + 1); -    std::vector<double>& zv = zcache_[src_len][trg_len]; -    if (zv.size() == 0) -      zv.resize(trg_len); -    double& z = zv[j]; -    if (!z) -      z = ComputeZ(j, src_len, trg_len, alpha_); -    return z; -  } - -  double alpha_; -  prob_t pnull_; -  prob_t pnotnull_; -  mutable std::vector<std::vector<std::vector<double> > > zcache_; -  typedef std::tr1::unordered_map<AlignmentObservation, int, boost::hash<AlignmentObservation> > ObsCount; -  ObsCount obs_; -}; - -#endif diff --git a/gi/pf/reachability.cc b/gi/pf/reachability.cc deleted file mode 100644 index 7d0d04ac..00000000 --- a/gi/pf/reachability.cc +++ /dev/null @@ -1,74 +0,0 @@ -#include "reachability.h" - -#include <vector> -#include <iostream> - -using namespace std; - -struct SState { -  SState() : prev_src_covered(), prev_trg_covered() {} -  SState(int i, int j) : prev_src_covered(i), prev_trg_covered(j) {} -  int prev_src_covered; -  int prev_trg_covered; -}; - -void Reachability::ComputeReachability(int srclen, int trglen, int src_max_phrase_len, int trg_max_phrase_len) { -    typedef boost::multi_array<vector<SState>, 2> array_type; -    array_type a(boost::extents[srclen + 1][trglen + 1]); -    a[0][0].push_back(SState()); -    for (int i = 0; i < srclen; ++i) { -      for (int j = 0; j < trglen; ++j) { -        if (a[i][j].size() == 0) continue; -        const SState prev(i,j); -        for (int k = 1; k <= src_max_phrase_len; ++k) { -          if ((i + k) > srclen) continue; -          for (int l = 1; l <= trg_max_phrase_len; ++l) { -            if ((j + l) > trglen) continue; -            a[i + k][j + l].push_back(prev); -          } -        } -      } -    } -    a[0][0].clear(); -    //cerr << srclen << "," << trglen << ": Final cell contains " << a[srclen][trglen].size() << " back pointers\n"; -    if (a[srclen][trglen].empty()) { -      cerr << "Sequence pair with lengths (" << srclen << ',' << trglen << ") violates reachability constraints\n"; -      nodes = 0; -      return; -    } - -    typedef boost::multi_array<bool, 2> rarray_type; -    rarray_type r(boost::extents[srclen + 1][trglen + 1]); -    r[srclen][trglen] = true; -    nodes = 0; -    for (int i = srclen; i >= 0; --i) { -      for (int j = trglen; j >= 0; --j) { -        vector<SState>& prevs = a[i][j]; -        if (!r[i][j]) { prevs.clear(); } -        for (int k = 0; k < prevs.size(); ++k) { -          r[prevs[k].prev_src_covered][prevs[k].prev_trg_covered] = true; -          int src_delta = i - prevs[k].prev_src_covered; -          edges[prevs[k].prev_src_covered][prevs[k].prev_trg_covered][src_delta][j - prevs[k].prev_trg_covered] = true; -          valid_deltas[prevs[k].prev_src_covered][prevs[k].prev_trg_covered].push_back(make_pair<short,short>(src_delta,j - prevs[k].prev_trg_covered)); -          short &msd = max_src_delta[prevs[k].prev_src_covered][prevs[k].prev_trg_covered]; -          if (src_delta > msd) msd = src_delta; -        } -      } -    } -    assert(!edges[0][0][1][0]); -    assert(!edges[0][0][0][1]); -    assert(!edges[0][0][0][0]); -    assert(max_src_delta[0][0] > 0); -    nodes = 0; -    for (int i = 0; i < srclen; ++i) { -      for (int j = 0; j < trglen; ++j) { -        if (valid_deltas[i][j].size() > 0) { -          node_addresses[i][j] = nodes++; -        } else { -          node_addresses[i][j] = -1; -        } -      } -    } -    cerr << "Sequence pair with lengths (" << srclen << ',' << trglen << ") has " << valid_deltas[0][0].size() << " out edges in its root node, " << nodes << " nodes in total, and outside estimate matrix will require " << sizeof(float)*nodes << " bytes\n"; -  } - diff --git a/gi/pf/reachability.h b/gi/pf/reachability.h deleted file mode 100644 index 1e22c76a..00000000 --- a/gi/pf/reachability.h +++ /dev/null @@ -1,34 +0,0 @@ -#ifndef _REACHABILITY_H_ -#define _REACHABILITY_H_ - -#include "boost/multi_array.hpp" - -// determines minimum and maximum lengths of outgoing edges from all -// coverage positions such that the alignment path respects src and -// trg maximum phrase sizes -// -// runs in O(n^2 * src_max * trg_max) time but should be relatively fast -// -// currently forbids 0 -> n and n -> 0 alignments - -struct Reachability { -  unsigned nodes; -  boost::multi_array<bool, 4> edges;  // edges[src_covered][trg_covered][src_delta][trg_delta] is this edge worth exploring? -  boost::multi_array<short, 2> max_src_delta; // msd[src_covered][trg_covered] -- the largest src delta that's valid -  boost::multi_array<short, 2> node_addresses; // na[src_covered][trg_covered] -- the index of the node in a one-dimensional array (of size "nodes") -  boost::multi_array<std::vector<std::pair<short,short> >, 2> valid_deltas; // valid_deltas[src_covered][trg_covered] list of valid transitions leaving a particular node - -  Reachability(int srclen, int trglen, int src_max_phrase_len, int trg_max_phrase_len) : -      nodes(), -      edges(boost::extents[srclen][trglen][src_max_phrase_len+1][trg_max_phrase_len+1]), -      max_src_delta(boost::extents[srclen][trglen]), -      node_addresses(boost::extents[srclen][trglen]), -      valid_deltas(boost::extents[srclen][trglen]) { -    ComputeReachability(srclen, trglen, src_max_phrase_len, trg_max_phrase_len); -  } - - private: -  void ComputeReachability(int srclen, int trglen, int src_max_phrase_len, int trg_max_phrase_len); -}; - -#endif diff --git a/gi/pf/tied_resampler.h b/gi/pf/tied_resampler.h deleted file mode 100644 index a4f4af36..00000000 --- a/gi/pf/tied_resampler.h +++ /dev/null @@ -1,122 +0,0 @@ -#ifndef _TIED_RESAMPLER_H_ -#define _TIED_RESAMPLER_H_ - -#include <set> -#include <vector> -#include "sampler.h" -#include "slice_sampler.h" -#include "m.h" - -template <class CRP> -struct TiedResampler { -  explicit TiedResampler(double da, double db, double ss, double sr, double d=0.5, double s=1.0) : -      d_alpha(da), -      d_beta(db), -      s_shape(ss), -      s_rate(sr), -      discount(d), -      strength(s) {} - -  void Add(CRP* crp) { -    crps.insert(crp); -    crp->set_discount(discount); -    crp->set_strength(strength); -    assert(!crp->has_discount_prior()); -    assert(!crp->has_strength_prior()); -  } - -  void Remove(CRP* crp) { -    crps.erase(crp); -  } - -  size_t size() const { -    return crps.size(); -  } - -  double LogLikelihood(double d, double s) const { -    if (s <= -d) return -std::numeric_limits<double>::infinity(); -    double llh = Md::log_beta_density(d, d_alpha, d_beta) + -                 Md::log_gamma_density(d + s, s_shape, s_rate); -    for (typename std::set<CRP*>::iterator it = crps.begin(); it != crps.end(); ++it) -      llh += (*it)->log_crp_prob(d, s); -    return llh; -  } - -  double LogLikelihood() const { -    return LogLikelihood(discount, strength); -  } - -  struct DiscountResampler { -    DiscountResampler(const TiedResampler& m) : m_(m) {} -    const TiedResampler& m_; -    double operator()(const double& proposed_discount) const { -      return m_.LogLikelihood(proposed_discount, m_.strength); -    } -  }; - -  struct AlphaResampler { -    AlphaResampler(const TiedResampler& m) : m_(m) {} -    const TiedResampler& m_; -    double operator()(const double& proposed_strength) const { -      return m_.LogLikelihood(m_.discount, proposed_strength); -    } -  }; - -  void ResampleHyperparameters(MT19937* rng, const unsigned nloop = 5, const unsigned niterations = 10) { -    if (size() == 0) { std::cerr << "EMPTY - not resampling\n"; return; } -    const DiscountResampler dr(*this); -    const AlphaResampler ar(*this); -    for (int iter = 0; iter < nloop; ++iter) { -      strength = slice_sampler1d(ar, strength, *rng, -discount + std::numeric_limits<double>::min(), -                              std::numeric_limits<double>::infinity(), 0.0, niterations, 100*niterations); -      double min_discount = std::numeric_limits<double>::min(); -      if (strength < 0.0) min_discount -= strength; -      discount = slice_sampler1d(dr, discount, *rng, min_discount, -                          1.0, 0.0, niterations, 100*niterations); -    } -    strength = slice_sampler1d(ar, strength, *rng, -discount + std::numeric_limits<double>::min(), -                            std::numeric_limits<double>::infinity(), 0.0, niterations, 100*niterations); -    std::cerr << "TiedCRPs(d=" << discount << ",s=" -              << strength << ") = " << LogLikelihood(discount, strength) << std::endl; -    for (typename std::set<CRP*>::iterator it = crps.begin(); it != crps.end(); ++it) -      (*it)->set_hyperparameters(discount, strength); -  } - private: -  std::set<CRP*> crps; -  const double d_alpha, d_beta, s_shape, s_rate; -  double discount, strength; -}; - -// split according to some criterion -template <class CRP> -struct BinTiedResampler { -  explicit BinTiedResampler(unsigned nbins) : -      resamplers(nbins, TiedResampler<CRP>(1,1,1,1)) {} - -  void Add(unsigned bin, CRP* crp) { -    resamplers[bin].Add(crp); -  } - -  void Remove(unsigned bin, CRP* crp) { -    resamplers[bin].Remove(crp); -  } - -  void ResampleHyperparameters(MT19937* rng) { -    for (unsigned i = 0; i < resamplers.size(); ++i) { -      std::cerr << "BIN " << i << " (" << resamplers[i].size() << " CRPs): " << std::flush; -      resamplers[i].ResampleHyperparameters(rng); -    } -  } - -  double LogLikelihood() const { -    double llh = 0; -    for (unsigned i = 0; i < resamplers.size(); ++i) -      llh += resamplers[i].LogLikelihood(); -    return llh; -  } - - private: -  std::vector<TiedResampler<CRP> > resamplers; -}; - -#endif diff --git a/gi/pf/tpf.cc b/gi/pf/tpf.cc deleted file mode 100644 index 7348d21c..00000000 --- a/gi/pf/tpf.cc +++ /dev/null @@ -1,99 +0,0 @@ -#include <iostream> -#include <tr1/memory> -#include <queue> - -#include "sampler.h" - -using namespace std; -using namespace tr1; - -shared_ptr<MT19937> prng; - -struct Particle { -  Particle() : weight(prob_t::One()) {} -  vector<int> states; -  prob_t weight; -  prob_t gamma_last; -}; - -ostream& operator<<(ostream& os, const Particle& p) { -  os << "["; -  for (int i = 0; i < p.states.size(); ++i) os << p.states[i] << ' '; -  os << "| w=" << log(p.weight) << ']'; -  return os; -} - -void Rejuvenate(vector<Particle>& pps) { -  SampleSet<prob_t> ss; -  vector<Particle> nps(pps.size()); -  for (int i = 0; i < pps.size(); ++i) { -//    cerr << pps[i] << endl; -    ss.add(pps[i].weight); -  } -//  cerr << "REJUVINATING...\n"; -  for (int i = 0; i < pps.size(); ++i) { -    nps[i] = pps[prng->SelectSample(ss)]; -    nps[i].weight = prob_t(1.0 / pps.size()); -//    cerr << nps[i] << endl; -  } -  nps.swap(pps); -//  exit(1); -} - -int main(int argc, char** argv) { -  const unsigned particles = 100; -  prng.reset(new MT19937); -  MT19937& rng = *prng; - -  // q(a) = 0.8 -  // q(b) = 0.8 -  // q(c) = 0.4 -  SampleSet<double> ssq; -  ssq.add(0.4); -  ssq.add(0.6); -  ssq.add(0); -  double qz = 1; - -  // p(a) = 0.2 -  // p(b) = 0.8 -  vector<double> p(3); -  p[0] = 0.2; -  p[1] = 0.8; -  p[2] = 0; - -  vector<int> counts(3); -  int tot = 0; - -  vector<Particle> pps(particles); -  SampleSet<prob_t> ppss; -  int LEN = 12; -  int PP = 1; -  while (pps[0].states.size() < LEN) { -    for (int pi = 0; pi < particles; ++pi) { -      Particle& prt = pps[pi]; - -      bool redo = true; -      const Particle savedp = prt; -      while (redo) { -        redo = false; -        for (int i = 0; i < PP; ++i) { -          int s = rng.SelectSample(ssq); -          double gamma_last = p[s]; -          if (!gamma_last) { redo = true; break; } -          double q = ssq[s] / qz; -          prt.states.push_back(s); -          prt.weight *= prob_t(gamma_last / q); -        } -        if (redo) { prt = savedp; continue; } -      } -    } -    Rejuvenate(pps); -  } -  ppss.clear(); -  for (int i = 0; i < particles; ++i) { ppss.add(pps[i].weight); } -  int sp = rng.SelectSample(ppss); -  cerr << pps[sp] << endl; - -  return 0; -} - diff --git a/gi/pf/transliterations.cc b/gi/pf/transliterations.cc deleted file mode 100644 index b2996f65..00000000 --- a/gi/pf/transliterations.cc +++ /dev/null @@ -1,334 +0,0 @@ -#include "transliterations.h" - -#include <iostream> -#include <vector> - -#include "boost/shared_ptr.hpp" - -#include "backward.h" -#include "filelib.h" -#include "tdict.h" -#include "trule.h" -#include "filelib.h" -#include "ccrp_nt.h" -#include "m.h" -#include "reachability.h" - -using namespace std; -using namespace std::tr1; - -struct TruncatedConditionalLengthModel { -  TruncatedConditionalLengthModel(unsigned max_src_size, unsigned max_trg_size, double expected_src_to_trg_ratio) : -      plens(max_src_size+1, vector<prob_t>(max_trg_size+1, 0.0)) { -    for (unsigned i = 1; i <= max_src_size; ++i) { -      prob_t z = prob_t::Zero(); -      for (unsigned j = 1; j <= max_trg_size; ++j) -        z += (plens[i][j] = prob_t(0.01 + exp(Md::log_poisson(j, i * expected_src_to_trg_ratio)))); -      for (unsigned j = 1; j <= max_trg_size; ++j) -        plens[i][j] /= z; -      //for (unsigned j = 1; j <= max_trg_size; ++j) -      //  cerr << "P(trg_len=" << j << " | src_len=" << i << ") = " << plens[i][j] << endl; -    } -  } - -  // return p(tlen | slen) for *chunks* not full words -  inline const prob_t& operator()(int slen, int tlen) const { -    return plens[slen][tlen]; -  } - -  vector<vector<prob_t> > plens; -}; - -struct CondBaseDist { -  CondBaseDist(unsigned max_src_size, unsigned max_trg_size, double expected_src_to_trg_ratio) : -    tclm(max_src_size, max_trg_size, expected_src_to_trg_ratio) {} - -  prob_t operator()(const vector<WordID>& src, unsigned sf, unsigned st, -                    const vector<WordID>& trg, unsigned tf, unsigned tt) const { -    prob_t p = tclm(st - sf, tt - tf);  // target len | source length ~ TCLM(source len) -    assert(!"not impl"); -    return p; -  } -  inline prob_t operator()(const vector<WordID>& src, const vector<WordID>& trg) const { -    return (*this)(src, 0, src.size(), trg, 0, trg.size()); -  } -  TruncatedConditionalLengthModel tclm; -}; - -// represents transliteration phrase probabilities, e.g. -//   p( a l - | A l ) , p( o | A w ) , ... -struct TransliterationChunkConditionalModel { -  explicit TransliterationChunkConditionalModel(const CondBaseDist& pp0) : -      d(0.0), -      strength(1.0), -      rp0(pp0) { -  } - -  void Summary() const { -    std::cerr << "Number of conditioning contexts: " << r.size() << std::endl; -    for (RuleModelHash::const_iterator it = r.begin(); it != r.end(); ++it) { -      std::cerr << TD::GetString(it->first) << "   \t(\\alpha = " << it->second.alpha() << ") --------------------------" << std::endl; -      for (CCRP_NoTable<TRule>::const_iterator i2 = it->second.begin(); i2 != it->second.end(); ++i2) -        std::cerr << "   " << i2->second << '\t' << i2->first << std::endl; -    } -  } - -  int DecrementRule(const TRule& rule) { -    RuleModelHash::iterator it = r.find(rule.f_); -    assert(it != r.end());     -    int count = it->second.decrement(rule); -    if (count) { -      if (it->second.num_customers() == 0) r.erase(it); -    } -    return count; -  } - -  int IncrementRule(const TRule& rule) { -    RuleModelHash::iterator it = r.find(rule.f_); -    if (it == r.end()) { -      it = r.insert(make_pair(rule.f_, CCRP_NoTable<TRule>(strength))).first; -    }  -    int count = it->second.increment(rule); -    return count; -  } - -  void IncrementRules(const std::vector<TRulePtr>& rules) { -    for (int i = 0; i < rules.size(); ++i) -      IncrementRule(*rules[i]); -  } - -  void DecrementRules(const std::vector<TRulePtr>& rules) { -    for (int i = 0; i < rules.size(); ++i) -      DecrementRule(*rules[i]); -  } - -  prob_t RuleProbability(const TRule& rule) const { -    prob_t p; -    RuleModelHash::const_iterator it = r.find(rule.f_); -    if (it == r.end()) { -      p = rp0(rule.f_, rule.e_); -    } else { -      p = it->second.prob(rule, rp0(rule.f_, rule.e_)); -    } -    return p; -  } - -  double LogLikelihood(const double& dd, const double& aa) const { -    if (aa <= -dd) return -std::numeric_limits<double>::infinity(); -    //double llh = Md::log_beta_density(dd, 10, 3) + Md::log_gamma_density(aa, 1, 1); -    double llh = //Md::log_beta_density(dd, 1, 1) + -                 Md::log_gamma_density(dd + aa, 1, 1); -    std::tr1::unordered_map<std::vector<WordID>, CCRP_NoTable<TRule>, boost::hash<std::vector<WordID> > >::const_iterator it; -    for (it = r.begin(); it != r.end(); ++it) -      llh += it->second.log_crp_prob(aa); -    return llh; -  } - -  struct AlphaResampler { -    AlphaResampler(const TransliterationChunkConditionalModel& m) : m_(m) {} -    const TransliterationChunkConditionalModel& m_; -    double operator()(const double& proposed_strength) const { -      return m_.LogLikelihood(m_.d, proposed_strength); -    } -  }; - -  void ResampleHyperparameters(MT19937* rng) { -    std::tr1::unordered_map<std::vector<WordID>, CCRP_NoTable<TRule>, boost::hash<std::vector<WordID> > >::iterator it; -    //const unsigned nloop = 5; -    const unsigned niterations = 10; -    //DiscountResampler dr(*this); -    AlphaResampler ar(*this); -#if 0 -    for (int iter = 0; iter < nloop; ++iter) { -      strength = slice_sampler1d(ar, strength, *rng, -d + std::numeric_limits<double>::min(), -                              std::numeric_limits<double>::infinity(), 0.0, niterations, 100*niterations); -      double min_discount = std::numeric_limits<double>::min(); -      if (strength < 0.0) min_discount -= strength; -      d = slice_sampler1d(dr, d, *rng, min_discount, -                          1.0, 0.0, niterations, 100*niterations); -    } -#endif -    strength = slice_sampler1d(ar, strength, *rng, -d, -                            std::numeric_limits<double>::infinity(), 0.0, niterations, 100*niterations); -    std::cerr << "CTMModel(alpha=" << strength << ") = " << LogLikelihood(d, strength) << std::endl; -    for (it = r.begin(); it != r.end(); ++it) { -#if 0 -      it->second.set_discount(d); -#endif -      it->second.set_alpha(strength); -    } -  } - -  prob_t Likelihood() const { -    prob_t p; p.logeq(LogLikelihood(d, strength)); -    return p; -  } - -  const CondBaseDist& rp0; -  typedef std::tr1::unordered_map<std::vector<WordID>, -                                  CCRP_NoTable<TRule>, -                                  boost::hash<std::vector<WordID> > > RuleModelHash; -  RuleModelHash r; -  double d, strength; -}; - -struct GraphStructure { -  GraphStructure() : r() {} -  // leak memory - these are basically static -  const Reachability* r; -  bool IsReachable() const { return r->nodes > 0; } -}; - -struct ProbabilityEstimates { -  ProbabilityEstimates() : gs(), backward() {} -  explicit ProbabilityEstimates(const GraphStructure& g) : -      gs(&g), backward() { -    if (g.r->nodes > 0) -      backward = new float[g.r->nodes]; -  } -  // leak memory, these are static - -  // returns an estimate of the marginal probability -  double MarginalEstimate() const { -    if (!backward) return 0; -    return backward[0]; -  } - -  // returns an backward estimate -  double Backward(int src_covered, int trg_covered) const { -    if (!backward) return 0; -    int ind = gs->r->node_addresses[src_covered][trg_covered]; -    if (ind < 0) return 0; -    return backward[ind]; -  } - -  prob_t estp; -  float* backward; - private: -  const GraphStructure* gs; -}; - -struct TransliterationsImpl { -  TransliterationsImpl(int max_src, int max_trg, double sr, const BackwardEstimator& b) : -      cp0(max_src, max_trg, sr), -      tccm(cp0), -      be(b), -      kMAX_SRC_CHUNK(max_src), -      kMAX_TRG_CHUNK(max_trg), -      kS2T_RATIO(sr), -      tot_pairs(), tot_mem() { -  } -  const CondBaseDist cp0; -  TransliterationChunkConditionalModel tccm; -  const BackwardEstimator& be; - -  void Initialize(WordID src, const vector<WordID>& src_lets, WordID trg, const vector<WordID>& trg_lets) { -    const size_t src_len = src_lets.size(); -    const size_t trg_len = trg_lets.size(); - -    // init graph structure -    if (src_len >= graphs.size()) graphs.resize(src_len + 1); -    if (trg_len >= graphs[src_len].size()) graphs[src_len].resize(trg_len + 1); -    GraphStructure& gs = graphs[src_len][trg_len]; -    if (!gs.r) { -      double rat = exp(fabs(log(trg_len / (src_len * kS2T_RATIO)))); -      if (rat > 1.5 || (rat > 2.4 && src_len < 6)) { -        cerr << " ** Forbidding transliterations of size " << src_len << "," << trg_len << ": " << rat << endl; -        gs.r = new Reachability(src_len, trg_len, 0, 0); -      } else { -        gs.r = new Reachability(src_len, trg_len, kMAX_SRC_CHUNK, kMAX_TRG_CHUNK); -      } -    } - -    const Reachability& r = *gs.r; - -    // init backward estimates -    if (src >= ests.size()) ests.resize(src + 1); -    unordered_map<WordID, ProbabilityEstimates>::iterator it = ests[src].find(trg); -    if (it != ests[src].end()) return; // already initialized - -    it = ests[src].insert(make_pair(trg, ProbabilityEstimates(gs))).first; -    ProbabilityEstimates& est = it->second; -    if (!gs.r->nodes) return;  // not derivable subject to length constraints - -    be.InitializeGrid(src_lets, trg_lets, r, kS2T_RATIO, est.backward); -    cerr << TD::GetString(src_lets) << " ||| " << TD::GetString(trg_lets) << " ||| " << (est.backward[0] / trg_lets.size()) << endl; -    tot_pairs++; -    tot_mem += sizeof(float) * gs.r->nodes; -  } - -  void Forbid(WordID src, const vector<WordID>& src_lets, WordID trg, const vector<WordID>& trg_lets) { -    const size_t src_len = src_lets.size(); -    const size_t trg_len = trg_lets.size(); -    // TODO -  } - -  prob_t EstimateProbability(WordID s, const vector<WordID>& src, WordID t, const vector<WordID>& trg) const { -    assert(src.size() < graphs.size()); -    const vector<GraphStructure>& tv = graphs[src.size()]; -    assert(trg.size() < tv.size()); -    const GraphStructure& gs = tv[trg.size()]; -    if (gs.r->nodes == 0) -      return prob_t::Zero(); -    const unordered_map<WordID, ProbabilityEstimates>::const_iterator it = ests[s].find(t); -    assert(it != ests[s].end()); -    return it->second.estp; -  } - -  void GraphSummary() const { -    double to = 0; -    double tn = 0; -    double tt = 0; -    for (int i = 0; i < graphs.size(); ++i) { -      const vector<GraphStructure>& vt = graphs[i]; -      for (int j = 0; j < vt.size(); ++j) { -        const GraphStructure& gs = vt[j]; -        if (!gs.r) continue; -        tt++; -        for (int k = 0; k < i; ++k) { -          for (int l = 0; l < j; ++l) { -            size_t c = gs.r->valid_deltas[k][l].size(); -            if (c) { -              tn += 1; -              to += c; -            } -          } -        } -      } -    } -    cerr << "     Average nodes = " << (tn / tt) << endl; -    cerr << "Average out-degree = " << (to / tn) << endl; -    cerr << " Unique structures = " << tt << endl; -    cerr << "      Unique pairs = " << tot_pairs << endl; -    cerr << "          BEs size = " << (tot_mem / (1024.0*1024.0)) << " MB" << endl; -  } - -  const int kMAX_SRC_CHUNK; -  const int kMAX_TRG_CHUNK; -  const double kS2T_RATIO; -  unsigned tot_pairs; -  size_t tot_mem; -  vector<vector<GraphStructure> > graphs; // graphs[src_len][trg_len] -  vector<unordered_map<WordID, ProbabilityEstimates> > ests; // ests[src][trg] -}; - -Transliterations::Transliterations(int max_src, int max_trg, double sr, const BackwardEstimator& be) : -    pimpl_(new TransliterationsImpl(max_src, max_trg, sr, be)) {} -Transliterations::~Transliterations() { delete pimpl_; } - -void Transliterations::Initialize(WordID src, const vector<WordID>& src_lets, WordID trg, const vector<WordID>& trg_lets) { -  pimpl_->Initialize(src, src_lets, trg, trg_lets); -} - -prob_t Transliterations::EstimateProbability(WordID s, const vector<WordID>& src, WordID t, const vector<WordID>& trg) const { -  return pimpl_->EstimateProbability(s, src,t, trg); -} - -void Transliterations::Forbid(WordID src, const vector<WordID>& src_lets, WordID trg, const vector<WordID>& trg_lets) { -  pimpl_->Forbid(src, src_lets, trg, trg_lets); -} - -void Transliterations::GraphSummary() const { -  pimpl_->GraphSummary(); -} - diff --git a/gi/pf/transliterations.h b/gi/pf/transliterations.h deleted file mode 100644 index 49d14684..00000000 --- a/gi/pf/transliterations.h +++ /dev/null @@ -1,24 +0,0 @@ -#ifndef _TRANSLITERATIONS_H_ -#define _TRANSLITERATIONS_H_ - -#include <vector> -#include "wordid.h" -#include "prob.h" - -struct BackwardEstimator; -struct TransliterationsImpl; -struct Transliterations { -  // max_src and max_trg indicate how big the transliteration phrases can be -  // see reachability.h for information about filter_ratio -  explicit Transliterations(int max_src, int max_trg, double s2t_rat, const BackwardEstimator& be); -  ~Transliterations(); -  void Initialize(WordID src, const std::vector<WordID>& src_lets, WordID trg, const std::vector<WordID>& trg_lets); -  void Forbid(WordID src, const std::vector<WordID>& src_lets, WordID trg, const std::vector<WordID>& trg_lets); -  void GraphSummary() const; -  prob_t EstimateProbability(WordID s, const std::vector<WordID>& src, WordID t, const std::vector<WordID>& trg) const; - private: -  TransliterationsImpl* pimpl_; -}; - -#endif - diff --git a/gi/pf/unigrams.cc b/gi/pf/unigrams.cc deleted file mode 100644 index 40829775..00000000 --- a/gi/pf/unigrams.cc +++ /dev/null @@ -1,80 +0,0 @@ -#include "unigrams.h" - -#include <string> -#include <cmath> - -#include "stringlib.h" -#include "filelib.h" - -using namespace std; - -void UnigramModel::LoadUnigrams(const string& fname) { -  cerr << "Loading unigram probabilities from " << fname << " ..." << endl; -  ReadFile rf(fname); -  string line; -  istream& in = *rf.stream(); -  assert(in); -  getline(in, line); -  assert(line.empty()); -  getline(in, line); -  assert(line == "\\data\\"); -  getline(in, line); -  size_t pos = line.find("ngram 1="); -  assert(pos == 0); -  assert(line.size() > 8); -  const size_t num_unigrams = atoi(&line[8]); -  getline(in, line); -  assert(line.empty()); -  getline(in, line); -  assert(line == "\\1-grams:"); -  for (size_t i = 0; i < num_unigrams; ++i) { -    getline(in, line); -    assert(line.size() > 0); -    pos = line.find('\t'); -    assert(pos > 0); -    assert(pos + 1 < line.size()); -    const WordID w = TD::Convert(line.substr(pos + 1)); -    line[pos] = 0; -    float p = atof(&line[0]); -    if (w < probs_.size()) probs_[w].logeq(p * log(10)); else cerr << "WARNING: don't know about '" << TD::Convert(w) << "'\n"; -  } -} - -void UnigramWordModel::LoadUnigrams(const string& fname) { -  cerr << "Loading unigram probabilities from " << fname << " ..." << endl; -  ReadFile rf(fname); -  string line; -  istream& in = *rf.stream(); -  assert(in); -  getline(in, line); -  assert(line.empty()); -  getline(in, line); -  assert(line == "\\data\\"); -  getline(in, line); -  size_t pos = line.find("ngram 1="); -  assert(pos == 0); -  assert(line.size() > 8); -  const size_t num_unigrams = atoi(&line[8]); -  getline(in, line); -  assert(line.empty()); -  getline(in, line); -  assert(line == "\\1-grams:"); -  for (size_t i = 0; i < num_unigrams; ++i) { -    getline(in, line); -    assert(line.size() > 0); -    pos = line.find('\t'); -    assert(pos > 0); -    assert(pos + 1 < line.size()); -    size_t cur = pos + 1; -    vector<WordID> w; -    while (cur < line.size()) { -      const size_t len = UTF8Len(line[cur]); -      w.push_back(TD::Convert(line.substr(cur, len))); -      cur += len; -    } -    line[pos] = 0; -    float p = atof(&line[0]); -    probs_[w].logeq(p * log(10.0)); -  } -} - diff --git a/gi/pf/unigrams.h b/gi/pf/unigrams.h deleted file mode 100644 index 1660d1ed..00000000 --- a/gi/pf/unigrams.h +++ /dev/null @@ -1,69 +0,0 @@ -#ifndef _UNIGRAMS_H_ -#define _UNIGRAMS_H_ - -#include <vector> -#include <string> -#include <tr1/unordered_map> -#include <boost/functional.hpp> - -#include "wordid.h" -#include "prob.h" -#include "tdict.h" - -struct UnigramModel { -  explicit UnigramModel(const std::string& fname, unsigned vocab_size) : -      use_uniform_(fname.size() == 0), -      uniform_(1.0 / vocab_size), -      probs_() { -    if (fname.size() > 0) { -      probs_.resize(TD::NumWords() + 1); -      LoadUnigrams(fname); -    } -  } - -  const prob_t& operator()(const WordID& w) const { -    assert(w); -    if (use_uniform_) return uniform_; -    return probs_[w]; -  } - - private: -  void LoadUnigrams(const std::string& fname); - -  const bool use_uniform_; -  const prob_t uniform_; -  std::vector<prob_t> probs_; -}; - - -// reads an ARPA unigram file and converts words like 'cat' into a string 'c a t' -struct UnigramWordModel { -  explicit UnigramWordModel(const std::string& fname) : -      use_uniform_(false), -      uniform_(1.0), -      probs_() { -    LoadUnigrams(fname); -  } - -  explicit UnigramWordModel(const unsigned vocab_size) : -      use_uniform_(true), -      uniform_(1.0 / vocab_size), -      probs_() {} - -  const prob_t& operator()(const std::vector<WordID>& s) const { -    if (use_uniform_) return uniform_; -    const VectorProbHash::const_iterator it = probs_.find(s); -    assert(it != probs_.end()); -    return it->second; -  } - - private: -  void LoadUnigrams(const std::string& fname); - -  const bool use_uniform_; -  const prob_t uniform_; -  typedef std::tr1::unordered_map<std::vector<WordID>, prob_t, boost::hash<std::vector<WordID> > > VectorProbHash; -  VectorProbHash probs_; -}; - -#endif  | 
