diff options
Diffstat (limited to 'gi/pf')
37 files changed, 3654 insertions, 71 deletions
| diff --git a/gi/pf/Makefile.am b/gi/pf/Makefile.am index 42758939..f9c979d0 100644 --- a/gi/pf/Makefile.am +++ b/gi/pf/Makefile.am @@ -1,10 +1,26 @@ -bin_PROGRAMS = cbgi brat dpnaive pfbrat pfdist itg pfnaive +bin_PROGRAMS = cbgi brat dpnaive pfbrat pfdist itg pfnaive condnaive align-lexonly-pyp learn_cfg pyp_lm nuisance_test align-tl  noinst_LIBRARIES = libpf.a -libpf_a_SOURCES = base_measures.cc reachability.cc cfg_wfst_composer.cc corpus.cc + +libpf_a_SOURCES = base_distributions.cc reachability.cc cfg_wfst_composer.cc corpus.cc unigrams.cc ngram_base.cc transliterations.cc backward.cc pyp_word_model.cc pyp_tm.cc + +nuisance_test_SOURCES = nuisance_test.cc +nuisance_test_LDADD = libpf.a $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a $(top_srcdir)/klm/lm/libklm.a $(top_srcdir)/klm/util/libklm_util.a -lz + +align_lexonly_pyp_SOURCES = align-lexonly-pyp.cc +align_lexonly_pyp_LDADD = libpf.a $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a $(top_srcdir)/klm/lm/libklm.a $(top_srcdir)/klm/util/libklm_util.a -lz + +align_tl_SOURCES = align-tl.cc +align_tl_LDADD = libpf.a $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a $(top_srcdir)/klm/lm/libklm.a $(top_srcdir)/klm/util/libklm_util.a -lz  itg_SOURCES = itg.cc +pyp_lm_SOURCES = pyp_lm.cc + +learn_cfg_SOURCES = learn_cfg.cc + +condnaive_SOURCES = condnaive.cc +  dpnaive_SOURCES = dpnaive.cc  pfdist_SOURCES = pfdist.cc @@ -17,5 +33,6 @@ brat_SOURCES = brat.cc  pfbrat_SOURCES = pfbrat.cc -AM_CPPFLAGS = -W -Wall -Wno-sign-compare -funroll-loops -I$(top_srcdir)/utils $(GTEST_CPPFLAGS) -I$(top_srcdir)/decoder -AM_LDFLAGS = libpf.a $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/utils/libutils.a -lz +AM_CPPFLAGS = -W -Wall -Wno-sign-compare -funroll-loops -I$(top_srcdir)/utils $(GTEST_CPPFLAGS) -I$(top_srcdir)/decoder -I$(top_srcdir)/klm + +AM_LDFLAGS = libpf.a $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/klm/lm/libklm.a $(top_srcdir)/klm/util/libklm_util.a $(top_srcdir)/utils/libutils.a -lz diff --git a/gi/pf/align-lexonly-pyp.cc b/gi/pf/align-lexonly-pyp.cc new file mode 100644 index 00000000..942dcf51 --- /dev/null +++ b/gi/pf/align-lexonly-pyp.cc @@ -0,0 +1,239 @@ +#include <iostream> +#include <queue> + +#include <boost/program_options.hpp> +#include <boost/program_options/variables_map.hpp> + +#include "tdict.h" +#include "stringlib.h" +#include "filelib.h" +#include "array2d.h" +#include "sampler.h" +#include "corpus.h" +#include "pyp_tm.h" +#include "quasi_model2.h" + +using namespace std; +namespace po = boost::program_options; + +void InitCommandLine(int argc, char** argv, po::variables_map* conf) { +  po::options_description opts("Configuration options"); +  opts.add_options() +        ("samples,s",po::value<unsigned>()->default_value(1000),"Number of samples") +        ("infer_alignment_hyperparameters,I", "Infer alpha and p_null, otherwise fixed values will be assumed") +        ("p_null,0", po::value<double>()->default_value(0.08), "probability of aligning to null") +        ("align_alpha,a", po::value<double>()->default_value(4.0), "how 'tight' is the bias toward be along the diagonal?") +        ("input,i",po::value<string>(),"Read parallel data from") +        ("random_seed,S",po::value<uint32_t>(), "Random seed"); +  po::options_description clo("Command line options"); +  clo.add_options() +        ("config", po::value<string>(), "Configuration file") +        ("help,h", "Print this help message and exit"); +  po::options_description dconfig_options, dcmdline_options; +  dconfig_options.add(opts); +  dcmdline_options.add(opts).add(clo); +   +  po::store(parse_command_line(argc, argv, dcmdline_options), *conf); +  if (conf->count("config")) { +    ifstream config((*conf)["config"].as<string>().c_str()); +    po::store(po::parse_config_file(config, dconfig_options), *conf); +  } +  po::notify(*conf); + +  if (conf->count("help") || (conf->count("input") == 0)) { +    cerr << dcmdline_options << endl; +    exit(1); +  } +} + +MT19937* prng; + +struct LexicalAlignment { +  unsigned char src_index; +  bool is_transliteration; +  vector<pair<short, short> > derivation; +}; + +struct AlignedSentencePair { +  vector<WordID> src; +  vector<WordID> trg; +  vector<LexicalAlignment> a; +  Array2D<short> posterior; +}; + +struct Aligner { +  Aligner(const vector<vector<WordID> >& lets, +          int num_letters, +          const po::variables_map& conf, +          vector<AlignedSentencePair>* c) : +      corpus(*c), +      paj_model(conf["align_alpha"].as<double>(), conf["p_null"].as<double>()), +      infer_paj(conf.count("infer_alignment_hyperparameters") > 0), +      model(lets, num_letters), +      kNULL(TD::Convert("NULL")) { +    assert(lets[kNULL].size() == 0); +  } + +  vector<AlignedSentencePair>& corpus; +  QuasiModel2 paj_model; +  const bool infer_paj; +  PYPLexicalTranslation model; +  const WordID kNULL; + +  void ResampleHyperparameters() { +    model.ResampleHyperparameters(prng); +    if (infer_paj) paj_model.ResampleHyperparameters(prng); +  } + +  void InitializeRandom() { +    cerr << "Initializing with random alignments ...\n"; +    for (unsigned i = 0; i < corpus.size(); ++i) { +      AlignedSentencePair& asp = corpus[i]; +      asp.a.resize(asp.trg.size()); +      for (unsigned j = 0; j < asp.trg.size(); ++j) { +        unsigned char& a_j = asp.a[j].src_index; +        a_j = prng->next() * (1 + asp.src.size()); +        const WordID f_a_j = (a_j ? asp.src[a_j - 1] : kNULL); +        model.Increment(f_a_j, asp.trg[j], &*prng); +        paj_model.Increment(a_j, j, asp.src.size(), asp.trg.size()); +      } +    } +    cerr << "Corpus intialized randomly." << endl; +    cerr << "LLH = " << Likelihood() << "    \t(Amodel=" << paj_model.Likelihood() +         << " TModel=" << model.Likelihood() << ") contexts=" << model.UniqueConditioningContexts() << endl; +  } + +  void ResampleCorpus() { +    for (unsigned i = 0; i < corpus.size(); ++i) { +      AlignedSentencePair& asp = corpus[i]; +      SampleSet<prob_t> ss; ss.resize(asp.src.size() + 1); +      for (unsigned j = 0; j < asp.trg.size(); ++j) { +        unsigned char& a_j = asp.a[j].src_index; +        const WordID e_j = asp.trg[j]; +        WordID f_a_j = (a_j ? asp.src[a_j - 1] : kNULL); +        model.Decrement(f_a_j, e_j, prng); +        paj_model.Decrement(a_j, j, asp.src.size(), asp.trg.size()); + +        for (unsigned prop_a_j = 0; prop_a_j <= asp.src.size(); ++prop_a_j) { +          const WordID prop_f = (prop_a_j ? asp.src[prop_a_j - 1] : kNULL); +          ss[prop_a_j] = model.Prob(prop_f, e_j); +          ss[prop_a_j] *= paj_model.Prob(prop_a_j, j, asp.src.size(), asp.trg.size()); +        } +        a_j = prng->SelectSample(ss); +        f_a_j = (a_j ? asp.src[a_j - 1] : kNULL); +        model.Increment(f_a_j, e_j, prng); +        paj_model.Increment(a_j, j, asp.src.size(), asp.trg.size()); +      } +    } +  } + +  prob_t Likelihood() const { +    return model.Likelihood() * paj_model.Likelihood(); +  } +}; + +void ExtractLetters(const set<WordID>& v, vector<vector<WordID> >* l, set<WordID>* letset = NULL) { +  for (set<WordID>::const_iterator it = v.begin(); it != v.end(); ++it) { +    vector<WordID>& letters = (*l)[*it]; +    if (letters.size()) continue;   // if e and f have the same word + +    const string& w = TD::Convert(*it); +     +    size_t cur = 0; +    while (cur < w.size()) { +      const size_t len = UTF8Len(w[cur]); +      letters.push_back(TD::Convert(w.substr(cur, len))); +      if (letset) letset->insert(letters.back()); +      cur += len; +    } +  } +} + +void Debug(const AlignedSentencePair& asp) { +  cerr << TD::GetString(asp.src) << endl << TD::GetString(asp.trg) << endl; +  Array2D<bool> a(asp.src.size(), asp.trg.size()); +  for (unsigned j = 0; j < asp.trg.size(); ++j) { +    assert(asp.a[j].src_index <= asp.src.size()); +    if (asp.a[j].src_index) a(asp.a[j].src_index - 1, j) = true; +  } +  cerr << a << endl; +} + +void AddSample(AlignedSentencePair* asp) { +  for (unsigned j = 0; j < asp->trg.size(); ++j) +    asp->posterior(asp->a[j].src_index, j)++; +} + +void WriteAlignments(const AlignedSentencePair& asp) { +  bool first = true; +  for (unsigned j = 0; j < asp.trg.size(); ++j) { +    int src_index = -1; +    int mc = -1; +    for (unsigned i = 0; i <= asp.src.size(); ++i) { +      if (asp.posterior(i, j) > mc) { +        mc = asp.posterior(i, j); +        src_index = i; +      } +    } + +    if (src_index) { +      if (first) first = false; else cout << ' '; +      cout << (src_index - 1) << '-' << j; +    } +  } +  cout << endl; +} + +int main(int argc, char** argv) { +  po::variables_map conf; +  InitCommandLine(argc, argv, &conf); + +  if (conf.count("random_seed")) +    prng = new MT19937(conf["random_seed"].as<uint32_t>()); +  else +    prng = new MT19937; + +  vector<vector<int> > corpuse, corpusf; +  set<int> vocabe, vocabf; +  corpus::ReadParallelCorpus(conf["input"].as<string>(), &corpusf, &corpuse, &vocabf, &vocabe); +  cerr << "f-Corpus size: " << corpusf.size() << " sentences\n"; +  cerr << "f-Vocabulary size: " << vocabf.size() << " types\n"; +  cerr << "f-Corpus size: " << corpuse.size() << " sentences\n"; +  cerr << "f-Vocabulary size: " << vocabe.size() << " types\n"; +  assert(corpusf.size() == corpuse.size()); + +  vector<AlignedSentencePair> corpus(corpuse.size()); +  for (unsigned i = 0; i < corpuse.size(); ++i) { +    corpus[i].src.swap(corpusf[i]); +    corpus[i].trg.swap(corpuse[i]); +    corpus[i].posterior.resize(corpus[i].src.size() + 1, corpus[i].trg.size()); +  } +  corpusf.clear(); corpuse.clear(); + +  vocabf.insert(TD::Convert("NULL")); +  vector<vector<WordID> > letters(TD::NumWords()); +  set<WordID> letset; +  ExtractLetters(vocabe, &letters, &letset); +  ExtractLetters(vocabf, &letters, NULL); +  letters[TD::Convert("NULL")].clear(); + +  Aligner aligner(letters, letset.size(), conf, &corpus); +  aligner.InitializeRandom(); + +  const unsigned samples = conf["samples"].as<unsigned>(); +  for (int i = 0; i < samples; ++i) { +    for (int j = 65; j < 67; ++j) Debug(corpus[j]); +    if (i % 10 == 9) { +      aligner.ResampleHyperparameters(); +      cerr << "LLH = " << aligner.Likelihood() << "    \t(Amodel=" << aligner.paj_model.Likelihood() +           << " TModel=" << aligner.model.Likelihood() << ") contexts=" << aligner.model.UniqueConditioningContexts() << endl; +    } +    aligner.ResampleCorpus(); +    if (i > (samples / 5) && (i % 6 == 5)) for (int j = 0; j < corpus.size(); ++j) AddSample(&corpus[j]); +  } +  for (unsigned i = 0; i < corpus.size(); ++i) +    WriteAlignments(corpus[i]); +  aligner.model.Summary(); + +  return 0; +} diff --git a/gi/pf/align-tl.cc b/gi/pf/align-tl.cc new file mode 100644 index 00000000..cbe8c6c8 --- /dev/null +++ b/gi/pf/align-tl.cc @@ -0,0 +1,339 @@ +#include <iostream> +#include <tr1/memory> +#include <queue> + +#include <boost/multi_array.hpp> +#include <boost/program_options.hpp> +#include <boost/program_options/variables_map.hpp> + +#include "backward.h" +#include "array2d.h" +#include "base_distributions.h" +#include "monotonic_pseg.h" +#include "conditional_pseg.h" +#include "trule.h" +#include "tdict.h" +#include "stringlib.h" +#include "filelib.h" +#include "dict.h" +#include "sampler.h" +#include "mfcr.h" +#include "corpus.h" +#include "ngram_base.h" +#include "transliterations.h" + +using namespace std; +using namespace tr1; +namespace po = boost::program_options; + +void InitCommandLine(int argc, char** argv, po::variables_map* conf) { +  po::options_description opts("Configuration options"); +  opts.add_options() +        ("samples,s",po::value<unsigned>()->default_value(1000),"Number of samples") +        ("input,i",po::value<string>(),"Read parallel data from") +        ("s2t", po::value<string>(), "character level source-to-target prior transliteration probabilities") +        ("t2s", po::value<string>(), "character level target-to-source prior transliteration probabilities") +        ("max_src_chunk", po::value<unsigned>()->default_value(4), "Maximum size of translitered chunk in source") +        ("max_trg_chunk", po::value<unsigned>()->default_value(4), "Maximum size of translitered chunk in target") +        ("expected_src_to_trg_ratio", po::value<double>()->default_value(1.0), "If a word is transliterated, what is the expected length ratio from source to target?") +        ("random_seed,S",po::value<uint32_t>(), "Random seed"); +  po::options_description clo("Command line options"); +  clo.add_options() +        ("config", po::value<string>(), "Configuration file") +        ("help,h", "Print this help message and exit"); +  po::options_description dconfig_options, dcmdline_options; +  dconfig_options.add(opts); +  dcmdline_options.add(opts).add(clo); +   +  po::store(parse_command_line(argc, argv, dcmdline_options), *conf); +  if (conf->count("config")) { +    ifstream config((*conf)["config"].as<string>().c_str()); +    po::store(po::parse_config_file(config, dconfig_options), *conf); +  } +  po::notify(*conf); + +  if (conf->count("help") || (conf->count("input") == 0)) { +    cerr << dcmdline_options << endl; +    exit(1); +  } +} + +shared_ptr<MT19937> prng; + +struct LexicalAlignment { +  unsigned char src_index; +  bool is_transliteration; +  vector<pair<short, short> > derivation; +}; + +struct AlignedSentencePair { +  vector<WordID> src; +  vector<WordID> trg; +  vector<LexicalAlignment> a; +  Array2D<short> posterior; +}; + +struct HierarchicalWordBase { +  explicit HierarchicalWordBase(const unsigned vocab_e_size) : +      base(prob_t::One()), r(1,1,1,1,0.66,50.0), u0(-log(vocab_e_size)), l(1,prob_t::One()), v(1, prob_t::Zero()) {} + +  void ResampleHyperparameters(MT19937* rng) { +    r.resample_hyperparameters(rng); +  } + +  inline double logp0(const vector<WordID>& s) const { +    return Md::log_poisson(s.size(), 7.5) + s.size() * u0; +  } + +  // return p0 of rule.e_ +  prob_t operator()(const TRule& rule) const { +    v[0].logeq(logp0(rule.e_)); +    return r.prob(rule.e_, v.begin(), l.begin()); +  } + +  void Increment(const TRule& rule) { +    v[0].logeq(logp0(rule.e_)); +    if (r.increment(rule.e_, v.begin(), l.begin(), &*prng).count) { +      base *= v[0] * l[0]; +    } +  } + +  void Decrement(const TRule& rule) { +    if (r.decrement(rule.e_, &*prng).count) { +      base /= prob_t(exp(logp0(rule.e_))); +    } +  } + +  prob_t Likelihood() const { +    prob_t p; p.logeq(r.log_crp_prob()); +    p *= base; +    return p; +  } + +  void Summary() const { +    cerr << "NUMBER OF CUSTOMERS: " << r.num_customers() << "  (d=" << r.discount() << ",s=" << r.strength() << ')' << endl; +    for (MFCR<1,vector<WordID> >::const_iterator it = r.begin(); it != r.end(); ++it) +      cerr << "   " << it->second.total_dish_count_ << " (on " << it->second.table_counts_.size() << " tables) " << TD::GetString(it->first) << endl; +  } + +  prob_t base; +  MFCR<1,vector<WordID> > r; +  const double u0; +  const vector<prob_t> l; +  mutable vector<prob_t> v; +}; + +struct BasicLexicalAlignment { +  explicit BasicLexicalAlignment(const vector<vector<WordID> >& lets, +                                 const unsigned words_e, +                                 const unsigned letters_e, +                                 vector<AlignedSentencePair>* corp) : +      letters(lets), +      corpus(*corp), +      //up0(words_e), +      //up0("en.chars.1gram", letters_e), +      //up0("en.words.1gram"), +      up0(letters_e), +      //up0("en.chars.2gram"), +      tmodel(up0) { +  } + +  void InstantiateRule(const WordID src, +                       const WordID trg, +                       TRule* rule) const { +    static const WordID kX = TD::Convert("X") * -1; +    rule->lhs_ = kX; +    rule->e_ = letters[trg]; +    rule->f_ = letters[src]; +  } + +  void InitializeRandom() { +    const WordID kNULL = TD::Convert("NULL"); +    cerr << "Initializing with random alignments ...\n"; +    for (unsigned i = 0; i < corpus.size(); ++i) { +      AlignedSentencePair& asp = corpus[i]; +      asp.a.resize(asp.trg.size()); +      for (unsigned j = 0; j < asp.trg.size(); ++j) { +        const unsigned char a_j = prng->next() * (1 + asp.src.size()); +        const WordID f_a_j = (a_j ? asp.src[a_j - 1] : kNULL); +        TRule r; +        InstantiateRule(f_a_j, asp.trg[j], &r); +        asp.a[j].is_transliteration = false; +        asp.a[j].src_index = a_j; +        if (tmodel.IncrementRule(r, &*prng)) +          up0.Increment(r); +      } +    } +    cerr << "  LLH = " << Likelihood() << endl; +  } + +  prob_t Likelihood() const { +    prob_t p = tmodel.Likelihood(); +    p *= up0.Likelihood(); +    return p; +  } + +  void ResampleHyperparemeters() { +    tmodel.ResampleHyperparameters(&*prng); +    up0.ResampleHyperparameters(&*prng); +    cerr << "  (base d=" << up0.r.discount() << ",s=" << up0.r.strength() << ")\n"; +  } + +  void ResampleCorpus(); + +  const vector<vector<WordID> >& letters; // spelling dictionary +  vector<AlignedSentencePair>& corpus; +  //PhraseConditionalUninformativeBase up0; +  //PhraseConditionalUninformativeUnigramBase up0; +  //UnigramWordBase up0; +  //HierarchicalUnigramBase up0; +  HierarchicalWordBase up0; +  //CompletelyUniformBase up0; +  //FixedNgramBase up0; +  //ConditionalTranslationModel<PhraseConditionalUninformativeBase> tmodel; +  //ConditionalTranslationModel<PhraseConditionalUninformativeUnigramBase> tmodel; +  //ConditionalTranslationModel<UnigramWordBase> tmodel; +  //ConditionalTranslationModel<HierarchicalUnigramBase> tmodel; +  MConditionalTranslationModel<HierarchicalWordBase> tmodel; +  //ConditionalTranslationModel<FixedNgramBase> tmodel; +  //ConditionalTranslationModel<CompletelyUniformBase> tmodel; +}; + +void BasicLexicalAlignment::ResampleCorpus() { +  static const WordID kNULL = TD::Convert("NULL"); +  for (unsigned i = 0; i < corpus.size(); ++i) { +    AlignedSentencePair& asp = corpus[i]; +    SampleSet<prob_t> ss; ss.resize(asp.src.size() + 1); +    for (unsigned j = 0; j < asp.trg.size(); ++j) { +      TRule r; +      unsigned char& a_j = asp.a[j].src_index; +      WordID f_a_j = (a_j ? asp.src[a_j - 1] : kNULL); +      InstantiateRule(f_a_j, asp.trg[j], &r); +      if (tmodel.DecrementRule(r, &*prng)) +        up0.Decrement(r); + +      for (unsigned prop_a_j = 0; prop_a_j <= asp.src.size(); ++prop_a_j) { +        const WordID prop_f = (prop_a_j ? asp.src[prop_a_j - 1] : kNULL); +        InstantiateRule(prop_f, asp.trg[j], &r); +        ss[prop_a_j] = tmodel.RuleProbability(r); +      } +      a_j = prng->SelectSample(ss); +      f_a_j = (a_j ? asp.src[a_j - 1] : kNULL); +      InstantiateRule(f_a_j, asp.trg[j], &r); +      if (tmodel.IncrementRule(r, &*prng)) +        up0.Increment(r); +    } +  } +  cerr << "  LLH = " << Likelihood() << endl; +} + +void ExtractLetters(const set<WordID>& v, vector<vector<WordID> >* l, set<WordID>* letset = NULL) { +  for (set<WordID>::const_iterator it = v.begin(); it != v.end(); ++it) { +    vector<WordID>& letters = (*l)[*it]; +    if (letters.size()) continue;   // if e and f have the same word + +    const string& w = TD::Convert(*it); +     +    size_t cur = 0; +    while (cur < w.size()) { +      const size_t len = UTF8Len(w[cur]); +      letters.push_back(TD::Convert(w.substr(cur, len))); +      if (letset) letset->insert(letters.back()); +      cur += len; +    } +  } +} + +void Debug(const AlignedSentencePair& asp) { +  cerr << TD::GetString(asp.src) << endl << TD::GetString(asp.trg) << endl; +  Array2D<bool> a(asp.src.size(), asp.trg.size()); +  for (unsigned j = 0; j < asp.trg.size(); ++j) +    if (asp.a[j].src_index) a(asp.a[j].src_index - 1, j) = true; +  cerr << a << endl; +} + +void AddSample(AlignedSentencePair* asp) { +  for (unsigned j = 0; j < asp->trg.size(); ++j) +    asp->posterior(asp->a[j].src_index, j)++; +} + +void WriteAlignments(const AlignedSentencePair& asp) { +  bool first = true; +  for (unsigned j = 0; j < asp.trg.size(); ++j) { +    int src_index = -1; +    int mc = -1; +    for (unsigned i = 0; i <= asp.src.size(); ++i) { +      if (asp.posterior(i, j) > mc) { +        mc = asp.posterior(i, j); +        src_index = i; +      } +    } + +    if (src_index) { +      if (first) first = false; else cout << ' '; +      cout << (src_index - 1) << '-' << j; +    } +  } +  cout << endl; +} + +int main(int argc, char** argv) { +  po::variables_map conf; +  InitCommandLine(argc, argv, &conf); + +  if (conf.count("random_seed")) +    prng.reset(new MT19937(conf["random_seed"].as<uint32_t>())); +  else +    prng.reset(new MT19937); +//  MT19937& rng = *prng; + +  vector<vector<int> > corpuse, corpusf; +  set<int> vocabe, vocabf; +  corpus::ReadParallelCorpus(conf["input"].as<string>(), &corpusf, &corpuse, &vocabf, &vocabe); +  cerr << "f-Corpus size: " << corpusf.size() << " sentences\n"; +  cerr << "f-Vocabulary size: " << vocabf.size() << " types\n"; +  cerr << "f-Corpus size: " << corpuse.size() << " sentences\n"; +  cerr << "f-Vocabulary size: " << vocabe.size() << " types\n"; +  assert(corpusf.size() == corpuse.size()); + +  vector<AlignedSentencePair> corpus(corpuse.size()); +  for (unsigned i = 0; i < corpuse.size(); ++i) { +    corpus[i].src.swap(corpusf[i]); +    corpus[i].trg.swap(corpuse[i]); +    corpus[i].posterior.resize(corpus[i].src.size() + 1, corpus[i].trg.size()); +  } +  corpusf.clear(); corpuse.clear(); + +  vocabf.insert(TD::Convert("NULL")); +  vector<vector<WordID> > letters(TD::NumWords() + 1); +  set<WordID> letset; +  ExtractLetters(vocabe, &letters, &letset); +  ExtractLetters(vocabf, &letters, NULL); +  letters[TD::Convert("NULL")].clear(); + +  // TODO configure this +  const int max_src_chunk = conf["max_src_chunk"].as<unsigned>(); +  const int max_trg_chunk = conf["max_trg_chunk"].as<unsigned>(); +  const double s2t_rat = conf["expected_src_to_trg_ratio"].as<double>(); +  const BackwardEstimator be(conf["s2t"].as<string>(), conf["t2s"].as<string>()); +  Transliterations tl(max_src_chunk, max_trg_chunk, s2t_rat, be);  + +  cerr << "Initializing transliteration graph structures ...\n"; +  for (int i = 0; i < corpus.size(); ++i) { +    const vector<int>& src = corpus[i].src; +    const vector<int>& trg = corpus[i].trg; +    for (int j = 0; j < src.size(); ++j) { +      const vector<int>& src_let = letters[src[j]]; +      for (int k = 0; k < trg.size(); ++k) { +        const vector<int>& trg_let = letters[trg[k]]; +        tl.Initialize(src[j], src_let, trg[k], trg_let); +        //if (src_let.size() < min_trans_src) +        //  tl.Forbid(src[j], src_let, trg[k], trg_let); +      } +    } +  } +  cerr << endl; +  tl.GraphSummary(); + +  return 0; +} diff --git a/gi/pf/backward.cc b/gi/pf/backward.cc new file mode 100644 index 00000000..b92629fd --- /dev/null +++ b/gi/pf/backward.cc @@ -0,0 +1,89 @@ +#include "backward.h" + +#include <queue> +#include <utility> + +#include "array2d.h" +#include "reachability.h" +#include "base_distributions.h" + +using namespace std; + +BackwardEstimator::BackwardEstimator(const string& s2t, +                    const string& t2s) : m1(new Model1(s2t)), m1inv(new Model1(t2s)) {} + +BackwardEstimator::~BackwardEstimator() { +  delete m1; m1 = NULL; +  delete m1inv; m1inv = NULL; +} + +float BackwardEstimator::ComputeBackwardProb(const std::vector<WordID>& src, +                                             const std::vector<WordID>& trg, +                                             unsigned src_covered, +                                             unsigned trg_covered, +                                             double s2t_ratio) const { +  if (src_covered == src.size() || trg_covered == trg.size()) { +    assert(src_covered == src.size()); +    assert(trg_covered == trg.size()); +    return 0; +  } +  static const WordID kNULL = TD::Convert("<eps>"); +  const prob_t uniform_alignment(1.0 / (src.size() - src_covered + 1)); +  // TODO factor in expected length ratio +  prob_t e; e.logeq(Md::log_poisson(trg.size() - trg_covered, (src.size() - src_covered) * s2t_ratio)); // p(trg len remaining | src len remaining) +  for (unsigned j = trg_covered; j < trg.size(); ++j) { +    prob_t p = (*m1)(kNULL, trg[j]) + prob_t(1e-12); +    for (unsigned i = src_covered; i < src.size(); ++i) +      p += (*m1)(src[i], trg[j]); +    if (p.is_0()) { +      cerr << "ERROR: p(" << TD::Convert(trg[j]) << " | " << TD::GetString(src) << ") = 0!\n"; +      assert(!"failed"); +    } +    p *= uniform_alignment; +    e *= p; +  } +  // TODO factor in expected length ratio +  const prob_t inv_uniform(1.0 / (trg.size() - trg_covered + 1.0)); +  prob_t inv; +  inv.logeq(Md::log_poisson(src.size() - src_covered, (trg.size() - trg_covered) / s2t_ratio)); +  for (unsigned i = src_covered; i < src.size(); ++i) { +    prob_t p = (*m1inv)(kNULL, src[i]) + prob_t(1e-12); +    for (unsigned j = trg_covered; j < trg.size(); ++j) +      p += (*m1inv)(trg[j], src[i]); +    if (p.is_0()) { +      cerr << "ERROR: p_inv(" << TD::Convert(src[i]) << " | " << TD::GetString(trg) << ") = 0!\n"; +      assert(!"failed"); +    } +    p *= inv_uniform; +    inv *= p; +  } +  return (log(e) + log(inv)) / 2; +} + +void BackwardEstimator::InitializeGrid(const vector<WordID>& src, +                      const vector<WordID>& trg, +                      const Reachability& r, +                      double s2t_ratio, +                      float* grid) const { +  queue<pair<int,int> > q; +  q.push(make_pair(0,0)); +  Array2D<bool> done(src.size()+1, trg.size()+1, false); +  //cerr << TD::GetString(src) << " ||| " << TD::GetString(trg) << endl; +  while(!q.empty()) { +    const pair<int,int> n = q.front(); +    q.pop(); +    if (done(n.first,n.second)) continue; +    done(n.first,n.second) = true; + +    float lp = ComputeBackwardProb(src, trg, n.first, n.second, s2t_ratio); +    if (n.first == 0 && n.second == 0) grid[0] = lp; +    //cerr << "  " << n.first << "," << n.second << "\t" << lp << endl; + +    if (n.first == src.size() || n.second == trg.size()) continue; +    const vector<pair<short,short> >& edges = r.valid_deltas[n.first][n.second]; +    for (int i = 0; i < edges.size(); ++i) +      q.push(make_pair(n.first + edges[i].first, n.second + edges[i].second)); +  } +  //static int cc = 0; ++cc; if (cc == 80) exit(1); +} + diff --git a/gi/pf/backward.h b/gi/pf/backward.h new file mode 100644 index 00000000..e67eff0c --- /dev/null +++ b/gi/pf/backward.h @@ -0,0 +1,33 @@ +#ifndef _BACKWARD_H_ +#define _BACKWARD_H_ + +#include <vector> +#include <string> +#include "wordid.h" + +struct Reachability; +struct Model1; + +struct BackwardEstimator { +  BackwardEstimator(const std::string& s2t, +                    const std::string& t2s); +  ~BackwardEstimator(); + +  void InitializeGrid(const std::vector<WordID>& src, +                      const std::vector<WordID>& trg, +                      const Reachability& r, +                      double src2trg_ratio, +                      float* grid) const; + + private: +  float ComputeBackwardProb(const std::vector<WordID>& src, +                            const std::vector<WordID>& trg, +                            unsigned src_covered, +                            unsigned trg_covered, +                            double src2trg_ratio) const; + +  Model1* m1; +  Model1* m1inv; +}; + +#endif diff --git a/gi/pf/base_measures.cc b/gi/pf/base_distributions.cc index 8adb37d7..d9761005 100644 --- a/gi/pf/base_measures.cc +++ b/gi/pf/base_distributions.cc @@ -1,4 +1,4 @@ -#include "base_measures.h" +#include "base_distributions.h"  #include <iostream> @@ -6,6 +6,79 @@  using namespace std; +TableLookupBase::TableLookupBase(const string& fname) { +  cerr << "TableLookupBase reading from " << fname << " ..." << endl; +  ReadFile rf(fname); +  istream& in = *rf.stream(); +  string line; +  unsigned lc = 0; +  const WordID kDIV = TD::Convert("|||"); +  vector<WordID> tmp; +  vector<int> le, lf; +  TRule x; +  x.lhs_ = -TD::Convert("X"); +  bool flag = false; +  while(getline(in, line)) { +    ++lc; +    if (lc % 1000000 == 0) { cerr << " [" << lc << ']' << endl; flag = false; } +    else if (lc % 25000 == 0) { cerr << '.' << flush; flag = true; } +    tmp.clear(); +    TD::ConvertSentence(line, &tmp); +    x.f_.clear(); +    x.e_.clear(); +    size_t pos = 0; +    int cc = 0; +    while(pos < tmp.size()) { +      const WordID cur = tmp[pos++]; +      if (cur == kDIV) { +        ++cc; +      } else if (cc == 0) { +        x.f_.push_back(cur);     +      } else if (cc == 1) { +        x.e_.push_back(cur); +      } else if (cc == 2) { +        table[x].logeq(atof(TD::Convert(cur))); +        ++cc; +      } else { +        if (flag) cerr << endl; +        cerr << "Bad format in " << lc << ": " << line << endl; abort(); +      } +    } +    if (cc != 3) { +      if (flag) cerr << endl; +      cerr << "Bad format in " << lc << ": " << line << endl; abort(); +    } +  } +  if (flag) cerr << endl; +  cerr << " read " << lc << " entries\n"; +} + +prob_t PhraseConditionalUninformativeUnigramBase::p0(const vector<WordID>& vsrc, +                                                     const vector<WordID>& vtrg, +                                                     int start_src, int start_trg) const { +  const int flen = vsrc.size() - start_src; +  const int elen = vtrg.size() - start_trg; +  prob_t p; +  p.logeq(Md::log_poisson(elen, flen + 0.01));       // elen | flen          ~Pois(flen + 0.01) +  //p.logeq(log_poisson(elen, 1));       // elen | flen          ~Pois(flen + 0.01) +  for (int i = 0; i < elen; ++i) +    p *= u(vtrg[i + start_trg]);                        // draw e_i             ~Uniform +  return p; +} + +prob_t PhraseConditionalUninformativeBase::p0(const vector<WordID>& vsrc, +                                              const vector<WordID>& vtrg, +                                              int start_src, int start_trg) const { +  const int flen = vsrc.size() - start_src; +  const int elen = vtrg.size() - start_trg; +  prob_t p; +  //p.logeq(log_poisson(elen, flen + 0.01));       // elen | flen          ~Pois(flen + 0.01) +  p.logeq(Md::log_poisson(elen, 1));       // elen | flen          ~Pois(flen + 0.01) +  for (int i = 0; i < elen; ++i) +    p *= kUNIFORM_TARGET;                        // draw e_i             ~Uniform +  return p; +} +  void Model1::LoadModel1(const string& fname) {    cerr << "Loading Model 1 parameters from " << fname << " ..." << endl;    ReadFile rf(fname); @@ -40,7 +113,7 @@ prob_t PhraseConditionalBase::p0(const vector<WordID>& vsrc,    const int elen = vtrg.size() - start_trg;    prob_t uniform_src_alignment; uniform_src_alignment.logeq(-log(flen + 1));    prob_t p; -  p.logeq(log_poisson(elen, flen + 0.01));       // elen | flen          ~Pois(flen + 0.01) +  p.logeq(Md::log_poisson(elen, flen + 0.01));       // elen | flen          ~Pois(flen + 0.01)    for (int i = 0; i < elen; ++i) {               // for each position i in e-RHS      const WordID trg = vtrg[i + start_trg];      prob_t tp = prob_t::Zero(); @@ -66,9 +139,9 @@ prob_t PhraseJointBase::p0(const vector<WordID>& vsrc,    const int elen = vtrg.size() - start_trg;    prob_t uniform_src_alignment; uniform_src_alignment.logeq(-log(flen + 1));    prob_t p; -  p.logeq(log_poisson(flen, 1.0));               // flen                 ~Pois(1) +  p.logeq(Md::log_poisson(flen, 1.0));               // flen                 ~Pois(1)                                                   // elen | flen          ~Pois(flen + 0.01) -  prob_t ptrglen; ptrglen.logeq(log_poisson(elen, flen + 0.01)); +  prob_t ptrglen; ptrglen.logeq(Md::log_poisson(elen, flen + 0.01));    p *= ptrglen;    p *= kUNIFORM_SOURCE.pow(flen);                // each f in F ~Uniform    for (int i = 0; i < elen; ++i) {               // for each position i in E @@ -98,9 +171,9 @@ prob_t PhraseJointBase_BiDir::p0(const vector<WordID>& vsrc,    prob_t uniform_trg_alignment; uniform_trg_alignment.logeq(-log(elen + 1));    prob_t p1; -  p1.logeq(log_poisson(flen, 1.0));               // flen                 ~Pois(1) +  p1.logeq(Md::log_poisson(flen, 1.0));               // flen                 ~Pois(1)                                                   // elen | flen          ~Pois(flen + 0.01) -  prob_t ptrglen; ptrglen.logeq(log_poisson(elen, flen + 0.01)); +  prob_t ptrglen; ptrglen.logeq(Md::log_poisson(elen, flen + 0.01));    p1 *= ptrglen;    p1 *= kUNIFORM_SOURCE.pow(flen);                // each f in F ~Uniform    for (int i = 0; i < elen; ++i) {               // for each position i in E @@ -120,9 +193,9 @@ prob_t PhraseJointBase_BiDir::p0(const vector<WordID>& vsrc,    }    prob_t p2; -  p2.logeq(log_poisson(elen, 1.0));               // elen                 ~Pois(1) +  p2.logeq(Md::log_poisson(elen, 1.0));               // elen                 ~Pois(1)                                                   // flen | elen          ~Pois(flen + 0.01) -  prob_t psrclen; psrclen.logeq(log_poisson(flen, elen + 0.01)); +  prob_t psrclen; psrclen.logeq(Md::log_poisson(flen, elen + 0.01));    p2 *= psrclen;    p2 *= kUNIFORM_TARGET.pow(elen);                // each f in F ~Uniform    for (int i = 0; i < flen; ++i) {               // for each position i in E @@ -154,9 +227,9 @@ JumpBase::JumpBase() : p(200) {      for (int j = min_jump; j <= max_jump; ++j) {        prob_t& cp = cpd[j];        if (j < 0) -        cp.logeq(log_poisson(1.5-j, 1)); +        cp.logeq(Md::log_poisson(1.5-j, 1));        else if (j > 0) -        cp.logeq(log_poisson(j, 1)); +        cp.logeq(Md::log_poisson(j, 1));        cp.poweq(0.2);        z += cp;      } diff --git a/gi/pf/base_measures.h b/gi/pf/base_distributions.h index 7ce7e2e6..84dacdf2 100644 --- a/gi/pf/base_measures.h +++ b/gi/pf/base_distributions.h @@ -6,22 +6,15 @@  #include <string>  #include <cmath>  #include <iostream> +#include <cassert> +#include "unigrams.h"  #include "trule.h"  #include "prob.h"  #include "tdict.h" - -inline double log_poisson(unsigned x, const double& lambda) { -  assert(lambda > 0.0); -  return log(lambda) * x - lgamma(x + 1) - lambda; -} - -inline std::ostream& operator<<(std::ostream& os, const std::vector<WordID>& p) { -  os << '['; -  for (int i = 0; i < p.size(); ++i) -    os << (i==0 ? "" : " ") << TD::Convert(p[i]); -  return os << ']'; -} +#include "sampler.h" +#include "m.h" +#include "os_phrase.h"  struct Model1 {    explicit Model1(const std::string& fname) : @@ -49,6 +42,104 @@ struct Model1 {    std::vector<std::map<WordID, prob_t> > ttable;  }; +struct PoissonUniformUninformativeBase { +  explicit PoissonUniformUninformativeBase(const unsigned ves) : kUNIFORM(1.0 / ves) {} +  prob_t operator()(const TRule& r) const { +    prob_t p; p.logeq(Md::log_poisson(r.e_.size(), 1.0)); +    prob_t q = kUNIFORM; q.poweq(r.e_.size()); +    p *= q; +    return p; +  } +  void Summary() const {} +  void ResampleHyperparameters(MT19937*) {} +  void Increment(const TRule&) {} +  void Decrement(const TRule&) {} +  prob_t Likelihood() const { return prob_t::One(); } +  const prob_t kUNIFORM; +}; + +struct CompletelyUniformBase { +  explicit CompletelyUniformBase(const unsigned ves) : kUNIFORM(1.0 / ves) {} +  prob_t operator()(const TRule&) const { +    return kUNIFORM; +  } +  void Summary() const {} +  void ResampleHyperparameters(MT19937*) {} +  void Increment(const TRule&) {} +  void Decrement(const TRule&) {} +  prob_t Likelihood() const { return prob_t::One(); } +  const prob_t kUNIFORM; +}; + +struct UnigramWordBase { +  explicit UnigramWordBase(const std::string& fname) : un(fname) {} +  prob_t operator()(const TRule& r) const { +    return un(r.e_); +  } +  const UnigramWordModel un; +}; + +struct RuleHasher { +  size_t operator()(const TRule& r) const { +    return hash_value(r); +  } +}; + +struct TableLookupBase { +  TableLookupBase(const std::string& fname); + +  prob_t operator()(const TRule& rule) const { +    const std::tr1::unordered_map<TRule,prob_t>::const_iterator it = table.find(rule); +    if (it == table.end()) { +      std::cerr << rule << " not found\n"; +      abort(); +    } +    return it->second; +  } + +  void ResampleHyperparameters(MT19937*) {} +  void Increment(const TRule&) {} +  void Decrement(const TRule&) {} +  prob_t Likelihood() const { return prob_t::One(); } +  void Summary() const {} + +  std::tr1::unordered_map<TRule,prob_t,RuleHasher> table; +}; + +struct PhraseConditionalUninformativeBase { +  explicit PhraseConditionalUninformativeBase(const unsigned vocab_e_size) : +      kUNIFORM_TARGET(1.0 / vocab_e_size) { +    assert(vocab_e_size > 0); +  } + +  // return p0 of rule.e_ | rule.f_ +  prob_t operator()(const TRule& rule) const { +    return p0(rule.f_, rule.e_, 0, 0); +  } + +  prob_t p0(const std::vector<WordID>& vsrc, const std::vector<WordID>& vtrg, int start_src, int start_trg) const; + +  void Summary() const {} +  void ResampleHyperparameters(MT19937*) {} +  void Increment(const TRule&) {} +  void Decrement(const TRule&) {} +  prob_t Likelihood() const { return prob_t::One(); } +  const prob_t kUNIFORM_TARGET; +}; + +struct PhraseConditionalUninformativeUnigramBase { +  explicit PhraseConditionalUninformativeUnigramBase(const std::string& file, const unsigned vocab_e_size) : u(file, vocab_e_size) {} + +  // return p0 of rule.e_ | rule.f_ +  prob_t operator()(const TRule& rule) const { +    return p0(rule.f_, rule.e_, 0, 0); +  } + +  prob_t p0(const std::vector<WordID>& vsrc, const std::vector<WordID>& vtrg, int start_src, int start_trg) const; + +  const UnigramModel u; +}; +  struct PhraseConditionalBase {    explicit PhraseConditionalBase(const Model1& m1, const double m1mixture, const unsigned vocab_e_size) :        model1(m1), @@ -83,7 +174,7 @@ struct PhraseJointBase {      assert(vocab_e_size > 0);    } -  // return p0 of rule.e_ | rule.f_ +  // return p0 of rule.e_ , rule.f_    prob_t operator()(const TRule& rule) const {      return p0(rule.f_, rule.e_, 0, 0);    } @@ -113,7 +204,7 @@ struct PhraseJointBase_BiDir {      assert(vocab_e_size > 0);    } -  // return p0 of rule.e_ | rule.f_ +  // return p0 of rule.e_ , rule.f_    prob_t operator()(const TRule& rule) const {      return p0(rule.f_, rule.e_, 0, 0);    } diff --git a/gi/pf/brat.cc b/gi/pf/brat.cc index 7b60ef23..c2c52760 100644 --- a/gi/pf/brat.cc +++ b/gi/pf/brat.cc @@ -191,7 +191,7 @@ struct UniphraseLM {    void ResampleHyperparameters(MT19937* rng) {      phrases_.resample_hyperparameters(rng);      gen_.resample_hyperparameters(rng); -    cerr << " " << phrases_.concentration(); +    cerr << " " << phrases_.alpha();    }    CCRP_NoTable<vector<int> > phrases_; diff --git a/gi/pf/conditional_pseg.h b/gi/pf/conditional_pseg.h new file mode 100644 index 00000000..81ddb206 --- /dev/null +++ b/gi/pf/conditional_pseg.h @@ -0,0 +1,275 @@ +#ifndef _CONDITIONAL_PSEG_H_ +#define _CONDITIONAL_PSEG_H_ + +#include <vector> +#include <tr1/unordered_map> +#include <boost/functional/hash.hpp> +#include <iostream> + +#include "m.h" +#include "prob.h" +#include "ccrp_nt.h" +#include "mfcr.h" +#include "trule.h" +#include "base_distributions.h" +#include "tdict.h" + +template <typename ConditionalBaseMeasure> +struct MConditionalTranslationModel { +  explicit MConditionalTranslationModel(ConditionalBaseMeasure& rcp0) : +    rp0(rcp0), d(0.5), strength(1.0), lambdas(1, prob_t::One()), p0s(1) {} + +  void Summary() const { +    std::cerr << "Number of conditioning contexts: " << r.size() << std::endl; +    for (RuleModelHash::const_iterator it = r.begin(); it != r.end(); ++it) { +      std::cerr << TD::GetString(it->first) << "   \t(d=" << it->second.discount() << ",s=" << it->second.strength() << ") --------------------------" << std::endl; +      for (MFCR<1,TRule>::const_iterator i2 = it->second.begin(); i2 != it->second.end(); ++i2) +        std::cerr << "   " << i2->second.total_dish_count_ << '\t' << i2->first << std::endl; +    } +  } + +  double log_likelihood(const double& dd, const double& aa) const { +    if (aa <= -dd) return -std::numeric_limits<double>::infinity(); +    //double llh = Md::log_beta_density(dd, 10, 3) + Md::log_gamma_density(aa, 1, 1); +    double llh = Md::log_beta_density(dd, 1, 1) + +                 Md::log_gamma_density(dd + aa, 1, 1); +    typename std::tr1::unordered_map<std::vector<WordID>, MFCR<1,TRule>, boost::hash<std::vector<WordID> > >::const_iterator it; +    for (it = r.begin(); it != r.end(); ++it) +      llh += it->second.log_crp_prob(dd, aa); +    return llh; +  } + +  struct DiscountResampler { +    DiscountResampler(const MConditionalTranslationModel& m) : m_(m) {} +    const MConditionalTranslationModel& m_; +    double operator()(const double& proposed_discount) const { +      return m_.log_likelihood(proposed_discount, m_.strength); +    } +  }; + +  struct AlphaResampler { +    AlphaResampler(const MConditionalTranslationModel& m) : m_(m) {} +    const MConditionalTranslationModel& m_; +    double operator()(const double& proposed_strength) const { +      return m_.log_likelihood(m_.d, proposed_strength); +    } +  }; + +  void ResampleHyperparameters(MT19937* rng) { +    typename std::tr1::unordered_map<std::vector<WordID>, MFCR<1,TRule>, boost::hash<std::vector<WordID> > >::iterator it; +#if 1 +    for (it = r.begin(); it != r.end(); ++it) { +      it->second.resample_hyperparameters(rng); +    } +#else +    const unsigned nloop = 5; +    const unsigned niterations = 10; +    DiscountResampler dr(*this); +    AlphaResampler ar(*this); +    for (int iter = 0; iter < nloop; ++iter) { +      strength = slice_sampler1d(ar, strength, *rng, -d + std::numeric_limits<double>::min(), +                              std::numeric_limits<double>::infinity(), 0.0, niterations, 100*niterations); +      double min_discount = std::numeric_limits<double>::min(); +      if (strength < 0.0) min_discount -= strength; +      d = slice_sampler1d(dr, d, *rng, min_discount, +                          1.0, 0.0, niterations, 100*niterations); +    } +    strength = slice_sampler1d(ar, strength, *rng, -d, +                            std::numeric_limits<double>::infinity(), 0.0, niterations, 100*niterations); +    std::cerr << "MConditionalTranslationModel(d=" << d << ",s=" << strength << ") = " << log_likelihood(d, strength) << std::endl; +    for (it = r.begin(); it != r.end(); ++it) { +      it->second.set_discount(d); +      it->second.set_strength(strength); +    } +#endif +  } + +  int DecrementRule(const TRule& rule, MT19937* rng) { +    RuleModelHash::iterator it = r.find(rule.f_); +    assert(it != r.end()); +    const TableCount delta = it->second.decrement(rule, rng); +    if (delta.count) { +      if (it->second.num_customers() == 0) r.erase(it); +    } +    return delta.count; +  } + +  int IncrementRule(const TRule& rule, MT19937* rng) { +    RuleModelHash::iterator it = r.find(rule.f_); +    if (it == r.end()) { +      //it = r.insert(make_pair(rule.f_, MFCR<1,TRule>(d, strength))).first; +      it = r.insert(make_pair(rule.f_, MFCR<1,TRule>(1,1,1,1,0.6, -0.12))).first; +    } +    p0s[0] = rp0(rule);  +    TableCount delta = it->second.increment(rule, p0s.begin(), lambdas.begin(), rng); +    return delta.count; +  } + +  prob_t RuleProbability(const TRule& rule) const { +    prob_t p; +    RuleModelHash::const_iterator it = r.find(rule.f_); +    if (it == r.end()) { +      p = rp0(rule); +    } else { +      p0s[0] = rp0(rule); +      p = it->second.prob(rule, p0s.begin(), lambdas.begin()); +    } +    return p; +  } + +  prob_t Likelihood() const { +    prob_t p; p.logeq(log_likelihood(d, strength)); +    return p; +  } + +  const ConditionalBaseMeasure& rp0; +  typedef std::tr1::unordered_map<std::vector<WordID>, +                                  MFCR<1, TRule>, +                                  boost::hash<std::vector<WordID> > > RuleModelHash; +  RuleModelHash r; +  double d, strength; +  std::vector<prob_t> lambdas; +  mutable std::vector<prob_t> p0s; +}; + +template <typename ConditionalBaseMeasure> +struct ConditionalTranslationModel { +  explicit ConditionalTranslationModel(ConditionalBaseMeasure& rcp0) : +    rp0(rcp0) {} + +  void Summary() const { +    std::cerr << "Number of conditioning contexts: " << r.size() << std::endl; +    for (RuleModelHash::const_iterator it = r.begin(); it != r.end(); ++it) { +      std::cerr << TD::GetString(it->first) << "   \t(\\alpha = " << it->second.alpha() << ") --------------------------" << std::endl; +      for (CCRP_NoTable<TRule>::const_iterator i2 = it->second.begin(); i2 != it->second.end(); ++i2) +        std::cerr << "   " << i2->second << '\t' << i2->first << std::endl; +    } +  } + +  void ResampleHyperparameters(MT19937* rng) { +    for (RuleModelHash::iterator it = r.begin(); it != r.end(); ++it) +      it->second.resample_hyperparameters(rng); +  }  + +  int DecrementRule(const TRule& rule) { +    RuleModelHash::iterator it = r.find(rule.f_); +    assert(it != r.end());     +    int count = it->second.decrement(rule); +    if (count) { +      if (it->second.num_customers() == 0) r.erase(it); +    } +    return count; +  } + +  int IncrementRule(const TRule& rule) { +    RuleModelHash::iterator it = r.find(rule.f_); +    if (it == r.end()) { +      it = r.insert(make_pair(rule.f_, CCRP_NoTable<TRule>(1.0, 1.0, 8.0))).first; +    }  +    int count = it->second.increment(rule); +    return count; +  } + +  void IncrementRules(const std::vector<TRulePtr>& rules) { +    for (int i = 0; i < rules.size(); ++i) +      IncrementRule(*rules[i]); +  } + +  void DecrementRules(const std::vector<TRulePtr>& rules) { +    for (int i = 0; i < rules.size(); ++i) +      DecrementRule(*rules[i]); +  } + +  prob_t RuleProbability(const TRule& rule) const { +    prob_t p; +    RuleModelHash::const_iterator it = r.find(rule.f_); +    if (it == r.end()) { +      p.logeq(log(rp0(rule))); +    } else { +      p.logeq(it->second.logprob(rule, log(rp0(rule)))); +    } +    return p; +  } + +  prob_t Likelihood() const { +    prob_t p = prob_t::One(); +    for (RuleModelHash::const_iterator it = r.begin(); it != r.end(); ++it) { +      prob_t q; q.logeq(it->second.log_crp_prob()); +      p *= q; +      for (CCRP_NoTable<TRule>::const_iterator i2 = it->second.begin(); i2 != it->second.end(); ++i2) +        p *= rp0(i2->first); +    } +    return p; +  } + +  const ConditionalBaseMeasure& rp0; +  typedef std::tr1::unordered_map<std::vector<WordID>, +                                  CCRP_NoTable<TRule>, +                                  boost::hash<std::vector<WordID> > > RuleModelHash; +  RuleModelHash r; +}; + +template <typename ConditionalBaseMeasure> +struct ConditionalParallelSegementationModel { +  explicit ConditionalParallelSegementationModel(ConditionalBaseMeasure& rcp0) : +    tmodel(rcp0), base(prob_t::One()), aligns(1,1) {} + +  ConditionalTranslationModel<ConditionalBaseMeasure> tmodel; + +  void DecrementRule(const TRule& rule) { +    tmodel.DecrementRule(rule); +  } + +  void IncrementRule(const TRule& rule) { +    tmodel.IncrementRule(rule); +  } + +  void IncrementRulesAndAlignments(const std::vector<TRulePtr>& rules) { +    tmodel.IncrementRules(rules); +    for (int i = 0; i < rules.size(); ++i) { +      IncrementAlign(rules[i]->f_.size()); +    } +  } + +  void DecrementRulesAndAlignments(const std::vector<TRulePtr>& rules) { +    tmodel.DecrementRules(rules); +    for (int i = 0; i < rules.size(); ++i) { +      DecrementAlign(rules[i]->f_.size()); +    } +  } + +  prob_t RuleProbability(const TRule& rule) const { +    return tmodel.RuleProbability(rule); +  } + +  void IncrementAlign(unsigned span) { +    if (aligns.increment(span)) { +      // TODO +    } +  } + +  void DecrementAlign(unsigned span) { +    if (aligns.decrement(span)) { +      // TODO +    } +  } + +  prob_t AlignProbability(unsigned span) const { +    prob_t p; +    p.logeq(aligns.logprob(span, Md::log_poisson(span, 1.0))); +    return p; +  } + +  prob_t Likelihood() const { +    prob_t p; p.logeq(aligns.log_crp_prob()); +    p *= base; +    p *= tmodel.Likelihood(); +    return p; +  } + +  prob_t base; +  CCRP_NoTable<unsigned> aligns; +}; + +#endif + diff --git a/gi/pf/condnaive.cc b/gi/pf/condnaive.cc new file mode 100644 index 00000000..3ea88016 --- /dev/null +++ b/gi/pf/condnaive.cc @@ -0,0 +1,298 @@ +#include <iostream> +#include <tr1/memory> +#include <queue> + +#include <boost/multi_array.hpp> +#include <boost/program_options.hpp> +#include <boost/program_options/variables_map.hpp> + +#include "base_distributions.h" +#include "monotonic_pseg.h" +#include "conditional_pseg.h" +#include "trule.h" +#include "tdict.h" +#include "filelib.h" +#include "dict.h" +#include "sampler.h" +#include "ccrp_nt.h" +#include "corpus.h" + +using namespace std; +using namespace std::tr1; +namespace po = boost::program_options; + +static unsigned kMAX_SRC_PHRASE; +static unsigned kMAX_TRG_PHRASE; + +void InitCommandLine(int argc, char** argv, po::variables_map* conf) { +  po::options_description opts("Configuration options"); +  opts.add_options() +        ("samples,s",po::value<unsigned>()->default_value(1000),"Number of samples") +        ("input,i",po::value<string>(),"Read parallel data from") +        ("max_src_phrase",po::value<unsigned>()->default_value(4),"Maximum length of source language phrases") +        ("max_trg_phrase",po::value<unsigned>()->default_value(4),"Maximum length of target language phrases") +        ("model1,m",po::value<string>(),"Model 1 parameters (used in base distribution)") +        ("model1_interpolation_weight",po::value<double>()->default_value(0.95),"Mixing proportion of model 1 with uniform target distribution") +        ("random_seed,S",po::value<uint32_t>(), "Random seed"); +  po::options_description clo("Command line options"); +  clo.add_options() +        ("config", po::value<string>(), "Configuration file") +        ("help,h", "Print this help message and exit"); +  po::options_description dconfig_options, dcmdline_options; +  dconfig_options.add(opts); +  dcmdline_options.add(opts).add(clo); +   +  po::store(parse_command_line(argc, argv, dcmdline_options), *conf); +  if (conf->count("config")) { +    ifstream config((*conf)["config"].as<string>().c_str()); +    po::store(po::parse_config_file(config, dconfig_options), *conf); +  } +  po::notify(*conf); + +  if (conf->count("help") || (conf->count("input") == 0)) { +    cerr << dcmdline_options << endl; +    exit(1); +  } +} + +shared_ptr<MT19937> prng; + +struct ModelAndData { +  explicit ModelAndData(ConditionalParallelSegementationModel<PhraseConditionalBase>& m, const vector<vector<int> >& ce, const vector<vector<int> >& cf, const set<int>& ve, const set<int>& vf) : +     model(m), +     rng(&*prng), +     corpuse(ce), +     corpusf(cf), +     vocabe(ve), +     vocabf(vf), +     mh_samples(), +     mh_rejects(), +     kX(-TD::Convert("X")), +     derivations(corpuse.size()) {} + +  void ResampleHyperparameters() { +  } + +  void InstantiateRule(const pair<short,short>& from, +                       const pair<short,short>& to, +                       const vector<int>& sentf, +                       const vector<int>& sente, +                       TRule* rule) const { +    rule->f_.clear(); +    rule->e_.clear(); +    rule->lhs_ = kX; +    for (short i = from.first; i < to.first; ++i) +      rule->f_.push_back(sentf[i]); +    for (short i = from.second; i < to.second; ++i) +      rule->e_.push_back(sente[i]); +  } + +  void DecrementDerivation(const vector<pair<short,short> >& d, const vector<int>& sentf, const vector<int>& sente) { +    if (d.size() < 2) return; +    TRule x; +    for (int i = 1; i < d.size(); ++i) { +      InstantiateRule(d[i], d[i-1], sentf, sente, &x); +      model.DecrementRule(x); +      model.DecrementAlign(x.f_.size()); +    } +  } + +  void PrintDerivation(const vector<pair<short,short> >& d, const vector<int>& sentf, const vector<int>& sente) { +    if (d.size() < 2) return; +    TRule x; +    for (int i = 1; i < d.size(); ++i) { +      InstantiateRule(d[i], d[i-1], sentf, sente, &x); +      cerr << i << '/' << (d.size() - 1) << ": " << x << endl; +    } +  } + +  void IncrementDerivation(const vector<pair<short,short> >& d, const vector<int>& sentf, const vector<int>& sente) { +    if (d.size() < 2) return; +    TRule x; +    for (int i = 1; i < d.size(); ++i) { +      InstantiateRule(d[i], d[i-1], sentf, sente, &x); +      model.IncrementRule(x); +      model.IncrementAlign(x.f_.size()); +    } +  } + +  prob_t Likelihood() const { +    return model.Likelihood(); +  } + +  prob_t DerivationProposalProbability(const vector<pair<short,short> >& d, const vector<int>& sentf, const vector<int>& sente) const { +    prob_t p = prob_t::One(); +    TRule x; +    for (int i = 1; i < d.size(); ++i) { +      InstantiateRule(d[i], d[i-1], sentf, sente, &x); +      p *= model.RuleProbability(x); +      p *= model.AlignProbability(x.f_.size()); +    } +    return p; +  } + +  void Sample(); + +  ConditionalParallelSegementationModel<PhraseConditionalBase>& model; +  MT19937* rng; +  const vector<vector<int> >& corpuse, corpusf; +  const set<int>& vocabe, vocabf; +  unsigned mh_samples, mh_rejects; +  const int kX; +  vector<vector<pair<short, short> > > derivations; +}; + +void ModelAndData::Sample() { +  unsigned MAXK = kMAX_SRC_PHRASE; +  unsigned MAXL = kMAX_TRG_PHRASE; +  TRule x; +  x.lhs_ = -TD::Convert("X"); + +  for (int samples = 0; samples < 1000; ++samples) { +    if (samples % 1 == 0 && samples > 0) { +      //ResampleHyperparameters(); +      cerr << " [" << samples << " LLH=" << log(Likelihood()) << " MH=" << ((double)mh_rejects / mh_samples) << "]\n"; +      for (int i = 0; i < 10; ++i) { +        cerr << "SENTENCE: " << TD::GetString(corpusf[i]) << " ||| " << TD::GetString(corpuse[i]) << endl; +        PrintDerivation(derivations[i], corpusf[i], corpuse[i]); +      } +      static TRule xx("[X] ||| w n ||| s h ||| X=0"); +      const CCRP_NoTable<TRule>& dcrp = model.tmodel.r.find(xx.f_)->second; +      for (CCRP_NoTable<TRule>::const_iterator it = dcrp.begin(); it != dcrp.end(); ++it) { +        cerr << "\t" << it->second << "\t" << it->first << endl; +      } +    } +    cerr << '.' << flush; +    for (int s = 0; s < corpuse.size(); ++s) { +      const vector<int>& sentf = corpusf[s]; +      const vector<int>& sente = corpuse[s]; +//      cerr << "  CUSTOMERS: " << rules.num_customers() << endl; +//      cerr << "SENTENCE: " << TD::GetString(sentf) << " ||| " << TD::GetString(sente) << endl; + +      vector<pair<short, short> >& deriv = derivations[s]; +      const prob_t p_cur = Likelihood(); +      DecrementDerivation(deriv, sentf, sente); + +      boost::multi_array<prob_t, 2> a(boost::extents[sentf.size() + 1][sente.size() + 1]); +      boost::multi_array<prob_t, 4> trans(boost::extents[sentf.size() + 1][sente.size() + 1][MAXK][MAXL]); +      a[0][0] = prob_t::One(); +      for (int i = 0; i < sentf.size(); ++i) { +        for (int j = 0; j < sente.size(); ++j) { +          const prob_t src_a = a[i][j]; +          x.f_.clear(); +          for (int k = 1; k <= MAXK; ++k) { +            if (i + k > sentf.size()) break; +            x.f_.push_back(sentf[i + k - 1]); +            x.e_.clear(); +            const prob_t p_span = model.AlignProbability(k);  // prob of consuming this much source +            for (int l = 1; l <= MAXL; ++l) { +              if (j + l > sente.size()) break; +              x.e_.push_back(sente[j + l - 1]); +              trans[i][j][k - 1][l - 1] = model.RuleProbability(x) * p_span; +              a[i + k][j + l] += src_a * trans[i][j][k - 1][l - 1]; +            } +          } +        } +      } +//      cerr << "Inside: " << log(a[sentf.size()][sente.size()]) << endl; +      const prob_t q_cur = DerivationProposalProbability(deriv, sentf, sente); + +      vector<pair<short,short> > newderiv; +      int cur_i = sentf.size(); +      int cur_j = sente.size(); +      while(cur_i > 0 && cur_j > 0) { +        newderiv.push_back(pair<short,short>(cur_i, cur_j)); +//        cerr << "NODE: (" << cur_i << "," << cur_j << ")\n"; +        SampleSet<prob_t> ss; +        vector<pair<short,short> > nexts; +        for (int k = 1; k <= MAXK; ++k) { +          const int hyp_i = cur_i - k; +          if (hyp_i < 0) break; +          for (int l = 1; l <= MAXL; ++l) { +            const int hyp_j = cur_j - l; +            if (hyp_j < 0) break; +            const prob_t& inside = a[hyp_i][hyp_j]; +            if (inside == prob_t::Zero()) continue; +            const prob_t& transp = trans[hyp_i][hyp_j][k - 1][l - 1]; +            if (transp == prob_t::Zero()) continue; +            const prob_t p = inside * transp; +            ss.add(p); +            nexts.push_back(pair<short,short>(hyp_i, hyp_j)); +//            cerr << "    (" << hyp_i << "," << hyp_j << ")  <--- " << log(p) << endl; +          } +        } +//        cerr << "  sample set has " << nexts.size() << " elements.\n"; +        const int selected = rng->SelectSample(ss); +        cur_i = nexts[selected].first; +        cur_j = nexts[selected].second; +      } +      newderiv.push_back(pair<short,short>(0,0)); +      const prob_t q_new = DerivationProposalProbability(newderiv, sentf, sente); +      IncrementDerivation(newderiv, sentf, sente); +//      cerr << "SANITY: " << q_new << "  " <<log(DerivationProposalProbability(newderiv, sentf, sente)) << endl; +      if (deriv.empty()) { deriv = newderiv; continue; } +      ++mh_samples; + +      if (deriv != newderiv) { +        const prob_t p_new = Likelihood(); +//        cerr << "p_cur=" << log(p_cur) << "\t p_new=" << log(p_new) << endl; +//        cerr << "q_cur=" << log(q_cur) << "\t q_new=" << log(q_new) << endl; +        if (!rng->AcceptMetropolisHastings(p_new, p_cur, q_new, q_cur)) { +          ++mh_rejects; +          DecrementDerivation(newderiv, sentf, sente); +          IncrementDerivation(deriv, sentf, sente); +        } else { +//          cerr << "  ACCEPT\n"; +          deriv = newderiv; +        } +      } +    } +  } +} + +int main(int argc, char** argv) { +  po::variables_map conf; +  InitCommandLine(argc, argv, &conf); +  kMAX_TRG_PHRASE = conf["max_trg_phrase"].as<unsigned>(); +  kMAX_SRC_PHRASE = conf["max_src_phrase"].as<unsigned>(); + +  if (!conf.count("model1")) { +    cerr << argv[0] << "Please use --model1 to specify model 1 parameters\n"; +    return 1; +  } +  if (conf.count("random_seed")) +    prng.reset(new MT19937(conf["random_seed"].as<uint32_t>())); +  else +    prng.reset(new MT19937); +//  MT19937& rng = *prng; + +  vector<vector<int> > corpuse, corpusf; +  set<int> vocabe, vocabf; +  corpus::ReadParallelCorpus(conf["input"].as<string>(), &corpusf, &corpuse, &vocabf, &vocabe); +  cerr << "f-Corpus size: " << corpusf.size() << " sentences\n"; +  cerr << "f-Vocabulary size: " << vocabf.size() << " types\n"; +  cerr << "f-Corpus size: " << corpuse.size() << " sentences\n"; +  cerr << "f-Vocabulary size: " << vocabe.size() << " types\n"; +  assert(corpusf.size() == corpuse.size()); + +  Model1 m1(conf["model1"].as<string>()); + +  PhraseConditionalBase pcb0(m1, conf["model1_interpolation_weight"].as<double>(), vocabe.size()); +  ConditionalParallelSegementationModel<PhraseConditionalBase> x(pcb0);   + +  ModelAndData posterior(x, corpuse, corpusf, vocabe, vocabf); +  posterior.Sample(); + +  TRule r1("[X] ||| x ||| l e ||| X=0"); +  TRule r2("[X] ||| A ||| a d ||| X=0"); +  TRule r3("[X] ||| n ||| e r ||| X=0"); +  TRule r4("[X] ||| x A n ||| b l a g ||| X=0"); + +  PhraseConditionalUninformativeBase u0(vocabe.size()); + +  cerr << (pcb0(r1)*pcb0(r2)*pcb0(r3)) << endl; +  cerr << (u0(r4)) << endl; + +  return 0; +} + diff --git a/gi/pf/corpus.cc b/gi/pf/corpus.cc index a408e7cf..cb6e4ed7 100644 --- a/gi/pf/corpus.cc +++ b/gi/pf/corpus.cc @@ -24,11 +24,11 @@ void ReadParallelCorpus(const string& filename,    istream* in = rf.stream();    assert(*in);    string line; +  unsigned lc = 0;    const WordID kDIV = TD::Convert("|||");    vector<WordID> tmp; -  while(*in) { -    getline(*in, line); -    if (line.empty() && !*in) break; +  while(getline(*in, line)) { +    ++lc;      e->push_back(vector<int>());      f->push_back(vector<int>());      vector<int>& le = e->back(); @@ -39,12 +39,17 @@ void ReadParallelCorpus(const string& filename,      for (unsigned i = 0; i < tmp.size(); ++i) {        const int cur = tmp[i];        if (isf) { -        if (kDIV == cur) { isf = false; } else { +        if (kDIV == cur) { +          isf = false; +        } else {            lf.push_back(cur);            vocab_f->insert(cur);          }        } else { -        assert(cur != kDIV); +        if (cur == kDIV) { +          cerr << "ERROR in " << lc << ": " << line << endl << endl; +          abort(); +        }          le.push_back(cur);          vocab_e->insert(cur);        } diff --git a/gi/pf/dpnaive.cc b/gi/pf/dpnaive.cc index db1c43c7..469dff5c 100644 --- a/gi/pf/dpnaive.cc +++ b/gi/pf/dpnaive.cc @@ -6,7 +6,7 @@  #include <boost/program_options.hpp>  #include <boost/program_options/variables_map.hpp> -#include "base_measures.h" +#include "base_distributions.h"  #include "monotonic_pseg.h"  #include "trule.h"  #include "tdict.h" diff --git a/gi/pf/guess-translits.pl b/gi/pf/guess-translits.pl new file mode 100755 index 00000000..d00c2168 --- /dev/null +++ b/gi/pf/guess-translits.pl @@ -0,0 +1,72 @@ +#!/usr/bin/perl -w +use strict; +use utf8; + +my $MIN_PMI = -3; + +my %fs; +my %es; +my %ef; + +die "Usage: $0 < input.utf8.txt\n" if scalar @ARGV > 0; + +binmode(STDIN,":utf8"); +binmode(STDOUT,":utf8"); +binmode(STDERR,":utf8"); + +my $tot = 0; +print STDERR "Reading alignments from STDIN ...\n"; +while(<STDIN>) { +  chomp; +  my ($fsent, $esent, $alsent) = split / \|\|\| /; +  die "Format should be 'foreign sentence ||| english sentence ||| 0-0 1-1 ...'\n" unless defined $fsent && defined $esent && defined $alsent; + +  my @fws = split /\s+/, $fsent;   +  my @ews = split /\s+/, $esent; +  my @as = split /\s+/, $alsent; +  my %a2b; +  my %b2a; +  for my $ap (@as) { +    my ($a,$b) = split /-/, $ap; +    die "BAD INPUT: $_\n" unless defined $a && defined $b; +    $a2b{$a}->{$b} = 1; +    $b2a{$b}->{$a} = 1; +  } +  for my $a (keys %a2b) { +    my $bref = $a2b{$a}; +    next unless scalar keys %$bref < 2; +    my $b = (keys %$bref)[0]; +    next unless scalar keys %{$b2a{$b}} < 2; +    my $f = $fws[$a]; +    next unless defined $f; +    next unless length($f) > 3; +    my $e = $ews[$b]; +    next unless defined $e; +    next unless length($e) > 3; + +    $ef{$f}->{$e}++; +    $es{$e}++; +    $fs{$f}++; +    $tot++; +  }   +} +my $ltot = log($tot); +my $num = 0; +print STDERR "Extracting pairs for PMI > $MIN_PMI ...\n"; +for my $f (keys %fs) { +  my $logf = log($fs{$f}); +  my $esref = $ef{$f}; +  for my $e (keys %$esref) { +    my $loge = log($es{$e}); +    my $ef = $esref->{$e}; +    my $logef = log($ef); +    my $pmi = $logef - ($loge + $logf); +    next if $pmi < $MIN_PMI; +    my @flets = split //, $f; +    my @elets = split //, $e; +    print "@flets ||| @elets\n"; +    $num++; +  } +} +print STDERR "Extracted $num pairs.\n"; +print STDERR "Recommend running:\n   ../../training/model1 -v -d -t -99999 output.txt\n"; diff --git a/gi/pf/itg.cc b/gi/pf/itg.cc index ac3c16a3..a38fe672 100644 --- a/gi/pf/itg.cc +++ b/gi/pf/itg.cc @@ -27,10 +27,67 @@ ostream& operator<<(ostream& os, const vector<WordID>& p) {    return os << ']';  } -double log_poisson(unsigned x, const double& lambda) { -  assert(lambda > 0.0); -  return log(lambda) * x - lgamma(x + 1) - lambda; -} +struct UnigramModel { +  explicit UnigramModel(const string& fname, unsigned vocab_size, double p0null = 0.05) : +      use_uniform_(fname.size() == 0), +      p0null_(p0null), +      uniform_((1.0 - p0null) / vocab_size), +      probs_(TD::NumWords() + 1) { +    if (fname.size() > 0) LoadUnigrams(fname); +    probs_[0] = p0null_; +  } + +//  +// \data\ +// ngram 1=9295 +//  +// \1-grams: +// -3.191193	" + +  void LoadUnigrams(const string& fname) { +    cerr << "Loading unigram probabilities from " << fname << " ..." << endl; +    ReadFile rf(fname); +    string line; +    istream& in = *rf.stream(); +    assert(in); +    getline(in, line); +    assert(line.empty()); +    getline(in, line); +    assert(line == "\\data\\"); +    getline(in, line); +    size_t pos = line.find("ngram 1="); +    assert(pos == 0); +    assert(line.size() > 8); +    const size_t num_unigrams = atoi(&line[8]); +    getline(in, line); +    assert(line.empty()); +    getline(in, line); +    assert(line == "\\1-grams:"); +    for (size_t i = 0; i < num_unigrams; ++i) { +      getline(in, line); +      assert(line.size() > 0); +      pos = line.find('\t'); +      assert(pos > 0); +      assert(pos + 1 < line.size()); +      const WordID w = TD::Convert(line.substr(pos + 1)); +      line[pos] = 0; +      float p = atof(&line[0]); +      const prob_t pnon_null(1.0 - p0null_.as_float()); +      if (w < probs_.size()) probs_[w].logeq(p * log(10) + log(pnon_null)); else abort(); +    } +  } + +  const prob_t& operator()(const WordID& w) const { +    if (!w) return p0null_; +    if (use_uniform_) return uniform_; +    return probs_[w]; +  } + +  const bool use_uniform_; +  const prob_t p0null_; +  const prob_t uniform_; +  vector<prob_t> probs_; +};  struct Model1 {    explicit Model1(const string& fname) : @@ -89,11 +146,11 @@ void InitCommandLine(int argc, char** argv, po::variables_map* conf) {          ("samples,s",po::value<unsigned>()->default_value(1000),"Number of samples")          ("particles,p",po::value<unsigned>()->default_value(25),"Number of particles")          ("input,i",po::value<string>(),"Read parallel data from") -        ("max_src_phrase",po::value<unsigned>()->default_value(7),"Maximum length of source language phrases") -        ("max_trg_phrase",po::value<unsigned>()->default_value(7),"Maximum length of target language phrases")          ("model1,m",po::value<string>(),"Model 1 parameters (used in base distribution)")          ("inverse_model1,M",po::value<string>(),"Inverse Model 1 parameters (used in backward estimate)")          ("model1_interpolation_weight",po::value<double>()->default_value(0.95),"Mixing proportion of model 1 with uniform target distribution") +        ("src_unigram,u",po::value<string>()->default_value(""),"Source unigram distribution; empty for uniform") +        ("trg_unigram,U",po::value<string>()->default_value(""),"Target unigram distribution; empty for uniform")          ("random_seed,S",po::value<uint32_t>(), "Random seed");    po::options_description clo("Command line options");    clo.add_options() @@ -165,11 +222,11 @@ void ReadParallelCorpus(const string& filename,  int main(int argc, char** argv) {    po::variables_map conf;    InitCommandLine(argc, argv, &conf); -  const size_t kMAX_TRG_PHRASE = conf["max_trg_phrase"].as<unsigned>(); -  const size_t kMAX_SRC_PHRASE = conf["max_src_phrase"].as<unsigned>();    const unsigned particles = conf["particles"].as<unsigned>();    const unsigned samples = conf["samples"].as<unsigned>(); - +  TD::Convert("<s>"); +  TD::Convert("</s>"); +  TD::Convert("<unk>");    if (!conf.count("model1")) {      cerr << argv[0] << "Please use --model1 to specify model 1 parameters\n";      return 1; @@ -188,23 +245,28 @@ int main(int argc, char** argv) {    cerr << "F-corpus size: " << corpusf.size() << " sentences\t (" << vocabf.size() << " word types)\n";    cerr << "E-corpus size: " << corpuse.size() << " sentences\t (" << vocabe.size() << " word types)\n";    assert(corpusf.size() == corpuse.size()); +  UnigramModel src_unigram(conf["src_unigram"].as<string>(), vocabf.size()); +  UnigramModel trg_unigram(conf["trg_unigram"].as<string>(), vocabe.size()); +  const prob_t kHALF(0.5); +  const string kEMPTY = "NULL";    const int kLHS = -TD::Convert("X");    Model1 m1(conf["model1"].as<string>());    Model1 invm1(conf["inverse_model1"].as<string>());    for (int si = 0; si < conf["samples"].as<unsigned>(); ++si) {      cerr << '.' << flush;      for (int ci = 0; ci < corpusf.size(); ++ci) { -      const vector<WordID>& src = corpusf[ci];        const vector<WordID>& trg = corpuse[ci]; -      for (int i = 0; i < src.size(); ++i) { -        for (int j = 0; j < trg.size(); ++j) { -          const int eff_max_src = min(src.size() - i, kMAX_SRC_PHRASE); -          for (int k = 0; k < eff_max_src; ++k) { -            const int eff_max_trg = (k == 0 ? 1 : min(trg.size() - j, kMAX_TRG_PHRASE)); -            for (int l = 0; l < eff_max_trg; ++l) { -            } -          } +      const vector<WordID>& src = corpusf[ci]; +      for (int i = 0; i <= trg.size(); ++i) { +        const WordID e_i = i > 0 ? trg[i-1] : 0; +        for (int j = 0; j <= src.size(); ++j) { +          const WordID f_j = j > 0 ? src[j-1] : 0; +          if (e_i == 0 && f_j == 0) continue; +          prob_t je = kHALF * src_unigram(f_j) * m1(f_j,e_i) + kHALF * trg_unigram(e_i) * invm1(e_i,f_j); +          cerr << "p( " << (e_i ? TD::Convert(e_i) : kEMPTY) << " , " << (f_j ? TD::Convert(f_j) : kEMPTY) << " ) = " << log(je) << endl; +          if (e_i && f_j) +            cout << "[X] ||| " << TD::Convert(f_j) << " ||| " << TD::Convert(e_i) << " ||| LogProb=" << log(je) << endl;          }        }      } diff --git a/gi/pf/learn_cfg.cc b/gi/pf/learn_cfg.cc new file mode 100644 index 00000000..ed1772bf --- /dev/null +++ b/gi/pf/learn_cfg.cc @@ -0,0 +1,428 @@ +#include <iostream> +#include <tr1/memory> +#include <queue> + +#include <boost/functional.hpp> +#include <boost/program_options.hpp> +#include <boost/program_options/variables_map.hpp> + +#include "inside_outside.h" +#include "hg.h" +#include "bottom_up_parser.h" +#include "fdict.h" +#include "grammar.h" +#include "m.h" +#include "trule.h" +#include "tdict.h" +#include "filelib.h" +#include "dict.h" +#include "sampler.h" +#include "ccrp.h" +#include "ccrp_onetable.h" + +using namespace std; +using namespace tr1; +namespace po = boost::program_options; + +shared_ptr<MT19937> prng; +vector<int> nt_vocab; +vector<int> nt_id_to_index; +static unsigned kMAX_RULE_SIZE = 0; +static unsigned kMAX_ARITY = 0; +static bool kALLOW_MIXED = true;  // allow rules with mixed terminals and NTs +static bool kHIERARCHICAL_PRIOR = false; + +void InitCommandLine(int argc, char** argv, po::variables_map* conf) { +  po::options_description opts("Configuration options"); +  opts.add_options() +        ("samples,s",po::value<unsigned>()->default_value(1000),"Number of samples") +        ("input,i",po::value<string>(),"Read parallel data from") +        ("max_rule_size,m", po::value<unsigned>()->default_value(0), "Maximum rule size (0 for unlimited)") +        ("max_arity,a", po::value<unsigned>()->default_value(0), "Maximum number of nonterminals in a rule (0 for unlimited)") +        ("no_mixed_rules,M", "Do not mix terminals and nonterminals in a rule RHS") +        ("nonterminals,n", po::value<unsigned>()->default_value(1), "Size of nonterminal vocabulary") +        ("hierarchical_prior,h", "Use hierarchical prior") +        ("random_seed,S",po::value<uint32_t>(), "Random seed"); +  po::options_description clo("Command line options"); +  clo.add_options() +        ("config", po::value<string>(), "Configuration file") +        ("help", "Print this help message and exit"); +  po::options_description dconfig_options, dcmdline_options; +  dconfig_options.add(opts); +  dcmdline_options.add(opts).add(clo); +   +  po::store(parse_command_line(argc, argv, dcmdline_options), *conf); +  if (conf->count("config")) { +    ifstream config((*conf)["config"].as<string>().c_str()); +    po::store(po::parse_config_file(config, dconfig_options), *conf); +  } +  po::notify(*conf); + +  if (conf->count("help") || (conf->count("input") == 0)) { +    cerr << dcmdline_options << endl; +    exit(1); +  } +} + +unsigned ReadCorpus(const string& filename, +                    vector<vector<WordID> >* e, +                    set<WordID>* vocab_e) { +  e->clear(); +  vocab_e->clear(); +  istream* in; +  if (filename == "-") +    in = &cin; +  else +    in = new ifstream(filename.c_str()); +  assert(*in); +  string line; +  unsigned toks = 0; +  while(*in) { +    getline(*in, line); +    if (line.empty() && !*in) break; +    e->push_back(vector<int>()); +    vector<int>& le = e->back(); +    TD::ConvertSentence(line, &le); +    for (unsigned i = 0; i < le.size(); ++i) +      vocab_e->insert(le[i]); +    toks += le.size(); +  } +  if (in != &cin) delete in; +  return toks; +} + +struct Grid { +  // a b c d e +  // 0 - 0 - - +  vector<int> grid; +}; + +struct BaseRuleModel { +  explicit BaseRuleModel(unsigned term_size, +                         unsigned nonterm_size = 1) : +      unif_term(1.0 / term_size), +      unif_nonterm(1.0 / nonterm_size) {} +  prob_t operator()(const TRule& r) const { +    prob_t p; p.logeq(Md::log_poisson(1.0, r.f_.size())); +    const prob_t term_prob((2.0 + 0.01*r.f_.size()) / (r.f_.size() + 2)); +    const prob_t nonterm_prob(1.0 - term_prob.as_float()); +    for (unsigned i = 0; i < r.f_.size(); ++i) { +      if (r.f_[i] <= 0) {     // nonterminal +        if (kALLOW_MIXED) p *= nonterm_prob; +        p *= unif_nonterm; +      } else {                // terminal +        if (kALLOW_MIXED) p *= term_prob; +        p *= unif_term; +      } +    } +    return p; +  } +  const prob_t unif_term, unif_nonterm; +}; + +struct HieroLMModel { +  explicit HieroLMModel(unsigned vocab_size, unsigned num_nts = 1) : +      base(vocab_size, num_nts), +      q0(1,1,1,1), +      nts(num_nts, CCRP<TRule>(1,1,1,1)) {} + +  prob_t Prob(const TRule& r) const { +    return nts[nt_id_to_index[-r.lhs_]].prob(r, p0(r)); +  } + +  inline prob_t p0(const TRule& r) const { +    if (kHIERARCHICAL_PRIOR) +      return q0.prob(r, base(r)); +    else +      return base(r); +  } + +  int Increment(const TRule& r, MT19937* rng) { +    const int delta = nts[nt_id_to_index[-r.lhs_]].increment(r, p0(r), rng); +    if (kHIERARCHICAL_PRIOR && delta) +      q0.increment(r, base(r), rng); +    return delta; +    // return x.increment(r); +  } + +  int Decrement(const TRule& r, MT19937* rng) { +    const int delta = nts[nt_id_to_index[-r.lhs_]].decrement(r, rng); +    if (kHIERARCHICAL_PRIOR && delta) +      q0.decrement(r, rng); +    return delta; +    //return x.decrement(r); +  } + +  prob_t Likelihood() const { +    prob_t p = prob_t::One(); +    for (unsigned i = 0; i < nts.size(); ++i) { +      prob_t q; q.logeq(nts[i].log_crp_prob()); +      p *= q; +      for (CCRP<TRule>::const_iterator it = nts[i].begin(); it != nts[i].end(); ++it) { +        prob_t tp = p0(it->first); +        tp.poweq(it->second.table_counts_.size()); +        p *= tp; +      } +    } +    if (kHIERARCHICAL_PRIOR) { +      prob_t q; q.logeq(q0.log_crp_prob()); +      p *= q; +      for (CCRP<TRule>::const_iterator it = q0.begin(); it != q0.end(); ++it) { +        prob_t tp = base(it->first); +        tp.poweq(it->second.table_counts_.size()); +        p *= tp; +      } +    } +    //for (CCRP_OneTable<TRule>::const_iterator it = x.begin(); it != x.end(); ++it) +    //    p *= base(it->first); +    return p; +  } + +  void ResampleHyperparameters(MT19937* rng) { +    for (unsigned i = 0; i < nts.size(); ++i) +      nts[i].resample_hyperparameters(rng); +    if (kHIERARCHICAL_PRIOR) { +      q0.resample_hyperparameters(rng); +      cerr << "[base d=" << q0.discount() << ", s=" << q0.strength() << "]"; +    } +    cerr << " d=" << nts[0].discount() << ", s=" << nts[0].strength() << endl; +  } + +  const BaseRuleModel base; +  CCRP<TRule> q0; +  vector<CCRP<TRule> > nts; +  //CCRP_OneTable<TRule> x; +}; + +vector<GrammarIter* > tofreelist; + +HieroLMModel* plm; + +struct NPGrammarIter : public GrammarIter, public RuleBin { +  NPGrammarIter() : arity() { tofreelist.push_back(this); } +  NPGrammarIter(const TRulePtr& inr, const int a, int symbol) : arity(a) { +    if (inr) { +      r.reset(new TRule(*inr)); +    } else { +      r.reset(new TRule); +    } +    TRule& rr = *r; +    rr.lhs_ = nt_vocab[0]; +    rr.f_.push_back(symbol); +    rr.e_.push_back(symbol < 0 ? (1-int(arity)) : symbol); +    tofreelist.push_back(this); +  } +  inline static unsigned NextArity(int cur_a, int symbol) { +    return cur_a + (symbol <= 0 ? 1 : 0); +  } +  virtual int GetNumRules() const { +    if (r) return nt_vocab.size(); else return 0; +  } +  virtual TRulePtr GetIthRule(int i) const { +    if (i == 0) return r; +    TRulePtr nr(new TRule(*r)); +    nr->lhs_ = nt_vocab[i]; +    return nr; +  } +  virtual int Arity() const { +    return arity; +  } +  virtual const RuleBin* GetRules() const { +    if (!r) return NULL; else return this; +  } +  virtual const GrammarIter* Extend(int symbol) const { +    const int next_arity = NextArity(arity, symbol); +    if (kMAX_ARITY && next_arity > kMAX_ARITY) +      return NULL; +    if (!kALLOW_MIXED && r) { +      bool t1 = r->f_.front() <= 0; +      bool t2 = symbol <= 0; +      if (t1 != t2) return NULL; +    } +    if (!kMAX_RULE_SIZE || !r || (r->f_.size() < kMAX_RULE_SIZE)) +      return new NPGrammarIter(r, next_arity, symbol); +    else +      return NULL; +  } +  const unsigned char arity; +  TRulePtr r; +}; + +struct NPGrammar : public Grammar { +  virtual const GrammarIter* GetRoot() const { +    return new NPGrammarIter; +  } +}; + +prob_t TotalProb(const Hypergraph& hg) { +  return Inside<prob_t, EdgeProb>(hg); +} + +void SampleDerivation(const Hypergraph& hg, MT19937* rng, vector<unsigned>* sampled_deriv) { +  vector<prob_t> node_probs; +  Inside<prob_t, EdgeProb>(hg, &node_probs); +  queue<unsigned> q; +  q.push(hg.nodes_.size() - 2); +  while(!q.empty()) { +    unsigned cur_node_id = q.front(); +//    cerr << "NODE=" << cur_node_id << endl; +    q.pop(); +    const Hypergraph::Node& node = hg.nodes_[cur_node_id]; +    const unsigned num_in_edges = node.in_edges_.size(); +    unsigned sampled_edge = 0; +    if (num_in_edges == 1) { +      sampled_edge = node.in_edges_[0]; +    } else { +      //prob_t z; +      assert(num_in_edges > 1); +      SampleSet<prob_t> ss; +      for (unsigned j = 0; j < num_in_edges; ++j) { +        const Hypergraph::Edge& edge = hg.edges_[node.in_edges_[j]]; +        prob_t p = edge.edge_prob_; +        for (unsigned k = 0; k < edge.tail_nodes_.size(); ++k) +          p *= node_probs[edge.tail_nodes_[k]]; +        ss.add(p); +//        cerr << log(ss[j]) << " ||| " << edge.rule_->AsString() << endl; +        //z += p; +      } +//      for (unsigned j = 0; j < num_in_edges; ++j) { +//        const Hypergraph::Edge& edge = hg.edges_[node.in_edges_[j]]; +//        cerr << exp(log(ss[j] / z)) << " ||| " << edge.rule_->AsString() << endl; +//      } +//      cerr << " --- \n"; +      sampled_edge = node.in_edges_[rng->SelectSample(ss)]; +    } +    sampled_deriv->push_back(sampled_edge); +    const Hypergraph::Edge& edge = hg.edges_[sampled_edge]; +    for (unsigned j = 0; j < edge.tail_nodes_.size(); ++j) { +      q.push(edge.tail_nodes_[j]); +    } +  } +  for (unsigned i = 0; i < sampled_deriv->size(); ++i) { +    cerr << *hg.edges_[(*sampled_deriv)[i]].rule_ << endl; +  } +} + +void IncrementDerivation(const Hypergraph& hg, const vector<unsigned>& d, HieroLMModel* plm, MT19937* rng) { +  for (unsigned i = 0; i < d.size(); ++i) +    plm->Increment(*hg.edges_[d[i]].rule_, rng); +} + +void DecrementDerivation(const Hypergraph& hg, const vector<unsigned>& d, HieroLMModel* plm, MT19937* rng) { +  for (unsigned i = 0; i < d.size(); ++i) +    plm->Decrement(*hg.edges_[d[i]].rule_, rng); +} + +int main(int argc, char** argv) { +  po::variables_map conf; + +  InitCommandLine(argc, argv, &conf); +  nt_vocab.resize(conf["nonterminals"].as<unsigned>()); +  assert(nt_vocab.size() > 0); +  assert(nt_vocab.size() < 26); +  { +    string nt = "X"; +    for (unsigned i = 0; i < nt_vocab.size(); ++i) { +      if (nt_vocab.size() > 1) nt[0] = ('A' + i); +      int pid = TD::Convert(nt); +      nt_vocab[i] = -pid; +      if (pid >= nt_id_to_index.size()) { +        nt_id_to_index.resize(pid + 1, -1); +      } +      nt_id_to_index[pid] = i; +    } +  } +  vector<GrammarPtr> grammars; +  grammars.push_back(GrammarPtr(new NPGrammar)); + +  const unsigned samples = conf["samples"].as<unsigned>(); +  kMAX_RULE_SIZE = conf["max_rule_size"].as<unsigned>(); +  if (kMAX_RULE_SIZE == 1) { +    cerr << "Invalid maximum rule size: must be 0 or >1\n"; +    return 1; +  } +  kMAX_ARITY = conf["max_arity"].as<unsigned>(); +  if (kMAX_ARITY == 1) { +    cerr << "Invalid maximum arity: must be 0 or >1\n"; +    return 1; +  } +  kALLOW_MIXED = !conf.count("no_mixed_rules"); + +  kHIERARCHICAL_PRIOR = conf.count("hierarchical_prior"); + +  if (conf.count("random_seed")) +    prng.reset(new MT19937(conf["random_seed"].as<uint32_t>())); +  else +    prng.reset(new MT19937); +  MT19937& rng = *prng; +  vector<vector<WordID> > corpuse; +  set<WordID> vocabe; +  cerr << "Reading corpus...\n"; +  const unsigned toks = ReadCorpus(conf["input"].as<string>(), &corpuse, &vocabe); +  cerr << "E-corpus size: " << corpuse.size() << " sentences\t (" << vocabe.size() << " word types)\n"; +  HieroLMModel lm(vocabe.size(), nt_vocab.size()); + +  plm = &lm; +  ExhaustiveBottomUpParser parser(TD::Convert(-nt_vocab[0]), grammars); + +  Hypergraph hg; +  const int kGoal = -TD::Convert("Goal"); +  const int kLP = FD::Convert("LogProb"); +  SparseVector<double> v; v.set_value(kLP, 1.0); +  vector<vector<unsigned> > derivs(corpuse.size()); +  vector<Lattice> cl(corpuse.size()); +  for (int ci = 0; ci < corpuse.size(); ++ci) { +    vector<int>& src = corpuse[ci]; +    Lattice& lat = cl[ci]; +    lat.resize(src.size()); +    for (unsigned i = 0; i < src.size(); ++i) +      lat[i].push_back(LatticeArc(src[i], 0.0, 1)); +  } +  for (int SS=0; SS < samples; ++SS) { +    const bool is_last = ((samples - 1) == SS); +    prob_t dlh = prob_t::One(); +    for (int ci = 0; ci < corpuse.size(); ++ci) { +      const vector<int>& src = corpuse[ci]; +      const Lattice& lat = cl[ci]; +      cerr << TD::GetString(src) << endl; +      hg.clear(); +      parser.Parse(lat, &hg);  // exhaustive parse +      vector<unsigned>& d = derivs[ci]; +      if (!is_last) DecrementDerivation(hg, d, &lm, &rng); +      for (unsigned i = 0; i < hg.edges_.size(); ++i) { +        TRule& r = *hg.edges_[i].rule_; +        if (r.lhs_ == kGoal) +          hg.edges_[i].edge_prob_ = prob_t::One(); +        else +          hg.edges_[i].edge_prob_ = lm.Prob(r); +      } +      if (!is_last) { +        d.clear(); +        SampleDerivation(hg, &rng, &d); +        IncrementDerivation(hg, derivs[ci], &lm, &rng); +      } else { +        prob_t p = TotalProb(hg); +        dlh *= p; +        cerr << " p(sentence) = " << log(p) << "\t" << log(dlh) << endl; +      } +      if (tofreelist.size() > 200000) { +        cerr << "Freeing ... "; +        for (unsigned i = 0; i < tofreelist.size(); ++i) +          delete tofreelist[i]; +        tofreelist.clear(); +        cerr << "Freed.\n"; +      } +    } +    double llh = log(lm.Likelihood()); +    cerr << "LLH=" << llh << "\tENTROPY=" << (-llh / log(2) / toks) << "\tPPL=" << pow(2, -llh / log(2) / toks) << endl; +    if (SS % 10 == 9) lm.ResampleHyperparameters(&rng); +    if (is_last) { +      double z = log(dlh); +      cerr << "TOTAL_PROB=" << z << "\tENTROPY=" << (-z / log(2) / toks) << "\tPPL=" << pow(2, -z / log(2) / toks) << endl; +    } +  } +  for (unsigned i = 0; i < nt_vocab.size(); ++i) +    cerr << lm.nts[i] << endl; +  return 0; +} + diff --git a/gi/pf/make-freq-bins.pl b/gi/pf/make-freq-bins.pl new file mode 100755 index 00000000..fdcd3555 --- /dev/null +++ b/gi/pf/make-freq-bins.pl @@ -0,0 +1,26 @@ +#!/usr/bin/perl -w +use strict; + +my $BASE = 6; +my $CUTOFF = 3; + +my %d; +my $num = 0; +while(<>){ + chomp; + my @words = split /\s+/; + for my $w (@words) {$d{$w}++; $num++;} +} + +my @vocab = sort {$d{$b} <=> $d{$a}} keys %d; + +for (my $i=0; $i<scalar @vocab; $i++) { +  my $most = $d{$vocab[$i]}; +  my $least = 1; + +  my $nl = -int(log($most / $num) / log($BASE) + $CUTOFF); +  if ($nl < 0) { $nl = 0; } +  print "$vocab[$i] $nl\n" +} + + diff --git a/gi/pf/monotonic_pseg.h b/gi/pf/monotonic_pseg.h index 301aa6d8..10d171fe 100644 --- a/gi/pf/monotonic_pseg.h +++ b/gi/pf/monotonic_pseg.h @@ -6,7 +6,7 @@  #include "prob.h"  #include "ccrp_nt.h"  #include "trule.h" -#include "base_measures.h" +#include "base_distributions.h"  template <typename BaseMeasure>  struct MonotonicParallelSegementationModel { diff --git a/gi/pf/ngram_base.cc b/gi/pf/ngram_base.cc new file mode 100644 index 00000000..1299f06f --- /dev/null +++ b/gi/pf/ngram_base.cc @@ -0,0 +1,69 @@ +#include "ngram_base.h" + +#include "lm/model.hh" +#include "tdict.h" + +using namespace std; + +namespace { +struct GICSVMapper : public lm::EnumerateVocab { +  GICSVMapper(vector<lm::WordIndex>* out) : out_(out), kLM_UNKNOWN_TOKEN(0) { out_->clear(); } +  void Add(lm::WordIndex index, const StringPiece &str) { +    const WordID cdec_id = TD::Convert(str.as_string()); +    if (cdec_id >= out_->size()) +      out_->resize(cdec_id + 1, kLM_UNKNOWN_TOKEN); +    (*out_)[cdec_id] = index; +  } +  vector<lm::WordIndex>* out_; +  const lm::WordIndex kLM_UNKNOWN_TOKEN; +}; +} + +struct FixedNgramBaseImpl { +  FixedNgramBaseImpl(const string& param) { +    GICSVMapper vm(&cdec2klm_map_); +    lm::ngram::Config conf; +    conf.enumerate_vocab = &vm; +    cerr << "Reading character LM from " << param << endl; +    model = new lm::ngram::ProbingModel(param.c_str(), conf); +    order = model->Order(); +    kEOS = MapWord(TD::Convert("</s>")); +    assert(kEOS > 0); +  } + +  lm::WordIndex MapWord(const WordID w) const { +    if (w < cdec2klm_map_.size()) return cdec2klm_map_[w]; +    return 0; +  } + +  ~FixedNgramBaseImpl() { delete model; } + +  prob_t StringProbability(const vector<WordID>& s) const { +    lm::ngram::State state = model->BeginSentenceState(); +    double prob = 0; +    for (unsigned i = 0; i < s.size(); ++i) { +      const lm::ngram::State scopy(state); +      prob += model->Score(scopy, MapWord(s[i]), state); +    } +    const lm::ngram::State scopy(state); +    prob += model->Score(scopy, kEOS, state); +    prob_t p; p.logeq(prob * log(10)); +    return p; +  } + +  lm::ngram::ProbingModel* model; +  unsigned order; +  vector<lm::WordIndex> cdec2klm_map_; +  lm::WordIndex kEOS; +}; + +FixedNgramBase::~FixedNgramBase() { delete impl; } + +FixedNgramBase::FixedNgramBase(const string& lmfname) { +  impl = new FixedNgramBaseImpl(lmfname); +} + +prob_t FixedNgramBase::StringProbability(const vector<WordID>& s) const { +  return impl->StringProbability(s); +} + diff --git a/gi/pf/ngram_base.h b/gi/pf/ngram_base.h new file mode 100644 index 00000000..4ea999f3 --- /dev/null +++ b/gi/pf/ngram_base.h @@ -0,0 +1,25 @@ +#ifndef _NGRAM_BASE_H_ +#define _NGRAM_BASE_H_ + +#include <string> +#include <vector> +#include "trule.h" +#include "wordid.h" +#include "prob.h" + +struct FixedNgramBaseImpl; +struct FixedNgramBase { +  FixedNgramBase(const std::string& lmfname); +  ~FixedNgramBase(); +  prob_t StringProbability(const std::vector<WordID>& s) const; + +  prob_t operator()(const TRule& rule) const { +    return StringProbability(rule.e_); +  } + + private: +  FixedNgramBaseImpl* impl; + +}; + +#endif diff --git a/gi/pf/nuisance_test.cc b/gi/pf/nuisance_test.cc new file mode 100644 index 00000000..fc0af9cb --- /dev/null +++ b/gi/pf/nuisance_test.cc @@ -0,0 +1,161 @@ +#include "ccrp.h" + +#include <vector> +#include <iostream> + +#include "tdict.h" +#include "transliterations.h" + +using namespace std; + +MT19937 rng; + +ostream& operator<<(ostream&os, const vector<int>& v) { +  os << '[' << v[0]; +  if (v.size() == 2) os << ' ' << v[1]; +  return os << ']'; +} + +struct Base { +  Base() : llh(), v(2), v1(1), v2(1), crp(0.25, 0.5) {} +  inline double p0(const vector<int>& x) const { +    double p = 0.75; +    if (x.size() == 2) p = 0.25; +    p *= 1.0 / 3.0; +    if (x.size() == 2) p *= 1.0 / 3.0; +    return p; +  } +  double est_deriv_prob(int a, int b, int seg) const { +    assert(a > 0 && a < 4);  // a \in {1,2,3} +    assert(b > 0 && b < 4);  // b \in {1,2,3} +    assert(seg == 0 || seg == 1);   // seg \in {0,1} +    if (seg == 0) { +      v[0] = a; +      v[1] = b; +      return crp.prob(v, p0(v)); +    } else { +      v1[0] = a; +      v2[0] = b; +      return crp.prob(v1, p0(v1)) * crp.prob(v2, p0(v2)); +    } +  } +  double est_marginal_prob(int a, int b) const { +    return est_deriv_prob(a,b,0) + est_deriv_prob(a,b,1); +  } +  int increment(int a, int b, double* pw = NULL) { +    double p1 = est_deriv_prob(a, b, 0); +    double p2 = est_deriv_prob(a, b, 1); +    //p1 = 0.5; p2 = 0.5; +    int seg = rng.SelectSample(p1,p2); +    double tmp = 0; +    if (!pw) pw = &tmp; +    double& w = *pw; +    if (seg == 0) { +      v[0] = a; +      v[1] = b; +      w = crp.prob(v, p0(v)) / p1; +      if (crp.increment(v, p0(v), &rng)) { +        llh += log(p0(v)); +      } +    } else { +      v1[0] = a; +      w = crp.prob(v1, p0(v1)) / p2; +      if (crp.increment(v1, p0(v1), &rng)) { +        llh += log(p0(v1)); +      } +      v2[0] = b; +      w *= crp.prob(v2, p0(v2)); +      if (crp.increment(v2, p0(v2), &rng)) { +        llh += log(p0(v2)); +      } +    } +    return seg; +  } +  void increment(int a, int b, int seg) { +    if (seg == 0) { +      v[0] = a; +      v[1] = b; +      if (crp.increment(v, p0(v), &rng)) { +        llh += log(p0(v)); +      } +    } else { +      v1[0] = a; +      if (crp.increment(v1, p0(v1), &rng)) { +        llh += log(p0(v1)); +      } +      v2[0] = b; +      if (crp.increment(v2, p0(v2), &rng)) { +        llh += log(p0(v2)); +      } +    } +  } +  void decrement(int a, int b, int seg) { +    if (seg == 0) { +      v[0] = a; +      v[1] = b; +      if (crp.decrement(v, &rng)) { +        llh -= log(p0(v)); +      } +    } else { +      v1[0] = a; +      if (crp.decrement(v1, &rng)) { +        llh -= log(p0(v1)); +      } +      v2[0] = b; +      if (crp.decrement(v2, &rng)) { +        llh -= log(p0(v2)); +      } +    } +  } +  double log_likelihood() const { +    return llh + crp.log_crp_prob(); +  } +  double llh; +  mutable vector<int> v, v1, v2; +  CCRP<vector<int> > crp; +}; + +int main(int argc, char** argv) { +  double tl = 0; +  const int ITERS = 1000; +  const int PARTICLES = 20; +  const int DATAPOINTS = 50; +  WordID x = TD::Convert("souvenons"); +  WordID y = TD::Convert("remember"); +  vector<WordID> src; TD::ConvertSentence("s o u v e n o n s", &src); +  vector<WordID> trg; TD::ConvertSentence("r e m e m b e r", &trg); +//  Transliterations xx; +//  xx.Initialize(x, src, y, trg); +//  return 1; + + for (int j = 0; j < ITERS; ++j) { +  Base b; +  vector<int> segs(DATAPOINTS); +  SampleSet<double> ss; +  vector<int> sss; +  for (int i = 0; i < DATAPOINTS; i++) { +    ss.clear(); +    sss.clear(); +    int x = ((i / 10) % 3) + 1; +    int y = (i % 3) + 1; +    //double ep = b.est_marginal_prob(x,y); +    //cerr << "est p(" << x << "," << y << ") = " << ep << endl; +    for (int n = 0; n < PARTICLES; ++n) { +      double w; +      int seg = b.increment(x,y,&w); +      //cerr << seg << " w=" << w << endl; +      ss.add(w); +      sss.push_back(seg); +      b.decrement(x,y,seg); +    } +    int seg = sss[rng.SelectSample(ss)]; +    b.increment(x, y, seg); +    //cerr << "Selected: " << seg << endl; +    //return 1; +    segs[i] = seg; +  } +  tl += b.log_likelihood(); + } +  cerr << "LLH=" << tl / ITERS << endl; +} + diff --git a/gi/pf/os_phrase.h b/gi/pf/os_phrase.h new file mode 100644 index 00000000..dfe40cb1 --- /dev/null +++ b/gi/pf/os_phrase.h @@ -0,0 +1,15 @@ +#ifndef _OS_PHRASE_H_ +#define _OS_PHRASE_H_ + +#include <iostream> +#include <vector> +#include "tdict.h" + +inline std::ostream& operator<<(std::ostream& os, const std::vector<WordID>& p) { +  os << '['; +  for (int i = 0; i < p.size(); ++i) +    os << (i==0 ? "" : " ") << TD::Convert(p[i]); +  return os << ']'; +} + +#endif diff --git a/gi/pf/pfbrat.cc b/gi/pf/pfbrat.cc index 7b60ef23..c2c52760 100644 --- a/gi/pf/pfbrat.cc +++ b/gi/pf/pfbrat.cc @@ -191,7 +191,7 @@ struct UniphraseLM {    void ResampleHyperparameters(MT19937* rng) {      phrases_.resample_hyperparameters(rng);      gen_.resample_hyperparameters(rng); -    cerr << " " << phrases_.concentration(); +    cerr << " " << phrases_.alpha();    }    CCRP_NoTable<vector<int> > phrases_; diff --git a/gi/pf/pfdist.cc b/gi/pf/pfdist.cc index aae5f798..3d578db2 100644 --- a/gi/pf/pfdist.cc +++ b/gi/pf/pfdist.cc @@ -7,7 +7,7 @@  #include <boost/program_options/variables_map.hpp>  #include "pf.h" -#include "base_measures.h" +#include "base_distributions.h"  #include "reachability.h"  #include "viterbi.h"  #include "hg.h" @@ -315,7 +315,7 @@ struct BackwardEstimate {        for (int i = 0; i < src_cov.size(); ++i)          if (!src_cov[i]) r.push_back(src_[i]);        const prob_t uniform_alignment(1.0 / r.size()); -      e.logeq(log_poisson(trg_.size() - trg_cov, r.size() - 1)); // p(trg len remaining | src len remaining) +      e.logeq(Md::log_poisson(trg_.size() - trg_cov, r.size() - 1)); // p(trg len remaining | src len remaining)        for (unsigned j = trg_cov; j < trg_.size(); ++j) {          prob_t p;          for (unsigned i = 0; i < r.size(); ++i) @@ -352,7 +352,7 @@ struct BackwardEstimateSym {          if (!src_cov[i]) r.push_back(src_[i]);        r.push_back(0);  // NULL word        const prob_t uniform_alignment(1.0 / r.size()); -      e.logeq(log_poisson(trg_.size() - trg_cov, r.size() - 1)); // p(trg len remaining | src len remaining) +      e.logeq(Md::log_poisson(trg_.size() - trg_cov, r.size() - 1)); // p(trg len remaining | src len remaining)        for (unsigned j = trg_cov; j < trg_.size(); ++j) {          prob_t p;          for (unsigned i = 0; i < r.size(); ++i) @@ -367,7 +367,7 @@ struct BackwardEstimateSym {        r.pop_back();        const prob_t inv_uniform(1.0 / (trg_.size() - trg_cov + 1.0));        prob_t inv; -      inv.logeq(log_poisson(r.size(), trg_.size() - trg_cov)); +      inv.logeq(Md::log_poisson(r.size(), trg_.size() - trg_cov));        for (unsigned i = 0; i < r.size(); ++i) {          prob_t p;          for (unsigned j = trg_cov - 1; j < trg_.size(); ++j) diff --git a/gi/pf/pfnaive.cc b/gi/pf/pfnaive.cc index 728ec00d..e1a53f5c 100644 --- a/gi/pf/pfnaive.cc +++ b/gi/pf/pfnaive.cc @@ -7,7 +7,7 @@  #include <boost/program_options/variables_map.hpp>  #include "pf.h" -#include "base_measures.h" +#include "base_distributions.h"  #include "monotonic_pseg.h"  #include "reachability.h"  #include "viterbi.h" @@ -77,7 +77,7 @@ struct BackwardEstimateSym {          r.push_back(src_[i]);        r.push_back(0);  // NULL word        const prob_t uniform_alignment(1.0 / r.size()); -      e.logeq(log_poisson(trg_.size() - trg_cov, r.size() - 1)); // p(trg len remaining | src len remaining) +      e.logeq(Md::log_poisson(trg_.size() - trg_cov, r.size() - 1)); // p(trg len remaining | src len remaining)        for (unsigned j = trg_cov; j < trg_.size(); ++j) {          prob_t p;          for (unsigned i = 0; i < r.size(); ++i) @@ -92,7 +92,7 @@ struct BackwardEstimateSym {        r.pop_back();        const prob_t inv_uniform(1.0 / (trg_.size() - trg_cov + 1.0));        prob_t inv; -      inv.logeq(log_poisson(r.size(), trg_.size() - trg_cov)); +      inv.logeq(Md::log_poisson(r.size(), trg_.size() - trg_cov));        for (unsigned i = 0; i < r.size(); ++i) {          prob_t p;          for (unsigned j = trg_cov - 1; j < trg_.size(); ++j) diff --git a/gi/pf/pyp_lm.cc b/gi/pf/pyp_lm.cc new file mode 100644 index 00000000..91029688 --- /dev/null +++ b/gi/pf/pyp_lm.cc @@ -0,0 +1,209 @@ +#include <iostream> +#include <tr1/memory> +#include <queue> + +#include <boost/functional.hpp> +#include <boost/program_options.hpp> +#include <boost/program_options/variables_map.hpp> + +#include "corpus_tools.h" +#include "m.h" +#include "tdict.h" +#include "sampler.h" +#include "ccrp.h" +#include "tied_resampler.h" + +// A not very memory-efficient implementation of an N-gram LM based on PYPs +// as described in Y.-W. Teh. (2006) A Hierarchical Bayesian Language Model +// based on Pitman-Yor Processes. In Proc. ACL. + +// I use templates to handle the recursive formalation of the prior, so +// the order of the model has to be specified here, at compile time: +#define kORDER 3 + +using namespace std; +using namespace tr1; +namespace po = boost::program_options; + +shared_ptr<MT19937> prng; + +void InitCommandLine(int argc, char** argv, po::variables_map* conf) { +  po::options_description opts("Configuration options"); +  opts.add_options() +        ("samples,n",po::value<unsigned>()->default_value(300),"Number of samples") +        ("train,i",po::value<string>(),"Training data file") +        ("test,T",po::value<string>(),"Test data file") +        ("discount_prior_a,a",po::value<double>()->default_value(1.0), "discount ~ Beta(a,b): a=this") +        ("discount_prior_b,b",po::value<double>()->default_value(1.0), "discount ~ Beta(a,b): b=this") +        ("strength_prior_s,s",po::value<double>()->default_value(1.0), "strength ~ Gamma(s,r): s=this") +        ("strength_prior_r,r",po::value<double>()->default_value(1.0), "strength ~ Gamma(s,r): r=this") +        ("random_seed,S",po::value<uint32_t>(), "Random seed"); +  po::options_description clo("Command line options"); +  clo.add_options() +        ("config", po::value<string>(), "Configuration file") +        ("help", "Print this help message and exit"); +  po::options_description dconfig_options, dcmdline_options; +  dconfig_options.add(opts); +  dcmdline_options.add(opts).add(clo); +   +  po::store(parse_command_line(argc, argv, dcmdline_options), *conf); +  if (conf->count("config")) { +    ifstream config((*conf)["config"].as<string>().c_str()); +    po::store(po::parse_config_file(config, dconfig_options), *conf); +  } +  po::notify(*conf); + +  if (conf->count("help") || (conf->count("train") == 0)) { +    cerr << dcmdline_options << endl; +    exit(1); +  } +} + +template <unsigned N> struct PYPLM; + +// uniform base distribution (0-gram model) +template<> struct PYPLM<0> { +  PYPLM(unsigned vs, double, double, double, double) : p0(1.0 / vs), draws() {} +  void increment(WordID, const vector<WordID>&, MT19937*) { ++draws; } +  void decrement(WordID, const vector<WordID>&, MT19937*) { --draws; assert(draws >= 0); } +  double prob(WordID, const vector<WordID>&) const { return p0; } +  void resample_hyperparameters(MT19937*) {} +  double log_likelihood() const { return draws * log(p0); } +  const double p0; +  int draws; +}; + +// represents an N-gram LM +template <unsigned N> struct PYPLM { +  PYPLM(unsigned vs, double da, double db, double ss, double sr) : +      backoff(vs, da, db, ss, sr), +      tr(da, db, ss, sr, 0.8, 1.0), +      lookup(N-1) {} +  void increment(WordID w, const vector<WordID>& context, MT19937* rng) { +    const double bo = backoff.prob(w, context); +    for (unsigned i = 0; i < N-1; ++i) +      lookup[i] = context[context.size() - 1 - i]; +    typename unordered_map<vector<WordID>, CCRP<WordID>, boost::hash<vector<WordID> > >::iterator it = p.find(lookup); +    if (it == p.end()) { +      it = p.insert(make_pair(lookup, CCRP<WordID>(0.5,1))).first; +      tr.Add(&it->second);  // add to resampler +    } +    if (it->second.increment(w, bo, rng)) +      backoff.increment(w, context, rng); +  } +  void decrement(WordID w, const vector<WordID>& context, MT19937* rng) { +    for (unsigned i = 0; i < N-1; ++i) +      lookup[i] = context[context.size() - 1 - i]; +    typename unordered_map<vector<WordID>, CCRP<WordID>, boost::hash<vector<WordID> > >::iterator it = p.find(lookup); +    assert(it != p.end()); +    if (it->second.decrement(w, rng)) +      backoff.decrement(w, context, rng); +  } +  double prob(WordID w, const vector<WordID>& context) const { +    const double bo = backoff.prob(w, context); +    for (unsigned i = 0; i < N-1; ++i) +      lookup[i] = context[context.size() - 1 - i]; +    typename unordered_map<vector<WordID>, CCRP<WordID>, boost::hash<vector<WordID> > >::const_iterator it = p.find(lookup); +    if (it == p.end()) return bo; +    return it->second.prob(w, bo); +  } + +  double log_likelihood() const { +    double llh = backoff.log_likelihood(); +    typename unordered_map<vector<WordID>, CCRP<WordID>, boost::hash<vector<WordID> > >::const_iterator it; +    for (it = p.begin(); it != p.end(); ++it) +      llh += it->second.log_crp_prob(); +    llh += tr.LogLikelihood(); +    return llh; +  } + +  void resample_hyperparameters(MT19937* rng) { +    tr.ResampleHyperparameters(rng); +    backoff.resample_hyperparameters(rng); +  } + +  PYPLM<N-1> backoff; +  TiedResampler<CCRP<WordID> > tr; +  double discount_a, discount_b, strength_s, strength_r; +  double d, strength; +  mutable vector<WordID> lookup;  // thread-local +  unordered_map<vector<WordID>, CCRP<WordID>, boost::hash<vector<WordID> > > p; +}; + +int main(int argc, char** argv) { +  po::variables_map conf; + +  InitCommandLine(argc, argv, &conf); +  const unsigned samples = conf["samples"].as<unsigned>(); +  if (conf.count("random_seed")) +    prng.reset(new MT19937(conf["random_seed"].as<uint32_t>())); +  else +    prng.reset(new MT19937); +  MT19937& rng = *prng; +  vector<vector<WordID> > corpuse; +  set<WordID> vocabe; +  const WordID kEOS = TD::Convert("</s>"); +  cerr << "Reading corpus...\n"; +  CorpusTools::ReadFromFile(conf["train"].as<string>(), &corpuse, &vocabe); +  cerr << "E-corpus size: " << corpuse.size() << " sentences\t (" << vocabe.size() << " word types)\n"; +  vector<vector<WordID> > test; +  if (conf.count("test")) +    CorpusTools::ReadFromFile(conf["test"].as<string>(), &test); +  else +    test = corpuse; +  PYPLM<kORDER> lm(vocabe.size(), +                   conf["discount_prior_a"].as<double>(), +                   conf["discount_prior_b"].as<double>(), +                   conf["strength_prior_s"].as<double>(), +                   conf["strength_prior_r"].as<double>()); +  vector<WordID> ctx(kORDER - 1, TD::Convert("<s>")); +  for (int SS=0; SS < samples; ++SS) { +    for (int ci = 0; ci < corpuse.size(); ++ci) { +      ctx.resize(kORDER - 1); +      const vector<WordID>& s = corpuse[ci]; +      for (int i = 0; i <= s.size(); ++i) { +        WordID w = (i < s.size() ? s[i] : kEOS); +        if (SS > 0) lm.decrement(w, ctx, &rng); +        lm.increment(w, ctx, &rng); +        ctx.push_back(w); +      } +      if (SS > 0) lm.decrement(kEOS, ctx, &rng); +      lm.increment(kEOS, ctx, &rng); +    } +    if (SS % 10 == 9) { +      cerr << " [LLH=" << lm.log_likelihood() << "]" << endl; +      if (SS % 20 == 19) lm.resample_hyperparameters(&rng); +    } else { cerr << '.' << flush; } +  } +  double llh = 0; +  unsigned cnt = 0; +  unsigned oovs = 0; +  for (int ci = 0; ci < test.size(); ++ci) { +    ctx.resize(kORDER - 1); +    const vector<WordID>& s = test[ci]; +    for (int i = 0; i <= s.size(); ++i) { +      WordID w = (i < s.size() ? s[i] : kEOS); +      double lp = log(lm.prob(w, ctx)) / log(2); +      if (i < s.size() && vocabe.count(w) == 0) { +        cerr << "**OOV "; +        ++oovs; +        lp = 0; +      } +      cerr << "p(" << TD::Convert(w) << " |"; +      for (int j = ctx.size() + 1 - kORDER; j < ctx.size(); ++j) +        cerr << ' ' << TD::Convert(ctx[j]); +      cerr << ") = " << lp << endl; +      ctx.push_back(w); +      llh -= lp; +      cnt++; +    } +  } +  cerr << "  Log_10 prob: " << (-llh * log(2) / log(10)) << endl; +  cerr << "        Count: " << cnt << endl; +  cerr << "         OOVs: " << oovs << endl; +  cerr << "Cross-entropy: " << (llh / cnt) << endl; +  cerr << "   Perplexity: " << pow(2, llh / cnt) << endl; +  return 0; +} + + diff --git a/gi/pf/pyp_tm.cc b/gi/pf/pyp_tm.cc new file mode 100644 index 00000000..e21f0267 --- /dev/null +++ b/gi/pf/pyp_tm.cc @@ -0,0 +1,131 @@ +#include "pyp_tm.h" + +#include <tr1/unordered_map> +#include <iostream> +#include <queue> + +#include "tdict.h" +#include "ccrp.h" +#include "pyp_word_model.h" +#include "tied_resampler.h" + +using namespace std; +using namespace std::tr1; + +struct FreqBinner { +  FreqBinner(const std::string& fname) { fd_.Load(fname); } +  unsigned NumberOfBins() const { return fd_.Max() + 1; } +  unsigned Bin(const WordID& w) const { return fd_.LookUp(w); } +  FreqDict<unsigned> fd_; +}; + +template <typename Base, class Binner = FreqBinner> +struct ConditionalPYPWordModel { +  ConditionalPYPWordModel(Base* b, const Binner* bnr = NULL) : +      base(*b), +      binner(bnr), +      btr(binner ? binner->NumberOfBins() + 1u : 2u) {} + +  void Summary() const { +    cerr << "Number of conditioning contexts: " << r.size() << endl; +    for (RuleModelHash::const_iterator it = r.begin(); it != r.end(); ++it) { +      cerr << TD::Convert(it->first) << "   \tPYP(d=" << it->second.discount() << ",s=" << it->second.strength() << ") --------------------------" << endl; +      for (CCRP<vector<WordID> >::const_iterator i2 = it->second.begin(); i2 != it->second.end(); ++i2) +        cerr << "   " << i2->second.total_dish_count_ << '\t' << TD::GetString(i2->first) << endl; +    } +  } + +  void ResampleHyperparameters(MT19937* rng) { +    btr.ResampleHyperparameters(rng); +  }  + +  prob_t Prob(const WordID src, const vector<WordID>& trglets) const { +    RuleModelHash::const_iterator it = r.find(src); +    if (it == r.end()) { +      return base(trglets); +    } else { +      return it->second.prob(trglets, base(trglets)); +    } +  } + +  void Increment(const WordID src, const vector<WordID>& trglets, MT19937* rng) { +    RuleModelHash::iterator it = r.find(src); +    if (it == r.end()) { +      it = r.insert(make_pair(src, CCRP<vector<WordID> >(0.5,1.0))).first; +      static const WordID kNULL = TD::Convert("NULL"); +      unsigned bin = (src == kNULL ? 0 : 1); +      if (binner && bin) { bin = binner->Bin(src) + 1; } +      btr.Add(bin, &it->second); +    } +    if (it->second.increment(trglets, base(trglets), rng)) +      base.Increment(trglets, rng); +  } + +  void Decrement(const WordID src, const vector<WordID>& trglets, MT19937* rng) { +    RuleModelHash::iterator it = r.find(src); +    assert(it != r.end()); +    if (it->second.decrement(trglets, rng)) { +      base.Decrement(trglets, rng); +    } +  } + +  prob_t Likelihood() const { +    prob_t p = prob_t::One(); +    for (RuleModelHash::const_iterator it = r.begin(); it != r.end(); ++it) { +      prob_t q; q.logeq(it->second.log_crp_prob()); +      p *= q; +    } +    return p; +  } + +  unsigned UniqueConditioningContexts() const { +    return r.size(); +  } + +  // TODO tie PYP hyperparameters based on source word frequency bins +  Base& base; +  const Binner* binner; +  BinTiedResampler<CCRP<vector<WordID> > > btr; +  typedef unordered_map<WordID, CCRP<vector<WordID> > > RuleModelHash; +  RuleModelHash r; +}; + +PYPLexicalTranslation::PYPLexicalTranslation(const vector<vector<WordID> >& lets, +                                             const unsigned num_letters) : +    letters(lets), +    up0(new PYPWordModel(num_letters)), +    tmodel(new ConditionalPYPWordModel<PYPWordModel>(up0, new FreqBinner("10k.freq"))), +    kX(-TD::Convert("X")) {} + +void PYPLexicalTranslation::Summary() const { +  tmodel->Summary(); +  up0->Summary(); +} + +prob_t PYPLexicalTranslation::Likelihood() const { +  prob_t p = up0->Likelihood(); +  p *= tmodel->Likelihood(); +  return p; +} + +void PYPLexicalTranslation::ResampleHyperparameters(MT19937* rng) { +  tmodel->ResampleHyperparameters(rng); +  up0->ResampleHyperparameters(rng); +} + +unsigned PYPLexicalTranslation::UniqueConditioningContexts() const { +  return tmodel->UniqueConditioningContexts(); +} + +prob_t PYPLexicalTranslation::Prob(WordID src, WordID trg) const { +  return tmodel->Prob(src, letters[trg]); +} + +void PYPLexicalTranslation::Increment(WordID src, WordID trg, MT19937* rng) { +  tmodel->Increment(src, letters[trg], rng); +} + +void PYPLexicalTranslation::Decrement(WordID src, WordID trg, MT19937* rng) { +  tmodel->Decrement(src, letters[trg], rng); +} + diff --git a/gi/pf/pyp_tm.h b/gi/pf/pyp_tm.h new file mode 100644 index 00000000..63e7c96d --- /dev/null +++ b/gi/pf/pyp_tm.h @@ -0,0 +1,35 @@ +#ifndef PYP_LEX_TRANS +#define PYP_LEX_TRANS + +#include <vector> +#include "wordid.h" +#include "prob.h" +#include "sampler.h" +#include "freqdict.h" + +struct FreqBinner; +struct PYPWordModel; +template <typename T, class B> struct ConditionalPYPWordModel; + +struct PYPLexicalTranslation { +  explicit PYPLexicalTranslation(const std::vector<std::vector<WordID> >& lets, +                                 const unsigned num_letters); + +  prob_t Likelihood() const; + +  void ResampleHyperparameters(MT19937* rng); +  prob_t Prob(WordID src, WordID trg) const;  // return p(trg | src) +  void Summary() const; +  void Increment(WordID src, WordID trg, MT19937* rng); +  void Decrement(WordID src, WordID trg, MT19937* rng); +  unsigned UniqueConditioningContexts() const; + + private: +  const std::vector<std::vector<WordID> >& letters;   // spelling dictionary +  PYPWordModel* up0;  // base distribuction (model English word) +  ConditionalPYPWordModel<PYPWordModel, FreqBinner>* tmodel;  // translation distributions +                      // (model English word | French word) +  const WordID kX; +}; + +#endif diff --git a/gi/pf/pyp_word_model.cc b/gi/pf/pyp_word_model.cc new file mode 100644 index 00000000..12df4abf --- /dev/null +++ b/gi/pf/pyp_word_model.cc @@ -0,0 +1,20 @@ +#include "pyp_word_model.h" + +#include <iostream> + +using namespace std; + +void PYPWordModel::ResampleHyperparameters(MT19937* rng) { +  r.resample_hyperparameters(rng); +  cerr << " PYPWordModel(d=" << r.discount() << ",s=" << r.strength() << ")\n"; +} + +void PYPWordModel::Summary() const { +  cerr << "PYPWordModel: generations=" << r.num_customers() +       << " PYP(d=" << r.discount() << ",s=" << r.strength() << ')' << endl; +  for (CCRP<vector<WordID> >::const_iterator it = r.begin(); it != r.end(); ++it) +    cerr << "   " << it->second.total_dish_count_ +              << " (on " << it->second.table_counts_.size() << " tables) " +              << TD::GetString(it->first) << endl; +} + diff --git a/gi/pf/pyp_word_model.h b/gi/pf/pyp_word_model.h new file mode 100644 index 00000000..ff366865 --- /dev/null +++ b/gi/pf/pyp_word_model.h @@ -0,0 +1,58 @@ +#ifndef _PYP_WORD_MODEL_H_ +#define _PYP_WORD_MODEL_H_ + +#include <iostream> +#include <cmath> +#include <vector> +#include "prob.h" +#include "ccrp.h" +#include "m.h" +#include "tdict.h" +#include "os_phrase.h" + +// PYP(d,s,poisson-uniform) represented as a CRP +struct PYPWordModel { +  explicit PYPWordModel(const unsigned vocab_e_size, const double mean_len = 5) : +      base(prob_t::One()), r(1,1,1,1,0.66,50.0), u0(-std::log(vocab_e_size)), mean_length(mean_len) {} + +  void ResampleHyperparameters(MT19937* rng); + +  inline prob_t operator()(const std::vector<WordID>& s) const { +    return r.prob(s, p0(s)); +  } + +  inline void Increment(const std::vector<WordID>& s, MT19937* rng) { +    if (r.increment(s, p0(s), rng)) +      base *= p0(s); +  } + +  inline void Decrement(const std::vector<WordID>& s, MT19937 *rng) { +    if (r.decrement(s, rng)) +      base /= p0(s); +  } + +  inline prob_t Likelihood() const { +    prob_t p; p.logeq(r.log_crp_prob()); +    p *= base; +    return p; +  } + +  void Summary() const; + + private: +  inline double logp0(const std::vector<WordID>& s) const { +    return Md::log_poisson(s.size(), mean_length) + s.size() * u0; +  } + +  inline prob_t p0(const std::vector<WordID>& s) const { +    prob_t p; p.logeq(logp0(s)); +    return p; +  } + +  prob_t base;  // keeps track of the draws from the base distribution +  CCRP<std::vector<WordID> > r; +  const double u0;  // uniform log prob of generating a letter +  const double mean_length;  // mean length of a word in the base distribution +}; + +#endif diff --git a/gi/pf/quasi_model2.h b/gi/pf/quasi_model2.h new file mode 100644 index 00000000..588c8f84 --- /dev/null +++ b/gi/pf/quasi_model2.h @@ -0,0 +1,166 @@ +#ifndef _QUASI_MODEL2_H_ +#define _QUASI_MODEL2_H_ + +#include <vector> +#include <cmath> +#include <tr1/unordered_map> +#include "boost/functional.hpp" +#include "prob.h" +#include "array2d.h" +#include "slice_sampler.h" +#include "m.h" + +struct AlignmentObservation { +  AlignmentObservation() : src_len(), trg_len(), j(), a_j() {} +  AlignmentObservation(unsigned sl, unsigned tl, unsigned tw, unsigned sw) : +      src_len(sl), trg_len(tl), j(tw), a_j(sw) {} +  unsigned short src_len; +  unsigned short trg_len; +  unsigned short j; +  unsigned short a_j; +}; + +inline size_t hash_value(const AlignmentObservation& o) { +  return reinterpret_cast<const size_t&>(o); +} + +inline bool operator==(const AlignmentObservation& a, const AlignmentObservation& b) { +  return hash_value(a) == hash_value(b); +} + +struct QuasiModel2 { +  explicit QuasiModel2(double alpha, double pnull = 0.1) : +      alpha_(alpha), +      pnull_(pnull), +      pnotnull_(1 - pnull) {} + +  // a_j = 0 => NULL; src_len does *not* include null +  prob_t Prob(unsigned a_j, unsigned j, unsigned src_len, unsigned trg_len) const { +    if (!a_j) return pnull_; +    return pnotnull_ * +       prob_t(UnnormalizedProb(a_j, j, src_len, trg_len, alpha_) / GetOrComputeZ(j, src_len, trg_len)); +  } + +  void Increment(unsigned a_j, unsigned j, unsigned src_len, unsigned trg_len) { +    assert(a_j <= src_len); +    assert(j < trg_len); +    ++obs_[AlignmentObservation(src_len, trg_len, j, a_j)]; +  } + +  void Decrement(unsigned a_j, unsigned j, unsigned src_len, unsigned trg_len) { +    const AlignmentObservation ao(src_len, trg_len, j, a_j); +    int &cc = obs_[ao]; +    assert(cc > 0); +    --cc; +    if (!cc) obs_.erase(ao); +  } + +  struct PNullResampler { +    PNullResampler(const QuasiModel2& m) : m_(m) {} +    const QuasiModel2& m_; +    double operator()(const double& proposed_pnull) const { +      return log(m_.Likelihood(m_.alpha_, proposed_pnull)); +    } +  }; + +  struct AlphaResampler { +    AlphaResampler(const QuasiModel2& m) : m_(m) {} +    const QuasiModel2& m_; +    double operator()(const double& proposed_alpha) const { +      return log(m_.Likelihood(proposed_alpha, m_.pnull_.as_float())); +    } +  }; + +  void ResampleHyperparameters(MT19937* rng, const unsigned nloop = 5, const unsigned niterations = 10) { +    const PNullResampler dr(*this); +    const AlphaResampler ar(*this); +    for (unsigned i = 0; i < nloop; ++i) { +      double pnull = slice_sampler1d(dr, pnull_.as_float(), *rng, 0.00000001, +                            1.0, 0.0, niterations, 100*niterations); +      pnull_ = prob_t(pnull); +      alpha_ = slice_sampler1d(ar, alpha_, *rng, 0.00000001, +                              std::numeric_limits<double>::infinity(), 0.0, niterations, 100*niterations); +    } +    std::cerr << "QuasiModel2(alpha=" << alpha_ << ",p_null=" +              << pnull_.as_float() << ") = " << Likelihood() << std::endl; +    zcache_.clear(); +  } + +  prob_t Likelihood() const { +    return Likelihood(alpha_, pnull_.as_float()); +  } + +  prob_t Likelihood(double alpha, double ppnull) const { +    const prob_t pnull(ppnull); +    const prob_t pnotnull(1 - ppnull); + +    prob_t p; +    p.logeq(Md::log_gamma_density(alpha, 0.1, 25));  // TODO configure +    assert(!p.is_0()); +    prob_t prob_of_ppnull; prob_of_ppnull.logeq(Md::log_beta_density(ppnull, 2, 10)); +    assert(!prob_of_ppnull.is_0()); +    p *= prob_of_ppnull; +    for (ObsCount::const_iterator it = obs_.begin(); it != obs_.end(); ++it) { +      const AlignmentObservation& ao = it->first; +      if (ao.a_j) { +        prob_t u = XUnnormalizedProb(ao.a_j, ao.j, ao.src_len, ao.trg_len, alpha); +        prob_t z = XComputeZ(ao.j, ao.src_len, ao.trg_len, alpha); +        prob_t pa(u / z); +        pa *= pnotnull; +        pa.poweq(it->second); +        p *= pa; +      } else { +        p *= pnull.pow(it->second); +      } +    } +    return p; +  } + + private: +  static prob_t XUnnormalizedProb(unsigned a_j, unsigned j, unsigned src_len, unsigned trg_len, double alpha) { +    prob_t p; +    p.logeq(-fabs(double(a_j - 1) / src_len - double(j) / trg_len) * alpha); +    return p; +  } + +  static prob_t XComputeZ(unsigned j, unsigned src_len, unsigned trg_len, double alpha) { +    prob_t z = prob_t::Zero(); +    for (int a_j = 1; a_j <= src_len; ++a_j) +      z += XUnnormalizedProb(a_j, j, src_len, trg_len, alpha); +    return z; +  } + +  static double UnnormalizedProb(unsigned a_j, unsigned j, unsigned src_len, unsigned trg_len, double alpha) { +    return exp(-fabs(double(a_j - 1) / src_len - double(j) / trg_len) * alpha); +  } + +  static double ComputeZ(unsigned j, unsigned src_len, unsigned trg_len, double alpha) { +    double z = 0; +    for (int a_j = 1; a_j <= src_len; ++a_j) +      z += UnnormalizedProb(a_j, j, src_len, trg_len, alpha); +    return z; +  } + +  const double& GetOrComputeZ(unsigned j, unsigned src_len, unsigned trg_len) const { +    if (src_len >= zcache_.size()) +      zcache_.resize(src_len + 1); +    if (trg_len >= zcache_[src_len].size()) +      zcache_[src_len].resize(trg_len + 1); +    std::vector<double>& zv = zcache_[src_len][trg_len]; +    if (zv.size() == 0) +      zv.resize(trg_len); +    double& z = zv[j]; +    if (!z) +      z = ComputeZ(j, src_len, trg_len, alpha_); +    return z; +  } + +  double alpha_; +  prob_t pnull_; +  prob_t pnotnull_; +  mutable std::vector<std::vector<std::vector<double> > > zcache_; +  typedef std::tr1::unordered_map<AlignmentObservation, int, boost::hash<AlignmentObservation> > ObsCount; +  ObsCount obs_; +}; + +#endif diff --git a/gi/pf/reachability.cc b/gi/pf/reachability.cc index 73dd8d39..7d0d04ac 100644 --- a/gi/pf/reachability.cc +++ b/gi/pf/reachability.cc @@ -30,15 +30,17 @@ void Reachability::ComputeReachability(int srclen, int trglen, int src_max_phras        }      }      a[0][0].clear(); -    //cerr << "Final cell contains " << a[srclen][trglen].size() << " back pointers\n"; -    if (a[srclen][trglen].size() == 0) { -      cerr << "Sentence with length (" << srclen << ',' << trglen << ") violates reachability constraints\n"; +    //cerr << srclen << "," << trglen << ": Final cell contains " << a[srclen][trglen].size() << " back pointers\n"; +    if (a[srclen][trglen].empty()) { +      cerr << "Sequence pair with lengths (" << srclen << ',' << trglen << ") violates reachability constraints\n"; +      nodes = 0;        return;      }      typedef boost::multi_array<bool, 2> rarray_type;      rarray_type r(boost::extents[srclen + 1][trglen + 1]);      r[srclen][trglen] = true; +    nodes = 0;      for (int i = srclen; i >= 0; --i) {        for (int j = trglen; j >= 0; --j) {          vector<SState>& prevs = a[i][j]; @@ -47,6 +49,7 @@ void Reachability::ComputeReachability(int srclen, int trglen, int src_max_phras            r[prevs[k].prev_src_covered][prevs[k].prev_trg_covered] = true;            int src_delta = i - prevs[k].prev_src_covered;            edges[prevs[k].prev_src_covered][prevs[k].prev_trg_covered][src_delta][j - prevs[k].prev_trg_covered] = true; +          valid_deltas[prevs[k].prev_src_covered][prevs[k].prev_trg_covered].push_back(make_pair<short,short>(src_delta,j - prevs[k].prev_trg_covered));            short &msd = max_src_delta[prevs[k].prev_src_covered][prevs[k].prev_trg_covered];            if (src_delta > msd) msd = src_delta;          } @@ -56,9 +59,16 @@ void Reachability::ComputeReachability(int srclen, int trglen, int src_max_phras      assert(!edges[0][0][0][1]);      assert(!edges[0][0][0][0]);      assert(max_src_delta[0][0] > 0); -    //cerr << "First cell contains " << b[0][0].size() << " forward pointers\n"; -    //for (int i = 0; i < b[0][0].size(); ++i) { -    //  cerr << "  -> (" << b[0][0][i].next_src_covered << "," << b[0][0][i].next_trg_covered << ")\n"; -    //} +    nodes = 0; +    for (int i = 0; i < srclen; ++i) { +      for (int j = 0; j < trglen; ++j) { +        if (valid_deltas[i][j].size() > 0) { +          node_addresses[i][j] = nodes++; +        } else { +          node_addresses[i][j] = -1; +        } +      } +    } +    cerr << "Sequence pair with lengths (" << srclen << ',' << trglen << ") has " << valid_deltas[0][0].size() << " out edges in its root node, " << nodes << " nodes in total, and outside estimate matrix will require " << sizeof(float)*nodes << " bytes\n";    } diff --git a/gi/pf/reachability.h b/gi/pf/reachability.h index 98450ec1..1e22c76a 100644 --- a/gi/pf/reachability.h +++ b/gi/pf/reachability.h @@ -12,12 +12,18 @@  // currently forbids 0 -> n and n -> 0 alignments  struct Reachability { -  boost::multi_array<bool, 4> edges;  // edges[src_covered][trg_covered][x][trg_delta] is this edge worth exploring? +  unsigned nodes; +  boost::multi_array<bool, 4> edges;  // edges[src_covered][trg_covered][src_delta][trg_delta] is this edge worth exploring?    boost::multi_array<short, 2> max_src_delta; // msd[src_covered][trg_covered] -- the largest src delta that's valid +  boost::multi_array<short, 2> node_addresses; // na[src_covered][trg_covered] -- the index of the node in a one-dimensional array (of size "nodes") +  boost::multi_array<std::vector<std::pair<short,short> >, 2> valid_deltas; // valid_deltas[src_covered][trg_covered] list of valid transitions leaving a particular node    Reachability(int srclen, int trglen, int src_max_phrase_len, int trg_max_phrase_len) : +      nodes(),        edges(boost::extents[srclen][trglen][src_max_phrase_len+1][trg_max_phrase_len+1]), -      max_src_delta(boost::extents[srclen][trglen]) { +      max_src_delta(boost::extents[srclen][trglen]), +      node_addresses(boost::extents[srclen][trglen]), +      valid_deltas(boost::extents[srclen][trglen]) {      ComputeReachability(srclen, trglen, src_max_phrase_len, trg_max_phrase_len);    } diff --git a/gi/pf/tied_resampler.h b/gi/pf/tied_resampler.h new file mode 100644 index 00000000..6f45fbce --- /dev/null +++ b/gi/pf/tied_resampler.h @@ -0,0 +1,124 @@ +#ifndef _TIED_RESAMPLER_H_ +#define _TIED_RESAMPLER_H_ + +#include <set> +#include <vector> +#include "sampler.h" +#include "slice_sampler.h" +#include "m.h" + +template <class CRP> +struct TiedResampler { +  explicit TiedResampler(double da, double db, double ss, double sr, double d=0.5, double s=1.0) : +      d_alpha(da), +      d_beta(db), +      s_shape(ss), +      s_rate(sr), +      discount(d), +      strength(s) {} + +  void Add(CRP* crp) { +    crps.insert(crp); +    crp->set_discount(discount); +    crp->set_strength(strength); +    assert(!crp->has_discount_prior()); +    assert(!crp->has_strength_prior()); +  } + +  void Remove(CRP* crp) { +    crps.erase(crp); +  } + +  size_t size() const { +    return crps.size(); +  } + +  double LogLikelihood(double d, double s) const { +    if (s <= -d) return -std::numeric_limits<double>::infinity(); +    double llh = Md::log_beta_density(d, d_alpha, d_beta) + +                 Md::log_gamma_density(d + s, s_shape, s_rate); +    for (typename std::set<CRP*>::iterator it = crps.begin(); it != crps.end(); ++it) +      llh += (*it)->log_crp_prob(d, s); +    return llh; +  } + +  double LogLikelihood() const { +    return LogLikelihood(discount, strength); +  } + +  struct DiscountResampler { +    DiscountResampler(const TiedResampler& m) : m_(m) {} +    const TiedResampler& m_; +    double operator()(const double& proposed_discount) const { +      return m_.LogLikelihood(proposed_discount, m_.strength); +    } +  }; + +  struct AlphaResampler { +    AlphaResampler(const TiedResampler& m) : m_(m) {} +    const TiedResampler& m_; +    double operator()(const double& proposed_strength) const { +      return m_.LogLikelihood(m_.discount, proposed_strength); +    } +  }; + +  void ResampleHyperparameters(MT19937* rng, const unsigned nloop = 5, const unsigned niterations = 10) { +    if (size() == 0) { std::cerr << "EMPTY - not resampling\n"; return; } +    const DiscountResampler dr(*this); +    const AlphaResampler ar(*this); +    for (int iter = 0; iter < nloop; ++iter) { +      strength = slice_sampler1d(ar, strength, *rng, -discount + std::numeric_limits<double>::min(), +                              std::numeric_limits<double>::infinity(), 0.0, niterations, 100*niterations); +      double min_discount = std::numeric_limits<double>::min(); +      if (strength < 0.0) min_discount -= strength; +      discount = slice_sampler1d(dr, discount, *rng, min_discount, +                          1.0, 0.0, niterations, 100*niterations); +    } +    strength = slice_sampler1d(ar, strength, *rng, -discount + std::numeric_limits<double>::min(), +                            std::numeric_limits<double>::infinity(), 0.0, niterations, 100*niterations); +    std::cerr << "TiedCRPs(d=" << discount << ",s=" +              << strength << ") = " << LogLikelihood(discount, strength) << std::endl; +    for (typename std::set<CRP*>::iterator it = crps.begin(); it != crps.end(); ++it) { +      (*it)->set_discount(discount); +      (*it)->set_strength(strength); +    } +  } + private: +  std::set<CRP*> crps; +  const double d_alpha, d_beta, s_shape, s_rate; +  double discount, strength; +}; + +// split according to some criterion +template <class CRP> +struct BinTiedResampler { +  explicit BinTiedResampler(unsigned nbins) : +      resamplers(nbins, TiedResampler<CRP>(1,1,1,1)) {} + +  void Add(unsigned bin, CRP* crp) { +    resamplers[bin].Add(crp); +  } + +  void Remove(unsigned bin, CRP* crp) { +    resamplers[bin].Remove(crp); +  } + +  void ResampleHyperparameters(MT19937* rng) { +    for (unsigned i = 0; i < resamplers.size(); ++i) { +      std::cerr << "BIN " << i << " (" << resamplers[i].size() << " CRPs): " << std::flush; +      resamplers[i].ResampleHyperparameters(rng); +    } +  } + +  double LogLikelihood() const { +    double llh = 0; +    for (unsigned i = 0; i < resamplers.size(); ++i) +      llh += resamplers[i].LogLikelihood(); +    return llh; +  } + + private: +  std::vector<TiedResampler<CRP> > resamplers; +}; + +#endif diff --git a/gi/pf/transliterations.cc b/gi/pf/transliterations.cc new file mode 100644 index 00000000..2200715e --- /dev/null +++ b/gi/pf/transliterations.cc @@ -0,0 +1,334 @@ +#include "transliterations.h" + +#include <iostream> +#include <vector> + +#include "boost/shared_ptr.hpp" + +#include "backward.h" +#include "filelib.h" +#include "tdict.h" +#include "trule.h" +#include "filelib.h" +#include "ccrp_nt.h" +#include "m.h" +#include "reachability.h" + +using namespace std; +using namespace std::tr1; + +struct TruncatedConditionalLengthModel { +  TruncatedConditionalLengthModel(unsigned max_src_size, unsigned max_trg_size, double expected_src_to_trg_ratio) : +      plens(max_src_size+1, vector<prob_t>(max_trg_size+1, 0.0)) { +    for (unsigned i = 1; i <= max_src_size; ++i) { +      prob_t z = prob_t::Zero(); +      for (unsigned j = 1; j <= max_trg_size; ++j) +        z += (plens[i][j] = prob_t(0.01 + exp(Md::log_poisson(j, i * expected_src_to_trg_ratio)))); +      for (unsigned j = 1; j <= max_trg_size; ++j) +        plens[i][j] /= z; +      //for (unsigned j = 1; j <= max_trg_size; ++j) +      //  cerr << "P(trg_len=" << j << " | src_len=" << i << ") = " << plens[i][j] << endl; +    } +  } + +  // return p(tlen | slen) for *chunks* not full words +  inline const prob_t& operator()(int slen, int tlen) const { +    return plens[slen][tlen]; +  } + +  vector<vector<prob_t> > plens; +}; + +struct CondBaseDist { +  CondBaseDist(unsigned max_src_size, unsigned max_trg_size, double expected_src_to_trg_ratio) : +    tclm(max_src_size, max_trg_size, expected_src_to_trg_ratio) {} + +  prob_t operator()(const vector<WordID>& src, unsigned sf, unsigned st, +                    const vector<WordID>& trg, unsigned tf, unsigned tt) const { +    prob_t p = tclm(st - sf, tt - tf);  // target len | source length ~ TCLM(source len) +    assert(!"not impl"); +    return p; +  } +  inline prob_t operator()(const vector<WordID>& src, const vector<WordID>& trg) const { +    return (*this)(src, 0, src.size(), trg, 0, trg.size()); +  } +  TruncatedConditionalLengthModel tclm; +}; + +// represents transliteration phrase probabilities, e.g. +//   p( a l - | A l ) , p( o | A w ) , ... +struct TransliterationChunkConditionalModel { +  explicit TransliterationChunkConditionalModel(const CondBaseDist& pp0) : +      d(0.0), +      strength(1.0), +      rp0(pp0) { +  } + +  void Summary() const { +    std::cerr << "Number of conditioning contexts: " << r.size() << std::endl; +    for (RuleModelHash::const_iterator it = r.begin(); it != r.end(); ++it) { +      std::cerr << TD::GetString(it->first) << "   \t(\\alpha = " << it->second.alpha() << ") --------------------------" << std::endl; +      for (CCRP_NoTable<TRule>::const_iterator i2 = it->second.begin(); i2 != it->second.end(); ++i2) +        std::cerr << "   " << i2->second << '\t' << i2->first << std::endl; +    } +  } + +  int DecrementRule(const TRule& rule) { +    RuleModelHash::iterator it = r.find(rule.f_); +    assert(it != r.end());     +    int count = it->second.decrement(rule); +    if (count) { +      if (it->second.num_customers() == 0) r.erase(it); +    } +    return count; +  } + +  int IncrementRule(const TRule& rule) { +    RuleModelHash::iterator it = r.find(rule.f_); +    if (it == r.end()) { +      it = r.insert(make_pair(rule.f_, CCRP_NoTable<TRule>(strength))).first; +    }  +    int count = it->second.increment(rule); +    return count; +  } + +  void IncrementRules(const std::vector<TRulePtr>& rules) { +    for (int i = 0; i < rules.size(); ++i) +      IncrementRule(*rules[i]); +  } + +  void DecrementRules(const std::vector<TRulePtr>& rules) { +    for (int i = 0; i < rules.size(); ++i) +      DecrementRule(*rules[i]); +  } + +  prob_t RuleProbability(const TRule& rule) const { +    prob_t p; +    RuleModelHash::const_iterator it = r.find(rule.f_); +    if (it == r.end()) { +      p = rp0(rule.f_, rule.e_); +    } else { +      p = it->second.prob(rule, rp0(rule.f_, rule.e_)); +    } +    return p; +  } + +  double LogLikelihood(const double& dd, const double& aa) const { +    if (aa <= -dd) return -std::numeric_limits<double>::infinity(); +    //double llh = Md::log_beta_density(dd, 10, 3) + Md::log_gamma_density(aa, 1, 1); +    double llh = //Md::log_beta_density(dd, 1, 1) + +                 Md::log_gamma_density(dd + aa, 1, 1); +    typename std::tr1::unordered_map<std::vector<WordID>, CCRP_NoTable<TRule>, boost::hash<std::vector<WordID> > >::const_iterator it; +    for (it = r.begin(); it != r.end(); ++it) +      llh += it->second.log_crp_prob(aa); +    return llh; +  } + +  struct AlphaResampler { +    AlphaResampler(const TransliterationChunkConditionalModel& m) : m_(m) {} +    const TransliterationChunkConditionalModel& m_; +    double operator()(const double& proposed_strength) const { +      return m_.LogLikelihood(m_.d, proposed_strength); +    } +  }; + +  void ResampleHyperparameters(MT19937* rng) { +    typename std::tr1::unordered_map<std::vector<WordID>, CCRP_NoTable<TRule>, boost::hash<std::vector<WordID> > >::iterator it; +    //const unsigned nloop = 5; +    const unsigned niterations = 10; +    //DiscountResampler dr(*this); +    AlphaResampler ar(*this); +#if 0 +    for (int iter = 0; iter < nloop; ++iter) { +      strength = slice_sampler1d(ar, strength, *rng, -d + std::numeric_limits<double>::min(), +                              std::numeric_limits<double>::infinity(), 0.0, niterations, 100*niterations); +      double min_discount = std::numeric_limits<double>::min(); +      if (strength < 0.0) min_discount -= strength; +      d = slice_sampler1d(dr, d, *rng, min_discount, +                          1.0, 0.0, niterations, 100*niterations); +    } +#endif +    strength = slice_sampler1d(ar, strength, *rng, -d, +                            std::numeric_limits<double>::infinity(), 0.0, niterations, 100*niterations); +    std::cerr << "CTMModel(alpha=" << strength << ") = " << LogLikelihood(d, strength) << std::endl; +    for (it = r.begin(); it != r.end(); ++it) { +#if 0 +      it->second.set_discount(d); +#endif +      it->second.set_alpha(strength); +    } +  } + +  prob_t Likelihood() const { +    prob_t p; p.logeq(LogLikelihood(d, strength)); +    return p; +  } + +  const CondBaseDist& rp0; +  typedef std::tr1::unordered_map<std::vector<WordID>, +                                  CCRP_NoTable<TRule>, +                                  boost::hash<std::vector<WordID> > > RuleModelHash; +  RuleModelHash r; +  double d, strength; +}; + +struct GraphStructure { +  GraphStructure() : r() {} +  // leak memory - these are basically static +  const Reachability* r; +  bool IsReachable() const { return r->nodes > 0; } +}; + +struct ProbabilityEstimates { +  ProbabilityEstimates() : gs(), backward() {} +  explicit ProbabilityEstimates(const GraphStructure& g) : +      gs(&g), backward() { +    if (g.r->nodes > 0) +      backward = new float[g.r->nodes]; +  } +  // leak memory, these are static + +  // returns an estimate of the marginal probability +  double MarginalEstimate() const { +    if (!backward) return 0; +    return backward[0]; +  } + +  // returns an backward estimate +  double Backward(int src_covered, int trg_covered) const { +    if (!backward) return 0; +    int ind = gs->r->node_addresses[src_covered][trg_covered]; +    if (ind < 0) return 0; +    return backward[ind]; +  } + +  prob_t estp; +  float* backward; + private: +  const GraphStructure* gs; +}; + +struct TransliterationsImpl { +  TransliterationsImpl(int max_src, int max_trg, double sr, const BackwardEstimator& b) : +      cp0(max_src, max_trg, sr), +      tccm(cp0), +      be(b), +      kMAX_SRC_CHUNK(max_src), +      kMAX_TRG_CHUNK(max_trg), +      kS2T_RATIO(sr), +      tot_pairs(), tot_mem() { +  } +  const CondBaseDist cp0; +  TransliterationChunkConditionalModel tccm; +  const BackwardEstimator& be; + +  void Initialize(WordID src, const vector<WordID>& src_lets, WordID trg, const vector<WordID>& trg_lets) { +    const size_t src_len = src_lets.size(); +    const size_t trg_len = trg_lets.size(); + +    // init graph structure +    if (src_len >= graphs.size()) graphs.resize(src_len + 1); +    if (trg_len >= graphs[src_len].size()) graphs[src_len].resize(trg_len + 1); +    GraphStructure& gs = graphs[src_len][trg_len]; +    if (!gs.r) { +      double rat = exp(fabs(log(trg_len / (src_len * kS2T_RATIO)))); +      if (rat > 1.5 || (rat > 2.4 && src_len < 6)) { +        cerr << " ** Forbidding transliterations of size " << src_len << "," << trg_len << ": " << rat << endl; +        gs.r = new Reachability(src_len, trg_len, 0, 0); +      } else { +        gs.r = new Reachability(src_len, trg_len, kMAX_SRC_CHUNK, kMAX_TRG_CHUNK); +      } +    } + +    const Reachability& r = *gs.r; + +    // init backward estimates +    if (src >= ests.size()) ests.resize(src + 1); +    unordered_map<WordID, ProbabilityEstimates>::iterator it = ests[src].find(trg); +    if (it != ests[src].end()) return; // already initialized + +    it = ests[src].insert(make_pair(trg, ProbabilityEstimates(gs))).first; +    ProbabilityEstimates& est = it->second; +    if (!gs.r->nodes) return;  // not derivable subject to length constraints + +    be.InitializeGrid(src_lets, trg_lets, r, kS2T_RATIO, est.backward); +    cerr << TD::GetString(src_lets) << " ||| " << TD::GetString(trg_lets) << " ||| " << (est.backward[0] / trg_lets.size()) << endl; +    tot_pairs++; +    tot_mem += sizeof(float) * gs.r->nodes; +  } + +  void Forbid(WordID src, const vector<WordID>& src_lets, WordID trg, const vector<WordID>& trg_lets) { +    const size_t src_len = src_lets.size(); +    const size_t trg_len = trg_lets.size(); +    // TODO +  } + +  prob_t EstimateProbability(WordID s, const vector<WordID>& src, WordID t, const vector<WordID>& trg) const { +    assert(src.size() < graphs.size()); +    const vector<GraphStructure>& tv = graphs[src.size()]; +    assert(trg.size() < tv.size()); +    const GraphStructure& gs = tv[trg.size()]; +    if (gs.r->nodes == 0) +      return prob_t::Zero(); +    const unordered_map<WordID, ProbabilityEstimates>::const_iterator it = ests[s].find(t); +    assert(it != ests[s].end()); +    return it->second.estp; +  } + +  void GraphSummary() const { +    double to = 0; +    double tn = 0; +    double tt = 0; +    for (int i = 0; i < graphs.size(); ++i) { +      const vector<GraphStructure>& vt = graphs[i]; +      for (int j = 0; j < vt.size(); ++j) { +        const GraphStructure& gs = vt[j]; +        if (!gs.r) continue; +        tt++; +        for (int k = 0; k < i; ++k) { +          for (int l = 0; l < j; ++l) { +            size_t c = gs.r->valid_deltas[k][l].size(); +            if (c) { +              tn += 1; +              to += c; +            } +          } +        } +      } +    } +    cerr << "     Average nodes = " << (tn / tt) << endl; +    cerr << "Average out-degree = " << (to / tn) << endl; +    cerr << " Unique structures = " << tt << endl; +    cerr << "      Unique pairs = " << tot_pairs << endl; +    cerr << "          BEs size = " << (tot_mem / (1024.0*1024.0)) << " MB" << endl; +  } + +  const int kMAX_SRC_CHUNK; +  const int kMAX_TRG_CHUNK; +  const double kS2T_RATIO; +  unsigned tot_pairs; +  size_t tot_mem; +  vector<vector<GraphStructure> > graphs; // graphs[src_len][trg_len] +  vector<unordered_map<WordID, ProbabilityEstimates> > ests; // ests[src][trg] +}; + +Transliterations::Transliterations(int max_src, int max_trg, double sr, const BackwardEstimator& be) : +    pimpl_(new TransliterationsImpl(max_src, max_trg, sr, be)) {} +Transliterations::~Transliterations() { delete pimpl_; } + +void Transliterations::Initialize(WordID src, const vector<WordID>& src_lets, WordID trg, const vector<WordID>& trg_lets) { +  pimpl_->Initialize(src, src_lets, trg, trg_lets); +} + +prob_t Transliterations::EstimateProbability(WordID s, const vector<WordID>& src, WordID t, const vector<WordID>& trg) const { +  return pimpl_->EstimateProbability(s, src,t, trg); +} + +void Transliterations::Forbid(WordID src, const vector<WordID>& src_lets, WordID trg, const vector<WordID>& trg_lets) { +  pimpl_->Forbid(src, src_lets, trg, trg_lets); +} + +void Transliterations::GraphSummary() const { +  pimpl_->GraphSummary(); +} + diff --git a/gi/pf/transliterations.h b/gi/pf/transliterations.h new file mode 100644 index 00000000..49d14684 --- /dev/null +++ b/gi/pf/transliterations.h @@ -0,0 +1,24 @@ +#ifndef _TRANSLITERATIONS_H_ +#define _TRANSLITERATIONS_H_ + +#include <vector> +#include "wordid.h" +#include "prob.h" + +struct BackwardEstimator; +struct TransliterationsImpl; +struct Transliterations { +  // max_src and max_trg indicate how big the transliteration phrases can be +  // see reachability.h for information about filter_ratio +  explicit Transliterations(int max_src, int max_trg, double s2t_rat, const BackwardEstimator& be); +  ~Transliterations(); +  void Initialize(WordID src, const std::vector<WordID>& src_lets, WordID trg, const std::vector<WordID>& trg_lets); +  void Forbid(WordID src, const std::vector<WordID>& src_lets, WordID trg, const std::vector<WordID>& trg_lets); +  void GraphSummary() const; +  prob_t EstimateProbability(WordID s, const std::vector<WordID>& src, WordID t, const std::vector<WordID>& trg) const; + private: +  TransliterationsImpl* pimpl_; +}; + +#endif + diff --git a/gi/pf/unigrams.cc b/gi/pf/unigrams.cc new file mode 100644 index 00000000..40829775 --- /dev/null +++ b/gi/pf/unigrams.cc @@ -0,0 +1,80 @@ +#include "unigrams.h" + +#include <string> +#include <cmath> + +#include "stringlib.h" +#include "filelib.h" + +using namespace std; + +void UnigramModel::LoadUnigrams(const string& fname) { +  cerr << "Loading unigram probabilities from " << fname << " ..." << endl; +  ReadFile rf(fname); +  string line; +  istream& in = *rf.stream(); +  assert(in); +  getline(in, line); +  assert(line.empty()); +  getline(in, line); +  assert(line == "\\data\\"); +  getline(in, line); +  size_t pos = line.find("ngram 1="); +  assert(pos == 0); +  assert(line.size() > 8); +  const size_t num_unigrams = atoi(&line[8]); +  getline(in, line); +  assert(line.empty()); +  getline(in, line); +  assert(line == "\\1-grams:"); +  for (size_t i = 0; i < num_unigrams; ++i) { +    getline(in, line); +    assert(line.size() > 0); +    pos = line.find('\t'); +    assert(pos > 0); +    assert(pos + 1 < line.size()); +    const WordID w = TD::Convert(line.substr(pos + 1)); +    line[pos] = 0; +    float p = atof(&line[0]); +    if (w < probs_.size()) probs_[w].logeq(p * log(10)); else cerr << "WARNING: don't know about '" << TD::Convert(w) << "'\n"; +  } +} + +void UnigramWordModel::LoadUnigrams(const string& fname) { +  cerr << "Loading unigram probabilities from " << fname << " ..." << endl; +  ReadFile rf(fname); +  string line; +  istream& in = *rf.stream(); +  assert(in); +  getline(in, line); +  assert(line.empty()); +  getline(in, line); +  assert(line == "\\data\\"); +  getline(in, line); +  size_t pos = line.find("ngram 1="); +  assert(pos == 0); +  assert(line.size() > 8); +  const size_t num_unigrams = atoi(&line[8]); +  getline(in, line); +  assert(line.empty()); +  getline(in, line); +  assert(line == "\\1-grams:"); +  for (size_t i = 0; i < num_unigrams; ++i) { +    getline(in, line); +    assert(line.size() > 0); +    pos = line.find('\t'); +    assert(pos > 0); +    assert(pos + 1 < line.size()); +    size_t cur = pos + 1; +    vector<WordID> w; +    while (cur < line.size()) { +      const size_t len = UTF8Len(line[cur]); +      w.push_back(TD::Convert(line.substr(cur, len))); +      cur += len; +    } +    line[pos] = 0; +    float p = atof(&line[0]); +    probs_[w].logeq(p * log(10.0)); +  } +} + diff --git a/gi/pf/unigrams.h b/gi/pf/unigrams.h new file mode 100644 index 00000000..1660d1ed --- /dev/null +++ b/gi/pf/unigrams.h @@ -0,0 +1,69 @@ +#ifndef _UNIGRAMS_H_ +#define _UNIGRAMS_H_ + +#include <vector> +#include <string> +#include <tr1/unordered_map> +#include <boost/functional.hpp> + +#include "wordid.h" +#include "prob.h" +#include "tdict.h" + +struct UnigramModel { +  explicit UnigramModel(const std::string& fname, unsigned vocab_size) : +      use_uniform_(fname.size() == 0), +      uniform_(1.0 / vocab_size), +      probs_() { +    if (fname.size() > 0) { +      probs_.resize(TD::NumWords() + 1); +      LoadUnigrams(fname); +    } +  } + +  const prob_t& operator()(const WordID& w) const { +    assert(w); +    if (use_uniform_) return uniform_; +    return probs_[w]; +  } + + private: +  void LoadUnigrams(const std::string& fname); + +  const bool use_uniform_; +  const prob_t uniform_; +  std::vector<prob_t> probs_; +}; + + +// reads an ARPA unigram file and converts words like 'cat' into a string 'c a t' +struct UnigramWordModel { +  explicit UnigramWordModel(const std::string& fname) : +      use_uniform_(false), +      uniform_(1.0), +      probs_() { +    LoadUnigrams(fname); +  } + +  explicit UnigramWordModel(const unsigned vocab_size) : +      use_uniform_(true), +      uniform_(1.0 / vocab_size), +      probs_() {} + +  const prob_t& operator()(const std::vector<WordID>& s) const { +    if (use_uniform_) return uniform_; +    const VectorProbHash::const_iterator it = probs_.find(s); +    assert(it != probs_.end()); +    return it->second; +  } + + private: +  void LoadUnigrams(const std::string& fname); + +  const bool use_uniform_; +  const prob_t uniform_; +  typedef std::tr1::unordered_map<std::vector<WordID>, prob_t, boost::hash<std::vector<WordID> > > VectorProbHash; +  VectorProbHash probs_; +}; + +#endif | 
