diff options
Diffstat (limited to 'gi')
| -rw-r--r-- | gi/pf/Makefile.am | 4 | ||||
| -rw-r--r-- | gi/pf/align-lexonly-pyp.cc | 327 | ||||
| -rw-r--r-- | gi/pf/base_measures.cc | 47 | ||||
| -rw-r--r-- | gi/pf/base_measures.h | 18 | ||||
| -rw-r--r-- | gi/pf/conditional_pseg.h | 74 | 
5 files changed, 469 insertions, 1 deletions
| diff --git a/gi/pf/Makefile.am b/gi/pf/Makefile.am index 7c8e89d0..28367e67 100644 --- a/gi/pf/Makefile.am +++ b/gi/pf/Makefile.am @@ -1,10 +1,12 @@ -bin_PROGRAMS = cbgi brat dpnaive pfbrat pfdist itg pfnaive condnaive align-lexonly +bin_PROGRAMS = cbgi brat dpnaive pfbrat pfdist itg pfnaive condnaive align-lexonly align-lexonly-pyp  noinst_LIBRARIES = libpf.a  libpf_a_SOURCES = base_measures.cc reachability.cc cfg_wfst_composer.cc corpus.cc unigrams.cc ngram_base.cc  align_lexonly_SOURCES = align-lexonly.cc +align_lexonly_pyp_SOURCES = align-lexonly-pyp.cc +  itg_SOURCES = itg.cc  condnaive_SOURCES = condnaive.cc diff --git a/gi/pf/align-lexonly-pyp.cc b/gi/pf/align-lexonly-pyp.cc new file mode 100644 index 00000000..d2630a2b --- /dev/null +++ b/gi/pf/align-lexonly-pyp.cc @@ -0,0 +1,327 @@ +#include <iostream> +#include <tr1/memory> +#include <queue> + +#include <boost/multi_array.hpp> +#include <boost/program_options.hpp> +#include <boost/program_options/variables_map.hpp> + +#include "array2d.h" +#include "base_measures.h" +#include "monotonic_pseg.h" +#include "conditional_pseg.h" +#include "trule.h" +#include "tdict.h" +#include "stringlib.h" +#include "filelib.h" +#include "dict.h" +#include "sampler.h" +#include "mfcr.h" +#include "corpus.h" +#include "ngram_base.h" + +using namespace std; +using namespace tr1; +namespace po = boost::program_options; + +void InitCommandLine(int argc, char** argv, po::variables_map* conf) { +  po::options_description opts("Configuration options"); +  opts.add_options() +        ("samples,s",po::value<unsigned>()->default_value(1000),"Number of samples") +        ("input,i",po::value<string>(),"Read parallel data from") +        ("random_seed,S",po::value<uint32_t>(), "Random seed"); +  po::options_description clo("Command line options"); +  clo.add_options() +        ("config", po::value<string>(), "Configuration file") +        ("help,h", "Print this help message and exit"); +  po::options_description dconfig_options, dcmdline_options; +  dconfig_options.add(opts); +  dcmdline_options.add(opts).add(clo); +   +  po::store(parse_command_line(argc, argv, dcmdline_options), *conf); +  if (conf->count("config")) { +    ifstream config((*conf)["config"].as<string>().c_str()); +    po::store(po::parse_config_file(config, dconfig_options), *conf); +  } +  po::notify(*conf); + +  if (conf->count("help") || (conf->count("input") == 0)) { +    cerr << dcmdline_options << endl; +    exit(1); +  } +} + +shared_ptr<MT19937> prng; + +struct LexicalAlignment { +  unsigned char src_index; +  bool is_transliteration; +  vector<pair<short, short> > derivation; +}; + +struct AlignedSentencePair { +  vector<WordID> src; +  vector<WordID> trg; +  vector<LexicalAlignment> a; +  Array2D<short> posterior; +}; + +struct HierarchicalWordBase { +  explicit HierarchicalWordBase(const unsigned vocab_e_size) : +      base(prob_t::One()), r(1,1,1,25,25), u0(-log(vocab_e_size)), l(1,1.0), v(1, 0.0) {} + +  void ResampleHyperparameters(MT19937* rng) { +    r.resample_hyperparameters(rng); +  } + +  inline double logp0(const vector<WordID>& s) const { +    return s.size() * u0; +  } + +  // return p0 of rule.e_ +  prob_t operator()(const TRule& rule) const { +    v[0] = exp(logp0(rule.e_)); +    return prob_t(r.prob(rule.e_, v, l)); +  } + +  void Increment(const TRule& rule) { +    v[0] = exp(logp0(rule.e_)); +    if (r.increment(rule.e_, v, l, &*prng).count) { +      base *= prob_t(v[0] * l[0]); +    } +  } + +  void Decrement(const TRule& rule) { +    if (r.decrement(rule.e_, &*prng).count) { +      base /= prob_t(exp(logp0(rule.e_))); +    } +  } + +  prob_t Likelihood() const { +    prob_t p; p.logeq(r.log_crp_prob()); +    p *= base; +    return p; +  } + +  void Summary() const { +    cerr << "NUMBER OF CUSTOMERS: " << r.num_customers() << "  (d=" << r.d() << ",\\alpha=" << r.alpha() << ')' << endl; +    for (MFCR<vector<WordID> >::const_iterator it = r.begin(); it != r.end(); ++it) +      cerr << "   " << it->second.total_dish_count_ << " (on " << it->second.table_counts_.size() << " tables)" << TD::GetString(it->first) << endl; +  } + +  prob_t base; +  MFCR<vector<WordID> > r; +  const double u0; +  const vector<double> l; +  mutable vector<double> v; +}; + +struct BasicLexicalAlignment { +  explicit BasicLexicalAlignment(const vector<vector<WordID> >& lets, +                                 const unsigned words_e, +                                 const unsigned letters_e, +                                 vector<AlignedSentencePair>* corp) : +      letters(lets), +      corpus(*corp), +      //up0(words_e), +      //up0("en.chars.1gram", letters_e), +      //up0("en.words.1gram"), +      up0(letters_e), +      //up0("en.chars.2gram"), +      tmodel(up0) { +  } + +  void InstantiateRule(const WordID src, +                       const WordID trg, +                       TRule* rule) const { +    static const WordID kX = TD::Convert("X") * -1; +    rule->lhs_ = kX; +    rule->e_ = letters[trg]; +    rule->f_ = letters[src]; +  } + +  void InitializeRandom() { +    const WordID kNULL = TD::Convert("NULL"); +    cerr << "Initializing with random alignments ...\n"; +    for (unsigned i = 0; i < corpus.size(); ++i) { +      AlignedSentencePair& asp = corpus[i]; +      asp.a.resize(asp.trg.size()); +      for (unsigned j = 0; j < asp.trg.size(); ++j) { +        const unsigned char a_j = prng->next() * (1 + asp.src.size()); +        const WordID f_a_j = (a_j ? asp.src[a_j - 1] : kNULL); +        TRule r; +        InstantiateRule(f_a_j, asp.trg[j], &r); +        asp.a[j].is_transliteration = false; +        asp.a[j].src_index = a_j; +        if (tmodel.IncrementRule(r, &*prng)) +          up0.Increment(r); +      } +    } +    cerr << "  LLH = " << Likelihood() << endl; +  } + +  prob_t Likelihood() const { +    prob_t p = tmodel.Likelihood(); +    p *= up0.Likelihood(); +    return p; +  } + +  void ResampleHyperparemeters() { +    cerr << "  LLH_prev = " << Likelihood() << flush; +    tmodel.ResampleHyperparameters(&*prng); +    up0.ResampleHyperparameters(&*prng); +    cerr << "\tLLH_post = " << Likelihood() << endl; +  } + +  void ResampleCorpus(); + +  const vector<vector<WordID> >& letters; // spelling dictionary +  vector<AlignedSentencePair>& corpus; +  //PhraseConditionalUninformativeBase up0; +  //PhraseConditionalUninformativeUnigramBase up0; +  //UnigramWordBase up0; +  //HierarchicalUnigramBase up0; +  HierarchicalWordBase up0; +  //CompletelyUniformBase up0; +  //FixedNgramBase up0; +  //ConditionalTranslationModel<PhraseConditionalUninformativeBase> tmodel; +  //ConditionalTranslationModel<PhraseConditionalUninformativeUnigramBase> tmodel; +  //ConditionalTranslationModel<UnigramWordBase> tmodel; +  //ConditionalTranslationModel<HierarchicalUnigramBase> tmodel; +  MConditionalTranslationModel<HierarchicalWordBase> tmodel; +  //ConditionalTranslationModel<FixedNgramBase> tmodel; +  //ConditionalTranslationModel<CompletelyUniformBase> tmodel; +}; + +void BasicLexicalAlignment::ResampleCorpus() { +  static const WordID kNULL = TD::Convert("NULL"); +  for (unsigned i = 0; i < corpus.size(); ++i) { +    AlignedSentencePair& asp = corpus[i]; +    SampleSet<prob_t> ss; ss.resize(asp.src.size() + 1); +    for (unsigned j = 0; j < asp.trg.size(); ++j) { +      TRule r; +      unsigned char& a_j = asp.a[j].src_index; +      WordID f_a_j = (a_j ? asp.src[a_j - 1] : kNULL); +      InstantiateRule(f_a_j, asp.trg[j], &r); +      if (tmodel.DecrementRule(r, &*prng)) +        up0.Decrement(r); + +      for (unsigned prop_a_j = 0; prop_a_j <= asp.src.size(); ++prop_a_j) { +        const WordID prop_f = (prop_a_j ? asp.src[prop_a_j - 1] : kNULL); +        InstantiateRule(prop_f, asp.trg[j], &r); +        ss[prop_a_j] = tmodel.RuleProbability(r); +      } +      a_j = prng->SelectSample(ss); +      f_a_j = (a_j ? asp.src[a_j - 1] : kNULL); +      InstantiateRule(f_a_j, asp.trg[j], &r); +      if (tmodel.IncrementRule(r, &*prng)) +        up0.Increment(r); +    } +  } +  cerr << "  LLH = " << tmodel.Likelihood() << endl; +} + +void ExtractLetters(const set<WordID>& v, vector<vector<WordID> >* l, set<WordID>* letset = NULL) { +  for (set<WordID>::const_iterator it = v.begin(); it != v.end(); ++it) { +    vector<WordID>& letters = (*l)[*it]; +    if (letters.size()) continue;   // if e and f have the same word + +    const string& w = TD::Convert(*it); +     +    size_t cur = 0; +    while (cur < w.size()) { +      const size_t len = UTF8Len(w[cur]); +      letters.push_back(TD::Convert(w.substr(cur, len))); +      if (letset) letset->insert(letters.back()); +      cur += len; +    } +  } +} + +void Debug(const AlignedSentencePair& asp) { +  cerr << TD::GetString(asp.src) << endl << TD::GetString(asp.trg) << endl; +  Array2D<bool> a(asp.src.size(), asp.trg.size()); +  for (unsigned j = 0; j < asp.trg.size(); ++j) +    if (asp.a[j].src_index) a(asp.a[j].src_index - 1, j) = true; +  cerr << a << endl; +} + +void AddSample(AlignedSentencePair* asp) { +  for (unsigned j = 0; j < asp->trg.size(); ++j) +    asp->posterior(asp->a[j].src_index, j)++; +} + +void WriteAlignments(const AlignedSentencePair& asp) { +  bool first = true; +  for (unsigned j = 0; j < asp.trg.size(); ++j) { +    int src_index = -1; +    int mc = -1; +    for (unsigned i = 0; i <= asp.src.size(); ++i) { +      if (asp.posterior(i, j) > mc) { +        mc = asp.posterior(i, j); +        src_index = i; +      } +    } + +    if (src_index) { +      if (first) first = false; else cout << ' '; +      cout << (src_index - 1) << '-' << j; +    } +  } +  cout << endl; +} + +int main(int argc, char** argv) { +  po::variables_map conf; +  InitCommandLine(argc, argv, &conf); + +  if (conf.count("random_seed")) +    prng.reset(new MT19937(conf["random_seed"].as<uint32_t>())); +  else +    prng.reset(new MT19937); +//  MT19937& rng = *prng; + +  vector<vector<int> > corpuse, corpusf; +  set<int> vocabe, vocabf; +  corpus::ReadParallelCorpus(conf["input"].as<string>(), &corpusf, &corpuse, &vocabf, &vocabe); +  cerr << "f-Corpus size: " << corpusf.size() << " sentences\n"; +  cerr << "f-Vocabulary size: " << vocabf.size() << " types\n"; +  cerr << "f-Corpus size: " << corpuse.size() << " sentences\n"; +  cerr << "f-Vocabulary size: " << vocabe.size() << " types\n"; +  assert(corpusf.size() == corpuse.size()); + +  vector<AlignedSentencePair> corpus(corpuse.size()); +  for (unsigned i = 0; i < corpuse.size(); ++i) { +    corpus[i].src.swap(corpusf[i]); +    corpus[i].trg.swap(corpuse[i]); +    corpus[i].posterior.resize(corpus[i].src.size() + 1, corpus[i].trg.size()); +  } +  corpusf.clear(); corpuse.clear(); + +  vocabf.insert(TD::Convert("NULL")); +  vector<vector<WordID> > letters(TD::NumWords()); +  set<WordID> letset; +  ExtractLetters(vocabe, &letters, &letset); +  ExtractLetters(vocabf, &letters, NULL); +  letters[TD::Convert("NULL")].clear(); + +  BasicLexicalAlignment x(letters, vocabe.size(), letset.size(), &corpus); +  x.InitializeRandom(); +  const unsigned samples = conf["samples"].as<unsigned>(); +  for (int i = 0; i < samples; ++i) { +    for (int j = 65; j < 67; ++j) Debug(corpus[j]); +    cerr << i << "\t" << x.tmodel.r.size() << "\t"; +    if (i % 10 == 0) x.ResampleHyperparemeters(); +    x.ResampleCorpus(); +    if (i > (samples / 5) && (i % 10 == 9)) for (int j = 0; j < corpus.size(); ++j) AddSample(&corpus[j]); +  } +  for (unsigned i = 0; i < corpus.size(); ++i) +    WriteAlignments(corpus[i]); +  //ModelAndData posterior(x, &corpus, vocabe, vocabf); +  x.tmodel.Summary(); +  x.up0.Summary(); + +  //posterior.Sample(); + +  return 0; +} diff --git a/gi/pf/base_measures.cc b/gi/pf/base_measures.cc index 97b4e698..7894d3e7 100644 --- a/gi/pf/base_measures.cc +++ b/gi/pf/base_measures.cc @@ -6,6 +6,53 @@  using namespace std; +TableLookupBase::TableLookupBase(const string& fname) { +  cerr << "TableLookupBase reading from " << fname << " ..." << endl; +  ReadFile rf(fname); +  istream& in = *rf.stream(); +  string line; +  unsigned lc = 0; +  const WordID kDIV = TD::Convert("|||"); +  vector<WordID> tmp; +  vector<int> le, lf; +  TRule x; +  x.lhs_ = -TD::Convert("X"); +  bool flag = false; +  while(getline(in, line)) { +    ++lc; +    if (lc % 1000000 == 0) { cerr << " [" << lc << ']' << endl; flag = false; } +    else if (lc % 25000 == 0) { cerr << '.' << flush; flag = true; } +    tmp.clear(); +    TD::ConvertSentence(line, &tmp); +    x.f_.clear(); +    x.e_.clear(); +    size_t pos = 0; +    int cc = 0; +    while(pos < tmp.size()) { +      const WordID cur = tmp[pos++]; +      if (cur == kDIV) { +        ++cc; +      } else if (cc == 0) { +        x.f_.push_back(cur);     +      } else if (cc == 1) { +        x.e_.push_back(cur); +      } else if (cc == 2) { +        table[x] = atof(TD::Convert(cur)); +        ++cc; +      } else { +        if (flag) cerr << endl; +        cerr << "Bad format in " << lc << ": " << line << endl; abort(); +      } +    } +    if (cc != 3) { +      if (flag) cerr << endl; +      cerr << "Bad format in " << lc << ": " << line << endl; abort(); +    } +  } +  if (flag) cerr << endl; +  cerr << " read " << lc << " entries\n"; +} +  prob_t PhraseConditionalUninformativeUnigramBase::p0(const vector<WordID>& vsrc,                                                       const vector<WordID>& vtrg,                                                       int start_src, int start_trg) const { diff --git a/gi/pf/base_measures.h b/gi/pf/base_measures.h index a4e9ac28..7214aa22 100644 --- a/gi/pf/base_measures.h +++ b/gi/pf/base_measures.h @@ -72,6 +72,24 @@ struct UnigramWordBase {    const UnigramWordModel un;  }; +struct RuleHasher { +  size_t operator()(const TRule& r) const { +    return hash_value(r); +  } +}; + +struct TableLookupBase { +  TableLookupBase(const std::string& fname); + +  prob_t operator()(const TRule& rule) const { +    const std::tr1::unordered_map<TRule,prob_t>::const_iterator it = table.find(rule); +    assert(it != table.end()); +    return it->second; +  } + +  std::tr1::unordered_map<TRule,prob_t,RuleHasher> table; +}; +  struct PhraseConditionalUninformativeBase {    explicit PhraseConditionalUninformativeBase(const unsigned vocab_e_size) :        kUNIFORM_TARGET(1.0 / vocab_e_size) { diff --git a/gi/pf/conditional_pseg.h b/gi/pf/conditional_pseg.h index edcdc813..db951d15 100644 --- a/gi/pf/conditional_pseg.h +++ b/gi/pf/conditional_pseg.h @@ -8,11 +8,85 @@  #include "prob.h"  #include "ccrp_nt.h" +#include "mfcr.h"  #include "trule.h"  #include "base_measures.h"  #include "tdict.h"  template <typename ConditionalBaseMeasure> +struct MConditionalTranslationModel { +  explicit MConditionalTranslationModel(ConditionalBaseMeasure& rcp0) : +    rp0(rcp0), lambdas(1, 1.0), p0s(1) {} + +  void Summary() const { +    std::cerr << "Number of conditioning contexts: " << r.size() << std::endl; +    for (RuleModelHash::const_iterator it = r.begin(); it != r.end(); ++it) { +      std::cerr << TD::GetString(it->first) << "   \t(d=" << it->second.d() << ",\\alpha = " << it->second.alpha() << ") --------------------------" << std::endl; +      for (MFCR<TRule>::const_iterator i2 = it->second.begin(); i2 != it->second.end(); ++i2) +        std::cerr << "   " << -1 << '\t' << i2->first << std::endl; +    } +  } + +  void ResampleHyperparameters(MT19937* rng) { +    for (RuleModelHash::iterator it = r.begin(); it != r.end(); ++it) +      it->second.resample_hyperparameters(rng); +  }  + +  int DecrementRule(const TRule& rule, MT19937* rng) { +    RuleModelHash::iterator it = r.find(rule.f_); +    assert(it != r.end()); +    const TableCount delta = it->second.decrement(rule, rng); +    if (delta.count) { +      if (it->second.num_customers() == 0) r.erase(it); +    } +    return delta.count; +  } + +  int IncrementRule(const TRule& rule, MT19937* rng) { +    RuleModelHash::iterator it = r.find(rule.f_); +    if (it == r.end()) { +      it = r.insert(make_pair(rule.f_, MFCR<TRule>(1, 1.0, 1.0, 1.0, 1.0, 1e-9, 4.0))).first; +    } +    p0s[0] = rp0(rule).as_float();  +    TableCount delta = it->second.increment(rule, p0s, lambdas, rng); +    return delta.count; +  } + +  prob_t RuleProbability(const TRule& rule) const { +    prob_t p; +    RuleModelHash::const_iterator it = r.find(rule.f_); +    if (it == r.end()) { +      p.logeq(log(rp0(rule))); +    } else { +      p0s[0] = rp0(rule).as_float(); +      p = prob_t(it->second.prob(rule, p0s, lambdas)); +    } +    return p; +  } + +  prob_t Likelihood() const { +    prob_t p = prob_t::One(); +#if 0 +    for (RuleModelHash::const_iterator it = r.begin(); it != r.end(); ++it) { +      prob_t q; q.logeq(it->second.log_crp_prob()); +      p *= q; +      for (CCRP_NoTable<TRule>::const_iterator i2 = it->second.begin(); i2 != it->second.end(); ++i2) +        p *= rp0(i2->first); +    } +#endif +    return p; +  } + +  const ConditionalBaseMeasure& rp0; +  typedef std::tr1::unordered_map<std::vector<WordID>, +                                  MFCR<TRule>, +                                  boost::hash<std::vector<WordID> > > RuleModelHash; +  RuleModelHash r; +  std::vector<double> lambdas; +  mutable std::vector<double> p0s; +}; + +template <typename ConditionalBaseMeasure>  struct ConditionalTranslationModel {    explicit ConditionalTranslationModel(ConditionalBaseMeasure& rcp0) :      rp0(rcp0) {} | 
