diff options
| author | Chris Dyer <cdyer@cs.cmu.edu> | 2012-10-11 14:06:32 -0400 | 
|---|---|---|
| committer | Chris Dyer <cdyer@cs.cmu.edu> | 2012-10-11 14:06:32 -0400 | 
| commit | 07ea7b64b6f85e5798a8068453ed9fd2b97396db (patch) | |
| tree | 644496a1690d84d82a396bbc1e39160788beb2cd /gi/markov_al | |
| parent | 37b9e45e5cb29d708f7249dbe0b0fb27685282a0 (diff) | |
| parent | a36fcc5d55c1de84ae68c1091ebff2b1c32dc3b7 (diff) | |
Merge branch 'master' of https://github.com/redpony/cdec
Diffstat (limited to 'gi/markov_al')
| -rw-r--r-- | gi/markov_al/Makefile.am | 6 | ||||
| -rw-r--r-- | gi/markov_al/README | 2 | ||||
| -rw-r--r-- | gi/markov_al/ml.cc | 470 | 
3 files changed, 0 insertions, 478 deletions
| diff --git a/gi/markov_al/Makefile.am b/gi/markov_al/Makefile.am deleted file mode 100644 index fe3e3349..00000000 --- a/gi/markov_al/Makefile.am +++ /dev/null @@ -1,6 +0,0 @@ -bin_PROGRAMS = ml - -ml_SOURCES = ml.cc - -AM_CPPFLAGS = -W -Wall -Wno-sign-compare -funroll-loops -I$(top_srcdir)/utils $(GTEST_CPPFLAGS) -I$(top_srcdir)/decoder -AM_LDFLAGS = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/utils/libutils.a -lz diff --git a/gi/markov_al/README b/gi/markov_al/README deleted file mode 100644 index 9c10f7cd..00000000 --- a/gi/markov_al/README +++ /dev/null @@ -1,2 +0,0 @@ -Experimental translation models with Markovian dependencies. - diff --git a/gi/markov_al/ml.cc b/gi/markov_al/ml.cc deleted file mode 100644 index 1e71edd6..00000000 --- a/gi/markov_al/ml.cc +++ /dev/null @@ -1,470 +0,0 @@ -#include <iostream> -#include <tr1/unordered_map> - -#include <boost/shared_ptr.hpp> -#include <boost/functional.hpp> -#include <boost/program_options.hpp> -#include <boost/program_options/variables_map.hpp> - -#include "tdict.h" -#include "filelib.h" -#include "sampler.h" -#include "ccrp_onetable.h" -#include "array2d.h" - -using namespace std; -using namespace std::tr1; -namespace po = boost::program_options; - -void PrintTopCustomers(const CCRP_OneTable<WordID>& crp) { -  for (CCRP_OneTable<WordID>::const_iterator it = crp.begin(); it != crp.end(); ++it) { -    cerr << "  " << TD::Convert(it->first) << " = " << it->second << endl; -  } -} - -void PrintAlignment(const vector<WordID>& src, const vector<WordID>& trg, const vector<unsigned char>& a) { -  cerr << TD::GetString(src) << endl << TD::GetString(trg) << endl; -  Array2D<bool> al(src.size(), trg.size()); -  for (int i = 0; i < a.size(); ++i) -    if (a[i] != 255) al(a[i], i) = true; -  cerr << al << endl; -} - -void InitCommandLine(int argc, char** argv, po::variables_map* conf) { -  po::options_description opts("Configuration options"); -  opts.add_options() -        ("samples,s",po::value<unsigned>()->default_value(1000),"Number of samples") -        ("input,i",po::value<string>(),"Read parallel data from") -        ("random_seed,S",po::value<uint32_t>(), "Random seed"); -  po::options_description clo("Command line options"); -  clo.add_options() -        ("config", po::value<string>(), "Configuration file") -        ("help,h", "Print this help message and exit"); -  po::options_description dconfig_options, dcmdline_options; -  dconfig_options.add(opts); -  dcmdline_options.add(opts).add(clo); -   -  po::store(parse_command_line(argc, argv, dcmdline_options), *conf); -  if (conf->count("config")) { -    ifstream config((*conf)["config"].as<string>().c_str()); -    po::store(po::parse_config_file(config, dconfig_options), *conf); -  } -  po::notify(*conf); - -  if (conf->count("help") || (conf->count("input") == 0)) { -    cerr << dcmdline_options << endl; -    exit(1); -  } -} - -struct Unigram; -struct Bigram { -  Bigram() : trg(), cond() {} -  Bigram(WordID prev, WordID cur, WordID t) : trg(t) { cond.first = prev; cond.second = cur; } -  const pair<WordID,WordID>& ConditioningPair() const { -    return cond; -  } -  WordID& prev_src() { return cond.first; } -  WordID& cur_src() { return cond.second; } -  const WordID& prev_src() const { return cond.first; } -  const WordID& cur_src() const { return cond.second; } -  WordID trg; - private: -  pair<WordID, WordID> cond; -}; - -struct Unigram { -  Unigram() : cur_src(), trg() {} -  Unigram(WordID s, WordID t) : cur_src(s), trg(t) {} -  WordID cur_src; -  WordID trg; -}; - -ostream& operator<<(ostream& os, const Bigram& b) { -  os << "( " << TD::Convert(b.trg) << " | " << TD::Convert(b.prev_src()) << " , " << TD::Convert(b.cur_src()) << " )"; -  return os; -} - -ostream& operator<<(ostream& os, const Unigram& u) { -  os << "( " << TD::Convert(u.trg) << " | " << TD::Convert(u.cur_src) << " )"; -  return os; -} - -bool operator==(const Bigram& a, const Bigram& b) { -  return a.trg == b.trg && a.cur_src() == b.cur_src() && a.prev_src() == b.prev_src(); -} - -bool operator==(const Unigram& a, const Unigram& b) { -  return a.trg == b.trg && a.cur_src == b.cur_src; -} - -size_t hash_value(const Bigram& b) { -  size_t h = boost::hash_value(b.prev_src()); -  boost::hash_combine(h, boost::hash_value(b.cur_src())); -  boost::hash_combine(h, boost::hash_value(b.trg)); -  return h; -} - -size_t hash_value(const Unigram& u) { -  size_t h = boost::hash_value(u.cur_src); -  boost::hash_combine(h, boost::hash_value(u.trg)); -  return h; -} - -void ReadParallelCorpus(const string& filename, -                vector<vector<WordID> >* f, -                vector<vector<WordID> >* e, -                set<WordID>* vocab_f, -                set<WordID>* vocab_e) { -  f->clear(); -  e->clear(); -  vocab_f->clear(); -  vocab_e->clear(); -  istream* in; -  if (filename == "-") -    in = &cin; -  else -    in = new ifstream(filename.c_str()); -  assert(*in); -  string line; -  const WordID kDIV = TD::Convert("|||"); -  vector<WordID> tmp; -  while(*in) { -    getline(*in, line); -    if (line.empty() && !*in) break; -    e->push_back(vector<int>()); -    f->push_back(vector<int>()); -    vector<int>& le = e->back(); -    vector<int>& lf = f->back(); -    tmp.clear(); -    TD::ConvertSentence(line, &tmp); -    bool isf = true; -    for (unsigned i = 0; i < tmp.size(); ++i) { -      const int cur = tmp[i]; -      if (isf) { -        if (kDIV == cur) { isf = false; } else { -          lf.push_back(cur); -          vocab_f->insert(cur); -        } -      } else { -        assert(cur != kDIV); -        le.push_back(cur); -        vocab_e->insert(cur); -      } -    } -    assert(isf == false); -  } -  if (in != &cin) delete in; -} - -struct UnigramModel { -  UnigramModel(size_t src_voc_size, size_t trg_voc_size) : -    unigrams(TD::NumWords() + 1, CCRP_OneTable<WordID>(1,1,1,1)), -    p0(1.0 / trg_voc_size) {} - -  void increment(const Bigram& b) { -    unigrams[b.cur_src()].increment(b.trg); -  } - -  void decrement(const Bigram& b) { -    unigrams[b.cur_src()].decrement(b.trg); -  } - -  double prob(const Bigram& b) const { -    const double q0 = unigrams[b.cur_src()].prob(b.trg, p0); -    return q0; -  } - -  double LogLikelihood() const { -    double llh = 0; -    for (unsigned i = 0; i < unigrams.size(); ++i) { -      const CCRP_OneTable<WordID>& crp = unigrams[i]; -      if (crp.num_customers() > 0) { -        llh += crp.log_crp_prob(); -        llh += crp.num_tables() * log(p0); -      } -    } -    return llh; -  } - -  void ResampleHyperparameters(MT19937* rng) { -    for (unsigned i = 0; i < unigrams.size(); ++i) -      unigrams[i].resample_hyperparameters(rng); -  } - -  vector<CCRP_OneTable<WordID> > unigrams;  // unigrams[src].prob(trg, p0) = p(trg|src) - -  const double p0; -}; - -struct BigramModel { -  BigramModel(size_t src_voc_size, size_t trg_voc_size) : -    unigrams(TD::NumWords() + 1, CCRP_OneTable<WordID>(1,1,1,1)), -    p0(1.0 / trg_voc_size) {} - -  void increment(const Bigram& b) { -    BigramMap::iterator it = bigrams.find(b.ConditioningPair()); -    if (it == bigrams.end()) { -      it = bigrams.insert(make_pair(b.ConditioningPair(), CCRP_OneTable<WordID>(1,1,1,1))).first; -    } -    if (it->second.increment(b.trg)) -      unigrams[b.cur_src()].increment(b.trg); -  } - -  void decrement(const Bigram& b) { -    BigramMap::iterator it = bigrams.find(b.ConditioningPair()); -    assert(it != bigrams.end()); -    if (it->second.decrement(b.trg)) { -      unigrams[b.cur_src()].decrement(b.trg); -      if (it->second.num_customers() == 0) -        bigrams.erase(it); -    } -  } - -  double prob(const Bigram& b) const { -    const double q0 = unigrams[b.cur_src()].prob(b.trg, p0); -    const BigramMap::const_iterator it = bigrams.find(b.ConditioningPair()); -    if (it == bigrams.end()) return q0; -    return it->second.prob(b.trg, q0); -  } - -  double LogLikelihood() const { -    double llh = 0; -    for (unsigned i = 0; i < unigrams.size(); ++i) { -      const CCRP_OneTable<WordID>& crp = unigrams[i]; -      if (crp.num_customers() > 0) { -        llh += crp.log_crp_prob(); -        llh += crp.num_tables() * log(p0); -      } -    } -    for (BigramMap::const_iterator it = bigrams.begin(); it != bigrams.end(); ++it) { -      const CCRP_OneTable<WordID>& crp = it->second; -      const WordID cur_src = it->first.second; -      llh += crp.log_crp_prob(); -      for (CCRP_OneTable<WordID>::const_iterator bit = crp.begin(); bit != crp.end(); ++bit) { -        llh += log(unigrams[cur_src].prob(bit->second, p0)); -      } -    } -    return llh; -  } - -  void ResampleHyperparameters(MT19937* rng) { -    for (unsigned i = 0; i < unigrams.size(); ++i) -      unigrams[i].resample_hyperparameters(rng); -    for (BigramMap::iterator it = bigrams.begin(); it != bigrams.end(); ++it) -      it->second.resample_hyperparameters(rng); -  } - -  typedef unordered_map<pair<WordID,WordID>, CCRP_OneTable<WordID>, boost::hash<pair<WordID,WordID> > > BigramMap; -  BigramMap bigrams;  // bigrams[(src-1,src)].prob(trg, q0) = p(trg|src,src-1) -  vector<CCRP_OneTable<WordID> > unigrams;  // unigrams[src].prob(trg, p0) = p(trg|src) - -  const double p0; -}; - -struct BigramAlignmentModel { -  BigramAlignmentModel(size_t src_voc_size, size_t trg_voc_size) : bigrams(TD::NumWords() + 1, CCRP_OneTable<WordID>(1,1,1,1)), p0(1.0 / src_voc_size) {} -  void increment(WordID prev, WordID next) { -    bigrams[prev].increment(next);  // hierarchy? -  } -  void decrement(WordID prev, WordID next) { -    bigrams[prev].decrement(next);  // hierarchy? -  } -  double prob(WordID prev, WordID next) { -    return bigrams[prev].prob(next, p0); -  } -  double LogLikelihood() const { -    double llh = 0; -    for (unsigned i = 0; i < bigrams.size(); ++i) { -      const CCRP_OneTable<WordID>& crp = bigrams[i]; -      if (crp.num_customers() > 0) { -        llh += crp.log_crp_prob(); -        llh += crp.num_tables() * log(p0); -      } -    } -    return llh; -  } - -  vector<CCRP_OneTable<WordID> > bigrams;  // bigrams[prev].prob(next, p0) = p(next|prev) -  const double p0; -}; - -struct Alignment { -  vector<unsigned char> a; -}; - -int main(int argc, char** argv) { -  po::variables_map conf; -  InitCommandLine(argc, argv, &conf); -  const unsigned samples = conf["samples"].as<unsigned>(); - -  boost::shared_ptr<MT19937> prng; -  if (conf.count("random_seed")) -    prng.reset(new MT19937(conf["random_seed"].as<uint32_t>())); -  else -    prng.reset(new MT19937); -  MT19937& rng = *prng; - -  vector<vector<WordID> > corpuse, corpusf; -  set<WordID> vocabe, vocabf; -  cerr << "Reading corpus...\n"; -  ReadParallelCorpus(conf["input"].as<string>(), &corpusf, &corpuse, &vocabf, &vocabe); -  cerr << "F-corpus size: " << corpusf.size() << " sentences\t (" << vocabf.size() << " word types)\n"; -  cerr << "E-corpus size: " << corpuse.size() << " sentences\t (" << vocabe.size() << " word types)\n"; -  assert(corpusf.size() == corpuse.size()); -  const size_t corpus_len = corpusf.size(); -  const WordID kNULL = TD::Convert("<eps>"); -  const WordID kBOS = TD::Convert("<s>"); -  const WordID kEOS = TD::Convert("</s>"); -  Bigram TT(kBOS, TD::Convert("我"), TD::Convert("i")); -  Bigram TT2(kBOS, TD::Convert("要"), TD::Convert("i")); - -  UnigramModel model(vocabf.size(), vocabe.size()); -  vector<Alignment> alignments(corpus_len); -  for (unsigned ci = 0; ci < corpus_len; ++ci) { -    const vector<WordID>& src = corpusf[ci]; -    const vector<WordID>& trg = corpuse[ci]; -    vector<unsigned char>& alg = alignments[ci].a; -    alg.resize(trg.size()); -    int lenp1 = src.size() + 1; -    WordID prev_src = kBOS; -    for (int j = 0; j < trg.size(); ++j) { -      int samp = lenp1 * rng.next(); -      --samp; -      if (samp < 0) samp = 255; -      alg[j] = samp; -      WordID cur_src = (samp == 255 ? kNULL : src[alg[j]]); -      Bigram b(prev_src, cur_src, trg[j]); -      model.increment(b); -      prev_src = cur_src; -    } -    Bigram b(prev_src, kEOS, kEOS); -    model.increment(b); -  } -  cerr << "Initial LLH: " << model.LogLikelihood() << endl; - -  SampleSet<double> ss; -  for (unsigned si = 0; si < 50; ++si) { -    for (unsigned ci = 0; ci < corpus_len; ++ci) { -      const vector<WordID>& src = corpusf[ci]; -      const vector<WordID>& trg = corpuse[ci]; -      vector<unsigned char>& alg = alignments[ci].a; -      WordID prev_src = kBOS; -      for (unsigned j = 0; j < trg.size(); ++j) { -        unsigned char& a_j = alg[j]; -        WordID cur_e_a_j = (a_j == 255 ? kNULL : src[a_j]); -        Bigram b(prev_src, cur_e_a_j, trg[j]); -        //cerr << "DEC: " << b << "\t" << nextb << endl; -        model.decrement(b); -        ss.clear(); -        for (unsigned i = 0; i <= src.size(); ++i) { -          const WordID cur_src = (i ? src[i-1] : kNULL); -          b.cur_src() = cur_src; -          ss.add(model.prob(b)); -        } -        int sampled_a_j = rng.SelectSample(ss); -        a_j = (sampled_a_j ? sampled_a_j - 1 : 255); -        cur_e_a_j = (a_j == 255 ? kNULL : src[a_j]); -        b.cur_src() = cur_e_a_j; -        //cerr << "INC: " << b << "\t" << nextb << endl; -        model.increment(b); -        prev_src = cur_e_a_j; -      } -    } -    cerr << '.' << flush; -    if (si % 10 == 9) { -      cerr << "[LLH prev=" << model.LogLikelihood(); -      //model.ResampleHyperparameters(&rng); -      cerr << " new=" << model.LogLikelihood() << "]\n"; -      //pair<WordID,WordID> xx = make_pair(kBOS, TD::Convert("我")); -      //PrintTopCustomers(model.bigrams.find(xx)->second); -      cerr << "p(" << TT << ") = " << model.prob(TT) << endl; -      cerr << "p(" << TT2 << ") = " << model.prob(TT2) << endl; -      PrintAlignment(corpusf[0], corpuse[0], alignments[0].a); -    } -  } -  { -  // MODEL 2 -  BigramModel model(vocabf.size(), vocabe.size()); -  BigramAlignmentModel amodel(vocabf.size(), vocabe.size()); -  for (unsigned ci = 0; ci < corpus_len; ++ci) { -    const vector<WordID>& src = corpusf[ci]; -    const vector<WordID>& trg = corpuse[ci]; -    vector<unsigned char>& alg = alignments[ci].a; -    WordID prev_src = kBOS; -    for (int j = 0; j < trg.size(); ++j) { -      WordID cur_src = (alg[j] == 255 ? kNULL : src[alg[j]]); -      Bigram b(prev_src, cur_src, trg[j]); -      model.increment(b); -      amodel.increment(prev_src, cur_src); -      prev_src = cur_src; -    } -    amodel.increment(prev_src, kEOS); -    Bigram b(prev_src, kEOS, kEOS); -    model.increment(b); -  } -  cerr << "Initial LLH: " << model.LogLikelihood() << " " << amodel.LogLikelihood() << endl; - -  SampleSet<double> ss; -  for (unsigned si = 0; si < samples; ++si) { -    for (unsigned ci = 0; ci < corpus_len; ++ci) { -      const vector<WordID>& src = corpusf[ci]; -      const vector<WordID>& trg = corpuse[ci]; -      vector<unsigned char>& alg = alignments[ci].a; -      WordID prev_src = kBOS; -      for (unsigned j = 0; j < trg.size(); ++j) { -        unsigned char& a_j = alg[j]; -        WordID cur_e_a_j = (a_j == 255 ? kNULL : src[a_j]); -        Bigram b(prev_src, cur_e_a_j, trg[j]); -        WordID next_src = kEOS; -        WordID next_trg = kEOS; -        if (j < (trg.size() - 1)) { -          next_src = (alg[j+1] == 255 ? kNULL : src[alg[j + 1]]); -          next_trg = trg[j + 1]; -        } -        Bigram nextb(cur_e_a_j, next_src, next_trg); -        //cerr << "DEC: " << b << "\t" << nextb << endl; -        model.decrement(b); -        model.decrement(nextb); -        amodel.decrement(prev_src, cur_e_a_j); -        amodel.decrement(cur_e_a_j, next_src); -        ss.clear(); -        for (unsigned i = 0; i <= src.size(); ++i) { -          const WordID cur_src = (i ? src[i-1] : kNULL); -          b.cur_src() = cur_src; -          ss.add(model.prob(b) * model.prob(nextb) * amodel.prob(prev_src, cur_src) * amodel.prob(cur_src, next_src)); -          //cerr << log(ss[ss.size() - 1]) << "\t" << b << endl; -        } -        int sampled_a_j = rng.SelectSample(ss); -        a_j = (sampled_a_j ? sampled_a_j - 1 : 255); -        cur_e_a_j = (a_j == 255 ? kNULL : src[a_j]); -        b.cur_src() = cur_e_a_j; -        nextb.prev_src() = cur_e_a_j; -        //cerr << "INC: " << b << "\t" << nextb << endl; -        //exit(1); -        model.increment(b); -        model.increment(nextb); -        amodel.increment(prev_src, cur_e_a_j); -        amodel.increment(cur_e_a_j, next_src); -        prev_src = cur_e_a_j; -      } -    } -    cerr << '.' << flush; -    if (si % 10 == 9) { -      cerr << "[LLH prev=" << (model.LogLikelihood() + amodel.LogLikelihood()); -      //model.ResampleHyperparameters(&rng); -      cerr << " new=" << model.LogLikelihood() << "]\n"; -      pair<WordID,WordID> xx = make_pair(kBOS, TD::Convert("我")); -      cerr << "p(" << TT << ") = " << model.prob(TT) << endl; -      cerr << "p(" << TT2 << ") = " << model.prob(TT2) << endl; -      pair<WordID,WordID> xx2 = make_pair(kBOS, TD::Convert("要")); -      PrintTopCustomers(model.bigrams.find(xx)->second); -      //PrintTopCustomers(amodel.bigrams[TD::Convert("<s>")]); -      //PrintTopCustomers(model.unigrams[TD::Convert("<eps>")]); -      PrintAlignment(corpusf[0], corpuse[0], alignments[0].a); -    } -  } -  } -  return 0; -} - | 
