diff options
Diffstat (limited to 'gi/pf')
| -rw-r--r-- | gi/pf/Makefile.am | 13 | ||||
| -rw-r--r-- | gi/pf/align-lexonly.cc | 356 | ||||
| -rw-r--r-- | gi/pf/base_measures.cc | 26 | ||||
| -rw-r--r-- | gi/pf/base_measures.h | 50 | ||||
| -rw-r--r-- | gi/pf/itg.cc | 98 | ||||
| -rw-r--r-- | gi/pf/unigrams.cc | 80 | ||||
| -rw-r--r-- | gi/pf/unigrams.h | 69 | 
7 files changed, 668 insertions, 24 deletions
| diff --git a/gi/pf/Makefile.am b/gi/pf/Makefile.am index 42758939..7c8e89d0 100644 --- a/gi/pf/Makefile.am +++ b/gi/pf/Makefile.am @@ -1,10 +1,14 @@ -bin_PROGRAMS = cbgi brat dpnaive pfbrat pfdist itg pfnaive +bin_PROGRAMS = cbgi brat dpnaive pfbrat pfdist itg pfnaive condnaive align-lexonly  noinst_LIBRARIES = libpf.a -libpf_a_SOURCES = base_measures.cc reachability.cc cfg_wfst_composer.cc corpus.cc +libpf_a_SOURCES = base_measures.cc reachability.cc cfg_wfst_composer.cc corpus.cc unigrams.cc ngram_base.cc + +align_lexonly_SOURCES = align-lexonly.cc  itg_SOURCES = itg.cc +condnaive_SOURCES = condnaive.cc +  dpnaive_SOURCES = dpnaive.cc  pfdist_SOURCES = pfdist.cc @@ -17,5 +21,6 @@ brat_SOURCES = brat.cc  pfbrat_SOURCES = pfbrat.cc -AM_CPPFLAGS = -W -Wall -Wno-sign-compare -funroll-loops -I$(top_srcdir)/utils $(GTEST_CPPFLAGS) -I$(top_srcdir)/decoder -AM_LDFLAGS = libpf.a $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/utils/libutils.a -lz +AM_CPPFLAGS = -W -Wall -Wno-sign-compare -funroll-loops -I$(top_srcdir)/utils $(GTEST_CPPFLAGS) -I$(top_srcdir)/decoder -I$(top_srcdir)/klm + +AM_LDFLAGS = libpf.a $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/klm/lm/libklm.a $(top_srcdir)/klm/util/libklm_util.a $(top_srcdir)/utils/libutils.a -lz diff --git a/gi/pf/align-lexonly.cc b/gi/pf/align-lexonly.cc new file mode 100644 index 00000000..91a3cfcf --- /dev/null +++ b/gi/pf/align-lexonly.cc @@ -0,0 +1,356 @@ +#include <iostream> +#include <tr1/memory> +#include <queue> + +#include <boost/multi_array.hpp> +#include <boost/program_options.hpp> +#include <boost/program_options/variables_map.hpp> + +#include "array2d.h" +#include "base_measures.h" +#include "monotonic_pseg.h" +#include "conditional_pseg.h" +#include "trule.h" +#include "tdict.h" +#include "stringlib.h" +#include "filelib.h" +#include "dict.h" +#include "sampler.h" +#include "ccrp_nt.h" +#include "corpus.h" +#include "ngram_base.h" + +using namespace std; +using namespace tr1; +namespace po = boost::program_options; + +void InitCommandLine(int argc, char** argv, po::variables_map* conf) { +  po::options_description opts("Configuration options"); +  opts.add_options() +        ("samples,s",po::value<unsigned>()->default_value(1000),"Number of samples") +        ("input,i",po::value<string>(),"Read parallel data from") +        ("random_seed,S",po::value<uint32_t>(), "Random seed"); +  po::options_description clo("Command line options"); +  clo.add_options() +        ("config", po::value<string>(), "Configuration file") +        ("help,h", "Print this help message and exit"); +  po::options_description dconfig_options, dcmdline_options; +  dconfig_options.add(opts); +  dcmdline_options.add(opts).add(clo); +   +  po::store(parse_command_line(argc, argv, dcmdline_options), *conf); +  if (conf->count("config")) { +    ifstream config((*conf)["config"].as<string>().c_str()); +    po::store(po::parse_config_file(config, dconfig_options), *conf); +  } +  po::notify(*conf); + +  if (conf->count("help") || (conf->count("input") == 0)) { +    cerr << dcmdline_options << endl; +    exit(1); +  } +} + +shared_ptr<MT19937> prng; + +struct LexicalAlignment { +  unsigned char src_index; +  bool is_transliteration; +  vector<pair<short, short> > derivation; +}; + +struct AlignedSentencePair { +  vector<WordID> src; +  vector<WordID> trg; +  vector<LexicalAlignment> a; +  Array2D<short> posterior; +}; + +struct HierarchicalUnigramBase { +  explicit HierarchicalUnigramBase(const unsigned vocab_e_size) : r(5,5), u0(1.0 / vocab_e_size) {} + +  // return p0 of rule.e_ +  prob_t operator()(const TRule& rule) const { +    prob_t p = prob_t::One(); +    prob_t q; +    for (unsigned i = 0; i < rule.e_.size(); ++i) { +      q.logeq(r.logprob(rule.e_[i], log(u0))); +      p *= q; +    } +    q.logeq(r.logprob(TD::Convert("</s>"), log(u0))); +    p *= q; +    return p; +  } + +  void Increment(const TRule& rule) { +    for (unsigned i = 0; i < rule.e_.size(); ++i) +      r.increment(rule.e_[i]); +    r.increment(TD::Convert("</s>")); +  } + +  void Decrement(const TRule& rule) { +    for (unsigned i = 0; i < rule.e_.size(); ++i) +      r.decrement(rule.e_[i]); +    r.decrement(TD::Convert("</s>")); +  } + +  CCRP_NoTable<WordID> r; +  prob_t u0; +}; + +struct HierarchicalWordBase { +  explicit HierarchicalWordBase(const unsigned vocab_e_size) : +      base(prob_t::One()), r(15,15), u0(-log(vocab_e_size)) {} + +  void ResampleHyperparameters(MT19937* rng) { +    r.resample_hyperparameters(rng); +  } + +  inline double logp0(const vector<WordID>& s) const { +    return s.size() * u0; +  } + +  // return p0 of rule.e_ +  prob_t operator()(const TRule& rule) const { +    prob_t p; p.logeq(r.logprob(rule.e_, logp0(rule.e_))); +    return p; +  } + +  void Increment(const TRule& rule) { +    if (r.increment(rule.e_)) { +      prob_t p; p.logeq(logp0(rule.e_)); +      base *= p; +    } +  } + +  void Decrement(const TRule& rule) { +    if (r.decrement(rule.e_)) { +      prob_t p; p.logeq(logp0(rule.e_)); +      base /= p; +    } +  } + +  prob_t Likelihood() const { +    prob_t p; p.logeq(r.log_crp_prob()); +    p *= base; +    return p; +  } + +  void Summary() const { +    cerr << "NUMBER OF CUSTOMERS: " << r.num_customers() << endl; +    for (CCRP_NoTable<vector<WordID> >::const_iterator it = r.begin(); it != r.end(); ++it) +      cerr << "   " << it->second << '\t' << TD::GetString(it->first) << endl; +  } + +  prob_t base; +  CCRP_NoTable<vector<WordID> > r; +  const double u0; +}; + +struct BasicLexicalAlignment { +  explicit BasicLexicalAlignment(const vector<vector<WordID> >& lets, +                                 const unsigned letters_e, +                                 vector<AlignedSentencePair>* corp) : +      letters(lets), +      corpus(*corp), +      //up0("en.chars.1gram", letters_e), +      //up0("en.words.1gram"), +      up0(letters_e), +      //up0("en.chars.2gram"), +      tmodel(up0) { +  } + +  void InstantiateRule(const WordID src, +                       const WordID trg, +                       TRule* rule) const { +    static const WordID kX = TD::Convert("X") * -1; +    rule->lhs_ = kX; +    rule->e_ = letters[trg]; +    rule->f_ = letters[src]; +  } + +  void InitializeRandom() { +    const WordID kNULL = TD::Convert("NULL"); +    cerr << "Initializing with random alignments ...\n"; +    for (unsigned i = 0; i < corpus.size(); ++i) { +      AlignedSentencePair& asp = corpus[i]; +      asp.a.resize(asp.trg.size()); +      for (unsigned j = 0; j < asp.trg.size(); ++j) { +        const unsigned char a_j = prng->next() * (1 + asp.src.size()); +        const WordID f_a_j = (a_j ? asp.src[a_j - 1] : kNULL); +        TRule r; +        InstantiateRule(f_a_j, asp.trg[j], &r); +        asp.a[j].is_transliteration = false; +        asp.a[j].src_index = a_j; +        if (tmodel.IncrementRule(r)) +          up0.Increment(r); +      } +    } +    cerr << "  LLH = " << Likelihood() << endl; +  } + +  prob_t Likelihood() const { +    prob_t p = tmodel.Likelihood(); +    p *= up0.Likelihood(); +    return p; +  } + +  void ResampleHyperparemeters() { +    cerr << "  LLH_prev = " << Likelihood() << flush; +    tmodel.ResampleHyperparameters(&*prng); +    up0.ResampleHyperparameters(&*prng); +    cerr << "\tLLH_post = " << Likelihood() << endl; +  } + +  void ResampleCorpus(); + +  const vector<vector<WordID> >& letters; // spelling dictionary +  vector<AlignedSentencePair>& corpus; +  //PhraseConditionalUninformativeBase up0; +  //PhraseConditionalUninformativeUnigramBase up0; +  //UnigramWordBase up0; +  //HierarchicalUnigramBase up0; +  HierarchicalWordBase up0; +  //CompletelyUniformBase up0; +  //FixedNgramBase up0; +  //ConditionalTranslationModel<PhraseConditionalUninformativeBase> tmodel; +  //ConditionalTranslationModel<PhraseConditionalUninformativeUnigramBase> tmodel; +  //ConditionalTranslationModel<UnigramWordBase> tmodel; +  //ConditionalTranslationModel<HierarchicalUnigramBase> tmodel; +  ConditionalTranslationModel<HierarchicalWordBase> tmodel; +  //ConditionalTranslationModel<FixedNgramBase> tmodel; +  //ConditionalTranslationModel<CompletelyUniformBase> tmodel; +}; + +void BasicLexicalAlignment::ResampleCorpus() { +  static const WordID kNULL = TD::Convert("NULL"); +  for (unsigned i = 0; i < corpus.size(); ++i) { +    AlignedSentencePair& asp = corpus[i]; +    SampleSet<prob_t> ss; ss.resize(asp.src.size() + 1); +    for (unsigned j = 0; j < asp.trg.size(); ++j) { +      TRule r; +      unsigned char& a_j = asp.a[j].src_index; +      WordID f_a_j = (a_j ? asp.src[a_j - 1] : kNULL); +      InstantiateRule(f_a_j, asp.trg[j], &r); +      if (tmodel.DecrementRule(r)) +        up0.Decrement(r); + +      for (unsigned prop_a_j = 0; prop_a_j <= asp.src.size(); ++prop_a_j) { +        const WordID prop_f = (prop_a_j ? asp.src[prop_a_j - 1] : kNULL); +        InstantiateRule(prop_f, asp.trg[j], &r); +        ss[prop_a_j] = tmodel.RuleProbability(r); +      } +      a_j = prng->SelectSample(ss); +      f_a_j = (a_j ? asp.src[a_j - 1] : kNULL); +      InstantiateRule(f_a_j, asp.trg[j], &r); +      if (tmodel.IncrementRule(r)) +        up0.Increment(r); +    } +  } +  cerr << "  LLH = " << tmodel.Likelihood() << endl; +} + +void ExtractLetters(const set<WordID>& v, vector<vector<WordID> >* l, set<WordID>* letset = NULL) { +  for (set<WordID>::const_iterator it = v.begin(); it != v.end(); ++it) { +    vector<WordID>& letters = (*l)[*it]; +    if (letters.size()) continue;   // if e and f have the same word + +    const string& w = TD::Convert(*it); +     +    size_t cur = 0; +    while (cur < w.size()) { +      const size_t len = UTF8Len(w[cur]); +      letters.push_back(TD::Convert(w.substr(cur, len))); +      if (letset) letset->insert(letters.back()); +      cur += len; +    } +  } +} + +void Debug(const AlignedSentencePair& asp) { +  cerr << TD::GetString(asp.src) << endl << TD::GetString(asp.trg) << endl; +  Array2D<bool> a(asp.src.size(), asp.trg.size()); +  for (unsigned j = 0; j < asp.trg.size(); ++j) +    if (asp.a[j].src_index) a(asp.a[j].src_index - 1, j) = true; +  cerr << a << endl; +} + +void AddSample(AlignedSentencePair* asp) { +  for (unsigned j = 0; j < asp->trg.size(); ++j) +    asp->posterior(asp->a[j].src_index, j)++; +} + +void WriteAlignments(const AlignedSentencePair& asp) { +  bool first = true; +  for (unsigned j = 0; j < asp.trg.size(); ++j) { +    int src_index = -1; +    int mc = -1; +    for (unsigned i = 0; i <= asp.src.size(); ++i) { +      if (asp.posterior(i, j) > mc) { +        mc = asp.posterior(i, j); +        src_index = i; +      } +    } + +    if (src_index) { +      if (first) first = false; else cout << ' '; +      cout << (src_index - 1) << '-' << j; +    } +  } +  cout << endl; +} + +int main(int argc, char** argv) { +  po::variables_map conf; +  InitCommandLine(argc, argv, &conf); + +  if (conf.count("random_seed")) +    prng.reset(new MT19937(conf["random_seed"].as<uint32_t>())); +  else +    prng.reset(new MT19937); +//  MT19937& rng = *prng; + +  vector<vector<int> > corpuse, corpusf; +  set<int> vocabe, vocabf; +  corpus::ReadParallelCorpus(conf["input"].as<string>(), &corpusf, &corpuse, &vocabf, &vocabe); +  cerr << "f-Corpus size: " << corpusf.size() << " sentences\n"; +  cerr << "f-Vocabulary size: " << vocabf.size() << " types\n"; +  cerr << "f-Corpus size: " << corpuse.size() << " sentences\n"; +  cerr << "f-Vocabulary size: " << vocabe.size() << " types\n"; +  assert(corpusf.size() == corpuse.size()); + +  vector<AlignedSentencePair> corpus(corpuse.size()); +  for (unsigned i = 0; i < corpuse.size(); ++i) { +    corpus[i].src.swap(corpusf[i]); +    corpus[i].trg.swap(corpuse[i]); +    corpus[i].posterior.resize(corpus[i].src.size() + 1, corpus[i].trg.size()); +  } +  corpusf.clear(); corpuse.clear(); + +  vocabf.insert(TD::Convert("NULL")); +  vector<vector<WordID> > letters(TD::NumWords()); +  set<WordID> letset; +  ExtractLetters(vocabe, &letters, &letset); +  ExtractLetters(vocabf, &letters, NULL); +  letters[TD::Convert("NULL")].clear(); + +  BasicLexicalAlignment x(letters, letset.size(), &corpus); +  x.InitializeRandom(); +  const unsigned samples = conf["samples"].as<unsigned>(); +  for (int i = 0; i < samples; ++i) { +    for (int j = 431; j < 433; ++j) Debug(corpus[j]); +    cerr << i << "\t" << x.tmodel.r.size() << "\t"; +    if (i % 10 == 0) x.ResampleHyperparemeters(); +    x.ResampleCorpus(); +    if (i > (samples / 5) && (i % 10 == 9)) for (int j = 0; j < corpus.size(); ++j) AddSample(&corpus[j]); +  } +  for (unsigned i = 0; i < corpus.size(); ++i) +    WriteAlignments(corpus[i]); +  //ModelAndData posterior(x, &corpus, vocabe, vocabf); +  x.tmodel.Summary(); +  x.up0.Summary(); + +  //posterior.Sample(); + +  return 0; +} diff --git a/gi/pf/base_measures.cc b/gi/pf/base_measures.cc index 8adb37d7..97b4e698 100644 --- a/gi/pf/base_measures.cc +++ b/gi/pf/base_measures.cc @@ -6,6 +6,32 @@  using namespace std; +prob_t PhraseConditionalUninformativeUnigramBase::p0(const vector<WordID>& vsrc, +                                                     const vector<WordID>& vtrg, +                                                     int start_src, int start_trg) const { +  const int flen = vsrc.size() - start_src; +  const int elen = vtrg.size() - start_trg; +  prob_t p; +  p.logeq(log_poisson(elen, flen + 0.01));       // elen | flen          ~Pois(flen + 0.01) +  //p.logeq(log_poisson(elen, 1));       // elen | flen          ~Pois(flen + 0.01) +  for (int i = 0; i < elen; ++i) +    p *= u(vtrg[i + start_trg]);                        // draw e_i             ~Uniform +  return p; +} + +prob_t PhraseConditionalUninformativeBase::p0(const vector<WordID>& vsrc, +                                              const vector<WordID>& vtrg, +                                              int start_src, int start_trg) const { +  const int flen = vsrc.size() - start_src; +  const int elen = vtrg.size() - start_trg; +  prob_t p; +  //p.logeq(log_poisson(elen, flen + 0.01));       // elen | flen          ~Pois(flen + 0.01) +  p.logeq(log_poisson(elen, 1));       // elen | flen          ~Pois(flen + 0.01) +  for (int i = 0; i < elen; ++i) +    p *= kUNIFORM_TARGET;                        // draw e_i             ~Uniform +  return p; +} +  void Model1::LoadModel1(const string& fname) {    cerr << "Loading Model 1 parameters from " << fname << " ..." << endl;    ReadFile rf(fname); diff --git a/gi/pf/base_measures.h b/gi/pf/base_measures.h index 7ce7e2e6..fbd1c3ad 100644 --- a/gi/pf/base_measures.h +++ b/gi/pf/base_measures.h @@ -7,6 +7,7 @@  #include <cmath>  #include <iostream> +#include "unigrams.h"  #include "trule.h"  #include "prob.h"  #include "tdict.h" @@ -49,6 +50,51 @@ struct Model1 {    std::vector<std::map<WordID, prob_t> > ttable;  }; +struct CompletelyUniformBase { +  explicit CompletelyUniformBase(const unsigned ves) : kUNIFORM(1.0 / ves) {} +  prob_t operator()(const TRule&) const { +    return kUNIFORM; +  } +  const prob_t kUNIFORM; +}; + +struct UnigramWordBase { +  explicit UnigramWordBase(const std::string& fname) : un(fname) {} +  prob_t operator()(const TRule& r) const { +    return un(r.e_); +  } +  const UnigramWordModel un; +}; + +struct PhraseConditionalUninformativeBase { +  explicit PhraseConditionalUninformativeBase(const unsigned vocab_e_size) : +      kUNIFORM_TARGET(1.0 / vocab_e_size) { +    assert(vocab_e_size > 0); +  } + +  // return p0 of rule.e_ | rule.f_ +  prob_t operator()(const TRule& rule) const { +    return p0(rule.f_, rule.e_, 0, 0); +  } + +  prob_t p0(const std::vector<WordID>& vsrc, const std::vector<WordID>& vtrg, int start_src, int start_trg) const; + +  const prob_t kUNIFORM_TARGET; +}; + +struct PhraseConditionalUninformativeUnigramBase { +  explicit PhraseConditionalUninformativeUnigramBase(const std::string& file, const unsigned vocab_e_size) : u(file, vocab_e_size) {} + +  // return p0 of rule.e_ | rule.f_ +  prob_t operator()(const TRule& rule) const { +    return p0(rule.f_, rule.e_, 0, 0); +  } + +  prob_t p0(const std::vector<WordID>& vsrc, const std::vector<WordID>& vtrg, int start_src, int start_trg) const; + +  const UnigramModel u; +}; +  struct PhraseConditionalBase {    explicit PhraseConditionalBase(const Model1& m1, const double m1mixture, const unsigned vocab_e_size) :        model1(m1), @@ -83,7 +129,7 @@ struct PhraseJointBase {      assert(vocab_e_size > 0);    } -  // return p0 of rule.e_ | rule.f_ +  // return p0 of rule.e_ , rule.f_    prob_t operator()(const TRule& rule) const {      return p0(rule.f_, rule.e_, 0, 0);    } @@ -113,7 +159,7 @@ struct PhraseJointBase_BiDir {      assert(vocab_e_size > 0);    } -  // return p0 of rule.e_ | rule.f_ +  // return p0 of rule.e_ , rule.f_    prob_t operator()(const TRule& rule) const {      return p0(rule.f_, rule.e_, 0, 0);    } diff --git a/gi/pf/itg.cc b/gi/pf/itg.cc index ac3c16a3..a38fe672 100644 --- a/gi/pf/itg.cc +++ b/gi/pf/itg.cc @@ -27,10 +27,67 @@ ostream& operator<<(ostream& os, const vector<WordID>& p) {    return os << ']';  } -double log_poisson(unsigned x, const double& lambda) { -  assert(lambda > 0.0); -  return log(lambda) * x - lgamma(x + 1) - lambda; -} +struct UnigramModel { +  explicit UnigramModel(const string& fname, unsigned vocab_size, double p0null = 0.05) : +      use_uniform_(fname.size() == 0), +      p0null_(p0null), +      uniform_((1.0 - p0null) / vocab_size), +      probs_(TD::NumWords() + 1) { +    if (fname.size() > 0) LoadUnigrams(fname); +    probs_[0] = p0null_; +  } + +//  +// \data\ +// ngram 1=9295 +//  +// \1-grams: +// -3.191193	" + +  void LoadUnigrams(const string& fname) { +    cerr << "Loading unigram probabilities from " << fname << " ..." << endl; +    ReadFile rf(fname); +    string line; +    istream& in = *rf.stream(); +    assert(in); +    getline(in, line); +    assert(line.empty()); +    getline(in, line); +    assert(line == "\\data\\"); +    getline(in, line); +    size_t pos = line.find("ngram 1="); +    assert(pos == 0); +    assert(line.size() > 8); +    const size_t num_unigrams = atoi(&line[8]); +    getline(in, line); +    assert(line.empty()); +    getline(in, line); +    assert(line == "\\1-grams:"); +    for (size_t i = 0; i < num_unigrams; ++i) { +      getline(in, line); +      assert(line.size() > 0); +      pos = line.find('\t'); +      assert(pos > 0); +      assert(pos + 1 < line.size()); +      const WordID w = TD::Convert(line.substr(pos + 1)); +      line[pos] = 0; +      float p = atof(&line[0]); +      const prob_t pnon_null(1.0 - p0null_.as_float()); +      if (w < probs_.size()) probs_[w].logeq(p * log(10) + log(pnon_null)); else abort(); +    } +  } + +  const prob_t& operator()(const WordID& w) const { +    if (!w) return p0null_; +    if (use_uniform_) return uniform_; +    return probs_[w]; +  } + +  const bool use_uniform_; +  const prob_t p0null_; +  const prob_t uniform_; +  vector<prob_t> probs_; +};  struct Model1 {    explicit Model1(const string& fname) : @@ -89,11 +146,11 @@ void InitCommandLine(int argc, char** argv, po::variables_map* conf) {          ("samples,s",po::value<unsigned>()->default_value(1000),"Number of samples")          ("particles,p",po::value<unsigned>()->default_value(25),"Number of particles")          ("input,i",po::value<string>(),"Read parallel data from") -        ("max_src_phrase",po::value<unsigned>()->default_value(7),"Maximum length of source language phrases") -        ("max_trg_phrase",po::value<unsigned>()->default_value(7),"Maximum length of target language phrases")          ("model1,m",po::value<string>(),"Model 1 parameters (used in base distribution)")          ("inverse_model1,M",po::value<string>(),"Inverse Model 1 parameters (used in backward estimate)")          ("model1_interpolation_weight",po::value<double>()->default_value(0.95),"Mixing proportion of model 1 with uniform target distribution") +        ("src_unigram,u",po::value<string>()->default_value(""),"Source unigram distribution; empty for uniform") +        ("trg_unigram,U",po::value<string>()->default_value(""),"Target unigram distribution; empty for uniform")          ("random_seed,S",po::value<uint32_t>(), "Random seed");    po::options_description clo("Command line options");    clo.add_options() @@ -165,11 +222,11 @@ void ReadParallelCorpus(const string& filename,  int main(int argc, char** argv) {    po::variables_map conf;    InitCommandLine(argc, argv, &conf); -  const size_t kMAX_TRG_PHRASE = conf["max_trg_phrase"].as<unsigned>(); -  const size_t kMAX_SRC_PHRASE = conf["max_src_phrase"].as<unsigned>();    const unsigned particles = conf["particles"].as<unsigned>();    const unsigned samples = conf["samples"].as<unsigned>(); - +  TD::Convert("<s>"); +  TD::Convert("</s>"); +  TD::Convert("<unk>");    if (!conf.count("model1")) {      cerr << argv[0] << "Please use --model1 to specify model 1 parameters\n";      return 1; @@ -188,23 +245,28 @@ int main(int argc, char** argv) {    cerr << "F-corpus size: " << corpusf.size() << " sentences\t (" << vocabf.size() << " word types)\n";    cerr << "E-corpus size: " << corpuse.size() << " sentences\t (" << vocabe.size() << " word types)\n";    assert(corpusf.size() == corpuse.size()); +  UnigramModel src_unigram(conf["src_unigram"].as<string>(), vocabf.size()); +  UnigramModel trg_unigram(conf["trg_unigram"].as<string>(), vocabe.size()); +  const prob_t kHALF(0.5); +  const string kEMPTY = "NULL";    const int kLHS = -TD::Convert("X");    Model1 m1(conf["model1"].as<string>());    Model1 invm1(conf["inverse_model1"].as<string>());    for (int si = 0; si < conf["samples"].as<unsigned>(); ++si) {      cerr << '.' << flush;      for (int ci = 0; ci < corpusf.size(); ++ci) { -      const vector<WordID>& src = corpusf[ci];        const vector<WordID>& trg = corpuse[ci]; -      for (int i = 0; i < src.size(); ++i) { -        for (int j = 0; j < trg.size(); ++j) { -          const int eff_max_src = min(src.size() - i, kMAX_SRC_PHRASE); -          for (int k = 0; k < eff_max_src; ++k) { -            const int eff_max_trg = (k == 0 ? 1 : min(trg.size() - j, kMAX_TRG_PHRASE)); -            for (int l = 0; l < eff_max_trg; ++l) { -            } -          } +      const vector<WordID>& src = corpusf[ci]; +      for (int i = 0; i <= trg.size(); ++i) { +        const WordID e_i = i > 0 ? trg[i-1] : 0; +        for (int j = 0; j <= src.size(); ++j) { +          const WordID f_j = j > 0 ? src[j-1] : 0; +          if (e_i == 0 && f_j == 0) continue; +          prob_t je = kHALF * src_unigram(f_j) * m1(f_j,e_i) + kHALF * trg_unigram(e_i) * invm1(e_i,f_j); +          cerr << "p( " << (e_i ? TD::Convert(e_i) : kEMPTY) << " , " << (f_j ? TD::Convert(f_j) : kEMPTY) << " ) = " << log(je) << endl; +          if (e_i && f_j) +            cout << "[X] ||| " << TD::Convert(f_j) << " ||| " << TD::Convert(e_i) << " ||| LogProb=" << log(je) << endl;          }        }      } diff --git a/gi/pf/unigrams.cc b/gi/pf/unigrams.cc new file mode 100644 index 00000000..40829775 --- /dev/null +++ b/gi/pf/unigrams.cc @@ -0,0 +1,80 @@ +#include "unigrams.h" + +#include <string> +#include <cmath> + +#include "stringlib.h" +#include "filelib.h" + +using namespace std; + +void UnigramModel::LoadUnigrams(const string& fname) { +  cerr << "Loading unigram probabilities from " << fname << " ..." << endl; +  ReadFile rf(fname); +  string line; +  istream& in = *rf.stream(); +  assert(in); +  getline(in, line); +  assert(line.empty()); +  getline(in, line); +  assert(line == "\\data\\"); +  getline(in, line); +  size_t pos = line.find("ngram 1="); +  assert(pos == 0); +  assert(line.size() > 8); +  const size_t num_unigrams = atoi(&line[8]); +  getline(in, line); +  assert(line.empty()); +  getline(in, line); +  assert(line == "\\1-grams:"); +  for (size_t i = 0; i < num_unigrams; ++i) { +    getline(in, line); +    assert(line.size() > 0); +    pos = line.find('\t'); +    assert(pos > 0); +    assert(pos + 1 < line.size()); +    const WordID w = TD::Convert(line.substr(pos + 1)); +    line[pos] = 0; +    float p = atof(&line[0]); +    if (w < probs_.size()) probs_[w].logeq(p * log(10)); else cerr << "WARNING: don't know about '" << TD::Convert(w) << "'\n"; +  } +} + +void UnigramWordModel::LoadUnigrams(const string& fname) { +  cerr << "Loading unigram probabilities from " << fname << " ..." << endl; +  ReadFile rf(fname); +  string line; +  istream& in = *rf.stream(); +  assert(in); +  getline(in, line); +  assert(line.empty()); +  getline(in, line); +  assert(line == "\\data\\"); +  getline(in, line); +  size_t pos = line.find("ngram 1="); +  assert(pos == 0); +  assert(line.size() > 8); +  const size_t num_unigrams = atoi(&line[8]); +  getline(in, line); +  assert(line.empty()); +  getline(in, line); +  assert(line == "\\1-grams:"); +  for (size_t i = 0; i < num_unigrams; ++i) { +    getline(in, line); +    assert(line.size() > 0); +    pos = line.find('\t'); +    assert(pos > 0); +    assert(pos + 1 < line.size()); +    size_t cur = pos + 1; +    vector<WordID> w; +    while (cur < line.size()) { +      const size_t len = UTF8Len(line[cur]); +      w.push_back(TD::Convert(line.substr(cur, len))); +      cur += len; +    } +    line[pos] = 0; +    float p = atof(&line[0]); +    probs_[w].logeq(p * log(10.0)); +  } +} + diff --git a/gi/pf/unigrams.h b/gi/pf/unigrams.h new file mode 100644 index 00000000..1660d1ed --- /dev/null +++ b/gi/pf/unigrams.h @@ -0,0 +1,69 @@ +#ifndef _UNIGRAMS_H_ +#define _UNIGRAMS_H_ + +#include <vector> +#include <string> +#include <tr1/unordered_map> +#include <boost/functional.hpp> + +#include "wordid.h" +#include "prob.h" +#include "tdict.h" + +struct UnigramModel { +  explicit UnigramModel(const std::string& fname, unsigned vocab_size) : +      use_uniform_(fname.size() == 0), +      uniform_(1.0 / vocab_size), +      probs_() { +    if (fname.size() > 0) { +      probs_.resize(TD::NumWords() + 1); +      LoadUnigrams(fname); +    } +  } + +  const prob_t& operator()(const WordID& w) const { +    assert(w); +    if (use_uniform_) return uniform_; +    return probs_[w]; +  } + + private: +  void LoadUnigrams(const std::string& fname); + +  const bool use_uniform_; +  const prob_t uniform_; +  std::vector<prob_t> probs_; +}; + + +// reads an ARPA unigram file and converts words like 'cat' into a string 'c a t' +struct UnigramWordModel { +  explicit UnigramWordModel(const std::string& fname) : +      use_uniform_(false), +      uniform_(1.0), +      probs_() { +    LoadUnigrams(fname); +  } + +  explicit UnigramWordModel(const unsigned vocab_size) : +      use_uniform_(true), +      uniform_(1.0 / vocab_size), +      probs_() {} + +  const prob_t& operator()(const std::vector<WordID>& s) const { +    if (use_uniform_) return uniform_; +    const VectorProbHash::const_iterator it = probs_.find(s); +    assert(it != probs_.end()); +    return it->second; +  } + + private: +  void LoadUnigrams(const std::string& fname); + +  const bool use_uniform_; +  const prob_t uniform_; +  typedef std::tr1::unordered_map<std::vector<WordID>, prob_t, boost::hash<std::vector<WordID> > > VectorProbHash; +  VectorProbHash probs_; +}; + +#endif | 
