From 03478c6e7307b66ebde1e76801edd06062d8039c Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Thu, 29 Dec 2011 21:08:30 -0500 Subject: lexical alignment samplers --- gi/pf/itg.cc | 98 +++++++++++++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 80 insertions(+), 18 deletions(-) (limited to 'gi/pf/itg.cc') diff --git a/gi/pf/itg.cc b/gi/pf/itg.cc index ac3c16a3..a38fe672 100644 --- a/gi/pf/itg.cc +++ b/gi/pf/itg.cc @@ -27,10 +27,67 @@ ostream& operator<<(ostream& os, const vector& p) { return os << ']'; } -double log_poisson(unsigned x, const double& lambda) { - assert(lambda > 0.0); - return log(lambda) * x - lgamma(x + 1) - lambda; -} +struct UnigramModel { + explicit UnigramModel(const string& fname, unsigned vocab_size, double p0null = 0.05) : + use_uniform_(fname.size() == 0), + p0null_(p0null), + uniform_((1.0 - p0null) / vocab_size), + probs_(TD::NumWords() + 1) { + if (fname.size() > 0) LoadUnigrams(fname); + probs_[0] = p0null_; + } + +// +// \data\ +// ngram 1=9295 +// +// \1-grams: +// -3.191193 " + + void LoadUnigrams(const string& fname) { + cerr << "Loading unigram probabilities from " << fname << " ..." << endl; + ReadFile rf(fname); + string line; + istream& in = *rf.stream(); + assert(in); + getline(in, line); + assert(line.empty()); + getline(in, line); + assert(line == "\\data\\"); + getline(in, line); + size_t pos = line.find("ngram 1="); + assert(pos == 0); + assert(line.size() > 8); + const size_t num_unigrams = atoi(&line[8]); + getline(in, line); + assert(line.empty()); + getline(in, line); + assert(line == "\\1-grams:"); + for (size_t i = 0; i < num_unigrams; ++i) { + getline(in, line); + assert(line.size() > 0); + pos = line.find('\t'); + assert(pos > 0); + assert(pos + 1 < line.size()); + const WordID w = TD::Convert(line.substr(pos + 1)); + line[pos] = 0; + float p = atof(&line[0]); + const prob_t pnon_null(1.0 - p0null_.as_float()); + if (w < probs_.size()) probs_[w].logeq(p * log(10) + log(pnon_null)); else abort(); + } + } + + const prob_t& operator()(const WordID& w) const { + if (!w) return p0null_; + if (use_uniform_) return uniform_; + return probs_[w]; + } + + const bool use_uniform_; + const prob_t p0null_; + const prob_t uniform_; + vector probs_; +}; struct Model1 { explicit Model1(const string& fname) : @@ -89,11 +146,11 @@ void InitCommandLine(int argc, char** argv, po::variables_map* conf) { ("samples,s",po::value()->default_value(1000),"Number of samples") ("particles,p",po::value()->default_value(25),"Number of particles") ("input,i",po::value(),"Read parallel data from") - ("max_src_phrase",po::value()->default_value(7),"Maximum length of source language phrases") - ("max_trg_phrase",po::value()->default_value(7),"Maximum length of target language phrases") ("model1,m",po::value(),"Model 1 parameters (used in base distribution)") ("inverse_model1,M",po::value(),"Inverse Model 1 parameters (used in backward estimate)") ("model1_interpolation_weight",po::value()->default_value(0.95),"Mixing proportion of model 1 with uniform target distribution") + ("src_unigram,u",po::value()->default_value(""),"Source unigram distribution; empty for uniform") + ("trg_unigram,U",po::value()->default_value(""),"Target unigram distribution; empty for uniform") ("random_seed,S",po::value(), "Random seed"); po::options_description clo("Command line options"); clo.add_options() @@ -165,11 +222,11 @@ void ReadParallelCorpus(const string& filename, int main(int argc, char** argv) { po::variables_map conf; InitCommandLine(argc, argv, &conf); - const size_t kMAX_TRG_PHRASE = conf["max_trg_phrase"].as(); - const size_t kMAX_SRC_PHRASE = conf["max_src_phrase"].as(); const unsigned particles = conf["particles"].as(); const unsigned samples = conf["samples"].as(); - + TD::Convert(""); + TD::Convert(""); + TD::Convert(""); if (!conf.count("model1")) { cerr << argv[0] << "Please use --model1 to specify model 1 parameters\n"; return 1; @@ -188,23 +245,28 @@ int main(int argc, char** argv) { cerr << "F-corpus size: " << corpusf.size() << " sentences\t (" << vocabf.size() << " word types)\n"; cerr << "E-corpus size: " << corpuse.size() << " sentences\t (" << vocabe.size() << " word types)\n"; assert(corpusf.size() == corpuse.size()); + UnigramModel src_unigram(conf["src_unigram"].as(), vocabf.size()); + UnigramModel trg_unigram(conf["trg_unigram"].as(), vocabe.size()); + const prob_t kHALF(0.5); + const string kEMPTY = "NULL"; const int kLHS = -TD::Convert("X"); Model1 m1(conf["model1"].as()); Model1 invm1(conf["inverse_model1"].as()); for (int si = 0; si < conf["samples"].as(); ++si) { cerr << '.' << flush; for (int ci = 0; ci < corpusf.size(); ++ci) { - const vector& src = corpusf[ci]; const vector& trg = corpuse[ci]; - for (int i = 0; i < src.size(); ++i) { - for (int j = 0; j < trg.size(); ++j) { - const int eff_max_src = min(src.size() - i, kMAX_SRC_PHRASE); - for (int k = 0; k < eff_max_src; ++k) { - const int eff_max_trg = (k == 0 ? 1 : min(trg.size() - j, kMAX_TRG_PHRASE)); - for (int l = 0; l < eff_max_trg; ++l) { - } - } + const vector& src = corpusf[ci]; + for (int i = 0; i <= trg.size(); ++i) { + const WordID e_i = i > 0 ? trg[i-1] : 0; + for (int j = 0; j <= src.size(); ++j) { + const WordID f_j = j > 0 ? src[j-1] : 0; + if (e_i == 0 && f_j == 0) continue; + prob_t je = kHALF * src_unigram(f_j) * m1(f_j,e_i) + kHALF * trg_unigram(e_i) * invm1(e_i,f_j); + cerr << "p( " << (e_i ? TD::Convert(e_i) : kEMPTY) << " , " << (f_j ? TD::Convert(f_j) : kEMPTY) << " ) = " << log(je) << endl; + if (e_i && f_j) + cout << "[X] ||| " << TD::Convert(f_j) << " ||| " << TD::Convert(e_i) << " ||| LogProb=" << log(je) << endl; } } } -- cgit v1.2.3