summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorChris Dyer <cdyer@cs.cmu.edu>2011-12-29 21:08:30 -0500
committerChris Dyer <cdyer@cs.cmu.edu>2011-12-29 21:08:30 -0500
commitaac3ef3e3fdf636406fc61a40096cee6381e5461 (patch)
treee014b029915614a23a8b2691a37fa14204a84a89
parentc96125ad4860aa823ff2f17aa3ccb565c8f99c0e (diff)
lexical alignment samplers
-rw-r--r--gi/pf/Makefile.am13
-rw-r--r--gi/pf/align-lexonly.cc356
-rw-r--r--gi/pf/base_measures.cc26
-rw-r--r--gi/pf/base_measures.h50
-rw-r--r--gi/pf/itg.cc98
-rw-r--r--gi/pf/unigrams.cc80
-rw-r--r--gi/pf/unigrams.h69
7 files changed, 668 insertions, 24 deletions
diff --git a/gi/pf/Makefile.am b/gi/pf/Makefile.am
index 42758939..7c8e89d0 100644
--- a/gi/pf/Makefile.am
+++ b/gi/pf/Makefile.am
@@ -1,10 +1,14 @@
-bin_PROGRAMS = cbgi brat dpnaive pfbrat pfdist itg pfnaive
+bin_PROGRAMS = cbgi brat dpnaive pfbrat pfdist itg pfnaive condnaive align-lexonly
noinst_LIBRARIES = libpf.a
-libpf_a_SOURCES = base_measures.cc reachability.cc cfg_wfst_composer.cc corpus.cc
+libpf_a_SOURCES = base_measures.cc reachability.cc cfg_wfst_composer.cc corpus.cc unigrams.cc ngram_base.cc
+
+align_lexonly_SOURCES = align-lexonly.cc
itg_SOURCES = itg.cc
+condnaive_SOURCES = condnaive.cc
+
dpnaive_SOURCES = dpnaive.cc
pfdist_SOURCES = pfdist.cc
@@ -17,5 +21,6 @@ brat_SOURCES = brat.cc
pfbrat_SOURCES = pfbrat.cc
-AM_CPPFLAGS = -W -Wall -Wno-sign-compare -funroll-loops -I$(top_srcdir)/utils $(GTEST_CPPFLAGS) -I$(top_srcdir)/decoder
-AM_LDFLAGS = libpf.a $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/utils/libutils.a -lz
+AM_CPPFLAGS = -W -Wall -Wno-sign-compare -funroll-loops -I$(top_srcdir)/utils $(GTEST_CPPFLAGS) -I$(top_srcdir)/decoder -I$(top_srcdir)/klm
+
+AM_LDFLAGS = libpf.a $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/klm/lm/libklm.a $(top_srcdir)/klm/util/libklm_util.a $(top_srcdir)/utils/libutils.a -lz
diff --git a/gi/pf/align-lexonly.cc b/gi/pf/align-lexonly.cc
new file mode 100644
index 00000000..91a3cfcf
--- /dev/null
+++ b/gi/pf/align-lexonly.cc
@@ -0,0 +1,356 @@
+#include <iostream>
+#include <tr1/memory>
+#include <queue>
+
+#include <boost/multi_array.hpp>
+#include <boost/program_options.hpp>
+#include <boost/program_options/variables_map.hpp>
+
+#include "array2d.h"
+#include "base_measures.h"
+#include "monotonic_pseg.h"
+#include "conditional_pseg.h"
+#include "trule.h"
+#include "tdict.h"
+#include "stringlib.h"
+#include "filelib.h"
+#include "dict.h"
+#include "sampler.h"
+#include "ccrp_nt.h"
+#include "corpus.h"
+#include "ngram_base.h"
+
+using namespace std;
+using namespace tr1;
+namespace po = boost::program_options;
+
+void InitCommandLine(int argc, char** argv, po::variables_map* conf) {
+ po::options_description opts("Configuration options");
+ opts.add_options()
+ ("samples,s",po::value<unsigned>()->default_value(1000),"Number of samples")
+ ("input,i",po::value<string>(),"Read parallel data from")
+ ("random_seed,S",po::value<uint32_t>(), "Random seed");
+ po::options_description clo("Command line options");
+ clo.add_options()
+ ("config", po::value<string>(), "Configuration file")
+ ("help,h", "Print this help message and exit");
+ po::options_description dconfig_options, dcmdline_options;
+ dconfig_options.add(opts);
+ dcmdline_options.add(opts).add(clo);
+
+ po::store(parse_command_line(argc, argv, dcmdline_options), *conf);
+ if (conf->count("config")) {
+ ifstream config((*conf)["config"].as<string>().c_str());
+ po::store(po::parse_config_file(config, dconfig_options), *conf);
+ }
+ po::notify(*conf);
+
+ if (conf->count("help") || (conf->count("input") == 0)) {
+ cerr << dcmdline_options << endl;
+ exit(1);
+ }
+}
+
+shared_ptr<MT19937> prng;
+
+struct LexicalAlignment {
+ unsigned char src_index;
+ bool is_transliteration;
+ vector<pair<short, short> > derivation;
+};
+
+struct AlignedSentencePair {
+ vector<WordID> src;
+ vector<WordID> trg;
+ vector<LexicalAlignment> a;
+ Array2D<short> posterior;
+};
+
+struct HierarchicalUnigramBase {
+ explicit HierarchicalUnigramBase(const unsigned vocab_e_size) : r(5,5), u0(1.0 / vocab_e_size) {}
+
+ // return p0 of rule.e_
+ prob_t operator()(const TRule& rule) const {
+ prob_t p = prob_t::One();
+ prob_t q;
+ for (unsigned i = 0; i < rule.e_.size(); ++i) {
+ q.logeq(r.logprob(rule.e_[i], log(u0)));
+ p *= q;
+ }
+ q.logeq(r.logprob(TD::Convert("</s>"), log(u0)));
+ p *= q;
+ return p;
+ }
+
+ void Increment(const TRule& rule) {
+ for (unsigned i = 0; i < rule.e_.size(); ++i)
+ r.increment(rule.e_[i]);
+ r.increment(TD::Convert("</s>"));
+ }
+
+ void Decrement(const TRule& rule) {
+ for (unsigned i = 0; i < rule.e_.size(); ++i)
+ r.decrement(rule.e_[i]);
+ r.decrement(TD::Convert("</s>"));
+ }
+
+ CCRP_NoTable<WordID> r;
+ prob_t u0;
+};
+
+struct HierarchicalWordBase {
+ explicit HierarchicalWordBase(const unsigned vocab_e_size) :
+ base(prob_t::One()), r(15,15), u0(-log(vocab_e_size)) {}
+
+ void ResampleHyperparameters(MT19937* rng) {
+ r.resample_hyperparameters(rng);
+ }
+
+ inline double logp0(const vector<WordID>& s) const {
+ return s.size() * u0;
+ }
+
+ // return p0 of rule.e_
+ prob_t operator()(const TRule& rule) const {
+ prob_t p; p.logeq(r.logprob(rule.e_, logp0(rule.e_)));
+ return p;
+ }
+
+ void Increment(const TRule& rule) {
+ if (r.increment(rule.e_)) {
+ prob_t p; p.logeq(logp0(rule.e_));
+ base *= p;
+ }
+ }
+
+ void Decrement(const TRule& rule) {
+ if (r.decrement(rule.e_)) {
+ prob_t p; p.logeq(logp0(rule.e_));
+ base /= p;
+ }
+ }
+
+ prob_t Likelihood() const {
+ prob_t p; p.logeq(r.log_crp_prob());
+ p *= base;
+ return p;
+ }
+
+ void Summary() const {
+ cerr << "NUMBER OF CUSTOMERS: " << r.num_customers() << endl;
+ for (CCRP_NoTable<vector<WordID> >::const_iterator it = r.begin(); it != r.end(); ++it)
+ cerr << " " << it->second << '\t' << TD::GetString(it->first) << endl;
+ }
+
+ prob_t base;
+ CCRP_NoTable<vector<WordID> > r;
+ const double u0;
+};
+
+struct BasicLexicalAlignment {
+ explicit BasicLexicalAlignment(const vector<vector<WordID> >& lets,
+ const unsigned letters_e,
+ vector<AlignedSentencePair>* corp) :
+ letters(lets),
+ corpus(*corp),
+ //up0("en.chars.1gram", letters_e),
+ //up0("en.words.1gram"),
+ up0(letters_e),
+ //up0("en.chars.2gram"),
+ tmodel(up0) {
+ }
+
+ void InstantiateRule(const WordID src,
+ const WordID trg,
+ TRule* rule) const {
+ static const WordID kX = TD::Convert("X") * -1;
+ rule->lhs_ = kX;
+ rule->e_ = letters[trg];
+ rule->f_ = letters[src];
+ }
+
+ void InitializeRandom() {
+ const WordID kNULL = TD::Convert("NULL");
+ cerr << "Initializing with random alignments ...\n";
+ for (unsigned i = 0; i < corpus.size(); ++i) {
+ AlignedSentencePair& asp = corpus[i];
+ asp.a.resize(asp.trg.size());
+ for (unsigned j = 0; j < asp.trg.size(); ++j) {
+ const unsigned char a_j = prng->next() * (1 + asp.src.size());
+ const WordID f_a_j = (a_j ? asp.src[a_j - 1] : kNULL);
+ TRule r;
+ InstantiateRule(f_a_j, asp.trg[j], &r);
+ asp.a[j].is_transliteration = false;
+ asp.a[j].src_index = a_j;
+ if (tmodel.IncrementRule(r))
+ up0.Increment(r);
+ }
+ }
+ cerr << " LLH = " << Likelihood() << endl;
+ }
+
+ prob_t Likelihood() const {
+ prob_t p = tmodel.Likelihood();
+ p *= up0.Likelihood();
+ return p;
+ }
+
+ void ResampleHyperparemeters() {
+ cerr << " LLH_prev = " << Likelihood() << flush;
+ tmodel.ResampleHyperparameters(&*prng);
+ up0.ResampleHyperparameters(&*prng);
+ cerr << "\tLLH_post = " << Likelihood() << endl;
+ }
+
+ void ResampleCorpus();
+
+ const vector<vector<WordID> >& letters; // spelling dictionary
+ vector<AlignedSentencePair>& corpus;
+ //PhraseConditionalUninformativeBase up0;
+ //PhraseConditionalUninformativeUnigramBase up0;
+ //UnigramWordBase up0;
+ //HierarchicalUnigramBase up0;
+ HierarchicalWordBase up0;
+ //CompletelyUniformBase up0;
+ //FixedNgramBase up0;
+ //ConditionalTranslationModel<PhraseConditionalUninformativeBase> tmodel;
+ //ConditionalTranslationModel<PhraseConditionalUninformativeUnigramBase> tmodel;
+ //ConditionalTranslationModel<UnigramWordBase> tmodel;
+ //ConditionalTranslationModel<HierarchicalUnigramBase> tmodel;
+ ConditionalTranslationModel<HierarchicalWordBase> tmodel;
+ //ConditionalTranslationModel<FixedNgramBase> tmodel;
+ //ConditionalTranslationModel<CompletelyUniformBase> tmodel;
+};
+
+void BasicLexicalAlignment::ResampleCorpus() {
+ static const WordID kNULL = TD::Convert("NULL");
+ for (unsigned i = 0; i < corpus.size(); ++i) {
+ AlignedSentencePair& asp = corpus[i];
+ SampleSet<prob_t> ss; ss.resize(asp.src.size() + 1);
+ for (unsigned j = 0; j < asp.trg.size(); ++j) {
+ TRule r;
+ unsigned char& a_j = asp.a[j].src_index;
+ WordID f_a_j = (a_j ? asp.src[a_j - 1] : kNULL);
+ InstantiateRule(f_a_j, asp.trg[j], &r);
+ if (tmodel.DecrementRule(r))
+ up0.Decrement(r);
+
+ for (unsigned prop_a_j = 0; prop_a_j <= asp.src.size(); ++prop_a_j) {
+ const WordID prop_f = (prop_a_j ? asp.src[prop_a_j - 1] : kNULL);
+ InstantiateRule(prop_f, asp.trg[j], &r);
+ ss[prop_a_j] = tmodel.RuleProbability(r);
+ }
+ a_j = prng->SelectSample(ss);
+ f_a_j = (a_j ? asp.src[a_j - 1] : kNULL);
+ InstantiateRule(f_a_j, asp.trg[j], &r);
+ if (tmodel.IncrementRule(r))
+ up0.Increment(r);
+ }
+ }
+ cerr << " LLH = " << tmodel.Likelihood() << endl;
+}
+
+void ExtractLetters(const set<WordID>& v, vector<vector<WordID> >* l, set<WordID>* letset = NULL) {
+ for (set<WordID>::const_iterator it = v.begin(); it != v.end(); ++it) {
+ vector<WordID>& letters = (*l)[*it];
+ if (letters.size()) continue; // if e and f have the same word
+
+ const string& w = TD::Convert(*it);
+
+ size_t cur = 0;
+ while (cur < w.size()) {
+ const size_t len = UTF8Len(w[cur]);
+ letters.push_back(TD::Convert(w.substr(cur, len)));
+ if (letset) letset->insert(letters.back());
+ cur += len;
+ }
+ }
+}
+
+void Debug(const AlignedSentencePair& asp) {
+ cerr << TD::GetString(asp.src) << endl << TD::GetString(asp.trg) << endl;
+ Array2D<bool> a(asp.src.size(), asp.trg.size());
+ for (unsigned j = 0; j < asp.trg.size(); ++j)
+ if (asp.a[j].src_index) a(asp.a[j].src_index - 1, j) = true;
+ cerr << a << endl;
+}
+
+void AddSample(AlignedSentencePair* asp) {
+ for (unsigned j = 0; j < asp->trg.size(); ++j)
+ asp->posterior(asp->a[j].src_index, j)++;
+}
+
+void WriteAlignments(const AlignedSentencePair& asp) {
+ bool first = true;
+ for (unsigned j = 0; j < asp.trg.size(); ++j) {
+ int src_index = -1;
+ int mc = -1;
+ for (unsigned i = 0; i <= asp.src.size(); ++i) {
+ if (asp.posterior(i, j) > mc) {
+ mc = asp.posterior(i, j);
+ src_index = i;
+ }
+ }
+
+ if (src_index) {
+ if (first) first = false; else cout << ' ';
+ cout << (src_index - 1) << '-' << j;
+ }
+ }
+ cout << endl;
+}
+
+int main(int argc, char** argv) {
+ po::variables_map conf;
+ InitCommandLine(argc, argv, &conf);
+
+ if (conf.count("random_seed"))
+ prng.reset(new MT19937(conf["random_seed"].as<uint32_t>()));
+ else
+ prng.reset(new MT19937);
+// MT19937& rng = *prng;
+
+ vector<vector<int> > corpuse, corpusf;
+ set<int> vocabe, vocabf;
+ corpus::ReadParallelCorpus(conf["input"].as<string>(), &corpusf, &corpuse, &vocabf, &vocabe);
+ cerr << "f-Corpus size: " << corpusf.size() << " sentences\n";
+ cerr << "f-Vocabulary size: " << vocabf.size() << " types\n";
+ cerr << "f-Corpus size: " << corpuse.size() << " sentences\n";
+ cerr << "f-Vocabulary size: " << vocabe.size() << " types\n";
+ assert(corpusf.size() == corpuse.size());
+
+ vector<AlignedSentencePair> corpus(corpuse.size());
+ for (unsigned i = 0; i < corpuse.size(); ++i) {
+ corpus[i].src.swap(corpusf[i]);
+ corpus[i].trg.swap(corpuse[i]);
+ corpus[i].posterior.resize(corpus[i].src.size() + 1, corpus[i].trg.size());
+ }
+ corpusf.clear(); corpuse.clear();
+
+ vocabf.insert(TD::Convert("NULL"));
+ vector<vector<WordID> > letters(TD::NumWords());
+ set<WordID> letset;
+ ExtractLetters(vocabe, &letters, &letset);
+ ExtractLetters(vocabf, &letters, NULL);
+ letters[TD::Convert("NULL")].clear();
+
+ BasicLexicalAlignment x(letters, letset.size(), &corpus);
+ x.InitializeRandom();
+ const unsigned samples = conf["samples"].as<unsigned>();
+ for (int i = 0; i < samples; ++i) {
+ for (int j = 431; j < 433; ++j) Debug(corpus[j]);
+ cerr << i << "\t" << x.tmodel.r.size() << "\t";
+ if (i % 10 == 0) x.ResampleHyperparemeters();
+ x.ResampleCorpus();
+ if (i > (samples / 5) && (i % 10 == 9)) for (int j = 0; j < corpus.size(); ++j) AddSample(&corpus[j]);
+ }
+ for (unsigned i = 0; i < corpus.size(); ++i)
+ WriteAlignments(corpus[i]);
+ //ModelAndData posterior(x, &corpus, vocabe, vocabf);
+ x.tmodel.Summary();
+ x.up0.Summary();
+
+ //posterior.Sample();
+
+ return 0;
+}
diff --git a/gi/pf/base_measures.cc b/gi/pf/base_measures.cc
index 8adb37d7..97b4e698 100644
--- a/gi/pf/base_measures.cc
+++ b/gi/pf/base_measures.cc
@@ -6,6 +6,32 @@
using namespace std;
+prob_t PhraseConditionalUninformativeUnigramBase::p0(const vector<WordID>& vsrc,
+ const vector<WordID>& vtrg,
+ int start_src, int start_trg) const {
+ const int flen = vsrc.size() - start_src;
+ const int elen = vtrg.size() - start_trg;
+ prob_t p;
+ p.logeq(log_poisson(elen, flen + 0.01)); // elen | flen ~Pois(flen + 0.01)
+ //p.logeq(log_poisson(elen, 1)); // elen | flen ~Pois(flen + 0.01)
+ for (int i = 0; i < elen; ++i)
+ p *= u(vtrg[i + start_trg]); // draw e_i ~Uniform
+ return p;
+}
+
+prob_t PhraseConditionalUninformativeBase::p0(const vector<WordID>& vsrc,
+ const vector<WordID>& vtrg,
+ int start_src, int start_trg) const {
+ const int flen = vsrc.size() - start_src;
+ const int elen = vtrg.size() - start_trg;
+ prob_t p;
+ //p.logeq(log_poisson(elen, flen + 0.01)); // elen | flen ~Pois(flen + 0.01)
+ p.logeq(log_poisson(elen, 1)); // elen | flen ~Pois(flen + 0.01)
+ for (int i = 0; i < elen; ++i)
+ p *= kUNIFORM_TARGET; // draw e_i ~Uniform
+ return p;
+}
+
void Model1::LoadModel1(const string& fname) {
cerr << "Loading Model 1 parameters from " << fname << " ..." << endl;
ReadFile rf(fname);
diff --git a/gi/pf/base_measures.h b/gi/pf/base_measures.h
index 7ce7e2e6..fbd1c3ad 100644
--- a/gi/pf/base_measures.h
+++ b/gi/pf/base_measures.h
@@ -7,6 +7,7 @@
#include <cmath>
#include <iostream>
+#include "unigrams.h"
#include "trule.h"
#include "prob.h"
#include "tdict.h"
@@ -49,6 +50,51 @@ struct Model1 {
std::vector<std::map<WordID, prob_t> > ttable;
};
+struct CompletelyUniformBase {
+ explicit CompletelyUniformBase(const unsigned ves) : kUNIFORM(1.0 / ves) {}
+ prob_t operator()(const TRule&) const {
+ return kUNIFORM;
+ }
+ const prob_t kUNIFORM;
+};
+
+struct UnigramWordBase {
+ explicit UnigramWordBase(const std::string& fname) : un(fname) {}
+ prob_t operator()(const TRule& r) const {
+ return un(r.e_);
+ }
+ const UnigramWordModel un;
+};
+
+struct PhraseConditionalUninformativeBase {
+ explicit PhraseConditionalUninformativeBase(const unsigned vocab_e_size) :
+ kUNIFORM_TARGET(1.0 / vocab_e_size) {
+ assert(vocab_e_size > 0);
+ }
+
+ // return p0 of rule.e_ | rule.f_
+ prob_t operator()(const TRule& rule) const {
+ return p0(rule.f_, rule.e_, 0, 0);
+ }
+
+ prob_t p0(const std::vector<WordID>& vsrc, const std::vector<WordID>& vtrg, int start_src, int start_trg) const;
+
+ const prob_t kUNIFORM_TARGET;
+};
+
+struct PhraseConditionalUninformativeUnigramBase {
+ explicit PhraseConditionalUninformativeUnigramBase(const std::string& file, const unsigned vocab_e_size) : u(file, vocab_e_size) {}
+
+ // return p0 of rule.e_ | rule.f_
+ prob_t operator()(const TRule& rule) const {
+ return p0(rule.f_, rule.e_, 0, 0);
+ }
+
+ prob_t p0(const std::vector<WordID>& vsrc, const std::vector<WordID>& vtrg, int start_src, int start_trg) const;
+
+ const UnigramModel u;
+};
+
struct PhraseConditionalBase {
explicit PhraseConditionalBase(const Model1& m1, const double m1mixture, const unsigned vocab_e_size) :
model1(m1),
@@ -83,7 +129,7 @@ struct PhraseJointBase {
assert(vocab_e_size > 0);
}
- // return p0 of rule.e_ | rule.f_
+ // return p0 of rule.e_ , rule.f_
prob_t operator()(const TRule& rule) const {
return p0(rule.f_, rule.e_, 0, 0);
}
@@ -113,7 +159,7 @@ struct PhraseJointBase_BiDir {
assert(vocab_e_size > 0);
}
- // return p0 of rule.e_ | rule.f_
+ // return p0 of rule.e_ , rule.f_
prob_t operator()(const TRule& rule) const {
return p0(rule.f_, rule.e_, 0, 0);
}
diff --git a/gi/pf/itg.cc b/gi/pf/itg.cc
index ac3c16a3..a38fe672 100644
--- a/gi/pf/itg.cc
+++ b/gi/pf/itg.cc
@@ -27,10 +27,67 @@ ostream& operator<<(ostream& os, const vector<WordID>& p) {
return os << ']';
}
-double log_poisson(unsigned x, const double& lambda) {
- assert(lambda > 0.0);
- return log(lambda) * x - lgamma(x + 1) - lambda;
-}
+struct UnigramModel {
+ explicit UnigramModel(const string& fname, unsigned vocab_size, double p0null = 0.05) :
+ use_uniform_(fname.size() == 0),
+ p0null_(p0null),
+ uniform_((1.0 - p0null) / vocab_size),
+ probs_(TD::NumWords() + 1) {
+ if (fname.size() > 0) LoadUnigrams(fname);
+ probs_[0] = p0null_;
+ }
+
+//
+// \data\
+// ngram 1=9295
+//
+// \1-grams:
+// -3.191193 "
+
+ void LoadUnigrams(const string& fname) {
+ cerr << "Loading unigram probabilities from " << fname << " ..." << endl;
+ ReadFile rf(fname);
+ string line;
+ istream& in = *rf.stream();
+ assert(in);
+ getline(in, line);
+ assert(line.empty());
+ getline(in, line);
+ assert(line == "\\data\\");
+ getline(in, line);
+ size_t pos = line.find("ngram 1=");
+ assert(pos == 0);
+ assert(line.size() > 8);
+ const size_t num_unigrams = atoi(&line[8]);
+ getline(in, line);
+ assert(line.empty());
+ getline(in, line);
+ assert(line == "\\1-grams:");
+ for (size_t i = 0; i < num_unigrams; ++i) {
+ getline(in, line);
+ assert(line.size() > 0);
+ pos = line.find('\t');
+ assert(pos > 0);
+ assert(pos + 1 < line.size());
+ const WordID w = TD::Convert(line.substr(pos + 1));
+ line[pos] = 0;
+ float p = atof(&line[0]);
+ const prob_t pnon_null(1.0 - p0null_.as_float());
+ if (w < probs_.size()) probs_[w].logeq(p * log(10) + log(pnon_null)); else abort();
+ }
+ }
+
+ const prob_t& operator()(const WordID& w) const {
+ if (!w) return p0null_;
+ if (use_uniform_) return uniform_;
+ return probs_[w];
+ }
+
+ const bool use_uniform_;
+ const prob_t p0null_;
+ const prob_t uniform_;
+ vector<prob_t> probs_;
+};
struct Model1 {
explicit Model1(const string& fname) :
@@ -89,11 +146,11 @@ void InitCommandLine(int argc, char** argv, po::variables_map* conf) {
("samples,s",po::value<unsigned>()->default_value(1000),"Number of samples")
("particles,p",po::value<unsigned>()->default_value(25),"Number of particles")
("input,i",po::value<string>(),"Read parallel data from")
- ("max_src_phrase",po::value<unsigned>()->default_value(7),"Maximum length of source language phrases")
- ("max_trg_phrase",po::value<unsigned>()->default_value(7),"Maximum length of target language phrases")
("model1,m",po::value<string>(),"Model 1 parameters (used in base distribution)")
("inverse_model1,M",po::value<string>(),"Inverse Model 1 parameters (used in backward estimate)")
("model1_interpolation_weight",po::value<double>()->default_value(0.95),"Mixing proportion of model 1 with uniform target distribution")
+ ("src_unigram,u",po::value<string>()->default_value(""),"Source unigram distribution; empty for uniform")
+ ("trg_unigram,U",po::value<string>()->default_value(""),"Target unigram distribution; empty for uniform")
("random_seed,S",po::value<uint32_t>(), "Random seed");
po::options_description clo("Command line options");
clo.add_options()
@@ -165,11 +222,11 @@ void ReadParallelCorpus(const string& filename,
int main(int argc, char** argv) {
po::variables_map conf;
InitCommandLine(argc, argv, &conf);
- const size_t kMAX_TRG_PHRASE = conf["max_trg_phrase"].as<unsigned>();
- const size_t kMAX_SRC_PHRASE = conf["max_src_phrase"].as<unsigned>();
const unsigned particles = conf["particles"].as<unsigned>();
const unsigned samples = conf["samples"].as<unsigned>();
-
+ TD::Convert("<s>");
+ TD::Convert("</s>");
+ TD::Convert("<unk>");
if (!conf.count("model1")) {
cerr << argv[0] << "Please use --model1 to specify model 1 parameters\n";
return 1;
@@ -188,23 +245,28 @@ int main(int argc, char** argv) {
cerr << "F-corpus size: " << corpusf.size() << " sentences\t (" << vocabf.size() << " word types)\n";
cerr << "E-corpus size: " << corpuse.size() << " sentences\t (" << vocabe.size() << " word types)\n";
assert(corpusf.size() == corpuse.size());
+ UnigramModel src_unigram(conf["src_unigram"].as<string>(), vocabf.size());
+ UnigramModel trg_unigram(conf["trg_unigram"].as<string>(), vocabe.size());
+ const prob_t kHALF(0.5);
+ const string kEMPTY = "NULL";
const int kLHS = -TD::Convert("X");
Model1 m1(conf["model1"].as<string>());
Model1 invm1(conf["inverse_model1"].as<string>());
for (int si = 0; si < conf["samples"].as<unsigned>(); ++si) {
cerr << '.' << flush;
for (int ci = 0; ci < corpusf.size(); ++ci) {
- const vector<WordID>& src = corpusf[ci];
const vector<WordID>& trg = corpuse[ci];
- for (int i = 0; i < src.size(); ++i) {
- for (int j = 0; j < trg.size(); ++j) {
- const int eff_max_src = min(src.size() - i, kMAX_SRC_PHRASE);
- for (int k = 0; k < eff_max_src; ++k) {
- const int eff_max_trg = (k == 0 ? 1 : min(trg.size() - j, kMAX_TRG_PHRASE));
- for (int l = 0; l < eff_max_trg; ++l) {
- }
- }
+ const vector<WordID>& src = corpusf[ci];
+ for (int i = 0; i <= trg.size(); ++i) {
+ const WordID e_i = i > 0 ? trg[i-1] : 0;
+ for (int j = 0; j <= src.size(); ++j) {
+ const WordID f_j = j > 0 ? src[j-1] : 0;
+ if (e_i == 0 && f_j == 0) continue;
+ prob_t je = kHALF * src_unigram(f_j) * m1(f_j,e_i) + kHALF * trg_unigram(e_i) * invm1(e_i,f_j);
+ cerr << "p( " << (e_i ? TD::Convert(e_i) : kEMPTY) << " , " << (f_j ? TD::Convert(f_j) : kEMPTY) << " ) = " << log(je) << endl;
+ if (e_i && f_j)
+ cout << "[X] ||| " << TD::Convert(f_j) << " ||| " << TD::Convert(e_i) << " ||| LogProb=" << log(je) << endl;
}
}
}
diff --git a/gi/pf/unigrams.cc b/gi/pf/unigrams.cc
new file mode 100644
index 00000000..40829775
--- /dev/null
+++ b/gi/pf/unigrams.cc
@@ -0,0 +1,80 @@
+#include "unigrams.h"
+
+#include <string>
+#include <cmath>
+
+#include "stringlib.h"
+#include "filelib.h"
+
+using namespace std;
+
+void UnigramModel::LoadUnigrams(const string& fname) {
+ cerr << "Loading unigram probabilities from " << fname << " ..." << endl;
+ ReadFile rf(fname);
+ string line;
+ istream& in = *rf.stream();
+ assert(in);
+ getline(in, line);
+ assert(line.empty());
+ getline(in, line);
+ assert(line == "\\data\\");
+ getline(in, line);
+ size_t pos = line.find("ngram 1=");
+ assert(pos == 0);
+ assert(line.size() > 8);
+ const size_t num_unigrams = atoi(&line[8]);
+ getline(in, line);
+ assert(line.empty());
+ getline(in, line);
+ assert(line == "\\1-grams:");
+ for (size_t i = 0; i < num_unigrams; ++i) {
+ getline(in, line);
+ assert(line.size() > 0);
+ pos = line.find('\t');
+ assert(pos > 0);
+ assert(pos + 1 < line.size());
+ const WordID w = TD::Convert(line.substr(pos + 1));
+ line[pos] = 0;
+ float p = atof(&line[0]);
+ if (w < probs_.size()) probs_[w].logeq(p * log(10)); else cerr << "WARNING: don't know about '" << TD::Convert(w) << "'\n";
+ }
+}
+
+void UnigramWordModel::LoadUnigrams(const string& fname) {
+ cerr << "Loading unigram probabilities from " << fname << " ..." << endl;
+ ReadFile rf(fname);
+ string line;
+ istream& in = *rf.stream();
+ assert(in);
+ getline(in, line);
+ assert(line.empty());
+ getline(in, line);
+ assert(line == "\\data\\");
+ getline(in, line);
+ size_t pos = line.find("ngram 1=");
+ assert(pos == 0);
+ assert(line.size() > 8);
+ const size_t num_unigrams = atoi(&line[8]);
+ getline(in, line);
+ assert(line.empty());
+ getline(in, line);
+ assert(line == "\\1-grams:");
+ for (size_t i = 0; i < num_unigrams; ++i) {
+ getline(in, line);
+ assert(line.size() > 0);
+ pos = line.find('\t');
+ assert(pos > 0);
+ assert(pos + 1 < line.size());
+ size_t cur = pos + 1;
+ vector<WordID> w;
+ while (cur < line.size()) {
+ const size_t len = UTF8Len(line[cur]);
+ w.push_back(TD::Convert(line.substr(cur, len)));
+ cur += len;
+ }
+ line[pos] = 0;
+ float p = atof(&line[0]);
+ probs_[w].logeq(p * log(10.0));
+ }
+}
+
diff --git a/gi/pf/unigrams.h b/gi/pf/unigrams.h
new file mode 100644
index 00000000..1660d1ed
--- /dev/null
+++ b/gi/pf/unigrams.h
@@ -0,0 +1,69 @@
+#ifndef _UNIGRAMS_H_
+#define _UNIGRAMS_H_
+
+#include <vector>
+#include <string>
+#include <tr1/unordered_map>
+#include <boost/functional.hpp>
+
+#include "wordid.h"
+#include "prob.h"
+#include "tdict.h"
+
+struct UnigramModel {
+ explicit UnigramModel(const std::string& fname, unsigned vocab_size) :
+ use_uniform_(fname.size() == 0),
+ uniform_(1.0 / vocab_size),
+ probs_() {
+ if (fname.size() > 0) {
+ probs_.resize(TD::NumWords() + 1);
+ LoadUnigrams(fname);
+ }
+ }
+
+ const prob_t& operator()(const WordID& w) const {
+ assert(w);
+ if (use_uniform_) return uniform_;
+ return probs_[w];
+ }
+
+ private:
+ void LoadUnigrams(const std::string& fname);
+
+ const bool use_uniform_;
+ const prob_t uniform_;
+ std::vector<prob_t> probs_;
+};
+
+
+// reads an ARPA unigram file and converts words like 'cat' into a string 'c a t'
+struct UnigramWordModel {
+ explicit UnigramWordModel(const std::string& fname) :
+ use_uniform_(false),
+ uniform_(1.0),
+ probs_() {
+ LoadUnigrams(fname);
+ }
+
+ explicit UnigramWordModel(const unsigned vocab_size) :
+ use_uniform_(true),
+ uniform_(1.0 / vocab_size),
+ probs_() {}
+
+ const prob_t& operator()(const std::vector<WordID>& s) const {
+ if (use_uniform_) return uniform_;
+ const VectorProbHash::const_iterator it = probs_.find(s);
+ assert(it != probs_.end());
+ return it->second;
+ }
+
+ private:
+ void LoadUnigrams(const std::string& fname);
+
+ const bool use_uniform_;
+ const prob_t uniform_;
+ typedef std::tr1::unordered_map<std::vector<WordID>, prob_t, boost::hash<std::vector<WordID> > > VectorProbHash;
+ VectorProbHash probs_;
+};
+
+#endif