From c294227a928672bf108eed81106063a194c872ca Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Thu, 12 Apr 2012 18:57:44 -0400 Subject: unigram pyp lm added --- utils/alias_sampler.h | 47 ++++++++++++++ utils/unigram_pyp_lm.cc | 168 ++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 215 insertions(+) create mode 100644 utils/alias_sampler.h create mode 100644 utils/unigram_pyp_lm.cc (limited to 'utils') diff --git a/utils/alias_sampler.h b/utils/alias_sampler.h new file mode 100644 index 00000000..85da9944 --- /dev/null +++ b/utils/alias_sampler.h @@ -0,0 +1,47 @@ +#ifndef _ALIAS_SAMPLER_H_ +#define _ALIAS_SAMPLER_H_ + +#include +#include + +// R. A. Kronmal and A. V. Peterson, Jr. (1977) On the alias method for +// generating random variables from a discrete distribution. In The American +// Statistician, Vol. 33, No. 4. Pages 214--218. +// +// Intuition: a multinomial with N outcomes can be rewritten as a uniform +// mixture of N Bernoulli distributions. The ith Bernoulli returns i with +// probability F[i], otherwise it returns an "alias" value L[i]. The +// constructor computes the F's and L's given an arbitrary multionimial p in +// O(n) time and Draw returns samples in O(1) time. +struct AliasSampler { + explicit AliasSampler(const std::vector& p) : + cutoffs_(p.size()), + aliases_(p.size(), std::numeric_limits::max()) { + const unsigned N = p.size(); + std::vector s,g; + for (unsigned i = 0; i < N; ++i) { + const double cutoff = cutoffs_[i] = N * p[i]; + if (cutoff >= 1.0) g.push_back(i); else s.push_back(i); + } + while(!s.empty() && !g.empty()) { + const unsigned k = g.back(); + const unsigned j = s.back(); + aliases_[j] = k; + cutoffs_[k] -= 1.0 - cutoffs_[j]; + s.pop_back(); + if (cutoffs_[k] < 1.0) { + g.pop_back(); + s.push_back(k); + } + } + } + template + unsigned Draw(Uniform01Generator& u01) const { + const unsigned n = u01() * cutoffs_.size(); + if (u01() > cutoffs_[n]) return aliases_[n]; else return n; + } + std::vector cutoffs_; // F + std::vector aliases_; // L +}; + +#endif diff --git a/utils/unigram_pyp_lm.cc b/utils/unigram_pyp_lm.cc new file mode 100644 index 00000000..510e8839 --- /dev/null +++ b/utils/unigram_pyp_lm.cc @@ -0,0 +1,168 @@ +#include +#include +#include + +#include +#include +#include + +#include "corpus_tools.h" +#include "m.h" +#include "tdict.h" +#include "sampler.h" +#include "ccrp.h" + +// A not very memory-efficient implementation of an 1-gram LM based on PYPs +// as described in Y.-W. Teh. (2006) A Hierarchical Bayesian Language Model +// based on Pitman-Yor Processes. In Proc. ACL. + +using namespace std; +using namespace tr1; +namespace po = boost::program_options; + +boost::shared_ptr prng; + +void InitCommandLine(int argc, char** argv, po::variables_map* conf) { + po::options_description opts("Configuration options"); + opts.add_options() + ("samples,n",po::value()->default_value(50),"Number of samples") + ("train,i",po::value(),"Training data file") + ("test,T",po::value(),"Test data file") + ("discount_prior_a,a",po::value()->default_value(1.0), "discount ~ Beta(a,b): a=this") + ("discount_prior_b,b",po::value()->default_value(1.0), "discount ~ Beta(a,b): b=this") + ("strength_prior_s,s",po::value()->default_value(1.0), "strength ~ Gamma(s,r): s=this") + ("strength_prior_r,r",po::value()->default_value(1.0), "strength ~ Gamma(s,r): r=this") + ("random_seed,S",po::value(), "Random seed"); + po::options_description clo("Command line options"); + clo.add_options() + ("config", po::value(), "Configuration file") + ("help", "Print this help message and exit"); + po::options_description dconfig_options, dcmdline_options; + dconfig_options.add(opts); + dcmdline_options.add(opts).add(clo); + + po::store(parse_command_line(argc, argv, dcmdline_options), *conf); + if (conf->count("config")) { + ifstream config((*conf)["config"].as().c_str()); + po::store(po::parse_config_file(config, dconfig_options), *conf); + } + po::notify(*conf); + + if (conf->count("help") || (conf->count("train") == 0)) { + cerr << dcmdline_options << endl; + exit(1); + } +} + +// uniform base distribution (0-gram model) +struct UniformWordModel { + explicit UniformWordModel(unsigned vocab_size) : p0(1.0 / vocab_size), draws() {} + void increment() { ++draws; } + void decrement() { --draws; assert(draws >= 0); } + double prob(WordID) const { return p0; } // all words have equal prob + double log_likelihood() const { return draws * log(p0); } + const double p0; + int draws; +}; + +// represents an Unigram LM +struct UnigramLM { + UnigramLM(unsigned vs, double da, double db, double ss, double sr) : + uniform_vocab(vs), + crp(da, db, ss, sr, 0.8, 1.0) {} + void increment(WordID w, MT19937* rng) { + const double backoff = uniform_vocab.prob(w); + if (crp.increment(w, backoff, rng)) + uniform_vocab.increment(); + } + void decrement(WordID w, MT19937* rng) { + if (crp.decrement(w, rng)) + uniform_vocab.decrement(); + } + double prob(WordID w) const { + const double backoff = uniform_vocab.prob(w); + return crp.prob(w, backoff); + } + + double log_likelihood() const { + double llh = uniform_vocab.log_likelihood(); + llh += crp.log_crp_prob(); + return llh; + } + + void resample_hyperparameters(MT19937* rng) { + crp.resample_hyperparameters(rng); + } + + double discount_a, discount_b, strength_s, strength_r; + double d, strength; + UniformWordModel uniform_vocab; + CCRP crp; +}; + +int main(int argc, char** argv) { + po::variables_map conf; + + InitCommandLine(argc, argv, &conf); + const unsigned samples = conf["samples"].as(); + if (conf.count("random_seed")) + prng.reset(new MT19937(conf["random_seed"].as())); + else + prng.reset(new MT19937); + MT19937& rng = *prng; + vector > corpuse; + set vocabe; + const WordID kEOS = TD::Convert(""); + cerr << "Reading corpus...\n"; + CorpusTools::ReadFromFile(conf["train"].as(), &corpuse, &vocabe); + cerr << "E-corpus size: " << corpuse.size() << " sentences\t (" << vocabe.size() << " word types)\n"; + vector > test; + if (conf.count("test")) + CorpusTools::ReadFromFile(conf["test"].as(), &test); + else + test = corpuse; + UnigramLM lm(vocabe.size(), + conf["discount_prior_a"].as(), + conf["discount_prior_b"].as(), + conf["strength_prior_s"].as(), + conf["strength_prior_r"].as()); + for (int SS=0; SS < samples; ++SS) { + for (int ci = 0; ci < corpuse.size(); ++ci) { + const vector& s = corpuse[ci]; + for (int i = 0; i <= s.size(); ++i) { + WordID w = (i < s.size() ? s[i] : kEOS); + if (SS > 0) lm.decrement(w, &rng); + lm.increment(w, &rng); + } + if (SS > 0) lm.decrement(kEOS, &rng); + lm.increment(kEOS, &rng); + } + cerr << "LLH=" << lm.log_likelihood() << endl; + //if (SS % 10 == 9) lm.resample_hyperparameters(&rng); + } + double llh = 0; + unsigned cnt = 0; + unsigned oovs = 0; + for (int ci = 0; ci < test.size(); ++ci) { + const vector& s = test[ci]; + for (int i = 0; i <= s.size(); ++i) { + WordID w = (i < s.size() ? s[i] : kEOS); + double lp = log(lm.prob(w)) / log(2); + if (i < s.size() && vocabe.count(w) == 0) { + cerr << "**OOV "; + ++oovs; + lp = 0; + } + cerr << "p(" << TD::Convert(w) << ") = " << lp << endl; + llh -= lp; + cnt++; + } + } + cerr << " Log_10 prob: " << (-llh * log(2) / log(10)) << endl; + cerr << " Count: " << cnt << endl; + cerr << " OOVs: " << oovs << endl; + cerr << "Cross-entropy: " << (llh / cnt) << endl; + cerr << " Perplexity: " << pow(2, llh / cnt) << endl; + return 0; +} + -- cgit v1.2.3 From d5a2a9c3bf18c1e414f79a757c1662fe422e2f5c Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Mon, 16 Apr 2012 14:11:02 -0400 Subject: switch to log domain for matrix operations --- rst_parser/arc_factored.h | 2 +- rst_parser/arc_factored_marginals.cc | 24 +++++++++++++----------- rst_parser/mst_train.cc | 29 +++++++++++++++++++++-------- rst_parser/rst_test.cc | 16 +++++++++++++--- utils/logval.h | 13 ++++++++++--- 5 files changed, 58 insertions(+), 26 deletions(-) (limited to 'utils') diff --git a/rst_parser/arc_factored.h b/rst_parser/arc_factored.h index d9a0bb24..4de38b66 100644 --- a/rst_parser/arc_factored.h +++ b/rst_parser/arc_factored.h @@ -56,7 +56,7 @@ class ArcFactoredForest { // Reweight edges so that edge_prob is the edge's marginals // optionally returns log partition - void EdgeMarginals(double* p_log_z = NULL); + void EdgeMarginals(prob_t* p_log_z = NULL); // This may not return a tree void PickBestParentForEachWord(EdgeSubset* st) const; diff --git a/rst_parser/arc_factored_marginals.cc b/rst_parser/arc_factored_marginals.cc index 16360b0d..acb8102a 100644 --- a/rst_parser/arc_factored_marginals.cc +++ b/rst_parser/arc_factored_marginals.cc @@ -9,37 +9,39 @@ using namespace std; #if HAVE_EIGEN #include -typedef Eigen::Matrix ArcMatrix; -typedef Eigen::Matrix RootVector; +typedef Eigen::Matrix ArcMatrix; +typedef Eigen::Matrix RootVector; -void ArcFactoredForest::EdgeMarginals(double *plog_z) { +void ArcFactoredForest::EdgeMarginals(prob_t *plog_z) { ArcMatrix A(num_words_,num_words_); RootVector r(num_words_); for (int h = 0; h < num_words_; ++h) { for (int m = 0; m < num_words_; ++m) { if (h != m) - A(h,m) = edges_(h,m).edge_prob.as_float(); + A(h,m) = edges_(h,m).edge_prob; else - A(h,m) = 0; + A(h,m) = prob_t::Zero(); } - r(h) = root_edges_[h].edge_prob.as_float(); + r(h) = root_edges_[h].edge_prob; } ArcMatrix L = -A; L.diagonal() = A.colwise().sum(); L.row(0) = r; ArcMatrix Linv = L.inverse(); - if (plog_z) *plog_z = log(Linv.determinant()); + if (plog_z) *plog_z = Linv.determinant(); RootVector rootMarginals = r.cwiseProduct(Linv.col(0)); + static const prob_t ZERO(0); + static const prob_t ONE(1); // ArcMatrix T = Linv; for (int h = 0; h < num_words_; ++h) { for (int m = 0; m < num_words_; ++m) { - const double marginal = (m == 0 ? 0.0 : 1.0) * A(h,m) * Linv(m,m) - - (h == 0 ? 0.0 : 1.0) * A(h,m) * Linv(m,h); - edges_(h,m).edge_prob = prob_t(marginal); + const prob_t marginal = (m == 0 ? ZERO : ONE) * A(h,m) * Linv(m,m) - + (h == 0 ? ZERO : ONE) * A(h,m) * Linv(m,h); + edges_(h,m).edge_prob = marginal; // T(h,m) = marginal; } - root_edges_[h].edge_prob = prob_t(rootMarginals(h)); + root_edges_[h].edge_prob = rootMarginals(h); } // cerr << "ROOT MARGINALS: " << rootMarginals.transpose() << endl; // cerr << "M:\n" << T << endl; diff --git a/rst_parser/mst_train.cc b/rst_parser/mst_train.cc index b5114726..c5cab6ec 100644 --- a/rst_parser/mst_train.cc +++ b/rst_parser/mst_train.cc @@ -23,7 +23,9 @@ void InitCommandLine(int argc, char** argv, po::variables_map* conf) { string cfg_file; opts.add_options() ("training_data,t",po::value()->default_value("-"), "File containing training data (jsent format)") - ("feature_function,F",po::value >()->composing(), "feature function") + ("feature_function,F",po::value >()->composing(), "feature function (multiple permitted)") + ("weights,w",po::value(), "Optional starting weights") + ("output_every_i_iterations,I",po::value()->default_value(1), "Write weights every I iterations") ("regularization_strength,C",po::value()->default_value(1.0), "Regularization strength") ("correction_buffers,m", po::value()->default_value(10), "LBFGS correction buffers"); po::options_description clo("Command line options"); @@ -161,9 +163,13 @@ int main(int argc, char** argv) { if (flag) cerr << endl; //cerr << "EMP: " << empirical << endl; //DE vector weights(FD::NumFeats(), 0.0); + if (conf.count("weights")) + Weights::InitFromFile(conf["weights"].as(), &weights); vector g(FD::NumFeats(), 0.0); cerr << "features initialized\noptimizing...\n"; boost::shared_ptr o; + int every = corpus.size() / 20; + if (!every) ++every; o.reset(new LBFGSOptimizer(g.size(), conf["correction_buffers"].as())); int iterations = 1000; for (int iter = 0; iter < iterations; ++iter) { @@ -174,11 +180,12 @@ int main(int argc, char** argv) { double obj = -empirical.dot(weights); // SparseVector mfm; //DE for (int i = 0; i < corpus.size(); ++i) { + if ((i + 1) % every == 0) cerr << '.' << flush; const int num_words = corpus[i].ts.words.size(); forests[i].Reweight(weights); - double lz; - forests[i].EdgeMarginals(&lz); - obj -= lz; + prob_t z; + forests[i].EdgeMarginals(&z); + obj -= log(z); //cerr << " O = " << (-corpus[i].features.dot(weights)) << " D=" << -lz << " OO= " << (-corpus[i].features.dot(weights) - lz) << endl; //cerr << " ZZ = " << zz << endl; for (int h = -1; h < num_words; ++h) { @@ -202,14 +209,20 @@ int main(int argc, char** argv) { gnorm += g[i]*g[i]; ostringstream ll; ll << "ITER=" << (iter+1) << "\tOBJ=" << (obj+r) << "\t[F=" << obj << " R=" << r << "]\tGnorm=" << sqrt(gnorm); - cerr << endl << ll.str() << endl; + cerr << ' ' << ll.str().substr(ll.str().find('\t')+1) << endl; obj += r; assert(obj >= 0); o->Optimize(obj, g, &weights); Weights::ShowLargestFeatures(weights); - string sl = ll.str(); - Weights::WriteToFile(o->HasConverged() ? "weights.final.gz" : "weights.cur.gz", weights, true, &sl); - if (o->HasConverged()) { cerr << "CONVERGED\n"; break; } + const bool converged = o->HasConverged(); + const char* ofname = converged ? "weights.final.gz" : "weights.cur.gz"; + if (converged || ((iter+1) % conf["output_every_i_iterations"].as()) == 0) { + cerr << "writing..." << flush; + const string sl = ll.str(); + Weights::WriteToFile(ofname, weights, true, &sl); + cerr << "done" << endl; + } + if (converged) { cerr << "CONVERGED\n"; break; } } forests[0].Reweight(weights); TreeSampler ts(forests[0]); diff --git a/rst_parser/rst_test.cc b/rst_parser/rst_test.cc index 7e6fb2c1..3bb95759 100644 --- a/rst_parser/rst_test.cc +++ b/rst_parser/rst_test.cc @@ -2,6 +2,8 @@ #include +#include + using namespace std; int main(int argc, char** argv) { @@ -28,11 +30,19 @@ int main(int argc, char** argv) { af(-1,2).edge_prob.logeq(9); EdgeSubset tree; // af.MaximumEdgeSubset(&tree); - double lz; - af.EdgeMarginals(&lz); - cerr << "Z = " << lz << endl; + prob_t z; + af.EdgeMarginals(&z); + cerr << "Z = " << abs(z) << endl; af.PickBestParentForEachWord(&tree); cerr << tree << endl; + typedef Eigen::Matrix M3; + M3 A = M3::Zero(); + A(0,0) = prob_t(1); + A(1,0) = prob_t(3); + A(0,1) = prob_t(2); + A(1,1) = prob_t(4); + prob_t det = A.determinant(); + cerr << det.as_float() << endl; return 0; } diff --git a/utils/logval.h b/utils/logval.h index 8a59d0b1..ec1f6acd 100644 --- a/utils/logval.h +++ b/utils/logval.h @@ -30,8 +30,6 @@ class LogVal { LogVal(init_minus_1) : s_(true),v_(0) { } LogVal(init_1) : s_(),v_(0) { } LogVal(init_0) : s_(),v_(LOGVAL_LOG0) { } - explicit LogVal(int x) : s_(x<0), v_(s_ ? std::log(-x) : std::log(x)) {} - explicit LogVal(unsigned x) : s_(0), v_(std::log(x)) { } LogVal(double lnx,bool sign) : s_(sign),v_(lnx) {} LogVal(double lnx,init_lnx) : s_(),v_(lnx) {} static Self exp(T lnx) { return Self(lnx,false); } @@ -126,7 +124,7 @@ class LogVal { } Self operator-() const { - return Self(v_,-s_); + return Self(v_,!s_); } void negate() { s_ = !s_; } @@ -193,6 +191,15 @@ T log(const LogVal& o) { return o.v_; } +template +LogVal abs(const LogVal& o) { + if (o.s_) { + LogVal res = o; + res.s_ = false; + return res; + } else { return o; } +} + template LogVal pow(const LogVal& b, const T& e) { return b.pow(e); -- cgit v1.2.3 From 4b38556c88c739de82b9c298261a262ec620280e Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Mon, 16 Apr 2012 18:20:33 -0400 Subject: rst sampler --- rst_parser/Makefile.am | 7 ++- rst_parser/dep_training.cc | 56 ++++++++++++++++++++ rst_parser/dep_training.h | 17 ++++++ rst_parser/mst_train.cc | 58 +-------------------- rst_parser/rst.cc | 56 +++++++++++++++----- rst_parser/rst.h | 8 ++- rst_parser/rst_parse.cc | 126 +++++++++++++++++++++++++++++++++++++++++++++ utils/weights.cc | 4 +- 8 files changed, 260 insertions(+), 72 deletions(-) create mode 100644 rst_parser/dep_training.cc create mode 100644 rst_parser/dep_training.h create mode 100644 rst_parser/rst_parse.cc (limited to 'utils') diff --git a/rst_parser/Makefile.am b/rst_parser/Makefile.am index 2b64b43a..6e884f53 100644 --- a/rst_parser/Makefile.am +++ b/rst_parser/Makefile.am @@ -1,5 +1,5 @@ bin_PROGRAMS = \ - mst_train + mst_train rst_parse noinst_PROGRAMS = \ rst_test @@ -8,11 +8,14 @@ TESTS = rst_test noinst_LIBRARIES = librst.a -librst_a_SOURCES = arc_factored.cc arc_factored_marginals.cc rst.cc arc_ff.cc +librst_a_SOURCES = arc_factored.cc arc_factored_marginals.cc rst.cc arc_ff.cc dep_training.cc mst_train_SOURCES = mst_train.cc mst_train_LDADD = librst.a $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a ../klm/lm/libklm.a ../klm/util/libklm_util.a ../training/optimize.o -lz +rst_parse_SOURCES = rst_parse.cc +rst_parse_LDADD = librst.a $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a ../klm/lm/libklm.a ../klm/util/libklm_util.a -lz + rst_test_SOURCES = rst_test.cc rst_test_LDADD = librst.a $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a ../klm/lm/libklm.a ../klm/util/libklm_util.a -lz diff --git a/rst_parser/dep_training.cc b/rst_parser/dep_training.cc new file mode 100644 index 00000000..de431ebc --- /dev/null +++ b/rst_parser/dep_training.cc @@ -0,0 +1,56 @@ +#include "dep_training.h" + +#include +#include + +#include "stringlib.h" +#include "filelib.h" +#include "tdict.h" +#include "picojson.h" + +using namespace std; + +void TrainingInstance::ReadTraining(const string& fname, vector* corpus, int rank, int size) { + ReadFile rf(fname); + istream& in = *rf.stream(); + string line; + string err; + int lc = 0; + bool flag = false; + while(getline(in, line)) { + ++lc; + if ((lc-1) % size != rank) continue; + if (rank == 0 && lc % 10 == 0) { cerr << '.' << flush; flag = true; } + if (rank == 0 && lc % 400 == 0) { cerr << " [" << lc << "]\n"; flag = false; } + size_t pos = line.rfind('\t'); + assert(pos != string::npos); + picojson::value obj; + picojson::parse(obj, line.begin() + pos, line.end(), &err); + if (err.size() > 0) { cerr << "JSON parse error in " << lc << ": " << err << endl; abort(); } + corpus->push_back(TrainingInstance()); + TrainingInstance& cur = corpus->back(); + TaggedSentence& ts = cur.ts; + EdgeSubset& tree = cur.tree; + assert(obj.is()); + const picojson::object& d = obj.get(); + const picojson::array& ta = d.find("tokens")->second.get(); + for (unsigned i = 0; i < ta.size(); ++i) { + ts.words.push_back(TD::Convert(ta[i].get()[0].get())); + ts.pos.push_back(TD::Convert(ta[i].get()[1].get())); + } + const picojson::array& da = d.find("deps")->second.get(); + for (unsigned i = 0; i < da.size(); ++i) { + const picojson::array& thm = da[i].get(); + // get dep type here + short h = thm[2].get(); + short m = thm[1].get(); + if (h < 0) + tree.roots.push_back(m); + else + tree.h_m_pairs.push_back(make_pair(h,m)); + } + //cerr << TD::GetString(ts.words) << endl << TD::GetString(ts.pos) << endl << tree << endl; + } + if (flag) cerr << "\nRead " << lc << " training instances\n"; +} + diff --git a/rst_parser/dep_training.h b/rst_parser/dep_training.h new file mode 100644 index 00000000..73ffd298 --- /dev/null +++ b/rst_parser/dep_training.h @@ -0,0 +1,17 @@ +#ifndef _DEP_TRAINING_H_ +#define _DEP_TRAINING_H_ + +#include +#include +#include "arc_factored.h" +#include "weights.h" + +struct TrainingInstance { + TaggedSentence ts; + EdgeSubset tree; + SparseVector features; + // reads a "Jsent" formatted dependency file + static void ReadTraining(const std::string& fname, std::vector* corpus, int rank = 0, int size = 1); +}; + +#endif diff --git a/rst_parser/mst_train.cc b/rst_parser/mst_train.cc index c5cab6ec..f0403d7e 100644 --- a/rst_parser/mst_train.cc +++ b/rst_parser/mst_train.cc @@ -10,10 +10,9 @@ #include "stringlib.h" #include "filelib.h" #include "tdict.h" -#include "picojson.h" +#include "dep_training.h" #include "optimize.h" #include "weights.h" -#include "rst.h" using namespace std; namespace po = boost::program_options; @@ -47,56 +46,6 @@ void InitCommandLine(int argc, char** argv, po::variables_map* conf) { } } -struct TrainingInstance { - TaggedSentence ts; - EdgeSubset tree; - SparseVector features; -}; - -void ReadTraining(const string& fname, vector* corpus, int rank = 0, int size = 1) { - ReadFile rf(fname); - istream& in = *rf.stream(); - string line; - string err; - int lc = 0; - bool flag = false; - while(getline(in, line)) { - ++lc; - if ((lc-1) % size != rank) continue; - if (rank == 0 && lc % 10 == 0) { cerr << '.' << flush; flag = true; } - if (rank == 0 && lc % 400 == 0) { cerr << " [" << lc << "]\n"; flag = false; } - size_t pos = line.rfind('\t'); - assert(pos != string::npos); - picojson::value obj; - picojson::parse(obj, line.begin() + pos, line.end(), &err); - if (err.size() > 0) { cerr << "JSON parse error in " << lc << ": " << err << endl; abort(); } - corpus->push_back(TrainingInstance()); - TrainingInstance& cur = corpus->back(); - TaggedSentence& ts = cur.ts; - EdgeSubset& tree = cur.tree; - assert(obj.is()); - const picojson::object& d = obj.get(); - const picojson::array& ta = d.find("tokens")->second.get(); - for (unsigned i = 0; i < ta.size(); ++i) { - ts.words.push_back(TD::Convert(ta[i].get()[0].get())); - ts.pos.push_back(TD::Convert(ta[i].get()[1].get())); - } - const picojson::array& da = d.find("deps")->second.get(); - for (unsigned i = 0; i < da.size(); ++i) { - const picojson::array& thm = da[i].get(); - // get dep type here - short h = thm[2].get(); - short m = thm[1].get(); - if (h < 0) - tree.roots.push_back(m); - else - tree.h_m_pairs.push_back(make_pair(h,m)); - } - //cerr << TD::GetString(ts.words) << endl << TD::GetString(ts.pos) << endl << tree << endl; - } - if (flag) cerr << "\nRead " << lc << " training instances\n"; -} - void AddFeatures(double prob, const SparseVector& fmap, vector* g) { for (SparseVector::const_iterator it = fmap.begin(); it != fmap.end(); ++it) (*g)[it->first] += it->second * prob; @@ -131,7 +80,7 @@ int main(int argc, char** argv) { vector corpus; vector > ffs; ffs.push_back(boost::shared_ptr(new DistancePenalty(""))); - ReadTraining(conf["training_data"].as(), &corpus, rank, size); + TrainingInstance::ReadTraining(conf["training_data"].as(), &corpus, rank, size); vector forests(corpus.size()); SparseVector empirical; bool flag = false; @@ -224,9 +173,6 @@ int main(int argc, char** argv) { } if (converged) { cerr << "CONVERGED\n"; break; } } - forests[0].Reweight(weights); - TreeSampler ts(forests[0]); - EdgeSubset tt; ts.SampleRandomSpanningTree(&tt); return 0; } diff --git a/rst_parser/rst.cc b/rst_parser/rst.cc index c4ce898e..bc91330b 100644 --- a/rst_parser/rst.cc +++ b/rst_parser/rst.cc @@ -3,45 +3,77 @@ using namespace std; // David B. Wilson. Generating Random Spanning Trees More Quickly than the Cover Time. - +// this is an awesome algorithm TreeSampler::TreeSampler(const ArcFactoredForest& af) : forest(af), usucc(af.size() + 1) { - // edges are directed from modifiers to heads, to the root + // edges are directed from modifiers to heads, and finally to the root + vector p; for (int m = 1; m <= forest.size(); ++m) { +#if USE_ALIAS_SAMPLER + p.clear(); +#else SampleSet& ss = usucc[m]; - for (int h = 0; h <= forest.size(); ++h) - ss.add(forest(h-1,m-1).edge_prob.as_float()); +#endif + double z = 0; + for (int h = 0; h <= forest.size(); ++h) { + double u = forest(h-1,m-1).edge_prob.as_float(); + z += u; +#if USE_ALIAS_SAMPLER + p.push_back(u); +#else + ss.add(u); +#endif + } +#if USE_ALIAS_SAMPLER + for (int i = 0; i < p.size(); ++i) { p[i] /= z; } + usucc[m].Init(p); +#endif } } -void TreeSampler::SampleRandomSpanningTree(EdgeSubset* tree) { - MT19937 rng; +void TreeSampler::SampleRandomSpanningTree(EdgeSubset* tree, MT19937* prng) { + MT19937& rng = *prng; const int r = 0; bool success = false; while (!success) { int roots = 0; + tree->h_m_pairs.clear(); + tree->roots.clear(); vector next(forest.size() + 1, -1); vector in_tree(forest.size() + 1, 0); in_tree[r] = 1; - for (int i = 0; i < forest.size(); ++i) { + //cerr << "Forest size: " << forest.size() << endl; + for (int i = 0; i <= forest.size(); ++i) { + //cerr << "Sampling starting at u=" << i << endl; int u = i; if (in_tree[u]) continue; while(!in_tree[u]) { +#if USE_ALIAS_SAMPLER + next[u] = usucc[u].Draw(rng); +#else next[u] = rng.SelectSample(usucc[u]); +#endif u = next[u]; } u = i; - cerr << (u-1); + //cerr << (u-1); + int prev = u-1; while(!in_tree[u]) { in_tree[u] = true; u = next[u]; - cerr << " > " << (u-1); - if (u == r) { ++roots; } + //cerr << " > " << (u-1); + if (u == r) { + ++roots; + tree->roots.push_back(prev); + } else { + tree->h_m_pairs.push_back(make_pair(u-1,prev)); + } + prev = u-1; } - cerr << endl; + //cerr << endl; } assert(roots > 0); if (roots > 1) { - cerr << "FAILURE\n"; + //cerr << "FAILURE\n"; } else { success = true; } diff --git a/rst_parser/rst.h b/rst_parser/rst.h index a269ff9b..8bf389f7 100644 --- a/rst_parser/rst.h +++ b/rst_parser/rst.h @@ -4,12 +4,18 @@ #include #include "sampler.h" #include "arc_factored.h" +#include "alias_sampler.h" struct TreeSampler { explicit TreeSampler(const ArcFactoredForest& af); - void SampleRandomSpanningTree(EdgeSubset* tree); + void SampleRandomSpanningTree(EdgeSubset* tree, MT19937* rng); const ArcFactoredForest& forest; +#define USE_ALIAS_SAMPLER 1 +#if USE_ALIAS_SAMPLER + std::vector usucc; +#else std::vector > usucc; +#endif }; #endif diff --git a/rst_parser/rst_parse.cc b/rst_parser/rst_parse.cc new file mode 100644 index 00000000..9cc1359a --- /dev/null +++ b/rst_parser/rst_parse.cc @@ -0,0 +1,126 @@ +#include "arc_factored.h" + +#include +#include +#include +#include + +#include "timing_stats.h" +#include "arc_ff.h" +#include "arc_ff_factory.h" +#include "dep_training.h" +#include "stringlib.h" +#include "filelib.h" +#include "tdict.h" +#include "weights.h" +#include "rst.h" + +using namespace std; +namespace po = boost::program_options; + +void InitCommandLine(int argc, char** argv, po::variables_map* conf) { + po::options_description opts("Configuration options"); + string cfg_file; + opts.add_options() + ("training_data,t",po::value()->default_value("-"), "File containing training data (jsent format)") + ("feature_function,F",po::value >()->composing(), "feature function (multiple permitted)") + ("q_weights,q",po::value(), "Arc-factored weights for proposal distribution") + ("samples,n",po::value()->default_value(1000), "Number of samples"); + po::options_description clo("Command line options"); + clo.add_options() + ("config,c", po::value(&cfg_file), "Configuration file") + ("help,?", "Print this help message and exit"); + + po::options_description dconfig_options, dcmdline_options; + dconfig_options.add(opts); + dcmdline_options.add(dconfig_options).add(clo); + po::store(parse_command_line(argc, argv, dcmdline_options), *conf); + if (cfg_file.size() > 0) { + ReadFile rf(cfg_file); + po::store(po::parse_config_file(*rf.stream(), dconfig_options), *conf); + } + if (conf->count("help")) { + cerr << dcmdline_options << endl; + exit(1); + } +} + +int main(int argc, char** argv) { + po::variables_map conf; + InitCommandLine(argc, argv, &conf); + ArcFactoredForest af(5); + ArcFFRegistry reg; + reg.Register("DistancePenalty", new ArcFFFactory); + vector corpus; + vector > ffs; + ffs.push_back(boost::shared_ptr(new DistancePenalty(""))); + TrainingInstance::ReadTraining(conf["training_data"].as(), &corpus); + vector forests(corpus.size()); + SparseVector empirical; + bool flag = false; + for (int i = 0; i < corpus.size(); ++i) { + TrainingInstance& cur = corpus[i]; + if ((i+1) % 10 == 0) { cerr << '.' << flush; flag = true; } + if ((i+1) % 400 == 0) { cerr << " [" << (i+1) << "]\n"; flag = false; } + for (int fi = 0; fi < ffs.size(); ++fi) { + ArcFeatureFunction& ff = *ffs[fi]; + ff.PrepareForInput(cur.ts); + SparseVector efmap; + for (int j = 0; j < cur.tree.h_m_pairs.size(); ++j) { + efmap.clear(); + ff.EgdeFeatures(cur.ts, cur.tree.h_m_pairs[j].first, + cur.tree.h_m_pairs[j].second, + &efmap); + cur.features += efmap; + } + for (int j = 0; j < cur.tree.roots.size(); ++j) { + efmap.clear(); + ff.EgdeFeatures(cur.ts, -1, cur.tree.roots[j], &efmap); + cur.features += efmap; + } + } + empirical += cur.features; + forests[i].resize(cur.ts.words.size()); + forests[i].ExtractFeatures(cur.ts, ffs); + } + if (flag) cerr << endl; + vector weights(FD::NumFeats(), 0.0); + Weights::InitFromFile(conf["q_weights"].as(), &weights); + MT19937 rng; + SparseVector model_exp; + SparseVector sampled_exp; + int samples = conf["samples"].as(); + for (int i = 0; i < corpus.size(); ++i) { + const int num_words = corpus[i].ts.words.size(); + forests[i].Reweight(weights); + forests[i].EdgeMarginals(); + model_exp.clear(); + for (int h = -1; h < num_words; ++h) { + for (int m = 0; m < num_words; ++m) { + if (h == m) continue; + const ArcFactoredForest::Edge& edge = forests[i](h,m); + const SparseVector& fmap = edge.features; + double prob = edge.edge_prob.as_float(); + model_exp += fmap * prob; + } + } + //cerr << "TRUE EXP: " << model_exp << endl; + + forests[i].Reweight(weights); + TreeSampler ts(forests[i]); + sampled_exp.clear(); + //ostringstream os; os << "Samples_" << samples; + //Timer t(os.str()); + for (int n = 0; n < samples; ++n) { + EdgeSubset tree; + ts.SampleRandomSpanningTree(&tree, &rng); + SparseVector feats; + tree.ExtractFeatures(corpus[i].ts, ffs, &feats); + sampled_exp += feats; + } + sampled_exp /= samples; + cerr << "L2 norm of diff @ " << samples << " samples: " << (model_exp - sampled_exp).l2norm() << endl; + } + return 0; +} + diff --git a/utils/weights.cc b/utils/weights.cc index ac407dfb..39c18474 100644 --- a/utils/weights.cc +++ b/utils/weights.cc @@ -144,8 +144,10 @@ void Weights::ShowLargestFeatures(const vector& w) { vector fnums(w.size()); for (int i = 0; i < w.size(); ++i) fnums[i] = i; + int nf = FD::NumFeats(); + if (nf > 10) nf = 10; vector::iterator mid = fnums.begin(); - mid += (w.size() > 10 ? 10 : w.size()); + mid += nf; partial_sort(fnums.begin(), mid, fnums.end(), FComp(w)); cerr << "TOP FEATURES:"; for (vector::iterator i = fnums.begin(); i != mid; ++i) { -- cgit v1.2.3 From 8aff3bd109b82b57c32a0b14a019c99c1ec35705 Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Mon, 16 Apr 2012 18:37:04 -0400 Subject: improved alias sampler --- utils/alias_sampler.h | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'utils') diff --git a/utils/alias_sampler.h b/utils/alias_sampler.h index 85da9944..81541f7a 100644 --- a/utils/alias_sampler.h +++ b/utils/alias_sampler.h @@ -14,10 +14,13 @@ // constructor computes the F's and L's given an arbitrary multionimial p in // O(n) time and Draw returns samples in O(1) time. struct AliasSampler { - explicit AliasSampler(const std::vector& p) : - cutoffs_(p.size()), - aliases_(p.size(), std::numeric_limits::max()) { + AliasSampler() {} + explicit AliasSampler(const std::vector& p) { Init(p); } + void Init(const std::vector& p) { const unsigned N = p.size(); + cutoffs_.resize(p.size()); + aliases_.clear(); + aliases_.resize(p.size(), std::numeric_limits::max()); std::vector s,g; for (unsigned i = 0; i < N; ++i) { const double cutoff = cutoffs_[i] = N * p[i]; -- cgit v1.2.3