From c294227a928672bf108eed81106063a194c872ca Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Thu, 12 Apr 2012 18:57:44 -0400 Subject: unigram pyp lm added --- utils/alias_sampler.h | 47 ++++++++++++++ utils/unigram_pyp_lm.cc | 168 ++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 215 insertions(+) create mode 100644 utils/alias_sampler.h create mode 100644 utils/unigram_pyp_lm.cc diff --git a/utils/alias_sampler.h b/utils/alias_sampler.h new file mode 100644 index 00000000..85da9944 --- /dev/null +++ b/utils/alias_sampler.h @@ -0,0 +1,47 @@ +#ifndef _ALIAS_SAMPLER_H_ +#define _ALIAS_SAMPLER_H_ + +#include +#include + +// R. A. Kronmal and A. V. Peterson, Jr. (1977) On the alias method for +// generating random variables from a discrete distribution. In The American +// Statistician, Vol. 33, No. 4. Pages 214--218. +// +// Intuition: a multinomial with N outcomes can be rewritten as a uniform +// mixture of N Bernoulli distributions. The ith Bernoulli returns i with +// probability F[i], otherwise it returns an "alias" value L[i]. The +// constructor computes the F's and L's given an arbitrary multionimial p in +// O(n) time and Draw returns samples in O(1) time. +struct AliasSampler { + explicit AliasSampler(const std::vector& p) : + cutoffs_(p.size()), + aliases_(p.size(), std::numeric_limits::max()) { + const unsigned N = p.size(); + std::vector s,g; + for (unsigned i = 0; i < N; ++i) { + const double cutoff = cutoffs_[i] = N * p[i]; + if (cutoff >= 1.0) g.push_back(i); else s.push_back(i); + } + while(!s.empty() && !g.empty()) { + const unsigned k = g.back(); + const unsigned j = s.back(); + aliases_[j] = k; + cutoffs_[k] -= 1.0 - cutoffs_[j]; + s.pop_back(); + if (cutoffs_[k] < 1.0) { + g.pop_back(); + s.push_back(k); + } + } + } + template + unsigned Draw(Uniform01Generator& u01) const { + const unsigned n = u01() * cutoffs_.size(); + if (u01() > cutoffs_[n]) return aliases_[n]; else return n; + } + std::vector cutoffs_; // F + std::vector aliases_; // L +}; + +#endif diff --git a/utils/unigram_pyp_lm.cc b/utils/unigram_pyp_lm.cc new file mode 100644 index 00000000..510e8839 --- /dev/null +++ b/utils/unigram_pyp_lm.cc @@ -0,0 +1,168 @@ +#include +#include +#include + +#include +#include +#include + +#include "corpus_tools.h" +#include "m.h" +#include "tdict.h" +#include "sampler.h" +#include "ccrp.h" + +// A not very memory-efficient implementation of an 1-gram LM based on PYPs +// as described in Y.-W. Teh. (2006) A Hierarchical Bayesian Language Model +// based on Pitman-Yor Processes. In Proc. ACL. + +using namespace std; +using namespace tr1; +namespace po = boost::program_options; + +boost::shared_ptr prng; + +void InitCommandLine(int argc, char** argv, po::variables_map* conf) { + po::options_description opts("Configuration options"); + opts.add_options() + ("samples,n",po::value()->default_value(50),"Number of samples") + ("train,i",po::value(),"Training data file") + ("test,T",po::value(),"Test data file") + ("discount_prior_a,a",po::value()->default_value(1.0), "discount ~ Beta(a,b): a=this") + ("discount_prior_b,b",po::value()->default_value(1.0), "discount ~ Beta(a,b): b=this") + ("strength_prior_s,s",po::value()->default_value(1.0), "strength ~ Gamma(s,r): s=this") + ("strength_prior_r,r",po::value()->default_value(1.0), "strength ~ Gamma(s,r): r=this") + ("random_seed,S",po::value(), "Random seed"); + po::options_description clo("Command line options"); + clo.add_options() + ("config", po::value(), "Configuration file") + ("help", "Print this help message and exit"); + po::options_description dconfig_options, dcmdline_options; + dconfig_options.add(opts); + dcmdline_options.add(opts).add(clo); + + po::store(parse_command_line(argc, argv, dcmdline_options), *conf); + if (conf->count("config")) { + ifstream config((*conf)["config"].as().c_str()); + po::store(po::parse_config_file(config, dconfig_options), *conf); + } + po::notify(*conf); + + if (conf->count("help") || (conf->count("train") == 0)) { + cerr << dcmdline_options << endl; + exit(1); + } +} + +// uniform base distribution (0-gram model) +struct UniformWordModel { + explicit UniformWordModel(unsigned vocab_size) : p0(1.0 / vocab_size), draws() {} + void increment() { ++draws; } + void decrement() { --draws; assert(draws >= 0); } + double prob(WordID) const { return p0; } // all words have equal prob + double log_likelihood() const { return draws * log(p0); } + const double p0; + int draws; +}; + +// represents an Unigram LM +struct UnigramLM { + UnigramLM(unsigned vs, double da, double db, double ss, double sr) : + uniform_vocab(vs), + crp(da, db, ss, sr, 0.8, 1.0) {} + void increment(WordID w, MT19937* rng) { + const double backoff = uniform_vocab.prob(w); + if (crp.increment(w, backoff, rng)) + uniform_vocab.increment(); + } + void decrement(WordID w, MT19937* rng) { + if (crp.decrement(w, rng)) + uniform_vocab.decrement(); + } + double prob(WordID w) const { + const double backoff = uniform_vocab.prob(w); + return crp.prob(w, backoff); + } + + double log_likelihood() const { + double llh = uniform_vocab.log_likelihood(); + llh += crp.log_crp_prob(); + return llh; + } + + void resample_hyperparameters(MT19937* rng) { + crp.resample_hyperparameters(rng); + } + + double discount_a, discount_b, strength_s, strength_r; + double d, strength; + UniformWordModel uniform_vocab; + CCRP crp; +}; + +int main(int argc, char** argv) { + po::variables_map conf; + + InitCommandLine(argc, argv, &conf); + const unsigned samples = conf["samples"].as(); + if (conf.count("random_seed")) + prng.reset(new MT19937(conf["random_seed"].as())); + else + prng.reset(new MT19937); + MT19937& rng = *prng; + vector > corpuse; + set vocabe; + const WordID kEOS = TD::Convert(""); + cerr << "Reading corpus...\n"; + CorpusTools::ReadFromFile(conf["train"].as(), &corpuse, &vocabe); + cerr << "E-corpus size: " << corpuse.size() << " sentences\t (" << vocabe.size() << " word types)\n"; + vector > test; + if (conf.count("test")) + CorpusTools::ReadFromFile(conf["test"].as(), &test); + else + test = corpuse; + UnigramLM lm(vocabe.size(), + conf["discount_prior_a"].as(), + conf["discount_prior_b"].as(), + conf["strength_prior_s"].as(), + conf["strength_prior_r"].as()); + for (int SS=0; SS < samples; ++SS) { + for (int ci = 0; ci < corpuse.size(); ++ci) { + const vector& s = corpuse[ci]; + for (int i = 0; i <= s.size(); ++i) { + WordID w = (i < s.size() ? s[i] : kEOS); + if (SS > 0) lm.decrement(w, &rng); + lm.increment(w, &rng); + } + if (SS > 0) lm.decrement(kEOS, &rng); + lm.increment(kEOS, &rng); + } + cerr << "LLH=" << lm.log_likelihood() << endl; + //if (SS % 10 == 9) lm.resample_hyperparameters(&rng); + } + double llh = 0; + unsigned cnt = 0; + unsigned oovs = 0; + for (int ci = 0; ci < test.size(); ++ci) { + const vector& s = test[ci]; + for (int i = 0; i <= s.size(); ++i) { + WordID w = (i < s.size() ? s[i] : kEOS); + double lp = log(lm.prob(w)) / log(2); + if (i < s.size() && vocabe.count(w) == 0) { + cerr << "**OOV "; + ++oovs; + lp = 0; + } + cerr << "p(" << TD::Convert(w) << ") = " << lp << endl; + llh -= lp; + cnt++; + } + } + cerr << " Log_10 prob: " << (-llh * log(2) / log(10)) << endl; + cerr << " Count: " << cnt << endl; + cerr << " OOVs: " << oovs << endl; + cerr << "Cross-entropy: " << (llh / cnt) << endl; + cerr << " Perplexity: " << pow(2, llh / cnt) << endl; + return 0; +} + -- cgit v1.2.3 From c22e9248a1fa24b0255a55d21afb94a9ed3ddc22 Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Sat, 14 Apr 2012 01:52:14 -0400 Subject: matrix tree theorem stuff --- rst_parser/Makefile.am | 2 +- rst_parser/arc_factored.cc | 98 ++++++++++++++++++++++++++++++++++-- rst_parser/arc_factored.h | 27 ++++++++-- rst_parser/arc_factored_marginals.cc | 52 +++++++++++++++++++ rst_parser/rst_test.cc | 9 +++- 5 files changed, 177 insertions(+), 11 deletions(-) create mode 100644 rst_parser/arc_factored_marginals.cc diff --git a/rst_parser/Makefile.am b/rst_parser/Makefile.am index e97ab5c5..b61a20dd 100644 --- a/rst_parser/Makefile.am +++ b/rst_parser/Makefile.am @@ -8,7 +8,7 @@ TESTS = rst_test noinst_LIBRARIES = librst.a -librst_a_SOURCES = arc_factored.cc rst.cc +librst_a_SOURCES = arc_factored.cc arc_factored_marginals.cc rst.cc mst_train_SOURCES = mst_train.cc mst_train_LDADD = librst.a $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a ../klm/lm/libklm.a ../klm/util/libklm_util.a -lz diff --git a/rst_parser/arc_factored.cc b/rst_parser/arc_factored.cc index 1e75600b..b2c2c427 100644 --- a/rst_parser/arc_factored.cc +++ b/rst_parser/arc_factored.cc @@ -1,31 +1,121 @@ #include "arc_factored.h" #include +#include #include +#include using namespace std; +using namespace std::tr1; using namespace boost; +void ArcFactoredForest::PickBestParentForEachWord(EdgeSubset* st) const { + for (int m = 1; m <= num_words_; ++m) { + int best_head = -1; + prob_t best_score; + for (int h = 0; h <= num_words_; ++h) { + const Edge& edge = (*this)(h,m); + if (best_head < 0 || edge.edge_prob > best_score) { + best_score = edge.edge_prob; + best_head = h; + } + } + assert(best_head >= 0); + if (best_head) + st->h_m_pairs.push_back(make_pair(best_head, m)); + else + st->roots.push_back(m); + } +} + +struct WeightedEdge { + WeightedEdge() : h(), m(), weight() {} + WeightedEdge(short hh, short mm, float w) : h(hh), m(mm), weight(w) {} + short h, m; + float weight; + inline bool operator==(const WeightedEdge& o) const { + return h == o.h && m == o.m && weight == o.weight; + } + inline bool operator!=(const WeightedEdge& o) const { + return h != o.h || m != o.m || weight != o.weight; + } +}; +inline bool operator<(const WeightedEdge& l, const WeightedEdge& o) { return l.weight < o.weight; } +inline size_t hash_value(const WeightedEdge& e) { return reinterpret_cast(e); } + + +struct PriorityQueue { + void push(const WeightedEdge& e) {} + const WeightedEdge& top() const { + static WeightedEdge w(1,2,3); + return w; + } + void pop() {} + void increment_all(float p) {} +}; + // based on Trajan 1977 -void ArcFactoredForest::MaximumSpanningTree(SpanningTree* st) const { +void ArcFactoredForest::MaximumEdgeSubset(EdgeSubset* st) const { typedef disjoint_sets_with_storage DisjointSet; DisjointSet strongly(num_words_ + 1); DisjointSet weakly(num_words_ + 1); - set roots, h, rset; - vector > enter(num_words_ + 1); + set roots, rset; + unordered_set > h; + vector qs(num_words_ + 1); + vector enter(num_words_ + 1); + vector mins(num_words_ + 1); + const WeightedEdge kDUMMY(0,0,0.0f); for (unsigned i = 0; i <= num_words_; ++i) { + if (i > 0) { + // I(i) incidence on i -- all incoming edges + for (unsigned j = 0; j <= num_words_; ++j) { + qs[i].push(WeightedEdge(j, i, Weight(j,i))); + } + } strongly.make_set(i); weakly.make_set(i); roots.insert(i); + enter[i] = kDUMMY; + mins[i] = i; } while(!roots.empty()) { set::iterator it = roots.begin(); const unsigned k = *it; roots.erase(it); cerr << "k=" << k << endl; - pair ij; // TODO = Max(k); + WeightedEdge ij = qs[k].top(); // MAX(k) + qs[k].pop(); + if (ij.weight <= 0) { + rset.insert(k); + } else { + if (strongly.find_set(ij.h) == k) { + roots.insert(k); + } else { + h.insert(ij); + if (weakly.find_set(ij.h) != weakly.find_set(ij.m)) { + weakly.union_set(ij.h, ij.m); + enter[k] = ij; + } else { + unsigned vertex = 0; + float val = 99999999999; + WeightedEdge xy = ij; + while(xy != kDUMMY) { + if (xy.weight < val) { + val = xy.weight; + vertex = strongly.find_set(xy.m); + } + xy = enter[strongly.find_set(xy.h)]; + } + qs[k].increment_all(val - ij.weight); + mins[k] = mins[vertex]; + xy = enter[strongly.find_set(ij.h)]; + while (xy != kDUMMY) { + } + } + } + } } } diff --git a/rst_parser/arc_factored.h b/rst_parser/arc_factored.h index e99be482..3003a86e 100644 --- a/rst_parser/arc_factored.h +++ b/rst_parser/arc_factored.h @@ -10,11 +10,11 @@ #include "prob.h" #include "weights.h" -struct SpanningTree { - SpanningTree() : roots(1, -1) {} +struct EdgeSubset { + EdgeSubset() {} std::vector roots; // unless multiroot trees are supported, this // will have a single member - std::vector > h_m_pairs; + std::vector > h_m_pairs; // h,m start at *1* }; class ArcFactoredForest { @@ -35,7 +35,14 @@ class ArcFactoredForest { // compute the maximum spanning tree based on the current weighting // using the O(n^2) CLE algorithm - void MaximumSpanningTree(SpanningTree* st) const; + void MaximumEdgeSubset(EdgeSubset* st) const; + + // Reweight edges so that edge_prob is the edge's marginals + // optionally returns log partition + void EdgeMarginals(double* p_log_z = NULL); + + // This may not return a tree + void PickBestParentForEachWord(EdgeSubset* st) const; struct Edge { Edge() : h(), m(), features(), edge_prob(prob_t::Zero()) {} @@ -61,6 +68,10 @@ class ArcFactoredForest { return h ? edges_(h - 1, m - 1) : root_edges_[m - 1]; } + float Weight(short h, short m) const { + return log((*this)(h,m).edge_prob); + } + template void Reweight(const V& weights) { for (int m = 0; m < num_words_; ++m) { @@ -85,4 +96,12 @@ inline std::ostream& operator<<(std::ostream& os, const ArcFactoredForest::Edge& return os << "(" << edge.h << " < " << edge.m << ")"; } +inline std::ostream& operator<<(std::ostream& os, const EdgeSubset& ss) { + for (unsigned i = 0; i < ss.roots.size(); ++i) + os << "ROOT < " << ss.roots[i] << std::endl; + for (unsigned i = 0; i < ss.h_m_pairs.size(); ++i) + os << ss.h_m_pairs[i].first << " < " << ss.h_m_pairs[i].second << std::endl; + return os; +} + #endif diff --git a/rst_parser/arc_factored_marginals.cc b/rst_parser/arc_factored_marginals.cc new file mode 100644 index 00000000..9851b59a --- /dev/null +++ b/rst_parser/arc_factored_marginals.cc @@ -0,0 +1,52 @@ +#include "arc_factored.h" + +#include + +#include "config.h" + +using namespace std; + +#if HAVE_EIGEN + +#include +typedef Eigen::Matrix ArcMatrix; +typedef Eigen::Matrix RootVector; + +void ArcFactoredForest::EdgeMarginals(double *plog_z) { + ArcMatrix A(num_words_,num_words_); + RootVector r(num_words_); + for (int h = 0; h < num_words_; ++h) { + for (int m = 0; m < num_words_; ++m) { + if (h != m) + A(h,m) = edges_(h,m).edge_prob.as_float(); + else + A(h,m) = 0; + } + r(h) = root_edges_[h].edge_prob.as_float(); + } + + ArcMatrix L = -A; + L.diagonal() = A.colwise().sum(); + L.row(0) = r; + ArcMatrix Linv = L.inverse(); + if (plog_z) *plog_z = log(Linv.determinant()); + RootVector rootMarginals = r.cwiseProduct(Linv.col(0)); + for (int h = 0; h < num_words_; ++h) { + for (int m = 0; m < num_words_; ++m) { + edges_(h,m).edge_prob = prob_t((m == 0 ? 0.0 : 1.0) * A(h,m) * Linv(m,m) - + (h == 0 ? 0.0 : 1.0) * A(h,m) * Linv(m,h)); + } + root_edges_[h].edge_prob = prob_t(rootMarginals(h)); + } + // cerr << "ROOT MARGINALS: " << rootMarginals.transpose() << endl; +} + +#else + +void ArcFactoredForest::EdgeMarginals(double*) { + cerr << "EdgeMarginals() requires --with-eigen!\n"; + abort(); +} + +#endif + diff --git a/rst_parser/rst_test.cc b/rst_parser/rst_test.cc index e8fe706e..8995515f 100644 --- a/rst_parser/rst_test.cc +++ b/rst_parser/rst_test.cc @@ -26,8 +26,13 @@ int main(int argc, char** argv) { af(0,2).edge_prob.logeq(10); af(0,1).edge_prob.logeq(9); af(0,3).edge_prob.logeq(9); - SpanningTree tree; - af.MaximumSpanningTree(&tree); + EdgeSubset tree; +// af.MaximumEdgeSubset(&tree); + double lz; + af.EdgeMarginals(&lz); + cerr << "Z = " << lz << endl; + af.PickBestParentForEachWord(&tree); + cerr << tree << endl; return 0; } -- cgit v1.2.3 From 8d51973c21337a1633e559cd09a649265600cc4c Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Sun, 15 Apr 2012 17:28:08 -0400 Subject: crf training of arc-factored dep parser --- rst_parser/Makefile.am | 6 +- rst_parser/arc_factored.cc | 29 +++-- rst_parser/arc_factored.h | 53 ++++++---- rst_parser/arc_factored_marginals.cc | 10 +- rst_parser/arc_ff.cc | 64 +++++++++++ rst_parser/arc_ff.h | 43 ++++++++ rst_parser/mst_train.cc | 200 ++++++++++++++++++++++++++++++++++- rst_parser/rst_test.cc | 18 ++-- 8 files changed, 379 insertions(+), 44 deletions(-) create mode 100644 rst_parser/arc_ff.cc create mode 100644 rst_parser/arc_ff.h diff --git a/rst_parser/Makefile.am b/rst_parser/Makefile.am index b61a20dd..2b64b43a 100644 --- a/rst_parser/Makefile.am +++ b/rst_parser/Makefile.am @@ -8,12 +8,12 @@ TESTS = rst_test noinst_LIBRARIES = librst.a -librst_a_SOURCES = arc_factored.cc arc_factored_marginals.cc rst.cc +librst_a_SOURCES = arc_factored.cc arc_factored_marginals.cc rst.cc arc_ff.cc mst_train_SOURCES = mst_train.cc -mst_train_LDADD = librst.a $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a ../klm/lm/libklm.a ../klm/util/libklm_util.a -lz +mst_train_LDADD = librst.a $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a ../klm/lm/libklm.a ../klm/util/libklm_util.a ../training/optimize.o -lz rst_test_SOURCES = rst_test.cc rst_test_LDADD = librst.a $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a ../klm/lm/libklm.a ../klm/util/libklm_util.a -lz -AM_CPPFLAGS = -W -Wall -Wno-sign-compare $(GTEST_CPPFLAGS) -I$(top_srcdir)/decoder -I$(top_srcdir)/utils -I$(top_srcdir)/mteval -I../klm +AM_CPPFLAGS = -W -Wall -Wno-sign-compare $(GTEST_CPPFLAGS) -I$(top_srcdir)/decoder -I$(top_srcdir)/training -I$(top_srcdir)/utils -I$(top_srcdir)/mteval -I../klm diff --git a/rst_parser/arc_factored.cc b/rst_parser/arc_factored.cc index b2c2c427..44e769b8 100644 --- a/rst_parser/arc_factored.cc +++ b/rst_parser/arc_factored.cc @@ -6,23 +6,38 @@ #include #include +#include "arc_ff.h" + using namespace std; using namespace std::tr1; using namespace boost; +void ArcFactoredForest::ExtractFeatures(const TaggedSentence& sentence, + const std::vector >& ffs) { + for (int i = 0; i < ffs.size(); ++i) { + const ArcFeatureFunction& ff = *ffs[i]; + for (int m = 0; m < num_words_; ++m) { + for (int h = 0; h < num_words_; ++h) { + ff.EgdeFeatures(sentence, h, m, &edges_(h,m).features); + } + ff.EgdeFeatures(sentence, -1, m, &root_edges_[m].features); + } + } +} + void ArcFactoredForest::PickBestParentForEachWord(EdgeSubset* st) const { - for (int m = 1; m <= num_words_; ++m) { - int best_head = -1; + for (int m = 0; m < num_words_; ++m) { + int best_head = -2; prob_t best_score; - for (int h = 0; h <= num_words_; ++h) { + for (int h = -1; h < num_words_; ++h) { const Edge& edge = (*this)(h,m); - if (best_head < 0 || edge.edge_prob > best_score) { + if (best_head < -1 || edge.edge_prob > best_score) { best_score = edge.edge_prob; best_head = h; } } - assert(best_head >= 0); - if (best_head) + assert(best_head >= -1); + if (best_head >= 0) st->h_m_pairs.push_back(make_pair(best_head, m)); else st->roots.push_back(m); @@ -56,7 +71,7 @@ struct PriorityQueue { }; // based on Trajan 1977 -void ArcFactoredForest::MaximumEdgeSubset(EdgeSubset* st) const { +void ArcFactoredForest::MaximumSpanningTree(EdgeSubset* st) const { typedef disjoint_sets_with_storage DisjointSet; DisjointSet strongly(num_words_ + 1); diff --git a/rst_parser/arc_factored.h b/rst_parser/arc_factored.h index 3003a86e..a95f8230 100644 --- a/rst_parser/arc_factored.h +++ b/rst_parser/arc_factored.h @@ -5,37 +5,52 @@ #include #include #include +#include #include "array2d.h" #include "sparse_vector.h" #include "prob.h" #include "weights.h" +#include "wordid.h" + +struct TaggedSentence { + std::vector words; + std::vector pos; +}; struct EdgeSubset { EdgeSubset() {} std::vector roots; // unless multiroot trees are supported, this // will have a single member - std::vector > h_m_pairs; // h,m start at *1* + std::vector > h_m_pairs; // h,m start at 0 }; +struct ArcFeatureFunction; class ArcFactoredForest { public: - explicit ArcFactoredForest(short num_words) : - num_words_(num_words), - root_edges_(num_words), - edges_(num_words, num_words) { + ArcFactoredForest() : num_words_() {} + explicit ArcFactoredForest(short num_words) { + resize(num_words); + } + + void resize(unsigned num_words) { + num_words_ = num_words; + root_edges_.clear(); + edges_.clear(); + root_edges_.resize(num_words); + edges_.resize(num_words, num_words); for (int h = 0; h < num_words; ++h) { for (int m = 0; m < num_words; ++m) { - edges_(h, m).h = h + 1; - edges_(h, m).m = m + 1; + edges_(h, m).h = h; + edges_(h, m).m = m; } - root_edges_[h].h = 0; - root_edges_[h].m = h + 1; + root_edges_[h].h = -1; + root_edges_[h].m = h; } } // compute the maximum spanning tree based on the current weighting // using the O(n^2) CLE algorithm - void MaximumEdgeSubset(EdgeSubset* st) const; + void MaximumSpanningTree(EdgeSubset* st) const; // Reweight edges so that edge_prob is the edge's marginals // optionally returns log partition @@ -52,20 +67,16 @@ class ArcFactoredForest { prob_t edge_prob; }; + // set eges_[*].features + void ExtractFeatures(const TaggedSentence& sentence, + const std::vector >& ffs); + const Edge& operator()(short h, short m) const { - assert(m > 0); - assert(m <= num_words_); - assert(h >= 0); - assert(h <= num_words_); - return h ? edges_(h - 1, m - 1) : root_edges_[m - 1]; + return h >= 0 ? edges_(h, m) : root_edges_[m]; } Edge& operator()(short h, short m) { - assert(m > 0); - assert(m <= num_words_); - assert(h >= 0); - assert(h <= num_words_); - return h ? edges_(h - 1, m - 1) : root_edges_[m - 1]; + return h >= 0 ? edges_(h, m) : root_edges_[m]; } float Weight(short h, short m) const { @@ -87,7 +98,7 @@ class ArcFactoredForest { } private: - unsigned num_words_; + int num_words_; std::vector root_edges_; Array2D edges_; }; diff --git a/rst_parser/arc_factored_marginals.cc b/rst_parser/arc_factored_marginals.cc index 9851b59a..16360b0d 100644 --- a/rst_parser/arc_factored_marginals.cc +++ b/rst_parser/arc_factored_marginals.cc @@ -31,14 +31,18 @@ void ArcFactoredForest::EdgeMarginals(double *plog_z) { ArcMatrix Linv = L.inverse(); if (plog_z) *plog_z = log(Linv.determinant()); RootVector rootMarginals = r.cwiseProduct(Linv.col(0)); +// ArcMatrix T = Linv; for (int h = 0; h < num_words_; ++h) { for (int m = 0; m < num_words_; ++m) { - edges_(h,m).edge_prob = prob_t((m == 0 ? 0.0 : 1.0) * A(h,m) * Linv(m,m) - - (h == 0 ? 0.0 : 1.0) * A(h,m) * Linv(m,h)); + const double marginal = (m == 0 ? 0.0 : 1.0) * A(h,m) * Linv(m,m) - + (h == 0 ? 0.0 : 1.0) * A(h,m) * Linv(m,h); + edges_(h,m).edge_prob = prob_t(marginal); +// T(h,m) = marginal; } root_edges_[h].edge_prob = prob_t(rootMarginals(h)); } - // cerr << "ROOT MARGINALS: " << rootMarginals.transpose() << endl; +// cerr << "ROOT MARGINALS: " << rootMarginals.transpose() << endl; +// cerr << "M:\n" << T << endl; } #else diff --git a/rst_parser/arc_ff.cc b/rst_parser/arc_ff.cc new file mode 100644 index 00000000..f9effbda --- /dev/null +++ b/rst_parser/arc_ff.cc @@ -0,0 +1,64 @@ +#include "arc_ff.h" + +#include "tdict.h" +#include "fdict.h" +#include "sentence_metadata.h" + +using namespace std; + +ArcFeatureFunction::~ArcFeatureFunction() {} + +void ArcFeatureFunction::PrepareForInput(const TaggedSentence&) {} + +DistancePenalty::DistancePenalty(const string&) : fidw_(FD::Convert("Distance")), fidr_(FD::Convert("RootDistance")) {} + +void DistancePenalty::EdgeFeaturesImpl(const TaggedSentence& sent, + short h, + short m, + SparseVector* features) const { + const bool dir = m < h; + const bool is_root = (h == -1); + int v = m - h; + if (v < 0) { + v= -1 - int(log(-v) / log(2)); + } else { + v= int(log(v) / log(2)); + } + static map lenmap; + int& lenfid = lenmap[v]; + if (!lenfid) { + ostringstream os; + if (v < 0) os << "LenL" << -v; else os << "LenR" << v; + lenfid = FD::Convert(os.str()); + } + features->set_value(lenfid, 1.0); + const string& lenstr = FD::Convert(lenfid); + if (!is_root) { + static int modl = FD::Convert("ModLeft"); + static int modr = FD::Convert("ModRight"); + if (dir) features->set_value(modl, 1); + else features->set_value(modr, 1); + } + if (is_root) { + ostringstream os; + os << "ROOT:" << TD::Convert(sent.pos[m]); + features->set_value(FD::Convert(os.str()), 1.0); + os << "_" << lenstr; + features->set_value(FD::Convert(os.str()), 1.0); + } else { // not root + ostringstream os; + os << "HM:" << TD::Convert(sent.pos[h]) << '_' << TD::Convert(sent.pos[m]); + features->set_value(FD::Convert(os.str()), 1.0); + os << '_' << dir; + features->set_value(FD::Convert(os.str()), 1.0); + os << '_' << lenstr; + features->set_value(FD::Convert(os.str()), 1.0); + ostringstream os2; + os2 << "LexHM:" << TD::Convert(sent.words[h]) << '_' << TD::Convert(sent.words[m]); + features->set_value(FD::Convert(os2.str()), 1.0); + os2 << '_' << dir; + features->set_value(FD::Convert(os2.str()), 1.0); + os2 << '_' << lenstr; + features->set_value(FD::Convert(os2.str()), 1.0); + } +} diff --git a/rst_parser/arc_ff.h b/rst_parser/arc_ff.h new file mode 100644 index 00000000..bc51fef4 --- /dev/null +++ b/rst_parser/arc_ff.h @@ -0,0 +1,43 @@ +#ifndef _ARC_FF_H_ +#define _ARC_FF_H_ + +#include +#include "sparse_vector.h" +#include "weights.h" +#include "arc_factored.h" + +struct TaggedSentence; +class ArcFeatureFunction { + public: + virtual ~ArcFeatureFunction(); + + // called once, per input, before any calls to EdgeFeatures + // used to initialize sentence-specific data structures + virtual void PrepareForInput(const TaggedSentence& sentence); + + inline void EgdeFeatures(const TaggedSentence& sentence, + short h, + short m, + SparseVector* features) const { + EdgeFeaturesImpl(sentence, h, m, features); + } + protected: + virtual void EdgeFeaturesImpl(const TaggedSentence& sentence, + short h, + short m, + SparseVector* features) const = 0; +}; + +class DistancePenalty : public ArcFeatureFunction { + public: + DistancePenalty(const std::string& param); + protected: + virtual void EdgeFeaturesImpl(const TaggedSentence& sentence, + short h, + short m, + SparseVector* features) const; + private: + const int fidw_, fidr_; +}; + +#endif diff --git a/rst_parser/mst_train.cc b/rst_parser/mst_train.cc index 7b5af4c1..def23edb 100644 --- a/rst_parser/mst_train.cc +++ b/rst_parser/mst_train.cc @@ -1,12 +1,210 @@ #include "arc_factored.h" +#include #include +#include +#include + +#include "arc_ff.h" +#include "arc_ff_factory.h" +#include "stringlib.h" +#include "filelib.h" +#include "tdict.h" +#include "picojson.h" +#include "optimize.h" +#include "weights.h" using namespace std; +namespace po = boost::program_options; + +void InitCommandLine(int argc, char** argv, po::variables_map* conf) { + po::options_description opts("Configuration options"); + string cfg_file; + opts.add_options() + ("training_data,t",po::value()->default_value("-"), "File containing training data (jsent format)") + ("feature_function,F",po::value >()->composing(), "feature function") + ("regularization_strength,C",po::value()->default_value(1.0), "Regularization strength") + ("correction_buffers,m", po::value()->default_value(10), "LBFGS correction buffers"); + po::options_description clo("Command line options"); + clo.add_options() + ("config,c", po::value(&cfg_file), "Configuration file") + ("help,?", "Print this help message and exit"); + + po::options_description dconfig_options, dcmdline_options; + dconfig_options.add(opts); + dcmdline_options.add(dconfig_options).add(clo); + po::store(parse_command_line(argc, argv, dcmdline_options), *conf); + if (cfg_file.size() > 0) { + ReadFile rf(cfg_file); + po::store(po::parse_config_file(*rf.stream(), dconfig_options), *conf); + } + if (conf->count("help")) { + cerr << dcmdline_options << endl; + exit(1); + } +} + +struct TrainingInstance { + TaggedSentence ts; + EdgeSubset tree; + SparseVector features; +}; + +void ReadTraining(const string& fname, vector* corpus, int rank = 0, int size = 1) { + ReadFile rf(fname); + istream& in = *rf.stream(); + string line; + string err; + int lc = 0; + bool flag = false; + while(getline(in, line)) { + ++lc; + if ((lc-1) % size != rank) continue; + if (rank == 0 && lc % 10 == 0) { cerr << '.' << flush; flag = true; } + if (rank == 0 && lc % 400 == 0) { cerr << " [" << lc << "]\n"; flag = false; } + size_t pos = line.rfind('\t'); + assert(pos != string::npos); + picojson::value obj; + picojson::parse(obj, line.begin() + pos, line.end(), &err); + if (err.size() > 0) { cerr << "JSON parse error in " << lc << ": " << err << endl; abort(); } + corpus->push_back(TrainingInstance()); + TrainingInstance& cur = corpus->back(); + TaggedSentence& ts = cur.ts; + EdgeSubset& tree = cur.tree; + assert(obj.is()); + const picojson::object& d = obj.get(); + const picojson::array& ta = d.find("tokens")->second.get(); + for (unsigned i = 0; i < ta.size(); ++i) { + ts.words.push_back(TD::Convert(ta[i].get()[0].get())); + ts.pos.push_back(TD::Convert(ta[i].get()[1].get())); + } + const picojson::array& da = d.find("deps")->second.get(); + for (unsigned i = 0; i < da.size(); ++i) { + const picojson::array& thm = da[i].get(); + // get dep type here + short h = thm[2].get(); + short m = thm[1].get(); + if (h < 0) + tree.roots.push_back(m); + else + tree.h_m_pairs.push_back(make_pair(h,m)); + } + //cerr << TD::GetString(ts.words) << endl << TD::GetString(ts.pos) << endl << tree << endl; + } + if (flag) cerr << "\nRead " << lc << " training instances\n"; +} + +void AddFeatures(double prob, const SparseVector& fmap, vector* g) { + for (SparseVector::const_iterator it = fmap.begin(); it != fmap.end(); ++it) + (*g)[it->first] += it->second * prob; +} + +double ApplyRegularizationTerms(const double C, + const vector& weights, + vector* g) { + assert(weights.size() == g->size()); + double reg = 0; + for (size_t i = 0; i < weights.size(); ++i) { +// const double prev_w_i = (i < prev_weights.size() ? prev_weights[i] : 0.0); + const double& w_i = weights[i]; + double& g_i = (*g)[i]; + reg += C * w_i * w_i; + g_i += 2 * C * w_i; + +// reg += T * (w_i - prev_w_i) * (w_i - prev_w_i); +// g_i += 2 * T * (w_i - prev_w_i); + } + return reg; +} int main(int argc, char** argv) { + int rank = 0; + int size = 1; + po::variables_map conf; + InitCommandLine(argc, argv, &conf); ArcFactoredForest af(5); - cerr << af(0,3) << endl; + ArcFFRegistry reg; + reg.Register("DistancePenalty", new ArcFFFactory); + vector corpus; + vector > ffs; + ffs.push_back(boost::shared_ptr(new DistancePenalty(""))); + ReadTraining(conf["training_data"].as(), &corpus, rank, size); + vector forests(corpus.size()); + SparseVector empirical; + bool flag = false; + for (int i = 0; i < corpus.size(); ++i) { + TrainingInstance& cur = corpus[i]; + if (rank == 0 && (i+1) % 10 == 0) { cerr << '.' << flush; flag = true; } + if (rank == 0 && (i+1) % 400 == 0) { cerr << " [" << (i+1) << "]\n"; flag = false; } + for (int fi = 0; fi < ffs.size(); ++fi) { + ArcFeatureFunction& ff = *ffs[fi]; + ff.PrepareForInput(cur.ts); + SparseVector efmap; + for (int j = 0; j < cur.tree.h_m_pairs.size(); ++j) { + efmap.clear(); + ff.EgdeFeatures(cur.ts, cur.tree.h_m_pairs[j].first, + cur.tree.h_m_pairs[j].second, + &efmap); + cur.features += efmap; + } + for (int j = 0; j < cur.tree.roots.size(); ++j) { + efmap.clear(); + ff.EgdeFeatures(cur.ts, -1, cur.tree.roots[j], &efmap); + cur.features += efmap; + } + } + empirical += cur.features; + forests[i].resize(cur.ts.words.size()); + forests[i].ExtractFeatures(cur.ts, ffs); + } + if (flag) cerr << endl; + //cerr << "EMP: " << empirical << endl; //DE + vector weights(FD::NumFeats(), 0.0); + vector g(FD::NumFeats(), 0.0); + cerr << "features initialized\noptimizing...\n"; + boost::shared_ptr o; + o.reset(new LBFGSOptimizer(g.size(), conf["correction_buffers"].as())); + int iterations = 1000; + for (int iter = 0; iter < iterations; ++iter) { + cerr << "ITERATION " << iter << " " << flush; + fill(g.begin(), g.end(), 0.0); + for (SparseVector::const_iterator it = empirical.begin(); it != empirical.end(); ++it) + g[it->first] = -it->second; + double obj = -empirical.dot(weights); + // SparseVector mfm; //DE + for (int i = 0; i < corpus.size(); ++i) { + forests[i].Reweight(weights); + double logz; + forests[i].EdgeMarginals(&logz); + //cerr << " O = " << (-corpus[i].features.dot(weights)) << " D=" << -logz << " OO= " << (-corpus[i].features.dot(weights) - logz) << endl; + obj -= logz; + int num_words = corpus[i].ts.words.size(); + for (int h = -1; h < num_words; ++h) { + for (int m = 0; m < num_words; ++m) { + if (h == m) continue; + const ArcFactoredForest::Edge& edge = forests[i](h,m); + const SparseVector& fmap = edge.features; + double prob = edge.edge_prob.as_float(); + if (prob < -0.000001) { cerr << "Prob < 0: " << prob << endl; prob = 0; } + if (prob > 1.000001) { cerr << "Prob > 1: " << prob << endl; prob = 1; } + AddFeatures(prob, fmap, &g); + //mfm += fmap * prob; // DE + } + } + } + //cerr << endl << "E: " << empirical << endl; // DE + //cerr << "M: " << mfm << endl; // DE + double r = ApplyRegularizationTerms(conf["regularization_strength"].as(), weights, &g); + double gnorm = 0; + for (int i = 0; i < g.size(); ++i) + gnorm += g[i]*g[i]; + cerr << "OBJ=" << (obj+r) << "\t[F=" << obj << " R=" << r << "]\tGnorm=" << sqrt(gnorm) << endl; + obj += r; + assert(obj >= 0); + o->Optimize(obj, g, &weights); + Weights::ShowLargestFeatures(weights); + if (o->HasConverged()) { cerr << "CONVERGED\n"; break; } + } return 0; } diff --git a/rst_parser/rst_test.cc b/rst_parser/rst_test.cc index 8995515f..7e6fb2c1 100644 --- a/rst_parser/rst_test.cc +++ b/rst_parser/rst_test.cc @@ -17,15 +17,15 @@ int main(int argc, char** argv) { // (0, 1) 9 // (0, 3) 9 ArcFactoredForest af(3); - af(1,2).edge_prob.logeq(20); - af(1,3).edge_prob.logeq(3); - af(2,1).edge_prob.logeq(20); - af(2,3).edge_prob.logeq(30); - af(3,2).edge_prob.logeq(0); - af(3,1).edge_prob.logeq(11); - af(0,2).edge_prob.logeq(10); - af(0,1).edge_prob.logeq(9); - af(0,3).edge_prob.logeq(9); + af(0,1).edge_prob.logeq(20); + af(0,2).edge_prob.logeq(3); + af(1,0).edge_prob.logeq(20); + af(1,2).edge_prob.logeq(30); + af(2,1).edge_prob.logeq(0); + af(2,0).edge_prob.logeq(11); + af(-1,1).edge_prob.logeq(10); + af(-1,0).edge_prob.logeq(9); + af(-1,2).edge_prob.logeq(9); EdgeSubset tree; // af.MaximumEdgeSubset(&tree); double lz; -- cgit v1.2.3 From ad068c70c8d608dd0dca7e9c7a94b51b2d6a28ba Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Sun, 15 Apr 2012 17:29:31 -0400 Subject: turn on dep parser build --- Makefile.am | 3 +-- configure.ac | 3 ++- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/Makefile.am b/Makefile.am index c0fcb1f6..2ecb60df 100644 --- a/Makefile.am +++ b/Makefile.am @@ -1,11 +1,10 @@ # warning - the subdirectories in the following list should # be kept in topologically sorted order. Also, DO NOT introduce # cyclic dependencies between these directories! -SUBDIRS = utils mteval klm/util klm/lm decoder phrasinator training mira dpmert pro-train extools gi/pf gi/markov_al +SUBDIRS = utils mteval klm/util klm/lm decoder phrasinator training mira dpmert pro-train extools gi/pf gi/markov_al rst_parser #gi/pyp-topics/src gi/clda/src gi/posterior-regularisation/prjava AUTOMAKE_OPTIONS = foreign ACLOCAL_AMFLAGS = -I m4 AM_CPPFLAGS = -D_GLIBCXX_PARALLEL - diff --git a/configure.ac b/configure.ac index 869f7cf9..f03f97f0 100644 --- a/configure.ac +++ b/configure.ac @@ -6,6 +6,7 @@ AC_PROG_LEX case $LEX in :) AC_MSG_ERROR([No lex (Flex, lex, etc.) program found]);; esac +# CPPFLAGS="$CPPFLAGS -std=c++0x" AC_PROG_CC AC_PROG_CXX AC_LANG_CPLUSPLUS @@ -129,4 +130,4 @@ then AM_CONDITIONAL([GLC], true) fi -AC_OUTPUT(Makefile utils/Makefile mteval/Makefile extools/Makefile decoder/Makefile phrasinator/Makefile training/Makefile dpmert/Makefile pro-train/Makefile klm/util/Makefile klm/lm/Makefile mira/Makefile gi/pyp-topics/src/Makefile gi/clda/src/Makefile gi/pf/Makefile gi/markov_al/Makefile) +AC_OUTPUT(Makefile rst_parser/Makefile utils/Makefile mteval/Makefile extools/Makefile decoder/Makefile phrasinator/Makefile training/Makefile dpmert/Makefile pro-train/Makefile klm/util/Makefile klm/lm/Makefile mira/Makefile gi/pyp-topics/src/Makefile gi/clda/src/Makefile gi/pf/Makefile gi/markov_al/Makefile) -- cgit v1.2.3 From ebf49f78a6b88dcea9110df24a65e54eb8c39a37 Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Sun, 15 Apr 2012 17:32:51 -0400 Subject: forgotten file --- rst_parser/arc_ff_factory.h | 42 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) create mode 100644 rst_parser/arc_ff_factory.h diff --git a/rst_parser/arc_ff_factory.h b/rst_parser/arc_ff_factory.h new file mode 100644 index 00000000..4237fd5d --- /dev/null +++ b/rst_parser/arc_ff_factory.h @@ -0,0 +1,42 @@ +#ifndef _ARC_FF_FACTORY_H_ +#define _ARC_FF_FACTORY_H_ + +#include +#include +#include + +struct ArcFFFactoryBase { + virtual boost::shared_ptr Create(const std::string& param) const = 0; +}; + +template +struct ArcFFFactory : public ArcFFFactoryBase { + boost::shared_ptr Create(const std::string& param) const { + return boost::shared_ptr(new FF(param)); + } +}; + +struct ArcFFRegistry { + boost::shared_ptr Create(const std::string& name, const std::string& param) const { + std::map::const_iterator it = facts.find(name); + assert(it != facts.end()); + return it->second->Create(param); + } + + void Register(const std::string& name, ArcFFFactoryBase* fact) { + ArcFFFactoryBase*& f = facts[name]; + assert(f == NULL); + f = fact; + } + std::map facts; +}; + +std::ostream& operator<<(std::ostream& os, const ArcFFRegistry& reg) { + for (std::map::const_iterator it = reg.facts.begin(); + it != reg.facts.end(); ++it) { + os << " " << it->first << std::endl; + } + return os; +} + +#endif -- cgit v1.2.3 From daa182defda1a97cb66b45b4ebf2a223948d950b Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Sun, 15 Apr 2012 17:33:17 -0400 Subject: small json library --- rst_parser/picojson.h | 979 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 979 insertions(+) create mode 100644 rst_parser/picojson.h diff --git a/rst_parser/picojson.h b/rst_parser/picojson.h new file mode 100644 index 00000000..bdb26057 --- /dev/null +++ b/rst_parser/picojson.h @@ -0,0 +1,979 @@ +/* + * Copyright 2009-2010 Cybozu Labs, Inc. + * Copyright 2011 Kazuho Oku + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY CYBOZU LABS, INC. ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO + * EVENT SHALL CYBOZU LABS, INC. OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, + * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * The views and conclusions contained in the software and documentation are + * those of the authors and should not be interpreted as representing official + * policies, either expressed or implied, of Cybozu Labs, Inc. + * + */ +#ifndef picojson_h +#define picojson_h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef _MSC_VER + #define SNPRINTF _snprintf_s + #pragma warning(push) + #pragma warning(disable : 4244) // conversion from int to char +#else + #define SNPRINTF snprintf +#endif + +namespace picojson { + + enum { + null_type, + boolean_type, + number_type, + string_type, + array_type, + object_type + }; + + struct null {}; + + class value { + public: + typedef std::vector array; + typedef std::map object; + protected: + int type_; + union { + bool boolean_; + double number_; + std::string* string_; + array* array_; + object* object_; + }; + public: + value(); + value(int type, bool); + explicit value(bool b); + explicit value(double n); + explicit value(const std::string& s); + explicit value(const array& a); + explicit value(const object& o); + explicit value(const char* s); + value(const char* s, size_t len); + ~value(); + value(const value& x); + value& operator=(const value& x); + template bool is() const; + template const T& get() const; + template T& get(); + bool evaluate_as_boolean() const; + const value& get(size_t idx) const; + const value& get(const std::string& key) const; + bool contains(size_t idx) const; + bool contains(const std::string& key) const; + std::string to_str() const; + template void serialize(Iter os) const; + std::string serialize() const; + private: + template value(const T*); // intentionally defined to block implicit conversion of pointer to bool + }; + + typedef value::array array; + typedef value::object object; + + inline value::value() : type_(null_type) {} + + inline value::value(int type, bool) : type_(type) { + switch (type) { +#define INIT(p, v) case p##type: p = v; break + INIT(boolean_, false); + INIT(number_, 0.0); + INIT(string_, new std::string()); + INIT(array_, new array()); + INIT(object_, new object()); +#undef INIT + default: break; + } + } + + inline value::value(bool b) : type_(boolean_type) { + boolean_ = b; + } + + inline value::value(double n) : type_(number_type) { + number_ = n; + } + + inline value::value(const std::string& s) : type_(string_type) { + string_ = new std::string(s); + } + + inline value::value(const array& a) : type_(array_type) { + array_ = new array(a); + } + + inline value::value(const object& o) : type_(object_type) { + object_ = new object(o); + } + + inline value::value(const char* s) : type_(string_type) { + string_ = new std::string(s); + } + + inline value::value(const char* s, size_t len) : type_(string_type) { + string_ = new std::string(s, len); + } + + inline value::~value() { + switch (type_) { +#define DEINIT(p) case p##type: delete p; break + DEINIT(string_); + DEINIT(array_); + DEINIT(object_); +#undef DEINIT + default: break; + } + } + + inline value::value(const value& x) : type_(x.type_) { + switch (type_) { +#define INIT(p, v) case p##type: p = v; break + INIT(boolean_, x.boolean_); + INIT(number_, x.number_); + INIT(string_, new std::string(*x.string_)); + INIT(array_, new array(*x.array_)); + INIT(object_, new object(*x.object_)); +#undef INIT + default: break; + } + } + + inline value& value::operator=(const value& x) { + if (this != &x) { + this->~value(); + new (this) value(x); + } + return *this; + } + +#define IS(ctype, jtype) \ + template <> inline bool value::is() const { \ + return type_ == jtype##_type; \ + } + IS(null, null) + IS(bool, boolean) + IS(int, number) + IS(double, number) + IS(std::string, string) + IS(array, array) + IS(object, object) +#undef IS + +#define GET(ctype, var) \ + template <> inline const ctype& value::get() const { \ + assert("type mismatch! call vis() before get()" \ + && is()); \ + return var; \ + } \ + template <> inline ctype& value::get() { \ + assert("type mismatch! call is() before get()" \ + && is()); \ + return var; \ + } + GET(bool, boolean_) + GET(double, number_) + GET(std::string, *string_) + GET(array, *array_) + GET(object, *object_) +#undef GET + + inline bool value::evaluate_as_boolean() const { + switch (type_) { + case null_type: + return false; + case boolean_type: + return boolean_; + case number_type: + return number_ != 0; + case string_type: + return ! string_->empty(); + default: + return true; + } + } + + inline const value& value::get(size_t idx) const { + static value s_null; + assert(is()); + return idx < array_->size() ? (*array_)[idx] : s_null; + } + + inline const value& value::get(const std::string& key) const { + static value s_null; + assert(is()); + object::const_iterator i = object_->find(key); + return i != object_->end() ? i->second : s_null; + } + + inline bool value::contains(size_t idx) const { + assert(is()); + return idx < array_->size(); + } + + inline bool value::contains(const std::string& key) const { + assert(is()); + object::const_iterator i = object_->find(key); + return i != object_->end(); + } + + inline std::string value::to_str() const { + switch (type_) { + case null_type: return "null"; + case boolean_type: return boolean_ ? "true" : "false"; + case number_type: { + char buf[256]; + double tmp; + SNPRINTF(buf, sizeof(buf), modf(number_, &tmp) == 0 ? "%.f" : "%f", number_); + return buf; + } + case string_type: return *string_; + case array_type: return "array"; + case object_type: return "object"; + default: assert(0); +#ifdef _MSC_VER + __assume(0); +#endif + } + } + + template void copy(const std::string& s, Iter oi) { + std::copy(s.begin(), s.end(), oi); + } + + template void serialize_str(const std::string& s, Iter oi) { + *oi++ = '"'; + for (std::string::const_iterator i = s.begin(); i != s.end(); ++i) { + switch (*i) { +#define MAP(val, sym) case val: copy(sym, oi); break + MAP('"', "\\\""); + MAP('\\', "\\\\"); + MAP('/', "\\/"); + MAP('\b', "\\b"); + MAP('\f', "\\f"); + MAP('\n', "\\n"); + MAP('\r', "\\r"); + MAP('\t', "\\t"); +#undef MAP + default: + if ((unsigned char)*i < 0x20 || *i == 0x7f) { + char buf[7]; + SNPRINTF(buf, sizeof(buf), "\\u%04x", *i & 0xff); + copy(buf, buf + 6, oi); + } else { + *oi++ = *i; + } + break; + } + } + *oi++ = '"'; + } + + template void value::serialize(Iter oi) const { + switch (type_) { + case string_type: + serialize_str(*string_, oi); + break; + case array_type: { + *oi++ = '['; + for (array::const_iterator i = array_->begin(); i != array_->end(); ++i) { + if (i != array_->begin()) { + *oi++ = ','; + } + i->serialize(oi); + } + *oi++ = ']'; + break; + } + case object_type: { + *oi++ = '{'; + for (object::const_iterator i = object_->begin(); + i != object_->end(); + ++i) { + if (i != object_->begin()) { + *oi++ = ','; + } + serialize_str(i->first, oi); + *oi++ = ':'; + i->second.serialize(oi); + } + *oi++ = '}'; + break; + } + default: + copy(to_str(), oi); + break; + } + } + + inline std::string value::serialize() const { + std::string s; + serialize(std::back_inserter(s)); + return s; + } + + template class input { + protected: + Iter cur_, end_; + int last_ch_; + bool ungot_; + int line_; + public: + input(const Iter& first, const Iter& last) : cur_(first), end_(last), last_ch_(-1), ungot_(false), line_(1) {} + int getc() { + if (ungot_) { + ungot_ = false; + return last_ch_; + } + if (cur_ == end_) { + last_ch_ = -1; + return -1; + } + if (last_ch_ == '\n') { + line_++; + } + last_ch_ = *cur_++ & 0xff; + return last_ch_; + } + void ungetc() { + if (last_ch_ != -1) { + assert(! ungot_); + ungot_ = true; + } + } + Iter cur() const { return cur_; } + int line() const { return line_; } + void skip_ws() { + while (1) { + int ch = getc(); + if (! (ch == ' ' || ch == '\t' || ch == '\n' || ch == '\r')) { + ungetc(); + break; + } + } + } + int expect(int expect) { + skip_ws(); + if (getc() != expect) { + ungetc(); + return false; + } + return true; + } + bool match(const std::string& pattern) { + for (std::string::const_iterator pi(pattern.begin()); + pi != pattern.end(); + ++pi) { + if (getc() != *pi) { + ungetc(); + return false; + } + } + return true; + } + }; + + template inline int _parse_quadhex(input &in) { + int uni_ch = 0, hex; + for (int i = 0; i < 4; i++) { + if ((hex = in.getc()) == -1) { + return -1; + } + if ('0' <= hex && hex <= '9') { + hex -= '0'; + } else if ('A' <= hex && hex <= 'F') { + hex -= 'A' - 0xa; + } else if ('a' <= hex && hex <= 'f') { + hex -= 'a' - 0xa; + } else { + in.ungetc(); + return -1; + } + uni_ch = uni_ch * 16 + hex; + } + return uni_ch; + } + + template inline bool _parse_codepoint(String& out, input& in) { + int uni_ch; + if ((uni_ch = _parse_quadhex(in)) == -1) { + return false; + } + if (0xd800 <= uni_ch && uni_ch <= 0xdfff) { + if (0xdc00 <= uni_ch) { + // a second 16-bit of a surrogate pair appeared + return false; + } + // first 16-bit of surrogate pair, get the next one + if (in.getc() != '\\' || in.getc() != 'u') { + in.ungetc(); + return false; + } + int second = _parse_quadhex(in); + if (! (0xdc00 <= second && second <= 0xdfff)) { + return false; + } + uni_ch = ((uni_ch - 0xd800) << 10) | ((second - 0xdc00) & 0x3ff); + uni_ch += 0x10000; + } + if (uni_ch < 0x80) { + out.push_back(uni_ch); + } else { + if (uni_ch < 0x800) { + out.push_back(0xc0 | (uni_ch >> 6)); + } else { + if (uni_ch < 0x10000) { + out.push_back(0xe0 | (uni_ch >> 12)); + } else { + out.push_back(0xf0 | (uni_ch >> 18)); + out.push_back(0x80 | ((uni_ch >> 12) & 0x3f)); + } + out.push_back(0x80 | ((uni_ch >> 6) & 0x3f)); + } + out.push_back(0x80 | (uni_ch & 0x3f)); + } + return true; + } + + template inline bool _parse_string(String& out, input& in) { + while (1) { + int ch = in.getc(); + if (ch < ' ') { + in.ungetc(); + return false; + } else if (ch == '"') { + return true; + } else if (ch == '\\') { + if ((ch = in.getc()) == -1) { + return false; + } + switch (ch) { +#define MAP(sym, val) case sym: out.push_back(val); break + MAP('"', '\"'); + MAP('\\', '\\'); + MAP('/', '/'); + MAP('b', '\b'); + MAP('f', '\f'); + MAP('n', '\n'); + MAP('r', '\r'); + MAP('t', '\t'); +#undef MAP + case 'u': + if (! _parse_codepoint(out, in)) { + return false; + } + break; + default: + return false; + } + } else { + out.push_back(ch); + } + } + return false; + } + + template inline bool _parse_array(Context& ctx, input& in) { + if (! ctx.parse_array_start()) { + return false; + } + if (in.expect(']')) { + return true; + } + size_t idx = 0; + do { + if (! ctx.parse_array_item(in, idx)) { + return false; + } + idx++; + } while (in.expect(',')); + return in.expect(']'); + } + + template inline bool _parse_object(Context& ctx, input& in) { + if (! ctx.parse_object_start()) { + return false; + } + if (in.expect('}')) { + return true; + } + do { + std::string key; + if (! in.expect('"') + || ! _parse_string(key, in) + || ! in.expect(':')) { + return false; + } + if (! ctx.parse_object_item(in, key)) { + return false; + } + } while (in.expect(',')); + return in.expect('}'); + } + + template inline bool _parse_number(double& out, input& in) { + std::string num_str; + while (1) { + int ch = in.getc(); + if (('0' <= ch && ch <= '9') || ch == '+' || ch == '-' || ch == '.' + || ch == 'e' || ch == 'E') { + num_str.push_back(ch); + } else { + in.ungetc(); + break; + } + } + char* endp; + out = strtod(num_str.c_str(), &endp); + return endp == num_str.c_str() + num_str.size(); + } + + template inline bool _parse(Context& ctx, input& in) { + in.skip_ws(); + int ch = in.getc(); + switch (ch) { +#define IS(ch, text, op) case ch: \ + if (in.match(text) && op) { \ + return true; \ + } else { \ + return false; \ + } + IS('n', "ull", ctx.set_null()); + IS('f', "alse", ctx.set_bool(false)); + IS('t', "rue", ctx.set_bool(true)); +#undef IS + case '"': + return ctx.parse_string(in); + case '[': + return _parse_array(ctx, in); + case '{': + return _parse_object(ctx, in); + default: + if (('0' <= ch && ch <= '9') || ch == '-') { + in.ungetc(); + double f; + if (_parse_number(f, in)) { + ctx.set_number(f); + return true; + } else { + return false; + } + } + break; + } + in.ungetc(); + return false; + } + + class deny_parse_context { + public: + bool set_null() { return false; } + bool set_bool(bool) { return false; } + bool set_number(double) { return false; } + template bool parse_string(input&) { return false; } + bool parse_array_start() { return false; } + template bool parse_array_item(input&, size_t) { + return false; + } + bool parse_object_start() { return false; } + template bool parse_object_item(input&, const std::string&) { + return false; + } + }; + + class default_parse_context { + protected: + value* out_; + public: + default_parse_context(value* out) : out_(out) {} + bool set_null() { + *out_ = value(); + return true; + } + bool set_bool(bool b) { + *out_ = value(b); + return true; + } + bool set_number(double f) { + *out_ = value(f); + return true; + } + template bool parse_string(input& in) { + *out_ = value(string_type, false); + return _parse_string(out_->get(), in); + } + bool parse_array_start() { + *out_ = value(array_type, false); + return true; + } + template bool parse_array_item(input& in, size_t) { + array& a = out_->get(); + a.push_back(value()); + default_parse_context ctx(&a.back()); + return _parse(ctx, in); + } + bool parse_object_start() { + *out_ = value(object_type, false); + return true; + } + template bool parse_object_item(input& in, const std::string& key) { + object& o = out_->get(); + default_parse_context ctx(&o[key]); + return _parse(ctx, in); + } + private: + default_parse_context(const default_parse_context&); + default_parse_context& operator=(const default_parse_context&); + }; + + class null_parse_context { + public: + struct dummy_str { + void push_back(int) {} + }; + public: + null_parse_context() {} + bool set_null() { return true; } + bool set_bool(bool) { return true; } + bool set_number(double) { return true; } + template bool parse_string(input& in) { + dummy_str s; + return _parse_string(s, in); + } + bool parse_array_start() { return true; } + template bool parse_array_item(input& in, size_t) { + return _parse(*this, in); + } + bool parse_object_start() { return true; } + template bool parse_object_item(input& in, const std::string&) { + return _parse(*this, in); + } + private: + null_parse_context(const null_parse_context&); + null_parse_context& operator=(const null_parse_context&); + }; + + // obsolete, use the version below + template inline std::string parse(value& out, Iter& pos, const Iter& last) { + std::string err; + pos = parse(out, pos, last, &err); + return err; + } + + template inline Iter _parse(Context& ctx, const Iter& first, const Iter& last, std::string* err) { + input in(first, last); + if (! _parse(ctx, in) && err != NULL) { + char buf[64]; + SNPRINTF(buf, sizeof(buf), "syntax error at line %d near: ", in.line()); + *err = buf; + while (1) { + int ch = in.getc(); + if (ch == -1 || ch == '\n') { + break; + } else if (ch >= ' ') { + err->push_back(ch); + } + } + } + return in.cur(); + } + + template inline Iter parse(value& out, const Iter& first, const Iter& last, std::string* err) { + default_parse_context ctx(&out); + return _parse(ctx, first, last, err); + } + + inline std::string parse(value& out, std::istream& is) { + std::string err; + parse(out, std::istreambuf_iterator(is.rdbuf()), + std::istreambuf_iterator(), &err); + return err; + } + + template struct last_error_t { + static std::string s; + }; + template std::string last_error_t::s; + + inline void set_last_error(const std::string& s) { + last_error_t::s = s; + } + + inline const std::string& get_last_error() { + return last_error_t::s; + } + + inline bool operator==(const value& x, const value& y) { + if (x.is()) + return y.is(); +#define PICOJSON_CMP(type) \ + if (x.is()) \ + return y.is() && x.get() == y.get() + PICOJSON_CMP(bool); + PICOJSON_CMP(double); + PICOJSON_CMP(std::string); + PICOJSON_CMP(array); + PICOJSON_CMP(object); +#undef PICOJSON_CMP + assert(0); +#ifdef _MSC_VER + __assume(0); +#endif + return false; + } + + inline bool operator!=(const value& x, const value& y) { + return ! (x == y); + } +} + +inline std::istream& operator>>(std::istream& is, picojson::value& x) +{ + picojson::set_last_error(std::string()); + std::string err = picojson::parse(x, is); + if (! err.empty()) { + picojson::set_last_error(err); + is.setstate(std::ios::failbit); + } + return is; +} + +inline std::ostream& operator<<(std::ostream& os, const picojson::value& x) +{ + x.serialize(std::ostream_iterator(os)); + return os; +} +#ifdef _MSC_VER + #pragma warning(pop) +#endif + +#endif +#ifdef TEST_PICOJSON +#ifdef _MSC_VER + #pragma warning(disable : 4127) // conditional expression is constant +#endif + +using namespace std; + +static void plan(int num) +{ + printf("1..%d\n", num); +} + +static bool success = true; + +static void ok(bool b, const char* name = "") +{ + static int n = 1; + if (! b) + success = false; + printf("%s %d - %s\n", b ? "ok" : "ng", n++, name); +} + +template void is(const T& x, const T& y, const char* name = "") +{ + if (x == y) { + ok(true, name); + } else { + ok(false, name); + } +} + +#include + +int main(void) +{ + plan(75); + + // constructors +#define TEST(expr, expected) \ + is(picojson::value expr .serialize(), string(expected), "picojson::value" #expr) + + TEST( (true), "true"); + TEST( (false), "false"); + TEST( (42.0), "42"); + TEST( (string("hello")), "\"hello\""); + TEST( ("hello"), "\"hello\""); + TEST( ("hello", 4), "\"hell\""); + +#undef TEST + +#define TEST(in, type, cmp, serialize_test) { \ + picojson::value v; \ + const char* s = in; \ + string err = picojson::parse(v, s, s + strlen(s)); \ + ok(err.empty(), in " no error"); \ + ok(v.is(), in " check type"); \ + is(v.get(), cmp, in " correct output"); \ + is(*s, '\0', in " read to eof"); \ + if (serialize_test) { \ + is(v.serialize(), string(in), in " serialize"); \ + } \ + } + TEST("false", bool, false, true); + TEST("true", bool, true, true); + TEST("90.5", double, 90.5, false); + TEST("\"hello\"", string, string("hello"), true); + TEST("\"\\\"\\\\\\/\\b\\f\\n\\r\\t\"", string, string("\"\\/\b\f\n\r\t"), + true); + TEST("\"\\u0061\\u30af\\u30ea\\u30b9\"", string, + string("a\xe3\x82\xaf\xe3\x83\xaa\xe3\x82\xb9"), false); + TEST("\"\\ud840\\udc0b\"", string, string("\xf0\xa0\x80\x8b"), false); +#undef TEST + +#define TEST(type, expr) { \ + picojson::value v; \ + const char *s = expr; \ + string err = picojson::parse(v, s, s + strlen(s)); \ + ok(err.empty(), "empty " #type " no error"); \ + ok(v.is(), "empty " #type " check type"); \ + ok(v.get().empty(), "check " #type " array size"); \ + } + TEST(array, "[]"); + TEST(object, "{}"); +#undef TEST + + { + picojson::value v; + const char *s = "[1,true,\"hello\"]"; + string err = picojson::parse(v, s, s + strlen(s)); + ok(err.empty(), "array no error"); + ok(v.is(), "array check type"); + is(v.get().size(), size_t(3), "check array size"); + ok(v.contains(0), "check contains array[0]"); + ok(v.get(0).is(), "check array[0] type"); + is(v.get(0).get(), 1.0, "check array[0] value"); + ok(v.contains(1), "check contains array[1]"); + ok(v.get(1).is(), "check array[1] type"); + ok(v.get(1).get(), "check array[1] value"); + ok(v.contains(2), "check contains array[2]"); + ok(v.get(2).is(), "check array[2] type"); + is(v.get(2).get(), string("hello"), "check array[2] value"); + ok(!v.contains(3), "check not contains array[3]"); + } + + { + picojson::value v; + const char *s = "{ \"a\": true }"; + string err = picojson::parse(v, s, s + strlen(s)); + ok(err.empty(), "object no error"); + ok(v.is(), "object check type"); + is(v.get().size(), size_t(1), "check object size"); + ok(v.contains("a"), "check contains property"); + ok(v.get("a").is(), "check bool property exists"); + is(v.get("a").get(), true, "check bool property value"); + is(v.serialize(), string("{\"a\":true}"), "serialize object"); + ok(!v.contains("z"), "check not contains property"); + } + +#define TEST(json, msg) do { \ + picojson::value v; \ + const char *s = json; \ + string err = picojson::parse(v, s, s + strlen(s)); \ + is(err, string("syntax error at line " msg), msg); \ + } while (0) + TEST("falsoa", "1 near: oa"); + TEST("{]", "1 near: ]"); + TEST("\n\bbell", "2 near: bell"); + TEST("\"abc\nd\"", "1 near: "); +#undef TEST + + { + picojson::value v1, v2; + const char *s; + string err; + s = "{ \"b\": true, \"a\": [1,2,\"three\"], \"d\": 2 }"; + err = picojson::parse(v1, s, s + strlen(s)); + s = "{ \"d\": 2.0, \"b\": true, \"a\": [1,2,\"three\"] }"; + err = picojson::parse(v2, s, s + strlen(s)); + ok((v1 == v2), "check == operator in deep comparison"); + } + + { + picojson::value v1, v2; + const char *s; + string err; + s = "{ \"b\": true, \"a\": [1,2,\"three\"], \"d\": 2 }"; + err = picojson::parse(v1, s, s + strlen(s)); + s = "{ \"d\": 2.0, \"a\": [1,\"three\"], \"b\": true }"; + err = picojson::parse(v2, s, s + strlen(s)); + ok((v1 != v2), "check != operator for array in deep comparison"); + } + + { + picojson::value v1, v2; + const char *s; + string err; + s = "{ \"b\": true, \"a\": [1,2,\"three\"], \"d\": 2 }"; + err = picojson::parse(v1, s, s + strlen(s)); + s = "{ \"d\": 2.0, \"a\": [1,2,\"three\"], \"b\": false }"; + err = picojson::parse(v2, s, s + strlen(s)); + ok((v1 != v2), "check != operator for object in deep comparison"); + } + + { + picojson::value v1, v2; + const char *s; + string err; + s = "{ \"b\": true, \"a\": [1,2,\"three\"], \"d\": 2 }"; + err = picojson::parse(v1, s, s + strlen(s)); + picojson::object& o = v1.get(); + o.erase("b"); + picojson::array& a = o["a"].get(); + picojson::array::iterator i; + i = std::remove(a.begin(), a.end(), picojson::value(std::string("three"))); + a.erase(i, a.end()); + s = "{ \"a\": [1,2], \"d\": 2 }"; + err = picojson::parse(v2, s, s + strlen(s)); + ok((v1 == v2), "check erase()"); + } + + ok(picojson::value(3.0).serialize() == "3", + "integral number should be serialized as a integer"); + + { + const char* s = "{ \"a\": [1,2], \"d\": 2 }"; + picojson::null_parse_context ctx; + string err; + picojson::_parse(ctx, s, s + strlen(s), &err); + ok(err.empty(), "null_parse_context"); + } + + return success ? 0 : 1; +} + +#endif -- cgit v1.2.3 From fa47b549e5ac7c16dce9e40d52328ffd51b60dc6 Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Mon, 16 Apr 2012 00:18:20 -0400 Subject: rst algorithm --- rst_parser/arc_factored.h | 4 +++- rst_parser/mst_train.cc | 21 +++++++++++++++------ rst_parser/rst.cc | 45 ++++++++++++++++++++++++++++++++++++++++++++- rst_parser/rst.h | 9 +++++++-- 4 files changed, 69 insertions(+), 10 deletions(-) diff --git a/rst_parser/arc_factored.h b/rst_parser/arc_factored.h index a95f8230..d9a0bb24 100644 --- a/rst_parser/arc_factored.h +++ b/rst_parser/arc_factored.h @@ -28,10 +28,12 @@ struct ArcFeatureFunction; class ArcFactoredForest { public: ArcFactoredForest() : num_words_() {} - explicit ArcFactoredForest(short num_words) { + explicit ArcFactoredForest(short num_words) : num_words_(num_words) { resize(num_words); } + unsigned size() const { return num_words_; } + void resize(unsigned num_words) { num_words_ = num_words; root_edges_.clear(); diff --git a/rst_parser/mst_train.cc b/rst_parser/mst_train.cc index def23edb..b5114726 100644 --- a/rst_parser/mst_train.cc +++ b/rst_parser/mst_train.cc @@ -13,6 +13,7 @@ #include "picojson.h" #include "optimize.h" #include "weights.h" +#include "rst.h" using namespace std; namespace po = boost::program_options; @@ -173,12 +174,13 @@ int main(int argc, char** argv) { double obj = -empirical.dot(weights); // SparseVector mfm; //DE for (int i = 0; i < corpus.size(); ++i) { + const int num_words = corpus[i].ts.words.size(); forests[i].Reweight(weights); - double logz; - forests[i].EdgeMarginals(&logz); - //cerr << " O = " << (-corpus[i].features.dot(weights)) << " D=" << -logz << " OO= " << (-corpus[i].features.dot(weights) - logz) << endl; - obj -= logz; - int num_words = corpus[i].ts.words.size(); + double lz; + forests[i].EdgeMarginals(&lz); + obj -= lz; + //cerr << " O = " << (-corpus[i].features.dot(weights)) << " D=" << -lz << " OO= " << (-corpus[i].features.dot(weights) - lz) << endl; + //cerr << " ZZ = " << zz << endl; for (int h = -1; h < num_words; ++h) { for (int m = 0; m < num_words; ++m) { if (h == m) continue; @@ -198,13 +200,20 @@ int main(int argc, char** argv) { double gnorm = 0; for (int i = 0; i < g.size(); ++i) gnorm += g[i]*g[i]; - cerr << "OBJ=" << (obj+r) << "\t[F=" << obj << " R=" << r << "]\tGnorm=" << sqrt(gnorm) << endl; + ostringstream ll; + ll << "ITER=" << (iter+1) << "\tOBJ=" << (obj+r) << "\t[F=" << obj << " R=" << r << "]\tGnorm=" << sqrt(gnorm); + cerr << endl << ll.str() << endl; obj += r; assert(obj >= 0); o->Optimize(obj, g, &weights); Weights::ShowLargestFeatures(weights); + string sl = ll.str(); + Weights::WriteToFile(o->HasConverged() ? "weights.final.gz" : "weights.cur.gz", weights, true, &sl); if (o->HasConverged()) { cerr << "CONVERGED\n"; break; } } + forests[0].Reweight(weights); + TreeSampler ts(forests[0]); + EdgeSubset tt; ts.SampleRandomSpanningTree(&tt); return 0; } diff --git a/rst_parser/rst.cc b/rst_parser/rst.cc index f6b295b3..c4ce898e 100644 --- a/rst_parser/rst.cc +++ b/rst_parser/rst.cc @@ -2,6 +2,49 @@ using namespace std; -StochasticForest::StochasticForest(const ArcFactoredForest& af) { +// David B. Wilson. Generating Random Spanning Trees More Quickly than the Cover Time. + +TreeSampler::TreeSampler(const ArcFactoredForest& af) : forest(af), usucc(af.size() + 1) { + // edges are directed from modifiers to heads, to the root + for (int m = 1; m <= forest.size(); ++m) { + SampleSet& ss = usucc[m]; + for (int h = 0; h <= forest.size(); ++h) + ss.add(forest(h-1,m-1).edge_prob.as_float()); + } } +void TreeSampler::SampleRandomSpanningTree(EdgeSubset* tree) { + MT19937 rng; + const int r = 0; + bool success = false; + while (!success) { + int roots = 0; + vector next(forest.size() + 1, -1); + vector in_tree(forest.size() + 1, 0); + in_tree[r] = 1; + for (int i = 0; i < forest.size(); ++i) { + int u = i; + if (in_tree[u]) continue; + while(!in_tree[u]) { + next[u] = rng.SelectSample(usucc[u]); + u = next[u]; + } + u = i; + cerr << (u-1); + while(!in_tree[u]) { + in_tree[u] = true; + u = next[u]; + cerr << " > " << (u-1); + if (u == r) { ++roots; } + } + cerr << endl; + } + assert(roots > 0); + if (roots > 1) { + cerr << "FAILURE\n"; + } else { + success = true; + } + } +}; + diff --git a/rst_parser/rst.h b/rst_parser/rst.h index 865871eb..a269ff9b 100644 --- a/rst_parser/rst.h +++ b/rst_parser/rst.h @@ -1,10 +1,15 @@ #ifndef _RST_H_ #define _RST_H_ +#include +#include "sampler.h" #include "arc_factored.h" -struct StochasticForest { - explicit StochasticForest(const ArcFactoredForest& af); +struct TreeSampler { + explicit TreeSampler(const ArcFactoredForest& af); + void SampleRandomSpanningTree(EdgeSubset* tree); + const ArcFactoredForest& forest; + std::vector > usucc; }; #endif -- cgit v1.2.3 From d5a2a9c3bf18c1e414f79a757c1662fe422e2f5c Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Mon, 16 Apr 2012 14:11:02 -0400 Subject: switch to log domain for matrix operations --- rst_parser/arc_factored.h | 2 +- rst_parser/arc_factored_marginals.cc | 24 +++++++++++++----------- rst_parser/mst_train.cc | 29 +++++++++++++++++++++-------- rst_parser/rst_test.cc | 16 +++++++++++++--- utils/logval.h | 13 ++++++++++--- 5 files changed, 58 insertions(+), 26 deletions(-) diff --git a/rst_parser/arc_factored.h b/rst_parser/arc_factored.h index d9a0bb24..4de38b66 100644 --- a/rst_parser/arc_factored.h +++ b/rst_parser/arc_factored.h @@ -56,7 +56,7 @@ class ArcFactoredForest { // Reweight edges so that edge_prob is the edge's marginals // optionally returns log partition - void EdgeMarginals(double* p_log_z = NULL); + void EdgeMarginals(prob_t* p_log_z = NULL); // This may not return a tree void PickBestParentForEachWord(EdgeSubset* st) const; diff --git a/rst_parser/arc_factored_marginals.cc b/rst_parser/arc_factored_marginals.cc index 16360b0d..acb8102a 100644 --- a/rst_parser/arc_factored_marginals.cc +++ b/rst_parser/arc_factored_marginals.cc @@ -9,37 +9,39 @@ using namespace std; #if HAVE_EIGEN #include -typedef Eigen::Matrix ArcMatrix; -typedef Eigen::Matrix RootVector; +typedef Eigen::Matrix ArcMatrix; +typedef Eigen::Matrix RootVector; -void ArcFactoredForest::EdgeMarginals(double *plog_z) { +void ArcFactoredForest::EdgeMarginals(prob_t *plog_z) { ArcMatrix A(num_words_,num_words_); RootVector r(num_words_); for (int h = 0; h < num_words_; ++h) { for (int m = 0; m < num_words_; ++m) { if (h != m) - A(h,m) = edges_(h,m).edge_prob.as_float(); + A(h,m) = edges_(h,m).edge_prob; else - A(h,m) = 0; + A(h,m) = prob_t::Zero(); } - r(h) = root_edges_[h].edge_prob.as_float(); + r(h) = root_edges_[h].edge_prob; } ArcMatrix L = -A; L.diagonal() = A.colwise().sum(); L.row(0) = r; ArcMatrix Linv = L.inverse(); - if (plog_z) *plog_z = log(Linv.determinant()); + if (plog_z) *plog_z = Linv.determinant(); RootVector rootMarginals = r.cwiseProduct(Linv.col(0)); + static const prob_t ZERO(0); + static const prob_t ONE(1); // ArcMatrix T = Linv; for (int h = 0; h < num_words_; ++h) { for (int m = 0; m < num_words_; ++m) { - const double marginal = (m == 0 ? 0.0 : 1.0) * A(h,m) * Linv(m,m) - - (h == 0 ? 0.0 : 1.0) * A(h,m) * Linv(m,h); - edges_(h,m).edge_prob = prob_t(marginal); + const prob_t marginal = (m == 0 ? ZERO : ONE) * A(h,m) * Linv(m,m) - + (h == 0 ? ZERO : ONE) * A(h,m) * Linv(m,h); + edges_(h,m).edge_prob = marginal; // T(h,m) = marginal; } - root_edges_[h].edge_prob = prob_t(rootMarginals(h)); + root_edges_[h].edge_prob = rootMarginals(h); } // cerr << "ROOT MARGINALS: " << rootMarginals.transpose() << endl; // cerr << "M:\n" << T << endl; diff --git a/rst_parser/mst_train.cc b/rst_parser/mst_train.cc index b5114726..c5cab6ec 100644 --- a/rst_parser/mst_train.cc +++ b/rst_parser/mst_train.cc @@ -23,7 +23,9 @@ void InitCommandLine(int argc, char** argv, po::variables_map* conf) { string cfg_file; opts.add_options() ("training_data,t",po::value()->default_value("-"), "File containing training data (jsent format)") - ("feature_function,F",po::value >()->composing(), "feature function") + ("feature_function,F",po::value >()->composing(), "feature function (multiple permitted)") + ("weights,w",po::value(), "Optional starting weights") + ("output_every_i_iterations,I",po::value()->default_value(1), "Write weights every I iterations") ("regularization_strength,C",po::value()->default_value(1.0), "Regularization strength") ("correction_buffers,m", po::value()->default_value(10), "LBFGS correction buffers"); po::options_description clo("Command line options"); @@ -161,9 +163,13 @@ int main(int argc, char** argv) { if (flag) cerr << endl; //cerr << "EMP: " << empirical << endl; //DE vector weights(FD::NumFeats(), 0.0); + if (conf.count("weights")) + Weights::InitFromFile(conf["weights"].as(), &weights); vector g(FD::NumFeats(), 0.0); cerr << "features initialized\noptimizing...\n"; boost::shared_ptr o; + int every = corpus.size() / 20; + if (!every) ++every; o.reset(new LBFGSOptimizer(g.size(), conf["correction_buffers"].as())); int iterations = 1000; for (int iter = 0; iter < iterations; ++iter) { @@ -174,11 +180,12 @@ int main(int argc, char** argv) { double obj = -empirical.dot(weights); // SparseVector mfm; //DE for (int i = 0; i < corpus.size(); ++i) { + if ((i + 1) % every == 0) cerr << '.' << flush; const int num_words = corpus[i].ts.words.size(); forests[i].Reweight(weights); - double lz; - forests[i].EdgeMarginals(&lz); - obj -= lz; + prob_t z; + forests[i].EdgeMarginals(&z); + obj -= log(z); //cerr << " O = " << (-corpus[i].features.dot(weights)) << " D=" << -lz << " OO= " << (-corpus[i].features.dot(weights) - lz) << endl; //cerr << " ZZ = " << zz << endl; for (int h = -1; h < num_words; ++h) { @@ -202,14 +209,20 @@ int main(int argc, char** argv) { gnorm += g[i]*g[i]; ostringstream ll; ll << "ITER=" << (iter+1) << "\tOBJ=" << (obj+r) << "\t[F=" << obj << " R=" << r << "]\tGnorm=" << sqrt(gnorm); - cerr << endl << ll.str() << endl; + cerr << ' ' << ll.str().substr(ll.str().find('\t')+1) << endl; obj += r; assert(obj >= 0); o->Optimize(obj, g, &weights); Weights::ShowLargestFeatures(weights); - string sl = ll.str(); - Weights::WriteToFile(o->HasConverged() ? "weights.final.gz" : "weights.cur.gz", weights, true, &sl); - if (o->HasConverged()) { cerr << "CONVERGED\n"; break; } + const bool converged = o->HasConverged(); + const char* ofname = converged ? "weights.final.gz" : "weights.cur.gz"; + if (converged || ((iter+1) % conf["output_every_i_iterations"].as()) == 0) { + cerr << "writing..." << flush; + const string sl = ll.str(); + Weights::WriteToFile(ofname, weights, true, &sl); + cerr << "done" << endl; + } + if (converged) { cerr << "CONVERGED\n"; break; } } forests[0].Reweight(weights); TreeSampler ts(forests[0]); diff --git a/rst_parser/rst_test.cc b/rst_parser/rst_test.cc index 7e6fb2c1..3bb95759 100644 --- a/rst_parser/rst_test.cc +++ b/rst_parser/rst_test.cc @@ -2,6 +2,8 @@ #include +#include + using namespace std; int main(int argc, char** argv) { @@ -28,11 +30,19 @@ int main(int argc, char** argv) { af(-1,2).edge_prob.logeq(9); EdgeSubset tree; // af.MaximumEdgeSubset(&tree); - double lz; - af.EdgeMarginals(&lz); - cerr << "Z = " << lz << endl; + prob_t z; + af.EdgeMarginals(&z); + cerr << "Z = " << abs(z) << endl; af.PickBestParentForEachWord(&tree); cerr << tree << endl; + typedef Eigen::Matrix M3; + M3 A = M3::Zero(); + A(0,0) = prob_t(1); + A(1,0) = prob_t(3); + A(0,1) = prob_t(2); + A(1,1) = prob_t(4); + prob_t det = A.determinant(); + cerr << det.as_float() << endl; return 0; } diff --git a/utils/logval.h b/utils/logval.h index 8a59d0b1..ec1f6acd 100644 --- a/utils/logval.h +++ b/utils/logval.h @@ -30,8 +30,6 @@ class LogVal { LogVal(init_minus_1) : s_(true),v_(0) { } LogVal(init_1) : s_(),v_(0) { } LogVal(init_0) : s_(),v_(LOGVAL_LOG0) { } - explicit LogVal(int x) : s_(x<0), v_(s_ ? std::log(-x) : std::log(x)) {} - explicit LogVal(unsigned x) : s_(0), v_(std::log(x)) { } LogVal(double lnx,bool sign) : s_(sign),v_(lnx) {} LogVal(double lnx,init_lnx) : s_(),v_(lnx) {} static Self exp(T lnx) { return Self(lnx,false); } @@ -126,7 +124,7 @@ class LogVal { } Self operator-() const { - return Self(v_,-s_); + return Self(v_,!s_); } void negate() { s_ = !s_; } @@ -193,6 +191,15 @@ T log(const LogVal& o) { return o.v_; } +template +LogVal abs(const LogVal& o) { + if (o.s_) { + LogVal res = o; + res.s_ = false; + return res; + } else { return o; } +} + template LogVal pow(const LogVal& b, const T& e) { return b.pow(e); -- cgit v1.2.3 From f2fcf9e8aa0e5dee75fd08ee915488ec1a741975 Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Mon, 16 Apr 2012 19:39:36 +0100 Subject: feature extraction helper --- rst_parser/arc_factored.cc | 21 +++++++++++++++++++++ rst_parser/arc_factored.h | 5 ++++- 2 files changed, 25 insertions(+), 1 deletion(-) diff --git a/rst_parser/arc_factored.cc b/rst_parser/arc_factored.cc index 44e769b8..34c689f4 100644 --- a/rst_parser/arc_factored.cc +++ b/rst_parser/arc_factored.cc @@ -12,6 +12,27 @@ using namespace std; using namespace std::tr1; using namespace boost; +void EdgeSubset::ExtractFeatures(const TaggedSentence& sentence, + const std::vector >& ffs, + SparseVector* features) const { + SparseVector efmap; + for (int i = 0; i < ffs.size(); ++i) { + const ArcFeatureFunction& ff= *ffs[i]; + for (int j = 0; j < h_m_pairs.size(); ++j) { + efmap.clear(); + ff.EgdeFeatures(sentence, h_m_pairs[j].first, + h_m_pairs[j].second, + &efmap); + (*features) += efmap; + } + for (int j = 0; j < roots.size(); ++j) { + efmap.clear(); + ff.EgdeFeatures(sentence, -1, roots[j], &efmap); + (*features) += efmap; + } + } +} + void ArcFactoredForest::ExtractFeatures(const TaggedSentence& sentence, const std::vector >& ffs) { for (int i = 0; i < ffs.size(); ++i) { diff --git a/rst_parser/arc_factored.h b/rst_parser/arc_factored.h index 4de38b66..a271c8d4 100644 --- a/rst_parser/arc_factored.h +++ b/rst_parser/arc_factored.h @@ -17,14 +17,17 @@ struct TaggedSentence { std::vector pos; }; +struct ArcFeatureFunction; struct EdgeSubset { EdgeSubset() {} std::vector roots; // unless multiroot trees are supported, this // will have a single member std::vector > h_m_pairs; // h,m start at 0 + void ExtractFeatures(const TaggedSentence& sentence, + const std::vector >& ffs, + SparseVector* features) const; }; -struct ArcFeatureFunction; class ArcFactoredForest { public: ArcFactoredForest() : num_words_() {} -- cgit v1.2.3 From 4b38556c88c739de82b9c298261a262ec620280e Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Mon, 16 Apr 2012 18:20:33 -0400 Subject: rst sampler --- rst_parser/Makefile.am | 7 ++- rst_parser/dep_training.cc | 56 ++++++++++++++++++++ rst_parser/dep_training.h | 17 ++++++ rst_parser/mst_train.cc | 58 +-------------------- rst_parser/rst.cc | 56 +++++++++++++++----- rst_parser/rst.h | 8 ++- rst_parser/rst_parse.cc | 126 +++++++++++++++++++++++++++++++++++++++++++++ utils/weights.cc | 4 +- 8 files changed, 260 insertions(+), 72 deletions(-) create mode 100644 rst_parser/dep_training.cc create mode 100644 rst_parser/dep_training.h create mode 100644 rst_parser/rst_parse.cc diff --git a/rst_parser/Makefile.am b/rst_parser/Makefile.am index 2b64b43a..6e884f53 100644 --- a/rst_parser/Makefile.am +++ b/rst_parser/Makefile.am @@ -1,5 +1,5 @@ bin_PROGRAMS = \ - mst_train + mst_train rst_parse noinst_PROGRAMS = \ rst_test @@ -8,11 +8,14 @@ TESTS = rst_test noinst_LIBRARIES = librst.a -librst_a_SOURCES = arc_factored.cc arc_factored_marginals.cc rst.cc arc_ff.cc +librst_a_SOURCES = arc_factored.cc arc_factored_marginals.cc rst.cc arc_ff.cc dep_training.cc mst_train_SOURCES = mst_train.cc mst_train_LDADD = librst.a $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a ../klm/lm/libklm.a ../klm/util/libklm_util.a ../training/optimize.o -lz +rst_parse_SOURCES = rst_parse.cc +rst_parse_LDADD = librst.a $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a ../klm/lm/libklm.a ../klm/util/libklm_util.a -lz + rst_test_SOURCES = rst_test.cc rst_test_LDADD = librst.a $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a ../klm/lm/libklm.a ../klm/util/libklm_util.a -lz diff --git a/rst_parser/dep_training.cc b/rst_parser/dep_training.cc new file mode 100644 index 00000000..de431ebc --- /dev/null +++ b/rst_parser/dep_training.cc @@ -0,0 +1,56 @@ +#include "dep_training.h" + +#include +#include + +#include "stringlib.h" +#include "filelib.h" +#include "tdict.h" +#include "picojson.h" + +using namespace std; + +void TrainingInstance::ReadTraining(const string& fname, vector* corpus, int rank, int size) { + ReadFile rf(fname); + istream& in = *rf.stream(); + string line; + string err; + int lc = 0; + bool flag = false; + while(getline(in, line)) { + ++lc; + if ((lc-1) % size != rank) continue; + if (rank == 0 && lc % 10 == 0) { cerr << '.' << flush; flag = true; } + if (rank == 0 && lc % 400 == 0) { cerr << " [" << lc << "]\n"; flag = false; } + size_t pos = line.rfind('\t'); + assert(pos != string::npos); + picojson::value obj; + picojson::parse(obj, line.begin() + pos, line.end(), &err); + if (err.size() > 0) { cerr << "JSON parse error in " << lc << ": " << err << endl; abort(); } + corpus->push_back(TrainingInstance()); + TrainingInstance& cur = corpus->back(); + TaggedSentence& ts = cur.ts; + EdgeSubset& tree = cur.tree; + assert(obj.is()); + const picojson::object& d = obj.get(); + const picojson::array& ta = d.find("tokens")->second.get(); + for (unsigned i = 0; i < ta.size(); ++i) { + ts.words.push_back(TD::Convert(ta[i].get()[0].get())); + ts.pos.push_back(TD::Convert(ta[i].get()[1].get())); + } + const picojson::array& da = d.find("deps")->second.get(); + for (unsigned i = 0; i < da.size(); ++i) { + const picojson::array& thm = da[i].get(); + // get dep type here + short h = thm[2].get(); + short m = thm[1].get(); + if (h < 0) + tree.roots.push_back(m); + else + tree.h_m_pairs.push_back(make_pair(h,m)); + } + //cerr << TD::GetString(ts.words) << endl << TD::GetString(ts.pos) << endl << tree << endl; + } + if (flag) cerr << "\nRead " << lc << " training instances\n"; +} + diff --git a/rst_parser/dep_training.h b/rst_parser/dep_training.h new file mode 100644 index 00000000..73ffd298 --- /dev/null +++ b/rst_parser/dep_training.h @@ -0,0 +1,17 @@ +#ifndef _DEP_TRAINING_H_ +#define _DEP_TRAINING_H_ + +#include +#include +#include "arc_factored.h" +#include "weights.h" + +struct TrainingInstance { + TaggedSentence ts; + EdgeSubset tree; + SparseVector features; + // reads a "Jsent" formatted dependency file + static void ReadTraining(const std::string& fname, std::vector* corpus, int rank = 0, int size = 1); +}; + +#endif diff --git a/rst_parser/mst_train.cc b/rst_parser/mst_train.cc index c5cab6ec..f0403d7e 100644 --- a/rst_parser/mst_train.cc +++ b/rst_parser/mst_train.cc @@ -10,10 +10,9 @@ #include "stringlib.h" #include "filelib.h" #include "tdict.h" -#include "picojson.h" +#include "dep_training.h" #include "optimize.h" #include "weights.h" -#include "rst.h" using namespace std; namespace po = boost::program_options; @@ -47,56 +46,6 @@ void InitCommandLine(int argc, char** argv, po::variables_map* conf) { } } -struct TrainingInstance { - TaggedSentence ts; - EdgeSubset tree; - SparseVector features; -}; - -void ReadTraining(const string& fname, vector* corpus, int rank = 0, int size = 1) { - ReadFile rf(fname); - istream& in = *rf.stream(); - string line; - string err; - int lc = 0; - bool flag = false; - while(getline(in, line)) { - ++lc; - if ((lc-1) % size != rank) continue; - if (rank == 0 && lc % 10 == 0) { cerr << '.' << flush; flag = true; } - if (rank == 0 && lc % 400 == 0) { cerr << " [" << lc << "]\n"; flag = false; } - size_t pos = line.rfind('\t'); - assert(pos != string::npos); - picojson::value obj; - picojson::parse(obj, line.begin() + pos, line.end(), &err); - if (err.size() > 0) { cerr << "JSON parse error in " << lc << ": " << err << endl; abort(); } - corpus->push_back(TrainingInstance()); - TrainingInstance& cur = corpus->back(); - TaggedSentence& ts = cur.ts; - EdgeSubset& tree = cur.tree; - assert(obj.is()); - const picojson::object& d = obj.get(); - const picojson::array& ta = d.find("tokens")->second.get(); - for (unsigned i = 0; i < ta.size(); ++i) { - ts.words.push_back(TD::Convert(ta[i].get()[0].get())); - ts.pos.push_back(TD::Convert(ta[i].get()[1].get())); - } - const picojson::array& da = d.find("deps")->second.get(); - for (unsigned i = 0; i < da.size(); ++i) { - const picojson::array& thm = da[i].get(); - // get dep type here - short h = thm[2].get(); - short m = thm[1].get(); - if (h < 0) - tree.roots.push_back(m); - else - tree.h_m_pairs.push_back(make_pair(h,m)); - } - //cerr << TD::GetString(ts.words) << endl << TD::GetString(ts.pos) << endl << tree << endl; - } - if (flag) cerr << "\nRead " << lc << " training instances\n"; -} - void AddFeatures(double prob, const SparseVector& fmap, vector* g) { for (SparseVector::const_iterator it = fmap.begin(); it != fmap.end(); ++it) (*g)[it->first] += it->second * prob; @@ -131,7 +80,7 @@ int main(int argc, char** argv) { vector corpus; vector > ffs; ffs.push_back(boost::shared_ptr(new DistancePenalty(""))); - ReadTraining(conf["training_data"].as(), &corpus, rank, size); + TrainingInstance::ReadTraining(conf["training_data"].as(), &corpus, rank, size); vector forests(corpus.size()); SparseVector empirical; bool flag = false; @@ -224,9 +173,6 @@ int main(int argc, char** argv) { } if (converged) { cerr << "CONVERGED\n"; break; } } - forests[0].Reweight(weights); - TreeSampler ts(forests[0]); - EdgeSubset tt; ts.SampleRandomSpanningTree(&tt); return 0; } diff --git a/rst_parser/rst.cc b/rst_parser/rst.cc index c4ce898e..bc91330b 100644 --- a/rst_parser/rst.cc +++ b/rst_parser/rst.cc @@ -3,45 +3,77 @@ using namespace std; // David B. Wilson. Generating Random Spanning Trees More Quickly than the Cover Time. - +// this is an awesome algorithm TreeSampler::TreeSampler(const ArcFactoredForest& af) : forest(af), usucc(af.size() + 1) { - // edges are directed from modifiers to heads, to the root + // edges are directed from modifiers to heads, and finally to the root + vector p; for (int m = 1; m <= forest.size(); ++m) { +#if USE_ALIAS_SAMPLER + p.clear(); +#else SampleSet& ss = usucc[m]; - for (int h = 0; h <= forest.size(); ++h) - ss.add(forest(h-1,m-1).edge_prob.as_float()); +#endif + double z = 0; + for (int h = 0; h <= forest.size(); ++h) { + double u = forest(h-1,m-1).edge_prob.as_float(); + z += u; +#if USE_ALIAS_SAMPLER + p.push_back(u); +#else + ss.add(u); +#endif + } +#if USE_ALIAS_SAMPLER + for (int i = 0; i < p.size(); ++i) { p[i] /= z; } + usucc[m].Init(p); +#endif } } -void TreeSampler::SampleRandomSpanningTree(EdgeSubset* tree) { - MT19937 rng; +void TreeSampler::SampleRandomSpanningTree(EdgeSubset* tree, MT19937* prng) { + MT19937& rng = *prng; const int r = 0; bool success = false; while (!success) { int roots = 0; + tree->h_m_pairs.clear(); + tree->roots.clear(); vector next(forest.size() + 1, -1); vector in_tree(forest.size() + 1, 0); in_tree[r] = 1; - for (int i = 0; i < forest.size(); ++i) { + //cerr << "Forest size: " << forest.size() << endl; + for (int i = 0; i <= forest.size(); ++i) { + //cerr << "Sampling starting at u=" << i << endl; int u = i; if (in_tree[u]) continue; while(!in_tree[u]) { +#if USE_ALIAS_SAMPLER + next[u] = usucc[u].Draw(rng); +#else next[u] = rng.SelectSample(usucc[u]); +#endif u = next[u]; } u = i; - cerr << (u-1); + //cerr << (u-1); + int prev = u-1; while(!in_tree[u]) { in_tree[u] = true; u = next[u]; - cerr << " > " << (u-1); - if (u == r) { ++roots; } + //cerr << " > " << (u-1); + if (u == r) { + ++roots; + tree->roots.push_back(prev); + } else { + tree->h_m_pairs.push_back(make_pair(u-1,prev)); + } + prev = u-1; } - cerr << endl; + //cerr << endl; } assert(roots > 0); if (roots > 1) { - cerr << "FAILURE\n"; + //cerr << "FAILURE\n"; } else { success = true; } diff --git a/rst_parser/rst.h b/rst_parser/rst.h index a269ff9b..8bf389f7 100644 --- a/rst_parser/rst.h +++ b/rst_parser/rst.h @@ -4,12 +4,18 @@ #include #include "sampler.h" #include "arc_factored.h" +#include "alias_sampler.h" struct TreeSampler { explicit TreeSampler(const ArcFactoredForest& af); - void SampleRandomSpanningTree(EdgeSubset* tree); + void SampleRandomSpanningTree(EdgeSubset* tree, MT19937* rng); const ArcFactoredForest& forest; +#define USE_ALIAS_SAMPLER 1 +#if USE_ALIAS_SAMPLER + std::vector usucc; +#else std::vector > usucc; +#endif }; #endif diff --git a/rst_parser/rst_parse.cc b/rst_parser/rst_parse.cc new file mode 100644 index 00000000..9cc1359a --- /dev/null +++ b/rst_parser/rst_parse.cc @@ -0,0 +1,126 @@ +#include "arc_factored.h" + +#include +#include +#include +#include + +#include "timing_stats.h" +#include "arc_ff.h" +#include "arc_ff_factory.h" +#include "dep_training.h" +#include "stringlib.h" +#include "filelib.h" +#include "tdict.h" +#include "weights.h" +#include "rst.h" + +using namespace std; +namespace po = boost::program_options; + +void InitCommandLine(int argc, char** argv, po::variables_map* conf) { + po::options_description opts("Configuration options"); + string cfg_file; + opts.add_options() + ("training_data,t",po::value()->default_value("-"), "File containing training data (jsent format)") + ("feature_function,F",po::value >()->composing(), "feature function (multiple permitted)") + ("q_weights,q",po::value(), "Arc-factored weights for proposal distribution") + ("samples,n",po::value()->default_value(1000), "Number of samples"); + po::options_description clo("Command line options"); + clo.add_options() + ("config,c", po::value(&cfg_file), "Configuration file") + ("help,?", "Print this help message and exit"); + + po::options_description dconfig_options, dcmdline_options; + dconfig_options.add(opts); + dcmdline_options.add(dconfig_options).add(clo); + po::store(parse_command_line(argc, argv, dcmdline_options), *conf); + if (cfg_file.size() > 0) { + ReadFile rf(cfg_file); + po::store(po::parse_config_file(*rf.stream(), dconfig_options), *conf); + } + if (conf->count("help")) { + cerr << dcmdline_options << endl; + exit(1); + } +} + +int main(int argc, char** argv) { + po::variables_map conf; + InitCommandLine(argc, argv, &conf); + ArcFactoredForest af(5); + ArcFFRegistry reg; + reg.Register("DistancePenalty", new ArcFFFactory); + vector corpus; + vector > ffs; + ffs.push_back(boost::shared_ptr(new DistancePenalty(""))); + TrainingInstance::ReadTraining(conf["training_data"].as(), &corpus); + vector forests(corpus.size()); + SparseVector empirical; + bool flag = false; + for (int i = 0; i < corpus.size(); ++i) { + TrainingInstance& cur = corpus[i]; + if ((i+1) % 10 == 0) { cerr << '.' << flush; flag = true; } + if ((i+1) % 400 == 0) { cerr << " [" << (i+1) << "]\n"; flag = false; } + for (int fi = 0; fi < ffs.size(); ++fi) { + ArcFeatureFunction& ff = *ffs[fi]; + ff.PrepareForInput(cur.ts); + SparseVector efmap; + for (int j = 0; j < cur.tree.h_m_pairs.size(); ++j) { + efmap.clear(); + ff.EgdeFeatures(cur.ts, cur.tree.h_m_pairs[j].first, + cur.tree.h_m_pairs[j].second, + &efmap); + cur.features += efmap; + } + for (int j = 0; j < cur.tree.roots.size(); ++j) { + efmap.clear(); + ff.EgdeFeatures(cur.ts, -1, cur.tree.roots[j], &efmap); + cur.features += efmap; + } + } + empirical += cur.features; + forests[i].resize(cur.ts.words.size()); + forests[i].ExtractFeatures(cur.ts, ffs); + } + if (flag) cerr << endl; + vector weights(FD::NumFeats(), 0.0); + Weights::InitFromFile(conf["q_weights"].as(), &weights); + MT19937 rng; + SparseVector model_exp; + SparseVector sampled_exp; + int samples = conf["samples"].as(); + for (int i = 0; i < corpus.size(); ++i) { + const int num_words = corpus[i].ts.words.size(); + forests[i].Reweight(weights); + forests[i].EdgeMarginals(); + model_exp.clear(); + for (int h = -1; h < num_words; ++h) { + for (int m = 0; m < num_words; ++m) { + if (h == m) continue; + const ArcFactoredForest::Edge& edge = forests[i](h,m); + const SparseVector& fmap = edge.features; + double prob = edge.edge_prob.as_float(); + model_exp += fmap * prob; + } + } + //cerr << "TRUE EXP: " << model_exp << endl; + + forests[i].Reweight(weights); + TreeSampler ts(forests[i]); + sampled_exp.clear(); + //ostringstream os; os << "Samples_" << samples; + //Timer t(os.str()); + for (int n = 0; n < samples; ++n) { + EdgeSubset tree; + ts.SampleRandomSpanningTree(&tree, &rng); + SparseVector feats; + tree.ExtractFeatures(corpus[i].ts, ffs, &feats); + sampled_exp += feats; + } + sampled_exp /= samples; + cerr << "L2 norm of diff @ " << samples << " samples: " << (model_exp - sampled_exp).l2norm() << endl; + } + return 0; +} + diff --git a/utils/weights.cc b/utils/weights.cc index ac407dfb..39c18474 100644 --- a/utils/weights.cc +++ b/utils/weights.cc @@ -144,8 +144,10 @@ void Weights::ShowLargestFeatures(const vector& w) { vector fnums(w.size()); for (int i = 0; i < w.size(); ++i) fnums[i] = i; + int nf = FD::NumFeats(); + if (nf > 10) nf = 10; vector::iterator mid = fnums.begin(); - mid += (w.size() > 10 ? 10 : w.size()); + mid += nf; partial_sort(fnums.begin(), mid, fnums.end(), FComp(w)); cerr << "TOP FEATURES:"; for (vector::iterator i = fnums.begin(); i != mid; ++i) { -- cgit v1.2.3 From 8aff3bd109b82b57c32a0b14a019c99c1ec35705 Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Mon, 16 Apr 2012 18:37:04 -0400 Subject: improved alias sampler --- utils/alias_sampler.h | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/utils/alias_sampler.h b/utils/alias_sampler.h index 85da9944..81541f7a 100644 --- a/utils/alias_sampler.h +++ b/utils/alias_sampler.h @@ -14,10 +14,13 @@ // constructor computes the F's and L's given an arbitrary multionimial p in // O(n) time and Draw returns samples in O(1) time. struct AliasSampler { - explicit AliasSampler(const std::vector& p) : - cutoffs_(p.size()), - aliases_(p.size(), std::numeric_limits::max()) { + AliasSampler() {} + explicit AliasSampler(const std::vector& p) { Init(p); } + void Init(const std::vector& p) { const unsigned N = p.size(); + cutoffs_.resize(p.size()); + aliases_.clear(); + aliases_.resize(p.size(), std::numeric_limits::max()); std::vector s,g; for (unsigned i = 0; i < N; ++i) { const double cutoff = cutoffs_[i] = N * p[i]; -- cgit v1.2.3 From fa80a23079d642a3e984712c9dfa9ac47d2457fa Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Mon, 16 Apr 2012 22:42:24 -0400 Subject: refactor some code, simplify, fix typos --- rst_parser/Makefile.am | 16 ++--- rst_parser/arc_factored.cc | 40 ++++++------ rst_parser/arc_factored.h | 7 ++- rst_parser/arc_ff.cc | 120 +++++++++++++++++++++--------------- rst_parser/arc_ff.h | 35 +++-------- rst_parser/arc_ff_factory.h | 42 ------------- rst_parser/mst_train.cc | 37 +++++------- rst_parser/rst_parse.cc | 126 -------------------------------------- rst_parser/rst_test.cc | 48 --------------- rst_parser/rst_train.cc | 144 ++++++++++++++++++++++++++++++++++++++++++++ 10 files changed, 264 insertions(+), 351 deletions(-) delete mode 100644 rst_parser/arc_ff_factory.h delete mode 100644 rst_parser/rst_parse.cc delete mode 100644 rst_parser/rst_test.cc create mode 100644 rst_parser/rst_train.cc diff --git a/rst_parser/Makefile.am b/rst_parser/Makefile.am index 6e884f53..876c2237 100644 --- a/rst_parser/Makefile.am +++ b/rst_parser/Makefile.am @@ -1,22 +1,14 @@ bin_PROGRAMS = \ - mst_train rst_parse - -noinst_PROGRAMS = \ - rst_test - -TESTS = rst_test + mst_train rst_train noinst_LIBRARIES = librst.a -librst_a_SOURCES = arc_factored.cc arc_factored_marginals.cc rst.cc arc_ff.cc dep_training.cc +librst_a_SOURCES = arc_factored.cc arc_factored_marginals.cc rst.cc arc_ff.cc dep_training.cc global_ff.cc mst_train_SOURCES = mst_train.cc mst_train_LDADD = librst.a $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a ../klm/lm/libklm.a ../klm/util/libklm_util.a ../training/optimize.o -lz -rst_parse_SOURCES = rst_parse.cc -rst_parse_LDADD = librst.a $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a ../klm/lm/libklm.a ../klm/util/libklm_util.a -lz - -rst_test_SOURCES = rst_test.cc -rst_test_LDADD = librst.a $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a ../klm/lm/libklm.a ../klm/util/libklm_util.a -lz +rst_train_SOURCES = rst_train.cc +rst_train_LDADD = librst.a $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a ../klm/lm/libklm.a ../klm/util/libklm_util.a -lz AM_CPPFLAGS = -W -Wall -Wno-sign-compare $(GTEST_CPPFLAGS) -I$(top_srcdir)/decoder -I$(top_srcdir)/training -I$(top_srcdir)/utils -I$(top_srcdir)/mteval -I../klm diff --git a/rst_parser/arc_factored.cc b/rst_parser/arc_factored.cc index 34c689f4..74bf7516 100644 --- a/rst_parser/arc_factored.cc +++ b/rst_parser/arc_factored.cc @@ -13,36 +13,30 @@ using namespace std::tr1; using namespace boost; void EdgeSubset::ExtractFeatures(const TaggedSentence& sentence, - const std::vector >& ffs, + const ArcFeatureFunctions& ffs, SparseVector* features) const { SparseVector efmap; - for (int i = 0; i < ffs.size(); ++i) { - const ArcFeatureFunction& ff= *ffs[i]; - for (int j = 0; j < h_m_pairs.size(); ++j) { - efmap.clear(); - ff.EgdeFeatures(sentence, h_m_pairs[j].first, - h_m_pairs[j].second, - &efmap); - (*features) += efmap; - } - for (int j = 0; j < roots.size(); ++j) { - efmap.clear(); - ff.EgdeFeatures(sentence, -1, roots[j], &efmap); - (*features) += efmap; - } + for (int j = 0; j < h_m_pairs.size(); ++j) { + efmap.clear(); + ffs.EdgeFeatures(sentence, h_m_pairs[j].first, + h_m_pairs[j].second, + &efmap); + (*features) += efmap; + } + for (int j = 0; j < roots.size(); ++j) { + efmap.clear(); + ffs.EdgeFeatures(sentence, -1, roots[j], &efmap); + (*features) += efmap; } } void ArcFactoredForest::ExtractFeatures(const TaggedSentence& sentence, - const std::vector >& ffs) { - for (int i = 0; i < ffs.size(); ++i) { - const ArcFeatureFunction& ff = *ffs[i]; - for (int m = 0; m < num_words_; ++m) { - for (int h = 0; h < num_words_; ++h) { - ff.EgdeFeatures(sentence, h, m, &edges_(h,m).features); - } - ff.EgdeFeatures(sentence, -1, m, &root_edges_[m].features); + const ArcFeatureFunctions& ffs) { + for (int m = 0; m < num_words_; ++m) { + for (int h = 0; h < num_words_; ++h) { + ffs.EdgeFeatures(sentence, h, m, &edges_(h,m).features); } + ffs.EdgeFeatures(sentence, -1, m, &root_edges_[m].features); } } diff --git a/rst_parser/arc_factored.h b/rst_parser/arc_factored.h index a271c8d4..c5481d80 100644 --- a/rst_parser/arc_factored.h +++ b/rst_parser/arc_factored.h @@ -17,14 +17,15 @@ struct TaggedSentence { std::vector pos; }; -struct ArcFeatureFunction; +struct ArcFeatureFunctions; struct EdgeSubset { EdgeSubset() {} std::vector roots; // unless multiroot trees are supported, this // will have a single member std::vector > h_m_pairs; // h,m start at 0 + // assumes ArcFeatureFunction::PrepareForInput has already been called void ExtractFeatures(const TaggedSentence& sentence, - const std::vector >& ffs, + const ArcFeatureFunctions& ffs, SparseVector* features) const; }; @@ -74,7 +75,7 @@ class ArcFactoredForest { // set eges_[*].features void ExtractFeatures(const TaggedSentence& sentence, - const std::vector >& ffs); + const ArcFeatureFunctions& ffs); const Edge& operator()(short h, short m) const { return h >= 0 ? edges_(h, m) : root_edges_[m]; diff --git a/rst_parser/arc_ff.cc b/rst_parser/arc_ff.cc index f9effbda..10885716 100644 --- a/rst_parser/arc_ff.cc +++ b/rst_parser/arc_ff.cc @@ -6,59 +6,81 @@ using namespace std; -ArcFeatureFunction::~ArcFeatureFunction() {} +struct ArcFFImpl { + ArcFFImpl() : kROOT("ROOT") {} + const string kROOT; -void ArcFeatureFunction::PrepareForInput(const TaggedSentence&) {} + void PrepareForInput(const TaggedSentence& sentence) { + (void) sentence; + } + + void EdgeFeatures(const TaggedSentence& sent, + short h, + short m, + SparseVector* features) const { + const bool is_root = (h == -1); + const string& head_word = (is_root ? kROOT : TD::Convert(sent.words[h])); + const string& head_pos = (is_root ? kROOT : TD::Convert(sent.pos[h])); + const string& mod_word = TD::Convert(sent.words[m]); + const string& mod_pos = TD::Convert(sent.pos[m]); + const bool dir = m < h; + int v = m - h; + if (v < 0) { + v= -1 - int(log(-v) / log(2)); + } else { + v= int(log(v) / log(2)); + } + static map lenmap; + int& lenfid = lenmap[v]; + if (!lenfid) { + ostringstream os; + if (v < 0) os << "LenL" << -v; else os << "LenR" << v; + lenfid = FD::Convert(os.str()); + } + features->set_value(lenfid, 1.0); + const string& lenstr = FD::Convert(lenfid); + if (!is_root) { + static int modl = FD::Convert("ModLeft"); + static int modr = FD::Convert("ModRight"); + if (dir) features->set_value(modl, 1); + else features->set_value(modr, 1); + } + if (is_root) { + ostringstream os; + os << "ROOT:" << mod_pos; + features->set_value(FD::Convert(os.str()), 1.0); + os << "_" << lenstr; + features->set_value(FD::Convert(os.str()), 1.0); + } else { // not root + ostringstream os; + os << "HM:" << head_pos << '_' << mod_pos; + features->set_value(FD::Convert(os.str()), 1.0); + os << '_' << dir; + features->set_value(FD::Convert(os.str()), 1.0); + os << '_' << lenstr; + features->set_value(FD::Convert(os.str()), 1.0); + ostringstream os2; + os2 << "LexHM:" << head_word << '_' << mod_word; + features->set_value(FD::Convert(os2.str()), 1.0); + os2 << '_' << dir; + features->set_value(FD::Convert(os2.str()), 1.0); + os2 << '_' << lenstr; + features->set_value(FD::Convert(os2.str()), 1.0); + } + } +}; -DistancePenalty::DistancePenalty(const string&) : fidw_(FD::Convert("Distance")), fidr_(FD::Convert("RootDistance")) {} +ArcFeatureFunctions::ArcFeatureFunctions() : pimpl(new ArcFFImpl) {} +ArcFeatureFunctions::~ArcFeatureFunctions() { delete pimpl; } + +void ArcFeatureFunctions::PrepareForInput(const TaggedSentence& sentence) { + pimpl->PrepareForInput(sentence); +} -void DistancePenalty::EdgeFeaturesImpl(const TaggedSentence& sent, +void ArcFeatureFunctions::EdgeFeatures(const TaggedSentence& sentence, short h, short m, SparseVector* features) const { - const bool dir = m < h; - const bool is_root = (h == -1); - int v = m - h; - if (v < 0) { - v= -1 - int(log(-v) / log(2)); - } else { - v= int(log(v) / log(2)); - } - static map lenmap; - int& lenfid = lenmap[v]; - if (!lenfid) { - ostringstream os; - if (v < 0) os << "LenL" << -v; else os << "LenR" << v; - lenfid = FD::Convert(os.str()); - } - features->set_value(lenfid, 1.0); - const string& lenstr = FD::Convert(lenfid); - if (!is_root) { - static int modl = FD::Convert("ModLeft"); - static int modr = FD::Convert("ModRight"); - if (dir) features->set_value(modl, 1); - else features->set_value(modr, 1); - } - if (is_root) { - ostringstream os; - os << "ROOT:" << TD::Convert(sent.pos[m]); - features->set_value(FD::Convert(os.str()), 1.0); - os << "_" << lenstr; - features->set_value(FD::Convert(os.str()), 1.0); - } else { // not root - ostringstream os; - os << "HM:" << TD::Convert(sent.pos[h]) << '_' << TD::Convert(sent.pos[m]); - features->set_value(FD::Convert(os.str()), 1.0); - os << '_' << dir; - features->set_value(FD::Convert(os.str()), 1.0); - os << '_' << lenstr; - features->set_value(FD::Convert(os.str()), 1.0); - ostringstream os2; - os2 << "LexHM:" << TD::Convert(sent.words[h]) << '_' << TD::Convert(sent.words[m]); - features->set_value(FD::Convert(os2.str()), 1.0); - os2 << '_' << dir; - features->set_value(FD::Convert(os2.str()), 1.0); - os2 << '_' << lenstr; - features->set_value(FD::Convert(os2.str()), 1.0); - } + pimpl->EdgeFeatures(sentence, h, m, features); } + diff --git a/rst_parser/arc_ff.h b/rst_parser/arc_ff.h index bc51fef4..52f311d2 100644 --- a/rst_parser/arc_ff.h +++ b/rst_parser/arc_ff.h @@ -7,37 +7,22 @@ #include "arc_factored.h" struct TaggedSentence; -class ArcFeatureFunction { +struct ArcFFImpl; +class ArcFeatureFunctions { public: - virtual ~ArcFeatureFunction(); + ArcFeatureFunctions(); + ~ArcFeatureFunctions(); // called once, per input, before any calls to EdgeFeatures // used to initialize sentence-specific data structures - virtual void PrepareForInput(const TaggedSentence& sentence); + void PrepareForInput(const TaggedSentence& sentence); - inline void EgdeFeatures(const TaggedSentence& sentence, - short h, - short m, - SparseVector* features) const { - EdgeFeaturesImpl(sentence, h, m, features); - } - protected: - virtual void EdgeFeaturesImpl(const TaggedSentence& sentence, - short h, - short m, - SparseVector* features) const = 0; -}; - -class DistancePenalty : public ArcFeatureFunction { - public: - DistancePenalty(const std::string& param); - protected: - virtual void EdgeFeaturesImpl(const TaggedSentence& sentence, - short h, - short m, - SparseVector* features) const; + void EdgeFeatures(const TaggedSentence& sentence, + short h, + short m, + SparseVector* features) const; private: - const int fidw_, fidr_; + ArcFFImpl* pimpl; }; #endif diff --git a/rst_parser/arc_ff_factory.h b/rst_parser/arc_ff_factory.h deleted file mode 100644 index 4237fd5d..00000000 --- a/rst_parser/arc_ff_factory.h +++ /dev/null @@ -1,42 +0,0 @@ -#ifndef _ARC_FF_FACTORY_H_ -#define _ARC_FF_FACTORY_H_ - -#include -#include -#include - -struct ArcFFFactoryBase { - virtual boost::shared_ptr Create(const std::string& param) const = 0; -}; - -template -struct ArcFFFactory : public ArcFFFactoryBase { - boost::shared_ptr Create(const std::string& param) const { - return boost::shared_ptr(new FF(param)); - } -}; - -struct ArcFFRegistry { - boost::shared_ptr Create(const std::string& name, const std::string& param) const { - std::map::const_iterator it = facts.find(name); - assert(it != facts.end()); - return it->second->Create(param); - } - - void Register(const std::string& name, ArcFFFactoryBase* fact) { - ArcFFFactoryBase*& f = facts[name]; - assert(f == NULL); - f = fact; - } - std::map facts; -}; - -std::ostream& operator<<(std::ostream& os, const ArcFFRegistry& reg) { - for (std::map::const_iterator it = reg.facts.begin(); - it != reg.facts.end(); ++it) { - os << " " << it->first << std::endl; - } - return os; -} - -#endif diff --git a/rst_parser/mst_train.cc b/rst_parser/mst_train.cc index f0403d7e..0709e7c9 100644 --- a/rst_parser/mst_train.cc +++ b/rst_parser/mst_train.cc @@ -6,7 +6,6 @@ #include #include "arc_ff.h" -#include "arc_ff_factory.h" #include "stringlib.h" #include "filelib.h" #include "tdict.h" @@ -22,7 +21,6 @@ void InitCommandLine(int argc, char** argv, po::variables_map* conf) { string cfg_file; opts.add_options() ("training_data,t",po::value()->default_value("-"), "File containing training data (jsent format)") - ("feature_function,F",po::value >()->composing(), "feature function (multiple permitted)") ("weights,w",po::value(), "Optional starting weights") ("output_every_i_iterations,I",po::value()->default_value(1), "Write weights every I iterations") ("regularization_strength,C",po::value()->default_value(1.0), "Regularization strength") @@ -74,12 +72,8 @@ int main(int argc, char** argv) { int size = 1; po::variables_map conf; InitCommandLine(argc, argv, &conf); - ArcFactoredForest af(5); - ArcFFRegistry reg; - reg.Register("DistancePenalty", new ArcFFFactory); + ArcFeatureFunctions ffs; vector corpus; - vector > ffs; - ffs.push_back(boost::shared_ptr(new DistancePenalty(""))); TrainingInstance::ReadTraining(conf["training_data"].as(), &corpus, rank, size); vector forests(corpus.size()); SparseVector empirical; @@ -88,22 +82,19 @@ int main(int argc, char** argv) { TrainingInstance& cur = corpus[i]; if (rank == 0 && (i+1) % 10 == 0) { cerr << '.' << flush; flag = true; } if (rank == 0 && (i+1) % 400 == 0) { cerr << " [" << (i+1) << "]\n"; flag = false; } - for (int fi = 0; fi < ffs.size(); ++fi) { - ArcFeatureFunction& ff = *ffs[fi]; - ff.PrepareForInput(cur.ts); - SparseVector efmap; - for (int j = 0; j < cur.tree.h_m_pairs.size(); ++j) { - efmap.clear(); - ff.EgdeFeatures(cur.ts, cur.tree.h_m_pairs[j].first, - cur.tree.h_m_pairs[j].second, - &efmap); - cur.features += efmap; - } - for (int j = 0; j < cur.tree.roots.size(); ++j) { - efmap.clear(); - ff.EgdeFeatures(cur.ts, -1, cur.tree.roots[j], &efmap); - cur.features += efmap; - } + ffs.PrepareForInput(cur.ts); + SparseVector efmap; + for (int j = 0; j < cur.tree.h_m_pairs.size(); ++j) { + efmap.clear(); + ffs.EdgeFeatures(cur.ts, cur.tree.h_m_pairs[j].first, + cur.tree.h_m_pairs[j].second, + &efmap); + cur.features += efmap; + } + for (int j = 0; j < cur.tree.roots.size(); ++j) { + efmap.clear(); + ffs.EdgeFeatures(cur.ts, -1, cur.tree.roots[j], &efmap); + cur.features += efmap; } empirical += cur.features; forests[i].resize(cur.ts.words.size()); diff --git a/rst_parser/rst_parse.cc b/rst_parser/rst_parse.cc deleted file mode 100644 index 9cc1359a..00000000 --- a/rst_parser/rst_parse.cc +++ /dev/null @@ -1,126 +0,0 @@ -#include "arc_factored.h" - -#include -#include -#include -#include - -#include "timing_stats.h" -#include "arc_ff.h" -#include "arc_ff_factory.h" -#include "dep_training.h" -#include "stringlib.h" -#include "filelib.h" -#include "tdict.h" -#include "weights.h" -#include "rst.h" - -using namespace std; -namespace po = boost::program_options; - -void InitCommandLine(int argc, char** argv, po::variables_map* conf) { - po::options_description opts("Configuration options"); - string cfg_file; - opts.add_options() - ("training_data,t",po::value()->default_value("-"), "File containing training data (jsent format)") - ("feature_function,F",po::value >()->composing(), "feature function (multiple permitted)") - ("q_weights,q",po::value(), "Arc-factored weights for proposal distribution") - ("samples,n",po::value()->default_value(1000), "Number of samples"); - po::options_description clo("Command line options"); - clo.add_options() - ("config,c", po::value(&cfg_file), "Configuration file") - ("help,?", "Print this help message and exit"); - - po::options_description dconfig_options, dcmdline_options; - dconfig_options.add(opts); - dcmdline_options.add(dconfig_options).add(clo); - po::store(parse_command_line(argc, argv, dcmdline_options), *conf); - if (cfg_file.size() > 0) { - ReadFile rf(cfg_file); - po::store(po::parse_config_file(*rf.stream(), dconfig_options), *conf); - } - if (conf->count("help")) { - cerr << dcmdline_options << endl; - exit(1); - } -} - -int main(int argc, char** argv) { - po::variables_map conf; - InitCommandLine(argc, argv, &conf); - ArcFactoredForest af(5); - ArcFFRegistry reg; - reg.Register("DistancePenalty", new ArcFFFactory); - vector corpus; - vector > ffs; - ffs.push_back(boost::shared_ptr(new DistancePenalty(""))); - TrainingInstance::ReadTraining(conf["training_data"].as(), &corpus); - vector forests(corpus.size()); - SparseVector empirical; - bool flag = false; - for (int i = 0; i < corpus.size(); ++i) { - TrainingInstance& cur = corpus[i]; - if ((i+1) % 10 == 0) { cerr << '.' << flush; flag = true; } - if ((i+1) % 400 == 0) { cerr << " [" << (i+1) << "]\n"; flag = false; } - for (int fi = 0; fi < ffs.size(); ++fi) { - ArcFeatureFunction& ff = *ffs[fi]; - ff.PrepareForInput(cur.ts); - SparseVector efmap; - for (int j = 0; j < cur.tree.h_m_pairs.size(); ++j) { - efmap.clear(); - ff.EgdeFeatures(cur.ts, cur.tree.h_m_pairs[j].first, - cur.tree.h_m_pairs[j].second, - &efmap); - cur.features += efmap; - } - for (int j = 0; j < cur.tree.roots.size(); ++j) { - efmap.clear(); - ff.EgdeFeatures(cur.ts, -1, cur.tree.roots[j], &efmap); - cur.features += efmap; - } - } - empirical += cur.features; - forests[i].resize(cur.ts.words.size()); - forests[i].ExtractFeatures(cur.ts, ffs); - } - if (flag) cerr << endl; - vector weights(FD::NumFeats(), 0.0); - Weights::InitFromFile(conf["q_weights"].as(), &weights); - MT19937 rng; - SparseVector model_exp; - SparseVector sampled_exp; - int samples = conf["samples"].as(); - for (int i = 0; i < corpus.size(); ++i) { - const int num_words = corpus[i].ts.words.size(); - forests[i].Reweight(weights); - forests[i].EdgeMarginals(); - model_exp.clear(); - for (int h = -1; h < num_words; ++h) { - for (int m = 0; m < num_words; ++m) { - if (h == m) continue; - const ArcFactoredForest::Edge& edge = forests[i](h,m); - const SparseVector& fmap = edge.features; - double prob = edge.edge_prob.as_float(); - model_exp += fmap * prob; - } - } - //cerr << "TRUE EXP: " << model_exp << endl; - - forests[i].Reweight(weights); - TreeSampler ts(forests[i]); - sampled_exp.clear(); - //ostringstream os; os << "Samples_" << samples; - //Timer t(os.str()); - for (int n = 0; n < samples; ++n) { - EdgeSubset tree; - ts.SampleRandomSpanningTree(&tree, &rng); - SparseVector feats; - tree.ExtractFeatures(corpus[i].ts, ffs, &feats); - sampled_exp += feats; - } - sampled_exp /= samples; - cerr << "L2 norm of diff @ " << samples << " samples: " << (model_exp - sampled_exp).l2norm() << endl; - } - return 0; -} - diff --git a/rst_parser/rst_test.cc b/rst_parser/rst_test.cc deleted file mode 100644 index 3bb95759..00000000 --- a/rst_parser/rst_test.cc +++ /dev/null @@ -1,48 +0,0 @@ -#include "arc_factored.h" - -#include - -#include - -using namespace std; - -int main(int argc, char** argv) { - // John saw Mary - // (H -> M) - // (1 -> 2) 20 - // (1 -> 3) 3 - // (2 -> 1) 20 - // (2 -> 3) 30 - // (3 -> 2) 0 - // (3 -> 1) 11 - // (0, 2) 10 - // (0, 1) 9 - // (0, 3) 9 - ArcFactoredForest af(3); - af(0,1).edge_prob.logeq(20); - af(0,2).edge_prob.logeq(3); - af(1,0).edge_prob.logeq(20); - af(1,2).edge_prob.logeq(30); - af(2,1).edge_prob.logeq(0); - af(2,0).edge_prob.logeq(11); - af(-1,1).edge_prob.logeq(10); - af(-1,0).edge_prob.logeq(9); - af(-1,2).edge_prob.logeq(9); - EdgeSubset tree; -// af.MaximumEdgeSubset(&tree); - prob_t z; - af.EdgeMarginals(&z); - cerr << "Z = " << abs(z) << endl; - af.PickBestParentForEachWord(&tree); - cerr << tree << endl; - typedef Eigen::Matrix M3; - M3 A = M3::Zero(); - A(0,0) = prob_t(1); - A(1,0) = prob_t(3); - A(0,1) = prob_t(2); - A(1,1) = prob_t(4); - prob_t det = A.determinant(); - cerr << det.as_float() << endl; - return 0; -} - diff --git a/rst_parser/rst_train.cc b/rst_parser/rst_train.cc new file mode 100644 index 00000000..16673cdc --- /dev/null +++ b/rst_parser/rst_train.cc @@ -0,0 +1,144 @@ +#include "arc_factored.h" + +#include +#include +#include +#include + +#include "timing_stats.h" +#include "arc_ff.h" +#include "dep_training.h" +#include "stringlib.h" +#include "filelib.h" +#include "tdict.h" +#include "weights.h" +#include "rst.h" +#include "global_ff.h" + +using namespace std; +namespace po = boost::program_options; + +void InitCommandLine(int argc, char** argv, po::variables_map* conf) { + po::options_description opts("Configuration options"); + string cfg_file; + opts.add_options() + ("training_data,t",po::value()->default_value("-"), "File containing training data (jsent format)") + ("q_weights,q",po::value(), "Arc-factored weights for proposal distribution") + ("samples,n",po::value()->default_value(1000), "Number of samples"); + po::options_description clo("Command line options"); + clo.add_options() + ("config,c", po::value(&cfg_file), "Configuration file") + ("help,?", "Print this help message and exit"); + + po::options_description dconfig_options, dcmdline_options; + dconfig_options.add(opts); + dcmdline_options.add(dconfig_options).add(clo); + po::store(parse_command_line(argc, argv, dcmdline_options), *conf); + if (cfg_file.size() > 0) { + ReadFile rf(cfg_file); + po::store(po::parse_config_file(*rf.stream(), dconfig_options), *conf); + } + if (conf->count("help")) { + cerr << dcmdline_options << endl; + exit(1); + } +} + +int main(int argc, char** argv) { + po::variables_map conf; + InitCommandLine(argc, argv, &conf); + vector qweights(FD::NumFeats(), 0.0); + Weights::InitFromFile(conf["q_weights"].as(), &qweights); + vector corpus; + ArcFeatureFunctions ffs; + GlobalFeatureFunctions gff; + TrainingInstance::ReadTraining(conf["training_data"].as(), &corpus); + vector forests(corpus.size()); + vector zs(corpus.size()); + SparseVector empirical; + bool flag = false; + for (int i = 0; i < corpus.size(); ++i) { + TrainingInstance& cur = corpus[i]; + if ((i+1) % 10 == 0) { cerr << '.' << flush; flag = true; } + if ((i+1) % 400 == 0) { cerr << " [" << (i+1) << "]\n"; flag = false; } + SparseVector efmap; + ffs.PrepareForInput(cur.ts); + gff.PrepareForInput(cur.ts); + for (int j = 0; j < cur.tree.h_m_pairs.size(); ++j) { + efmap.clear(); + ffs.EdgeFeatures(cur.ts, cur.tree.h_m_pairs[j].first, + cur.tree.h_m_pairs[j].second, + &efmap); + cur.features += efmap; + } + for (int j = 0; j < cur.tree.roots.size(); ++j) { + efmap.clear(); + ffs.EdgeFeatures(cur.ts, -1, cur.tree.roots[j], &efmap); + cur.features += efmap; + } + efmap.clear(); + gff.Features(cur.ts, cur.tree, &efmap); + cur.features += efmap; + empirical += cur.features; + forests[i].resize(cur.ts.words.size()); + forests[i].ExtractFeatures(cur.ts, ffs); + forests[i].Reweight(qweights); + forests[i].EdgeMarginals(&zs[i]); + zs[i] = prob_t::One() / zs[i]; + // cerr << zs[i] << endl; + forests[i].Reweight(qweights); // EdgeMarginals overwrites edge_prob + } + if (flag) cerr << endl; + MT19937 rng; + SparseVector model_exp; + SparseVector weights; + Weights::InitSparseVector(qweights, &weights); + int samples = conf["samples"].as(); + for (int i = 0; i < corpus.size(); ++i) { +#if 0 + forests[i].EdgeMarginals(); + model_exp.clear(); + for (int h = -1; h < num_words; ++h) { + for (int m = 0; m < num_words; ++m) { + if (h == m) continue; + const ArcFactoredForest::Edge& edge = forests[i](h,m); + const SparseVector& fmap = edge.features; + double prob = edge.edge_prob.as_float(); + model_exp += fmap * prob; + } + } + cerr << "TRUE EXP: " << model_exp << endl; + forests[i].Reweight(weights); +#endif + + TreeSampler ts(forests[i]); + prob_t zhat = prob_t::Zero(); + SparseVector sampled_exp; + for (int n = 0; n < samples; ++n) { + EdgeSubset tree; + ts.SampleRandomSpanningTree(&tree, &rng); + SparseVector qfeats, gfeats; + tree.ExtractFeatures(corpus[i].ts, ffs, &qfeats); + prob_t u; u.logeq(qfeats.dot(qweights)); + const prob_t q = u / zs[i]; // proposal mass + gff.Features(corpus[i].ts, tree, &gfeats); + SparseVector tot_feats = qfeats + gfeats; + u.logeq(tot_feats.dot(weights)); + prob_t w = u / q; + zhat += w; + for (SparseVector::const_iterator it = tot_feats.begin(); it != tot_feats.end(); ++it) + sampled_exp.add_value(it->first, w * prob_t(it->second)); + } + sampled_exp /= zhat; + SparseVector tot_m; + for (SparseVector::const_iterator it = sampled_exp.begin(); it != sampled_exp.end(); ++it) + tot_m.add_value(it->first, it->second.as_float()); + //cerr << "DIFF: " << (tot_m - corpus[i].features) << endl; + const double eta = 0.03; + weights -= (tot_m - corpus[i].features) * eta; + } + cerr << "WEIGHTS.\n"; + cerr << weights << endl; + return 0; +} + -- cgit v1.2.3 From ba74af911877726a346e7f5f570fe20de5117253 Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Mon, 16 Apr 2012 22:43:29 -0400 Subject: global features --- rst_parser/global_ff.cc | 43 +++++++++++++++++++++++++++++++++++++++++++ rst_parser/global_ff.h | 18 ++++++++++++++++++ 2 files changed, 61 insertions(+) create mode 100644 rst_parser/global_ff.cc create mode 100644 rst_parser/global_ff.h diff --git a/rst_parser/global_ff.cc b/rst_parser/global_ff.cc new file mode 100644 index 00000000..bb715f18 --- /dev/null +++ b/rst_parser/global_ff.cc @@ -0,0 +1,43 @@ +#include "global_ff.h" + +#include + +#include "tdict.h" + +using namespace std; + +struct GFFImpl { + void PrepareForInput(const TaggedSentence& sentence) { + } + void Features(const TaggedSentence& sentence, + const EdgeSubset& tree, + SparseVector* feats) const { + const vector& words = sentence.words; + const vector& tags = sentence.pos; + const vector >& hms = tree.h_m_pairs; + assert(words.size() == tags.size()); + vector mods(words.size()); + for (int i = 0; i < hms.size(); ++i) { + mods[hms[i].first]++; // first = head, second = modifier + } + for (int i = 0; i < mods.size(); ++i) { + ostringstream os; + os << "NM:" << TD::Convert(tags[i]) << "_" << mods[i]; + feats->add_value(FD::Convert(os.str()), 1.0); + } + } +}; + +GlobalFeatureFunctions::GlobalFeatureFunctions() {} +GlobalFeatureFunctions::~GlobalFeatureFunctions() { delete pimpl; } + +void GlobalFeatureFunctions::PrepareForInput(const TaggedSentence& sentence) { + pimpl->PrepareForInput(sentence); +} + +void GlobalFeatureFunctions::Features(const TaggedSentence& sentence, + const EdgeSubset& tree, + SparseVector* feats) const { + pimpl->Features(sentence, tree, feats); +} + diff --git a/rst_parser/global_ff.h b/rst_parser/global_ff.h new file mode 100644 index 00000000..d71d0fa1 --- /dev/null +++ b/rst_parser/global_ff.h @@ -0,0 +1,18 @@ +#ifndef _GLOBAL_FF_H_ +#define _GLOBAL_FF_H_ + +#include "arc_factored.h" + +struct GFFImpl; +struct GlobalFeatureFunctions { + GlobalFeatureFunctions(); + ~GlobalFeatureFunctions(); + void PrepareForInput(const TaggedSentence& sentence); + void Features(const TaggedSentence& sentence, + const EdgeSubset& tree, + SparseVector* feats) const; + private: + GFFImpl* pimpl; +}; + +#endif -- cgit v1.2.3 From ea830224a256aa85702f74d3a0073839d30b6747 Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Tue, 17 Apr 2012 23:12:18 -0400 Subject: most features --- rst_parser/arc_ff.cc | 110 ++++++++++++++++++++++++++++++++++----------------- 1 file changed, 73 insertions(+), 37 deletions(-) diff --git a/rst_parser/arc_ff.cc b/rst_parser/arc_ff.cc index 10885716..fba36e24 100644 --- a/rst_parser/arc_ff.cc +++ b/rst_parser/arc_ff.cc @@ -7,11 +7,48 @@ using namespace std; struct ArcFFImpl { - ArcFFImpl() : kROOT("ROOT") {} + ArcFFImpl() : kROOT("ROOT"), kLEFT_POS("LEFT"), kRIGHT_POS("RIGHT") {} const string kROOT; + const string kLEFT_POS; + const string kRIGHT_POS; - void PrepareForInput(const TaggedSentence& sentence) { - (void) sentence; + void PrepareForInput(const TaggedSentence& sent) { + (void) sent; + } + + template + static void Fire(SparseVector* v, const A& a) { + ostringstream os; + os << a; + v->set_value(FD::Convert(os.str()), 1); + } + + template + static void Fire(SparseVector* v, const A& a, const B& b) { + ostringstream os; + os << a << '_' << b; + v->set_value(FD::Convert(os.str()), 1); + } + + template + static void Fire(SparseVector* v, const A& a, const B& b, const C& c) { + ostringstream os; + os << a << '_' << b << '_' << c; + v->set_value(FD::Convert(os.str()), 1); + } + + template + static void Fire(SparseVector* v, const A& a, const B& b, const C& c, const D& d) { + ostringstream os; + os << a << '_' << b << '_' << c << '_' << d; + v->set_value(FD::Convert(os.str()), 1); + } + + template + static void Fire(SparseVector* v, const A& a, const B& b, const C& c, const D& d, const E& e) { + ostringstream os; + os << a << '_' << b << '_' << c << '_' << d << '_' << e; + v->set_value(FD::Convert(os.str()), 1); } void EdgeFeatures(const TaggedSentence& sent, @@ -19,10 +56,13 @@ struct ArcFFImpl { short m, SparseVector* features) const { const bool is_root = (h == -1); + int num_words = sent.words.size(); const string& head_word = (is_root ? kROOT : TD::Convert(sent.words[h])); const string& head_pos = (is_root ? kROOT : TD::Convert(sent.pos[h])); const string& mod_word = TD::Convert(sent.words[m]); const string& mod_pos = TD::Convert(sent.pos[m]); + const string& mod_pos_L = (m > 0 ? TD::Convert(sent.pos[m-1]) : kLEFT_POS); + const string& mod_pos_R = (m < sent.pos.size() - 1 ? TD::Convert(sent.pos[m]) : kRIGHT_POS); const bool dir = m < h; int v = m - h; if (v < 0) { @@ -30,42 +70,38 @@ struct ArcFFImpl { } else { v= int(log(v) / log(2)); } - static map lenmap; - int& lenfid = lenmap[v]; - if (!lenfid) { - ostringstream os; - if (v < 0) os << "LenL" << -v; else os << "LenR" << v; - lenfid = FD::Convert(os.str()); - } - features->set_value(lenfid, 1.0); - const string& lenstr = FD::Convert(lenfid); - if (!is_root) { - static int modl = FD::Convert("ModLeft"); - static int modr = FD::Convert("ModRight"); - if (dir) features->set_value(modl, 1); - else features->set_value(modr, 1); - } + ostringstream os; + if (v < 0) os << "LenL" << -v; else os << "LenR" << v; + const string lenstr = os.str(); if (is_root) { - ostringstream os; - os << "ROOT:" << mod_pos; - features->set_value(FD::Convert(os.str()), 1.0); - os << "_" << lenstr; - features->set_value(FD::Convert(os.str()), 1.0); + Fire(features, "ROOT", mod_pos); + Fire(features, "DROOT", mod_pos, lenstr); + Fire(features, "LROOT", mod_pos_L); + Fire(features, "RROOT", mod_pos_R); + Fire(features, "LROOT", mod_pos_L, mod_pos); + Fire(features, "RROOT", mod_pos_R, mod_pos); + Fire(features, "LDist", m); + Fire(features, "RDist", m - num_words); } else { // not root - ostringstream os; - os << "HM:" << head_pos << '_' << mod_pos; - features->set_value(FD::Convert(os.str()), 1.0); - os << '_' << dir; - features->set_value(FD::Convert(os.str()), 1.0); - os << '_' << lenstr; - features->set_value(FD::Convert(os.str()), 1.0); - ostringstream os2; - os2 << "LexHM:" << head_word << '_' << mod_word; - features->set_value(FD::Convert(os2.str()), 1.0); - os2 << '_' << dir; - features->set_value(FD::Convert(os2.str()), 1.0); - os2 << '_' << lenstr; - features->set_value(FD::Convert(os2.str()), 1.0); + const string& head_pos_L = (h > 0 ? TD::Convert(sent.pos[h-1]) : kLEFT_POS); + const string& head_pos_R = (h < sent.pos.size() - 1 ? TD::Convert(sent.pos[h]) : kRIGHT_POS); + Fire(features, lenstr); + Fire(features, "H", head_pos); + Fire(features, "M", mod_pos); + Fire(features, "HM", head_pos, mod_pos); + Fire(features, "HM", head_pos, mod_pos, dir); + Fire(features, "HM", head_pos, mod_pos, lenstr); + Fire(features, "LexH", head_word); + Fire(features, "LexM", mod_word); + Fire(features, "LexHM", head_word, mod_word); + Fire(features, "LexHM", head_word, mod_word, dir); + Fire(features, "LexHM", head_word, mod_word, lenstr); + // surrounders + Fire(features, "posLL", head_pos, mod_pos, head_pos_L, mod_pos_L); + Fire(features, "posRR", head_pos, mod_pos, head_pos_R, mod_pos_R); + Fire(features, "posLR", head_pos, mod_pos, head_pos_L, mod_pos_R); + Fire(features, "posRL", head_pos, mod_pos, head_pos_R, mod_pos_L); + Fire(features, "lexRL", head_word, head_pos_L, mod_pos_L); } } }; -- cgit v1.2.3 From 390bfd5a0bf1701b6274acbc8f335a4b6c68a52b Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Wed, 18 Apr 2012 01:39:20 -0400 Subject: last of the mcdonald 05 features --- rst_parser/arc_ff.cc | 25 ++++++++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) diff --git a/rst_parser/arc_ff.cc b/rst_parser/arc_ff.cc index fba36e24..2992cde2 100644 --- a/rst_parser/arc_ff.cc +++ b/rst_parser/arc_ff.cc @@ -11,9 +11,21 @@ struct ArcFFImpl { const string kROOT; const string kLEFT_POS; const string kRIGHT_POS; + map > pcs; void PrepareForInput(const TaggedSentence& sent) { - (void) sent; + pcs.clear(); + for (int i = 0; i < sent.pos.size(); ++i) + pcs[sent.pos[i]].resize(1, 0); + pcs[sent.pos[0]][0] = 1; + for (int i = 1; i < sent.pos.size(); ++i) { + const WordID posi = sent.pos[i]; + for (map >::iterator j = pcs.begin(); j != pcs.end(); ++j) { + const WordID posj = j->first; + vector& cs = j->second; + cs.push_back(cs.back() + (posj == posi ? 1 : 0)); + } + } } template @@ -102,6 +114,17 @@ struct ArcFFImpl { Fire(features, "posLR", head_pos, mod_pos, head_pos_L, mod_pos_R); Fire(features, "posRL", head_pos, mod_pos, head_pos_R, mod_pos_L); Fire(features, "lexRL", head_word, head_pos_L, mod_pos_L); + + // between features + int left = min(h,m); + int right = max(h,m); + if (right - left > 2) { + ++left; + for (map >::const_iterator it = pcs.begin(); it != pcs.end(); ++it) { + if (it->second[left] != it->second[right]) + Fire(features, "BT", head_pos, TD::Convert(it->first), mod_pos); + } + } } } }; -- cgit v1.2.3 From 206a30b912b2c425379ef23dbe0d8a61d3dea913 Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Wed, 18 Apr 2012 03:21:39 -0400 Subject: arc ff fix --- rst_parser/arc_ff.cc | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/rst_parser/arc_ff.cc b/rst_parser/arc_ff.cc index 2992cde2..b3560b73 100644 --- a/rst_parser/arc_ff.cc +++ b/rst_parser/arc_ff.cc @@ -38,28 +38,28 @@ struct ArcFFImpl { template static void Fire(SparseVector* v, const A& a, const B& b) { ostringstream os; - os << a << '_' << b; + os << a << ':' << b; v->set_value(FD::Convert(os.str()), 1); } template static void Fire(SparseVector* v, const A& a, const B& b, const C& c) { ostringstream os; - os << a << '_' << b << '_' << c; + os << a << ':' << b << '_' << c; v->set_value(FD::Convert(os.str()), 1); } template static void Fire(SparseVector* v, const A& a, const B& b, const C& c, const D& d) { ostringstream os; - os << a << '_' << b << '_' << c << '_' << d; + os << a << ':' << b << '_' << c << '_' << d; v->set_value(FD::Convert(os.str()), 1); } template static void Fire(SparseVector* v, const A& a, const B& b, const C& c, const D& d, const E& e) { ostringstream os; - os << a << '_' << b << '_' << c << '_' << d << '_' << e; + os << a << ':' << b << '_' << c << '_' << d << '_' << e; v->set_value(FD::Convert(os.str()), 1); } @@ -75,7 +75,7 @@ struct ArcFFImpl { const string& mod_pos = TD::Convert(sent.pos[m]); const string& mod_pos_L = (m > 0 ? TD::Convert(sent.pos[m-1]) : kLEFT_POS); const string& mod_pos_R = (m < sent.pos.size() - 1 ? TD::Convert(sent.pos[m]) : kRIGHT_POS); - const bool dir = m < h; + const string dir = (m < h ? "MLeft" : "MRight"); int v = m - h; if (v < 0) { v= -1 - int(log(-v) / log(2)); @@ -118,11 +118,13 @@ struct ArcFFImpl { // between features int left = min(h,m); int right = max(h,m); - if (right - left > 2) { - ++left; + if (right - left >= 2) { + --right; for (map >::const_iterator it = pcs.begin(); it != pcs.end(); ++it) { - if (it->second[left] != it->second[right]) + if (it->second[left] != it->second[right]) { Fire(features, "BT", head_pos, TD::Convert(it->first), mod_pos); + Fire(features, "BT", head_pos, TD::Convert(it->first), mod_pos, dir); + } } } } -- cgit v1.2.3 From 467175c2f4aaf35720c25220dd45f8b8f2221682 Mon Sep 17 00:00:00 2001 From: Guest_account Guest_account prguest11 Date: Wed, 18 Apr 2012 18:08:10 +0100 Subject: fix mem alloc --- rst_parser/global_ff.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rst_parser/global_ff.cc b/rst_parser/global_ff.cc index bb715f18..a7db977d 100644 --- a/rst_parser/global_ff.cc +++ b/rst_parser/global_ff.cc @@ -28,7 +28,7 @@ struct GFFImpl { } }; -GlobalFeatureFunctions::GlobalFeatureFunctions() {} +GlobalFeatureFunctions::GlobalFeatureFunctions() : pimpl(new GFFImpl) {} GlobalFeatureFunctions::~GlobalFeatureFunctions() { delete pimpl; } void GlobalFeatureFunctions::PrepareForInput(const TaggedSentence& sentence) { -- cgit v1.2.3 From a4e62df53b4bbac77cb967997dae41fb8dc674d7 Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Wed, 18 Apr 2012 22:34:57 -0400 Subject: damn off by one error --- rst_parser/arc_ff.cc | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/rst_parser/arc_ff.cc b/rst_parser/arc_ff.cc index b3560b73..673648d1 100644 --- a/rst_parser/arc_ff.cc +++ b/rst_parser/arc_ff.cc @@ -75,7 +75,8 @@ struct ArcFFImpl { const string& mod_pos = TD::Convert(sent.pos[m]); const string& mod_pos_L = (m > 0 ? TD::Convert(sent.pos[m-1]) : kLEFT_POS); const string& mod_pos_R = (m < sent.pos.size() - 1 ? TD::Convert(sent.pos[m]) : kRIGHT_POS); - const string dir = (m < h ? "MLeft" : "MRight"); + const bool bdir = m < h; + const string dir = (bdir ? "MLeft" : "MRight"); int v = m - h; if (v < 0) { v= -1 - int(log(-v) / log(2)); @@ -119,7 +120,7 @@ struct ArcFFImpl { int left = min(h,m); int right = max(h,m); if (right - left >= 2) { - --right; + if (bdir) --right; else ++left; for (map >::const_iterator it = pcs.begin(); it != pcs.end(); ++it) { if (it->second[left] != it->second[right]) { Fire(features, "BT", head_pos, TD::Convert(it->first), mod_pos); -- cgit v1.2.3 From f4570f262c10534b335568e1d69fb3a8dfbf38ed Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Wed, 18 Apr 2012 22:37:08 -0400 Subject: refactor --- rst_parser/dep_training.cc | 70 ++++++++++++++++++++++++++++------------------ rst_parser/dep_training.h | 4 ++- rst_parser/mst_train.cc | 2 +- rst_parser/rst_train.cc | 2 +- 4 files changed, 48 insertions(+), 30 deletions(-) diff --git a/rst_parser/dep_training.cc b/rst_parser/dep_training.cc index de431ebc..e26505ec 100644 --- a/rst_parser/dep_training.cc +++ b/rst_parser/dep_training.cc @@ -10,11 +10,51 @@ using namespace std; -void TrainingInstance::ReadTraining(const string& fname, vector* corpus, int rank, int size) { +static void ParseInstance(const string& line, int start, TrainingInstance* out, int lc = 0) { + picojson::value obj; + string err; + picojson::parse(obj, line.begin() + start, line.end(), &err); + if (err.size() > 0) { cerr << "JSON parse error in " << lc << ": " << err << endl; abort(); } + TrainingInstance& cur = *out; + TaggedSentence& ts = cur.ts; + EdgeSubset& tree = cur.tree; + assert(obj.is()); + const picojson::object& d = obj.get(); + const picojson::array& ta = d.find("tokens")->second.get(); + for (unsigned i = 0; i < ta.size(); ++i) { + ts.words.push_back(TD::Convert(ta[i].get()[0].get())); + ts.pos.push_back(TD::Convert(ta[i].get()[1].get())); + } + if (d.find("deps") != d.end()) { + const picojson::array& da = d.find("deps")->second.get(); + for (unsigned i = 0; i < da.size(); ++i) { + const picojson::array& thm = da[i].get(); + // get dep type here + short h = thm[2].get(); + short m = thm[1].get(); + if (h < 0) + tree.roots.push_back(m); + else + tree.h_m_pairs.push_back(make_pair(h,m)); + } + } + //cerr << TD::GetString(ts.words) << endl << TD::GetString(ts.pos) << endl << tree << endl; +} + +bool TrainingInstance::ReadInstance(std::istream* in, TrainingInstance* instance) { + string line; + if (!getline(*in, line)) return false; + size_t pos = line.rfind('\t'); + assert(pos != string::npos); + static int lc = 0; ++lc; + ParseInstance(line, pos + 1, instance, lc); + return true; +} + +void TrainingInstance::ReadTrainingCorpus(const string& fname, vector* corpus, int rank, int size) { ReadFile rf(fname); istream& in = *rf.stream(); string line; - string err; int lc = 0; bool flag = false; while(getline(in, line)) { @@ -24,32 +64,8 @@ void TrainingInstance::ReadTraining(const string& fname, vector 0) { cerr << "JSON parse error in " << lc << ": " << err << endl; abort(); } corpus->push_back(TrainingInstance()); - TrainingInstance& cur = corpus->back(); - TaggedSentence& ts = cur.ts; - EdgeSubset& tree = cur.tree; - assert(obj.is()); - const picojson::object& d = obj.get(); - const picojson::array& ta = d.find("tokens")->second.get(); - for (unsigned i = 0; i < ta.size(); ++i) { - ts.words.push_back(TD::Convert(ta[i].get()[0].get())); - ts.pos.push_back(TD::Convert(ta[i].get()[1].get())); - } - const picojson::array& da = d.find("deps")->second.get(); - for (unsigned i = 0; i < da.size(); ++i) { - const picojson::array& thm = da[i].get(); - // get dep type here - short h = thm[2].get(); - short m = thm[1].get(); - if (h < 0) - tree.roots.push_back(m); - else - tree.h_m_pairs.push_back(make_pair(h,m)); - } - //cerr << TD::GetString(ts.words) << endl << TD::GetString(ts.pos) << endl << tree << endl; + ParseInstance(line, pos + 1, &corpus->back(), lc); } if (flag) cerr << "\nRead " << lc << " training instances\n"; } diff --git a/rst_parser/dep_training.h b/rst_parser/dep_training.h index 73ffd298..3eeee22e 100644 --- a/rst_parser/dep_training.h +++ b/rst_parser/dep_training.h @@ -1,6 +1,7 @@ #ifndef _DEP_TRAINING_H_ #define _DEP_TRAINING_H_ +#include #include #include #include "arc_factored.h" @@ -11,7 +12,8 @@ struct TrainingInstance { EdgeSubset tree; SparseVector features; // reads a "Jsent" formatted dependency file - static void ReadTraining(const std::string& fname, std::vector* corpus, int rank = 0, int size = 1); + static bool ReadInstance(std::istream* in, TrainingInstance* instance); // returns false at EOF + static void ReadTrainingCorpus(const std::string& fname, std::vector* corpus, int rank = 0, int size = 1); }; #endif diff --git a/rst_parser/mst_train.cc b/rst_parser/mst_train.cc index 0709e7c9..e414f450 100644 --- a/rst_parser/mst_train.cc +++ b/rst_parser/mst_train.cc @@ -74,7 +74,7 @@ int main(int argc, char** argv) { InitCommandLine(argc, argv, &conf); ArcFeatureFunctions ffs; vector corpus; - TrainingInstance::ReadTraining(conf["training_data"].as(), &corpus, rank, size); + TrainingInstance::ReadTrainingCorpus(conf["training_data"].as(), &corpus, rank, size); vector forests(corpus.size()); SparseVector empirical; bool flag = false; diff --git a/rst_parser/rst_train.cc b/rst_parser/rst_train.cc index 16673cdc..9b730f3d 100644 --- a/rst_parser/rst_train.cc +++ b/rst_parser/rst_train.cc @@ -52,7 +52,7 @@ int main(int argc, char** argv) { vector corpus; ArcFeatureFunctions ffs; GlobalFeatureFunctions gff; - TrainingInstance::ReadTraining(conf["training_data"].as(), &corpus); + TrainingInstance::ReadTrainingCorpus(conf["training_data"].as(), &corpus); vector forests(corpus.size()); vector zs(corpus.size()); SparseVector empirical; -- cgit v1.2.3 From d016f7f28510f822b89c921da38006eae3877872 Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Thu, 19 Apr 2012 02:45:27 -0400 Subject: compute f --- rst_parser/Makefile.am | 5 +- rst_parser/dep_training.cc | 4 ++ rst_parser/rst_parse.cc | 111 +++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 119 insertions(+), 1 deletion(-) create mode 100644 rst_parser/rst_parse.cc diff --git a/rst_parser/Makefile.am b/rst_parser/Makefile.am index 876c2237..4977f584 100644 --- a/rst_parser/Makefile.am +++ b/rst_parser/Makefile.am @@ -1,5 +1,5 @@ bin_PROGRAMS = \ - mst_train rst_train + mst_train rst_train rst_parse noinst_LIBRARIES = librst.a @@ -11,4 +11,7 @@ mst_train_LDADD = librst.a $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/ rst_train_SOURCES = rst_train.cc rst_train_LDADD = librst.a $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a ../klm/lm/libklm.a ../klm/util/libklm_util.a -lz +rst_parse_SOURCES = rst_parse.cc +rst_parse_LDADD = librst.a $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a ../klm/lm/libklm.a ../klm/util/libklm_util.a -lz + AM_CPPFLAGS = -W -Wall -Wno-sign-compare $(GTEST_CPPFLAGS) -I$(top_srcdir)/decoder -I$(top_srcdir)/training -I$(top_srcdir)/utils -I$(top_srcdir)/mteval -I../klm diff --git a/rst_parser/dep_training.cc b/rst_parser/dep_training.cc index e26505ec..ef97798b 100644 --- a/rst_parser/dep_training.cc +++ b/rst_parser/dep_training.cc @@ -18,6 +18,10 @@ static void ParseInstance(const string& line, int start, TrainingInstance* out, TrainingInstance& cur = *out; TaggedSentence& ts = cur.ts; EdgeSubset& tree = cur.tree; + ts.pos.clear(); + ts.words.clear(); + tree.roots.clear(); + tree.h_m_pairs.clear(); assert(obj.is()); const picojson::object& d = obj.get(); const picojson::array& ta = d.find("tokens")->second.get(); diff --git a/rst_parser/rst_parse.cc b/rst_parser/rst_parse.cc new file mode 100644 index 00000000..9c42a8f4 --- /dev/null +++ b/rst_parser/rst_parse.cc @@ -0,0 +1,111 @@ +#include "arc_factored.h" + +#include +#include +#include +#include + +#include "timing_stats.h" +#include "arc_ff.h" +#include "dep_training.h" +#include "stringlib.h" +#include "filelib.h" +#include "tdict.h" +#include "weights.h" +#include "rst.h" +#include "global_ff.h" + +using namespace std; +namespace po = boost::program_options; + +void InitCommandLine(int argc, char** argv, po::variables_map* conf) { + po::options_description opts("Configuration options"); + string cfg_file; + opts.add_options() + ("input,i",po::value()->default_value("-"), "File containing test data (jsent format)") + ("q_weights,q",po::value(), "Arc-factored weights for proposal distribution (mandatory)") + ("p_weights,p",po::value(), "Weights for target distribution (optional)") + ("samples,n",po::value()->default_value(1000), "Number of samples"); + po::options_description clo("Command line options"); + clo.add_options() + ("config,c", po::value(&cfg_file), "Configuration file") + ("help,?", "Print this help message and exit"); + + po::options_description dconfig_options, dcmdline_options; + dconfig_options.add(opts); + dcmdline_options.add(dconfig_options).add(clo); + po::store(parse_command_line(argc, argv, dcmdline_options), *conf); + if (cfg_file.size() > 0) { + ReadFile rf(cfg_file); + po::store(po::parse_config_file(*rf.stream(), dconfig_options), *conf); + } + if (conf->count("help") || conf->count("q_weights") == 0) { + cerr << dcmdline_options << endl; + exit(1); + } +} + +int main(int argc, char** argv) { + po::variables_map conf; + InitCommandLine(argc, argv, &conf); + vector qweights, pweights; + Weights::InitFromFile(conf["q_weights"].as(), &qweights); + if (conf.count("p_weights")) + Weights::InitFromFile(conf["p_weights"].as(), &pweights); + const bool global = pweights.size() > 0; + ArcFeatureFunctions ffs; + GlobalFeatureFunctions gff; + ReadFile rf(conf["input"].as()); + istream* in = rf.stream(); + TrainingInstance sent; + MT19937 rng; + int samples = conf["samples"].as(); + int totroot = 0, root_right = 0, tot = 0, cor = 0; + while(TrainingInstance::ReadInstance(in, &sent)) { + ffs.PrepareForInput(sent.ts); + if (global) gff.PrepareForInput(sent.ts); + ArcFactoredForest forest(sent.ts.pos.size()); + forest.ExtractFeatures(sent.ts, ffs); + forest.Reweight(qweights); + TreeSampler ts(forest); + double best_score = -numeric_limits::infinity(); + EdgeSubset best_tree; + for (int n = 0; n < samples; ++n) { + EdgeSubset tree; + ts.SampleRandomSpanningTree(&tree, &rng); + SparseVector qfeats, gfeats; + tree.ExtractFeatures(sent.ts, ffs, &qfeats); + double score = 0; + if (global) { + gff.Features(sent.ts, tree, &gfeats); + score = (qfeats + gfeats).dot(pweights); + } else { + score = qfeats.dot(qweights); + } + if (score > best_score) { + best_tree = tree; + best_score = score; + } + } + cerr << "BEST SCORE: " << best_score << endl; + cout << best_tree << endl; + const bool sent_has_ref = sent.tree.h_m_pairs.size() > 0; + if (sent_has_ref) { + map, bool> ref; + for (int i = 0; i < sent.tree.h_m_pairs.size(); ++i) + ref[sent.tree.h_m_pairs[i]] = true; + int ref_root = sent.tree.roots.front(); + if (ref_root == best_tree.roots.front()) { ++root_right; } + ++totroot; + for (int i = 0; i < best_tree.h_m_pairs.size(); ++i) { + if (ref[best_tree.h_m_pairs[i]]) { + ++cor; + } + ++tot; + } + } + } + cerr << "F = " << (double(cor + root_right) / (tot + totroot)) << endl; + return 0; +} + -- cgit v1.2.3 From cf693c4334304360b5e718bf1e8485e59fc38a54 Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Thu, 19 Apr 2012 11:34:16 -0400 Subject: fix typo --- gi/pf/base_distributions.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gi/pf/base_distributions.h b/gi/pf/base_distributions.h index 84dacdf2..41b513f8 100644 --- a/gi/pf/base_distributions.h +++ b/gi/pf/base_distributions.h @@ -89,7 +89,7 @@ struct TableLookupBase { TableLookupBase(const std::string& fname); prob_t operator()(const TRule& rule) const { - const std::tr1::unordered_map::const_iterator it = table.find(rule); + const std::tr1::unordered_map::const_iterator it = table.find(rule); if (it == table.end()) { std::cerr << rule << " not found\n"; abort(); -- cgit v1.2.3 From 7453227890a0e5239891ad602b36eb84a24a12f7 Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Thu, 19 Apr 2012 13:30:23 -0400 Subject: fix for marginals --- rst_parser/arc_factored_marginals.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rst_parser/arc_factored_marginals.cc b/rst_parser/arc_factored_marginals.cc index acb8102a..3e8c9f86 100644 --- a/rst_parser/arc_factored_marginals.cc +++ b/rst_parser/arc_factored_marginals.cc @@ -49,7 +49,7 @@ void ArcFactoredForest::EdgeMarginals(prob_t *plog_z) { #else -void ArcFactoredForest::EdgeMarginals(double*) { +void ArcFactoredForest::EdgeMarginals(prob_t *) { cerr << "EdgeMarginals() requires --with-eigen!\n"; abort(); } -- cgit v1.2.3 From 1965a61937b35cac2cc8d93e5db1d43fb17369e1 Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Thu, 19 Apr 2012 14:44:27 -0400 Subject: fix includes --- rst_parser/arc_ff.cc | 3 +++ 1 file changed, 3 insertions(+) diff --git a/rst_parser/arc_ff.cc b/rst_parser/arc_ff.cc index 673648d1..b2a87a7d 100644 --- a/rst_parser/arc_ff.cc +++ b/rst_parser/arc_ff.cc @@ -1,5 +1,8 @@ #include "arc_ff.h" +#include +#include + #include "tdict.h" #include "fdict.h" #include "sentence_metadata.h" -- cgit v1.2.3 From 4dfe96758c5190d2550434d3a0857853c4ef2612 Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Thu, 19 Apr 2012 14:45:27 -0400 Subject: fix includes --- rst_parser/global_ff.cc | 1 + 1 file changed, 1 insertion(+) diff --git a/rst_parser/global_ff.cc b/rst_parser/global_ff.cc index a7db977d..ae410875 100644 --- a/rst_parser/global_ff.cc +++ b/rst_parser/global_ff.cc @@ -1,6 +1,7 @@ #include "global_ff.h" #include +#include #include "tdict.h" -- cgit v1.2.3 From 1e206220aa506ac0e8eabcfe0cbd0ab851dee262 Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Fri, 20 Apr 2012 19:39:17 +0100 Subject: parallel gradient computation --- rst_parser/mst_train.cc | 98 +++++++++++++++++++++++++++++++++++++------------ 1 file changed, 74 insertions(+), 24 deletions(-) diff --git a/rst_parser/mst_train.cc b/rst_parser/mst_train.cc index e414f450..b3711aba 100644 --- a/rst_parser/mst_train.cc +++ b/rst_parser/mst_train.cc @@ -4,6 +4,10 @@ #include #include #include +// #define HAVE_THREAD 1 +#if HAVE_THREAD +#include +#endif #include "arc_ff.h" #include "stringlib.h" @@ -24,6 +28,9 @@ void InitCommandLine(int argc, char** argv, po::variables_map* conf) { ("weights,w",po::value(), "Optional starting weights") ("output_every_i_iterations,I",po::value()->default_value(1), "Write weights every I iterations") ("regularization_strength,C",po::value()->default_value(1.0), "Regularization strength") +#if HAVE_THREAD + ("threads,T",po::value()->default_value(1), "Number of threads") +#endif ("correction_buffers,m", po::value()->default_value(10), "LBFGS correction buffers"); po::options_description clo("Command line options"); clo.add_options() @@ -67,6 +74,46 @@ double ApplyRegularizationTerms(const double C, return reg; } +struct GradientWorker { + GradientWorker(int f, + int t, + vector* w, + vector* c, + vector* fs) : obj(), weights(*w), from(f), to(t), corpus(*c), forests(*fs), g(w->size()) {} + void operator()() { + int every = (to - from) / 20; + if (!every) every++; + for (int i = from; i < to; ++i) { + if ((from == 0) && (i + 1) % every == 0) cerr << '.' << flush; + const int num_words = corpus[i].ts.words.size(); + forests[i].Reweight(weights); + prob_t z; + forests[i].EdgeMarginals(&z); + obj -= log(z); + //cerr << " O = " << (-corpus[i].features.dot(weights)) << " D=" << -lz << " OO= " << (-corpus[i].features.dot(weights) - lz) << endl; + //cerr << " ZZ = " << zz << endl; + for (int h = -1; h < num_words; ++h) { + for (int m = 0; m < num_words; ++m) { + if (h == m) continue; + const ArcFactoredForest::Edge& edge = forests[i](h,m); + const SparseVector& fmap = edge.features; + double prob = edge.edge_prob.as_float(); + if (prob < -0.000001) { cerr << "Prob < 0: " << prob << endl; prob = 0; } + if (prob > 1.000001) { cerr << "Prob > 1: " << prob << endl; prob = 1; } + AddFeatures(prob, fmap, &g); + //mfm += fmap * prob; // DE + } + } + } + } + double obj; + vector& weights; + const int from, to; + vector& corpus; + vector& forests; + vector g; // local gradient +}; + int main(int argc, char** argv) { int rank = 0; int size = 1; @@ -108,8 +155,13 @@ int main(int argc, char** argv) { vector g(FD::NumFeats(), 0.0); cerr << "features initialized\noptimizing...\n"; boost::shared_ptr o; - int every = corpus.size() / 20; - if (!every) ++every; +#if HAVE_THREAD + unsigned threads = conf["threads"].as(); + if (threads > corpus.size()) threads = corpus.size(); +#else + const unsigned threads = 1; +#endif + int chunk = corpus.size() / threads; o.reset(new LBFGSOptimizer(g.size(), conf["correction_buffers"].as())); int iterations = 1000; for (int iter = 0; iter < iterations; ++iter) { @@ -118,29 +170,27 @@ int main(int argc, char** argv) { for (SparseVector::const_iterator it = empirical.begin(); it != empirical.end(); ++it) g[it->first] = -it->second; double obj = -empirical.dot(weights); - // SparseVector mfm; //DE - for (int i = 0; i < corpus.size(); ++i) { - if ((i + 1) % every == 0) cerr << '.' << flush; - const int num_words = corpus[i].ts.words.size(); - forests[i].Reweight(weights); - prob_t z; - forests[i].EdgeMarginals(&z); - obj -= log(z); - //cerr << " O = " << (-corpus[i].features.dot(weights)) << " D=" << -lz << " OO= " << (-corpus[i].features.dot(weights) - lz) << endl; - //cerr << " ZZ = " << zz << endl; - for (int h = -1; h < num_words; ++h) { - for (int m = 0; m < num_words; ++m) { - if (h == m) continue; - const ArcFactoredForest::Edge& edge = forests[i](h,m); - const SparseVector& fmap = edge.features; - double prob = edge.edge_prob.as_float(); - if (prob < -0.000001) { cerr << "Prob < 0: " << prob << endl; prob = 0; } - if (prob > 1.000001) { cerr << "Prob > 1: " << prob << endl; prob = 1; } - AddFeatures(prob, fmap, &g); - //mfm += fmap * prob; // DE - } - } + vector > jobs; + for (int from = 0; from < corpus.size(); from += chunk) { + int to = from + chunk; + if (to > corpus.size()) to = corpus.size(); + jobs.push_back(boost::shared_ptr(new GradientWorker(from, to, &weights, &corpus, &forests))); } +#if HAVE_THREAD + boost::thread_group tg; + for (int i = 0; i < threads; ++i) + tg.create_thread(boost::ref(*jobs[i])); + tg.join_all(); +#else + (*jobs[0])(); +#endif + for (int i = 0; i < threads; ++i) { + obj += jobs[i]->obj; + vector& tg = jobs[i]->g; + for (unsigned j = 0; j < g.size(); ++j) + g[j] += tg[j]; + } + // SparseVector mfm; //DE //cerr << endl << "E: " << empirical << endl; // DE //cerr << "M: " << mfm << endl; // DE double r = ApplyRegularizationTerms(conf["regularization_strength"].as(), weights, &g); -- cgit v1.2.3 From 06718177056fe5262262e00d98dc89f67cefb193 Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Sat, 21 Apr 2012 13:33:33 -0400 Subject: full feature set --- rst_parser/arc_ff.cc | 80 ++++++++++++++++++++++++++++++++++++---------------- 1 file changed, 56 insertions(+), 24 deletions(-) diff --git a/rst_parser/arc_ff.cc b/rst_parser/arc_ff.cc index b2a87a7d..c4e5aa17 100644 --- a/rst_parser/arc_ff.cc +++ b/rst_parser/arc_ff.cc @@ -3,6 +3,7 @@ #include #include +#include "stringlib.h" #include "tdict.h" #include "fdict.h" #include "sentence_metadata.h" @@ -66,15 +67,30 @@ struct ArcFFImpl { v->set_value(FD::Convert(os.str()), 1); } + static void AddConjoin(const SparseVector& v, const string& feat, SparseVector* pf) { + for (SparseVector::const_iterator it = v.begin(); it != v.end(); ++it) + pf->set_value(FD::Convert(FD::Convert(it->first) + "_" + feat), it->second); + } + + static inline string Fixup(const string& str) { + string res = LowercaseString(str); + if (res.size() < 6) return res; + return res.substr(0, 5) + "*"; + } + + static inline string Suffix(const string& str) { + if (str.size() < 4) return ""; else return str.substr(str.size() - 3); + } + void EdgeFeatures(const TaggedSentence& sent, short h, short m, SparseVector* features) const { const bool is_root = (h == -1); + const string head_word = (is_root ? kROOT : Fixup(TD::Convert(sent.words[h]))); int num_words = sent.words.size(); - const string& head_word = (is_root ? kROOT : TD::Convert(sent.words[h])); const string& head_pos = (is_root ? kROOT : TD::Convert(sent.pos[h])); - const string& mod_word = TD::Convert(sent.words[m]); + const string mod_word = Fixup(TD::Convert(sent.words[m])); const string& mod_pos = TD::Convert(sent.pos[m]); const string& mod_pos_L = (m > 0 ? TD::Convert(sent.pos[m-1]) : kLEFT_POS); const string& mod_pos_R = (m < sent.pos.size() - 1 ? TD::Convert(sent.pos[m]) : kRIGHT_POS); @@ -82,42 +98,41 @@ struct ArcFFImpl { const string dir = (bdir ? "MLeft" : "MRight"); int v = m - h; if (v < 0) { - v= -1 - int(log(-v) / log(2)); + v= -1 - int(log(-v) / log(1.6)); } else { - v= int(log(v) / log(2)); + v= int(log(v) / log(1.6)) + 1; } ostringstream os; if (v < 0) os << "LenL" << -v; else os << "LenR" << v; const string lenstr = os.str(); + Fire(features, dir); + Fire(features, lenstr); + // dir, lenstr if (is_root) { - Fire(features, "ROOT", mod_pos); + Fire(features, "wROOT", mod_word); + Fire(features, "pROOT", mod_pos); + Fire(features, "wpROOT", mod_word, mod_pos); Fire(features, "DROOT", mod_pos, lenstr); Fire(features, "LROOT", mod_pos_L); Fire(features, "RROOT", mod_pos_R); Fire(features, "LROOT", mod_pos_L, mod_pos); Fire(features, "RROOT", mod_pos_R, mod_pos); Fire(features, "LDist", m); - Fire(features, "RDist", m - num_words); + Fire(features, "RDist", num_words - m); } else { // not root const string& head_pos_L = (h > 0 ? TD::Convert(sent.pos[h-1]) : kLEFT_POS); const string& head_pos_R = (h < sent.pos.size() - 1 ? TD::Convert(sent.pos[h]) : kRIGHT_POS); - Fire(features, lenstr); - Fire(features, "H", head_pos); - Fire(features, "M", mod_pos); - Fire(features, "HM", head_pos, mod_pos); - Fire(features, "HM", head_pos, mod_pos, dir); - Fire(features, "HM", head_pos, mod_pos, lenstr); - Fire(features, "LexH", head_word); - Fire(features, "LexM", mod_word); - Fire(features, "LexHM", head_word, mod_word); - Fire(features, "LexHM", head_word, mod_word, dir); - Fire(features, "LexHM", head_word, mod_word, lenstr); + SparseVector fv; + SparseVector* f = &fv; + Fire(f, "H", head_pos); + Fire(f, "M", mod_pos); + Fire(f, "HM", head_pos, mod_pos); + // surrounders - Fire(features, "posLL", head_pos, mod_pos, head_pos_L, mod_pos_L); - Fire(features, "posRR", head_pos, mod_pos, head_pos_R, mod_pos_R); - Fire(features, "posLR", head_pos, mod_pos, head_pos_L, mod_pos_R); - Fire(features, "posRL", head_pos, mod_pos, head_pos_R, mod_pos_L); - Fire(features, "lexRL", head_word, head_pos_L, mod_pos_L); + Fire(f, "posLL", head_pos, mod_pos, head_pos_L, mod_pos_L); + Fire(f, "posRR", head_pos, mod_pos, head_pos_R, mod_pos_R); + Fire(f, "posLR", head_pos, mod_pos, head_pos_L, mod_pos_R); + Fire(f, "posRL", head_pos, mod_pos, head_pos_R, mod_pos_L); // between features int left = min(h,m); @@ -126,11 +141,28 @@ struct ArcFFImpl { if (bdir) --right; else ++left; for (map >::const_iterator it = pcs.begin(); it != pcs.end(); ++it) { if (it->second[left] != it->second[right]) { - Fire(features, "BT", head_pos, TD::Convert(it->first), mod_pos); - Fire(features, "BT", head_pos, TD::Convert(it->first), mod_pos, dir); + Fire(f, "BT", head_pos, TD::Convert(it->first), mod_pos); } } } + + Fire(f, "wH", head_word); + Fire(f, "wM", mod_word); + Fire(f, "wpH", head_word, head_pos); + Fire(f, "wpM", mod_word, mod_pos); + Fire(f, "pHwM", head_pos, mod_word); + Fire(f, "wHpM", head_word, mod_pos); + + Fire(f, "wHM", head_word, mod_word); + Fire(f, "pHMwH", head_pos, mod_pos, head_word); + Fire(f, "pHMwM", head_pos, mod_pos, mod_word); + Fire(f, "wHMpH", head_word, mod_word, head_pos); + Fire(f, "wHMpM", head_word, mod_word, mod_pos); + Fire(f, "wHMpHM", head_word, mod_word, head_pos, mod_pos); + + AddConjoin(fv, dir, features); + AddConjoin(fv, lenstr, features); + (*features) += fv; } } }; -- cgit v1.2.3 From 217c4aaeba1c9f19b3420b526235bffd86c7a92b Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Mon, 23 Apr 2012 03:11:26 +0100 Subject: mst train --- rst_parser/mst_train.cc | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/rst_parser/mst_train.cc b/rst_parser/mst_train.cc index b3711aba..6332693e 100644 --- a/rst_parser/mst_train.cc +++ b/rst_parser/mst_train.cc @@ -28,6 +28,9 @@ void InitCommandLine(int argc, char** argv, po::variables_map* conf) { ("weights,w",po::value(), "Optional starting weights") ("output_every_i_iterations,I",po::value()->default_value(1), "Write weights every I iterations") ("regularization_strength,C",po::value()->default_value(1.0), "Regularization strength") +#ifdef HAVE_CMPH + ("cmph_perfect_feature_hash,h", po::value(), "Load perfect hash function for features") +#endif #if HAVE_THREAD ("threads,T",po::value()->default_value(1), "Number of threads") #endif @@ -119,11 +122,19 @@ int main(int argc, char** argv) { int size = 1; po::variables_map conf; InitCommandLine(argc, argv, &conf); + if (conf.count("cmph_perfect_feature_hash")) { + cerr << "Loading perfect hash function from " << conf["cmph_perfect_feature_hash"].as() << " ...\n"; + FD::EnableHash(conf["cmph_perfect_feature_hash"].as()); + cerr << " " << FD::NumFeats() << " features in map\n"; + } ArcFeatureFunctions ffs; vector corpus; TrainingInstance::ReadTrainingCorpus(conf["training_data"].as(), &corpus, rank, size); + vector weights; + Weights::InitFromFile(conf["weights"].as(), &weights); vector forests(corpus.size()); SparseVector empirical; + cerr << "Extracting features...\n"; bool flag = false; for (int i = 0; i < corpus.size(); ++i) { TrainingInstance& cur = corpus[i]; @@ -149,9 +160,7 @@ int main(int argc, char** argv) { } if (flag) cerr << endl; //cerr << "EMP: " << empirical << endl; //DE - vector weights(FD::NumFeats(), 0.0); - if (conf.count("weights")) - Weights::InitFromFile(conf["weights"].as(), &weights); + weights.resize(FD::NumFeats(), 0.0); vector g(FD::NumFeats(), 0.0); cerr << "features initialized\noptimizing...\n"; boost::shared_ptr o; -- cgit v1.2.3