From 19147c5f45b40eac1e0ae1bc8bc8ccf90d1ea56c Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Sat, 14 Apr 2012 01:52:14 -0400 Subject: matrix tree theorem stuff --- rst_parser/Makefile.am | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'rst_parser/Makefile.am') diff --git a/rst_parser/Makefile.am b/rst_parser/Makefile.am index e97ab5c5..b61a20dd 100644 --- a/rst_parser/Makefile.am +++ b/rst_parser/Makefile.am @@ -8,7 +8,7 @@ TESTS = rst_test noinst_LIBRARIES = librst.a -librst_a_SOURCES = arc_factored.cc rst.cc +librst_a_SOURCES = arc_factored.cc arc_factored_marginals.cc rst.cc mst_train_SOURCES = mst_train.cc mst_train_LDADD = librst.a $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a ../klm/lm/libklm.a ../klm/util/libklm_util.a -lz -- cgit v1.2.3 From 372835df257ddd084e47cab962e932615f92f09c Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Sun, 15 Apr 2012 17:28:08 -0400 Subject: crf training of arc-factored dep parser --- rst_parser/Makefile.am | 6 +- rst_parser/arc_factored.cc | 29 +++-- rst_parser/arc_factored.h | 53 ++++++---- rst_parser/arc_factored_marginals.cc | 10 +- rst_parser/arc_ff.cc | 64 +++++++++++ rst_parser/arc_ff.h | 43 ++++++++ rst_parser/mst_train.cc | 200 ++++++++++++++++++++++++++++++++++- rst_parser/rst_test.cc | 18 ++-- 8 files changed, 379 insertions(+), 44 deletions(-) create mode 100644 rst_parser/arc_ff.cc create mode 100644 rst_parser/arc_ff.h (limited to 'rst_parser/Makefile.am') diff --git a/rst_parser/Makefile.am b/rst_parser/Makefile.am index b61a20dd..2b64b43a 100644 --- a/rst_parser/Makefile.am +++ b/rst_parser/Makefile.am @@ -8,12 +8,12 @@ TESTS = rst_test noinst_LIBRARIES = librst.a -librst_a_SOURCES = arc_factored.cc arc_factored_marginals.cc rst.cc +librst_a_SOURCES = arc_factored.cc arc_factored_marginals.cc rst.cc arc_ff.cc mst_train_SOURCES = mst_train.cc -mst_train_LDADD = librst.a $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a ../klm/lm/libklm.a ../klm/util/libklm_util.a -lz +mst_train_LDADD = librst.a $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a ../klm/lm/libklm.a ../klm/util/libklm_util.a ../training/optimize.o -lz rst_test_SOURCES = rst_test.cc rst_test_LDADD = librst.a $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a ../klm/lm/libklm.a ../klm/util/libklm_util.a -lz -AM_CPPFLAGS = -W -Wall -Wno-sign-compare $(GTEST_CPPFLAGS) -I$(top_srcdir)/decoder -I$(top_srcdir)/utils -I$(top_srcdir)/mteval -I../klm +AM_CPPFLAGS = -W -Wall -Wno-sign-compare $(GTEST_CPPFLAGS) -I$(top_srcdir)/decoder -I$(top_srcdir)/training -I$(top_srcdir)/utils -I$(top_srcdir)/mteval -I../klm diff --git a/rst_parser/arc_factored.cc b/rst_parser/arc_factored.cc index b2c2c427..44e769b8 100644 --- a/rst_parser/arc_factored.cc +++ b/rst_parser/arc_factored.cc @@ -6,23 +6,38 @@ #include #include +#include "arc_ff.h" + using namespace std; using namespace std::tr1; using namespace boost; +void ArcFactoredForest::ExtractFeatures(const TaggedSentence& sentence, + const std::vector >& ffs) { + for (int i = 0; i < ffs.size(); ++i) { + const ArcFeatureFunction& ff = *ffs[i]; + for (int m = 0; m < num_words_; ++m) { + for (int h = 0; h < num_words_; ++h) { + ff.EgdeFeatures(sentence, h, m, &edges_(h,m).features); + } + ff.EgdeFeatures(sentence, -1, m, &root_edges_[m].features); + } + } +} + void ArcFactoredForest::PickBestParentForEachWord(EdgeSubset* st) const { - for (int m = 1; m <= num_words_; ++m) { - int best_head = -1; + for (int m = 0; m < num_words_; ++m) { + int best_head = -2; prob_t best_score; - for (int h = 0; h <= num_words_; ++h) { + for (int h = -1; h < num_words_; ++h) { const Edge& edge = (*this)(h,m); - if (best_head < 0 || edge.edge_prob > best_score) { + if (best_head < -1 || edge.edge_prob > best_score) { best_score = edge.edge_prob; best_head = h; } } - assert(best_head >= 0); - if (best_head) + assert(best_head >= -1); + if (best_head >= 0) st->h_m_pairs.push_back(make_pair(best_head, m)); else st->roots.push_back(m); @@ -56,7 +71,7 @@ struct PriorityQueue { }; // based on Trajan 1977 -void ArcFactoredForest::MaximumEdgeSubset(EdgeSubset* st) const { +void ArcFactoredForest::MaximumSpanningTree(EdgeSubset* st) const { typedef disjoint_sets_with_storage DisjointSet; DisjointSet strongly(num_words_ + 1); diff --git a/rst_parser/arc_factored.h b/rst_parser/arc_factored.h index 3003a86e..a95f8230 100644 --- a/rst_parser/arc_factored.h +++ b/rst_parser/arc_factored.h @@ -5,37 +5,52 @@ #include #include #include +#include #include "array2d.h" #include "sparse_vector.h" #include "prob.h" #include "weights.h" +#include "wordid.h" + +struct TaggedSentence { + std::vector words; + std::vector pos; +}; struct EdgeSubset { EdgeSubset() {} std::vector roots; // unless multiroot trees are supported, this // will have a single member - std::vector > h_m_pairs; // h,m start at *1* + std::vector > h_m_pairs; // h,m start at 0 }; +struct ArcFeatureFunction; class ArcFactoredForest { public: - explicit ArcFactoredForest(short num_words) : - num_words_(num_words), - root_edges_(num_words), - edges_(num_words, num_words) { + ArcFactoredForest() : num_words_() {} + explicit ArcFactoredForest(short num_words) { + resize(num_words); + } + + void resize(unsigned num_words) { + num_words_ = num_words; + root_edges_.clear(); + edges_.clear(); + root_edges_.resize(num_words); + edges_.resize(num_words, num_words); for (int h = 0; h < num_words; ++h) { for (int m = 0; m < num_words; ++m) { - edges_(h, m).h = h + 1; - edges_(h, m).m = m + 1; + edges_(h, m).h = h; + edges_(h, m).m = m; } - root_edges_[h].h = 0; - root_edges_[h].m = h + 1; + root_edges_[h].h = -1; + root_edges_[h].m = h; } } // compute the maximum spanning tree based on the current weighting // using the O(n^2) CLE algorithm - void MaximumEdgeSubset(EdgeSubset* st) const; + void MaximumSpanningTree(EdgeSubset* st) const; // Reweight edges so that edge_prob is the edge's marginals // optionally returns log partition @@ -52,20 +67,16 @@ class ArcFactoredForest { prob_t edge_prob; }; + // set eges_[*].features + void ExtractFeatures(const TaggedSentence& sentence, + const std::vector >& ffs); + const Edge& operator()(short h, short m) const { - assert(m > 0); - assert(m <= num_words_); - assert(h >= 0); - assert(h <= num_words_); - return h ? edges_(h - 1, m - 1) : root_edges_[m - 1]; + return h >= 0 ? edges_(h, m) : root_edges_[m]; } Edge& operator()(short h, short m) { - assert(m > 0); - assert(m <= num_words_); - assert(h >= 0); - assert(h <= num_words_); - return h ? edges_(h - 1, m - 1) : root_edges_[m - 1]; + return h >= 0 ? edges_(h, m) : root_edges_[m]; } float Weight(short h, short m) const { @@ -87,7 +98,7 @@ class ArcFactoredForest { } private: - unsigned num_words_; + int num_words_; std::vector root_edges_; Array2D edges_; }; diff --git a/rst_parser/arc_factored_marginals.cc b/rst_parser/arc_factored_marginals.cc index 9851b59a..16360b0d 100644 --- a/rst_parser/arc_factored_marginals.cc +++ b/rst_parser/arc_factored_marginals.cc @@ -31,14 +31,18 @@ void ArcFactoredForest::EdgeMarginals(double *plog_z) { ArcMatrix Linv = L.inverse(); if (plog_z) *plog_z = log(Linv.determinant()); RootVector rootMarginals = r.cwiseProduct(Linv.col(0)); +// ArcMatrix T = Linv; for (int h = 0; h < num_words_; ++h) { for (int m = 0; m < num_words_; ++m) { - edges_(h,m).edge_prob = prob_t((m == 0 ? 0.0 : 1.0) * A(h,m) * Linv(m,m) - - (h == 0 ? 0.0 : 1.0) * A(h,m) * Linv(m,h)); + const double marginal = (m == 0 ? 0.0 : 1.0) * A(h,m) * Linv(m,m) - + (h == 0 ? 0.0 : 1.0) * A(h,m) * Linv(m,h); + edges_(h,m).edge_prob = prob_t(marginal); +// T(h,m) = marginal; } root_edges_[h].edge_prob = prob_t(rootMarginals(h)); } - // cerr << "ROOT MARGINALS: " << rootMarginals.transpose() << endl; +// cerr << "ROOT MARGINALS: " << rootMarginals.transpose() << endl; +// cerr << "M:\n" << T << endl; } #else diff --git a/rst_parser/arc_ff.cc b/rst_parser/arc_ff.cc new file mode 100644 index 00000000..f9effbda --- /dev/null +++ b/rst_parser/arc_ff.cc @@ -0,0 +1,64 @@ +#include "arc_ff.h" + +#include "tdict.h" +#include "fdict.h" +#include "sentence_metadata.h" + +using namespace std; + +ArcFeatureFunction::~ArcFeatureFunction() {} + +void ArcFeatureFunction::PrepareForInput(const TaggedSentence&) {} + +DistancePenalty::DistancePenalty(const string&) : fidw_(FD::Convert("Distance")), fidr_(FD::Convert("RootDistance")) {} + +void DistancePenalty::EdgeFeaturesImpl(const TaggedSentence& sent, + short h, + short m, + SparseVector* features) const { + const bool dir = m < h; + const bool is_root = (h == -1); + int v = m - h; + if (v < 0) { + v= -1 - int(log(-v) / log(2)); + } else { + v= int(log(v) / log(2)); + } + static map lenmap; + int& lenfid = lenmap[v]; + if (!lenfid) { + ostringstream os; + if (v < 0) os << "LenL" << -v; else os << "LenR" << v; + lenfid = FD::Convert(os.str()); + } + features->set_value(lenfid, 1.0); + const string& lenstr = FD::Convert(lenfid); + if (!is_root) { + static int modl = FD::Convert("ModLeft"); + static int modr = FD::Convert("ModRight"); + if (dir) features->set_value(modl, 1); + else features->set_value(modr, 1); + } + if (is_root) { + ostringstream os; + os << "ROOT:" << TD::Convert(sent.pos[m]); + features->set_value(FD::Convert(os.str()), 1.0); + os << "_" << lenstr; + features->set_value(FD::Convert(os.str()), 1.0); + } else { // not root + ostringstream os; + os << "HM:" << TD::Convert(sent.pos[h]) << '_' << TD::Convert(sent.pos[m]); + features->set_value(FD::Convert(os.str()), 1.0); + os << '_' << dir; + features->set_value(FD::Convert(os.str()), 1.0); + os << '_' << lenstr; + features->set_value(FD::Convert(os.str()), 1.0); + ostringstream os2; + os2 << "LexHM:" << TD::Convert(sent.words[h]) << '_' << TD::Convert(sent.words[m]); + features->set_value(FD::Convert(os2.str()), 1.0); + os2 << '_' << dir; + features->set_value(FD::Convert(os2.str()), 1.0); + os2 << '_' << lenstr; + features->set_value(FD::Convert(os2.str()), 1.0); + } +} diff --git a/rst_parser/arc_ff.h b/rst_parser/arc_ff.h new file mode 100644 index 00000000..bc51fef4 --- /dev/null +++ b/rst_parser/arc_ff.h @@ -0,0 +1,43 @@ +#ifndef _ARC_FF_H_ +#define _ARC_FF_H_ + +#include +#include "sparse_vector.h" +#include "weights.h" +#include "arc_factored.h" + +struct TaggedSentence; +class ArcFeatureFunction { + public: + virtual ~ArcFeatureFunction(); + + // called once, per input, before any calls to EdgeFeatures + // used to initialize sentence-specific data structures + virtual void PrepareForInput(const TaggedSentence& sentence); + + inline void EgdeFeatures(const TaggedSentence& sentence, + short h, + short m, + SparseVector* features) const { + EdgeFeaturesImpl(sentence, h, m, features); + } + protected: + virtual void EdgeFeaturesImpl(const TaggedSentence& sentence, + short h, + short m, + SparseVector* features) const = 0; +}; + +class DistancePenalty : public ArcFeatureFunction { + public: + DistancePenalty(const std::string& param); + protected: + virtual void EdgeFeaturesImpl(const TaggedSentence& sentence, + short h, + short m, + SparseVector* features) const; + private: + const int fidw_, fidr_; +}; + +#endif diff --git a/rst_parser/mst_train.cc b/rst_parser/mst_train.cc index 7b5af4c1..def23edb 100644 --- a/rst_parser/mst_train.cc +++ b/rst_parser/mst_train.cc @@ -1,12 +1,210 @@ #include "arc_factored.h" +#include #include +#include +#include + +#include "arc_ff.h" +#include "arc_ff_factory.h" +#include "stringlib.h" +#include "filelib.h" +#include "tdict.h" +#include "picojson.h" +#include "optimize.h" +#include "weights.h" using namespace std; +namespace po = boost::program_options; + +void InitCommandLine(int argc, char** argv, po::variables_map* conf) { + po::options_description opts("Configuration options"); + string cfg_file; + opts.add_options() + ("training_data,t",po::value()->default_value("-"), "File containing training data (jsent format)") + ("feature_function,F",po::value >()->composing(), "feature function") + ("regularization_strength,C",po::value()->default_value(1.0), "Regularization strength") + ("correction_buffers,m", po::value()->default_value(10), "LBFGS correction buffers"); + po::options_description clo("Command line options"); + clo.add_options() + ("config,c", po::value(&cfg_file), "Configuration file") + ("help,?", "Print this help message and exit"); + + po::options_description dconfig_options, dcmdline_options; + dconfig_options.add(opts); + dcmdline_options.add(dconfig_options).add(clo); + po::store(parse_command_line(argc, argv, dcmdline_options), *conf); + if (cfg_file.size() > 0) { + ReadFile rf(cfg_file); + po::store(po::parse_config_file(*rf.stream(), dconfig_options), *conf); + } + if (conf->count("help")) { + cerr << dcmdline_options << endl; + exit(1); + } +} + +struct TrainingInstance { + TaggedSentence ts; + EdgeSubset tree; + SparseVector features; +}; + +void ReadTraining(const string& fname, vector* corpus, int rank = 0, int size = 1) { + ReadFile rf(fname); + istream& in = *rf.stream(); + string line; + string err; + int lc = 0; + bool flag = false; + while(getline(in, line)) { + ++lc; + if ((lc-1) % size != rank) continue; + if (rank == 0 && lc % 10 == 0) { cerr << '.' << flush; flag = true; } + if (rank == 0 && lc % 400 == 0) { cerr << " [" << lc << "]\n"; flag = false; } + size_t pos = line.rfind('\t'); + assert(pos != string::npos); + picojson::value obj; + picojson::parse(obj, line.begin() + pos, line.end(), &err); + if (err.size() > 0) { cerr << "JSON parse error in " << lc << ": " << err << endl; abort(); } + corpus->push_back(TrainingInstance()); + TrainingInstance& cur = corpus->back(); + TaggedSentence& ts = cur.ts; + EdgeSubset& tree = cur.tree; + assert(obj.is()); + const picojson::object& d = obj.get(); + const picojson::array& ta = d.find("tokens")->second.get(); + for (unsigned i = 0; i < ta.size(); ++i) { + ts.words.push_back(TD::Convert(ta[i].get()[0].get())); + ts.pos.push_back(TD::Convert(ta[i].get()[1].get())); + } + const picojson::array& da = d.find("deps")->second.get(); + for (unsigned i = 0; i < da.size(); ++i) { + const picojson::array& thm = da[i].get(); + // get dep type here + short h = thm[2].get(); + short m = thm[1].get(); + if (h < 0) + tree.roots.push_back(m); + else + tree.h_m_pairs.push_back(make_pair(h,m)); + } + //cerr << TD::GetString(ts.words) << endl << TD::GetString(ts.pos) << endl << tree << endl; + } + if (flag) cerr << "\nRead " << lc << " training instances\n"; +} + +void AddFeatures(double prob, const SparseVector& fmap, vector* g) { + for (SparseVector::const_iterator it = fmap.begin(); it != fmap.end(); ++it) + (*g)[it->first] += it->second * prob; +} + +double ApplyRegularizationTerms(const double C, + const vector& weights, + vector* g) { + assert(weights.size() == g->size()); + double reg = 0; + for (size_t i = 0; i < weights.size(); ++i) { +// const double prev_w_i = (i < prev_weights.size() ? prev_weights[i] : 0.0); + const double& w_i = weights[i]; + double& g_i = (*g)[i]; + reg += C * w_i * w_i; + g_i += 2 * C * w_i; + +// reg += T * (w_i - prev_w_i) * (w_i - prev_w_i); +// g_i += 2 * T * (w_i - prev_w_i); + } + return reg; +} int main(int argc, char** argv) { + int rank = 0; + int size = 1; + po::variables_map conf; + InitCommandLine(argc, argv, &conf); ArcFactoredForest af(5); - cerr << af(0,3) << endl; + ArcFFRegistry reg; + reg.Register("DistancePenalty", new ArcFFFactory); + vector corpus; + vector > ffs; + ffs.push_back(boost::shared_ptr(new DistancePenalty(""))); + ReadTraining(conf["training_data"].as(), &corpus, rank, size); + vector forests(corpus.size()); + SparseVector empirical; + bool flag = false; + for (int i = 0; i < corpus.size(); ++i) { + TrainingInstance& cur = corpus[i]; + if (rank == 0 && (i+1) % 10 == 0) { cerr << '.' << flush; flag = true; } + if (rank == 0 && (i+1) % 400 == 0) { cerr << " [" << (i+1) << "]\n"; flag = false; } + for (int fi = 0; fi < ffs.size(); ++fi) { + ArcFeatureFunction& ff = *ffs[fi]; + ff.PrepareForInput(cur.ts); + SparseVector efmap; + for (int j = 0; j < cur.tree.h_m_pairs.size(); ++j) { + efmap.clear(); + ff.EgdeFeatures(cur.ts, cur.tree.h_m_pairs[j].first, + cur.tree.h_m_pairs[j].second, + &efmap); + cur.features += efmap; + } + for (int j = 0; j < cur.tree.roots.size(); ++j) { + efmap.clear(); + ff.EgdeFeatures(cur.ts, -1, cur.tree.roots[j], &efmap); + cur.features += efmap; + } + } + empirical += cur.features; + forests[i].resize(cur.ts.words.size()); + forests[i].ExtractFeatures(cur.ts, ffs); + } + if (flag) cerr << endl; + //cerr << "EMP: " << empirical << endl; //DE + vector weights(FD::NumFeats(), 0.0); + vector g(FD::NumFeats(), 0.0); + cerr << "features initialized\noptimizing...\n"; + boost::shared_ptr o; + o.reset(new LBFGSOptimizer(g.size(), conf["correction_buffers"].as())); + int iterations = 1000; + for (int iter = 0; iter < iterations; ++iter) { + cerr << "ITERATION " << iter << " " << flush; + fill(g.begin(), g.end(), 0.0); + for (SparseVector::const_iterator it = empirical.begin(); it != empirical.end(); ++it) + g[it->first] = -it->second; + double obj = -empirical.dot(weights); + // SparseVector mfm; //DE + for (int i = 0; i < corpus.size(); ++i) { + forests[i].Reweight(weights); + double logz; + forests[i].EdgeMarginals(&logz); + //cerr << " O = " << (-corpus[i].features.dot(weights)) << " D=" << -logz << " OO= " << (-corpus[i].features.dot(weights) - logz) << endl; + obj -= logz; + int num_words = corpus[i].ts.words.size(); + for (int h = -1; h < num_words; ++h) { + for (int m = 0; m < num_words; ++m) { + if (h == m) continue; + const ArcFactoredForest::Edge& edge = forests[i](h,m); + const SparseVector& fmap = edge.features; + double prob = edge.edge_prob.as_float(); + if (prob < -0.000001) { cerr << "Prob < 0: " << prob << endl; prob = 0; } + if (prob > 1.000001) { cerr << "Prob > 1: " << prob << endl; prob = 1; } + AddFeatures(prob, fmap, &g); + //mfm += fmap * prob; // DE + } + } + } + //cerr << endl << "E: " << empirical << endl; // DE + //cerr << "M: " << mfm << endl; // DE + double r = ApplyRegularizationTerms(conf["regularization_strength"].as(), weights, &g); + double gnorm = 0; + for (int i = 0; i < g.size(); ++i) + gnorm += g[i]*g[i]; + cerr << "OBJ=" << (obj+r) << "\t[F=" << obj << " R=" << r << "]\tGnorm=" << sqrt(gnorm) << endl; + obj += r; + assert(obj >= 0); + o->Optimize(obj, g, &weights); + Weights::ShowLargestFeatures(weights); + if (o->HasConverged()) { cerr << "CONVERGED\n"; break; } + } return 0; } diff --git a/rst_parser/rst_test.cc b/rst_parser/rst_test.cc index 8995515f..7e6fb2c1 100644 --- a/rst_parser/rst_test.cc +++ b/rst_parser/rst_test.cc @@ -17,15 +17,15 @@ int main(int argc, char** argv) { // (0, 1) 9 // (0, 3) 9 ArcFactoredForest af(3); - af(1,2).edge_prob.logeq(20); - af(1,3).edge_prob.logeq(3); - af(2,1).edge_prob.logeq(20); - af(2,3).edge_prob.logeq(30); - af(3,2).edge_prob.logeq(0); - af(3,1).edge_prob.logeq(11); - af(0,2).edge_prob.logeq(10); - af(0,1).edge_prob.logeq(9); - af(0,3).edge_prob.logeq(9); + af(0,1).edge_prob.logeq(20); + af(0,2).edge_prob.logeq(3); + af(1,0).edge_prob.logeq(20); + af(1,2).edge_prob.logeq(30); + af(2,1).edge_prob.logeq(0); + af(2,0).edge_prob.logeq(11); + af(-1,1).edge_prob.logeq(10); + af(-1,0).edge_prob.logeq(9); + af(-1,2).edge_prob.logeq(9); EdgeSubset tree; // af.MaximumEdgeSubset(&tree); double lz; -- cgit v1.2.3 From caf1a688db7b446642581c3f69a9aea720735a8f Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Mon, 16 Apr 2012 18:20:33 -0400 Subject: rst sampler --- rst_parser/Makefile.am | 7 ++- rst_parser/dep_training.cc | 56 ++++++++++++++++++++ rst_parser/dep_training.h | 17 ++++++ rst_parser/mst_train.cc | 58 +-------------------- rst_parser/rst.cc | 56 +++++++++++++++----- rst_parser/rst.h | 8 ++- rst_parser/rst_parse.cc | 126 +++++++++++++++++++++++++++++++++++++++++++++ utils/weights.cc | 4 +- 8 files changed, 260 insertions(+), 72 deletions(-) create mode 100644 rst_parser/dep_training.cc create mode 100644 rst_parser/dep_training.h create mode 100644 rst_parser/rst_parse.cc (limited to 'rst_parser/Makefile.am') diff --git a/rst_parser/Makefile.am b/rst_parser/Makefile.am index 2b64b43a..6e884f53 100644 --- a/rst_parser/Makefile.am +++ b/rst_parser/Makefile.am @@ -1,5 +1,5 @@ bin_PROGRAMS = \ - mst_train + mst_train rst_parse noinst_PROGRAMS = \ rst_test @@ -8,11 +8,14 @@ TESTS = rst_test noinst_LIBRARIES = librst.a -librst_a_SOURCES = arc_factored.cc arc_factored_marginals.cc rst.cc arc_ff.cc +librst_a_SOURCES = arc_factored.cc arc_factored_marginals.cc rst.cc arc_ff.cc dep_training.cc mst_train_SOURCES = mst_train.cc mst_train_LDADD = librst.a $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a ../klm/lm/libklm.a ../klm/util/libklm_util.a ../training/optimize.o -lz +rst_parse_SOURCES = rst_parse.cc +rst_parse_LDADD = librst.a $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a ../klm/lm/libklm.a ../klm/util/libklm_util.a -lz + rst_test_SOURCES = rst_test.cc rst_test_LDADD = librst.a $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a ../klm/lm/libklm.a ../klm/util/libklm_util.a -lz diff --git a/rst_parser/dep_training.cc b/rst_parser/dep_training.cc new file mode 100644 index 00000000..de431ebc --- /dev/null +++ b/rst_parser/dep_training.cc @@ -0,0 +1,56 @@ +#include "dep_training.h" + +#include +#include + +#include "stringlib.h" +#include "filelib.h" +#include "tdict.h" +#include "picojson.h" + +using namespace std; + +void TrainingInstance::ReadTraining(const string& fname, vector* corpus, int rank, int size) { + ReadFile rf(fname); + istream& in = *rf.stream(); + string line; + string err; + int lc = 0; + bool flag = false; + while(getline(in, line)) { + ++lc; + if ((lc-1) % size != rank) continue; + if (rank == 0 && lc % 10 == 0) { cerr << '.' << flush; flag = true; } + if (rank == 0 && lc % 400 == 0) { cerr << " [" << lc << "]\n"; flag = false; } + size_t pos = line.rfind('\t'); + assert(pos != string::npos); + picojson::value obj; + picojson::parse(obj, line.begin() + pos, line.end(), &err); + if (err.size() > 0) { cerr << "JSON parse error in " << lc << ": " << err << endl; abort(); } + corpus->push_back(TrainingInstance()); + TrainingInstance& cur = corpus->back(); + TaggedSentence& ts = cur.ts; + EdgeSubset& tree = cur.tree; + assert(obj.is()); + const picojson::object& d = obj.get(); + const picojson::array& ta = d.find("tokens")->second.get(); + for (unsigned i = 0; i < ta.size(); ++i) { + ts.words.push_back(TD::Convert(ta[i].get()[0].get())); + ts.pos.push_back(TD::Convert(ta[i].get()[1].get())); + } + const picojson::array& da = d.find("deps")->second.get(); + for (unsigned i = 0; i < da.size(); ++i) { + const picojson::array& thm = da[i].get(); + // get dep type here + short h = thm[2].get(); + short m = thm[1].get(); + if (h < 0) + tree.roots.push_back(m); + else + tree.h_m_pairs.push_back(make_pair(h,m)); + } + //cerr << TD::GetString(ts.words) << endl << TD::GetString(ts.pos) << endl << tree << endl; + } + if (flag) cerr << "\nRead " << lc << " training instances\n"; +} + diff --git a/rst_parser/dep_training.h b/rst_parser/dep_training.h new file mode 100644 index 00000000..73ffd298 --- /dev/null +++ b/rst_parser/dep_training.h @@ -0,0 +1,17 @@ +#ifndef _DEP_TRAINING_H_ +#define _DEP_TRAINING_H_ + +#include +#include +#include "arc_factored.h" +#include "weights.h" + +struct TrainingInstance { + TaggedSentence ts; + EdgeSubset tree; + SparseVector features; + // reads a "Jsent" formatted dependency file + static void ReadTraining(const std::string& fname, std::vector* corpus, int rank = 0, int size = 1); +}; + +#endif diff --git a/rst_parser/mst_train.cc b/rst_parser/mst_train.cc index c5cab6ec..f0403d7e 100644 --- a/rst_parser/mst_train.cc +++ b/rst_parser/mst_train.cc @@ -10,10 +10,9 @@ #include "stringlib.h" #include "filelib.h" #include "tdict.h" -#include "picojson.h" +#include "dep_training.h" #include "optimize.h" #include "weights.h" -#include "rst.h" using namespace std; namespace po = boost::program_options; @@ -47,56 +46,6 @@ void InitCommandLine(int argc, char** argv, po::variables_map* conf) { } } -struct TrainingInstance { - TaggedSentence ts; - EdgeSubset tree; - SparseVector features; -}; - -void ReadTraining(const string& fname, vector* corpus, int rank = 0, int size = 1) { - ReadFile rf(fname); - istream& in = *rf.stream(); - string line; - string err; - int lc = 0; - bool flag = false; - while(getline(in, line)) { - ++lc; - if ((lc-1) % size != rank) continue; - if (rank == 0 && lc % 10 == 0) { cerr << '.' << flush; flag = true; } - if (rank == 0 && lc % 400 == 0) { cerr << " [" << lc << "]\n"; flag = false; } - size_t pos = line.rfind('\t'); - assert(pos != string::npos); - picojson::value obj; - picojson::parse(obj, line.begin() + pos, line.end(), &err); - if (err.size() > 0) { cerr << "JSON parse error in " << lc << ": " << err << endl; abort(); } - corpus->push_back(TrainingInstance()); - TrainingInstance& cur = corpus->back(); - TaggedSentence& ts = cur.ts; - EdgeSubset& tree = cur.tree; - assert(obj.is()); - const picojson::object& d = obj.get(); - const picojson::array& ta = d.find("tokens")->second.get(); - for (unsigned i = 0; i < ta.size(); ++i) { - ts.words.push_back(TD::Convert(ta[i].get()[0].get())); - ts.pos.push_back(TD::Convert(ta[i].get()[1].get())); - } - const picojson::array& da = d.find("deps")->second.get(); - for (unsigned i = 0; i < da.size(); ++i) { - const picojson::array& thm = da[i].get(); - // get dep type here - short h = thm[2].get(); - short m = thm[1].get(); - if (h < 0) - tree.roots.push_back(m); - else - tree.h_m_pairs.push_back(make_pair(h,m)); - } - //cerr << TD::GetString(ts.words) << endl << TD::GetString(ts.pos) << endl << tree << endl; - } - if (flag) cerr << "\nRead " << lc << " training instances\n"; -} - void AddFeatures(double prob, const SparseVector& fmap, vector* g) { for (SparseVector::const_iterator it = fmap.begin(); it != fmap.end(); ++it) (*g)[it->first] += it->second * prob; @@ -131,7 +80,7 @@ int main(int argc, char** argv) { vector corpus; vector > ffs; ffs.push_back(boost::shared_ptr(new DistancePenalty(""))); - ReadTraining(conf["training_data"].as(), &corpus, rank, size); + TrainingInstance::ReadTraining(conf["training_data"].as(), &corpus, rank, size); vector forests(corpus.size()); SparseVector empirical; bool flag = false; @@ -224,9 +173,6 @@ int main(int argc, char** argv) { } if (converged) { cerr << "CONVERGED\n"; break; } } - forests[0].Reweight(weights); - TreeSampler ts(forests[0]); - EdgeSubset tt; ts.SampleRandomSpanningTree(&tt); return 0; } diff --git a/rst_parser/rst.cc b/rst_parser/rst.cc index c4ce898e..bc91330b 100644 --- a/rst_parser/rst.cc +++ b/rst_parser/rst.cc @@ -3,45 +3,77 @@ using namespace std; // David B. Wilson. Generating Random Spanning Trees More Quickly than the Cover Time. - +// this is an awesome algorithm TreeSampler::TreeSampler(const ArcFactoredForest& af) : forest(af), usucc(af.size() + 1) { - // edges are directed from modifiers to heads, to the root + // edges are directed from modifiers to heads, and finally to the root + vector p; for (int m = 1; m <= forest.size(); ++m) { +#if USE_ALIAS_SAMPLER + p.clear(); +#else SampleSet& ss = usucc[m]; - for (int h = 0; h <= forest.size(); ++h) - ss.add(forest(h-1,m-1).edge_prob.as_float()); +#endif + double z = 0; + for (int h = 0; h <= forest.size(); ++h) { + double u = forest(h-1,m-1).edge_prob.as_float(); + z += u; +#if USE_ALIAS_SAMPLER + p.push_back(u); +#else + ss.add(u); +#endif + } +#if USE_ALIAS_SAMPLER + for (int i = 0; i < p.size(); ++i) { p[i] /= z; } + usucc[m].Init(p); +#endif } } -void TreeSampler::SampleRandomSpanningTree(EdgeSubset* tree) { - MT19937 rng; +void TreeSampler::SampleRandomSpanningTree(EdgeSubset* tree, MT19937* prng) { + MT19937& rng = *prng; const int r = 0; bool success = false; while (!success) { int roots = 0; + tree->h_m_pairs.clear(); + tree->roots.clear(); vector next(forest.size() + 1, -1); vector in_tree(forest.size() + 1, 0); in_tree[r] = 1; - for (int i = 0; i < forest.size(); ++i) { + //cerr << "Forest size: " << forest.size() << endl; + for (int i = 0; i <= forest.size(); ++i) { + //cerr << "Sampling starting at u=" << i << endl; int u = i; if (in_tree[u]) continue; while(!in_tree[u]) { +#if USE_ALIAS_SAMPLER + next[u] = usucc[u].Draw(rng); +#else next[u] = rng.SelectSample(usucc[u]); +#endif u = next[u]; } u = i; - cerr << (u-1); + //cerr << (u-1); + int prev = u-1; while(!in_tree[u]) { in_tree[u] = true; u = next[u]; - cerr << " > " << (u-1); - if (u == r) { ++roots; } + //cerr << " > " << (u-1); + if (u == r) { + ++roots; + tree->roots.push_back(prev); + } else { + tree->h_m_pairs.push_back(make_pair(u-1,prev)); + } + prev = u-1; } - cerr << endl; + //cerr << endl; } assert(roots > 0); if (roots > 1) { - cerr << "FAILURE\n"; + //cerr << "FAILURE\n"; } else { success = true; } diff --git a/rst_parser/rst.h b/rst_parser/rst.h index a269ff9b..8bf389f7 100644 --- a/rst_parser/rst.h +++ b/rst_parser/rst.h @@ -4,12 +4,18 @@ #include #include "sampler.h" #include "arc_factored.h" +#include "alias_sampler.h" struct TreeSampler { explicit TreeSampler(const ArcFactoredForest& af); - void SampleRandomSpanningTree(EdgeSubset* tree); + void SampleRandomSpanningTree(EdgeSubset* tree, MT19937* rng); const ArcFactoredForest& forest; +#define USE_ALIAS_SAMPLER 1 +#if USE_ALIAS_SAMPLER + std::vector usucc; +#else std::vector > usucc; +#endif }; #endif diff --git a/rst_parser/rst_parse.cc b/rst_parser/rst_parse.cc new file mode 100644 index 00000000..9cc1359a --- /dev/null +++ b/rst_parser/rst_parse.cc @@ -0,0 +1,126 @@ +#include "arc_factored.h" + +#include +#include +#include +#include + +#include "timing_stats.h" +#include "arc_ff.h" +#include "arc_ff_factory.h" +#include "dep_training.h" +#include "stringlib.h" +#include "filelib.h" +#include "tdict.h" +#include "weights.h" +#include "rst.h" + +using namespace std; +namespace po = boost::program_options; + +void InitCommandLine(int argc, char** argv, po::variables_map* conf) { + po::options_description opts("Configuration options"); + string cfg_file; + opts.add_options() + ("training_data,t",po::value()->default_value("-"), "File containing training data (jsent format)") + ("feature_function,F",po::value >()->composing(), "feature function (multiple permitted)") + ("q_weights,q",po::value(), "Arc-factored weights for proposal distribution") + ("samples,n",po::value()->default_value(1000), "Number of samples"); + po::options_description clo("Command line options"); + clo.add_options() + ("config,c", po::value(&cfg_file), "Configuration file") + ("help,?", "Print this help message and exit"); + + po::options_description dconfig_options, dcmdline_options; + dconfig_options.add(opts); + dcmdline_options.add(dconfig_options).add(clo); + po::store(parse_command_line(argc, argv, dcmdline_options), *conf); + if (cfg_file.size() > 0) { + ReadFile rf(cfg_file); + po::store(po::parse_config_file(*rf.stream(), dconfig_options), *conf); + } + if (conf->count("help")) { + cerr << dcmdline_options << endl; + exit(1); + } +} + +int main(int argc, char** argv) { + po::variables_map conf; + InitCommandLine(argc, argv, &conf); + ArcFactoredForest af(5); + ArcFFRegistry reg; + reg.Register("DistancePenalty", new ArcFFFactory); + vector corpus; + vector > ffs; + ffs.push_back(boost::shared_ptr(new DistancePenalty(""))); + TrainingInstance::ReadTraining(conf["training_data"].as(), &corpus); + vector forests(corpus.size()); + SparseVector empirical; + bool flag = false; + for (int i = 0; i < corpus.size(); ++i) { + TrainingInstance& cur = corpus[i]; + if ((i+1) % 10 == 0) { cerr << '.' << flush; flag = true; } + if ((i+1) % 400 == 0) { cerr << " [" << (i+1) << "]\n"; flag = false; } + for (int fi = 0; fi < ffs.size(); ++fi) { + ArcFeatureFunction& ff = *ffs[fi]; + ff.PrepareForInput(cur.ts); + SparseVector efmap; + for (int j = 0; j < cur.tree.h_m_pairs.size(); ++j) { + efmap.clear(); + ff.EgdeFeatures(cur.ts, cur.tree.h_m_pairs[j].first, + cur.tree.h_m_pairs[j].second, + &efmap); + cur.features += efmap; + } + for (int j = 0; j < cur.tree.roots.size(); ++j) { + efmap.clear(); + ff.EgdeFeatures(cur.ts, -1, cur.tree.roots[j], &efmap); + cur.features += efmap; + } + } + empirical += cur.features; + forests[i].resize(cur.ts.words.size()); + forests[i].ExtractFeatures(cur.ts, ffs); + } + if (flag) cerr << endl; + vector weights(FD::NumFeats(), 0.0); + Weights::InitFromFile(conf["q_weights"].as(), &weights); + MT19937 rng; + SparseVector model_exp; + SparseVector sampled_exp; + int samples = conf["samples"].as(); + for (int i = 0; i < corpus.size(); ++i) { + const int num_words = corpus[i].ts.words.size(); + forests[i].Reweight(weights); + forests[i].EdgeMarginals(); + model_exp.clear(); + for (int h = -1; h < num_words; ++h) { + for (int m = 0; m < num_words; ++m) { + if (h == m) continue; + const ArcFactoredForest::Edge& edge = forests[i](h,m); + const SparseVector& fmap = edge.features; + double prob = edge.edge_prob.as_float(); + model_exp += fmap * prob; + } + } + //cerr << "TRUE EXP: " << model_exp << endl; + + forests[i].Reweight(weights); + TreeSampler ts(forests[i]); + sampled_exp.clear(); + //ostringstream os; os << "Samples_" << samples; + //Timer t(os.str()); + for (int n = 0; n < samples; ++n) { + EdgeSubset tree; + ts.SampleRandomSpanningTree(&tree, &rng); + SparseVector feats; + tree.ExtractFeatures(corpus[i].ts, ffs, &feats); + sampled_exp += feats; + } + sampled_exp /= samples; + cerr << "L2 norm of diff @ " << samples << " samples: " << (model_exp - sampled_exp).l2norm() << endl; + } + return 0; +} + diff --git a/utils/weights.cc b/utils/weights.cc index ac407dfb..39c18474 100644 --- a/utils/weights.cc +++ b/utils/weights.cc @@ -144,8 +144,10 @@ void Weights::ShowLargestFeatures(const vector& w) { vector fnums(w.size()); for (int i = 0; i < w.size(); ++i) fnums[i] = i; + int nf = FD::NumFeats(); + if (nf > 10) nf = 10; vector::iterator mid = fnums.begin(); - mid += (w.size() > 10 ? 10 : w.size()); + mid += nf; partial_sort(fnums.begin(), mid, fnums.end(), FComp(w)); cerr << "TOP FEATURES:"; for (vector::iterator i = fnums.begin(); i != mid; ++i) { -- cgit v1.2.3 From 23dcec445852d140291bba4a8c40bb0e47e9a266 Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Mon, 16 Apr 2012 22:42:24 -0400 Subject: refactor some code, simplify, fix typos --- rst_parser/Makefile.am | 16 ++--- rst_parser/arc_factored.cc | 40 ++++++------ rst_parser/arc_factored.h | 7 ++- rst_parser/arc_ff.cc | 120 +++++++++++++++++++++--------------- rst_parser/arc_ff.h | 35 +++-------- rst_parser/arc_ff_factory.h | 42 ------------- rst_parser/mst_train.cc | 37 +++++------- rst_parser/rst_parse.cc | 126 -------------------------------------- rst_parser/rst_test.cc | 48 --------------- rst_parser/rst_train.cc | 144 ++++++++++++++++++++++++++++++++++++++++++++ 10 files changed, 264 insertions(+), 351 deletions(-) delete mode 100644 rst_parser/arc_ff_factory.h delete mode 100644 rst_parser/rst_parse.cc delete mode 100644 rst_parser/rst_test.cc create mode 100644 rst_parser/rst_train.cc (limited to 'rst_parser/Makefile.am') diff --git a/rst_parser/Makefile.am b/rst_parser/Makefile.am index 6e884f53..876c2237 100644 --- a/rst_parser/Makefile.am +++ b/rst_parser/Makefile.am @@ -1,22 +1,14 @@ bin_PROGRAMS = \ - mst_train rst_parse - -noinst_PROGRAMS = \ - rst_test - -TESTS = rst_test + mst_train rst_train noinst_LIBRARIES = librst.a -librst_a_SOURCES = arc_factored.cc arc_factored_marginals.cc rst.cc arc_ff.cc dep_training.cc +librst_a_SOURCES = arc_factored.cc arc_factored_marginals.cc rst.cc arc_ff.cc dep_training.cc global_ff.cc mst_train_SOURCES = mst_train.cc mst_train_LDADD = librst.a $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a ../klm/lm/libklm.a ../klm/util/libklm_util.a ../training/optimize.o -lz -rst_parse_SOURCES = rst_parse.cc -rst_parse_LDADD = librst.a $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a ../klm/lm/libklm.a ../klm/util/libklm_util.a -lz - -rst_test_SOURCES = rst_test.cc -rst_test_LDADD = librst.a $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a ../klm/lm/libklm.a ../klm/util/libklm_util.a -lz +rst_train_SOURCES = rst_train.cc +rst_train_LDADD = librst.a $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a ../klm/lm/libklm.a ../klm/util/libklm_util.a -lz AM_CPPFLAGS = -W -Wall -Wno-sign-compare $(GTEST_CPPFLAGS) -I$(top_srcdir)/decoder -I$(top_srcdir)/training -I$(top_srcdir)/utils -I$(top_srcdir)/mteval -I../klm diff --git a/rst_parser/arc_factored.cc b/rst_parser/arc_factored.cc index 34c689f4..74bf7516 100644 --- a/rst_parser/arc_factored.cc +++ b/rst_parser/arc_factored.cc @@ -13,36 +13,30 @@ using namespace std::tr1; using namespace boost; void EdgeSubset::ExtractFeatures(const TaggedSentence& sentence, - const std::vector >& ffs, + const ArcFeatureFunctions& ffs, SparseVector* features) const { SparseVector efmap; - for (int i = 0; i < ffs.size(); ++i) { - const ArcFeatureFunction& ff= *ffs[i]; - for (int j = 0; j < h_m_pairs.size(); ++j) { - efmap.clear(); - ff.EgdeFeatures(sentence, h_m_pairs[j].first, - h_m_pairs[j].second, - &efmap); - (*features) += efmap; - } - for (int j = 0; j < roots.size(); ++j) { - efmap.clear(); - ff.EgdeFeatures(sentence, -1, roots[j], &efmap); - (*features) += efmap; - } + for (int j = 0; j < h_m_pairs.size(); ++j) { + efmap.clear(); + ffs.EdgeFeatures(sentence, h_m_pairs[j].first, + h_m_pairs[j].second, + &efmap); + (*features) += efmap; + } + for (int j = 0; j < roots.size(); ++j) { + efmap.clear(); + ffs.EdgeFeatures(sentence, -1, roots[j], &efmap); + (*features) += efmap; } } void ArcFactoredForest::ExtractFeatures(const TaggedSentence& sentence, - const std::vector >& ffs) { - for (int i = 0; i < ffs.size(); ++i) { - const ArcFeatureFunction& ff = *ffs[i]; - for (int m = 0; m < num_words_; ++m) { - for (int h = 0; h < num_words_; ++h) { - ff.EgdeFeatures(sentence, h, m, &edges_(h,m).features); - } - ff.EgdeFeatures(sentence, -1, m, &root_edges_[m].features); + const ArcFeatureFunctions& ffs) { + for (int m = 0; m < num_words_; ++m) { + for (int h = 0; h < num_words_; ++h) { + ffs.EdgeFeatures(sentence, h, m, &edges_(h,m).features); } + ffs.EdgeFeatures(sentence, -1, m, &root_edges_[m].features); } } diff --git a/rst_parser/arc_factored.h b/rst_parser/arc_factored.h index a271c8d4..c5481d80 100644 --- a/rst_parser/arc_factored.h +++ b/rst_parser/arc_factored.h @@ -17,14 +17,15 @@ struct TaggedSentence { std::vector pos; }; -struct ArcFeatureFunction; +struct ArcFeatureFunctions; struct EdgeSubset { EdgeSubset() {} std::vector roots; // unless multiroot trees are supported, this // will have a single member std::vector > h_m_pairs; // h,m start at 0 + // assumes ArcFeatureFunction::PrepareForInput has already been called void ExtractFeatures(const TaggedSentence& sentence, - const std::vector >& ffs, + const ArcFeatureFunctions& ffs, SparseVector* features) const; }; @@ -74,7 +75,7 @@ class ArcFactoredForest { // set eges_[*].features void ExtractFeatures(const TaggedSentence& sentence, - const std::vector >& ffs); + const ArcFeatureFunctions& ffs); const Edge& operator()(short h, short m) const { return h >= 0 ? edges_(h, m) : root_edges_[m]; diff --git a/rst_parser/arc_ff.cc b/rst_parser/arc_ff.cc index f9effbda..10885716 100644 --- a/rst_parser/arc_ff.cc +++ b/rst_parser/arc_ff.cc @@ -6,59 +6,81 @@ using namespace std; -ArcFeatureFunction::~ArcFeatureFunction() {} +struct ArcFFImpl { + ArcFFImpl() : kROOT("ROOT") {} + const string kROOT; -void ArcFeatureFunction::PrepareForInput(const TaggedSentence&) {} + void PrepareForInput(const TaggedSentence& sentence) { + (void) sentence; + } + + void EdgeFeatures(const TaggedSentence& sent, + short h, + short m, + SparseVector* features) const { + const bool is_root = (h == -1); + const string& head_word = (is_root ? kROOT : TD::Convert(sent.words[h])); + const string& head_pos = (is_root ? kROOT : TD::Convert(sent.pos[h])); + const string& mod_word = TD::Convert(sent.words[m]); + const string& mod_pos = TD::Convert(sent.pos[m]); + const bool dir = m < h; + int v = m - h; + if (v < 0) { + v= -1 - int(log(-v) / log(2)); + } else { + v= int(log(v) / log(2)); + } + static map lenmap; + int& lenfid = lenmap[v]; + if (!lenfid) { + ostringstream os; + if (v < 0) os << "LenL" << -v; else os << "LenR" << v; + lenfid = FD::Convert(os.str()); + } + features->set_value(lenfid, 1.0); + const string& lenstr = FD::Convert(lenfid); + if (!is_root) { + static int modl = FD::Convert("ModLeft"); + static int modr = FD::Convert("ModRight"); + if (dir) features->set_value(modl, 1); + else features->set_value(modr, 1); + } + if (is_root) { + ostringstream os; + os << "ROOT:" << mod_pos; + features->set_value(FD::Convert(os.str()), 1.0); + os << "_" << lenstr; + features->set_value(FD::Convert(os.str()), 1.0); + } else { // not root + ostringstream os; + os << "HM:" << head_pos << '_' << mod_pos; + features->set_value(FD::Convert(os.str()), 1.0); + os << '_' << dir; + features->set_value(FD::Convert(os.str()), 1.0); + os << '_' << lenstr; + features->set_value(FD::Convert(os.str()), 1.0); + ostringstream os2; + os2 << "LexHM:" << head_word << '_' << mod_word; + features->set_value(FD::Convert(os2.str()), 1.0); + os2 << '_' << dir; + features->set_value(FD::Convert(os2.str()), 1.0); + os2 << '_' << lenstr; + features->set_value(FD::Convert(os2.str()), 1.0); + } + } +}; -DistancePenalty::DistancePenalty(const string&) : fidw_(FD::Convert("Distance")), fidr_(FD::Convert("RootDistance")) {} +ArcFeatureFunctions::ArcFeatureFunctions() : pimpl(new ArcFFImpl) {} +ArcFeatureFunctions::~ArcFeatureFunctions() { delete pimpl; } + +void ArcFeatureFunctions::PrepareForInput(const TaggedSentence& sentence) { + pimpl->PrepareForInput(sentence); +} -void DistancePenalty::EdgeFeaturesImpl(const TaggedSentence& sent, +void ArcFeatureFunctions::EdgeFeatures(const TaggedSentence& sentence, short h, short m, SparseVector* features) const { - const bool dir = m < h; - const bool is_root = (h == -1); - int v = m - h; - if (v < 0) { - v= -1 - int(log(-v) / log(2)); - } else { - v= int(log(v) / log(2)); - } - static map lenmap; - int& lenfid = lenmap[v]; - if (!lenfid) { - ostringstream os; - if (v < 0) os << "LenL" << -v; else os << "LenR" << v; - lenfid = FD::Convert(os.str()); - } - features->set_value(lenfid, 1.0); - const string& lenstr = FD::Convert(lenfid); - if (!is_root) { - static int modl = FD::Convert("ModLeft"); - static int modr = FD::Convert("ModRight"); - if (dir) features->set_value(modl, 1); - else features->set_value(modr, 1); - } - if (is_root) { - ostringstream os; - os << "ROOT:" << TD::Convert(sent.pos[m]); - features->set_value(FD::Convert(os.str()), 1.0); - os << "_" << lenstr; - features->set_value(FD::Convert(os.str()), 1.0); - } else { // not root - ostringstream os; - os << "HM:" << TD::Convert(sent.pos[h]) << '_' << TD::Convert(sent.pos[m]); - features->set_value(FD::Convert(os.str()), 1.0); - os << '_' << dir; - features->set_value(FD::Convert(os.str()), 1.0); - os << '_' << lenstr; - features->set_value(FD::Convert(os.str()), 1.0); - ostringstream os2; - os2 << "LexHM:" << TD::Convert(sent.words[h]) << '_' << TD::Convert(sent.words[m]); - features->set_value(FD::Convert(os2.str()), 1.0); - os2 << '_' << dir; - features->set_value(FD::Convert(os2.str()), 1.0); - os2 << '_' << lenstr; - features->set_value(FD::Convert(os2.str()), 1.0); - } + pimpl->EdgeFeatures(sentence, h, m, features); } + diff --git a/rst_parser/arc_ff.h b/rst_parser/arc_ff.h index bc51fef4..52f311d2 100644 --- a/rst_parser/arc_ff.h +++ b/rst_parser/arc_ff.h @@ -7,37 +7,22 @@ #include "arc_factored.h" struct TaggedSentence; -class ArcFeatureFunction { +struct ArcFFImpl; +class ArcFeatureFunctions { public: - virtual ~ArcFeatureFunction(); + ArcFeatureFunctions(); + ~ArcFeatureFunctions(); // called once, per input, before any calls to EdgeFeatures // used to initialize sentence-specific data structures - virtual void PrepareForInput(const TaggedSentence& sentence); + void PrepareForInput(const TaggedSentence& sentence); - inline void EgdeFeatures(const TaggedSentence& sentence, - short h, - short m, - SparseVector* features) const { - EdgeFeaturesImpl(sentence, h, m, features); - } - protected: - virtual void EdgeFeaturesImpl(const TaggedSentence& sentence, - short h, - short m, - SparseVector* features) const = 0; -}; - -class DistancePenalty : public ArcFeatureFunction { - public: - DistancePenalty(const std::string& param); - protected: - virtual void EdgeFeaturesImpl(const TaggedSentence& sentence, - short h, - short m, - SparseVector* features) const; + void EdgeFeatures(const TaggedSentence& sentence, + short h, + short m, + SparseVector* features) const; private: - const int fidw_, fidr_; + ArcFFImpl* pimpl; }; #endif diff --git a/rst_parser/arc_ff_factory.h b/rst_parser/arc_ff_factory.h deleted file mode 100644 index 4237fd5d..00000000 --- a/rst_parser/arc_ff_factory.h +++ /dev/null @@ -1,42 +0,0 @@ -#ifndef _ARC_FF_FACTORY_H_ -#define _ARC_FF_FACTORY_H_ - -#include -#include -#include - -struct ArcFFFactoryBase { - virtual boost::shared_ptr Create(const std::string& param) const = 0; -}; - -template -struct ArcFFFactory : public ArcFFFactoryBase { - boost::shared_ptr Create(const std::string& param) const { - return boost::shared_ptr(new FF(param)); - } -}; - -struct ArcFFRegistry { - boost::shared_ptr Create(const std::string& name, const std::string& param) const { - std::map::const_iterator it = facts.find(name); - assert(it != facts.end()); - return it->second->Create(param); - } - - void Register(const std::string& name, ArcFFFactoryBase* fact) { - ArcFFFactoryBase*& f = facts[name]; - assert(f == NULL); - f = fact; - } - std::map facts; -}; - -std::ostream& operator<<(std::ostream& os, const ArcFFRegistry& reg) { - for (std::map::const_iterator it = reg.facts.begin(); - it != reg.facts.end(); ++it) { - os << " " << it->first << std::endl; - } - return os; -} - -#endif diff --git a/rst_parser/mst_train.cc b/rst_parser/mst_train.cc index f0403d7e..0709e7c9 100644 --- a/rst_parser/mst_train.cc +++ b/rst_parser/mst_train.cc @@ -6,7 +6,6 @@ #include #include "arc_ff.h" -#include "arc_ff_factory.h" #include "stringlib.h" #include "filelib.h" #include "tdict.h" @@ -22,7 +21,6 @@ void InitCommandLine(int argc, char** argv, po::variables_map* conf) { string cfg_file; opts.add_options() ("training_data,t",po::value()->default_value("-"), "File containing training data (jsent format)") - ("feature_function,F",po::value >()->composing(), "feature function (multiple permitted)") ("weights,w",po::value(), "Optional starting weights") ("output_every_i_iterations,I",po::value()->default_value(1), "Write weights every I iterations") ("regularization_strength,C",po::value()->default_value(1.0), "Regularization strength") @@ -74,12 +72,8 @@ int main(int argc, char** argv) { int size = 1; po::variables_map conf; InitCommandLine(argc, argv, &conf); - ArcFactoredForest af(5); - ArcFFRegistry reg; - reg.Register("DistancePenalty", new ArcFFFactory); + ArcFeatureFunctions ffs; vector corpus; - vector > ffs; - ffs.push_back(boost::shared_ptr(new DistancePenalty(""))); TrainingInstance::ReadTraining(conf["training_data"].as(), &corpus, rank, size); vector forests(corpus.size()); SparseVector empirical; @@ -88,22 +82,19 @@ int main(int argc, char** argv) { TrainingInstance& cur = corpus[i]; if (rank == 0 && (i+1) % 10 == 0) { cerr << '.' << flush; flag = true; } if (rank == 0 && (i+1) % 400 == 0) { cerr << " [" << (i+1) << "]\n"; flag = false; } - for (int fi = 0; fi < ffs.size(); ++fi) { - ArcFeatureFunction& ff = *ffs[fi]; - ff.PrepareForInput(cur.ts); - SparseVector efmap; - for (int j = 0; j < cur.tree.h_m_pairs.size(); ++j) { - efmap.clear(); - ff.EgdeFeatures(cur.ts, cur.tree.h_m_pairs[j].first, - cur.tree.h_m_pairs[j].second, - &efmap); - cur.features += efmap; - } - for (int j = 0; j < cur.tree.roots.size(); ++j) { - efmap.clear(); - ff.EgdeFeatures(cur.ts, -1, cur.tree.roots[j], &efmap); - cur.features += efmap; - } + ffs.PrepareForInput(cur.ts); + SparseVector efmap; + for (int j = 0; j < cur.tree.h_m_pairs.size(); ++j) { + efmap.clear(); + ffs.EdgeFeatures(cur.ts, cur.tree.h_m_pairs[j].first, + cur.tree.h_m_pairs[j].second, + &efmap); + cur.features += efmap; + } + for (int j = 0; j < cur.tree.roots.size(); ++j) { + efmap.clear(); + ffs.EdgeFeatures(cur.ts, -1, cur.tree.roots[j], &efmap); + cur.features += efmap; } empirical += cur.features; forests[i].resize(cur.ts.words.size()); diff --git a/rst_parser/rst_parse.cc b/rst_parser/rst_parse.cc deleted file mode 100644 index 9cc1359a..00000000 --- a/rst_parser/rst_parse.cc +++ /dev/null @@ -1,126 +0,0 @@ -#include "arc_factored.h" - -#include -#include -#include -#include - -#include "timing_stats.h" -#include "arc_ff.h" -#include "arc_ff_factory.h" -#include "dep_training.h" -#include "stringlib.h" -#include "filelib.h" -#include "tdict.h" -#include "weights.h" -#include "rst.h" - -using namespace std; -namespace po = boost::program_options; - -void InitCommandLine(int argc, char** argv, po::variables_map* conf) { - po::options_description opts("Configuration options"); - string cfg_file; - opts.add_options() - ("training_data,t",po::value()->default_value("-"), "File containing training data (jsent format)") - ("feature_function,F",po::value >()->composing(), "feature function (multiple permitted)") - ("q_weights,q",po::value(), "Arc-factored weights for proposal distribution") - ("samples,n",po::value()->default_value(1000), "Number of samples"); - po::options_description clo("Command line options"); - clo.add_options() - ("config,c", po::value(&cfg_file), "Configuration file") - ("help,?", "Print this help message and exit"); - - po::options_description dconfig_options, dcmdline_options; - dconfig_options.add(opts); - dcmdline_options.add(dconfig_options).add(clo); - po::store(parse_command_line(argc, argv, dcmdline_options), *conf); - if (cfg_file.size() > 0) { - ReadFile rf(cfg_file); - po::store(po::parse_config_file(*rf.stream(), dconfig_options), *conf); - } - if (conf->count("help")) { - cerr << dcmdline_options << endl; - exit(1); - } -} - -int main(int argc, char** argv) { - po::variables_map conf; - InitCommandLine(argc, argv, &conf); - ArcFactoredForest af(5); - ArcFFRegistry reg; - reg.Register("DistancePenalty", new ArcFFFactory); - vector corpus; - vector > ffs; - ffs.push_back(boost::shared_ptr(new DistancePenalty(""))); - TrainingInstance::ReadTraining(conf["training_data"].as(), &corpus); - vector forests(corpus.size()); - SparseVector empirical; - bool flag = false; - for (int i = 0; i < corpus.size(); ++i) { - TrainingInstance& cur = corpus[i]; - if ((i+1) % 10 == 0) { cerr << '.' << flush; flag = true; } - if ((i+1) % 400 == 0) { cerr << " [" << (i+1) << "]\n"; flag = false; } - for (int fi = 0; fi < ffs.size(); ++fi) { - ArcFeatureFunction& ff = *ffs[fi]; - ff.PrepareForInput(cur.ts); - SparseVector efmap; - for (int j = 0; j < cur.tree.h_m_pairs.size(); ++j) { - efmap.clear(); - ff.EgdeFeatures(cur.ts, cur.tree.h_m_pairs[j].first, - cur.tree.h_m_pairs[j].second, - &efmap); - cur.features += efmap; - } - for (int j = 0; j < cur.tree.roots.size(); ++j) { - efmap.clear(); - ff.EgdeFeatures(cur.ts, -1, cur.tree.roots[j], &efmap); - cur.features += efmap; - } - } - empirical += cur.features; - forests[i].resize(cur.ts.words.size()); - forests[i].ExtractFeatures(cur.ts, ffs); - } - if (flag) cerr << endl; - vector weights(FD::NumFeats(), 0.0); - Weights::InitFromFile(conf["q_weights"].as(), &weights); - MT19937 rng; - SparseVector model_exp; - SparseVector sampled_exp; - int samples = conf["samples"].as(); - for (int i = 0; i < corpus.size(); ++i) { - const int num_words = corpus[i].ts.words.size(); - forests[i].Reweight(weights); - forests[i].EdgeMarginals(); - model_exp.clear(); - for (int h = -1; h < num_words; ++h) { - for (int m = 0; m < num_words; ++m) { - if (h == m) continue; - const ArcFactoredForest::Edge& edge = forests[i](h,m); - const SparseVector& fmap = edge.features; - double prob = edge.edge_prob.as_float(); - model_exp += fmap * prob; - } - } - //cerr << "TRUE EXP: " << model_exp << endl; - - forests[i].Reweight(weights); - TreeSampler ts(forests[i]); - sampled_exp.clear(); - //ostringstream os; os << "Samples_" << samples; - //Timer t(os.str()); - for (int n = 0; n < samples; ++n) { - EdgeSubset tree; - ts.SampleRandomSpanningTree(&tree, &rng); - SparseVector feats; - tree.ExtractFeatures(corpus[i].ts, ffs, &feats); - sampled_exp += feats; - } - sampled_exp /= samples; - cerr << "L2 norm of diff @ " << samples << " samples: " << (model_exp - sampled_exp).l2norm() << endl; - } - return 0; -} - diff --git a/rst_parser/rst_test.cc b/rst_parser/rst_test.cc deleted file mode 100644 index 3bb95759..00000000 --- a/rst_parser/rst_test.cc +++ /dev/null @@ -1,48 +0,0 @@ -#include "arc_factored.h" - -#include - -#include - -using namespace std; - -int main(int argc, char** argv) { - // John saw Mary - // (H -> M) - // (1 -> 2) 20 - // (1 -> 3) 3 - // (2 -> 1) 20 - // (2 -> 3) 30 - // (3 -> 2) 0 - // (3 -> 1) 11 - // (0, 2) 10 - // (0, 1) 9 - // (0, 3) 9 - ArcFactoredForest af(3); - af(0,1).edge_prob.logeq(20); - af(0,2).edge_prob.logeq(3); - af(1,0).edge_prob.logeq(20); - af(1,2).edge_prob.logeq(30); - af(2,1).edge_prob.logeq(0); - af(2,0).edge_prob.logeq(11); - af(-1,1).edge_prob.logeq(10); - af(-1,0).edge_prob.logeq(9); - af(-1,2).edge_prob.logeq(9); - EdgeSubset tree; -// af.MaximumEdgeSubset(&tree); - prob_t z; - af.EdgeMarginals(&z); - cerr << "Z = " << abs(z) << endl; - af.PickBestParentForEachWord(&tree); - cerr << tree << endl; - typedef Eigen::Matrix M3; - M3 A = M3::Zero(); - A(0,0) = prob_t(1); - A(1,0) = prob_t(3); - A(0,1) = prob_t(2); - A(1,1) = prob_t(4); - prob_t det = A.determinant(); - cerr << det.as_float() << endl; - return 0; -} - diff --git a/rst_parser/rst_train.cc b/rst_parser/rst_train.cc new file mode 100644 index 00000000..16673cdc --- /dev/null +++ b/rst_parser/rst_train.cc @@ -0,0 +1,144 @@ +#include "arc_factored.h" + +#include +#include +#include +#include + +#include "timing_stats.h" +#include "arc_ff.h" +#include "dep_training.h" +#include "stringlib.h" +#include "filelib.h" +#include "tdict.h" +#include "weights.h" +#include "rst.h" +#include "global_ff.h" + +using namespace std; +namespace po = boost::program_options; + +void InitCommandLine(int argc, char** argv, po::variables_map* conf) { + po::options_description opts("Configuration options"); + string cfg_file; + opts.add_options() + ("training_data,t",po::value()->default_value("-"), "File containing training data (jsent format)") + ("q_weights,q",po::value(), "Arc-factored weights for proposal distribution") + ("samples,n",po::value()->default_value(1000), "Number of samples"); + po::options_description clo("Command line options"); + clo.add_options() + ("config,c", po::value(&cfg_file), "Configuration file") + ("help,?", "Print this help message and exit"); + + po::options_description dconfig_options, dcmdline_options; + dconfig_options.add(opts); + dcmdline_options.add(dconfig_options).add(clo); + po::store(parse_command_line(argc, argv, dcmdline_options), *conf); + if (cfg_file.size() > 0) { + ReadFile rf(cfg_file); + po::store(po::parse_config_file(*rf.stream(), dconfig_options), *conf); + } + if (conf->count("help")) { + cerr << dcmdline_options << endl; + exit(1); + } +} + +int main(int argc, char** argv) { + po::variables_map conf; + InitCommandLine(argc, argv, &conf); + vector qweights(FD::NumFeats(), 0.0); + Weights::InitFromFile(conf["q_weights"].as(), &qweights); + vector corpus; + ArcFeatureFunctions ffs; + GlobalFeatureFunctions gff; + TrainingInstance::ReadTraining(conf["training_data"].as(), &corpus); + vector forests(corpus.size()); + vector zs(corpus.size()); + SparseVector empirical; + bool flag = false; + for (int i = 0; i < corpus.size(); ++i) { + TrainingInstance& cur = corpus[i]; + if ((i+1) % 10 == 0) { cerr << '.' << flush; flag = true; } + if ((i+1) % 400 == 0) { cerr << " [" << (i+1) << "]\n"; flag = false; } + SparseVector efmap; + ffs.PrepareForInput(cur.ts); + gff.PrepareForInput(cur.ts); + for (int j = 0; j < cur.tree.h_m_pairs.size(); ++j) { + efmap.clear(); + ffs.EdgeFeatures(cur.ts, cur.tree.h_m_pairs[j].first, + cur.tree.h_m_pairs[j].second, + &efmap); + cur.features += efmap; + } + for (int j = 0; j < cur.tree.roots.size(); ++j) { + efmap.clear(); + ffs.EdgeFeatures(cur.ts, -1, cur.tree.roots[j], &efmap); + cur.features += efmap; + } + efmap.clear(); + gff.Features(cur.ts, cur.tree, &efmap); + cur.features += efmap; + empirical += cur.features; + forests[i].resize(cur.ts.words.size()); + forests[i].ExtractFeatures(cur.ts, ffs); + forests[i].Reweight(qweights); + forests[i].EdgeMarginals(&zs[i]); + zs[i] = prob_t::One() / zs[i]; + // cerr << zs[i] << endl; + forests[i].Reweight(qweights); // EdgeMarginals overwrites edge_prob + } + if (flag) cerr << endl; + MT19937 rng; + SparseVector model_exp; + SparseVector weights; + Weights::InitSparseVector(qweights, &weights); + int samples = conf["samples"].as(); + for (int i = 0; i < corpus.size(); ++i) { +#if 0 + forests[i].EdgeMarginals(); + model_exp.clear(); + for (int h = -1; h < num_words; ++h) { + for (int m = 0; m < num_words; ++m) { + if (h == m) continue; + const ArcFactoredForest::Edge& edge = forests[i](h,m); + const SparseVector& fmap = edge.features; + double prob = edge.edge_prob.as_float(); + model_exp += fmap * prob; + } + } + cerr << "TRUE EXP: " << model_exp << endl; + forests[i].Reweight(weights); +#endif + + TreeSampler ts(forests[i]); + prob_t zhat = prob_t::Zero(); + SparseVector sampled_exp; + for (int n = 0; n < samples; ++n) { + EdgeSubset tree; + ts.SampleRandomSpanningTree(&tree, &rng); + SparseVector qfeats, gfeats; + tree.ExtractFeatures(corpus[i].ts, ffs, &qfeats); + prob_t u; u.logeq(qfeats.dot(qweights)); + const prob_t q = u / zs[i]; // proposal mass + gff.Features(corpus[i].ts, tree, &gfeats); + SparseVector tot_feats = qfeats + gfeats; + u.logeq(tot_feats.dot(weights)); + prob_t w = u / q; + zhat += w; + for (SparseVector::const_iterator it = tot_feats.begin(); it != tot_feats.end(); ++it) + sampled_exp.add_value(it->first, w * prob_t(it->second)); + } + sampled_exp /= zhat; + SparseVector tot_m; + for (SparseVector::const_iterator it = sampled_exp.begin(); it != sampled_exp.end(); ++it) + tot_m.add_value(it->first, it->second.as_float()); + //cerr << "DIFF: " << (tot_m - corpus[i].features) << endl; + const double eta = 0.03; + weights -= (tot_m - corpus[i].features) * eta; + } + cerr << "WEIGHTS.\n"; + cerr << weights << endl; + return 0; +} + -- cgit v1.2.3 From 35f9ffa85fb68e0624111a76e27955524649950a Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Thu, 19 Apr 2012 02:45:27 -0400 Subject: compute f --- rst_parser/Makefile.am | 5 +- rst_parser/dep_training.cc | 4 ++ rst_parser/rst_parse.cc | 111 +++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 119 insertions(+), 1 deletion(-) create mode 100644 rst_parser/rst_parse.cc (limited to 'rst_parser/Makefile.am') diff --git a/rst_parser/Makefile.am b/rst_parser/Makefile.am index 876c2237..4977f584 100644 --- a/rst_parser/Makefile.am +++ b/rst_parser/Makefile.am @@ -1,5 +1,5 @@ bin_PROGRAMS = \ - mst_train rst_train + mst_train rst_train rst_parse noinst_LIBRARIES = librst.a @@ -11,4 +11,7 @@ mst_train_LDADD = librst.a $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/ rst_train_SOURCES = rst_train.cc rst_train_LDADD = librst.a $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a ../klm/lm/libklm.a ../klm/util/libklm_util.a -lz +rst_parse_SOURCES = rst_parse.cc +rst_parse_LDADD = librst.a $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a ../klm/lm/libklm.a ../klm/util/libklm_util.a -lz + AM_CPPFLAGS = -W -Wall -Wno-sign-compare $(GTEST_CPPFLAGS) -I$(top_srcdir)/decoder -I$(top_srcdir)/training -I$(top_srcdir)/utils -I$(top_srcdir)/mteval -I../klm diff --git a/rst_parser/dep_training.cc b/rst_parser/dep_training.cc index e26505ec..ef97798b 100644 --- a/rst_parser/dep_training.cc +++ b/rst_parser/dep_training.cc @@ -18,6 +18,10 @@ static void ParseInstance(const string& line, int start, TrainingInstance* out, TrainingInstance& cur = *out; TaggedSentence& ts = cur.ts; EdgeSubset& tree = cur.tree; + ts.pos.clear(); + ts.words.clear(); + tree.roots.clear(); + tree.h_m_pairs.clear(); assert(obj.is()); const picojson::object& d = obj.get(); const picojson::array& ta = d.find("tokens")->second.get(); diff --git a/rst_parser/rst_parse.cc b/rst_parser/rst_parse.cc new file mode 100644 index 00000000..9c42a8f4 --- /dev/null +++ b/rst_parser/rst_parse.cc @@ -0,0 +1,111 @@ +#include "arc_factored.h" + +#include +#include +#include +#include + +#include "timing_stats.h" +#include "arc_ff.h" +#include "dep_training.h" +#include "stringlib.h" +#include "filelib.h" +#include "tdict.h" +#include "weights.h" +#include "rst.h" +#include "global_ff.h" + +using namespace std; +namespace po = boost::program_options; + +void InitCommandLine(int argc, char** argv, po::variables_map* conf) { + po::options_description opts("Configuration options"); + string cfg_file; + opts.add_options() + ("input,i",po::value()->default_value("-"), "File containing test data (jsent format)") + ("q_weights,q",po::value(), "Arc-factored weights for proposal distribution (mandatory)") + ("p_weights,p",po::value(), "Weights for target distribution (optional)") + ("samples,n",po::value()->default_value(1000), "Number of samples"); + po::options_description clo("Command line options"); + clo.add_options() + ("config,c", po::value(&cfg_file), "Configuration file") + ("help,?", "Print this help message and exit"); + + po::options_description dconfig_options, dcmdline_options; + dconfig_options.add(opts); + dcmdline_options.add(dconfig_options).add(clo); + po::store(parse_command_line(argc, argv, dcmdline_options), *conf); + if (cfg_file.size() > 0) { + ReadFile rf(cfg_file); + po::store(po::parse_config_file(*rf.stream(), dconfig_options), *conf); + } + if (conf->count("help") || conf->count("q_weights") == 0) { + cerr << dcmdline_options << endl; + exit(1); + } +} + +int main(int argc, char** argv) { + po::variables_map conf; + InitCommandLine(argc, argv, &conf); + vector qweights, pweights; + Weights::InitFromFile(conf["q_weights"].as(), &qweights); + if (conf.count("p_weights")) + Weights::InitFromFile(conf["p_weights"].as(), &pweights); + const bool global = pweights.size() > 0; + ArcFeatureFunctions ffs; + GlobalFeatureFunctions gff; + ReadFile rf(conf["input"].as()); + istream* in = rf.stream(); + TrainingInstance sent; + MT19937 rng; + int samples = conf["samples"].as(); + int totroot = 0, root_right = 0, tot = 0, cor = 0; + while(TrainingInstance::ReadInstance(in, &sent)) { + ffs.PrepareForInput(sent.ts); + if (global) gff.PrepareForInput(sent.ts); + ArcFactoredForest forest(sent.ts.pos.size()); + forest.ExtractFeatures(sent.ts, ffs); + forest.Reweight(qweights); + TreeSampler ts(forest); + double best_score = -numeric_limits::infinity(); + EdgeSubset best_tree; + for (int n = 0; n < samples; ++n) { + EdgeSubset tree; + ts.SampleRandomSpanningTree(&tree, &rng); + SparseVector qfeats, gfeats; + tree.ExtractFeatures(sent.ts, ffs, &qfeats); + double score = 0; + if (global) { + gff.Features(sent.ts, tree, &gfeats); + score = (qfeats + gfeats).dot(pweights); + } else { + score = qfeats.dot(qweights); + } + if (score > best_score) { + best_tree = tree; + best_score = score; + } + } + cerr << "BEST SCORE: " << best_score << endl; + cout << best_tree << endl; + const bool sent_has_ref = sent.tree.h_m_pairs.size() > 0; + if (sent_has_ref) { + map, bool> ref; + for (int i = 0; i < sent.tree.h_m_pairs.size(); ++i) + ref[sent.tree.h_m_pairs[i]] = true; + int ref_root = sent.tree.roots.front(); + if (ref_root == best_tree.roots.front()) { ++root_right; } + ++totroot; + for (int i = 0; i < best_tree.h_m_pairs.size(); ++i) { + if (ref[best_tree.h_m_pairs[i]]) { + ++cor; + } + ++tot; + } + } + } + cerr << "F = " << (double(cor + root_right) / (tot + totroot)) << endl; + return 0; +} + -- cgit v1.2.3