summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorChris Dyer <cdyer@cs.cmu.edu>2012-04-16 18:20:33 -0400
committerChris Dyer <cdyer@cs.cmu.edu>2012-04-16 18:20:33 -0400
commit4b38556c88c739de82b9c298261a262ec620280e (patch)
tree272124012fa38a359a7c0efb1da4499a1b4371b5
parentf2fcf9e8aa0e5dee75fd08ee915488ec1a741975 (diff)
rst sampler
-rw-r--r--rst_parser/Makefile.am7
-rw-r--r--rst_parser/dep_training.cc56
-rw-r--r--rst_parser/dep_training.h17
-rw-r--r--rst_parser/mst_train.cc58
-rw-r--r--rst_parser/rst.cc56
-rw-r--r--rst_parser/rst.h8
-rw-r--r--rst_parser/rst_parse.cc126
-rw-r--r--utils/weights.cc4
8 files changed, 260 insertions, 72 deletions
diff --git a/rst_parser/Makefile.am b/rst_parser/Makefile.am
index 2b64b43a..6e884f53 100644
--- a/rst_parser/Makefile.am
+++ b/rst_parser/Makefile.am
@@ -1,5 +1,5 @@
bin_PROGRAMS = \
- mst_train
+ mst_train rst_parse
noinst_PROGRAMS = \
rst_test
@@ -8,11 +8,14 @@ TESTS = rst_test
noinst_LIBRARIES = librst.a
-librst_a_SOURCES = arc_factored.cc arc_factored_marginals.cc rst.cc arc_ff.cc
+librst_a_SOURCES = arc_factored.cc arc_factored_marginals.cc rst.cc arc_ff.cc dep_training.cc
mst_train_SOURCES = mst_train.cc
mst_train_LDADD = librst.a $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a ../klm/lm/libklm.a ../klm/util/libklm_util.a ../training/optimize.o -lz
+rst_parse_SOURCES = rst_parse.cc
+rst_parse_LDADD = librst.a $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a ../klm/lm/libklm.a ../klm/util/libklm_util.a -lz
+
rst_test_SOURCES = rst_test.cc
rst_test_LDADD = librst.a $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a ../klm/lm/libklm.a ../klm/util/libklm_util.a -lz
diff --git a/rst_parser/dep_training.cc b/rst_parser/dep_training.cc
new file mode 100644
index 00000000..de431ebc
--- /dev/null
+++ b/rst_parser/dep_training.cc
@@ -0,0 +1,56 @@
+#include "dep_training.h"
+
+#include <vector>
+#include <iostream>
+
+#include "stringlib.h"
+#include "filelib.h"
+#include "tdict.h"
+#include "picojson.h"
+
+using namespace std;
+
+void TrainingInstance::ReadTraining(const string& fname, vector<TrainingInstance>* corpus, int rank, int size) {
+ ReadFile rf(fname);
+ istream& in = *rf.stream();
+ string line;
+ string err;
+ int lc = 0;
+ bool flag = false;
+ while(getline(in, line)) {
+ ++lc;
+ if ((lc-1) % size != rank) continue;
+ if (rank == 0 && lc % 10 == 0) { cerr << '.' << flush; flag = true; }
+ if (rank == 0 && lc % 400 == 0) { cerr << " [" << lc << "]\n"; flag = false; }
+ size_t pos = line.rfind('\t');
+ assert(pos != string::npos);
+ picojson::value obj;
+ picojson::parse(obj, line.begin() + pos, line.end(), &err);
+ if (err.size() > 0) { cerr << "JSON parse error in " << lc << ": " << err << endl; abort(); }
+ corpus->push_back(TrainingInstance());
+ TrainingInstance& cur = corpus->back();
+ TaggedSentence& ts = cur.ts;
+ EdgeSubset& tree = cur.tree;
+ assert(obj.is<picojson::object>());
+ const picojson::object& d = obj.get<picojson::object>();
+ const picojson::array& ta = d.find("tokens")->second.get<picojson::array>();
+ for (unsigned i = 0; i < ta.size(); ++i) {
+ ts.words.push_back(TD::Convert(ta[i].get<picojson::array>()[0].get<string>()));
+ ts.pos.push_back(TD::Convert(ta[i].get<picojson::array>()[1].get<string>()));
+ }
+ const picojson::array& da = d.find("deps")->second.get<picojson::array>();
+ for (unsigned i = 0; i < da.size(); ++i) {
+ const picojson::array& thm = da[i].get<picojson::array>();
+ // get dep type here
+ short h = thm[2].get<double>();
+ short m = thm[1].get<double>();
+ if (h < 0)
+ tree.roots.push_back(m);
+ else
+ tree.h_m_pairs.push_back(make_pair(h,m));
+ }
+ //cerr << TD::GetString(ts.words) << endl << TD::GetString(ts.pos) << endl << tree << endl;
+ }
+ if (flag) cerr << "\nRead " << lc << " training instances\n";
+}
+
diff --git a/rst_parser/dep_training.h b/rst_parser/dep_training.h
new file mode 100644
index 00000000..73ffd298
--- /dev/null
+++ b/rst_parser/dep_training.h
@@ -0,0 +1,17 @@
+#ifndef _DEP_TRAINING_H_
+#define _DEP_TRAINING_H_
+
+#include <string>
+#include <vector>
+#include "arc_factored.h"
+#include "weights.h"
+
+struct TrainingInstance {
+ TaggedSentence ts;
+ EdgeSubset tree;
+ SparseVector<weight_t> features;
+ // reads a "Jsent" formatted dependency file
+ static void ReadTraining(const std::string& fname, std::vector<TrainingInstance>* corpus, int rank = 0, int size = 1);
+};
+
+#endif
diff --git a/rst_parser/mst_train.cc b/rst_parser/mst_train.cc
index c5cab6ec..f0403d7e 100644
--- a/rst_parser/mst_train.cc
+++ b/rst_parser/mst_train.cc
@@ -10,10 +10,9 @@
#include "stringlib.h"
#include "filelib.h"
#include "tdict.h"
-#include "picojson.h"
+#include "dep_training.h"
#include "optimize.h"
#include "weights.h"
-#include "rst.h"
using namespace std;
namespace po = boost::program_options;
@@ -47,56 +46,6 @@ void InitCommandLine(int argc, char** argv, po::variables_map* conf) {
}
}
-struct TrainingInstance {
- TaggedSentence ts;
- EdgeSubset tree;
- SparseVector<weight_t> features;
-};
-
-void ReadTraining(const string& fname, vector<TrainingInstance>* corpus, int rank = 0, int size = 1) {
- ReadFile rf(fname);
- istream& in = *rf.stream();
- string line;
- string err;
- int lc = 0;
- bool flag = false;
- while(getline(in, line)) {
- ++lc;
- if ((lc-1) % size != rank) continue;
- if (rank == 0 && lc % 10 == 0) { cerr << '.' << flush; flag = true; }
- if (rank == 0 && lc % 400 == 0) { cerr << " [" << lc << "]\n"; flag = false; }
- size_t pos = line.rfind('\t');
- assert(pos != string::npos);
- picojson::value obj;
- picojson::parse(obj, line.begin() + pos, line.end(), &err);
- if (err.size() > 0) { cerr << "JSON parse error in " << lc << ": " << err << endl; abort(); }
- corpus->push_back(TrainingInstance());
- TrainingInstance& cur = corpus->back();
- TaggedSentence& ts = cur.ts;
- EdgeSubset& tree = cur.tree;
- assert(obj.is<picojson::object>());
- const picojson::object& d = obj.get<picojson::object>();
- const picojson::array& ta = d.find("tokens")->second.get<picojson::array>();
- for (unsigned i = 0; i < ta.size(); ++i) {
- ts.words.push_back(TD::Convert(ta[i].get<picojson::array>()[0].get<string>()));
- ts.pos.push_back(TD::Convert(ta[i].get<picojson::array>()[1].get<string>()));
- }
- const picojson::array& da = d.find("deps")->second.get<picojson::array>();
- for (unsigned i = 0; i < da.size(); ++i) {
- const picojson::array& thm = da[i].get<picojson::array>();
- // get dep type here
- short h = thm[2].get<double>();
- short m = thm[1].get<double>();
- if (h < 0)
- tree.roots.push_back(m);
- else
- tree.h_m_pairs.push_back(make_pair(h,m));
- }
- //cerr << TD::GetString(ts.words) << endl << TD::GetString(ts.pos) << endl << tree << endl;
- }
- if (flag) cerr << "\nRead " << lc << " training instances\n";
-}
-
void AddFeatures(double prob, const SparseVector<double>& fmap, vector<double>* g) {
for (SparseVector<double>::const_iterator it = fmap.begin(); it != fmap.end(); ++it)
(*g)[it->first] += it->second * prob;
@@ -131,7 +80,7 @@ int main(int argc, char** argv) {
vector<TrainingInstance> corpus;
vector<boost::shared_ptr<ArcFeatureFunction> > ffs;
ffs.push_back(boost::shared_ptr<ArcFeatureFunction>(new DistancePenalty("")));
- ReadTraining(conf["training_data"].as<string>(), &corpus, rank, size);
+ TrainingInstance::ReadTraining(conf["training_data"].as<string>(), &corpus, rank, size);
vector<ArcFactoredForest> forests(corpus.size());
SparseVector<double> empirical;
bool flag = false;
@@ -224,9 +173,6 @@ int main(int argc, char** argv) {
}
if (converged) { cerr << "CONVERGED\n"; break; }
}
- forests[0].Reweight(weights);
- TreeSampler ts(forests[0]);
- EdgeSubset tt; ts.SampleRandomSpanningTree(&tt);
return 0;
}
diff --git a/rst_parser/rst.cc b/rst_parser/rst.cc
index c4ce898e..bc91330b 100644
--- a/rst_parser/rst.cc
+++ b/rst_parser/rst.cc
@@ -3,45 +3,77 @@
using namespace std;
// David B. Wilson. Generating Random Spanning Trees More Quickly than the Cover Time.
-
+// this is an awesome algorithm
TreeSampler::TreeSampler(const ArcFactoredForest& af) : forest(af), usucc(af.size() + 1) {
- // edges are directed from modifiers to heads, to the root
+ // edges are directed from modifiers to heads, and finally to the root
+ vector<double> p;
for (int m = 1; m <= forest.size(); ++m) {
+#if USE_ALIAS_SAMPLER
+ p.clear();
+#else
SampleSet<double>& ss = usucc[m];
- for (int h = 0; h <= forest.size(); ++h)
- ss.add(forest(h-1,m-1).edge_prob.as_float());
+#endif
+ double z = 0;
+ for (int h = 0; h <= forest.size(); ++h) {
+ double u = forest(h-1,m-1).edge_prob.as_float();
+ z += u;
+#if USE_ALIAS_SAMPLER
+ p.push_back(u);
+#else
+ ss.add(u);
+#endif
+ }
+#if USE_ALIAS_SAMPLER
+ for (int i = 0; i < p.size(); ++i) { p[i] /= z; }
+ usucc[m].Init(p);
+#endif
}
}
-void TreeSampler::SampleRandomSpanningTree(EdgeSubset* tree) {
- MT19937 rng;
+void TreeSampler::SampleRandomSpanningTree(EdgeSubset* tree, MT19937* prng) {
+ MT19937& rng = *prng;
const int r = 0;
bool success = false;
while (!success) {
int roots = 0;
+ tree->h_m_pairs.clear();
+ tree->roots.clear();
vector<int> next(forest.size() + 1, -1);
vector<char> in_tree(forest.size() + 1, 0);
in_tree[r] = 1;
- for (int i = 0; i < forest.size(); ++i) {
+ //cerr << "Forest size: " << forest.size() << endl;
+ for (int i = 0; i <= forest.size(); ++i) {
+ //cerr << "Sampling starting at u=" << i << endl;
int u = i;
if (in_tree[u]) continue;
while(!in_tree[u]) {
+#if USE_ALIAS_SAMPLER
+ next[u] = usucc[u].Draw(rng);
+#else
next[u] = rng.SelectSample(usucc[u]);
+#endif
u = next[u];
}
u = i;
- cerr << (u-1);
+ //cerr << (u-1);
+ int prev = u-1;
while(!in_tree[u]) {
in_tree[u] = true;
u = next[u];
- cerr << " > " << (u-1);
- if (u == r) { ++roots; }
+ //cerr << " > " << (u-1);
+ if (u == r) {
+ ++roots;
+ tree->roots.push_back(prev);
+ } else {
+ tree->h_m_pairs.push_back(make_pair<short,short>(u-1,prev));
+ }
+ prev = u-1;
}
- cerr << endl;
+ //cerr << endl;
}
assert(roots > 0);
if (roots > 1) {
- cerr << "FAILURE\n";
+ //cerr << "FAILURE\n";
} else {
success = true;
}
diff --git a/rst_parser/rst.h b/rst_parser/rst.h
index a269ff9b..8bf389f7 100644
--- a/rst_parser/rst.h
+++ b/rst_parser/rst.h
@@ -4,12 +4,18 @@
#include <vector>
#include "sampler.h"
#include "arc_factored.h"
+#include "alias_sampler.h"
struct TreeSampler {
explicit TreeSampler(const ArcFactoredForest& af);
- void SampleRandomSpanningTree(EdgeSubset* tree);
+ void SampleRandomSpanningTree(EdgeSubset* tree, MT19937* rng);
const ArcFactoredForest& forest;
+#define USE_ALIAS_SAMPLER 1
+#if USE_ALIAS_SAMPLER
+ std::vector<AliasSampler> usucc;
+#else
std::vector<SampleSet<double> > usucc;
+#endif
};
#endif
diff --git a/rst_parser/rst_parse.cc b/rst_parser/rst_parse.cc
new file mode 100644
index 00000000..9cc1359a
--- /dev/null
+++ b/rst_parser/rst_parse.cc
@@ -0,0 +1,126 @@
+#include "arc_factored.h"
+
+#include <vector>
+#include <iostream>
+#include <boost/program_options.hpp>
+#include <boost/program_options/variables_map.hpp>
+
+#include "timing_stats.h"
+#include "arc_ff.h"
+#include "arc_ff_factory.h"
+#include "dep_training.h"
+#include "stringlib.h"
+#include "filelib.h"
+#include "tdict.h"
+#include "weights.h"
+#include "rst.h"
+
+using namespace std;
+namespace po = boost::program_options;
+
+void InitCommandLine(int argc, char** argv, po::variables_map* conf) {
+ po::options_description opts("Configuration options");
+ string cfg_file;
+ opts.add_options()
+ ("training_data,t",po::value<string>()->default_value("-"), "File containing training data (jsent format)")
+ ("feature_function,F",po::value<vector<string> >()->composing(), "feature function (multiple permitted)")
+ ("q_weights,q",po::value<string>(), "Arc-factored weights for proposal distribution")
+ ("samples,n",po::value<unsigned>()->default_value(1000), "Number of samples");
+ po::options_description clo("Command line options");
+ clo.add_options()
+ ("config,c", po::value<string>(&cfg_file), "Configuration file")
+ ("help,?", "Print this help message and exit");
+
+ po::options_description dconfig_options, dcmdline_options;
+ dconfig_options.add(opts);
+ dcmdline_options.add(dconfig_options).add(clo);
+ po::store(parse_command_line(argc, argv, dcmdline_options), *conf);
+ if (cfg_file.size() > 0) {
+ ReadFile rf(cfg_file);
+ po::store(po::parse_config_file(*rf.stream(), dconfig_options), *conf);
+ }
+ if (conf->count("help")) {
+ cerr << dcmdline_options << endl;
+ exit(1);
+ }
+}
+
+int main(int argc, char** argv) {
+ po::variables_map conf;
+ InitCommandLine(argc, argv, &conf);
+ ArcFactoredForest af(5);
+ ArcFFRegistry reg;
+ reg.Register("DistancePenalty", new ArcFFFactory<DistancePenalty>);
+ vector<TrainingInstance> corpus;
+ vector<boost::shared_ptr<ArcFeatureFunction> > ffs;
+ ffs.push_back(boost::shared_ptr<ArcFeatureFunction>(new DistancePenalty("")));
+ TrainingInstance::ReadTraining(conf["training_data"].as<string>(), &corpus);
+ vector<ArcFactoredForest> forests(corpus.size());
+ SparseVector<double> empirical;
+ bool flag = false;
+ for (int i = 0; i < corpus.size(); ++i) {
+ TrainingInstance& cur = corpus[i];
+ if ((i+1) % 10 == 0) { cerr << '.' << flush; flag = true; }
+ if ((i+1) % 400 == 0) { cerr << " [" << (i+1) << "]\n"; flag = false; }
+ for (int fi = 0; fi < ffs.size(); ++fi) {
+ ArcFeatureFunction& ff = *ffs[fi];
+ ff.PrepareForInput(cur.ts);
+ SparseVector<weight_t> efmap;
+ for (int j = 0; j < cur.tree.h_m_pairs.size(); ++j) {
+ efmap.clear();
+ ff.EgdeFeatures(cur.ts, cur.tree.h_m_pairs[j].first,
+ cur.tree.h_m_pairs[j].second,
+ &efmap);
+ cur.features += efmap;
+ }
+ for (int j = 0; j < cur.tree.roots.size(); ++j) {
+ efmap.clear();
+ ff.EgdeFeatures(cur.ts, -1, cur.tree.roots[j], &efmap);
+ cur.features += efmap;
+ }
+ }
+ empirical += cur.features;
+ forests[i].resize(cur.ts.words.size());
+ forests[i].ExtractFeatures(cur.ts, ffs);
+ }
+ if (flag) cerr << endl;
+ vector<weight_t> weights(FD::NumFeats(), 0.0);
+ Weights::InitFromFile(conf["q_weights"].as<string>(), &weights);
+ MT19937 rng;
+ SparseVector<double> model_exp;
+ SparseVector<double> sampled_exp;
+ int samples = conf["samples"].as<unsigned>();
+ for (int i = 0; i < corpus.size(); ++i) {
+ const int num_words = corpus[i].ts.words.size();
+ forests[i].Reweight(weights);
+ forests[i].EdgeMarginals();
+ model_exp.clear();
+ for (int h = -1; h < num_words; ++h) {
+ for (int m = 0; m < num_words; ++m) {
+ if (h == m) continue;
+ const ArcFactoredForest::Edge& edge = forests[i](h,m);
+ const SparseVector<weight_t>& fmap = edge.features;
+ double prob = edge.edge_prob.as_float();
+ model_exp += fmap * prob;
+ }
+ }
+ //cerr << "TRUE EXP: " << model_exp << endl;
+
+ forests[i].Reweight(weights);
+ TreeSampler ts(forests[i]);
+ sampled_exp.clear();
+ //ostringstream os; os << "Samples_" << samples;
+ //Timer t(os.str());
+ for (int n = 0; n < samples; ++n) {
+ EdgeSubset tree;
+ ts.SampleRandomSpanningTree(&tree, &rng);
+ SparseVector<double> feats;
+ tree.ExtractFeatures(corpus[i].ts, ffs, &feats);
+ sampled_exp += feats;
+ }
+ sampled_exp /= samples;
+ cerr << "L2 norm of diff @ " << samples << " samples: " << (model_exp - sampled_exp).l2norm() << endl;
+ }
+ return 0;
+}
+
diff --git a/utils/weights.cc b/utils/weights.cc
index ac407dfb..39c18474 100644
--- a/utils/weights.cc
+++ b/utils/weights.cc
@@ -144,8 +144,10 @@ void Weights::ShowLargestFeatures(const vector<weight_t>& w) {
vector<int> fnums(w.size());
for (int i = 0; i < w.size(); ++i)
fnums[i] = i;
+ int nf = FD::NumFeats();
+ if (nf > 10) nf = 10;
vector<int>::iterator mid = fnums.begin();
- mid += (w.size() > 10 ? 10 : w.size());
+ mid += nf;
partial_sort(fnums.begin(), mid, fnums.end(), FComp(w));
cerr << "TOP FEATURES:";
for (vector<int>::iterator i = fnums.begin(); i != mid; ++i) {