From 437ee131bc4c2574768a187bf29d62dbb715abf4 Mon Sep 17 00:00:00 2001 From: Kenneth Heafield Date: Mon, 24 Oct 2011 18:17:24 +0100 Subject: KenLM update. EnumerateVocab moved up a namespace. Fix trie building when bigrams are pruned. Make Chris feel better about MurmurHashNative. --- decoder/ff_csplit.cc | 2 +- decoder/ff_klm.cc | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'decoder') diff --git a/decoder/ff_csplit.cc b/decoder/ff_csplit.cc index dee6f4f9..3991d38f 100644 --- a/decoder/ff_csplit.cc +++ b/decoder/ff_csplit.cc @@ -155,7 +155,7 @@ void BasicCSplitFeatures::TraversalFeaturesImpl( } namespace { -struct CSVMapper : public lm::ngram::EnumerateVocab { +struct CSVMapper : public lm::EnumerateVocab { CSVMapper(vector* out) : out_(out), kLM_UNKNOWN_TOKEN(0) { out_->clear(); } void Add(lm::WordIndex index, const StringPiece &str) { const WordID cdec_id = TD::Convert(str.as_string()); diff --git a/decoder/ff_klm.cc b/decoder/ff_klm.cc index ed6f731e..a4b26f7c 100644 --- a/decoder/ff_klm.cc +++ b/decoder/ff_klm.cc @@ -70,7 +70,7 @@ string KLanguageModel::usage(bool /*param*/,bool /*verbose*/) { namespace { -struct VMapper : public lm::ngram::EnumerateVocab { +struct VMapper : public lm::EnumerateVocab { VMapper(vector* out) : out_(out), kLM_UNKNOWN_TOKEN(0) { out_->clear(); } void Add(lm::WordIndex index, const StringPiece &str) { const WordID cdec_id = TD::Convert(str.as_string()); -- cgit v1.2.3 From a5592c9ab0266dbf4993e42e82e5a113316990ad Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Fri, 11 Nov 2011 17:12:39 -0500 Subject: optionally sample from forest to get training instances, rather than k-best it --- decoder/Makefile.am | 1 + decoder/hg_sampler.cc | 73 +++++++++++++++++++++++++++++++++++++++++++++++ decoder/hg_sampler.h | 27 ++++++++++++++++++ mira/kbest_mira.cc | 79 ++++++++++++++++++++++++++++++++++++++------------- 4 files changed, 161 insertions(+), 19 deletions(-) create mode 100644 decoder/hg_sampler.cc create mode 100644 decoder/hg_sampler.h (limited to 'decoder') diff --git a/decoder/Makefile.am b/decoder/Makefile.am index 6b9360d8..30eaf04d 100644 --- a/decoder/Makefile.am +++ b/decoder/Makefile.am @@ -51,6 +51,7 @@ libcdec_a_SOURCES = \ hg_io.cc \ decoder.cc \ hg_intersect.cc \ + hg_sampler.cc \ factored_lexicon_helper.cc \ viterbi.cc \ lattice.cc \ diff --git a/decoder/hg_sampler.cc b/decoder/hg_sampler.cc new file mode 100644 index 00000000..cdf0ec3c --- /dev/null +++ b/decoder/hg_sampler.cc @@ -0,0 +1,73 @@ +#include "hg_sampler.h" + +#include + +#include "viterbi.h" +#include "inside_outside.h" + +using namespace std; + +struct SampledDerivationWeightFunction { + typedef double Weight; + explicit SampledDerivationWeightFunction(const vector& sampled) : sampled_edges(sampled) {} + double operator()(const Hypergraph::Edge& e) const { + return static_cast(sampled_edges[e.id_]); + } + const vector& sampled_edges; +}; + +void HypergraphSampler::sample_hypotheses(const Hypergraph& hg, + unsigned n, + MT19937* rng, + vector* hypos) { + hypos->clear(); + hypos->resize(n); + + // compute inside probabilities + vector node_probs; + Inside(hg, &node_probs, EdgeProb()); + + vector sampled_edges(hg.edges_.size()); + queue q; + SampleSet ss; + for (unsigned i = 0; i < n; ++i) { + fill(sampled_edges.begin(), sampled_edges.end(), false); + // sample derivation top down + assert(q.empty()); + Hypothesis& hyp = (*hypos)[i]; + SparseVector& deriv_features = hyp.fmap; + q.push(hg.nodes_.size() - 1); + prob_t& model_score = hyp.model_score; + model_score = prob_t::One(); + while(!q.empty()) { + unsigned cur_node_id = q.front(); + q.pop(); + const Hypergraph::Node& node = hg.nodes_[cur_node_id]; + const unsigned num_in_edges = node.in_edges_.size(); + unsigned sampled_edge_idx = 0; + if (num_in_edges == 1) { + sampled_edge_idx = node.in_edges_[0]; + } else { + assert(num_in_edges > 1); + ss.clear(); + for (unsigned j = 0; j < num_in_edges; ++j) { + const Hypergraph::Edge& edge = hg.edges_[node.in_edges_[j]]; + prob_t p = edge.edge_prob_; // edge weight + for (unsigned k = 0; k < edge.tail_nodes_.size(); ++k) + p *= node_probs[edge.tail_nodes_[k]]; // tail node inside weight + ss.add(p); + } + sampled_edge_idx = node.in_edges_[rng->SelectSample(ss)]; + } + sampled_edges[sampled_edge_idx] = true; + const Hypergraph::Edge& sampled_edge = hg.edges_[sampled_edge_idx]; + deriv_features += sampled_edge.feature_values_; + model_score *= sampled_edge.edge_prob_; + //sampled_deriv->push_back(sampled_edge_idx); + for (unsigned j = 0; j < sampled_edge.tail_nodes_.size(); ++j) { + q.push(sampled_edge.tail_nodes_[j]); + } + } + Viterbi(hg, &hyp.words, ESentenceTraversal(), SampledDerivationWeightFunction(sampled_edges)); + } +} diff --git a/decoder/hg_sampler.h b/decoder/hg_sampler.h new file mode 100644 index 00000000..bf4e1eb0 --- /dev/null +++ b/decoder/hg_sampler.h @@ -0,0 +1,27 @@ +#ifndef _HG_SAMPLER_H_ +#define _HG_SAMPLER_H_ + + +#include +#include "sparse_vector.h" +#include "sampler.h" +#include "wordid.h" + +class Hypergraph; + +struct HypergraphSampler { + + struct Hypothesis { + std::vector words; + SparseVector fmap; + prob_t model_score; // log unnormalized probability + }; + + static void + sample_hypotheses(const Hypergraph& hg, + unsigned n, // how many samples to draw + MT19937* rng, + std::vector* hypos); +}; + +#endif diff --git a/mira/kbest_mira.cc b/mira/kbest_mira.cc index 459a5e6f..904eba74 100644 --- a/mira/kbest_mira.cc +++ b/mira/kbest_mira.cc @@ -10,6 +10,7 @@ #include #include +#include "hg_sampler.h" #include "sentence_metadata.h" #include "scorer.h" #include "verbose.h" @@ -54,6 +55,8 @@ bool InitCommandLine(int argc, char** argv, po::variables_map* conf) { ("max_step_size,C", po::value()->default_value(0.01), "regularization strength (C)") ("mt_metric_scale,s", po::value()->default_value(1.0), "Amount to scale MT loss function by") ("k_best_size,k", po::value()->default_value(250), "Size of hypothesis list to search for oracles") + ("sample_forest,f", "Instead of a k-best list, sample k hypotheses from the decoder's forest") + ("sample_forest_unit_weight_vector,x", "Before sampling (must use -f option), rescale the weight vector used so it has unit length; this may improve the quality of the samples") ("random_seed,S", po::value(), "Random seed (if not specified, /dev/random will be used)") ("decoder_config,c",po::value(),"Decoder configuration file"); po::options_description clo("Command line options"); @@ -91,11 +94,12 @@ struct GoodBadOracle { }; struct TrainingObserver : public DecoderObserver { - TrainingObserver(const int k, const DocScorer& d, vector* o) : ds(d), oracles(*o), kbest_size(k) {} + TrainingObserver(const int k, const DocScorer& d, bool sf, vector* o) : ds(d), oracles(*o), kbest_size(k), sample_forest(sf) {} const DocScorer& ds; vector& oracles; shared_ptr cur_best; const int kbest_size; + const bool sample_forest; const HypothesisInfo& GetCurrentBestHypothesis() const { return *cur_best; @@ -116,24 +120,43 @@ struct TrainingObserver : public DecoderObserver { shared_ptr& cur_good = oracles[sent_id].good; shared_ptr& cur_bad = oracles[sent_id].bad; cur_bad.reset(); // TODO get rid of?? - KBest::KBestDerivations, ESentenceTraversal> kbest(forest, kbest_size); - for (int i = 0; i < kbest_size; ++i) { - const KBest::KBestDerivations, ESentenceTraversal>::Derivation* d = - kbest.LazyKthBest(forest.nodes_.size() - 1, i); - if (!d) break; - float sentscore = ds[sent_id]->ScoreCandidate(d->yield)->ComputeScore(); - if (invert_score) sentscore *= -1.0; - // cerr << TD::GetString(d->yield) << " ||| " << d->score << " ||| " << sentscore << endl; - if (i == 0) - cur_best = MakeHypothesisInfo(d->feature_values, sentscore); - if (!cur_good || sentscore > cur_good->mt_metric) - cur_good = MakeHypothesisInfo(d->feature_values, sentscore); - if (!cur_bad || sentscore < cur_bad->mt_metric) - cur_bad = MakeHypothesisInfo(d->feature_values, sentscore); + + if (sample_forest) { + vector cur_prediction; + ViterbiESentence(forest, &cur_prediction); + float sentscore = ds[sent_id]->ScoreCandidate(cur_prediction)->ComputeScore(); + cur_best = MakeHypothesisInfo(ViterbiFeatures(forest), sentscore); + + vector samples; + HypergraphSampler::sample_hypotheses(forest, kbest_size, &*rng, &samples); + for (unsigned i = 0; i < samples.size(); ++i) { + sentscore = ds[sent_id]->ScoreCandidate(samples[i].words)->ComputeScore(); + if (invert_score) sentscore *= -1.0; + if (!cur_good || sentscore > cur_good->mt_metric) + cur_good = MakeHypothesisInfo(samples[i].fmap, sentscore); + if (!cur_bad || sentscore < cur_bad->mt_metric) + cur_bad = MakeHypothesisInfo(samples[i].fmap, sentscore); + } + } else { + KBest::KBestDerivations, ESentenceTraversal> kbest(forest, kbest_size); + for (int i = 0; i < kbest_size; ++i) { + const KBest::KBestDerivations, ESentenceTraversal>::Derivation* d = + kbest.LazyKthBest(forest.nodes_.size() - 1, i); + if (!d) break; + float sentscore = ds[sent_id]->ScoreCandidate(d->yield)->ComputeScore(); + if (invert_score) sentscore *= -1.0; + // cerr << TD::GetString(d->yield) << " ||| " << d->score << " ||| " << sentscore << endl; + if (i == 0) + cur_best = MakeHypothesisInfo(d->feature_values, sentscore); + if (!cur_good || sentscore > cur_good->mt_metric) + cur_good = MakeHypothesisInfo(d->feature_values, sentscore); + if (!cur_bad || sentscore < cur_bad->mt_metric) + cur_bad = MakeHypothesisInfo(d->feature_values, sentscore); + } + //cerr << "GOOD: " << cur_good->mt_metric << endl; + //cerr << " CUR: " << cur_best->mt_metric << endl; + //cerr << " BAD: " << cur_bad->mt_metric << endl; } - //cerr << "GOOD: " << cur_good->mt_metric << endl; - //cerr << " CUR: " << cur_best->mt_metric << endl; - //cerr << " BAD: " << cur_bad->mt_metric << endl; } }; @@ -164,6 +187,12 @@ int main(int argc, char** argv) { rng.reset(new MT19937(conf["random_seed"].as())); else rng.reset(new MT19937); + const bool sample_forest = conf.count("sample_forest") > 0; + const bool sample_forest_unit_weight_vector = conf.count("sample_forest_unit_weight_vector") > 0; + if (sample_forest_unit_weight_vector && !sample_forest) { + cerr << "Cannot --sample_forest_unit_weight_vector without --sample_forest" << endl; + return 1; + } vector corpus; ReadTrainingCorpus(conf["source"].as(), &corpus); const string metric_name = conf["mt_metric"].as(); @@ -195,7 +224,7 @@ int main(int argc, char** argv) { assert(corpus.size() > 0); vector oracles(corpus.size()); - TrainingObserver observer(conf["k_best_size"].as(), ds, &oracles); + TrainingObserver observer(conf["k_best_size"].as(), ds, sample_forest, &oracles); int cur_sent = 0; int lcount = 0; int normalizer = 0; @@ -234,7 +263,19 @@ int main(int argc, char** argv) { cerr << "PASS " << (lcount / corpus.size() + 1) << endl; } decoder.SetId(order[cur_sent]); + double sc = 1.0; + if (sample_forest_unit_weight_vector) { + sc = lambdas.l2norm(); + if (sc > 0) { + for (unsigned i = 0; i < dense_weights.size(); ++i) + dense_weights[i] /= sc; + } + } decoder.Decode(corpus[order[cur_sent]], &observer); // update oracles + if (sc && sc != 1.0) { + for (unsigned i = 0; i < dense_weights.size(); ++i) + dense_weights[i] *= sc; + } const HypothesisInfo& cur_hyp = observer.GetCurrentBestHypothesis(); const HypothesisInfo& cur_good = *oracles[order[cur_sent]].good; const HypothesisInfo& cur_bad = *oracles[order[cur_sent]].bad; -- cgit v1.2.3