diff options
author | Patrick Simianer <p@simianer.de> | 2011-11-13 12:26:23 +0100 |
---|---|---|
committer | Patrick Simianer <p@simianer.de> | 2011-11-13 12:26:23 +0100 |
commit | effc9bfc40a0559ce36a155daa15e0dc53e93b75 (patch) | |
tree | 768a29ebad48089e3445c515d47f49c942f09124 /decoder | |
parent | ed8ca37550910a540e755ada119e814f13eeef03 (diff) | |
parent | a5592c9ab0266dbf4993e42e82e5a113316990ad (diff) |
merge upstream/master
Diffstat (limited to 'decoder')
-rw-r--r-- | decoder/Makefile.am | 1 | ||||
-rw-r--r-- | decoder/ff_csplit.cc | 2 | ||||
-rw-r--r-- | decoder/ff_klm.cc | 2 | ||||
-rw-r--r-- | decoder/hg_sampler.cc | 73 | ||||
-rw-r--r-- | decoder/hg_sampler.h | 27 |
5 files changed, 103 insertions, 2 deletions
diff --git a/decoder/Makefile.am b/decoder/Makefile.am index 6b9360d8..30eaf04d 100644 --- a/decoder/Makefile.am +++ b/decoder/Makefile.am @@ -51,6 +51,7 @@ libcdec_a_SOURCES = \ hg_io.cc \ decoder.cc \ hg_intersect.cc \ + hg_sampler.cc \ factored_lexicon_helper.cc \ viterbi.cc \ lattice.cc \ diff --git a/decoder/ff_csplit.cc b/decoder/ff_csplit.cc index dee6f4f9..3991d38f 100644 --- a/decoder/ff_csplit.cc +++ b/decoder/ff_csplit.cc @@ -155,7 +155,7 @@ void BasicCSplitFeatures::TraversalFeaturesImpl( } namespace { -struct CSVMapper : public lm::ngram::EnumerateVocab { +struct CSVMapper : public lm::EnumerateVocab { CSVMapper(vector<lm::WordIndex>* out) : out_(out), kLM_UNKNOWN_TOKEN(0) { out_->clear(); } void Add(lm::WordIndex index, const StringPiece &str) { const WordID cdec_id = TD::Convert(str.as_string()); diff --git a/decoder/ff_klm.cc b/decoder/ff_klm.cc index ed6f731e..a4b26f7c 100644 --- a/decoder/ff_klm.cc +++ b/decoder/ff_klm.cc @@ -70,7 +70,7 @@ string KLanguageModel<Model>::usage(bool /*param*/,bool /*verbose*/) { namespace { -struct VMapper : public lm::ngram::EnumerateVocab { +struct VMapper : public lm::EnumerateVocab { VMapper(vector<lm::WordIndex>* out) : out_(out), kLM_UNKNOWN_TOKEN(0) { out_->clear(); } void Add(lm::WordIndex index, const StringPiece &str) { const WordID cdec_id = TD::Convert(str.as_string()); diff --git a/decoder/hg_sampler.cc b/decoder/hg_sampler.cc new file mode 100644 index 00000000..cdf0ec3c --- /dev/null +++ b/decoder/hg_sampler.cc @@ -0,0 +1,73 @@ +#include "hg_sampler.h" + +#include <queue> + +#include "viterbi.h" +#include "inside_outside.h" + +using namespace std; + +struct SampledDerivationWeightFunction { + typedef double Weight; + explicit SampledDerivationWeightFunction(const vector<bool>& sampled) : sampled_edges(sampled) {} + double operator()(const Hypergraph::Edge& e) const { + return static_cast<double>(sampled_edges[e.id_]); + } + const vector<bool>& sampled_edges; +}; + +void HypergraphSampler::sample_hypotheses(const Hypergraph& hg, + unsigned n, + MT19937* rng, + vector<Hypothesis>* hypos) { + hypos->clear(); + hypos->resize(n); + + // compute inside probabilities + vector<prob_t> node_probs; + Inside<prob_t, EdgeProb>(hg, &node_probs, EdgeProb()); + + vector<bool> sampled_edges(hg.edges_.size()); + queue<unsigned> q; + SampleSet<prob_t> ss; + for (unsigned i = 0; i < n; ++i) { + fill(sampled_edges.begin(), sampled_edges.end(), false); + // sample derivation top down + assert(q.empty()); + Hypothesis& hyp = (*hypos)[i]; + SparseVector<double>& deriv_features = hyp.fmap; + q.push(hg.nodes_.size() - 1); + prob_t& model_score = hyp.model_score; + model_score = prob_t::One(); + while(!q.empty()) { + unsigned cur_node_id = q.front(); + q.pop(); + const Hypergraph::Node& node = hg.nodes_[cur_node_id]; + const unsigned num_in_edges = node.in_edges_.size(); + unsigned sampled_edge_idx = 0; + if (num_in_edges == 1) { + sampled_edge_idx = node.in_edges_[0]; + } else { + assert(num_in_edges > 1); + ss.clear(); + for (unsigned j = 0; j < num_in_edges; ++j) { + const Hypergraph::Edge& edge = hg.edges_[node.in_edges_[j]]; + prob_t p = edge.edge_prob_; // edge weight + for (unsigned k = 0; k < edge.tail_nodes_.size(); ++k) + p *= node_probs[edge.tail_nodes_[k]]; // tail node inside weight + ss.add(p); + } + sampled_edge_idx = node.in_edges_[rng->SelectSample(ss)]; + } + sampled_edges[sampled_edge_idx] = true; + const Hypergraph::Edge& sampled_edge = hg.edges_[sampled_edge_idx]; + deriv_features += sampled_edge.feature_values_; + model_score *= sampled_edge.edge_prob_; + //sampled_deriv->push_back(sampled_edge_idx); + for (unsigned j = 0; j < sampled_edge.tail_nodes_.size(); ++j) { + q.push(sampled_edge.tail_nodes_[j]); + } + } + Viterbi(hg, &hyp.words, ESentenceTraversal(), SampledDerivationWeightFunction(sampled_edges)); + } +} diff --git a/decoder/hg_sampler.h b/decoder/hg_sampler.h new file mode 100644 index 00000000..bf4e1eb0 --- /dev/null +++ b/decoder/hg_sampler.h @@ -0,0 +1,27 @@ +#ifndef _HG_SAMPLER_H_ +#define _HG_SAMPLER_H_ + + +#include <vector> +#include "sparse_vector.h" +#include "sampler.h" +#include "wordid.h" + +class Hypergraph; + +struct HypergraphSampler { + + struct Hypothesis { + std::vector<WordID> words; + SparseVector<double> fmap; + prob_t model_score; // log unnormalized probability + }; + + static void + sample_hypotheses(const Hypergraph& hg, + unsigned n, // how many samples to draw + MT19937* rng, + std::vector<Hypothesis>* hypos); +}; + +#endif |