From 0a47dda0e980d1fb6222a1f548649914427555b2 Mon Sep 17 00:00:00 2001
From: Patrick Simianer
Date: Fri, 23 Sep 2011 22:02:45 +0200
Subject: more renaming, random pair sampler uses boost rng
---
dtrain/Makefile.am | 2 +-
dtrain/dtrain.cc | 10 +--
dtrain/hgsampler.cc | 74 ++++++++++++++++++++++
dtrain/hgsampler.h | 29 +++++++++
dtrain/ksampler.h | 2 +-
dtrain/pairsampling.h | 35 ++++++-----
dtrain/sample_hg.cc | 74 ----------------------
dtrain/sample_hg.h | 24 --------
dtrain/score.cc | 165 ++++++++++++++++++++++----------------------------
dtrain/score.h | 53 +++++++---------
10 files changed, 224 insertions(+), 244 deletions(-)
create mode 100644 dtrain/hgsampler.cc
create mode 100644 dtrain/hgsampler.h
delete mode 100644 dtrain/sample_hg.cc
delete mode 100644 dtrain/sample_hg.h
(limited to 'dtrain')
diff --git a/dtrain/Makefile.am b/dtrain/Makefile.am
index 9b5df8bf..12084a70 100644
--- a/dtrain/Makefile.am
+++ b/dtrain/Makefile.am
@@ -1,7 +1,7 @@
# TODO I'm sure I can leave something out.
bin_PROGRAMS = dtrain
-dtrain_SOURCES = dtrain.cc score.cc sample_hg.cc
+dtrain_SOURCES = dtrain.cc score.cc hgsampler.cc
dtrain_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a ../klm/lm/libklm.a ../klm/util/libklm_util.a -lz -lboost_filesystem -lboost_iostreams
AM_CPPFLAGS = -W -Wall -Wno-sign-compare -I$(top_srcdir)/utils -I$(top_srcdir)/decoder -I$(top_srcdir)/mteval
diff --git a/dtrain/dtrain.cc b/dtrain/dtrain.cc
index 01821b30..01119997 100644
--- a/dtrain/dtrain.cc
+++ b/dtrain/dtrain.cc
@@ -347,10 +347,9 @@ main( int argc, char** argv )
cand_len = kb->sents[i].size();
}
NgramCounts counts_tmp = global_counts + counts;
- // TODO as param
- score = 0.9 * scorer( counts_tmp,
- global_ref_len,
- global_hyp_len + cand_len, N, bleu_weights );
+ score = .9*scorer( counts_tmp,
+ global_ref_len,
+ global_hyp_len + cand_len, N, bleu_weights );
} else {
// other scorers
cand_len = kb->sents[i].size();
@@ -381,7 +380,8 @@ main( int argc, char** argv )
if ( !noup ) {
TrainingInstances pairs;
- sample_all( kb, pairs );
+ sample_all_pairs(kb, pairs);
+ //sample_rand_pairs( kb, pairs, &rng );
for ( TrainingInstances::iterator ti = pairs.begin();
ti != pairs.end(); ti++ ) {
diff --git a/dtrain/hgsampler.cc b/dtrain/hgsampler.cc
new file mode 100644
index 00000000..7a00a3d3
--- /dev/null
+++ b/dtrain/hgsampler.cc
@@ -0,0 +1,74 @@
+#include "hgsampler.h"
+
+#include
+
+#include "viterbi.h"
+#include "inside_outside.h"
+
+using namespace std;
+
+struct SampledDerivationWeightFunction {
+ typedef double Weight;
+ explicit SampledDerivationWeightFunction(const vector& sampled) : sampled_edges(sampled) {}
+ double operator()(const Hypergraph::Edge& e) const {
+ return static_cast(sampled_edges[e.id_]);
+ }
+ const vector& sampled_edges;
+};
+
+void HypergraphSampler::sample_hypotheses(const Hypergraph& hg,
+ unsigned n,
+ MT19937* rng,
+ vector* hypos) {
+ hypos->clear();
+ hypos->resize(n);
+
+ // compute inside probabilities
+ vector node_probs;
+ Inside(hg, &node_probs, EdgeProb());
+
+ vector sampled_edges(hg.edges_.size());
+ queue q;
+ SampleSet ss;
+ for (unsigned i = 0; i < n; ++i) {
+ fill(sampled_edges.begin(), sampled_edges.end(), false);
+ // sample derivation top down
+ assert(q.empty());
+ Hypothesis& hyp = (*hypos)[i];
+ SparseVector& deriv_features = hyp.fmap;
+ q.push(hg.nodes_.size() - 1);
+ prob_t& model_score = hyp.model_score;
+ model_score = prob_t::One();
+ while(!q.empty()) {
+ unsigned cur_node_id = q.front();
+ q.pop();
+ const Hypergraph::Node& node = hg.nodes_[cur_node_id];
+ const unsigned num_in_edges = node.in_edges_.size();
+ unsigned sampled_edge_idx = 0;
+ if (num_in_edges == 1) {
+ sampled_edge_idx = node.in_edges_[0];
+ } else {
+ assert(num_in_edges > 1);
+ ss.clear();
+ for (unsigned j = 0; j < num_in_edges; ++j) {
+ const Hypergraph::Edge& edge = hg.edges_[node.in_edges_[j]];
+ prob_t p = edge.edge_prob_; // edge weight
+ for (unsigned k = 0; k < edge.tail_nodes_.size(); ++k)
+ p *= node_probs[edge.tail_nodes_[k]]; // tail node inside weight
+ ss.add(p);
+ }
+ sampled_edge_idx = node.in_edges_[rng->SelectSample(ss)];
+ }
+ sampled_edges[sampled_edge_idx] = true;
+ const Hypergraph::Edge& sampled_edge = hg.edges_[sampled_edge_idx];
+ deriv_features += sampled_edge.feature_values_;
+ model_score *= sampled_edge.edge_prob_;
+ //sampled_deriv->push_back(sampled_edge_idx);
+ for (unsigned j = 0; j < sampled_edge.tail_nodes_.size(); ++j) {
+ q.push(sampled_edge.tail_nodes_[j]);
+ }
+ }
+ Viterbi(hg, &hyp.words, ESentenceTraversal(), SampledDerivationWeightFunction(sampled_edges));
+ }
+}
+
diff --git a/dtrain/hgsampler.h b/dtrain/hgsampler.h
new file mode 100644
index 00000000..b840c07f
--- /dev/null
+++ b/dtrain/hgsampler.h
@@ -0,0 +1,29 @@
+#ifndef _DTRAIN_HGSAMPLER_H_
+#define _DTRAIN_HGSAMPLER_H_
+
+
+#include
+#include "sparse_vector.h"
+#include "sampler.h"
+#include "wordid.h"
+
+class Hypergraph;
+
+struct HypergraphSampler {
+
+ struct Hypothesis {
+ std::vector words;
+ SparseVector fmap;
+ prob_t model_score;
+ };
+
+ static void
+ sample_hypotheses(const Hypergraph& hg,
+ unsigned n,
+ MT19937* rng,
+ std::vector* hypos);
+};
+
+
+#endif
+
diff --git a/dtrain/ksampler.h b/dtrain/ksampler.h
index a28b69c9..914e9723 100644
--- a/dtrain/ksampler.h
+++ b/dtrain/ksampler.h
@@ -2,7 +2,7 @@
#define _DTRAIN_KSAMPLER_H_
#include "kbest.h"
-#include "sample_hg.h"
+#include "hgsampler.h"
#include "sampler.h"
namespace dtrain
diff --git a/dtrain/pairsampling.h b/dtrain/pairsampling.h
index 502901af..9774ba4a 100644
--- a/dtrain/pairsampling.h
+++ b/dtrain/pairsampling.h
@@ -1,9 +1,8 @@
-#ifndef _DTRAIN_SAMPLE_H_
-#define _DTRAIN_SAMPLE_H_
-
+#ifndef _DTRAIN_PAIRSAMPLING_H_
+#define _DTRAIN_PAIRSAMPLING_H_
#include "kbestget.h"
-
+#include "sampler.h" // cdec MT19937
namespace dtrain
{
@@ -11,19 +10,18 @@ namespace dtrain
struct TPair
{
- SparseVector first, second;
- size_t first_rank, second_rank;
- double first_score, second_score;
+ SparseVector first, second;
+ size_t first_rank, second_rank;
+ double first_score, second_score;
};
typedef vector TrainingInstances;
-
void
-sample_all( KBestList* kb, TrainingInstances &training )
+sample_all_pairs(KBestList* kb, TrainingInstances &training)
{
- for ( size_t i = 0; i < kb->GetSize()-1; i++ ) {
- for ( size_t j = i+1; j < kb->GetSize(); j++ ) {
+ for (size_t i = 0; i < kb->GetSize()-1; i++) {
+ for (size_t j = i+1; j < kb->GetSize(); j++) {
TPair p;
p.first = kb->feats[i];
p.second = kb->feats[j];
@@ -31,18 +29,18 @@ sample_all( KBestList* kb, TrainingInstances &training )
p.second_rank = j;
p.first_score = kb->scores[i];
p.second_score = kb->scores[j];
- training.push_back( p );
+ training.push_back(p);
}
}
}
void
-sample_rand( KBestList* kb, TrainingInstances &training )
+sample_rand_pairs(KBestList* kb, TrainingInstances &training, MT19937* prng)
{
- srand( time(NULL) );
- for ( size_t i = 0; i < kb->GetSize()-1; i++ ) {
- for ( size_t j = i+1; j < kb->GetSize(); j++ ) {
- if ( rand() % 2 ) {
+ srand(time(NULL));
+ for (size_t i = 0; i < kb->GetSize()-1; i++) {
+ for (size_t j = i+1; j < kb->GetSize(); j++) {
+ if (prng->next() < .5) {
TPair p;
p.first = kb->feats[i];
p.second = kb->feats[j];
@@ -50,10 +48,11 @@ sample_rand( KBestList* kb, TrainingInstances &training )
p.second_rank = j;
p.first_score = kb->scores[i];
p.second_score = kb->scores[j];
- training.push_back( p );
+ training.push_back(p);
}
}
}
+ cout << training.size() << " sampled" << endl;
}
diff --git a/dtrain/sample_hg.cc b/dtrain/sample_hg.cc
deleted file mode 100644
index 33872fb8..00000000
--- a/dtrain/sample_hg.cc
+++ /dev/null
@@ -1,74 +0,0 @@
-#include "sample_hg.h"
-
-#include
-
-#include "viterbi.h"
-#include "inside_outside.h"
-
-using namespace std;
-
-struct SampledDerivationWeightFunction {
- typedef double Weight;
- explicit SampledDerivationWeightFunction(const vector& sampled) : sampled_edges(sampled) {}
- double operator()(const Hypergraph::Edge& e) const {
- return static_cast(sampled_edges[e.id_]);
- }
- const vector& sampled_edges;
-};
-
-void HypergraphSampler::sample_hypotheses(const Hypergraph& hg,
- unsigned n,
- MT19937* rng,
- vector* hypos) {
- hypos->clear();
- hypos->resize(n);
-
- // compute inside probabilities
- vector node_probs;
- Inside(hg, &node_probs, EdgeProb());
-
- vector sampled_edges(hg.edges_.size());
- queue q;
- SampleSet ss;
- for (unsigned i = 0; i < n; ++i) {
- fill(sampled_edges.begin(), sampled_edges.end(), false);
- // sample derivation top down
- assert(q.empty());
- Hypothesis& hyp = (*hypos)[i];
- SparseVector& deriv_features = hyp.fmap;
- q.push(hg.nodes_.size() - 1);
- prob_t& model_score = hyp.model_score;
- model_score = prob_t::One();
- while(!q.empty()) {
- unsigned cur_node_id = q.front();
- q.pop();
- const Hypergraph::Node& node = hg.nodes_[cur_node_id];
- const unsigned num_in_edges = node.in_edges_.size();
- unsigned sampled_edge_idx = 0;
- if (num_in_edges == 1) {
- sampled_edge_idx = node.in_edges_[0];
- } else {
- assert(num_in_edges > 1);
- ss.clear();
- for (unsigned j = 0; j < num_in_edges; ++j) {
- const Hypergraph::Edge& edge = hg.edges_[node.in_edges_[j]];
- prob_t p = edge.edge_prob_; // edge weight
- for (unsigned k = 0; k < edge.tail_nodes_.size(); ++k)
- p *= node_probs[edge.tail_nodes_[k]]; // tail node inside weight
- ss.add(p);
- }
- sampled_edge_idx = node.in_edges_[rng->SelectSample(ss)];
- }
- sampled_edges[sampled_edge_idx] = true;
- const Hypergraph::Edge& sampled_edge = hg.edges_[sampled_edge_idx];
- deriv_features += sampled_edge.feature_values_;
- model_score *= sampled_edge.edge_prob_;
- //sampled_deriv->push_back(sampled_edge_idx);
- for (unsigned j = 0; j < sampled_edge.tail_nodes_.size(); ++j) {
- q.push(sampled_edge.tail_nodes_[j]);
- }
- }
- Viterbi(hg, &hyp.words, ESentenceTraversal(), SampledDerivationWeightFunction(sampled_edges));
- }
-}
-
diff --git a/dtrain/sample_hg.h b/dtrain/sample_hg.h
deleted file mode 100644
index 932fd369..00000000
--- a/dtrain/sample_hg.h
+++ /dev/null
@@ -1,24 +0,0 @@
-#ifndef _SAMPLE_HG_H_
-#define _SAMPLE_HG_H_
-
-#include
-#include "sparse_vector.h"
-#include "sampler.h"
-#include "wordid.h"
-
-class Hypergraph;
-
-struct HypergraphSampler {
- struct Hypothesis {
- std::vector words;
- SparseVector fmap;
- prob_t model_score;
- };
-
- static void sample_hypotheses(const Hypergraph& hg,
- unsigned n,
- MT19937* rng,
- std::vector* hypos);
-};
-
-#endif
diff --git a/dtrain/score.cc b/dtrain/score.cc
index 1e98c11d..d08e87f3 100644
--- a/dtrain/score.cc
+++ b/dtrain/score.cc
@@ -1,166 +1,149 @@
#include "score.h"
-
namespace dtrain
{
-/******************************************************************************
- * NGRAMS
- *
- *
- * make_ngrams
- *
- */
-typedef map, size_t> Ngrams;
Ngrams
-make_ngrams( vector& s, size_t N )
+make_ngrams(vector& s, size_t N)
{
Ngrams ngrams;
vector ng;
- for ( size_t i = 0; i < s.size(); i++ ) {
+ for (size_t i = 0; i < s.size(); i++) {
ng.clear();
- for ( size_t j = i; j < min( i+N, s.size() ); j++ ) {
- ng.push_back( s[j] );
+ for (size_t j = i; j < min(i+N, s.size()); j++) {
+ ng.push_back(s[j]);
ngrams[ng]++;
}
}
return ngrams;
}
-
-/*
- * ngram_matches
- *
- */
NgramCounts
-make_ngram_counts( vector hyp, vector ref, size_t N )
+make_ngram_counts(vector hyp, vector ref, size_t N)
{
- Ngrams hyp_ngrams = make_ngrams( hyp, N );
- Ngrams ref_ngrams = make_ngrams( ref, N );
- NgramCounts counts( N );
+ Ngrams hyp_ngrams = make_ngrams(hyp, N);
+ Ngrams ref_ngrams = make_ngrams(ref, N);
+ NgramCounts counts(N);
Ngrams::iterator it;
Ngrams::iterator ti;
- for ( it = hyp_ngrams.begin(); it != hyp_ngrams.end(); it++ ) {
- ti = ref_ngrams.find( it->first );
- if ( ti != ref_ngrams.end() ) {
- counts.add( it->second, ti->second, it->first.size() - 1 );
+ for (it = hyp_ngrams.begin(); it != hyp_ngrams.end(); it++) {
+ ti = ref_ngrams.find(it->first);
+ if (ti != ref_ngrams.end()) {
+ counts.add(it->second, ti->second, it->first.size() - 1);
} else {
- counts.add( it->second, 0, it->first.size() - 1 );
+ counts.add(it->second, 0, it->first.size() - 1);
}
}
return counts;
}
-
-/******************************************************************************
- * SCORERS
- *
+/*
+ * bleu
*
- * brevity_penaly
+ * as in "BLEU: a Method for Automatic Evaluation
+ * of Machine Translation"
+ * (Papineni et al. '02)
*
+ * NOTE: 0 if one n in {1..N} has 0 count
*/
double
-brevity_penaly( const size_t hyp_len, const size_t ref_len )
+brevity_penaly(const size_t hyp_len, const size_t ref_len)
{
- if ( hyp_len > ref_len ) return 1;
- return exp( 1 - (double)ref_len/(double)hyp_len );
+ if (hyp_len > ref_len) return 1;
+ return exp(1 - (double)ref_len/(double)hyp_len);
}
-
-
-/*
- * bleu
- * as in "BLEU: a Method for Automatic Evaluation of Machine Translation" (Papineni et al. '02)
- * page TODO
- * 0 if for N one of the counts = 0
- */
double
-bleu( NgramCounts& counts, const size_t hyp_len, const size_t ref_len,
- size_t N, vector weights )
+bleu(NgramCounts& counts, const size_t hyp_len, const size_t ref_len,
+ size_t N, vector weights )
{
- if ( hyp_len == 0 || ref_len == 0 ) return 0;
- if ( ref_len < N ) N = ref_len;
+ if (hyp_len == 0 || ref_len == 0) return 0;
+ if (ref_len < N) N = ref_len;
float N_ = (float)N;
- if ( weights.empty() )
+ if (weights.empty())
{
- for ( size_t i = 0; i < N; i++ ) weights.push_back( 1/N_ );
+ for (size_t i = 0; i < N; i++) weights.push_back(1/N_);
}
double sum = 0;
- for ( size_t i = 0; i < N; i++ ) {
- if ( counts.clipped[i] == 0 || counts.sum[i] == 0 ) return 0;
- sum += weights[i] * log( (double)counts.clipped[i] / (double)counts.sum[i] );
+ for (size_t i = 0; i < N; i++) {
+ if (counts.clipped[i] == 0 || counts.sum[i] == 0) return 0;
+ sum += weights[i] * log((double)counts.clipped[i] / (double)counts.sum[i]);
}
- return brevity_penaly( hyp_len, ref_len ) * exp( sum );
+ return brevity_penaly(hyp_len, ref_len) * exp(sum);
}
-
/*
- * stupid_bleu
- * as in "ORANGE: a Method for Evaluating Automatic Evaluation Metrics for Machine Translation (Lin & Och '04)
- * page TODO
- * 0 iff no 1gram match
+ * 'stupid' bleu
+ *
+ * as in "ORANGE: a Method for Evaluating
+ * Automatic Evaluation Metrics
+ * for Machine Translation"
+ * (Lin & Och '04)
+ *
+ * NOTE: 0 iff no 1gram match
*/
double
-stupid_bleu( NgramCounts& counts, const size_t hyp_len, const size_t ref_len,
- size_t N, vector weights )
+stupid_bleu(NgramCounts& counts, const size_t hyp_len, const size_t ref_len,
+ size_t N, vector weights )
{
- if ( hyp_len == 0 || ref_len == 0 ) return 0;
- if ( ref_len < N ) N = ref_len;
+ if (hyp_len == 0 || ref_len == 0) return 0;
+ if (ref_len < N) N = ref_len;
float N_ = (float)N;
- if ( weights.empty() )
+ if (weights.empty())
{
- for ( size_t i = 0; i < N; i++ ) weights.push_back( 1/N_ );
+ for (size_t i = 0; i < N; i++) weights.push_back(1/N_);
}
double sum = 0;
float add = 0;
- for ( size_t i = 0; i < N; i++ ) {
- if ( i == 1 ) add = 1;
- sum += weights[i] * log( ((double)counts.clipped[i] + add) / ((double)counts.sum[i] + add) );
+ for (size_t i = 0; i < N; i++) {
+ if (i == 1) add = 1;
+ sum += weights[i] * log(((double)counts.clipped[i] + add) / ((double)counts.sum[i] + add));
}
- return brevity_penaly( hyp_len, ref_len ) * exp( sum );
+ return brevity_penaly(hyp_len, ref_len) * exp(sum);
}
-
/*
- * smooth_bleu
- * as in "An End-to-End Discriminative Approach to Machine Translation" (Liang et al. '06)
- * page TODO
- * max. 0.9375
+ * smooth bleu
+ *
+ * as in "An End-to-End Discriminative Approach
+ * to Machine Translation"
+ * (Liang et al. '06)
+ *
+ * NOTE: max is 0.9375
*/
double
-smooth_bleu( NgramCounts& counts, const size_t hyp_len, const size_t ref_len,
- const size_t N, vector weights )
+smooth_bleu(NgramCounts& counts, const size_t hyp_len, const size_t ref_len,
+ const size_t N, vector weights )
{
- if ( hyp_len == 0 || ref_len == 0 ) return 0;
+ if (hyp_len == 0 || ref_len == 0) return 0;
float N_ = (float)N;
- if ( weights.empty() )
+ if (weights.empty())
{
- for ( size_t i = 0; i < N; i++ ) weights.push_back( 1/N_ );
+ for (size_t i = 0; i < N; i++) weights.push_back(1/N_);
}
double sum = 0;
float j = 1;
- for ( size_t i = 0; i < N; i++ ) {
- if ( counts.clipped[i] == 0 || counts.sum[i] == 0) continue;
- sum += exp((weights[i] * log((double)counts.clipped[i]/(double)counts.sum[i]))) / pow( 2, N_-j+1 );
+ for (size_t i = 0; i < N; i++) {
+ if (counts.clipped[i] == 0 || counts.sum[i] == 0) continue;
+ sum += exp((weights[i] * log((double)counts.clipped[i]/(double)counts.sum[i]))) / pow(2, N_-j+1);
j++;
}
- return brevity_penaly( hyp_len, ref_len ) * sum;
+ return brevity_penaly(hyp_len, ref_len) * sum;
}
-
/*
- * approx_bleu
- * as in "Online Large-Margin Training for Statistical Machine Translation" (Watanabe et al. '07)
- * CHIANG, RESNIK, synt struct features
- * .9*
- * page TODO
+ * approx. bleu
*
+ * as in "Online Large-Margin Training of Syntactic
+ * and Structural Translation Features"
+ * (Chiang et al. '08)
*/
double
-approx_bleu( NgramCounts& counts, const size_t hyp_len, const size_t ref_len,
- const size_t N, vector weights )
+approx_bleu(NgramCounts& counts, const size_t hyp_len, const size_t ref_len,
+ const size_t N, vector weights)
{
- return bleu( counts, hyp_len, ref_len, N, weights );
+ return brevity_penaly(hyp_len, ref_len)
+ * 0.9 * bleu(counts, hyp_len, ref_len, N, weights);
}
diff --git a/dtrain/score.h b/dtrain/score.h
index e88387c5..0afb6237 100644
--- a/dtrain/score.h
+++ b/dtrain/score.h
@@ -1,29 +1,23 @@
#ifndef _DTRAIN_SCORE_H_
#define _DTRAIN_SCORE_H_
-
#include
#include
#include