summaryrefslogtreecommitdiff
path: root/dtrain
diff options
context:
space:
mode:
authorPatrick Simianer <p@simianer.de>2011-09-25 22:59:24 +0200
committerPatrick Simianer <p@simianer.de>2011-09-25 22:59:24 +0200
commit899a30eb4e53d539ee0b846f38d7524fec811864 (patch)
tree77d02d7c6746b8d5249bca9900fdaef4c15ccf77 /dtrain
parentec8e1b92b0a898754eb11d72741c8af39854c706 (diff)
size_t -> unsigned
Diffstat (limited to 'dtrain')
-rw-r--r--dtrain/dtrain.cc61
-rw-r--r--dtrain/dtrain.h12
-rw-r--r--dtrain/kbestget.h14
-rw-r--r--dtrain/ksampler.h6
-rw-r--r--dtrain/pairsampling.h8
-rw-r--r--dtrain/score.cc63
-rw-r--r--dtrain/score.h32
-rw-r--r--dtrain/test/example/dtrain.ini4
-rw-r--r--dtrain/test/example/weights.gzbin255 -> 248 bytes
9 files changed, 95 insertions, 105 deletions
diff --git a/dtrain/dtrain.cc b/dtrain/dtrain.cc
index 0481cf96..44090242 100644
--- a/dtrain/dtrain.cc
+++ b/dtrain/dtrain.cc
@@ -6,23 +6,24 @@ dtrain_init(int argc, char** argv, po::variables_map* cfg)
{
po::options_description ini("Configuration File Options");
ini.add_options()
- ("input", po::value<string>()->default_value("-"), "input file")
- ("output", po::value<string>()->default_value("-"), "output weights file (or VOID)")
- ("input_weights", po::value<string>(), "input weights file (e.g. from previous iteration)")
- ("decoder_config", po::value<string>(), "configuration file for cdec")
- ("k", po::value<size_t>()->default_value(100), "size of kbest or sample from forest")
- ("sample_from", po::value<string>()->default_value("kbest"), "where to get translations from")
- ("filter", po::value<string>()->default_value("unique"), "filter kbest list")
- ("pair_sampling", po::value<string>()->default_value("all"), "how to sample pairs: all, rand")
- ("N", po::value<size_t>()->default_value(3), "N for Ngrams")
- ("epochs", po::value<size_t>()->default_value(2), "# of iterations T")
- ("scorer", po::value<string>()->default_value("stupid_bleu"), "scoring metric")
- ("stop_after", po::value<size_t>()->default_value(0), "stop after X input sentences")
- ("print_weights", po::value<string>(), "weights to print on each iteration")
- ("hstreaming", po::value<bool>()->zero_tokens(), "run in hadoop streaming mode")
- ("learning_rate", po::value<double>()->default_value(0.0005), "learning rate")
- ("gamma", po::value<double>()->default_value(0.), "gamma for SVM (0 for perceptron)")
- ("noup", po::value<bool>()->zero_tokens(), "do not update weights");
+ ("input", po::value<string>()->default_value("-"), "input file")
+ ("output", po::value<string>()->default_value("-"), "output weights file (or VOID)")
+ ("input_weights", po::value<string>(), "input weights file (e.g. from previous iteration)")
+ ("decoder_config", po::value<string>(), "configuration file for cdec")
+ ("k", po::value<unsigned>()->default_value(100), "size of kbest or sample from forest")
+ ("sample_from", po::value<string>()->default_value("kbest"), "where to get translations from")
+ ("filter", po::value<string>()->default_value("unique"), "filter kbest list")
+ ("pair_sampling", po::value<string>()->default_value("all"), "how to sample pairs: all, rand")
+ ("N", po::value<unsigned>()->default_value(3), "N for Ngrams")
+ ("epochs", po::value<unsigned>()->default_value(2), "# of iterations T")
+ ("scorer", po::value<string>()->default_value("stupid_bleu"), "scoring metric")
+ ("stop_after", po::value<unsigned>()->default_value(0), "stop after X input sentences")
+ ("print_weights", po::value<string>(), "weights to print on each iteration")
+ ("hstreaming", po::value<bool>()->zero_tokens(), "run in hadoop streaming mode")
+ ("learning_rate", po::value<double>()->default_value(0.0005), "learning rate")
+ ("gamma", po::value<double>()->default_value(0.), "gamma for SVM (0 for perceptron)")
+ ("tmp", po::value<string>()->default_value("/tmp"), "temp dir to use") // FIXME
+ ("noup", po::value<bool>()->zero_tokens(), "do not update weights");
po::options_description cl("Command Line Options");
cl.add_options()
("config,c", po::value<string>(), "dtrain config file")
@@ -75,10 +76,10 @@ main(int argc, char** argv)
hstreaming = true;
quiet = true;
}
- const size_t k = cfg["k"].as<size_t>();
- const size_t N = cfg["N"].as<size_t>();
- const size_t T = cfg["epochs"].as<size_t>();
- const size_t stop_after = cfg["stop_after"].as<size_t>();
+ const unsigned k = cfg["k"].as<unsigned>();
+ const unsigned N = cfg["N"].as<unsigned>();
+ const unsigned T = cfg["epochs"].as<unsigned>();
+ const unsigned stop_after = cfg["stop_after"].as<unsigned>();
const string filter_type = cfg["filter"].as<string>();
const string sample_from = cfg["sample_from"].as<string>();
const string pair_sampling = cfg["pair_sampling"].as<string>();
@@ -105,7 +106,7 @@ main(int argc, char** argv)
// scoring metric/scorer
string scorer_str = cfg["scorer"].as<string>();
- score_t (*scorer)(NgramCounts&, const size_t, const size_t, size_t, vector<score_t>);
+ score_t (*scorer)(NgramCounts&, const unsigned, const unsigned, unsigned, vector<score_t>);
if (scorer_str == "bleu") {
scorer = &bleu;
} else if (scorer_str == "stupid_bleu") {
@@ -119,8 +120,8 @@ main(int argc, char** argv)
exit(1);
}
NgramCounts global_counts(N); // counts for 1 best translations
- size_t global_hyp_len = 0; // sum hypothesis lengths
- size_t global_ref_len = 0; // sum reference lengths
+ unsigned global_hyp_len = 0; // sum hypothesis lengths
+ unsigned global_ref_len = 0; // sum reference lengths
// ^^^ global_* for approx_bleu
vector<score_t> bleu_weights; // we leave this empty -> 1/N
if (!quiet) cerr << setw(26) << "scorer '" << scorer_str << "'" << endl << endl;
@@ -149,10 +150,10 @@ main(int argc, char** argv)
ogzstream grammar_buf_out;
grammar_buf_out.open(grammar_buf_fn);
- size_t in_sz = 999999999; // input index, input size
+ unsigned in_sz = 999999999; // input index, input size
vector<pair<score_t,score_t> > all_scores;
score_t max_score = 0.;
- size_t best_it = 0;
+ unsigned best_it = 0;
float overall_time = 0.;
// output cfg
@@ -178,7 +179,7 @@ main(int argc, char** argv)
}
- for (size_t t = 0; t < T; t++) // T epochs
+ for (unsigned t = 0; t < T; t++) // T epochs
{
time_t start, end;
@@ -186,7 +187,7 @@ main(int argc, char** argv)
igzstream grammar_buf_in;
if (t > 0) grammar_buf_in.open(grammar_buf_fn);
score_t score_sum = 0., model_sum = 0.;
- size_t ii = 0;
+ unsigned ii = 0;
if (!quiet) cerr << "Iteration #" << t+1 << " of " << T << "." << endl;
while(true)
@@ -279,10 +280,10 @@ main(int argc, char** argv)
// (local) scoring
if (t > 0) ref_ids = ref_ids_buf[ii];
score_t score = 0.;
- for (size_t i = 0; i < samples->size(); i++) {
+ for (unsigned i = 0; i < samples->size(); i++) {
NgramCounts counts = make_ngram_counts(ref_ids, (*samples)[i].w, N);
if (scorer_str == "approx_bleu") {
- size_t hyp_len = 0;
+ unsigned hyp_len = 0;
if (i == 0) { // 'context of 1best translations'
global_counts += counts;
global_hyp_len += (*samples)[i].w.size();
diff --git a/dtrain/dtrain.h b/dtrain/dtrain.h
index 9bc5be93..ed75a297 100644
--- a/dtrain/dtrain.h
+++ b/dtrain/dtrain.h
@@ -18,8 +18,8 @@
#include "ksampler.h"
#include "pairsampling.h"
-#define DTRAIN_DOTS 100 // when to display a '.'
-#define DTRAIN_TMP_DIR "/var/hadoop/mapred/local" // put this on a SSD?
+#define DTRAIN_DOTS 100 // when to display a '.'
+#define DTRAIN_TMP_DIR "/tmp"
#define DTRAIN_GRAMMAR_DELIM "########EOS########"
using namespace std;
@@ -36,20 +36,20 @@ inline ostream& _p(ostream& out) { return out << setiosflags(ios::showpos); }
inline ostream& _p2(ostream& out) { return out << setprecision(2); }
inline ostream& _p5(ostream& out) { return out << setprecision(5); }
inline ostream& _p9(ostream& out) { return out << setprecision(9); }
-inline void strsplit(string &s, vector<string>& v, char d = '\t', size_t parts = 0) {
+inline void strsplit(string &s, vector<string>& v, char d = '\t', unsigned parts = 0) {
stringstream ss(s);
string t;
- size_t c = 0;
+ unsigned i = 0;
while(true)
{
- if (parts > 0 && c == parts-1) {
+ if (parts > 0 && i == parts-1) {
getline(ss, t);
v.push_back(t);
break;
}
if (!getline(ss, t, d)) break;
v.push_back(t);
- c++;
+ i++;
}
}
diff --git a/dtrain/kbestget.h b/dtrain/kbestget.h
index 403384de..935998a0 100644
--- a/dtrain/kbestget.h
+++ b/dtrain/kbestget.h
@@ -22,11 +22,11 @@ struct HypSampler : public DecoderObserver
struct KBestGetter : public HypSampler
{
- const size_t k_;
+ const unsigned k_;
const string filter_type_;
vector<ScoredHyp> s_;
- KBestGetter(const size_t k, const string filter_type) :
+ KBestGetter(const unsigned k, const string filter_type) :
k_(k), filter_type_(filter_type) {}
virtual void
@@ -51,9 +51,11 @@ struct KBestGetter : public HypSampler
KBestUnique(const Hypergraph& forest)
{
s_.clear();
- KBest::KBestDerivations<vector<WordID>, ESentenceTraversal, KBest::FilterUnique, prob_t, EdgeProb> kbest(forest, k_);
- for (size_t i = 0; i < k_; ++i) {
- const KBest::KBestDerivations<vector<WordID>, ESentenceTraversal, KBest::FilterUnique, prob_t, EdgeProb>::Derivation* d =
+ KBest::KBestDerivations<vector<WordID>, ESentenceTraversal,
+ KBest::FilterUnique, prob_t, EdgeProb> kbest(forest, k_);
+ for (unsigned i = 0; i < k_; ++i) {
+ const KBest::KBestDerivations<vector<WordID>, ESentenceTraversal, KBest::FilterUnique,
+ prob_t, EdgeProb>::Derivation* d =
kbest.LazyKthBest(forest.nodes_.size() - 1, i);
if (!d) break;
ScoredHyp h;
@@ -69,7 +71,7 @@ struct KBestGetter : public HypSampler
{
s_.clear();
KBest::KBestDerivations<vector<WordID>, ESentenceTraversal> kbest(forest, k_);
- for (size_t i = 0; i < k_; ++i) {
+ for (unsigned i = 0; i < k_; ++i) {
const KBest::KBestDerivations<vector<WordID>, ESentenceTraversal>::Derivation* d =
kbest.LazyKthBest(forest.nodes_.size() - 1, i);
if (!d) break;
diff --git a/dtrain/ksampler.h b/dtrain/ksampler.h
index 08bf1498..17b0ba56 100644
--- a/dtrain/ksampler.h
+++ b/dtrain/ksampler.h
@@ -11,11 +11,11 @@ namespace dtrain
struct KSampler : public HypSampler
{
- const size_t k_;
+ const unsigned k_;
vector<ScoredHyp> s_;
MT19937* prng_;
- explicit KSampler(const size_t k, MT19937* prng) :
+ explicit KSampler(const unsigned k, MT19937* prng) :
k_(k), prng_(prng) {}
virtual void
@@ -30,7 +30,7 @@ struct KSampler : public HypSampler
s_.clear();
std::vector<HypergraphSampler::Hypothesis> samples;
HypergraphSampler::sample_hypotheses(forest, k_, prng_, &samples);
- for (size_t i = 0; i < k_; ++i) {
+ for (unsigned i = 0; i < k_; ++i) {
ScoredHyp h;
h.w = samples[i].words;
h.f = samples[i].fmap;
diff --git a/dtrain/pairsampling.h b/dtrain/pairsampling.h
index 2e4ab155..9546a945 100644
--- a/dtrain/pairsampling.h
+++ b/dtrain/pairsampling.h
@@ -11,8 +11,8 @@ namespace dtrain
inline void
sample_all_pairs(vector<ScoredHyp>* s, vector<pair<ScoredHyp,ScoredHyp> > &training)
{
- for (size_t i = 0; i < s->size()-1; i++) {
- for (size_t j = i+1; j < s->size(); j++) {
+ for (unsigned i = 0; i < s->size()-1; i++) {
+ for (unsigned j = i+1; j < s->size(); j++) {
pair<ScoredHyp,ScoredHyp> p;
p.first = (*s)[i];
p.second = (*s)[j];
@@ -25,8 +25,8 @@ inline void
sample_rand_pairs(vector<ScoredHyp>* s, vector<pair<ScoredHyp,ScoredHyp> > &training,
MT19937* prng)
{
- for (size_t i = 0; i < s->size()-1; i++) {
- for (size_t j = i+1; j < s->size(); j++) {
+ for (unsigned i = 0; i < s->size()-1; i++) {
+ for (unsigned j = i+1; j < s->size(); j++) {
if (prng->next() < .5) {
pair<ScoredHyp,ScoredHyp> p;
p.first = (*s)[i];
diff --git a/dtrain/score.cc b/dtrain/score.cc
index c6d3a05f..52644250 100644
--- a/dtrain/score.cc
+++ b/dtrain/score.cc
@@ -5,13 +5,13 @@ namespace dtrain
Ngrams
-make_ngrams(vector<WordID>& s, size_t N)
+make_ngrams(vector<WordID>& s, unsigned N)
{
Ngrams ngrams;
vector<WordID> ng;
for (size_t i = 0; i < s.size(); i++) {
ng.clear();
- for (size_t j = i; j < min(i+N, s.size()); j++) {
+ for (unsigned j = i; j < min(i+N, s.size()); j++) {
ng.push_back(s[j]);
ngrams[ng]++;
}
@@ -20,7 +20,7 @@ make_ngrams(vector<WordID>& s, size_t N)
}
NgramCounts
-make_ngram_counts(vector<WordID> hyp, vector<WordID> ref, size_t N)
+make_ngram_counts(vector<WordID> hyp, vector<WordID> ref, unsigned N)
{
Ngrams hyp_ngrams = make_ngrams(hyp, N);
Ngrams ref_ngrams = make_ngrams(ref, N);
@@ -48,26 +48,22 @@ make_ngram_counts(vector<WordID> hyp, vector<WordID> ref, size_t N)
* NOTE: 0 if one n in {1..N} has 0 count
*/
score_t
-brevity_penaly(const size_t hyp_len, const size_t ref_len)
+brevity_penaly(const unsigned hyp_len, const unsigned ref_len)
{
if (hyp_len > ref_len) return 1;
- return exp(1 - (score_t)ref_len/(score_t)hyp_len);
+ return exp(1 - (score_t)ref_len/hyp_len);
}
score_t
-bleu(NgramCounts& counts, const size_t hyp_len, const size_t ref_len,
- size_t N, vector<score_t> weights )
+bleu(NgramCounts& counts, const unsigned hyp_len, const unsigned ref_len,
+ unsigned N, vector<score_t> weights )
{
if (hyp_len == 0 || ref_len == 0) return 0;
if (ref_len < N) N = ref_len;
- score_t N_ = (score_t)N;
- if (weights.empty())
- {
- for (size_t i = 0; i < N; i++) weights.push_back(1/N_);
- }
+ if (weights.empty()) for (unsigned i = 0; i < N; i++) weights.push_back(1./N);
score_t sum = 0;
- for (size_t i = 0; i < N; i++) {
+ for (unsigned i = 0; i < N; i++) {
if (counts.clipped[i] == 0 || counts.sum[i] == 0) return 0;
- sum += weights[i] * log((score_t)counts.clipped[i] / (score_t)counts.sum[i]);
+ sum += weights[i] * log((score_t)counts.clipped[i] / counts.sum[i]);
}
return brevity_penaly(hyp_len, ref_len) * exp(sum);
}
@@ -83,21 +79,16 @@ bleu(NgramCounts& counts, const size_t hyp_len, const size_t ref_len,
* NOTE: 0 iff no 1gram match
*/
score_t
-stupid_bleu(NgramCounts& counts, const size_t hyp_len, const size_t ref_len,
- size_t N, vector<score_t> weights )
+stupid_bleu(NgramCounts& counts, const unsigned hyp_len, const unsigned ref_len,
+ unsigned N, vector<score_t> weights )
{
if (hyp_len == 0 || ref_len == 0) return 0;
if (ref_len < N) N = ref_len;
- score_t N_ = (score_t)N;
- if (weights.empty())
- {
- for (size_t i = 0; i < N; i++) weights.push_back(1/N_);
- }
- score_t sum = 0;
- score_t add = 0;
- for (size_t i = 0; i < N; i++) {
+ if (weights.empty()) for (unsigned i = 0; i < N; i++) weights.push_back(1./N);
+ score_t sum = 0, add = 0;
+ for (unsigned i = 0; i < N; i++) {
if (i == 1) add = 1;
- sum += weights[i] * log(((score_t)counts.clipped[i] + add) / ((score_t)counts.sum[i] + add));
+ sum += weights[i] * log(((score_t)counts.clipped[i] + add) / ((counts.sum[i] + add)));
}
return brevity_penaly(hyp_len, ref_len) * exp(sum);
}
@@ -112,20 +103,16 @@ stupid_bleu(NgramCounts& counts, const size_t hyp_len, const size_t ref_len,
* NOTE: max is 0.9375
*/
score_t
-smooth_bleu(NgramCounts& counts, const size_t hyp_len, const size_t ref_len,
- const size_t N, vector<score_t> weights )
+smooth_bleu(NgramCounts& counts, const unsigned hyp_len, const unsigned ref_len,
+ const unsigned N, vector<score_t> weights )
{
if (hyp_len == 0 || ref_len == 0) return 0;
- score_t N_ = (score_t)N;
- if (weights.empty())
- {
- for (size_t i = 0; i < N; i++) weights.push_back(1/N_);
- }
+ if (weights.empty()) for (unsigned i = 0; i < N; i++) weights.push_back(1./N);
score_t sum = 0;
- score_t j = 1;
- for (size_t i = 0; i < N; i++) {
+ unsigned j = 1;
+ for (unsigned i = 0; i < N; i++) {
if (counts.clipped[i] == 0 || counts.sum[i] == 0) continue;
- sum += exp((weights[i] * log((score_t)counts.clipped[i]/(score_t)counts.sum[i]))) / pow(2, N_-j+1);
+ sum += exp((weights[i] * log((score_t)counts.clipped[i]/counts.sum[i]))) / pow(2, N-j+1);
j++;
}
return brevity_penaly(hyp_len, ref_len) * sum;
@@ -139,11 +126,11 @@ smooth_bleu(NgramCounts& counts, const size_t hyp_len, const size_t ref_len,
* (Chiang et al. '08)
*/
score_t
-approx_bleu(NgramCounts& counts, const size_t hyp_len, const size_t ref_len,
- const size_t N, vector<score_t> weights)
+approx_bleu(NgramCounts& counts, const unsigned hyp_len, const unsigned ref_len,
+ const unsigned N, vector<score_t> weights)
{
return brevity_penaly(hyp_len, ref_len)
- * 0.9 * bleu(counts, hyp_len, ref_len, N, weights);
+ * 0.9 * bleu(counts, hyp_len, ref_len, N, weights);
}
diff --git a/dtrain/score.h b/dtrain/score.h
index bff0b10c..3e5d82a9 100644
--- a/dtrain/score.h
+++ b/dtrain/score.h
@@ -19,17 +19,17 @@ typedef double score_t; // float
struct NgramCounts
{
- size_t N_;
- map<size_t, size_t> clipped;
- map<size_t, size_t> sum;
+ unsigned N_;
+ map<unsigned, unsigned> clipped;
+ map<unsigned, unsigned> sum;
- NgramCounts(const size_t N) : N_(N) { reset(); }
+ NgramCounts(const unsigned N) : N_(N) { reset(); }
void
operator+=(const NgramCounts& rhs)
{
assert(N_ == rhs.N_);
- for (size_t i = 0; i < N_; i++) {
+ for (unsigned i = 0; i < N_; i++) {
this->clipped[i] += rhs.clipped.find(i)->second;
this->sum[i] += rhs.sum.find(i)->second;
}
@@ -44,7 +44,7 @@ struct NgramCounts
}
void
- add(size_t count, size_t ref_count, size_t i)
+ add(unsigned count, unsigned ref_count, unsigned i)
{
assert(i < N_);
if (count > ref_count) {
@@ -59,7 +59,7 @@ struct NgramCounts
void
reset()
{
- size_t i;
+ unsigned i;
for (i = 0; i < N_; i++) {
clipped[i] = 0;
sum[i] = 0;
@@ -69,26 +69,26 @@ struct NgramCounts
void
print()
{
- for (size_t i = 0; i < N_; i++) {
+ for (unsigned i = 0; i < N_; i++) {
cout << i+1 << "grams (clipped):\t" << clipped[i] << endl;
cout << i+1 << "grams:\t\t\t" << sum[i] << endl;
}
}
};
-typedef map<vector<WordID>, size_t> Ngrams;
+typedef map<vector<WordID>, unsigned> Ngrams;
-Ngrams make_ngrams(vector<WordID>& s, size_t N);
-NgramCounts make_ngram_counts(vector<WordID> hyp, vector<WordID> ref, size_t N);
+Ngrams make_ngrams(vector<WordID>& s, unsigned N);
+NgramCounts make_ngram_counts(vector<WordID> hyp, vector<WordID> ref, unsigned N);
-score_t brevity_penaly(const size_t hyp_len, const size_t ref_len);
-score_t bleu(NgramCounts& counts, const size_t hyp_len, const size_t ref_len, const size_t N,
+score_t brevity_penaly(const unsigned hyp_len, const unsigned ref_len);
+score_t bleu(NgramCounts& counts, const unsigned hyp_len, const unsigned ref_len, const unsigned N,
vector<score_t> weights = vector<score_t>());
-score_t stupid_bleu(NgramCounts& counts, const size_t hyp_len, const size_t ref_len, size_t N,
+score_t stupid_bleu(NgramCounts& counts, const unsigned hyp_len, const unsigned ref_len, unsigned N,
vector<score_t> weights = vector<score_t>());
-score_t smooth_bleu(NgramCounts& counts, const size_t hyp_len, const size_t ref_len, const size_t N,
+score_t smooth_bleu(NgramCounts& counts, const unsigned hyp_len, const unsigned ref_len, const unsigned N,
vector<score_t> weights = vector<score_t>());
-score_t approx_bleu(NgramCounts& counts, const size_t hyp_len, const size_t ref_len, const size_t N,
+score_t approx_bleu(NgramCounts& counts, const unsigned hyp_len, const unsigned ref_len, const unsigned N,
vector<score_t> weights = vector<score_t>());
diff --git a/dtrain/test/example/dtrain.ini b/dtrain/test/example/dtrain.ini
index 00ba72f9..fbddb915 100644
--- a/dtrain/test/example/dtrain.ini
+++ b/dtrain/test/example/dtrain.ini
@@ -5,7 +5,7 @@ epochs=1000
input=test/example/nc-1k.gz
scorer=stupid_bleu
output=test/example/weights.gz
-stop_after=100
-sample_from=forest
+stop_after=10
+sample_from=kbest
pair_sampling=all
print_weights=Glue WordPenalty LanguageModel LanguageModel_OOV PhraseModel_0 PhraseModel_1 PhraseModel_2 PhraseModel_3 PhraseModel_4 PassThrough
diff --git a/dtrain/test/example/weights.gz b/dtrain/test/example/weights.gz
index 21157427..e2e1ecce 100644
--- a/dtrain/test/example/weights.gz
+++ b/dtrain/test/example/weights.gz
Binary files differ