diff options
-rw-r--r-- | dtrain/dtrain.cc | 61 | ||||
-rw-r--r-- | dtrain/dtrain.h | 12 | ||||
-rw-r--r-- | dtrain/kbestget.h | 14 | ||||
-rw-r--r-- | dtrain/ksampler.h | 6 | ||||
-rw-r--r-- | dtrain/pairsampling.h | 8 | ||||
-rw-r--r-- | dtrain/score.cc | 63 | ||||
-rw-r--r-- | dtrain/score.h | 32 | ||||
-rw-r--r-- | dtrain/test/example/dtrain.ini | 4 | ||||
-rw-r--r-- | dtrain/test/example/weights.gz | bin | 255 -> 248 bytes |
9 files changed, 95 insertions, 105 deletions
diff --git a/dtrain/dtrain.cc b/dtrain/dtrain.cc index 0481cf96..44090242 100644 --- a/dtrain/dtrain.cc +++ b/dtrain/dtrain.cc @@ -6,23 +6,24 @@ dtrain_init(int argc, char** argv, po::variables_map* cfg) { po::options_description ini("Configuration File Options"); ini.add_options() - ("input", po::value<string>()->default_value("-"), "input file") - ("output", po::value<string>()->default_value("-"), "output weights file (or VOID)") - ("input_weights", po::value<string>(), "input weights file (e.g. from previous iteration)") - ("decoder_config", po::value<string>(), "configuration file for cdec") - ("k", po::value<size_t>()->default_value(100), "size of kbest or sample from forest") - ("sample_from", po::value<string>()->default_value("kbest"), "where to get translations from") - ("filter", po::value<string>()->default_value("unique"), "filter kbest list") - ("pair_sampling", po::value<string>()->default_value("all"), "how to sample pairs: all, rand") - ("N", po::value<size_t>()->default_value(3), "N for Ngrams") - ("epochs", po::value<size_t>()->default_value(2), "# of iterations T") - ("scorer", po::value<string>()->default_value("stupid_bleu"), "scoring metric") - ("stop_after", po::value<size_t>()->default_value(0), "stop after X input sentences") - ("print_weights", po::value<string>(), "weights to print on each iteration") - ("hstreaming", po::value<bool>()->zero_tokens(), "run in hadoop streaming mode") - ("learning_rate", po::value<double>()->default_value(0.0005), "learning rate") - ("gamma", po::value<double>()->default_value(0.), "gamma for SVM (0 for perceptron)") - ("noup", po::value<bool>()->zero_tokens(), "do not update weights"); + ("input", po::value<string>()->default_value("-"), "input file") + ("output", po::value<string>()->default_value("-"), "output weights file (or VOID)") + ("input_weights", po::value<string>(), "input weights file (e.g. from previous iteration)") + ("decoder_config", po::value<string>(), "configuration file for cdec") + ("k", po::value<unsigned>()->default_value(100), "size of kbest or sample from forest") + ("sample_from", po::value<string>()->default_value("kbest"), "where to get translations from") + ("filter", po::value<string>()->default_value("unique"), "filter kbest list") + ("pair_sampling", po::value<string>()->default_value("all"), "how to sample pairs: all, rand") + ("N", po::value<unsigned>()->default_value(3), "N for Ngrams") + ("epochs", po::value<unsigned>()->default_value(2), "# of iterations T") + ("scorer", po::value<string>()->default_value("stupid_bleu"), "scoring metric") + ("stop_after", po::value<unsigned>()->default_value(0), "stop after X input sentences") + ("print_weights", po::value<string>(), "weights to print on each iteration") + ("hstreaming", po::value<bool>()->zero_tokens(), "run in hadoop streaming mode") + ("learning_rate", po::value<double>()->default_value(0.0005), "learning rate") + ("gamma", po::value<double>()->default_value(0.), "gamma for SVM (0 for perceptron)") + ("tmp", po::value<string>()->default_value("/tmp"), "temp dir to use") // FIXME + ("noup", po::value<bool>()->zero_tokens(), "do not update weights"); po::options_description cl("Command Line Options"); cl.add_options() ("config,c", po::value<string>(), "dtrain config file") @@ -75,10 +76,10 @@ main(int argc, char** argv) hstreaming = true; quiet = true; } - const size_t k = cfg["k"].as<size_t>(); - const size_t N = cfg["N"].as<size_t>(); - const size_t T = cfg["epochs"].as<size_t>(); - const size_t stop_after = cfg["stop_after"].as<size_t>(); + const unsigned k = cfg["k"].as<unsigned>(); + const unsigned N = cfg["N"].as<unsigned>(); + const unsigned T = cfg["epochs"].as<unsigned>(); + const unsigned stop_after = cfg["stop_after"].as<unsigned>(); const string filter_type = cfg["filter"].as<string>(); const string sample_from = cfg["sample_from"].as<string>(); const string pair_sampling = cfg["pair_sampling"].as<string>(); @@ -105,7 +106,7 @@ main(int argc, char** argv) // scoring metric/scorer string scorer_str = cfg["scorer"].as<string>(); - score_t (*scorer)(NgramCounts&, const size_t, const size_t, size_t, vector<score_t>); + score_t (*scorer)(NgramCounts&, const unsigned, const unsigned, unsigned, vector<score_t>); if (scorer_str == "bleu") { scorer = &bleu; } else if (scorer_str == "stupid_bleu") { @@ -119,8 +120,8 @@ main(int argc, char** argv) exit(1); } NgramCounts global_counts(N); // counts for 1 best translations - size_t global_hyp_len = 0; // sum hypothesis lengths - size_t global_ref_len = 0; // sum reference lengths + unsigned global_hyp_len = 0; // sum hypothesis lengths + unsigned global_ref_len = 0; // sum reference lengths // ^^^ global_* for approx_bleu vector<score_t> bleu_weights; // we leave this empty -> 1/N if (!quiet) cerr << setw(26) << "scorer '" << scorer_str << "'" << endl << endl; @@ -149,10 +150,10 @@ main(int argc, char** argv) ogzstream grammar_buf_out; grammar_buf_out.open(grammar_buf_fn); - size_t in_sz = 999999999; // input index, input size + unsigned in_sz = 999999999; // input index, input size vector<pair<score_t,score_t> > all_scores; score_t max_score = 0.; - size_t best_it = 0; + unsigned best_it = 0; float overall_time = 0.; // output cfg @@ -178,7 +179,7 @@ main(int argc, char** argv) } - for (size_t t = 0; t < T; t++) // T epochs + for (unsigned t = 0; t < T; t++) // T epochs { time_t start, end; @@ -186,7 +187,7 @@ main(int argc, char** argv) igzstream grammar_buf_in; if (t > 0) grammar_buf_in.open(grammar_buf_fn); score_t score_sum = 0., model_sum = 0.; - size_t ii = 0; + unsigned ii = 0; if (!quiet) cerr << "Iteration #" << t+1 << " of " << T << "." << endl; while(true) @@ -279,10 +280,10 @@ main(int argc, char** argv) // (local) scoring if (t > 0) ref_ids = ref_ids_buf[ii]; score_t score = 0.; - for (size_t i = 0; i < samples->size(); i++) { + for (unsigned i = 0; i < samples->size(); i++) { NgramCounts counts = make_ngram_counts(ref_ids, (*samples)[i].w, N); if (scorer_str == "approx_bleu") { - size_t hyp_len = 0; + unsigned hyp_len = 0; if (i == 0) { // 'context of 1best translations' global_counts += counts; global_hyp_len += (*samples)[i].w.size(); diff --git a/dtrain/dtrain.h b/dtrain/dtrain.h index 9bc5be93..ed75a297 100644 --- a/dtrain/dtrain.h +++ b/dtrain/dtrain.h @@ -18,8 +18,8 @@ #include "ksampler.h" #include "pairsampling.h" -#define DTRAIN_DOTS 100 // when to display a '.' -#define DTRAIN_TMP_DIR "/var/hadoop/mapred/local" // put this on a SSD? +#define DTRAIN_DOTS 100 // when to display a '.' +#define DTRAIN_TMP_DIR "/tmp" #define DTRAIN_GRAMMAR_DELIM "########EOS########" using namespace std; @@ -36,20 +36,20 @@ inline ostream& _p(ostream& out) { return out << setiosflags(ios::showpos); } inline ostream& _p2(ostream& out) { return out << setprecision(2); } inline ostream& _p5(ostream& out) { return out << setprecision(5); } inline ostream& _p9(ostream& out) { return out << setprecision(9); } -inline void strsplit(string &s, vector<string>& v, char d = '\t', size_t parts = 0) { +inline void strsplit(string &s, vector<string>& v, char d = '\t', unsigned parts = 0) { stringstream ss(s); string t; - size_t c = 0; + unsigned i = 0; while(true) { - if (parts > 0 && c == parts-1) { + if (parts > 0 && i == parts-1) { getline(ss, t); v.push_back(t); break; } if (!getline(ss, t, d)) break; v.push_back(t); - c++; + i++; } } diff --git a/dtrain/kbestget.h b/dtrain/kbestget.h index 403384de..935998a0 100644 --- a/dtrain/kbestget.h +++ b/dtrain/kbestget.h @@ -22,11 +22,11 @@ struct HypSampler : public DecoderObserver struct KBestGetter : public HypSampler { - const size_t k_; + const unsigned k_; const string filter_type_; vector<ScoredHyp> s_; - KBestGetter(const size_t k, const string filter_type) : + KBestGetter(const unsigned k, const string filter_type) : k_(k), filter_type_(filter_type) {} virtual void @@ -51,9 +51,11 @@ struct KBestGetter : public HypSampler KBestUnique(const Hypergraph& forest) { s_.clear(); - KBest::KBestDerivations<vector<WordID>, ESentenceTraversal, KBest::FilterUnique, prob_t, EdgeProb> kbest(forest, k_); - for (size_t i = 0; i < k_; ++i) { - const KBest::KBestDerivations<vector<WordID>, ESentenceTraversal, KBest::FilterUnique, prob_t, EdgeProb>::Derivation* d = + KBest::KBestDerivations<vector<WordID>, ESentenceTraversal, + KBest::FilterUnique, prob_t, EdgeProb> kbest(forest, k_); + for (unsigned i = 0; i < k_; ++i) { + const KBest::KBestDerivations<vector<WordID>, ESentenceTraversal, KBest::FilterUnique, + prob_t, EdgeProb>::Derivation* d = kbest.LazyKthBest(forest.nodes_.size() - 1, i); if (!d) break; ScoredHyp h; @@ -69,7 +71,7 @@ struct KBestGetter : public HypSampler { s_.clear(); KBest::KBestDerivations<vector<WordID>, ESentenceTraversal> kbest(forest, k_); - for (size_t i = 0; i < k_; ++i) { + for (unsigned i = 0; i < k_; ++i) { const KBest::KBestDerivations<vector<WordID>, ESentenceTraversal>::Derivation* d = kbest.LazyKthBest(forest.nodes_.size() - 1, i); if (!d) break; diff --git a/dtrain/ksampler.h b/dtrain/ksampler.h index 08bf1498..17b0ba56 100644 --- a/dtrain/ksampler.h +++ b/dtrain/ksampler.h @@ -11,11 +11,11 @@ namespace dtrain struct KSampler : public HypSampler { - const size_t k_; + const unsigned k_; vector<ScoredHyp> s_; MT19937* prng_; - explicit KSampler(const size_t k, MT19937* prng) : + explicit KSampler(const unsigned k, MT19937* prng) : k_(k), prng_(prng) {} virtual void @@ -30,7 +30,7 @@ struct KSampler : public HypSampler s_.clear(); std::vector<HypergraphSampler::Hypothesis> samples; HypergraphSampler::sample_hypotheses(forest, k_, prng_, &samples); - for (size_t i = 0; i < k_; ++i) { + for (unsigned i = 0; i < k_; ++i) { ScoredHyp h; h.w = samples[i].words; h.f = samples[i].fmap; diff --git a/dtrain/pairsampling.h b/dtrain/pairsampling.h index 2e4ab155..9546a945 100644 --- a/dtrain/pairsampling.h +++ b/dtrain/pairsampling.h @@ -11,8 +11,8 @@ namespace dtrain inline void sample_all_pairs(vector<ScoredHyp>* s, vector<pair<ScoredHyp,ScoredHyp> > &training) { - for (size_t i = 0; i < s->size()-1; i++) { - for (size_t j = i+1; j < s->size(); j++) { + for (unsigned i = 0; i < s->size()-1; i++) { + for (unsigned j = i+1; j < s->size(); j++) { pair<ScoredHyp,ScoredHyp> p; p.first = (*s)[i]; p.second = (*s)[j]; @@ -25,8 +25,8 @@ inline void sample_rand_pairs(vector<ScoredHyp>* s, vector<pair<ScoredHyp,ScoredHyp> > &training, MT19937* prng) { - for (size_t i = 0; i < s->size()-1; i++) { - for (size_t j = i+1; j < s->size(); j++) { + for (unsigned i = 0; i < s->size()-1; i++) { + for (unsigned j = i+1; j < s->size(); j++) { if (prng->next() < .5) { pair<ScoredHyp,ScoredHyp> p; p.first = (*s)[i]; diff --git a/dtrain/score.cc b/dtrain/score.cc index c6d3a05f..52644250 100644 --- a/dtrain/score.cc +++ b/dtrain/score.cc @@ -5,13 +5,13 @@ namespace dtrain Ngrams -make_ngrams(vector<WordID>& s, size_t N) +make_ngrams(vector<WordID>& s, unsigned N) { Ngrams ngrams; vector<WordID> ng; for (size_t i = 0; i < s.size(); i++) { ng.clear(); - for (size_t j = i; j < min(i+N, s.size()); j++) { + for (unsigned j = i; j < min(i+N, s.size()); j++) { ng.push_back(s[j]); ngrams[ng]++; } @@ -20,7 +20,7 @@ make_ngrams(vector<WordID>& s, size_t N) } NgramCounts -make_ngram_counts(vector<WordID> hyp, vector<WordID> ref, size_t N) +make_ngram_counts(vector<WordID> hyp, vector<WordID> ref, unsigned N) { Ngrams hyp_ngrams = make_ngrams(hyp, N); Ngrams ref_ngrams = make_ngrams(ref, N); @@ -48,26 +48,22 @@ make_ngram_counts(vector<WordID> hyp, vector<WordID> ref, size_t N) * NOTE: 0 if one n in {1..N} has 0 count */ score_t -brevity_penaly(const size_t hyp_len, const size_t ref_len) +brevity_penaly(const unsigned hyp_len, const unsigned ref_len) { if (hyp_len > ref_len) return 1; - return exp(1 - (score_t)ref_len/(score_t)hyp_len); + return exp(1 - (score_t)ref_len/hyp_len); } score_t -bleu(NgramCounts& counts, const size_t hyp_len, const size_t ref_len, - size_t N, vector<score_t> weights ) +bleu(NgramCounts& counts, const unsigned hyp_len, const unsigned ref_len, + unsigned N, vector<score_t> weights ) { if (hyp_len == 0 || ref_len == 0) return 0; if (ref_len < N) N = ref_len; - score_t N_ = (score_t)N; - if (weights.empty()) - { - for (size_t i = 0; i < N; i++) weights.push_back(1/N_); - } + if (weights.empty()) for (unsigned i = 0; i < N; i++) weights.push_back(1./N); score_t sum = 0; - for (size_t i = 0; i < N; i++) { + for (unsigned i = 0; i < N; i++) { if (counts.clipped[i] == 0 || counts.sum[i] == 0) return 0; - sum += weights[i] * log((score_t)counts.clipped[i] / (score_t)counts.sum[i]); + sum += weights[i] * log((score_t)counts.clipped[i] / counts.sum[i]); } return brevity_penaly(hyp_len, ref_len) * exp(sum); } @@ -83,21 +79,16 @@ bleu(NgramCounts& counts, const size_t hyp_len, const size_t ref_len, * NOTE: 0 iff no 1gram match */ score_t -stupid_bleu(NgramCounts& counts, const size_t hyp_len, const size_t ref_len, - size_t N, vector<score_t> weights ) +stupid_bleu(NgramCounts& counts, const unsigned hyp_len, const unsigned ref_len, + unsigned N, vector<score_t> weights ) { if (hyp_len == 0 || ref_len == 0) return 0; if (ref_len < N) N = ref_len; - score_t N_ = (score_t)N; - if (weights.empty()) - { - for (size_t i = 0; i < N; i++) weights.push_back(1/N_); - } - score_t sum = 0; - score_t add = 0; - for (size_t i = 0; i < N; i++) { + if (weights.empty()) for (unsigned i = 0; i < N; i++) weights.push_back(1./N); + score_t sum = 0, add = 0; + for (unsigned i = 0; i < N; i++) { if (i == 1) add = 1; - sum += weights[i] * log(((score_t)counts.clipped[i] + add) / ((score_t)counts.sum[i] + add)); + sum += weights[i] * log(((score_t)counts.clipped[i] + add) / ((counts.sum[i] + add))); } return brevity_penaly(hyp_len, ref_len) * exp(sum); } @@ -112,20 +103,16 @@ stupid_bleu(NgramCounts& counts, const size_t hyp_len, const size_t ref_len, * NOTE: max is 0.9375 */ score_t -smooth_bleu(NgramCounts& counts, const size_t hyp_len, const size_t ref_len, - const size_t N, vector<score_t> weights ) +smooth_bleu(NgramCounts& counts, const unsigned hyp_len, const unsigned ref_len, + const unsigned N, vector<score_t> weights ) { if (hyp_len == 0 || ref_len == 0) return 0; - score_t N_ = (score_t)N; - if (weights.empty()) - { - for (size_t i = 0; i < N; i++) weights.push_back(1/N_); - } + if (weights.empty()) for (unsigned i = 0; i < N; i++) weights.push_back(1./N); score_t sum = 0; - score_t j = 1; - for (size_t i = 0; i < N; i++) { + unsigned j = 1; + for (unsigned i = 0; i < N; i++) { if (counts.clipped[i] == 0 || counts.sum[i] == 0) continue; - sum += exp((weights[i] * log((score_t)counts.clipped[i]/(score_t)counts.sum[i]))) / pow(2, N_-j+1); + sum += exp((weights[i] * log((score_t)counts.clipped[i]/counts.sum[i]))) / pow(2, N-j+1); j++; } return brevity_penaly(hyp_len, ref_len) * sum; @@ -139,11 +126,11 @@ smooth_bleu(NgramCounts& counts, const size_t hyp_len, const size_t ref_len, * (Chiang et al. '08) */ score_t -approx_bleu(NgramCounts& counts, const size_t hyp_len, const size_t ref_len, - const size_t N, vector<score_t> weights) +approx_bleu(NgramCounts& counts, const unsigned hyp_len, const unsigned ref_len, + const unsigned N, vector<score_t> weights) { return brevity_penaly(hyp_len, ref_len) - * 0.9 * bleu(counts, hyp_len, ref_len, N, weights); + * 0.9 * bleu(counts, hyp_len, ref_len, N, weights); } diff --git a/dtrain/score.h b/dtrain/score.h index bff0b10c..3e5d82a9 100644 --- a/dtrain/score.h +++ b/dtrain/score.h @@ -19,17 +19,17 @@ typedef double score_t; // float struct NgramCounts { - size_t N_; - map<size_t, size_t> clipped; - map<size_t, size_t> sum; + unsigned N_; + map<unsigned, unsigned> clipped; + map<unsigned, unsigned> sum; - NgramCounts(const size_t N) : N_(N) { reset(); } + NgramCounts(const unsigned N) : N_(N) { reset(); } void operator+=(const NgramCounts& rhs) { assert(N_ == rhs.N_); - for (size_t i = 0; i < N_; i++) { + for (unsigned i = 0; i < N_; i++) { this->clipped[i] += rhs.clipped.find(i)->second; this->sum[i] += rhs.sum.find(i)->second; } @@ -44,7 +44,7 @@ struct NgramCounts } void - add(size_t count, size_t ref_count, size_t i) + add(unsigned count, unsigned ref_count, unsigned i) { assert(i < N_); if (count > ref_count) { @@ -59,7 +59,7 @@ struct NgramCounts void reset() { - size_t i; + unsigned i; for (i = 0; i < N_; i++) { clipped[i] = 0; sum[i] = 0; @@ -69,26 +69,26 @@ struct NgramCounts void print() { - for (size_t i = 0; i < N_; i++) { + for (unsigned i = 0; i < N_; i++) { cout << i+1 << "grams (clipped):\t" << clipped[i] << endl; cout << i+1 << "grams:\t\t\t" << sum[i] << endl; } } }; -typedef map<vector<WordID>, size_t> Ngrams; +typedef map<vector<WordID>, unsigned> Ngrams; -Ngrams make_ngrams(vector<WordID>& s, size_t N); -NgramCounts make_ngram_counts(vector<WordID> hyp, vector<WordID> ref, size_t N); +Ngrams make_ngrams(vector<WordID>& s, unsigned N); +NgramCounts make_ngram_counts(vector<WordID> hyp, vector<WordID> ref, unsigned N); -score_t brevity_penaly(const size_t hyp_len, const size_t ref_len); -score_t bleu(NgramCounts& counts, const size_t hyp_len, const size_t ref_len, const size_t N, +score_t brevity_penaly(const unsigned hyp_len, const unsigned ref_len); +score_t bleu(NgramCounts& counts, const unsigned hyp_len, const unsigned ref_len, const unsigned N, vector<score_t> weights = vector<score_t>()); -score_t stupid_bleu(NgramCounts& counts, const size_t hyp_len, const size_t ref_len, size_t N, +score_t stupid_bleu(NgramCounts& counts, const unsigned hyp_len, const unsigned ref_len, unsigned N, vector<score_t> weights = vector<score_t>()); -score_t smooth_bleu(NgramCounts& counts, const size_t hyp_len, const size_t ref_len, const size_t N, +score_t smooth_bleu(NgramCounts& counts, const unsigned hyp_len, const unsigned ref_len, const unsigned N, vector<score_t> weights = vector<score_t>()); -score_t approx_bleu(NgramCounts& counts, const size_t hyp_len, const size_t ref_len, const size_t N, +score_t approx_bleu(NgramCounts& counts, const unsigned hyp_len, const unsigned ref_len, const unsigned N, vector<score_t> weights = vector<score_t>()); diff --git a/dtrain/test/example/dtrain.ini b/dtrain/test/example/dtrain.ini index 00ba72f9..fbddb915 100644 --- a/dtrain/test/example/dtrain.ini +++ b/dtrain/test/example/dtrain.ini @@ -5,7 +5,7 @@ epochs=1000 input=test/example/nc-1k.gz scorer=stupid_bleu output=test/example/weights.gz -stop_after=100 -sample_from=forest +stop_after=10 +sample_from=kbest pair_sampling=all print_weights=Glue WordPenalty LanguageModel LanguageModel_OOV PhraseModel_0 PhraseModel_1 PhraseModel_2 PhraseModel_3 PhraseModel_4 PassThrough diff --git a/dtrain/test/example/weights.gz b/dtrain/test/example/weights.gz Binary files differindex 21157427..e2e1ecce 100644 --- a/dtrain/test/example/weights.gz +++ b/dtrain/test/example/weights.gz |