diff options
-rw-r--r-- | dtrain/README.md | 6 | ||||
-rw-r--r-- | dtrain/dtrain.cc | 29 | ||||
-rw-r--r-- | dtrain/dtrain.h | 3 | ||||
-rw-r--r-- | dtrain/kbestget.h | 11 | ||||
-rw-r--r-- | dtrain/ksampler.h | 4 | ||||
-rw-r--r-- | dtrain/pairsampling.h | 36 | ||||
-rw-r--r-- | dtrain/score.cc | 4 | ||||
-rw-r--r-- | dtrain/test/example/dtrain.ini | 2 |
8 files changed, 59 insertions, 36 deletions
diff --git a/dtrain/README.md b/dtrain/README.md index 8cf99800..077fbc58 100644 --- a/dtrain/README.md +++ b/dtrain/README.md @@ -41,13 +41,13 @@ DTRAIN_LOCAL. Next ---- -+ (dtrain|decoder) meta-parameters testing -+ target side rule ngrams ++ (dtrain|decoder) more meta-parameters testing ++ feature selection directly in dtrain ++ feature template: target side rule ngrams + sa-extract -> leave-one-out for grammar of training set? + make svm doable; no subgradient? + reranking while sgd? + try PRO, mira emulations -+ avg feature count Legal ----- diff --git a/dtrain/dtrain.cc b/dtrain/dtrain.cc index e817e7ab..b662cd26 100644 --- a/dtrain/dtrain.cc +++ b/dtrain/dtrain.cc @@ -21,17 +21,18 @@ dtrain_init(int argc, char** argv, po::variables_map* cfg) ("filter", po::value<string>()->default_value("uniq"), "filter kbest list: 'not', 'uniq'") ("pair_sampling", po::value<string>()->default_value("XYX"), "how to sample pairs: 'all', 'XYX' or 'PRO'") ("hi_lo", po::value<float>()->default_value(0.1), "hi and lo (X) for XYX (default 0.1), <= 0.5") - ("pair_threshold", po::value<score_t>()->default_value(0), "bleu [0,1] threshold to filter pairs") + ("pair_threshold", po::value<score_t>()->default_value(0.), "bleu [0,1] threshold to filter pairs") ("N", po::value<unsigned>()->default_value(4), "N for Ngrams (BLEU)") ("scorer", po::value<string>()->default_value("stupid_bleu"), "scoring: bleu, stupid_, smooth_, approx_") ("learning_rate", po::value<weight_t>()->default_value(0.0001), "learning rate") - ("gamma", po::value<weight_t>()->default_value(0), "gamma for SVM (0 for perceptron)") + ("gamma", po::value<weight_t>()->default_value(0.), "gamma for SVM (0 for perceptron)") ("select_weights", po::value<string>()->default_value("last"), "output best, last, avg weights ('VOID' to throw away)") ("rescale", po::value<bool>()->zero_tokens(), "rescale weight vector after each input") ("l1_reg", po::value<string>()->default_value("none"), "apply l1 regularization as in 'Tsuroka et al' (2010)") ("l1_reg_strength", po::value<weight_t>(), "l1 regularization strength") - ("fselect", po::value<weight_t>()->default_value(-1), "TODO select top x percent of features after each epoch") + ("fselect", po::value<weight_t>()->default_value(-1), "TODO select top x percent (or by threshold) of features after each epoch") ("approx_bleu_d", po::value<score_t>()->default_value(0.9), "discount for approx. BLEU") + ("scale_bleu_diff", po::value<bool>()->zero_tokens(), "learning rate <- bleu diff of a misranked pair") #ifdef DTRAIN_LOCAL ("refs,r", po::value<string>(), "references in local mode") #endif @@ -133,6 +134,8 @@ main(int argc, char** argv) const string select_weights = cfg["select_weights"].as<string>(); const float hi_lo = cfg["hi_lo"].as<float>(); const score_t approx_bleu_d = cfg["approx_bleu_d"].as<score_t>(); + bool scale_bleu_diff = false; + if (cfg.count("scale_bleu_diff")) scale_bleu_diff = true; bool average = false; if (select_weights == "avg") average = true; @@ -236,7 +239,8 @@ main(int argc, char** argv) cerr << setw(25) << "sample from " << "'" << sample_from << "'" << endl; if (sample_from == "kbest") cerr << setw(25) << "filter " << "'" << filter_type << "'" << endl; - cerr << setw(25) << "learning rate " << eta << endl; + if (!scale_bleu_diff) cerr << setw(25) << "learning rate " << eta << endl; + else cerr << setw(25) << "learning rate " << "bleu diff" << endl; cerr << setw(25) << "gamma " << gamma << endl; cerr << setw(25) << "pairs " << "'" << pair_sampling << "'" << endl; if (pair_sampling == "XYX") @@ -255,7 +259,7 @@ main(int argc, char** argv) cerr << setw(25) << "output " << "'" << output_fn << "'" << endl; if (cfg.count("input_weights")) cerr << setw(25) << "weights in " << "'" << cfg["input_weights"].as<string>() << "'" << endl; - if (cfg.count("stop-after")) + if (stop_after > 0) cerr << setw(25) << "stop_after " << stop_after << endl; if (!verbose) cerr << "(a dot represents " << DTRAIN_DOTS << " inputs)" << endl; } @@ -274,7 +278,7 @@ main(int argc, char** argv) #endif score_t score_sum = 0.; score_t model_sum(0); - unsigned ii = 0, rank_errors = 0, margin_violations = 0, npairs = 0; + unsigned ii = 0, rank_errors = 0, margin_violations = 0, npairs = 0, f_count = 0, list_sz = 0; if (!quiet) cerr << "Iteration #" << t+1 << " of " << T << "." << endl; while(true) @@ -392,7 +396,7 @@ main(int argc, char** argv) else printWordIDVec(ref_ids); cerr << endl; for (unsigned u = 0; u < samples->size(); u++) { - cerr << _p5 << _np << "[" << u << ". '"; + cerr << _p2 << _np << "[" << u << ". '"; printWordIDVec((*samples)[u].w); cerr << "'" << endl; cerr << "SCORE=" << (*samples)[u].score << ",model="<< (*samples)[u].model << endl; @@ -403,8 +407,12 @@ main(int argc, char** argv) score_sum += (*samples)[0].score; // stats for 1best model_sum += (*samples)[0].model; + f_count += observer->get_f_count(); + list_sz += observer->get_sz(); + // weight updates if (!noup) { + // get pairs vector<pair<ScoredHyp,ScoredHyp> > pairs; if (pair_sampling == "all") all_pairs(samples, pairs, pair_threshold); @@ -420,6 +428,7 @@ main(int argc, char** argv) if (rank_error) rank_errors++; score_t margin = fabs(it->first.model - it->second.model); if (!rank_error && margin < 1) margin_violations++; + if (scale_bleu_diff) eta = it->first.score - it->second.score; if (rank_error || (gamma && margin<1)) { SparseVector<weight_t> diff_vec = it->first.f - it->second.f; lambdas.plus_eq_v_times_s(diff_vec, eta); @@ -512,7 +521,7 @@ main(int argc, char** argv) if (!quiet || hstreaming) nonz = (unsigned)lambdas.size_nonzero(); if (!quiet) { - cerr << _p9 << _p << "WEIGHTS" << endl; + cerr << _p5 << _p << "WEIGHTS" << endl; for (vector<string>::iterator it = print_weights.begin(); it != print_weights.end(); it++) { cerr << setw(18) << *it << " = " << lambdas.get(FD::Convert(*it)) << endl; } @@ -528,6 +537,8 @@ main(int argc, char** argv) cerr << " avg # margin viol: "; cerr << margin_violations/(float)in_sz << endl; cerr << " non0 feature count: " << nonz << endl; + cerr << " avg list sz: " << list_sz/(float)in_sz << endl; + cerr << " avg f count: " << f_count/(float)list_sz << endl; } if (hstreaming) { @@ -617,7 +628,7 @@ main(int argc, char** argv) if (!quiet) { cerr << _p5 << _np << endl << "---" << endl << "Best iteration: "; cerr << best_it+1 << " [SCORE '" << scorer_str << "'=" << max_score << "]." << endl; - cerr << _p2 << "This took " << overall_time/60. << " min." << endl; + cerr << "This took " << overall_time/60. << " min." << endl; } } diff --git a/dtrain/dtrain.h b/dtrain/dtrain.h index 15d32e36..94d149ce 100644 --- a/dtrain/dtrain.h +++ b/dtrain/dtrain.h @@ -13,7 +13,7 @@ #include "filelib.h" -#define DTRAIN_LOCAL +#undef DTRAIN_LOCAL #define DTRAIN_DOTS 10 // after how many inputs to display a '.' #define DTRAIN_GRAMMAR_DELIM "########EOS########" @@ -74,7 +74,6 @@ inline ostream& _np(ostream& out) { return out << resetiosflags(ios::showpos); } inline ostream& _p(ostream& out) { return out << setiosflags(ios::showpos); } inline ostream& _p2(ostream& out) { return out << setprecision(2); } inline ostream& _p5(ostream& out) { return out << setprecision(5); } -inline ostream& _p9(ostream& out) { return out << setprecision(9); } inline void printWordIDVec(vector<WordID>& v) { diff --git a/dtrain/kbestget.h b/dtrain/kbestget.h index 77d4a139..dd8882e1 100644 --- a/dtrain/kbestget.h +++ b/dtrain/kbestget.h @@ -59,9 +59,12 @@ struct HypSampler : public DecoderObserver { LocalScorer* scorer_; vector<WordID>* ref_; + unsigned f_count_, sz_; virtual vector<ScoredHyp>* GetSamples()=0; inline void SetScorer(LocalScorer* scorer) { scorer_ = scorer; } inline void SetRef(vector<WordID>& ref) { ref_ = &ref; } + inline unsigned get_f_count() { return f_count_; } + inline unsigned get_sz() { return sz_; } }; //////////////////////////////////////////////////////////////////////////////// @@ -100,7 +103,7 @@ struct KBestGetter : public HypSampler void KBestUnique(const Hypergraph& forest) { - s_.clear(); + s_.clear(); sz_ = f_count_ = 0; KBest::KBestDerivations<vector<WordID>, ESentenceTraversal, KBest::FilterUnique, prob_t, EdgeProb> kbest(forest, k_); for (unsigned i = 0; i < k_; ++i) { @@ -115,13 +118,15 @@ struct KBestGetter : public HypSampler h.rank = i; h.score = scorer_->Score(h.w, *ref_, i, src_len_); s_.push_back(h); + sz_++; + f_count_ += h.f.size(); } } void KBestNoFilter(const Hypergraph& forest) { - s_.clear(); + s_.clear(); sz_ = f_count_ = 0; KBest::KBestDerivations<vector<WordID>, ESentenceTraversal> kbest(forest, k_); for (unsigned i = 0; i < k_; ++i) { const KBest::KBestDerivations<vector<WordID>, ESentenceTraversal>::Derivation* d = @@ -134,6 +139,8 @@ struct KBestGetter : public HypSampler h.rank = i; h.score = scorer_->Score(h.w, *ref_, i, src_len_); s_.push_back(h); + sz_++; + f_count_ += h.f.size(); } } }; diff --git a/dtrain/ksampler.h b/dtrain/ksampler.h index 0783f98b..f52fb649 100644 --- a/dtrain/ksampler.h +++ b/dtrain/ksampler.h @@ -30,7 +30,7 @@ struct KSampler : public HypSampler vector<ScoredHyp>* GetSamples() { return &s_; } void ScoredSamples(const Hypergraph& forest) { - s_.clear(); + s_.clear(); sz_ = f_count_ = 0; std::vector<HypergraphSampler::Hypothesis> samples; HypergraphSampler::sample_hypotheses(forest, k_, prng_, &samples); for (unsigned i = 0; i < k_; ++i) { @@ -41,6 +41,8 @@ struct KSampler : public HypSampler h.rank = i; h.score = scorer_->Score(h.w, *ref_, i, src_len_); s_.push_back(h); + sz_++; + f_count_ += h.f.size(); } } }; diff --git a/dtrain/pairsampling.h b/dtrain/pairsampling.h index bb01cf4f..bac132c6 100644 --- a/dtrain/pairsampling.h +++ b/dtrain/pairsampling.h @@ -12,9 +12,16 @@ accept_pair(score_t a, score_t b, score_t threshold) return true; } +bool +cmp_hyp_by_score_d(ScoredHyp a, ScoredHyp b) +{ + return a.score > b.score; +} + inline void -all_pairs(vector<ScoredHyp>* s, vector<pair<ScoredHyp,ScoredHyp> >& training, score_t threshold, float _unused = 1) +all_pairs(vector<ScoredHyp>* s, vector<pair<ScoredHyp,ScoredHyp> >& training, score_t threshold, float _unused=1) { + sort(s->begin(), s->end(), cmp_hyp_by_score_d); unsigned sz = s->size(); for (unsigned i = 0; i < sz-1; i++) { for (unsigned j = i+1; j < sz; j++) { @@ -22,7 +29,8 @@ all_pairs(vector<ScoredHyp>* s, vector<pair<ScoredHyp,ScoredHyp> >& training, sc if (accept_pair((*s)[i].score, (*s)[j].score, threshold)) training.push_back(make_pair((*s)[i], (*s)[j])); } else { - training.push_back(make_pair((*s)[i], (*s)[j])); + if ((*s)[i].score != (*s)[j].score) + training.push_back(make_pair((*s)[i], (*s)[j])); } } } @@ -34,15 +42,11 @@ all_pairs(vector<ScoredHyp>* s, vector<pair<ScoredHyp,ScoredHyp> >& training, sc * compare top X to middle Y and low X * cmp middle Y to low X */ -bool -_XYX_cmp_hyp_by_score(ScoredHyp a, ScoredHyp b) -{ - return a.score > b.score; -} + inline void partXYX(vector<ScoredHyp>* s, vector<pair<ScoredHyp,ScoredHyp> >& training, score_t threshold, float hi_lo) { - sort(s->begin(), s->end(), _XYX_cmp_hyp_by_score); + sort(s->begin(), s->end(), cmp_hyp_by_score_d); unsigned sz = s->size(); unsigned sep = round(sz*hi_lo); for (unsigned i = 0; i < sep; i++) { @@ -51,7 +55,7 @@ partXYX(vector<ScoredHyp>* s, vector<pair<ScoredHyp,ScoredHyp> >& training, scor if (accept_pair((*s)[i].score, (*s)[j].score, threshold)) training.push_back(make_pair((*s)[i], (*s)[j])); } else { - if((*s)[i].score != (*s)[j].score) + if ((*s)[i].score != (*s)[j].score) training.push_back(make_pair((*s)[i], (*s)[j])); } } @@ -62,7 +66,7 @@ partXYX(vector<ScoredHyp>* s, vector<pair<ScoredHyp,ScoredHyp> >& training, scor if (accept_pair((*s)[i].score, (*s)[j].score, threshold)) training.push_back(make_pair((*s)[i], (*s)[j])); } else { - if((*s)[i].score != (*s)[j].score) + if ((*s)[i].score != (*s)[j].score) training.push_back(make_pair((*s)[i], (*s)[j])); } } @@ -77,17 +81,17 @@ partXYX(vector<ScoredHyp>* s, vector<pair<ScoredHyp,ScoredHyp> >& training, scor * cut = top 50 */ bool -_PRO_cmp_pair_by_diff(pair<ScoredHyp,ScoredHyp> a, pair<ScoredHyp,ScoredHyp> b) +_PRO_cmp_pair_by_diff_d(pair<ScoredHyp,ScoredHyp> a, pair<ScoredHyp,ScoredHyp> b) { return (fabs(a.first.score - a.second.score)) > (fabs(b.first.score - b.second.score)); } inline void -PROsampling(vector<ScoredHyp>* s, vector<pair<ScoredHyp,ScoredHyp> >& training, score_t threshold, float _unused = 1) +PROsampling(vector<ScoredHyp>* s, vector<pair<ScoredHyp,ScoredHyp> >& training, score_t threshold, float _unused=1) { - unsigned max_count = 5000, count = 0; + unsigned max_count = 5000, count = 0, sz = s->size(); bool b = false; - for (unsigned i = 0; i < s->size()-1; i++) { - for (unsigned j = i+1; j < s->size(); j++) { + for (unsigned i = 0; i < sz-1; i++) { + for (unsigned j = i+1; j < sz; j++) { if (accept_pair((*s)[i].score, (*s)[j].score, threshold)) { training.push_back(make_pair((*s)[i], (*s)[j])); if (++count == max_count) { @@ -99,7 +103,7 @@ PROsampling(vector<ScoredHyp>* s, vector<pair<ScoredHyp,ScoredHyp> >& training, if (b) break; } if (training.size() > 50) { - sort(training.begin(), training.end(), _PRO_cmp_pair_by_diff); + sort(training.begin(), training.end(), _PRO_cmp_pair_by_diff_d); training.erase(training.begin()+50, training.end()); } return; diff --git a/dtrain/score.cc b/dtrain/score.cc index b09d32ba..71d3e5de 100644 --- a/dtrain/score.cc +++ b/dtrain/score.cc @@ -21,7 +21,7 @@ BleuScorer::Bleu(NgramCounts& counts, const unsigned hyp_len, const unsigned ref vector<score_t> v = w_; if (ref_len < N_) { M = ref_len; - for (unsigned i = 0; i < M; i++) v[i] = 1./((score_t)M); + for (unsigned i = 0; i < M; i++) v[i] = 1/((score_t)M); } score_t sum = 0; for (unsigned i = 0; i < M; i++) { @@ -62,7 +62,7 @@ StupidBleuScorer::Score(vector<WordID>& hyp, vector<WordID>& ref, vector<score_t> v = w_; if (ref_len < N_) { M = ref_len; - for (unsigned i = 0; i < M; i++) v[i] = 1./((score_t)M); + for (unsigned i = 0; i < M; i++) v[i] = 1/((score_t)M); } score_t sum = 0, add = 0; for (unsigned i = 0; i < M; i++) { diff --git a/dtrain/test/example/dtrain.ini b/dtrain/test/example/dtrain.ini index 2ad44688..e43d6b34 100644 --- a/dtrain/test/example/dtrain.ini +++ b/dtrain/test/example/dtrain.ini @@ -1,5 +1,6 @@ input=test/example/nc-wmt11.1k.gz # use '-' for STDIN output=- # a weights file (add .gz for gzip compression) or STDOUT '-' +select_weights=VOID # don't output weights decoder_config=test/example/cdec.ini # config for cdec # weights for these features will be printed on each iteration print_weights=Glue WordPenalty LanguageModel LanguageModel_OOV PhraseModel_0 PhraseModel_1 PhraseModel_2 PhraseModel_3 PhraseModel_4 PhraseModel_5 PhraseModel_6 PassThrough @@ -18,4 +19,3 @@ filter=uniq # only unique entries in kbest (surface form) pair_sampling=XYX hi_lo=0.1 # 10 vs 80 vs 10 and 80 vs 10 here pair_threshold=0 # minimum distance in BLEU (this will still only use pairs with diff > 0) -select_weights=VOID # don't output weights |