diff options
-rw-r--r-- | training/dtrain/Makefile.am | 2 | ||||
-rw-r--r-- | training/dtrain/dtrain.cc | 99 | ||||
-rw-r--r-- | training/dtrain/dtrain.h | 48 | ||||
-rw-r--r-- | training/dtrain/examples/standard/cdec.ini | 2 | ||||
-rw-r--r-- | training/dtrain/examples/standard/dtrain.ini | 4 | ||||
-rwxr-xr-x | training/dtrain/lplp.rb | 7 | ||||
-rwxr-xr-x | training/dtrain/parallelize.rb | 45 | ||||
-rw-r--r-- | training/dtrain/sample.h | 31 | ||||
-rw-r--r-- | training/dtrain/score.h | 51 | ||||
-rw-r--r-- | training/dtrain/update.h | 34 |
10 files changed, 155 insertions, 168 deletions
diff --git a/training/dtrain/Makefile.am b/training/dtrain/Makefile.am index aadd376d..a6c65b1e 100644 --- a/training/dtrain/Makefile.am +++ b/training/dtrain/Makefile.am @@ -1,6 +1,6 @@ bin_PROGRAMS = dtrain -dtrain_SOURCES = dtrain.cc dtrain.h sample.h update.h score.h +dtrain_SOURCES = dtrain.cc dtrain.h sample.h score.h update.h dtrain_LDADD = ../../decoder/libcdec.a ../../klm/search/libksearch.a ../../mteval/libmteval.a ../../utils/libutils.a ../../klm/lm/libklm.a ../../klm/util/libklm_util.a ../../klm/util/double-conversion/libklm_util_double.a AM_CPPFLAGS = -W -Wall -Wno-sign-compare -I$(top_srcdir)/utils -I$(top_srcdir)/decoder -I$(top_srcdir)/mteval diff --git a/training/dtrain/dtrain.cc b/training/dtrain/dtrain.cc index 1b7047b0..63b154b4 100644 --- a/training/dtrain/dtrain.cc +++ b/training/dtrain/dtrain.cc @@ -1,6 +1,6 @@ #include "dtrain.h" -#include "score.h" #include "sample.h" +#include "score.h" #include "update.h" using namespace dtrain; @@ -16,21 +16,20 @@ main(int argc, char** argv) const size_t N = conf["N"].as<size_t>(); const size_t T = conf["iterations"].as<size_t>(); const weight_t eta = conf["learning_rate"].as<weight_t>(); - const weight_t error_margin = conf["error_margin"].as<weight_t>(); + const weight_t margin = conf["margin"].as<weight_t>(); const bool average = conf["average"].as<bool>(); - const bool keep = conf["keep"].as<bool>(); const weight_t l1_reg = conf["l1_reg"].as<weight_t>(); + const bool keep = conf["keep"].as<bool>(); const string output_fn = conf["output"].as<string>(); vector<string> print_weights; - boost::split(print_weights, conf["print_weights"].as<string>(), boost::is_any_of(" ")); + boost::split(print_weights, conf["print_weights"].as<string>(), + boost::is_any_of(" ")); // setup decoder register_feature_functions(); SetSilent(true); - ReadFile f(conf["decoder_config"].as<string>()); + ReadFile f(conf["decoder_conf"].as<string>()); Decoder decoder(f.stream()); - - // setup decoder observer ScoredKbest* observer = new ScoredKbest(k, new PerSentenceBleuScorer(N)); // weights @@ -44,25 +43,29 @@ main(int argc, char** argv) // input string input_fn = conf["bitext"].as<string>(); ReadFile input(input_fn); - vector<string> buf; // source strings (decoder takes only strings) - vector<vector<Ngrams> > buf_ngs; // compute ngrams and lengths of references - vector<vector<size_t> > buf_ls; // just once + vector<string> buf; // decoder only accepts strings as input + vector<vector<Ngrams> > buf_ngs; // compute ngrams and lengths of references + vector<vector<size_t> > buf_ls; // just once size_t input_sz = 0; + cerr << _p4; // output configuration - cerr << _p5 << "dtrain" << endl << "Parameters:" << endl; + cerr << "dtrain" << endl << "Parameters:" << endl; cerr << setw(25) << "k " << k << endl; cerr << setw(25) << "N " << N << endl; cerr << setw(25) << "T " << T << endl; cerr << setw(25) << "learning rate " << eta << endl; - cerr << setw(25) << "error margin " << error_margin << endl; + cerr << setw(25) << "margin " << margin << endl; cerr << setw(25) << "l1 reg " << l1_reg << endl; - cerr << setw(25) << "decoder conf " << "'" << conf["decoder_config"].as<string>() << "'" << endl; + cerr << setw(25) << "decoder conf " << "'" + << conf["decoder_conf"].as<string>() << "'" << endl; cerr << setw(25) << "input " << "'" << input_fn << "'" << endl; cerr << setw(25) << "output " << "'" << output_fn << "'" << endl; - if (conf.count("input_weights")) - cerr << setw(25) << "weights in " << "'" << conf["input_weights"].as<string>() << "'" << endl; - cerr << "(a dot per input)" << endl; + if (conf.count("input_weights")) { + cerr << setw(25) << "weights in " << "'" + << conf["input_weights"].as<string>() << "'" << endl; + } + cerr << "(1 dot per processed input)" << endl; // meta weight_t best=0., gold_prev=0.; @@ -75,7 +78,7 @@ main(int argc, char** argv) time_t start, end; time(&start); weight_t gold_sum=0., model_sum=0.; - size_t i = 0, num_pairs = 0, feature_count = 0, list_sz = 0; + size_t i=0, num_up=0, feature_count=0, list_sz=0; cerr << "Iteration #" << t+1 << " of " << T << "." << endl; @@ -97,9 +100,10 @@ main(int argc, char** argv) buf_ls.push_back({}); for (auto s: parts) { vector<WordID> r; - vector<string> tok; - boost::split(tok, s, boost::is_any_of(" ")); - RegisterAndConvert(tok, r); + vector<string> toks; + boost::split(toks, s, boost::is_any_of(" ")); + for (auto tok: toks) + r.push_back(TD::Convert(tok)); buf_ngs.back().emplace_back(MakeNgrams(r, N)); buf_ls.back().push_back(r.size()); } @@ -109,12 +113,16 @@ main(int argc, char** argv) } // produce some pretty output - if (i == 0 || (i+1)%20==0) - cerr << " "; - cerr << "."; + if (next) { + if (i%20==0) + cerr << " "; + cerr << "."; + if ((i+1)%20==0) + cerr << " " << i+1 << endl; + } else { + cerr << " " << i << endl; + } cerr.flush(); - if (!next) - if (i%20 != 0) cerr << " " << i << endl; // stop iterating if (!next) break; @@ -133,9 +141,8 @@ main(int argc, char** argv) list_sz += observer->GetSize(); // get pairs and update - vector<pair<ScoredHyp,ScoredHyp> > pairs; SparseVector<weight_t> updates; - num_pairs += CollectUpdates(samples, updates, error_margin); + num_up += CollectUpdates(samples, updates, margin); SparseVector<weight_t> lambdas_copy; if (l1_reg) lambdas_copy = lambdas; @@ -147,11 +154,12 @@ main(int argc, char** argv) if (l1_reg) { SparseVector<weight_t>::iterator it = lambdas.begin(); for (; it != lambdas.end(); ++it) { - if (it->second == 0) continue; - if (!lambdas_copy.get(it->first) // new or.. - || lambdas_copy.get(it->first)!=it->second) // updated feature + weight_t v = it->second; + if (!v) + continue; + if (!lambdas_copy.get(it->first) // new or.. + || lambdas_copy.get(it->first)!=v) // updated feature { - weight_t v = it->second; if (v > 0) { it->second = max(0., v - l1_reg); } else { @@ -174,19 +182,19 @@ main(int argc, char** argv) // stats weight_t gold_avg = gold_sum/(weight_t)input_sz; - size_t non_zero = (size_t)lambdas.num_nonzero(); - cerr << _p5 << _p << "WEIGHTS" << endl; + cerr << _p << "WEIGHTS" << endl; for (auto name: print_weights) cerr << setw(18) << name << " = " << lambdas.get(FD::Convert(name)) << endl; cerr << " ---" << endl; - cerr << _np << " 1best avg score: " << gold_avg; - cerr << _p << " (" << gold_avg-gold_prev << ")" << endl; - cerr << _np << " 1best avg model score: " << model_sum/(weight_t)input_sz << endl; - cerr << " avg # pairs: "; - cerr << _np << num_pairs/(float)input_sz << endl; - cerr << " non-0 feature count: " << non_zero << endl; - cerr << " avg list sz: " << list_sz/(float)input_sz << endl; + cerr << _np << " 1best avg score: " << gold_avg*100; + cerr << _p << " (" << (gold_avg-gold_prev)*100 << ")" << endl; + cerr << " 1best avg model score: " + << model_sum/(weight_t)input_sz << endl; + cerr << " avg # updates: "; + cerr << _np << num_up/(float)input_sz << endl; + cerr << " non-0 feature count: " << lambdas.num_nonzero() << endl; cerr << " avg f count: " << feature_count/(float)list_sz << endl; + cerr << " avg list sz: " << list_sz/(float)input_sz << endl; if (gold_avg > best) { best = gold_avg; @@ -197,7 +205,7 @@ main(int argc, char** argv) time (&end); time_t time_diff = difftime(end, start); total_time += time_diff; - cerr << _p2 << _np << "(time " << time_diff/60. << " min, "; + cerr << "(time " << time_diff/60. << " min, "; cerr << time_diff/input_sz << " s/S)" << endl; if (t+1 != T) cerr << endl; @@ -211,15 +219,16 @@ main(int argc, char** argv) // final weights if (average) { - w_average /= (weight_t)T; + w_average /= T; w_average.init_vector(decoder_weights); } else if (!keep) { lambdas.init_vector(decoder_weights); } - Weights::WriteToFile(output_fn, decoder_weights, true); + if (average || !keep) + Weights::WriteToFile(output_fn, decoder_weights, true); - cerr << _p5 << _np << endl << "---" << endl << "Best iteration: "; - cerr << best_iteration+1 << " [GOLD = " << best << "]." << endl; + cerr << endl << "---" << endl << "Best iteration: "; + cerr << best_iteration+1 << " [GOLD = " << best*100 << "]." << endl; cerr << "This took " << total_time/60. << " min." << endl; return 0; diff --git a/training/dtrain/dtrain.h b/training/dtrain/dtrain.h index 728b0698..8b1a00eb 100644 --- a/training/dtrain/dtrain.h +++ b/training/dtrain/dtrain.h @@ -27,17 +27,10 @@ struct ScoredHyp vector<WordID> w; SparseVector<weight_t> f; weight_t model, gold; - size_t rank; + size_t rank; }; inline void -RegisterAndConvert(const vector<string>& strs, vector<WordID>& ids) -{ - for (auto s: strs) - ids.push_back(TD::Convert(s)); -} - -inline void PrintWordIDVec(vector<WordID>& v, ostream& os=cerr) { for (size_t i = 0; i < v.size(); i++) { @@ -48,44 +41,43 @@ PrintWordIDVec(vector<WordID>& v, ostream& os=cerr) inline ostream& _np(ostream& out) { return out << resetiosflags(ios::showpos); } inline ostream& _p(ostream& out) { return out << setiosflags(ios::showpos); } -inline ostream& _p2(ostream& out) { return out << setprecision(2); } -inline ostream& _p5(ostream& out) { return out << setprecision(5); } +inline ostream& _p4(ostream& out) { return out << setprecision(4); } bool dtrain_init(int argc, char** argv, po::variables_map* conf) { po::options_description ini("Configuration File Options"); ini.add_options() - ("bitext,b", po::value<string>(), "bitext") - ("decoder_config,C", po::value<string>(), "configuration file for decoder") - ("iterations,T", po::value<size_t>()->default_value(10), "number of iterations T (per shard)") - ("k", po::value<size_t>()->default_value(100), "size of kbest list") - ("learning_rate,l", po::value<weight_t>()->default_value(1.0), "learning rate") - ("l1_reg,r", po::value<weight_t>()->default_value(0.), "l1 regularization strength") - ("error_margin,m", po::value<weight_t>()->default_value(0.), "margin for margin perceptron") - ("N", po::value<size_t>()->default_value(4), "N for BLEU approximation") - ("input_weights,w", po::value<string>(), "input weights file") - ("average,a", po::value<bool>()->default_value(false), "output average weights") - ("keep,K", po::value<bool>()->default_value(false), "output a weight file per iteration") - ("output,o", po::value<string>()->default_value("-"), "output weights file, '-' for STDOUT") + ("bitext,b", po::value<string>(), "bitext") + ("decoder_conf,C", po::value<string>(), "configuration file for decoder") + ("iterations,T", po::value<size_t>()->default_value(10), "number of iterations T (per shard)") + ("k", po::value<size_t>()->default_value(100), "size of kbest list") + ("learning_rate,l", po::value<weight_t>()->default_value(1.0), "learning rate") + ("l1_reg,r", po::value<weight_t>()->default_value(0.), "l1 regularization strength") + ("error_margin,m", po::value<weight_t>()->default_value(0.), "margin for margin perceptron") + ("N", po::value<size_t>()->default_value(4), "N for BLEU approximation") + ("input_weights,w", po::value<string>(), "input weights file") + ("average,a", po::value<bool>()->default_value(false), "output average weights") + ("keep,K", po::value<bool>()->default_value(false), "output a weight file per iteration") + ("output,o", po::value<string>()->default_value("-"), "output weights file, '-' for STDOUT") ("print_weights,P", po::value<string>()->default_value("EgivenFCoherent SampleCountF CountEF MaxLexFgivenE MaxLexEgivenF IsSingletonF IsSingletonFE Glue WordPenalty PassThrough LanguageModel LanguageModel_OOV"), - "list of weights to print after each iteration"); + "list of weights to print after each iteration"); po::options_description cl("Command Line Options"); cl.add_options() - ("config,c", po::value<string>(), "dtrain config file"); + ("conf,c", po::value<string>(), "dtrain configuration file"); cl.add(ini); po::store(parse_command_line(argc, argv, cl), *conf); - if (conf->count("config")) { - ifstream f((*conf)["config"].as<string>().c_str()); + if (conf->count("conf")) { + ifstream f((*conf)["conf"].as<string>().c_str()); po::store(po::parse_config_file(f, ini), *conf); } po::notify(*conf); - if (!conf->count("decoder_config")) { + if (!conf->count("decoder_conf")) { cerr << "Missing decoder configuration." << endl; return false; } if (!conf->count("bitext")) { - cerr << "No training data given." << endl; + cerr << "No input given." << endl; return false; } diff --git a/training/dtrain/examples/standard/cdec.ini b/training/dtrain/examples/standard/cdec.ini index 3330dd71..36368d44 100644 --- a/training/dtrain/examples/standard/cdec.ini +++ b/training/dtrain/examples/standard/cdec.ini @@ -21,7 +21,7 @@ feature_function=RuleIdentityFeatures feature_function=RuleSourceBigramFeatures feature_function=RuleTargetBigramFeatures feature_function=RuleShape -feature_function=LexicalFeatures 1 1 1 +#feature_function=LexicalFeatures 1 1 1 #feature_function=SourceSpanSizeFeatures #feature_function=SourceWordPenalty #feature_function=SpanFeatures diff --git a/training/dtrain/examples/standard/dtrain.ini b/training/dtrain/examples/standard/dtrain.ini index f2698007..610d41d7 100644 --- a/training/dtrain/examples/standard/dtrain.ini +++ b/training/dtrain/examples/standard/dtrain.ini @@ -1,6 +1,6 @@ -bitext=./nc-wmt11.gz # input bitext +bitext=nc-wmt11.100.gz # input bitext output=- # a weights file (add .gz for gzip compression) or STDOUT '-' -decoder_config=./cdec.ini # config for cdec +decoder_conf=./cdec.ini # config for cdec iterations=3 # run over input 3 times k=100 # use 100best lists N=4 # optimize (approx.) BLEU4 diff --git a/training/dtrain/lplp.rb b/training/dtrain/lplp.rb index a1fcd1a3..62c80489 100755 --- a/training/dtrain/lplp.rb +++ b/training/dtrain/lplp.rb @@ -1,4 +1,4 @@ -# lplp.rb +#!/usr/bin/env ruby # norms def l0(feature_column, n) @@ -19,7 +19,7 @@ end # stats def median(feature_column, n) - return feature_column.concat(0.step(n-feature_column.size-1).map{|i|0})\ + return feature_column.concat(0.step(n-feature_column.size-1).map{|i|0}) .sort[feature_column.size/2] end @@ -85,7 +85,6 @@ def _test() end #_test() - def usage() puts "lplp.rb <l0,l1,l2,linfty,mean,median> <cut|select_k> <k|threshold> <#shards> < <input>" puts " l0...: norms for selection" @@ -95,7 +94,7 @@ def usage() exit 1 end -if ARGV.size < 4 then usage end +usage if ARGV.size<4 norm_fun = method(ARGV[0].to_sym) type = ARGV[1] x = ARGV[2].to_f diff --git a/training/dtrain/parallelize.rb b/training/dtrain/parallelize.rb index 29f3e609..563145b6 100755 --- a/training/dtrain/parallelize.rb +++ b/training/dtrain/parallelize.rb @@ -4,19 +4,19 @@ require 'trollop' require 'zipf' conf = Trollop::options do - opt :config, "dtrain configuration", :type => :string - opt :input, "input as bitext (f ||| e)", :type => :string - opt :epochs, "number of epochs", :type => :int, :default => 10 - opt :lplp_args, "arguments for lplp.rb", :type => :string, :default => "l2 select_k 100000" - opt :randomize, "randomize shards once", :type => :bool, :default => false, :short => '-z' - opt :reshard, "randomize after each epoch", :type => :bool, :default => false, :short => '-y' - opt :shards, "number of shards", :type => :int - opt :weights, "input weights for first epoch", :type => :string, :default => '' - opt :per_shard_decoder_configs, "give custom decoder config per shard", :type => :string, :short => '-o' - opt :processes_at_once, "jobs to run at oce", :type => :int, :default => 9999 - opt :qsub, "use qsub", :type => :bool, :default => false - opt :qsub_args, "extra args for qsub", :type => :string, :default => "-l h_vmem=5G" - opt :dtrain_binary, "path to dtrain binary", :type => :string + opt :conf, "dtrain configuration", :type => :string, :short => '-c' + opt :input, "input as bitext (f ||| e)", :type => :string, :short => '-i' + opt :epochs, "number of epochs", :type => :int, :default => 10 + opt :randomize, "randomize shards once", :type => :bool, :default => false, :short => '-z' + opt :reshard, "randomize after each epoch", :type => :bool, :default => false, :short => '-y' + opt :shards, "number of shards", :type => :int :short => '-s' + opt :weights, "input weights for first epoch", :type => :string, :default => '', :short => '-w' + opt :lplp_args, "arguments for lplp.rb", :type => :string, :default => "l2 select_k 100000", :short => '-l' + opt :per_shard_decoder_configs, "give custom decoder config per shard", :type => :string, :short => '-o' + opt :processes_at_once, "jobs to run at oce", :type => :int, :default => 9999, :short => '-p' + opt :qsub, "use qsub", :type => :bool, :default => false, :short => '-q' + opt :qsub_args, "extra args for qsub", :type => :string, :default => "-l h_vmem=5G", :short => 'r' + opt :dtrain_binary, "path to dtrain binary", :type => :string, :short => '-d' end dtrain_dir = File.expand_path File.dirname(__FILE__) @@ -55,16 +55,16 @@ def make_shards input, num_shards, epoch, rand index.shuffle! if rand shard_sz = (lc / num_shards.to_f).round 0 leftover = lc - (num_shards*shard_sz) - leftover = 0 if leftover < 0 + leftover = max(0, leftover) in_f = File.new input, 'r' in_lines = in_f.readlines shard_in_files = [] in_fns = [] - new_num_shards = 0 + real_num_shards = 0 0.upto(num_shards-1) { |shard| break if index.size==0 - new_num_shards += 1 - in_fn = "work/shard.#{shard}.#{epoch}.in" + real_num_shards += 1 + in_fn = "work/shard.#{shard}.#{epoch}" shard_in = File.new in_fn, 'w+' in_fns << in_fn 0.upto(shard_sz-1) { |i| @@ -81,12 +81,12 @@ def make_shards input, num_shards, epoch, rand end shard_in_files.each do |f| f.close end in_f.close - return in_fns, new_num_shards + return in_fns, real_num_shards end input_files = [] if predefined_shards - input_files = File.new(input).readlines.map {|i| i.strip } + input_files = File.new(input).readlines.map { |i| i.strip } if per_shard_decoder_configs decoder_configs = ReadFile.readlines_strip(conf[:per_shard_decoder_configs] ).map { |i| i.strip } @@ -100,15 +100,14 @@ end puts "epoch #{epoch+1}" pids = [] input_weights = '' - if epoch > 0 then input_weights = "--input_weights work/weights.#{epoch-1}" end + input_weights = "--input_weights work/weights.#{epoch-1}" if epoch>0 weights_files = [] shard = 0 remaining_shards = num_shards while remaining_shards > 0 shards_at_once.times { break if remaining_shards==0 - qsub_str_start = qsub_str_end = '' - local_end = '' + qsub_str_start = qsub_str_end = local_end = '' if use_qsub qsub_str_start = "qsub #{conf[:qsub_args]} -cwd -sync y -b y -j y\ -o work/out.#{shard}.#{epoch}\ @@ -123,7 +122,7 @@ end else cdec_conf = "" end - if first_input_weights!='' && epoch == 0 + if first_input_weights != '' && epoch == 0 input_weights = "--input_weights #{first_input_weights}" end pids << Kernel.fork { diff --git a/training/dtrain/sample.h b/training/dtrain/sample.h index c3586c58..03cc82c3 100644 --- a/training/dtrain/sample.h +++ b/training/dtrain/sample.h @@ -3,20 +3,19 @@ #include "kbest.h" +#include "score.h" + namespace dtrain { - struct ScoredKbest : public DecoderObserver { const size_t k_; - vector<ScoredHyp> s_; - size_t src_len_; + size_t feature_count_, effective_sz_; + vector<ScoredHyp> samples_; PerSentenceBleuScorer* scorer_; - vector<vector<WordID> >* refs_; vector<Ngrams>* ref_ngs_; vector<size_t>* ref_ls_; - size_t f_count_, sz_; ScoredKbest(const size_t k, PerSentenceBleuScorer* scorer) : k_(k), scorer_(scorer) {} @@ -24,14 +23,13 @@ struct ScoredKbest : public DecoderObserver virtual void NotifyTranslationForest(const SentenceMetadata& smeta, Hypergraph* hg) { - src_len_ = smeta.GetSourceLength(); - s_.clear(); sz_ = f_count_ = 0; + samples_.clear(); effective_sz_ = feature_count_ = 0; KBest::KBestDerivations<vector<WordID>, ESentenceTraversal, KBest::FilterUnique, prob_t, EdgeProb> kbest(*hg, k_); for (size_t i = 0; i < k_; ++i) { - const KBest::KBestDerivations<vector<WordID>, ESentenceTraversal, KBest::FilterUnique, - prob_t, EdgeProb>::Derivation* d = - kbest.LazyKthBest(hg->nodes_.size() - 1, i); + const KBest::KBestDerivations<vector<WordID>, ESentenceTraversal, + KBest::FilterUnique, prob_t, EdgeProb>::Derivation* d = + kbest.LazyKthBest(hg->nodes_.size() - 1, i); if (!d) break; ScoredHyp h; h.w = d->yield; @@ -39,23 +37,22 @@ struct ScoredKbest : public DecoderObserver h.model = log(d->score); h.rank = i; h.gold = scorer_->Score(h.w, *ref_ngs_, *ref_ls_); - s_.push_back(h); - sz_++; - f_count_ += h.f.size(); + samples_.push_back(h); + effective_sz_++; + feature_count_ += h.f.size(); } } - vector<ScoredHyp>* GetSamples() { return &s_; } + vector<ScoredHyp>* GetSamples() { return &samples_; } inline void SetReference(vector<Ngrams>& ngs, vector<size_t>& ls) { ref_ngs_ = &ngs; ref_ls_ = &ls; } - inline size_t GetFeatureCount() { return f_count_; } - inline size_t GetSize() { return sz_; } + inline size_t GetFeatureCount() { return feature_count_; } + inline size_t GetSize() { return effective_sz_; } }; - } // namespace #endif diff --git a/training/dtrain/score.h b/training/dtrain/score.h index d51aef82..06dbc5a4 100644 --- a/training/dtrain/score.h +++ b/training/dtrain/score.h @@ -34,15 +34,6 @@ struct NgramCounts } inline void - operator*=(const weight_t rhs) - { - for (size_t i = 0; i < N_; i++) { - this->clipped_[i] *= rhs; - this->sum_[i] *= rhs; - } - } - - inline void Add(const size_t count, const size_t ref_count, const size_t i) { assert(i < N_); @@ -64,15 +55,7 @@ struct NgramCounts } inline void - Print(ostream& os=cerr) - { - for (size_t i = 0; i < N_; i++) { - os << i+1 << "grams (clipped):\t" << clipped_[i] << endl; - os << i+1 << "grams:\t\t\t" << sum_[i] << endl; - } - } - - inline void Resize(size_t N) + Resize(size_t N) { if (N == N_) return; else if (N > N_) { @@ -158,16 +141,13 @@ struct PerSentenceBleuScorer return exp(1 - (weight_t)rl/hl); } - weight_t - Score(const vector<WordID>& hyp, - const vector<Ngrams>& ref_ngs, - const vector<size_t>& ref_ls) + inline size_t + BestMatchLength(const size_t hl, + const vector<size_t>& ref_ls) { - size_t hl = hyp.size(), rl = 0; - if (hl == 0) return 0.; - // best match reference length + size_t m; if (ref_ls.size() == 1) { - rl = ref_ls.front(); + m = ref_ls.front(); } else { size_t i = 0, best_idx = 0; size_t best = numeric_limits<size_t>::max(); @@ -179,8 +159,20 @@ struct PerSentenceBleuScorer } i += 1; } - rl = ref_ls[best_idx]; + m = ref_ls[best_idx]; } + + return m; + } + + weight_t + Score(const vector<WordID>& hyp, + const vector<Ngrams>& ref_ngs, + const vector<size_t>& ref_ls) + { + size_t hl = hyp.size(), rl = 0; + if (hl == 0) return 0.; + rl = BestMatchLength(hl, ref_ls); if (rl == 0) return 0.; NgramCounts counts = MakeNgramCounts(hyp, ref_ngs, N_); size_t M = N_; @@ -192,8 +184,9 @@ struct PerSentenceBleuScorer weight_t sum = 0, add = 0; for (size_t i = 0; i < M; i++) { if (i == 0 && (counts.sum_[i] == 0 || counts.clipped_[i] == 0)) return 0.; - if (i == 1) add = 1; - sum += v[i] * log(((weight_t)counts.clipped_[i] + add)/((counts.sum_[i] + add))); + if (i > 0) add = 1; + sum += v[i] * log(((weight_t)counts.clipped_[i] + add) + / ((counts.sum_[i] + add))); } return BrevityPenalty(hl, rl+1) * exp(sum); diff --git a/training/dtrain/update.h b/training/dtrain/update.h index 57671ce1..72d369c4 100644 --- a/training/dtrain/update.h +++ b/training/dtrain/update.h @@ -5,7 +5,7 @@ namespace dtrain { bool -CmpHypsByGold(ScoredHyp a, ScoredHyp b) +_cmp(ScoredHyp a, ScoredHyp b) { return a.gold > b.gold; } @@ -19,44 +19,42 @@ CmpHypsByGold(ScoredHyp a, ScoredHyp b) inline size_t CollectUpdates(vector<ScoredHyp>* s, SparseVector<weight_t>& updates, - float margin=1.0) + float margin=0.) { - size_t num_pairs = 0; + size_t num_up = 0; size_t sz = s->size(); if (sz < 2) return 0; - sort(s->begin(), s->end(), CmpHypsByGold); + sort(s->begin(), s->end(), _cmp); size_t sep = round(sz*0.1); size_t sep_hi = sep; if (sz > 4) { - while - (sep_hi < sz && (*s)[sep_hi-1].gold == (*s)[sep_hi].gold) ++sep_hi; + while (sep_hi<sz && (*s)[sep_hi-1].gold==(*s)[sep_hi].gold) + ++sep_hi; } else sep_hi = 1; for (size_t i = 0; i < sep_hi; i++) { for (size_t j = sep_hi; j < sz; j++) { - if (((*s)[i].model-(*s)[j].model) > margin) + if (((*s)[i].model-(*s)[j].model) > margin + || (*s)[i].gold == (*s)[j].gold) continue; - if ((*s)[i].gold != (*s)[j].gold) { - updates += (*s)[i].f-(*s)[j].f; - num_pairs++; - } + updates += (*s)[i].f-(*s)[j].f; + num_up++; } } size_t sep_lo = sz-sep; - while (sep_lo > 0 && (*s)[sep_lo-1].gold == (*s)[sep_lo].gold) + while (sep_lo>=sep_hi && (*s)[sep_lo].gold==(*s)[sep_lo+1].gold) --sep_lo; for (size_t i = sep_hi; i < sep_lo; i++) { for (size_t j = sep_lo; j < sz; j++) { - if (((*s)[i].model-(*s)[j].model) > margin) + if (((*s)[i].model-(*s)[j].model) > margin + || (*s)[i].gold == (*s)[j].gold) continue; - if ((*s)[i].gold != (*s)[j].gold) { - updates += (*s)[i].f-(*s)[j].f; - num_pairs++; - } + updates += (*s)[i].f-(*s)[j].f; + num_up++; } } - return num_pairs; + return num_up; } } // namespace |