summaryrefslogtreecommitdiff
path: root/training
diff options
context:
space:
mode:
Diffstat (limited to 'training')
-rw-r--r--training/dtrain/Makefile.am2
-rw-r--r--training/dtrain/dtrain.cc99
-rw-r--r--training/dtrain/dtrain.h48
-rw-r--r--training/dtrain/examples/standard/cdec.ini2
-rw-r--r--training/dtrain/examples/standard/dtrain.ini4
-rwxr-xr-xtraining/dtrain/lplp.rb7
-rwxr-xr-xtraining/dtrain/parallelize.rb45
-rw-r--r--training/dtrain/sample.h31
-rw-r--r--training/dtrain/score.h51
-rw-r--r--training/dtrain/update.h34
10 files changed, 155 insertions, 168 deletions
diff --git a/training/dtrain/Makefile.am b/training/dtrain/Makefile.am
index aadd376d..a6c65b1e 100644
--- a/training/dtrain/Makefile.am
+++ b/training/dtrain/Makefile.am
@@ -1,6 +1,6 @@
bin_PROGRAMS = dtrain
-dtrain_SOURCES = dtrain.cc dtrain.h sample.h update.h score.h
+dtrain_SOURCES = dtrain.cc dtrain.h sample.h score.h update.h
dtrain_LDADD = ../../decoder/libcdec.a ../../klm/search/libksearch.a ../../mteval/libmteval.a ../../utils/libutils.a ../../klm/lm/libklm.a ../../klm/util/libklm_util.a ../../klm/util/double-conversion/libklm_util_double.a
AM_CPPFLAGS = -W -Wall -Wno-sign-compare -I$(top_srcdir)/utils -I$(top_srcdir)/decoder -I$(top_srcdir)/mteval
diff --git a/training/dtrain/dtrain.cc b/training/dtrain/dtrain.cc
index 1b7047b0..63b154b4 100644
--- a/training/dtrain/dtrain.cc
+++ b/training/dtrain/dtrain.cc
@@ -1,6 +1,6 @@
#include "dtrain.h"
-#include "score.h"
#include "sample.h"
+#include "score.h"
#include "update.h"
using namespace dtrain;
@@ -16,21 +16,20 @@ main(int argc, char** argv)
const size_t N = conf["N"].as<size_t>();
const size_t T = conf["iterations"].as<size_t>();
const weight_t eta = conf["learning_rate"].as<weight_t>();
- const weight_t error_margin = conf["error_margin"].as<weight_t>();
+ const weight_t margin = conf["margin"].as<weight_t>();
const bool average = conf["average"].as<bool>();
- const bool keep = conf["keep"].as<bool>();
const weight_t l1_reg = conf["l1_reg"].as<weight_t>();
+ const bool keep = conf["keep"].as<bool>();
const string output_fn = conf["output"].as<string>();
vector<string> print_weights;
- boost::split(print_weights, conf["print_weights"].as<string>(), boost::is_any_of(" "));
+ boost::split(print_weights, conf["print_weights"].as<string>(),
+ boost::is_any_of(" "));
// setup decoder
register_feature_functions();
SetSilent(true);
- ReadFile f(conf["decoder_config"].as<string>());
+ ReadFile f(conf["decoder_conf"].as<string>());
Decoder decoder(f.stream());
-
- // setup decoder observer
ScoredKbest* observer = new ScoredKbest(k, new PerSentenceBleuScorer(N));
// weights
@@ -44,25 +43,29 @@ main(int argc, char** argv)
// input
string input_fn = conf["bitext"].as<string>();
ReadFile input(input_fn);
- vector<string> buf; // source strings (decoder takes only strings)
- vector<vector<Ngrams> > buf_ngs; // compute ngrams and lengths of references
- vector<vector<size_t> > buf_ls; // just once
+ vector<string> buf; // decoder only accepts strings as input
+ vector<vector<Ngrams> > buf_ngs; // compute ngrams and lengths of references
+ vector<vector<size_t> > buf_ls; // just once
size_t input_sz = 0;
+ cerr << _p4;
// output configuration
- cerr << _p5 << "dtrain" << endl << "Parameters:" << endl;
+ cerr << "dtrain" << endl << "Parameters:" << endl;
cerr << setw(25) << "k " << k << endl;
cerr << setw(25) << "N " << N << endl;
cerr << setw(25) << "T " << T << endl;
cerr << setw(25) << "learning rate " << eta << endl;
- cerr << setw(25) << "error margin " << error_margin << endl;
+ cerr << setw(25) << "margin " << margin << endl;
cerr << setw(25) << "l1 reg " << l1_reg << endl;
- cerr << setw(25) << "decoder conf " << "'" << conf["decoder_config"].as<string>() << "'" << endl;
+ cerr << setw(25) << "decoder conf " << "'"
+ << conf["decoder_conf"].as<string>() << "'" << endl;
cerr << setw(25) << "input " << "'" << input_fn << "'" << endl;
cerr << setw(25) << "output " << "'" << output_fn << "'" << endl;
- if (conf.count("input_weights"))
- cerr << setw(25) << "weights in " << "'" << conf["input_weights"].as<string>() << "'" << endl;
- cerr << "(a dot per input)" << endl;
+ if (conf.count("input_weights")) {
+ cerr << setw(25) << "weights in " << "'"
+ << conf["input_weights"].as<string>() << "'" << endl;
+ }
+ cerr << "(1 dot per processed input)" << endl;
// meta
weight_t best=0., gold_prev=0.;
@@ -75,7 +78,7 @@ main(int argc, char** argv)
time_t start, end;
time(&start);
weight_t gold_sum=0., model_sum=0.;
- size_t i = 0, num_pairs = 0, feature_count = 0, list_sz = 0;
+ size_t i=0, num_up=0, feature_count=0, list_sz=0;
cerr << "Iteration #" << t+1 << " of " << T << "." << endl;
@@ -97,9 +100,10 @@ main(int argc, char** argv)
buf_ls.push_back({});
for (auto s: parts) {
vector<WordID> r;
- vector<string> tok;
- boost::split(tok, s, boost::is_any_of(" "));
- RegisterAndConvert(tok, r);
+ vector<string> toks;
+ boost::split(toks, s, boost::is_any_of(" "));
+ for (auto tok: toks)
+ r.push_back(TD::Convert(tok));
buf_ngs.back().emplace_back(MakeNgrams(r, N));
buf_ls.back().push_back(r.size());
}
@@ -109,12 +113,16 @@ main(int argc, char** argv)
}
// produce some pretty output
- if (i == 0 || (i+1)%20==0)
- cerr << " ";
- cerr << ".";
+ if (next) {
+ if (i%20==0)
+ cerr << " ";
+ cerr << ".";
+ if ((i+1)%20==0)
+ cerr << " " << i+1 << endl;
+ } else {
+ cerr << " " << i << endl;
+ }
cerr.flush();
- if (!next)
- if (i%20 != 0) cerr << " " << i << endl;
// stop iterating
if (!next) break;
@@ -133,9 +141,8 @@ main(int argc, char** argv)
list_sz += observer->GetSize();
// get pairs and update
- vector<pair<ScoredHyp,ScoredHyp> > pairs;
SparseVector<weight_t> updates;
- num_pairs += CollectUpdates(samples, updates, error_margin);
+ num_up += CollectUpdates(samples, updates, margin);
SparseVector<weight_t> lambdas_copy;
if (l1_reg)
lambdas_copy = lambdas;
@@ -147,11 +154,12 @@ main(int argc, char** argv)
if (l1_reg) {
SparseVector<weight_t>::iterator it = lambdas.begin();
for (; it != lambdas.end(); ++it) {
- if (it->second == 0) continue;
- if (!lambdas_copy.get(it->first) // new or..
- || lambdas_copy.get(it->first)!=it->second) // updated feature
+ weight_t v = it->second;
+ if (!v)
+ continue;
+ if (!lambdas_copy.get(it->first) // new or..
+ || lambdas_copy.get(it->first)!=v) // updated feature
{
- weight_t v = it->second;
if (v > 0) {
it->second = max(0., v - l1_reg);
} else {
@@ -174,19 +182,19 @@ main(int argc, char** argv)
// stats
weight_t gold_avg = gold_sum/(weight_t)input_sz;
- size_t non_zero = (size_t)lambdas.num_nonzero();
- cerr << _p5 << _p << "WEIGHTS" << endl;
+ cerr << _p << "WEIGHTS" << endl;
for (auto name: print_weights)
cerr << setw(18) << name << " = " << lambdas.get(FD::Convert(name)) << endl;
cerr << " ---" << endl;
- cerr << _np << " 1best avg score: " << gold_avg;
- cerr << _p << " (" << gold_avg-gold_prev << ")" << endl;
- cerr << _np << " 1best avg model score: " << model_sum/(weight_t)input_sz << endl;
- cerr << " avg # pairs: ";
- cerr << _np << num_pairs/(float)input_sz << endl;
- cerr << " non-0 feature count: " << non_zero << endl;
- cerr << " avg list sz: " << list_sz/(float)input_sz << endl;
+ cerr << _np << " 1best avg score: " << gold_avg*100;
+ cerr << _p << " (" << (gold_avg-gold_prev)*100 << ")" << endl;
+ cerr << " 1best avg model score: "
+ << model_sum/(weight_t)input_sz << endl;
+ cerr << " avg # updates: ";
+ cerr << _np << num_up/(float)input_sz << endl;
+ cerr << " non-0 feature count: " << lambdas.num_nonzero() << endl;
cerr << " avg f count: " << feature_count/(float)list_sz << endl;
+ cerr << " avg list sz: " << list_sz/(float)input_sz << endl;
if (gold_avg > best) {
best = gold_avg;
@@ -197,7 +205,7 @@ main(int argc, char** argv)
time (&end);
time_t time_diff = difftime(end, start);
total_time += time_diff;
- cerr << _p2 << _np << "(time " << time_diff/60. << " min, ";
+ cerr << "(time " << time_diff/60. << " min, ";
cerr << time_diff/input_sz << " s/S)" << endl;
if (t+1 != T) cerr << endl;
@@ -211,15 +219,16 @@ main(int argc, char** argv)
// final weights
if (average) {
- w_average /= (weight_t)T;
+ w_average /= T;
w_average.init_vector(decoder_weights);
} else if (!keep) {
lambdas.init_vector(decoder_weights);
}
- Weights::WriteToFile(output_fn, decoder_weights, true);
+ if (average || !keep)
+ Weights::WriteToFile(output_fn, decoder_weights, true);
- cerr << _p5 << _np << endl << "---" << endl << "Best iteration: ";
- cerr << best_iteration+1 << " [GOLD = " << best << "]." << endl;
+ cerr << endl << "---" << endl << "Best iteration: ";
+ cerr << best_iteration+1 << " [GOLD = " << best*100 << "]." << endl;
cerr << "This took " << total_time/60. << " min." << endl;
return 0;
diff --git a/training/dtrain/dtrain.h b/training/dtrain/dtrain.h
index 728b0698..8b1a00eb 100644
--- a/training/dtrain/dtrain.h
+++ b/training/dtrain/dtrain.h
@@ -27,17 +27,10 @@ struct ScoredHyp
vector<WordID> w;
SparseVector<weight_t> f;
weight_t model, gold;
- size_t rank;
+ size_t rank;
};
inline void
-RegisterAndConvert(const vector<string>& strs, vector<WordID>& ids)
-{
- for (auto s: strs)
- ids.push_back(TD::Convert(s));
-}
-
-inline void
PrintWordIDVec(vector<WordID>& v, ostream& os=cerr)
{
for (size_t i = 0; i < v.size(); i++) {
@@ -48,44 +41,43 @@ PrintWordIDVec(vector<WordID>& v, ostream& os=cerr)
inline ostream& _np(ostream& out) { return out << resetiosflags(ios::showpos); }
inline ostream& _p(ostream& out) { return out << setiosflags(ios::showpos); }
-inline ostream& _p2(ostream& out) { return out << setprecision(2); }
-inline ostream& _p5(ostream& out) { return out << setprecision(5); }
+inline ostream& _p4(ostream& out) { return out << setprecision(4); }
bool
dtrain_init(int argc, char** argv, po::variables_map* conf)
{
po::options_description ini("Configuration File Options");
ini.add_options()
- ("bitext,b", po::value<string>(), "bitext")
- ("decoder_config,C", po::value<string>(), "configuration file for decoder")
- ("iterations,T", po::value<size_t>()->default_value(10), "number of iterations T (per shard)")
- ("k", po::value<size_t>()->default_value(100), "size of kbest list")
- ("learning_rate,l", po::value<weight_t>()->default_value(1.0), "learning rate")
- ("l1_reg,r", po::value<weight_t>()->default_value(0.), "l1 regularization strength")
- ("error_margin,m", po::value<weight_t>()->default_value(0.), "margin for margin perceptron")
- ("N", po::value<size_t>()->default_value(4), "N for BLEU approximation")
- ("input_weights,w", po::value<string>(), "input weights file")
- ("average,a", po::value<bool>()->default_value(false), "output average weights")
- ("keep,K", po::value<bool>()->default_value(false), "output a weight file per iteration")
- ("output,o", po::value<string>()->default_value("-"), "output weights file, '-' for STDOUT")
+ ("bitext,b", po::value<string>(), "bitext")
+ ("decoder_conf,C", po::value<string>(), "configuration file for decoder")
+ ("iterations,T", po::value<size_t>()->default_value(10), "number of iterations T (per shard)")
+ ("k", po::value<size_t>()->default_value(100), "size of kbest list")
+ ("learning_rate,l", po::value<weight_t>()->default_value(1.0), "learning rate")
+ ("l1_reg,r", po::value<weight_t>()->default_value(0.), "l1 regularization strength")
+ ("error_margin,m", po::value<weight_t>()->default_value(0.), "margin for margin perceptron")
+ ("N", po::value<size_t>()->default_value(4), "N for BLEU approximation")
+ ("input_weights,w", po::value<string>(), "input weights file")
+ ("average,a", po::value<bool>()->default_value(false), "output average weights")
+ ("keep,K", po::value<bool>()->default_value(false), "output a weight file per iteration")
+ ("output,o", po::value<string>()->default_value("-"), "output weights file, '-' for STDOUT")
("print_weights,P", po::value<string>()->default_value("EgivenFCoherent SampleCountF CountEF MaxLexFgivenE MaxLexEgivenF IsSingletonF IsSingletonFE Glue WordPenalty PassThrough LanguageModel LanguageModel_OOV"),
- "list of weights to print after each iteration");
+ "list of weights to print after each iteration");
po::options_description cl("Command Line Options");
cl.add_options()
- ("config,c", po::value<string>(), "dtrain config file");
+ ("conf,c", po::value<string>(), "dtrain configuration file");
cl.add(ini);
po::store(parse_command_line(argc, argv, cl), *conf);
- if (conf->count("config")) {
- ifstream f((*conf)["config"].as<string>().c_str());
+ if (conf->count("conf")) {
+ ifstream f((*conf)["conf"].as<string>().c_str());
po::store(po::parse_config_file(f, ini), *conf);
}
po::notify(*conf);
- if (!conf->count("decoder_config")) {
+ if (!conf->count("decoder_conf")) {
cerr << "Missing decoder configuration." << endl;
return false;
}
if (!conf->count("bitext")) {
- cerr << "No training data given." << endl;
+ cerr << "No input given." << endl;
return false;
}
diff --git a/training/dtrain/examples/standard/cdec.ini b/training/dtrain/examples/standard/cdec.ini
index 3330dd71..36368d44 100644
--- a/training/dtrain/examples/standard/cdec.ini
+++ b/training/dtrain/examples/standard/cdec.ini
@@ -21,7 +21,7 @@ feature_function=RuleIdentityFeatures
feature_function=RuleSourceBigramFeatures
feature_function=RuleTargetBigramFeatures
feature_function=RuleShape
-feature_function=LexicalFeatures 1 1 1
+#feature_function=LexicalFeatures 1 1 1
#feature_function=SourceSpanSizeFeatures
#feature_function=SourceWordPenalty
#feature_function=SpanFeatures
diff --git a/training/dtrain/examples/standard/dtrain.ini b/training/dtrain/examples/standard/dtrain.ini
index f2698007..610d41d7 100644
--- a/training/dtrain/examples/standard/dtrain.ini
+++ b/training/dtrain/examples/standard/dtrain.ini
@@ -1,6 +1,6 @@
-bitext=./nc-wmt11.gz # input bitext
+bitext=nc-wmt11.100.gz # input bitext
output=- # a weights file (add .gz for gzip compression) or STDOUT '-'
-decoder_config=./cdec.ini # config for cdec
+decoder_conf=./cdec.ini # config for cdec
iterations=3 # run over input 3 times
k=100 # use 100best lists
N=4 # optimize (approx.) BLEU4
diff --git a/training/dtrain/lplp.rb b/training/dtrain/lplp.rb
index a1fcd1a3..62c80489 100755
--- a/training/dtrain/lplp.rb
+++ b/training/dtrain/lplp.rb
@@ -1,4 +1,4 @@
-# lplp.rb
+#!/usr/bin/env ruby
# norms
def l0(feature_column, n)
@@ -19,7 +19,7 @@ end
# stats
def median(feature_column, n)
- return feature_column.concat(0.step(n-feature_column.size-1).map{|i|0})\
+ return feature_column.concat(0.step(n-feature_column.size-1).map{|i|0})
.sort[feature_column.size/2]
end
@@ -85,7 +85,6 @@ def _test()
end
#_test()
-
def usage()
puts "lplp.rb <l0,l1,l2,linfty,mean,median> <cut|select_k> <k|threshold> <#shards> < <input>"
puts " l0...: norms for selection"
@@ -95,7 +94,7 @@ def usage()
exit 1
end
-if ARGV.size < 4 then usage end
+usage if ARGV.size<4
norm_fun = method(ARGV[0].to_sym)
type = ARGV[1]
x = ARGV[2].to_f
diff --git a/training/dtrain/parallelize.rb b/training/dtrain/parallelize.rb
index 29f3e609..563145b6 100755
--- a/training/dtrain/parallelize.rb
+++ b/training/dtrain/parallelize.rb
@@ -4,19 +4,19 @@ require 'trollop'
require 'zipf'
conf = Trollop::options do
- opt :config, "dtrain configuration", :type => :string
- opt :input, "input as bitext (f ||| e)", :type => :string
- opt :epochs, "number of epochs", :type => :int, :default => 10
- opt :lplp_args, "arguments for lplp.rb", :type => :string, :default => "l2 select_k 100000"
- opt :randomize, "randomize shards once", :type => :bool, :default => false, :short => '-z'
- opt :reshard, "randomize after each epoch", :type => :bool, :default => false, :short => '-y'
- opt :shards, "number of shards", :type => :int
- opt :weights, "input weights for first epoch", :type => :string, :default => ''
- opt :per_shard_decoder_configs, "give custom decoder config per shard", :type => :string, :short => '-o'
- opt :processes_at_once, "jobs to run at oce", :type => :int, :default => 9999
- opt :qsub, "use qsub", :type => :bool, :default => false
- opt :qsub_args, "extra args for qsub", :type => :string, :default => "-l h_vmem=5G"
- opt :dtrain_binary, "path to dtrain binary", :type => :string
+ opt :conf, "dtrain configuration", :type => :string, :short => '-c'
+ opt :input, "input as bitext (f ||| e)", :type => :string, :short => '-i'
+ opt :epochs, "number of epochs", :type => :int, :default => 10
+ opt :randomize, "randomize shards once", :type => :bool, :default => false, :short => '-z'
+ opt :reshard, "randomize after each epoch", :type => :bool, :default => false, :short => '-y'
+ opt :shards, "number of shards", :type => :int :short => '-s'
+ opt :weights, "input weights for first epoch", :type => :string, :default => '', :short => '-w'
+ opt :lplp_args, "arguments for lplp.rb", :type => :string, :default => "l2 select_k 100000", :short => '-l'
+ opt :per_shard_decoder_configs, "give custom decoder config per shard", :type => :string, :short => '-o'
+ opt :processes_at_once, "jobs to run at oce", :type => :int, :default => 9999, :short => '-p'
+ opt :qsub, "use qsub", :type => :bool, :default => false, :short => '-q'
+ opt :qsub_args, "extra args for qsub", :type => :string, :default => "-l h_vmem=5G", :short => 'r'
+ opt :dtrain_binary, "path to dtrain binary", :type => :string, :short => '-d'
end
dtrain_dir = File.expand_path File.dirname(__FILE__)
@@ -55,16 +55,16 @@ def make_shards input, num_shards, epoch, rand
index.shuffle! if rand
shard_sz = (lc / num_shards.to_f).round 0
leftover = lc - (num_shards*shard_sz)
- leftover = 0 if leftover < 0
+ leftover = max(0, leftover)
in_f = File.new input, 'r'
in_lines = in_f.readlines
shard_in_files = []
in_fns = []
- new_num_shards = 0
+ real_num_shards = 0
0.upto(num_shards-1) { |shard|
break if index.size==0
- new_num_shards += 1
- in_fn = "work/shard.#{shard}.#{epoch}.in"
+ real_num_shards += 1
+ in_fn = "work/shard.#{shard}.#{epoch}"
shard_in = File.new in_fn, 'w+'
in_fns << in_fn
0.upto(shard_sz-1) { |i|
@@ -81,12 +81,12 @@ def make_shards input, num_shards, epoch, rand
end
shard_in_files.each do |f| f.close end
in_f.close
- return in_fns, new_num_shards
+ return in_fns, real_num_shards
end
input_files = []
if predefined_shards
- input_files = File.new(input).readlines.map {|i| i.strip }
+ input_files = File.new(input).readlines.map { |i| i.strip }
if per_shard_decoder_configs
decoder_configs = ReadFile.readlines_strip(conf[:per_shard_decoder_configs]
).map { |i| i.strip }
@@ -100,15 +100,14 @@ end
puts "epoch #{epoch+1}"
pids = []
input_weights = ''
- if epoch > 0 then input_weights = "--input_weights work/weights.#{epoch-1}" end
+ input_weights = "--input_weights work/weights.#{epoch-1}" if epoch>0
weights_files = []
shard = 0
remaining_shards = num_shards
while remaining_shards > 0
shards_at_once.times {
break if remaining_shards==0
- qsub_str_start = qsub_str_end = ''
- local_end = ''
+ qsub_str_start = qsub_str_end = local_end = ''
if use_qsub
qsub_str_start = "qsub #{conf[:qsub_args]} -cwd -sync y -b y -j y\
-o work/out.#{shard}.#{epoch}\
@@ -123,7 +122,7 @@ end
else
cdec_conf = ""
end
- if first_input_weights!='' && epoch == 0
+ if first_input_weights != '' && epoch == 0
input_weights = "--input_weights #{first_input_weights}"
end
pids << Kernel.fork {
diff --git a/training/dtrain/sample.h b/training/dtrain/sample.h
index c3586c58..03cc82c3 100644
--- a/training/dtrain/sample.h
+++ b/training/dtrain/sample.h
@@ -3,20 +3,19 @@
#include "kbest.h"
+#include "score.h"
+
namespace dtrain
{
-
struct ScoredKbest : public DecoderObserver
{
const size_t k_;
- vector<ScoredHyp> s_;
- size_t src_len_;
+ size_t feature_count_, effective_sz_;
+ vector<ScoredHyp> samples_;
PerSentenceBleuScorer* scorer_;
- vector<vector<WordID> >* refs_;
vector<Ngrams>* ref_ngs_;
vector<size_t>* ref_ls_;
- size_t f_count_, sz_;
ScoredKbest(const size_t k, PerSentenceBleuScorer* scorer) :
k_(k), scorer_(scorer) {}
@@ -24,14 +23,13 @@ struct ScoredKbest : public DecoderObserver
virtual void
NotifyTranslationForest(const SentenceMetadata& smeta, Hypergraph* hg)
{
- src_len_ = smeta.GetSourceLength();
- s_.clear(); sz_ = f_count_ = 0;
+ samples_.clear(); effective_sz_ = feature_count_ = 0;
KBest::KBestDerivations<vector<WordID>, ESentenceTraversal,
KBest::FilterUnique, prob_t, EdgeProb> kbest(*hg, k_);
for (size_t i = 0; i < k_; ++i) {
- const KBest::KBestDerivations<vector<WordID>, ESentenceTraversal, KBest::FilterUnique,
- prob_t, EdgeProb>::Derivation* d =
- kbest.LazyKthBest(hg->nodes_.size() - 1, i);
+ const KBest::KBestDerivations<vector<WordID>, ESentenceTraversal,
+ KBest::FilterUnique, prob_t, EdgeProb>::Derivation* d =
+ kbest.LazyKthBest(hg->nodes_.size() - 1, i);
if (!d) break;
ScoredHyp h;
h.w = d->yield;
@@ -39,23 +37,22 @@ struct ScoredKbest : public DecoderObserver
h.model = log(d->score);
h.rank = i;
h.gold = scorer_->Score(h.w, *ref_ngs_, *ref_ls_);
- s_.push_back(h);
- sz_++;
- f_count_ += h.f.size();
+ samples_.push_back(h);
+ effective_sz_++;
+ feature_count_ += h.f.size();
}
}
- vector<ScoredHyp>* GetSamples() { return &s_; }
+ vector<ScoredHyp>* GetSamples() { return &samples_; }
inline void SetReference(vector<Ngrams>& ngs, vector<size_t>& ls)
{
ref_ngs_ = &ngs;
ref_ls_ = &ls;
}
- inline size_t GetFeatureCount() { return f_count_; }
- inline size_t GetSize() { return sz_; }
+ inline size_t GetFeatureCount() { return feature_count_; }
+ inline size_t GetSize() { return effective_sz_; }
};
-
} // namespace
#endif
diff --git a/training/dtrain/score.h b/training/dtrain/score.h
index d51aef82..06dbc5a4 100644
--- a/training/dtrain/score.h
+++ b/training/dtrain/score.h
@@ -34,15 +34,6 @@ struct NgramCounts
}
inline void
- operator*=(const weight_t rhs)
- {
- for (size_t i = 0; i < N_; i++) {
- this->clipped_[i] *= rhs;
- this->sum_[i] *= rhs;
- }
- }
-
- inline void
Add(const size_t count, const size_t ref_count, const size_t i)
{
assert(i < N_);
@@ -64,15 +55,7 @@ struct NgramCounts
}
inline void
- Print(ostream& os=cerr)
- {
- for (size_t i = 0; i < N_; i++) {
- os << i+1 << "grams (clipped):\t" << clipped_[i] << endl;
- os << i+1 << "grams:\t\t\t" << sum_[i] << endl;
- }
- }
-
- inline void Resize(size_t N)
+ Resize(size_t N)
{
if (N == N_) return;
else if (N > N_) {
@@ -158,16 +141,13 @@ struct PerSentenceBleuScorer
return exp(1 - (weight_t)rl/hl);
}
- weight_t
- Score(const vector<WordID>& hyp,
- const vector<Ngrams>& ref_ngs,
- const vector<size_t>& ref_ls)
+ inline size_t
+ BestMatchLength(const size_t hl,
+ const vector<size_t>& ref_ls)
{
- size_t hl = hyp.size(), rl = 0;
- if (hl == 0) return 0.;
- // best match reference length
+ size_t m;
if (ref_ls.size() == 1) {
- rl = ref_ls.front();
+ m = ref_ls.front();
} else {
size_t i = 0, best_idx = 0;
size_t best = numeric_limits<size_t>::max();
@@ -179,8 +159,20 @@ struct PerSentenceBleuScorer
}
i += 1;
}
- rl = ref_ls[best_idx];
+ m = ref_ls[best_idx];
}
+
+ return m;
+ }
+
+ weight_t
+ Score(const vector<WordID>& hyp,
+ const vector<Ngrams>& ref_ngs,
+ const vector<size_t>& ref_ls)
+ {
+ size_t hl = hyp.size(), rl = 0;
+ if (hl == 0) return 0.;
+ rl = BestMatchLength(hl, ref_ls);
if (rl == 0) return 0.;
NgramCounts counts = MakeNgramCounts(hyp, ref_ngs, N_);
size_t M = N_;
@@ -192,8 +184,9 @@ struct PerSentenceBleuScorer
weight_t sum = 0, add = 0;
for (size_t i = 0; i < M; i++) {
if (i == 0 && (counts.sum_[i] == 0 || counts.clipped_[i] == 0)) return 0.;
- if (i == 1) add = 1;
- sum += v[i] * log(((weight_t)counts.clipped_[i] + add)/((counts.sum_[i] + add)));
+ if (i > 0) add = 1;
+ sum += v[i] * log(((weight_t)counts.clipped_[i] + add)
+ / ((counts.sum_[i] + add)));
}
return BrevityPenalty(hl, rl+1) * exp(sum);
diff --git a/training/dtrain/update.h b/training/dtrain/update.h
index 57671ce1..72d369c4 100644
--- a/training/dtrain/update.h
+++ b/training/dtrain/update.h
@@ -5,7 +5,7 @@ namespace dtrain
{
bool
-CmpHypsByGold(ScoredHyp a, ScoredHyp b)
+_cmp(ScoredHyp a, ScoredHyp b)
{
return a.gold > b.gold;
}
@@ -19,44 +19,42 @@ CmpHypsByGold(ScoredHyp a, ScoredHyp b)
inline size_t
CollectUpdates(vector<ScoredHyp>* s,
SparseVector<weight_t>& updates,
- float margin=1.0)
+ float margin=0.)
{
- size_t num_pairs = 0;
+ size_t num_up = 0;
size_t sz = s->size();
if (sz < 2) return 0;
- sort(s->begin(), s->end(), CmpHypsByGold);
+ sort(s->begin(), s->end(), _cmp);
size_t sep = round(sz*0.1);
size_t sep_hi = sep;
if (sz > 4) {
- while
- (sep_hi < sz && (*s)[sep_hi-1].gold == (*s)[sep_hi].gold) ++sep_hi;
+ while (sep_hi<sz && (*s)[sep_hi-1].gold==(*s)[sep_hi].gold)
+ ++sep_hi;
}
else sep_hi = 1;
for (size_t i = 0; i < sep_hi; i++) {
for (size_t j = sep_hi; j < sz; j++) {
- if (((*s)[i].model-(*s)[j].model) > margin)
+ if (((*s)[i].model-(*s)[j].model) > margin
+ || (*s)[i].gold == (*s)[j].gold)
continue;
- if ((*s)[i].gold != (*s)[j].gold) {
- updates += (*s)[i].f-(*s)[j].f;
- num_pairs++;
- }
+ updates += (*s)[i].f-(*s)[j].f;
+ num_up++;
}
}
size_t sep_lo = sz-sep;
- while (sep_lo > 0 && (*s)[sep_lo-1].gold == (*s)[sep_lo].gold)
+ while (sep_lo>=sep_hi && (*s)[sep_lo].gold==(*s)[sep_lo+1].gold)
--sep_lo;
for (size_t i = sep_hi; i < sep_lo; i++) {
for (size_t j = sep_lo; j < sz; j++) {
- if (((*s)[i].model-(*s)[j].model) > margin)
+ if (((*s)[i].model-(*s)[j].model) > margin
+ || (*s)[i].gold == (*s)[j].gold)
continue;
- if ((*s)[i].gold != (*s)[j].gold) {
- updates += (*s)[i].f-(*s)[j].f;
- num_pairs++;
- }
+ updates += (*s)[i].f-(*s)[j].f;
+ num_up++;
}
}
- return num_pairs;
+ return num_up;
}
} // namespace