summaryrefslogtreecommitdiff
path: root/training
diff options
context:
space:
mode:
authorPatrick Simianer <p@simianer.de>2015-01-23 15:50:27 +0100
committerPatrick Simianer <p@simianer.de>2015-01-23 15:50:27 +0100
commit32dea3f24e56ac7c17343457c48f750f16838742 (patch)
tree79177b58cbff08c14991a0da8e851912b1c06309 /training
parent556dc935c7a2d8df78a35447d20d71b4bf6e391a (diff)
dtrain: multi-reference BLEU
Diffstat (limited to 'training')
-rw-r--r--training/dtrain/dtrain.cc67
-rw-r--r--training/dtrain/dtrain.h6
-rw-r--r--training/dtrain/examples/standard/expected-output123
-rw-r--r--training/dtrain/examples/standard/expected-output.gzbin0 -> 625304 bytes
-rw-r--r--training/dtrain/examples/standard/nc-wmt11.de.gzbin58324 -> 0 bytes
-rw-r--r--training/dtrain/examples/standard/nc-wmt11.en.gzbin49600 -> 0 bytes
-rw-r--r--training/dtrain/examples/toy/dtrain.ini3
-rw-r--r--training/dtrain/examples/toy/expected-output31
-rw-r--r--training/dtrain/examples/toy/in2
-rw-r--r--training/dtrain/examples/toy/src2
-rw-r--r--training/dtrain/examples/toy/tgt2
-rw-r--r--training/dtrain/kbestget.h4
-rw-r--r--training/dtrain/ksampler.h2
-rwxr-xr-xtraining/dtrain/parallelize.rb35
-rw-r--r--training/dtrain/score.cc63
-rw-r--r--training/dtrain/score.h37
16 files changed, 123 insertions, 254 deletions
diff --git a/training/dtrain/dtrain.cc b/training/dtrain/dtrain.cc
index 823a50de..737326f8 100644
--- a/training/dtrain/dtrain.cc
+++ b/training/dtrain/dtrain.cc
@@ -12,9 +12,7 @@ dtrain_init(int argc, char** argv, po::variables_map* cfg)
{
po::options_description ini("Configuration File Options");
ini.add_options()
- ("input", po::value<string>(), "input file (src)")
- ("refs,r", po::value<string>(), "references")
- ("bitext,b", po::value<string>(), "bitext: 'src ||| tgt'")
+ ("bitext,b", po::value<string>(), "bitext: 'src ||| tgt ||| tgt ||| ...'")
("output", po::value<string>()->default_value("-"), "output weights file, '-' for STDOUT")
("input_weights", po::value<string>(), "input weights file (e.g. from previous iteration)")
("decoder_config", po::value<string>(), "configuration file for cdec")
@@ -84,8 +82,8 @@ dtrain_init(int argc, char** argv, po::variables_map* cfg)
cerr << "hi_lo must lie in [0.01, 0.5]" << endl;
return false;
}
- if ((cfg->count("input")>0 || cfg->count("refs")>0) && cfg->count("bitext")>0) {
- cerr << "Provide 'input' and 'refs' or 'bitext', not both." << endl;
+ if (!cfg->count("bitext")) {
+ cerr << "No training data given." << endl;
return false;
}
if ((*cfg)["pair_threshold"].as<score_t>() < 0) {
@@ -221,24 +219,11 @@ main(int argc, char** argv)
// output
string output_fn = cfg["output"].as<string>();
// input
- bool read_bitext = false;
string input_fn;
- if (cfg.count("bitext")) {
- read_bitext = true;
- input_fn = cfg["bitext"].as<string>();
- } else {
- input_fn = cfg["input"].as<string>();
- }
- ReadFile input(input_fn);
+ ReadFile input(cfg["bitext"].as<string>());
// buffer input for t > 0
vector<string> src_str_buf; // source strings (decoder takes only strings)
- vector<vector<WordID> > ref_ids_buf; // references as WordID vecs
- ReadFile refs;
- string refs_fn;
- if (!read_bitext) {
- refs_fn = cfg["refs"].as<string>();
- refs.Init(refs_fn);
- }
+ vector<vector<vector<WordID> > > refs_as_ids_buf; // references as WordID vecs
unsigned in_sz = std::numeric_limits<unsigned>::max(); // input index, input size
vector<pair<score_t, score_t> > all_scores;
@@ -280,8 +265,6 @@ main(int argc, char** argv)
//cerr << setw(25) << "test k-best " << test_k_best << endl;
cerr << setw(25) << "cdec cfg " << "'" << cfg["decoder_config"].as<string>() << "'" << endl;
cerr << setw(25) << "input " << "'" << input_fn << "'" << endl;
- if (!read_bitext)
- cerr << setw(25) << "refs " << "'" << refs_fn << "'" << endl;
cerr << setw(25) << "output " << "'" << output_fn << "'" << endl;
if (cfg.count("input_weights"))
cerr << setw(25) << "weights in " << "'" << cfg["input_weights"].as<string>() << "'" << endl;
@@ -311,17 +294,13 @@ main(int argc, char** argv)
{
string in;
- vector<string> ref;
+ vector<string> refs;
bool next = false, stop = false; // next iteration or premature stop
if (t == 0) {
if(!getline(*input, in)) next = true;
- if(read_bitext) {
- vector<string> strs;
- boost::algorithm::split_regex(strs, in, boost::regex(" \\|\\|\\| "));
- in = strs[0];
- strs.erase(strs.begin());
- ref = strs;
- }
+ boost::algorithm::split_regex(refs, in, boost::regex(" \\|\\|\\| "));
+ in = refs[0];
+ refs.erase(refs.begin());
} else {
if (ii == in_sz) next = true; // stop if we reach the end of our input
}
@@ -356,20 +335,19 @@ main(int argc, char** argv)
lambdas.init_vector(&decoder_weights);
// getting input
- vector<vector<WordID> ref_ids; // reference as vector<WordID>
if (t == 0) {
- if (!read_bitext) {
- getline(*refs, ref);
+ vector<vector<WordID> > cur_refs;
+ for (auto r: refs) {
+ vector<WordID> cur_ref;
+ vector<string> tok;
+ boost::split(tok, r, boost::is_any_of(" "));
+ register_and_convert(tok, cur_ref);
+ cur_refs.push_back(cur_ref);
}
- vector<string> ref_tok;
- boost::split(ref_tok, ref, boost::is_any_of(" "));
- register_and_convert(ref_tok, ref_ids);
- ref_ids_buf.push_back(ref_ids);
+ refs_as_ids_buf.push_back(cur_refs);
src_str_buf.push_back(in);
- } else {
- ref_ids = ref_ids_buf[ii];
}
- observer->SetRef(ref_ids);
+ observer->SetRef(refs_as_ids_buf[ii]);
if (t == 0)
decoder.Decode(in, observer);
else
@@ -379,10 +357,11 @@ main(int argc, char** argv)
vector<ScoredHyp>* samples = observer->GetSamples();
if (verbose) {
- cerr << "--- ref for " << ii << ": ";
- if (t > 0) printWordIDVec(ref_ids_buf[ii]);
- else printWordIDVec(ref_ids);
- cerr << endl;
+ cerr << "--- refs for " << ii << ": ";
+ for (auto r: refs_as_ids_buf[ii]) {
+ printWordIDVec(r);
+ cerr << endl;
+ }
for (unsigned u = 0; u < samples->size(); u++) {
cerr << _p2 << _np << "[" << u << ". '";
printWordIDVec((*samples)[u].w);
diff --git a/training/dtrain/dtrain.h b/training/dtrain/dtrain.h
index 07bd9b65..d7980688 100644
--- a/training/dtrain/dtrain.h
+++ b/training/dtrain/dtrain.h
@@ -64,7 +64,7 @@ struct LocalScorer
vector<score_t> w_;
virtual score_t
- Score(const vector<WordID>& hyp, const vector<WordID>& ref, const unsigned rank, const unsigned src_len)=0;
+ Score(const vector<WordID>& hyp, const vector<vector<WordID> >& ref, const unsigned rank, const unsigned src_len)=0;
virtual void Reset() {} // only for ApproxBleuScorer, LinearBleuScorer
@@ -88,11 +88,11 @@ struct LocalScorer
struct HypSampler : public DecoderObserver
{
LocalScorer* scorer_;
- vector<WordID>* ref_;
+ vector<vector<WordID> >* refs_;
unsigned f_count_, sz_;
virtual vector<ScoredHyp>* GetSamples()=0;
inline void SetScorer(LocalScorer* scorer) { scorer_ = scorer; }
- inline void SetRef(vector<WordID>& ref) { ref_ = &ref; }
+ inline void SetRef(vector<vector<WordID> >& refs) { refs_ = &refs; }
inline unsigned get_f_count() { return f_count_; }
inline unsigned get_sz() { return sz_; }
};
diff --git a/training/dtrain/examples/standard/expected-output b/training/dtrain/examples/standard/expected-output
deleted file mode 100644
index 2460cfbb..00000000
--- a/training/dtrain/examples/standard/expected-output
+++ /dev/null
@@ -1,123 +0,0 @@
- cdec cfg './cdec.ini'
-Loading the LM will be faster if you build a binary file.
-Reading ./nc-wmt11.en.srilm.gz
-----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100
-****************************************************************************************************
- Example feature: Shape_S00000_T00000
-T=1 I=1 D=1
-Seeding random number sequence to 2327685089
-
-dtrain
-Parameters:
- k 100
- N 4
- T 3
- batch 0
- scorer 'fixed_stupid_bleu'
- sample from 'kbest'
- filter 'uniq'
- learning rate 0.1
- gamma 0
- loss margin 0
- faster perceptron 1
- pairs 'XYX'
- hi lo 0.1
- pair threshold 0
- select weights 'avg'
- l1 reg 0 'none'
- pclr no
- max pairs 4294967295
- repeat 1
- cdec cfg './cdec.ini'
- input './nc-wmt11.gz'
- output '-'
- stop_after 10
-(a dot represents 10 inputs)
-Iteration #1 of 3.
- . 10
-Stopping after 10 input sentences.
-WEIGHTS
- Glue = +6.9
- WordPenalty = -46.426
- LanguageModel = +535.12
- LanguageModel_OOV = -123.5
- PhraseModel_0 = -160.73
- PhraseModel_1 = -350.13
- PhraseModel_2 = -187.81
- PhraseModel_3 = +172.04
- PhraseModel_4 = +0.90108
- PhraseModel_5 = +21.6
- PhraseModel_6 = +67.2
- PassThrough = -149.7
- ---
- 1best avg score: 0.23327 (+0.23327)
- 1best avg model score: -9084.9 (-9084.9)
- avg # pairs: 780.7
- avg # rank err: 0 (meaningless)
- avg # margin viol: 0
- k-best loss imp: 100%
- non0 feature count: 1389
- avg list sz: 91.3
- avg f count: 146.2
-(time 0.37 min, 2.2 s/S)
-
-Iteration #2 of 3.
- . 10
-WEIGHTS
- Glue = -43
- WordPenalty = -22.019
- LanguageModel = +591.53
- LanguageModel_OOV = -252.1
- PhraseModel_0 = -120.21
- PhraseModel_1 = -43.589
- PhraseModel_2 = +73.53
- PhraseModel_3 = +113.7
- PhraseModel_4 = -223.81
- PhraseModel_5 = +64
- PhraseModel_6 = +54.8
- PassThrough = -331.1
- ---
- 1best avg score: 0.29568 (+0.062413)
- 1best avg model score: -15879 (-6794.1)
- avg # pairs: 566.1
- avg # rank err: 0 (meaningless)
- avg # margin viol: 0
- k-best loss imp: 100%
- non0 feature count: 1931
- avg list sz: 91.3
- avg f count: 139.89
-(time 0.33 min, 2 s/S)
-
-Iteration #3 of 3.
- . 10
-WEIGHTS
- Glue = -44.3
- WordPenalty = -131.85
- LanguageModel = +230.91
- LanguageModel_OOV = -285.4
- PhraseModel_0 = -194.27
- PhraseModel_1 = -294.83
- PhraseModel_2 = -92.043
- PhraseModel_3 = -140.24
- PhraseModel_4 = +85.613
- PhraseModel_5 = +238.1
- PhraseModel_6 = +158.7
- PassThrough = -359.6
- ---
- 1best avg score: 0.37375 (+0.078067)
- 1best avg model score: -14519 (+1359.7)
- avg # pairs: 545.4
- avg # rank err: 0 (meaningless)
- avg # margin viol: 0
- k-best loss imp: 100%
- non0 feature count: 2218
- avg list sz: 91.3
- avg f count: 137.77
-(time 0.35 min, 2.1 s/S)
-
-Writing weights file to '-' ...
-done
-
----
-Best iteration: 3 [SCORE 'fixed_stupid_bleu'=0.37375].
-This took 1.05 min.
diff --git a/training/dtrain/examples/standard/expected-output.gz b/training/dtrain/examples/standard/expected-output.gz
new file mode 100644
index 00000000..f93a253e
--- /dev/null
+++ b/training/dtrain/examples/standard/expected-output.gz
Binary files differ
diff --git a/training/dtrain/examples/standard/nc-wmt11.de.gz b/training/dtrain/examples/standard/nc-wmt11.de.gz
deleted file mode 100644
index 0741fd92..00000000
--- a/training/dtrain/examples/standard/nc-wmt11.de.gz
+++ /dev/null
Binary files differ
diff --git a/training/dtrain/examples/standard/nc-wmt11.en.gz b/training/dtrain/examples/standard/nc-wmt11.en.gz
deleted file mode 100644
index 1c0bd401..00000000
--- a/training/dtrain/examples/standard/nc-wmt11.en.gz
+++ /dev/null
Binary files differ
diff --git a/training/dtrain/examples/toy/dtrain.ini b/training/dtrain/examples/toy/dtrain.ini
index ef956df7..70c7331c 100644
--- a/training/dtrain/examples/toy/dtrain.ini
+++ b/training/dtrain/examples/toy/dtrain.ini
@@ -1,6 +1,5 @@
decoder_config=cdec.ini
-input=src
-refs=tgt
+bitext=in
output=-
print_weights=logp shell_rule house_rule small_rule little_rule PassThrough PassThrough_1 PassThrough_2 PassThrough_3 PassThrough_4 PassThrough_5 PassThrough_6
k=4
diff --git a/training/dtrain/examples/toy/expected-output b/training/dtrain/examples/toy/expected-output
index 1da2aadd..fbee24e3 100644
--- a/training/dtrain/examples/toy/expected-output
+++ b/training/dtrain/examples/toy/expected-output
@@ -1,26 +1,29 @@
Warning: hi_lo only works with pair_sampling XYX.
cdec cfg 'cdec.ini'
-Seeding random number sequence to 1664825829
+Seeding random number sequence to 3626026233
dtrain
Parameters:
k 4
N 4
T 2
+ batch 0
scorer 'bleu'
sample from 'kbest'
filter 'uniq'
learning rate 1
gamma 0
loss margin 0
+ faster perceptron 1
pairs 'all'
pair threshold 0
select weights 'last'
l1 reg 0 'none'
+ pclr no
max pairs 4294967295
+ repeat 1
cdec cfg 'cdec.ini'
- input 'src'
- refs 'tgt'
+ input ''
output '-'
(a dot represents 10 inputs)
Iteration #1 of 2.
@@ -32,12 +35,19 @@ WEIGHTS
small_rule = -2
little_rule = +3
PassThrough = -5
+ PassThrough_1 = +0
+ PassThrough_2 = +0
+ PassThrough_3 = +0
+ PassThrough_4 = +0
+ PassThrough_5 = +0
+ PassThrough_6 = +0
---
1best avg score: 0.5 (+0.5)
1best avg model score: 2.5 (+2.5)
- avg # pairs: 4
- avg # rank err: 1.5
+ avg # pairs: 1.5
+ avg # rank err: 1.5 (meaningless)
avg # margin viol: 0
+ k-best loss imp: 100%
non0 feature count: 6
avg list sz: 4
avg f count: 2.875
@@ -52,12 +62,19 @@ WEIGHTS
small_rule = -2
little_rule = +3
PassThrough = -5
+ PassThrough_1 = +0
+ PassThrough_2 = +0
+ PassThrough_3 = +0
+ PassThrough_4 = +0
+ PassThrough_5 = +0
+ PassThrough_6 = +0
---
1best avg score: 1 (+0.5)
1best avg model score: 5 (+2.5)
- avg # pairs: 5
- avg # rank err: 0
+ avg # pairs: 0
+ avg # rank err: 0 (meaningless)
avg # margin viol: 0
+ k-best loss imp: 100%
non0 feature count: 6
avg list sz: 4
avg f count: 3
diff --git a/training/dtrain/examples/toy/in b/training/dtrain/examples/toy/in
new file mode 100644
index 00000000..5d70795d
--- /dev/null
+++ b/training/dtrain/examples/toy/in
@@ -0,0 +1,2 @@
+ich sah ein kleines haus ||| i saw a little house
+ich fand ein kleines haus ||| i found a little house
diff --git a/training/dtrain/examples/toy/src b/training/dtrain/examples/toy/src
deleted file mode 100644
index 87e39ef2..00000000
--- a/training/dtrain/examples/toy/src
+++ /dev/null
@@ -1,2 +0,0 @@
-ich sah ein kleines haus
-ich fand ein kleines haus
diff --git a/training/dtrain/examples/toy/tgt b/training/dtrain/examples/toy/tgt
deleted file mode 100644
index 174926b3..00000000
--- a/training/dtrain/examples/toy/tgt
+++ /dev/null
@@ -1,2 +0,0 @@
-i saw a little house
-i found a little house
diff --git a/training/dtrain/kbestget.h b/training/dtrain/kbestget.h
index 85252db3..25f02273 100644
--- a/training/dtrain/kbestget.h
+++ b/training/dtrain/kbestget.h
@@ -52,7 +52,7 @@ struct KBestGetter : public HypSampler
h.f = d->feature_values;
h.model = log(d->score);
h.rank = i;
- h.score = scorer_->Score(h.w, *ref_, i, src_len_);
+ h.score = scorer_->Score(h.w, *refs_, i, src_len_);
s_.push_back(h);
sz_++;
f_count_ += h.f.size();
@@ -73,7 +73,7 @@ struct KBestGetter : public HypSampler
h.f = d->feature_values;
h.model = log(d->score);
h.rank = i;
- h.score = scorer_->Score(h.w, *ref_, i, src_len_);
+ h.score = scorer_->Score(h.w, *refs_, i, src_len_);
s_.push_back(h);
sz_++;
f_count_ += h.f.size();
diff --git a/training/dtrain/ksampler.h b/training/dtrain/ksampler.h
index 29dab667..9eedc74f 100644
--- a/training/dtrain/ksampler.h
+++ b/training/dtrain/ksampler.h
@@ -43,7 +43,7 @@ struct KSampler : public HypSampler
h.f = samples[i].fmap;
h.model = log(samples[i].model_score);
h.rank = i;
- h.score = scorer_->Score(h.w, *ref_, i, src_len_);
+ h.score = scorer_->Score(h.w, *refs_, i, src_len_);
s_.push_back(h);
sz_++;
f_count_ += h.f.size();
diff --git a/training/dtrain/parallelize.rb b/training/dtrain/parallelize.rb
index 5fc8b04e..fe3a6cf5 100755
--- a/training/dtrain/parallelize.rb
+++ b/training/dtrain/parallelize.rb
@@ -1,10 +1,11 @@
#!/usr/bin/env ruby
require 'trollop'
+require 'zipf'
def usage
STDERR.write "Usage: "
- STDERR.write "ruby parallelize.rb -c <dtrain.ini> [-e <epochs=10>] [--randomize/-z] [--reshard/-y] -s <#shards|0> [-p <at once=9999>] -i <input> -r <refs> [--qsub/-q] [--dtrain_binary <path to dtrain binary>] [-l \"l2 select_k 100000\"] [--extra_qsub \"-l mem_free=24G\"]\n"
+ STDERR.write "ruby parallelize.rb -c <dtrain.ini> [-e <epochs=10>] [--randomize/-z] [--reshard/-y] -s <#shards|0> [-p <at once=9999>] -i <input> [--qsub/-q] [--dtrain_binary <path to dtrain binary>] [-l \"l2 select_k 100000\"] [--extra_qsub \"-l mem_free=24G\"]\n"
exit 1
end
@@ -16,15 +17,14 @@ opts = Trollop::options do
opt :reshard, "reshard after each epoch", :type => :bool, :short => '-y', :default => false
opt :shards, "number of shards", :type => :int
opt :processes_at_once, "have this number (max) running at the same time", :type => :int, :default => 9999
- opt :input, "input", :type => :string
- opt :references, "references", :type => :string
+ opt :input, "input (bitext f ||| e ||| ...)", :type => :string
opt :qsub, "use qsub", :type => :bool, :default => false
opt :dtrain_binary, "path to dtrain binary", :type => :string
opt :extra_qsub, "extra qsub args", :type => :string, :default => ""
opt :per_shard_decoder_configs, "give special decoder config per shard", :type => :string, :short => '-o'
opt :first_input_weights, "input weights for first iter", :type => :string, :default => '', :short => '-w'
end
-usage if not opts[:config]&&opts[:shards]&&opts[:input]&&opts[:references]
+usage if not opts[:config]&&opts[:shards]&&opts[:input]
dtrain_dir = File.expand_path File.dirname(__FILE__)
if not opts[:dtrain_binary]
@@ -51,7 +51,6 @@ else
num_shards = opts[:shards]
end
input = opts[:input]
-refs = opts[:references]
use_qsub = opts[:qsub]
shards_at_once = opts[:processes_at_once]
first_input_weights = opts[:first_input_weights]
@@ -59,7 +58,7 @@ opts[:extra_qsub] = "-l #{opts[:extra_qsub]}" if opts[:extra_qsub]!=""
`mkdir work`
-def make_shards(input, refs, num_shards, epoch, rand)
+def make_shards(input, num_shards, epoch, rand)
lc = `wc -l #{input}`.split.first.to_i
index = (0..lc-1).to_a
index.reverse!
@@ -69,12 +68,8 @@ def make_shards(input, refs, num_shards, epoch, rand)
leftover = 0 if leftover < 0
in_f = File.new input, 'r'
in_lines = in_f.readlines
- refs_f = File.new refs, 'r'
- refs_lines = refs_f.readlines
shard_in_files = []
- shard_refs_files = []
in_fns = []
- refs_fns = []
new_num_shards = 0
0.upto(num_shards-1) { |shard|
break if index.size==0
@@ -82,41 +77,32 @@ def make_shards(input, refs, num_shards, epoch, rand)
in_fn = "work/shard.#{shard}.#{epoch}.in"
shard_in = File.new in_fn, 'w+'
in_fns << in_fn
- refs_fn = "work/shard.#{shard}.#{epoch}.refs"
- shard_refs = File.new refs_fn, 'w+'
- refs_fns << refs_fn
0.upto(shard_sz-1) { |i|
j = index.pop
break if !j
shard_in.write in_lines[j]
- shard_refs.write refs_lines[j]
}
shard_in_files << shard_in
- shard_refs_files << shard_refs
}
while leftover > 0
j = index.pop
shard_in_files[-1].write in_lines[j]
- shard_refs_files[-1].write refs_lines[j]
leftover -= 1
end
- (shard_in_files + shard_refs_files).each do |f| f.close end
+ shard_in_files.each do |f| f.close end
in_f.close
- refs_f.close
- return in_fns, refs_fns, new_num_shards
+ return in_fns, new_num_shards
end
input_files = []
-refs_files = []
if predefined_shards
input_files = File.new(input).readlines.map {|i| i.strip }
- refs_files = File.new(refs).readlines.map {|i| i.strip }
if per_shard_decoder_configs
decoder_configs = File.new(opts[:per_shard_decoder_configs]).readlines.map {|i| i.strip}
end
num_shards = input_files.size
else
- input_files, refs_files, num_shards = make_shards input, refs, num_shards, 0, rand
+ input_files, num_shards = make_shards input, num_shards, 0, rand
end
0.upto(epochs-1) { |epoch|
@@ -149,8 +135,7 @@ end
end
pids << Kernel.fork {
`#{qsub_str_start}#{dtrain_bin} -c #{ini} #{cdec_cfg} #{input_weights}\
- --input #{input_files[shard]}\
- --refs #{refs_files[shard]}\
+ --bitext #{input_files[shard]}\
--output work/weights.#{shard}.#{epoch}#{qsub_str_end} #{local_end}`
}
weights_files << "work/weights.#{shard}.#{epoch}"
@@ -163,7 +148,7 @@ end
`#{cat} work/weights.*.#{epoch} > work/weights_cat`
`#{ruby} #{lplp_rb} #{lplp_args} #{num_shards} < work/weights_cat > work/weights.#{epoch}`
if rand and reshard and epoch+1!=epochs
- input_files, refs_files, num_shards = make_shards input, refs, num_shards, epoch+1, rand
+ input_files, num_shards = make_shards input, num_shards, epoch+1, rand
end
}
diff --git a/training/dtrain/score.cc b/training/dtrain/score.cc
index 127f34d2..d81eafcb 100644
--- a/training/dtrain/score.cc
+++ b/training/dtrain/score.cc
@@ -31,13 +31,22 @@ BleuScorer::Bleu(NgramCounts& counts, const unsigned hyp_len, const unsigned ref
return brevity_penalty(hyp_len, ref_len) * exp(sum);
}
+size_t
+RefLen(vector<vector<WordID> > refs)
+{
+ size_t ref_len = 0;
+ for (auto r: refs)
+ ref_len = max(ref_len, r.size());
+ return ref_len;
+}
+
score_t
-BleuScorer::Score(const vector<WordID>& hyp, const vector<WordID>& ref,
+BleuScorer::Score(const vector<WordID>& hyp, const vector<vector<WordID> >& refs,
const unsigned /*rank*/, const unsigned /*src_len*/)
{
- unsigned hyp_len = hyp.size(), ref_len = ref.size();
+ unsigned hyp_len = hyp.size(), ref_len = RefLen(refs);
if (hyp_len == 0 || ref_len == 0) return 0.;
- NgramCounts counts = make_ngram_counts(hyp, ref, N_);
+ NgramCounts counts = make_ngram_counts(hyp, refs, N_);
return Bleu(counts, hyp_len, ref_len);
}
@@ -52,12 +61,12 @@ BleuScorer::Score(const vector<WordID>& hyp, const vector<WordID>& ref,
* NOTE: 0 iff no 1gram match ('grounded')
*/
score_t
-StupidBleuScorer::Score(const vector<WordID>& hyp, const vector<WordID>& ref,
+StupidBleuScorer::Score(const vector<WordID>& hyp, const vector<vector<WordID> >& refs,
const unsigned /*rank*/, const unsigned /*src_len*/)
{
- unsigned hyp_len = hyp.size(), ref_len = ref.size();
+ unsigned hyp_len = hyp.size(), ref_len = RefLen(refs);
if (hyp_len == 0 || ref_len == 0) return 0.;
- NgramCounts counts = make_ngram_counts(hyp, ref, N_);
+ NgramCounts counts = make_ngram_counts(hyp, refs, N_);
unsigned M = N_;
vector<score_t> v = w_;
if (ref_len < N_) {
@@ -81,12 +90,12 @@ StupidBleuScorer::Score(const vector<WordID>& hyp, const vector<WordID>& ref,
* (Nakov et al. '12)
*/
score_t
-FixedStupidBleuScorer::Score(const vector<WordID>& hyp, const vector<WordID>& ref,
+FixedStupidBleuScorer::Score(const vector<WordID>& hyp, const vector<vector<WordID> >& refs,
const unsigned /*rank*/, const unsigned /*src_len*/)
{
- unsigned hyp_len = hyp.size(), ref_len = ref.size();
+ unsigned hyp_len = hyp.size(), ref_len = RefLen(refs);
if (hyp_len == 0 || ref_len == 0) return 0.;
- NgramCounts counts = make_ngram_counts(hyp, ref, N_);
+ NgramCounts counts = make_ngram_counts(hyp, refs, N_);
unsigned M = N_;
vector<score_t> v = w_;
if (ref_len < N_) {
@@ -112,12 +121,12 @@ FixedStupidBleuScorer::Score(const vector<WordID>& hyp, const vector<WordID>& re
* NOTE: max is 0.9375 (with N=4)
*/
score_t
-SmoothBleuScorer::Score(const vector<WordID>& hyp, const vector<WordID>& ref,
+SmoothBleuScorer::Score(const vector<WordID>& hyp, const vector<vector<WordID> >& refs,
const unsigned /*rank*/, const unsigned /*src_len*/)
{
- unsigned hyp_len = hyp.size(), ref_len = ref.size();
+ unsigned hyp_len = hyp.size(), ref_len = RefLen(refs);
if (hyp_len == 0 || ref_len == 0) return 0.;
- NgramCounts counts = make_ngram_counts(hyp, ref, N_);
+ NgramCounts counts = make_ngram_counts(hyp, refs, N_);
unsigned M = N_;
if (ref_len < N_) M = ref_len;
score_t sum = 0.;
@@ -143,12 +152,12 @@ SmoothBleuScorer::Score(const vector<WordID>& hyp, const vector<WordID>& ref,
* sum up Ngram precisions
*/
score_t
-SumBleuScorer::Score(const vector<WordID>& hyp, const vector<WordID>& ref,
+SumBleuScorer::Score(const vector<WordID>& hyp, const vector<vector<WordID> >& refs,
const unsigned /*rank*/, const unsigned /*src_len*/)
{
- unsigned hyp_len = hyp.size(), ref_len = ref.size();
+ unsigned hyp_len = hyp.size(), ref_len = RefLen(refs);
if (hyp_len == 0 || ref_len == 0) return 0.;
- NgramCounts counts = make_ngram_counts(hyp, ref, N_);
+ NgramCounts counts = make_ngram_counts(hyp, refs, N_);
unsigned M = N_;
if (ref_len < N_) M = ref_len;
score_t sum = 0.;
@@ -167,12 +176,12 @@ SumBleuScorer::Score(const vector<WordID>& hyp, const vector<WordID>& ref,
* sum up exp(Ngram precisions)
*/
score_t
-SumExpBleuScorer::Score(const vector<WordID>& hyp, const vector<WordID>& ref,
+SumExpBleuScorer::Score(const vector<WordID>& hyp, const vector<vector<WordID> >& refs,
const unsigned /*rank*/, const unsigned /*src_len*/)
{
- unsigned hyp_len = hyp.size(), ref_len = ref.size();
+ unsigned hyp_len = hyp.size(), ref_len = RefLen(refs);
if (hyp_len == 0 || ref_len == 0) return 0.;
- NgramCounts counts = make_ngram_counts(hyp, ref, N_);
+ NgramCounts counts = make_ngram_counts(hyp, refs, N_);
unsigned M = N_;
if (ref_len < N_) M = ref_len;
score_t sum = 0.;
@@ -191,12 +200,12 @@ SumExpBleuScorer::Score(const vector<WordID>& hyp, const vector<WordID>& ref,
* sum up exp(weight * log(Ngram precisions))
*/
score_t
-SumWhateverBleuScorer::Score(const vector<WordID>& hyp, const vector<WordID>& ref,
+SumWhateverBleuScorer::Score(const vector<WordID>& hyp, const vector<vector<WordID> >& refs,
const unsigned /*rank*/, const unsigned /*src_len*/)
{
- unsigned hyp_len = hyp.size(), ref_len = ref.size();
+ unsigned hyp_len = hyp.size(), ref_len = RefLen(refs);
if (hyp_len == 0 || ref_len == 0) return 0.;
- NgramCounts counts = make_ngram_counts(hyp, ref, N_);
+ NgramCounts counts = make_ngram_counts(hyp, refs, N_);
unsigned M = N_;
vector<score_t> v = w_;
if (ref_len < N_) {
@@ -224,15 +233,15 @@ SumWhateverBleuScorer::Score(const vector<WordID>& hyp, const vector<WordID>& re
* No scaling by src len.
*/
score_t
-ApproxBleuScorer::Score(const vector<WordID>& hyp, const vector<WordID>& ref,
+ApproxBleuScorer::Score(const vector<WordID>& hyp, const vector<vector<WordID> >& refs,
const unsigned rank, const unsigned src_len)
{
- unsigned hyp_len = hyp.size(), ref_len = ref.size();
+ unsigned hyp_len = hyp.size(), ref_len = RefLen(refs);
if (ref_len == 0) return 0.;
score_t score = 0.;
NgramCounts counts(N_);
if (hyp_len > 0) {
- counts = make_ngram_counts(hyp, ref, N_);
+ counts = make_ngram_counts(hyp, refs, N_);
NgramCounts tmp = glob_onebest_counts_ + counts;
score = Bleu(tmp, hyp_len, ref_len);
}
@@ -255,16 +264,16 @@ ApproxBleuScorer::Score(const vector<WordID>& hyp, const vector<WordID>& ref,
*
*/
score_t
-LinearBleuScorer::Score(const vector<WordID>& hyp, const vector<WordID>& ref,
+LinearBleuScorer::Score(const vector<WordID>& hyp, const vector<vector<WordID> >& refs,
const unsigned rank, const unsigned /*src_len*/)
{
- unsigned hyp_len = hyp.size(), ref_len = ref.size();
+ unsigned hyp_len = hyp.size(), ref_len = RefLen(refs);
if (ref_len == 0) return 0.;
unsigned M = N_;
if (ref_len < N_) M = ref_len;
NgramCounts counts(M);
if (hyp_len > 0)
- counts = make_ngram_counts(hyp, ref, M);
+ counts = make_ngram_counts(hyp, refs, M);
score_t ret = 0.;
for (unsigned i = 0; i < M; i++) {
if (counts.sum_[i] == 0 || onebest_counts_.sum_[i] == 0) break;
diff --git a/training/dtrain/score.h b/training/dtrain/score.h
index 1cdd3fa9..7d88cb61 100644
--- a/training/dtrain/score.h
+++ b/training/dtrain/score.h
@@ -117,20 +117,25 @@ make_ngrams(const vector<WordID>& s, const unsigned N)
}
inline NgramCounts
-make_ngram_counts(const vector<WordID>& hyp, const vector<WordID>& ref, const unsigned N)
+make_ngram_counts(const vector<WordID>& hyp, const vector<vector<WordID> >& refs, const unsigned N)
{
Ngrams hyp_ngrams = make_ngrams(hyp, N);
- Ngrams ref_ngrams = make_ngrams(ref, N);
+ vector<Ngrams> refs_ngrams;
+ for (auto r: refs) {
+ Ngrams r_ng = make_ngrams(r, N);
+ refs_ngrams.push_back(r_ng);
+ }
NgramCounts counts(N);
Ngrams::iterator it;
Ngrams::iterator ti;
for (it = hyp_ngrams.begin(); it != hyp_ngrams.end(); it++) {
- ti = ref_ngrams.find(it->first);
- if (ti != ref_ngrams.end()) {
- counts.Add(it->second, ti->second, it->first.size() - 1);
- } else {
- counts.Add(it->second, 0, it->first.size() - 1);
+ unsigned max_ref_count = 0;
+ for (auto ref_ngrams: refs_ngrams) {
+ ti = ref_ngrams.find(it->first);
+ if (ti != ref_ngrams.end())
+ max_ref_count = max(max_ref_count, ti->second);
}
+ counts.Add(it->second, max_ref_count, it->first.size() - 1);
}
return counts;
}
@@ -138,43 +143,43 @@ make_ngram_counts(const vector<WordID>& hyp, const vector<WordID>& ref, const un
struct BleuScorer : public LocalScorer
{
score_t Bleu(NgramCounts& counts, const unsigned hyp_len, const unsigned ref_len);
- score_t Score(const vector<WordID>& hyp, const vector<WordID>& ref, const unsigned /*rank*/, const unsigned /*src_len*/);
+ score_t Score(const vector<WordID>& hyp, const vector<vector<WordID> >& refs, const unsigned /*rank*/, const unsigned /*src_len*/);
void Reset() {}
};
struct StupidBleuScorer : public LocalScorer
{
- score_t Score(const vector<WordID>& hyp, const vector<WordID>& ref, const unsigned /*rank*/, const unsigned /*src_len*/);
+ score_t Score(const vector<WordID>& hyp, const vector<vector<WordID> >& refs, const unsigned /*rank*/, const unsigned /*src_len*/);
void Reset() {}
};
struct FixedStupidBleuScorer : public LocalScorer
{
- score_t Score(const vector<WordID>& hyp, const vector<WordID>& ref, const unsigned /*rank*/, const unsigned /*src_len*/);
+ score_t Score(const vector<WordID>& hyp, const vector<vector<WordID> >& refs, const unsigned /*rank*/, const unsigned /*src_len*/);
void Reset() {}
};
struct SmoothBleuScorer : public LocalScorer
{
- score_t Score(const vector<WordID>& hyp, const vector<WordID>& ref, const unsigned /*rank*/, const unsigned /*src_len*/);
+ score_t Score(const vector<WordID>& hyp, const vector<vector<WordID> >& refs, const unsigned /*rank*/, const unsigned /*src_len*/);
void Reset() {}
};
struct SumBleuScorer : public LocalScorer
{
- score_t Score(const vector<WordID>& hyp, const vector<WordID>& ref, const unsigned /*rank*/, const unsigned /*src_len*/);
+ score_t Score(const vector<WordID>& hyp, const vector<vector<WordID> >& refs, const unsigned /*rank*/, const unsigned /*src_len*/);
void Reset() {}
};
struct SumExpBleuScorer : public LocalScorer
{
- score_t Score(const vector<WordID>& hyp, const vector<WordID>& ref, const unsigned /*rank*/, const unsigned /*src_len*/);
+ score_t Score(const vector<WordID>& hyp, const vector<vector<WordID> >& refs, const unsigned /*rank*/, const unsigned /*src_len*/);
void Reset() {}
};
struct SumWhateverBleuScorer : public LocalScorer
{
- score_t Score(const vector<WordID>& hyp, const vector<WordID>& ref, const unsigned /*rank*/, const unsigned /*src_len*/);
+ score_t Score(const vector<WordID>& hyp, const vector<vector<WordID> >& refs, const unsigned /*rank*/, const unsigned /*src_len*/);
void Reset() {};
};
@@ -194,7 +199,7 @@ struct ApproxBleuScorer : public BleuScorer
glob_hyp_len_ = glob_ref_len_ = glob_src_len_ = 0.;
}
- score_t Score(const vector<WordID>& hyp, const vector<WordID>& ref, const unsigned rank, const unsigned src_len);
+ score_t Score(const vector<WordID>& hyp, const vector<vector<WordID> >& refs, const unsigned rank, const unsigned src_len);
};
struct LinearBleuScorer : public BleuScorer
@@ -207,7 +212,7 @@ struct LinearBleuScorer : public BleuScorer
onebest_counts_.One();
}
- score_t Score(const vector<WordID>& hyp, const vector<WordID>& ref, const unsigned rank, const unsigned /*src_len*/);
+ score_t Score(const vector<WordID>& hyp, const vector<vector<WordID> >& refs, const unsigned rank, const unsigned /*src_len*/);
inline void Reset() {
onebest_len_ = 1;