diff options
author | Patrick Simianer <p@simianer.de> | 2015-01-23 15:50:27 +0100 |
---|---|---|
committer | Patrick Simianer <p@simianer.de> | 2015-01-23 15:50:27 +0100 |
commit | 32dea3f24e56ac7c17343457c48f750f16838742 (patch) | |
tree | 79177b58cbff08c14991a0da8e851912b1c06309 /training/dtrain | |
parent | 556dc935c7a2d8df78a35447d20d71b4bf6e391a (diff) |
dtrain: multi-reference BLEU
Diffstat (limited to 'training/dtrain')
-rw-r--r-- | training/dtrain/dtrain.cc | 67 | ||||
-rw-r--r-- | training/dtrain/dtrain.h | 6 | ||||
-rw-r--r-- | training/dtrain/examples/standard/expected-output | 123 | ||||
-rw-r--r-- | training/dtrain/examples/standard/expected-output.gz | bin | 0 -> 625304 bytes | |||
-rw-r--r-- | training/dtrain/examples/standard/nc-wmt11.de.gz | bin | 58324 -> 0 bytes | |||
-rw-r--r-- | training/dtrain/examples/standard/nc-wmt11.en.gz | bin | 49600 -> 0 bytes | |||
-rw-r--r-- | training/dtrain/examples/toy/dtrain.ini | 3 | ||||
-rw-r--r-- | training/dtrain/examples/toy/expected-output | 31 | ||||
-rw-r--r-- | training/dtrain/examples/toy/in | 2 | ||||
-rw-r--r-- | training/dtrain/examples/toy/src | 2 | ||||
-rw-r--r-- | training/dtrain/examples/toy/tgt | 2 | ||||
-rw-r--r-- | training/dtrain/kbestget.h | 4 | ||||
-rw-r--r-- | training/dtrain/ksampler.h | 2 | ||||
-rwxr-xr-x | training/dtrain/parallelize.rb | 35 | ||||
-rw-r--r-- | training/dtrain/score.cc | 63 | ||||
-rw-r--r-- | training/dtrain/score.h | 37 |
16 files changed, 123 insertions, 254 deletions
diff --git a/training/dtrain/dtrain.cc b/training/dtrain/dtrain.cc index 823a50de..737326f8 100644 --- a/training/dtrain/dtrain.cc +++ b/training/dtrain/dtrain.cc @@ -12,9 +12,7 @@ dtrain_init(int argc, char** argv, po::variables_map* cfg) { po::options_description ini("Configuration File Options"); ini.add_options() - ("input", po::value<string>(), "input file (src)") - ("refs,r", po::value<string>(), "references") - ("bitext,b", po::value<string>(), "bitext: 'src ||| tgt'") + ("bitext,b", po::value<string>(), "bitext: 'src ||| tgt ||| tgt ||| ...'") ("output", po::value<string>()->default_value("-"), "output weights file, '-' for STDOUT") ("input_weights", po::value<string>(), "input weights file (e.g. from previous iteration)") ("decoder_config", po::value<string>(), "configuration file for cdec") @@ -84,8 +82,8 @@ dtrain_init(int argc, char** argv, po::variables_map* cfg) cerr << "hi_lo must lie in [0.01, 0.5]" << endl; return false; } - if ((cfg->count("input")>0 || cfg->count("refs")>0) && cfg->count("bitext")>0) { - cerr << "Provide 'input' and 'refs' or 'bitext', not both." << endl; + if (!cfg->count("bitext")) { + cerr << "No training data given." << endl; return false; } if ((*cfg)["pair_threshold"].as<score_t>() < 0) { @@ -221,24 +219,11 @@ main(int argc, char** argv) // output string output_fn = cfg["output"].as<string>(); // input - bool read_bitext = false; string input_fn; - if (cfg.count("bitext")) { - read_bitext = true; - input_fn = cfg["bitext"].as<string>(); - } else { - input_fn = cfg["input"].as<string>(); - } - ReadFile input(input_fn); + ReadFile input(cfg["bitext"].as<string>()); // buffer input for t > 0 vector<string> src_str_buf; // source strings (decoder takes only strings) - vector<vector<WordID> > ref_ids_buf; // references as WordID vecs - ReadFile refs; - string refs_fn; - if (!read_bitext) { - refs_fn = cfg["refs"].as<string>(); - refs.Init(refs_fn); - } + vector<vector<vector<WordID> > > refs_as_ids_buf; // references as WordID vecs unsigned in_sz = std::numeric_limits<unsigned>::max(); // input index, input size vector<pair<score_t, score_t> > all_scores; @@ -280,8 +265,6 @@ main(int argc, char** argv) //cerr << setw(25) << "test k-best " << test_k_best << endl; cerr << setw(25) << "cdec cfg " << "'" << cfg["decoder_config"].as<string>() << "'" << endl; cerr << setw(25) << "input " << "'" << input_fn << "'" << endl; - if (!read_bitext) - cerr << setw(25) << "refs " << "'" << refs_fn << "'" << endl; cerr << setw(25) << "output " << "'" << output_fn << "'" << endl; if (cfg.count("input_weights")) cerr << setw(25) << "weights in " << "'" << cfg["input_weights"].as<string>() << "'" << endl; @@ -311,17 +294,13 @@ main(int argc, char** argv) { string in; - vector<string> ref; + vector<string> refs; bool next = false, stop = false; // next iteration or premature stop if (t == 0) { if(!getline(*input, in)) next = true; - if(read_bitext) { - vector<string> strs; - boost::algorithm::split_regex(strs, in, boost::regex(" \\|\\|\\| ")); - in = strs[0]; - strs.erase(strs.begin()); - ref = strs; - } + boost::algorithm::split_regex(refs, in, boost::regex(" \\|\\|\\| ")); + in = refs[0]; + refs.erase(refs.begin()); } else { if (ii == in_sz) next = true; // stop if we reach the end of our input } @@ -356,20 +335,19 @@ main(int argc, char** argv) lambdas.init_vector(&decoder_weights); // getting input - vector<vector<WordID> ref_ids; // reference as vector<WordID> if (t == 0) { - if (!read_bitext) { - getline(*refs, ref); + vector<vector<WordID> > cur_refs; + for (auto r: refs) { + vector<WordID> cur_ref; + vector<string> tok; + boost::split(tok, r, boost::is_any_of(" ")); + register_and_convert(tok, cur_ref); + cur_refs.push_back(cur_ref); } - vector<string> ref_tok; - boost::split(ref_tok, ref, boost::is_any_of(" ")); - register_and_convert(ref_tok, ref_ids); - ref_ids_buf.push_back(ref_ids); + refs_as_ids_buf.push_back(cur_refs); src_str_buf.push_back(in); - } else { - ref_ids = ref_ids_buf[ii]; } - observer->SetRef(ref_ids); + observer->SetRef(refs_as_ids_buf[ii]); if (t == 0) decoder.Decode(in, observer); else @@ -379,10 +357,11 @@ main(int argc, char** argv) vector<ScoredHyp>* samples = observer->GetSamples(); if (verbose) { - cerr << "--- ref for " << ii << ": "; - if (t > 0) printWordIDVec(ref_ids_buf[ii]); - else printWordIDVec(ref_ids); - cerr << endl; + cerr << "--- refs for " << ii << ": "; + for (auto r: refs_as_ids_buf[ii]) { + printWordIDVec(r); + cerr << endl; + } for (unsigned u = 0; u < samples->size(); u++) { cerr << _p2 << _np << "[" << u << ". '"; printWordIDVec((*samples)[u].w); diff --git a/training/dtrain/dtrain.h b/training/dtrain/dtrain.h index 07bd9b65..d7980688 100644 --- a/training/dtrain/dtrain.h +++ b/training/dtrain/dtrain.h @@ -64,7 +64,7 @@ struct LocalScorer vector<score_t> w_; virtual score_t - Score(const vector<WordID>& hyp, const vector<WordID>& ref, const unsigned rank, const unsigned src_len)=0; + Score(const vector<WordID>& hyp, const vector<vector<WordID> >& ref, const unsigned rank, const unsigned src_len)=0; virtual void Reset() {} // only for ApproxBleuScorer, LinearBleuScorer @@ -88,11 +88,11 @@ struct LocalScorer struct HypSampler : public DecoderObserver { LocalScorer* scorer_; - vector<WordID>* ref_; + vector<vector<WordID> >* refs_; unsigned f_count_, sz_; virtual vector<ScoredHyp>* GetSamples()=0; inline void SetScorer(LocalScorer* scorer) { scorer_ = scorer; } - inline void SetRef(vector<WordID>& ref) { ref_ = &ref; } + inline void SetRef(vector<vector<WordID> >& refs) { refs_ = &refs; } inline unsigned get_f_count() { return f_count_; } inline unsigned get_sz() { return sz_; } }; diff --git a/training/dtrain/examples/standard/expected-output b/training/dtrain/examples/standard/expected-output deleted file mode 100644 index 2460cfbb..00000000 --- a/training/dtrain/examples/standard/expected-output +++ /dev/null @@ -1,123 +0,0 @@ - cdec cfg './cdec.ini' -Loading the LM will be faster if you build a binary file. -Reading ./nc-wmt11.en.srilm.gz -----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100 -**************************************************************************************************** - Example feature: Shape_S00000_T00000 -T=1 I=1 D=1 -Seeding random number sequence to 2327685089 - -dtrain -Parameters: - k 100 - N 4 - T 3 - batch 0 - scorer 'fixed_stupid_bleu' - sample from 'kbest' - filter 'uniq' - learning rate 0.1 - gamma 0 - loss margin 0 - faster perceptron 1 - pairs 'XYX' - hi lo 0.1 - pair threshold 0 - select weights 'avg' - l1 reg 0 'none' - pclr no - max pairs 4294967295 - repeat 1 - cdec cfg './cdec.ini' - input './nc-wmt11.gz' - output '-' - stop_after 10 -(a dot represents 10 inputs) -Iteration #1 of 3. - . 10 -Stopping after 10 input sentences. -WEIGHTS - Glue = +6.9 - WordPenalty = -46.426 - LanguageModel = +535.12 - LanguageModel_OOV = -123.5 - PhraseModel_0 = -160.73 - PhraseModel_1 = -350.13 - PhraseModel_2 = -187.81 - PhraseModel_3 = +172.04 - PhraseModel_4 = +0.90108 - PhraseModel_5 = +21.6 - PhraseModel_6 = +67.2 - PassThrough = -149.7 - --- - 1best avg score: 0.23327 (+0.23327) - 1best avg model score: -9084.9 (-9084.9) - avg # pairs: 780.7 - avg # rank err: 0 (meaningless) - avg # margin viol: 0 - k-best loss imp: 100% - non0 feature count: 1389 - avg list sz: 91.3 - avg f count: 146.2 -(time 0.37 min, 2.2 s/S) - -Iteration #2 of 3. - . 10 -WEIGHTS - Glue = -43 - WordPenalty = -22.019 - LanguageModel = +591.53 - LanguageModel_OOV = -252.1 - PhraseModel_0 = -120.21 - PhraseModel_1 = -43.589 - PhraseModel_2 = +73.53 - PhraseModel_3 = +113.7 - PhraseModel_4 = -223.81 - PhraseModel_5 = +64 - PhraseModel_6 = +54.8 - PassThrough = -331.1 - --- - 1best avg score: 0.29568 (+0.062413) - 1best avg model score: -15879 (-6794.1) - avg # pairs: 566.1 - avg # rank err: 0 (meaningless) - avg # margin viol: 0 - k-best loss imp: 100% - non0 feature count: 1931 - avg list sz: 91.3 - avg f count: 139.89 -(time 0.33 min, 2 s/S) - -Iteration #3 of 3. - . 10 -WEIGHTS - Glue = -44.3 - WordPenalty = -131.85 - LanguageModel = +230.91 - LanguageModel_OOV = -285.4 - PhraseModel_0 = -194.27 - PhraseModel_1 = -294.83 - PhraseModel_2 = -92.043 - PhraseModel_3 = -140.24 - PhraseModel_4 = +85.613 - PhraseModel_5 = +238.1 - PhraseModel_6 = +158.7 - PassThrough = -359.6 - --- - 1best avg score: 0.37375 (+0.078067) - 1best avg model score: -14519 (+1359.7) - avg # pairs: 545.4 - avg # rank err: 0 (meaningless) - avg # margin viol: 0 - k-best loss imp: 100% - non0 feature count: 2218 - avg list sz: 91.3 - avg f count: 137.77 -(time 0.35 min, 2.1 s/S) - -Writing weights file to '-' ... -done - ---- -Best iteration: 3 [SCORE 'fixed_stupid_bleu'=0.37375]. -This took 1.05 min. diff --git a/training/dtrain/examples/standard/expected-output.gz b/training/dtrain/examples/standard/expected-output.gz Binary files differnew file mode 100644 index 00000000..f93a253e --- /dev/null +++ b/training/dtrain/examples/standard/expected-output.gz diff --git a/training/dtrain/examples/standard/nc-wmt11.de.gz b/training/dtrain/examples/standard/nc-wmt11.de.gz Binary files differdeleted file mode 100644 index 0741fd92..00000000 --- a/training/dtrain/examples/standard/nc-wmt11.de.gz +++ /dev/null diff --git a/training/dtrain/examples/standard/nc-wmt11.en.gz b/training/dtrain/examples/standard/nc-wmt11.en.gz Binary files differdeleted file mode 100644 index 1c0bd401..00000000 --- a/training/dtrain/examples/standard/nc-wmt11.en.gz +++ /dev/null diff --git a/training/dtrain/examples/toy/dtrain.ini b/training/dtrain/examples/toy/dtrain.ini index ef956df7..70c7331c 100644 --- a/training/dtrain/examples/toy/dtrain.ini +++ b/training/dtrain/examples/toy/dtrain.ini @@ -1,6 +1,5 @@ decoder_config=cdec.ini -input=src -refs=tgt +bitext=in output=- print_weights=logp shell_rule house_rule small_rule little_rule PassThrough PassThrough_1 PassThrough_2 PassThrough_3 PassThrough_4 PassThrough_5 PassThrough_6 k=4 diff --git a/training/dtrain/examples/toy/expected-output b/training/dtrain/examples/toy/expected-output index 1da2aadd..fbee24e3 100644 --- a/training/dtrain/examples/toy/expected-output +++ b/training/dtrain/examples/toy/expected-output @@ -1,26 +1,29 @@ Warning: hi_lo only works with pair_sampling XYX. cdec cfg 'cdec.ini' -Seeding random number sequence to 1664825829 +Seeding random number sequence to 3626026233 dtrain Parameters: k 4 N 4 T 2 + batch 0 scorer 'bleu' sample from 'kbest' filter 'uniq' learning rate 1 gamma 0 loss margin 0 + faster perceptron 1 pairs 'all' pair threshold 0 select weights 'last' l1 reg 0 'none' + pclr no max pairs 4294967295 + repeat 1 cdec cfg 'cdec.ini' - input 'src' - refs 'tgt' + input '' output '-' (a dot represents 10 inputs) Iteration #1 of 2. @@ -32,12 +35,19 @@ WEIGHTS small_rule = -2 little_rule = +3 PassThrough = -5 + PassThrough_1 = +0 + PassThrough_2 = +0 + PassThrough_3 = +0 + PassThrough_4 = +0 + PassThrough_5 = +0 + PassThrough_6 = +0 --- 1best avg score: 0.5 (+0.5) 1best avg model score: 2.5 (+2.5) - avg # pairs: 4 - avg # rank err: 1.5 + avg # pairs: 1.5 + avg # rank err: 1.5 (meaningless) avg # margin viol: 0 + k-best loss imp: 100% non0 feature count: 6 avg list sz: 4 avg f count: 2.875 @@ -52,12 +62,19 @@ WEIGHTS small_rule = -2 little_rule = +3 PassThrough = -5 + PassThrough_1 = +0 + PassThrough_2 = +0 + PassThrough_3 = +0 + PassThrough_4 = +0 + PassThrough_5 = +0 + PassThrough_6 = +0 --- 1best avg score: 1 (+0.5) 1best avg model score: 5 (+2.5) - avg # pairs: 5 - avg # rank err: 0 + avg # pairs: 0 + avg # rank err: 0 (meaningless) avg # margin viol: 0 + k-best loss imp: 100% non0 feature count: 6 avg list sz: 4 avg f count: 3 diff --git a/training/dtrain/examples/toy/in b/training/dtrain/examples/toy/in new file mode 100644 index 00000000..5d70795d --- /dev/null +++ b/training/dtrain/examples/toy/in @@ -0,0 +1,2 @@ +ich sah ein kleines haus ||| i saw a little house +ich fand ein kleines haus ||| i found a little house diff --git a/training/dtrain/examples/toy/src b/training/dtrain/examples/toy/src deleted file mode 100644 index 87e39ef2..00000000 --- a/training/dtrain/examples/toy/src +++ /dev/null @@ -1,2 +0,0 @@ -ich sah ein kleines haus -ich fand ein kleines haus diff --git a/training/dtrain/examples/toy/tgt b/training/dtrain/examples/toy/tgt deleted file mode 100644 index 174926b3..00000000 --- a/training/dtrain/examples/toy/tgt +++ /dev/null @@ -1,2 +0,0 @@ -i saw a little house -i found a little house diff --git a/training/dtrain/kbestget.h b/training/dtrain/kbestget.h index 85252db3..25f02273 100644 --- a/training/dtrain/kbestget.h +++ b/training/dtrain/kbestget.h @@ -52,7 +52,7 @@ struct KBestGetter : public HypSampler h.f = d->feature_values; h.model = log(d->score); h.rank = i; - h.score = scorer_->Score(h.w, *ref_, i, src_len_); + h.score = scorer_->Score(h.w, *refs_, i, src_len_); s_.push_back(h); sz_++; f_count_ += h.f.size(); @@ -73,7 +73,7 @@ struct KBestGetter : public HypSampler h.f = d->feature_values; h.model = log(d->score); h.rank = i; - h.score = scorer_->Score(h.w, *ref_, i, src_len_); + h.score = scorer_->Score(h.w, *refs_, i, src_len_); s_.push_back(h); sz_++; f_count_ += h.f.size(); diff --git a/training/dtrain/ksampler.h b/training/dtrain/ksampler.h index 29dab667..9eedc74f 100644 --- a/training/dtrain/ksampler.h +++ b/training/dtrain/ksampler.h @@ -43,7 +43,7 @@ struct KSampler : public HypSampler h.f = samples[i].fmap; h.model = log(samples[i].model_score); h.rank = i; - h.score = scorer_->Score(h.w, *ref_, i, src_len_); + h.score = scorer_->Score(h.w, *refs_, i, src_len_); s_.push_back(h); sz_++; f_count_ += h.f.size(); diff --git a/training/dtrain/parallelize.rb b/training/dtrain/parallelize.rb index 5fc8b04e..fe3a6cf5 100755 --- a/training/dtrain/parallelize.rb +++ b/training/dtrain/parallelize.rb @@ -1,10 +1,11 @@ #!/usr/bin/env ruby require 'trollop' +require 'zipf' def usage STDERR.write "Usage: " - STDERR.write "ruby parallelize.rb -c <dtrain.ini> [-e <epochs=10>] [--randomize/-z] [--reshard/-y] -s <#shards|0> [-p <at once=9999>] -i <input> -r <refs> [--qsub/-q] [--dtrain_binary <path to dtrain binary>] [-l \"l2 select_k 100000\"] [--extra_qsub \"-l mem_free=24G\"]\n" + STDERR.write "ruby parallelize.rb -c <dtrain.ini> [-e <epochs=10>] [--randomize/-z] [--reshard/-y] -s <#shards|0> [-p <at once=9999>] -i <input> [--qsub/-q] [--dtrain_binary <path to dtrain binary>] [-l \"l2 select_k 100000\"] [--extra_qsub \"-l mem_free=24G\"]\n" exit 1 end @@ -16,15 +17,14 @@ opts = Trollop::options do opt :reshard, "reshard after each epoch", :type => :bool, :short => '-y', :default => false opt :shards, "number of shards", :type => :int opt :processes_at_once, "have this number (max) running at the same time", :type => :int, :default => 9999 - opt :input, "input", :type => :string - opt :references, "references", :type => :string + opt :input, "input (bitext f ||| e ||| ...)", :type => :string opt :qsub, "use qsub", :type => :bool, :default => false opt :dtrain_binary, "path to dtrain binary", :type => :string opt :extra_qsub, "extra qsub args", :type => :string, :default => "" opt :per_shard_decoder_configs, "give special decoder config per shard", :type => :string, :short => '-o' opt :first_input_weights, "input weights for first iter", :type => :string, :default => '', :short => '-w' end -usage if not opts[:config]&&opts[:shards]&&opts[:input]&&opts[:references] +usage if not opts[:config]&&opts[:shards]&&opts[:input] dtrain_dir = File.expand_path File.dirname(__FILE__) if not opts[:dtrain_binary] @@ -51,7 +51,6 @@ else num_shards = opts[:shards] end input = opts[:input] -refs = opts[:references] use_qsub = opts[:qsub] shards_at_once = opts[:processes_at_once] first_input_weights = opts[:first_input_weights] @@ -59,7 +58,7 @@ opts[:extra_qsub] = "-l #{opts[:extra_qsub]}" if opts[:extra_qsub]!="" `mkdir work` -def make_shards(input, refs, num_shards, epoch, rand) +def make_shards(input, num_shards, epoch, rand) lc = `wc -l #{input}`.split.first.to_i index = (0..lc-1).to_a index.reverse! @@ -69,12 +68,8 @@ def make_shards(input, refs, num_shards, epoch, rand) leftover = 0 if leftover < 0 in_f = File.new input, 'r' in_lines = in_f.readlines - refs_f = File.new refs, 'r' - refs_lines = refs_f.readlines shard_in_files = [] - shard_refs_files = [] in_fns = [] - refs_fns = [] new_num_shards = 0 0.upto(num_shards-1) { |shard| break if index.size==0 @@ -82,41 +77,32 @@ def make_shards(input, refs, num_shards, epoch, rand) in_fn = "work/shard.#{shard}.#{epoch}.in" shard_in = File.new in_fn, 'w+' in_fns << in_fn - refs_fn = "work/shard.#{shard}.#{epoch}.refs" - shard_refs = File.new refs_fn, 'w+' - refs_fns << refs_fn 0.upto(shard_sz-1) { |i| j = index.pop break if !j shard_in.write in_lines[j] - shard_refs.write refs_lines[j] } shard_in_files << shard_in - shard_refs_files << shard_refs } while leftover > 0 j = index.pop shard_in_files[-1].write in_lines[j] - shard_refs_files[-1].write refs_lines[j] leftover -= 1 end - (shard_in_files + shard_refs_files).each do |f| f.close end + shard_in_files.each do |f| f.close end in_f.close - refs_f.close - return in_fns, refs_fns, new_num_shards + return in_fns, new_num_shards end input_files = [] -refs_files = [] if predefined_shards input_files = File.new(input).readlines.map {|i| i.strip } - refs_files = File.new(refs).readlines.map {|i| i.strip } if per_shard_decoder_configs decoder_configs = File.new(opts[:per_shard_decoder_configs]).readlines.map {|i| i.strip} end num_shards = input_files.size else - input_files, refs_files, num_shards = make_shards input, refs, num_shards, 0, rand + input_files, num_shards = make_shards input, num_shards, 0, rand end 0.upto(epochs-1) { |epoch| @@ -149,8 +135,7 @@ end end pids << Kernel.fork { `#{qsub_str_start}#{dtrain_bin} -c #{ini} #{cdec_cfg} #{input_weights}\ - --input #{input_files[shard]}\ - --refs #{refs_files[shard]}\ + --bitext #{input_files[shard]}\ --output work/weights.#{shard}.#{epoch}#{qsub_str_end} #{local_end}` } weights_files << "work/weights.#{shard}.#{epoch}" @@ -163,7 +148,7 @@ end `#{cat} work/weights.*.#{epoch} > work/weights_cat` `#{ruby} #{lplp_rb} #{lplp_args} #{num_shards} < work/weights_cat > work/weights.#{epoch}` if rand and reshard and epoch+1!=epochs - input_files, refs_files, num_shards = make_shards input, refs, num_shards, epoch+1, rand + input_files, num_shards = make_shards input, num_shards, epoch+1, rand end } diff --git a/training/dtrain/score.cc b/training/dtrain/score.cc index 127f34d2..d81eafcb 100644 --- a/training/dtrain/score.cc +++ b/training/dtrain/score.cc @@ -31,13 +31,22 @@ BleuScorer::Bleu(NgramCounts& counts, const unsigned hyp_len, const unsigned ref return brevity_penalty(hyp_len, ref_len) * exp(sum); } +size_t +RefLen(vector<vector<WordID> > refs) +{ + size_t ref_len = 0; + for (auto r: refs) + ref_len = max(ref_len, r.size()); + return ref_len; +} + score_t -BleuScorer::Score(const vector<WordID>& hyp, const vector<WordID>& ref, +BleuScorer::Score(const vector<WordID>& hyp, const vector<vector<WordID> >& refs, const unsigned /*rank*/, const unsigned /*src_len*/) { - unsigned hyp_len = hyp.size(), ref_len = ref.size(); + unsigned hyp_len = hyp.size(), ref_len = RefLen(refs); if (hyp_len == 0 || ref_len == 0) return 0.; - NgramCounts counts = make_ngram_counts(hyp, ref, N_); + NgramCounts counts = make_ngram_counts(hyp, refs, N_); return Bleu(counts, hyp_len, ref_len); } @@ -52,12 +61,12 @@ BleuScorer::Score(const vector<WordID>& hyp, const vector<WordID>& ref, * NOTE: 0 iff no 1gram match ('grounded') */ score_t -StupidBleuScorer::Score(const vector<WordID>& hyp, const vector<WordID>& ref, +StupidBleuScorer::Score(const vector<WordID>& hyp, const vector<vector<WordID> >& refs, const unsigned /*rank*/, const unsigned /*src_len*/) { - unsigned hyp_len = hyp.size(), ref_len = ref.size(); + unsigned hyp_len = hyp.size(), ref_len = RefLen(refs); if (hyp_len == 0 || ref_len == 0) return 0.; - NgramCounts counts = make_ngram_counts(hyp, ref, N_); + NgramCounts counts = make_ngram_counts(hyp, refs, N_); unsigned M = N_; vector<score_t> v = w_; if (ref_len < N_) { @@ -81,12 +90,12 @@ StupidBleuScorer::Score(const vector<WordID>& hyp, const vector<WordID>& ref, * (Nakov et al. '12) */ score_t -FixedStupidBleuScorer::Score(const vector<WordID>& hyp, const vector<WordID>& ref, +FixedStupidBleuScorer::Score(const vector<WordID>& hyp, const vector<vector<WordID> >& refs, const unsigned /*rank*/, const unsigned /*src_len*/) { - unsigned hyp_len = hyp.size(), ref_len = ref.size(); + unsigned hyp_len = hyp.size(), ref_len = RefLen(refs); if (hyp_len == 0 || ref_len == 0) return 0.; - NgramCounts counts = make_ngram_counts(hyp, ref, N_); + NgramCounts counts = make_ngram_counts(hyp, refs, N_); unsigned M = N_; vector<score_t> v = w_; if (ref_len < N_) { @@ -112,12 +121,12 @@ FixedStupidBleuScorer::Score(const vector<WordID>& hyp, const vector<WordID>& re * NOTE: max is 0.9375 (with N=4) */ score_t -SmoothBleuScorer::Score(const vector<WordID>& hyp, const vector<WordID>& ref, +SmoothBleuScorer::Score(const vector<WordID>& hyp, const vector<vector<WordID> >& refs, const unsigned /*rank*/, const unsigned /*src_len*/) { - unsigned hyp_len = hyp.size(), ref_len = ref.size(); + unsigned hyp_len = hyp.size(), ref_len = RefLen(refs); if (hyp_len == 0 || ref_len == 0) return 0.; - NgramCounts counts = make_ngram_counts(hyp, ref, N_); + NgramCounts counts = make_ngram_counts(hyp, refs, N_); unsigned M = N_; if (ref_len < N_) M = ref_len; score_t sum = 0.; @@ -143,12 +152,12 @@ SmoothBleuScorer::Score(const vector<WordID>& hyp, const vector<WordID>& ref, * sum up Ngram precisions */ score_t -SumBleuScorer::Score(const vector<WordID>& hyp, const vector<WordID>& ref, +SumBleuScorer::Score(const vector<WordID>& hyp, const vector<vector<WordID> >& refs, const unsigned /*rank*/, const unsigned /*src_len*/) { - unsigned hyp_len = hyp.size(), ref_len = ref.size(); + unsigned hyp_len = hyp.size(), ref_len = RefLen(refs); if (hyp_len == 0 || ref_len == 0) return 0.; - NgramCounts counts = make_ngram_counts(hyp, ref, N_); + NgramCounts counts = make_ngram_counts(hyp, refs, N_); unsigned M = N_; if (ref_len < N_) M = ref_len; score_t sum = 0.; @@ -167,12 +176,12 @@ SumBleuScorer::Score(const vector<WordID>& hyp, const vector<WordID>& ref, * sum up exp(Ngram precisions) */ score_t -SumExpBleuScorer::Score(const vector<WordID>& hyp, const vector<WordID>& ref, +SumExpBleuScorer::Score(const vector<WordID>& hyp, const vector<vector<WordID> >& refs, const unsigned /*rank*/, const unsigned /*src_len*/) { - unsigned hyp_len = hyp.size(), ref_len = ref.size(); + unsigned hyp_len = hyp.size(), ref_len = RefLen(refs); if (hyp_len == 0 || ref_len == 0) return 0.; - NgramCounts counts = make_ngram_counts(hyp, ref, N_); + NgramCounts counts = make_ngram_counts(hyp, refs, N_); unsigned M = N_; if (ref_len < N_) M = ref_len; score_t sum = 0.; @@ -191,12 +200,12 @@ SumExpBleuScorer::Score(const vector<WordID>& hyp, const vector<WordID>& ref, * sum up exp(weight * log(Ngram precisions)) */ score_t -SumWhateverBleuScorer::Score(const vector<WordID>& hyp, const vector<WordID>& ref, +SumWhateverBleuScorer::Score(const vector<WordID>& hyp, const vector<vector<WordID> >& refs, const unsigned /*rank*/, const unsigned /*src_len*/) { - unsigned hyp_len = hyp.size(), ref_len = ref.size(); + unsigned hyp_len = hyp.size(), ref_len = RefLen(refs); if (hyp_len == 0 || ref_len == 0) return 0.; - NgramCounts counts = make_ngram_counts(hyp, ref, N_); + NgramCounts counts = make_ngram_counts(hyp, refs, N_); unsigned M = N_; vector<score_t> v = w_; if (ref_len < N_) { @@ -224,15 +233,15 @@ SumWhateverBleuScorer::Score(const vector<WordID>& hyp, const vector<WordID>& re * No scaling by src len. */ score_t -ApproxBleuScorer::Score(const vector<WordID>& hyp, const vector<WordID>& ref, +ApproxBleuScorer::Score(const vector<WordID>& hyp, const vector<vector<WordID> >& refs, const unsigned rank, const unsigned src_len) { - unsigned hyp_len = hyp.size(), ref_len = ref.size(); + unsigned hyp_len = hyp.size(), ref_len = RefLen(refs); if (ref_len == 0) return 0.; score_t score = 0.; NgramCounts counts(N_); if (hyp_len > 0) { - counts = make_ngram_counts(hyp, ref, N_); + counts = make_ngram_counts(hyp, refs, N_); NgramCounts tmp = glob_onebest_counts_ + counts; score = Bleu(tmp, hyp_len, ref_len); } @@ -255,16 +264,16 @@ ApproxBleuScorer::Score(const vector<WordID>& hyp, const vector<WordID>& ref, * */ score_t -LinearBleuScorer::Score(const vector<WordID>& hyp, const vector<WordID>& ref, +LinearBleuScorer::Score(const vector<WordID>& hyp, const vector<vector<WordID> >& refs, const unsigned rank, const unsigned /*src_len*/) { - unsigned hyp_len = hyp.size(), ref_len = ref.size(); + unsigned hyp_len = hyp.size(), ref_len = RefLen(refs); if (ref_len == 0) return 0.; unsigned M = N_; if (ref_len < N_) M = ref_len; NgramCounts counts(M); if (hyp_len > 0) - counts = make_ngram_counts(hyp, ref, M); + counts = make_ngram_counts(hyp, refs, M); score_t ret = 0.; for (unsigned i = 0; i < M; i++) { if (counts.sum_[i] == 0 || onebest_counts_.sum_[i] == 0) break; diff --git a/training/dtrain/score.h b/training/dtrain/score.h index 1cdd3fa9..7d88cb61 100644 --- a/training/dtrain/score.h +++ b/training/dtrain/score.h @@ -117,20 +117,25 @@ make_ngrams(const vector<WordID>& s, const unsigned N) } inline NgramCounts -make_ngram_counts(const vector<WordID>& hyp, const vector<WordID>& ref, const unsigned N) +make_ngram_counts(const vector<WordID>& hyp, const vector<vector<WordID> >& refs, const unsigned N) { Ngrams hyp_ngrams = make_ngrams(hyp, N); - Ngrams ref_ngrams = make_ngrams(ref, N); + vector<Ngrams> refs_ngrams; + for (auto r: refs) { + Ngrams r_ng = make_ngrams(r, N); + refs_ngrams.push_back(r_ng); + } NgramCounts counts(N); Ngrams::iterator it; Ngrams::iterator ti; for (it = hyp_ngrams.begin(); it != hyp_ngrams.end(); it++) { - ti = ref_ngrams.find(it->first); - if (ti != ref_ngrams.end()) { - counts.Add(it->second, ti->second, it->first.size() - 1); - } else { - counts.Add(it->second, 0, it->first.size() - 1); + unsigned max_ref_count = 0; + for (auto ref_ngrams: refs_ngrams) { + ti = ref_ngrams.find(it->first); + if (ti != ref_ngrams.end()) + max_ref_count = max(max_ref_count, ti->second); } + counts.Add(it->second, max_ref_count, it->first.size() - 1); } return counts; } @@ -138,43 +143,43 @@ make_ngram_counts(const vector<WordID>& hyp, const vector<WordID>& ref, const un struct BleuScorer : public LocalScorer { score_t Bleu(NgramCounts& counts, const unsigned hyp_len, const unsigned ref_len); - score_t Score(const vector<WordID>& hyp, const vector<WordID>& ref, const unsigned /*rank*/, const unsigned /*src_len*/); + score_t Score(const vector<WordID>& hyp, const vector<vector<WordID> >& refs, const unsigned /*rank*/, const unsigned /*src_len*/); void Reset() {} }; struct StupidBleuScorer : public LocalScorer { - score_t Score(const vector<WordID>& hyp, const vector<WordID>& ref, const unsigned /*rank*/, const unsigned /*src_len*/); + score_t Score(const vector<WordID>& hyp, const vector<vector<WordID> >& refs, const unsigned /*rank*/, const unsigned /*src_len*/); void Reset() {} }; struct FixedStupidBleuScorer : public LocalScorer { - score_t Score(const vector<WordID>& hyp, const vector<WordID>& ref, const unsigned /*rank*/, const unsigned /*src_len*/); + score_t Score(const vector<WordID>& hyp, const vector<vector<WordID> >& refs, const unsigned /*rank*/, const unsigned /*src_len*/); void Reset() {} }; struct SmoothBleuScorer : public LocalScorer { - score_t Score(const vector<WordID>& hyp, const vector<WordID>& ref, const unsigned /*rank*/, const unsigned /*src_len*/); + score_t Score(const vector<WordID>& hyp, const vector<vector<WordID> >& refs, const unsigned /*rank*/, const unsigned /*src_len*/); void Reset() {} }; struct SumBleuScorer : public LocalScorer { - score_t Score(const vector<WordID>& hyp, const vector<WordID>& ref, const unsigned /*rank*/, const unsigned /*src_len*/); + score_t Score(const vector<WordID>& hyp, const vector<vector<WordID> >& refs, const unsigned /*rank*/, const unsigned /*src_len*/); void Reset() {} }; struct SumExpBleuScorer : public LocalScorer { - score_t Score(const vector<WordID>& hyp, const vector<WordID>& ref, const unsigned /*rank*/, const unsigned /*src_len*/); + score_t Score(const vector<WordID>& hyp, const vector<vector<WordID> >& refs, const unsigned /*rank*/, const unsigned /*src_len*/); void Reset() {} }; struct SumWhateverBleuScorer : public LocalScorer { - score_t Score(const vector<WordID>& hyp, const vector<WordID>& ref, const unsigned /*rank*/, const unsigned /*src_len*/); + score_t Score(const vector<WordID>& hyp, const vector<vector<WordID> >& refs, const unsigned /*rank*/, const unsigned /*src_len*/); void Reset() {}; }; @@ -194,7 +199,7 @@ struct ApproxBleuScorer : public BleuScorer glob_hyp_len_ = glob_ref_len_ = glob_src_len_ = 0.; } - score_t Score(const vector<WordID>& hyp, const vector<WordID>& ref, const unsigned rank, const unsigned src_len); + score_t Score(const vector<WordID>& hyp, const vector<vector<WordID> >& refs, const unsigned rank, const unsigned src_len); }; struct LinearBleuScorer : public BleuScorer @@ -207,7 +212,7 @@ struct LinearBleuScorer : public BleuScorer onebest_counts_.One(); } - score_t Score(const vector<WordID>& hyp, const vector<WordID>& ref, const unsigned rank, const unsigned /*src_len*/); + score_t Score(const vector<WordID>& hyp, const vector<vector<WordID> >& refs, const unsigned rank, const unsigned /*src_len*/); inline void Reset() { onebest_len_ = 1; |