From c398cef915ea7037c91066b6bfc19d915cac498b Mon Sep 17 00:00:00 2001 From: Patrick Simianer Date: Tue, 10 Sep 2013 18:20:16 +0200 Subject: simple pclr --- training/dtrain/dtrain.cc | 25 ++++++++++++++++++++----- 1 file changed, 20 insertions(+), 5 deletions(-) diff --git a/training/dtrain/dtrain.cc b/training/dtrain/dtrain.cc index 0ee2f124..34c0a54a 100644 --- a/training/dtrain/dtrain.cc +++ b/training/dtrain/dtrain.cc @@ -40,6 +40,7 @@ dtrain_init(int argc, char** argv, po::variables_map* cfg) ("scale_bleu_diff", po::value()->zero_tokens(), "learning rate <- bleu diff of a misranked pair") ("loss_margin", po::value()->default_value(0.), "update if no error in pref pair but model scores this near") ("max_pairs", po::value()->default_value(std::numeric_limits::max()), "max. # of pairs per Sent.") + ("pclr", po::value()->zero_tokens(), "use a (simple) per-coordinate learning rate") ("noup", po::value()->zero_tokens(), "do not update weights"); po::options_description cl("Command Line Options"); cl.add_options() @@ -124,6 +125,8 @@ main(int argc, char** argv) if (loss_margin > 9998.) loss_margin = std::numeric_limits::max(); bool scale_bleu_diff = false; if (cfg.count("scale_bleu_diff")) scale_bleu_diff = true; + bool pclr = false; + if (cfg.count("pclr")) pclr = true; bool average = false; if (select_weights == "avg") average = true; @@ -131,7 +134,6 @@ main(int argc, char** argv) if (cfg.count("print_weights")) boost::split(print_weights, cfg["print_weights"].as(), boost::is_any_of(" ")); - // setup decoder register_feature_functions(); SetSilent(true); @@ -249,6 +251,8 @@ main(int argc, char** argv) cerr << setw(25) << "l1 reg " << l1_reg << " '" << cfg["l1_reg"].as() << "'" << endl; if (rescale) cerr << setw(25) << "rescale " << rescale << endl; + if (pclr) + cerr << setw(25) << "pclr " << pclr << endl; cerr << setw(25) << "max pairs " << max_pairs << endl; cerr << setw(25) << "cdec cfg " << "'" << cfg["decoder_config"].as() << "'" << endl; cerr << setw(25) << "input " << "'" << input_fn << "'" << endl; @@ -261,6 +265,8 @@ main(int argc, char** argv) if (!verbose) cerr << "(a dot represents " << DTRAIN_DOTS << " inputs)" << endl; } + // pclr + SparseVector learning_rates; for (unsigned t = 0; t < T; t++) // T epochs { @@ -385,7 +391,16 @@ main(int argc, char** argv) if (scale_bleu_diff) eta = it->first.score - it->second.score; if (rank_error || margin < loss_margin) { SparseVector diff_vec = it->first.f - it->second.f; - lambdas.plus_eq_v_times_s(diff_vec, eta); + if (pclr) { + SparseVector::iterator jt = diff_vec.begin(); + for (; jt != diff_vec.end(); ++it) { + jt->second *= max(0.0000001, eta/(eta+learning_rates[jt->first])); // FIXME + learning_rates[jt->first]++; + } + lambdas += diff_vec; + } else { + lambdas.plus_eq_v_times_s(diff_vec, eta); + } if (gamma) lambdas.plus_eq_v_times_s(lambdas, -2*gamma*eta*(1./npairs)); } @@ -395,14 +410,14 @@ main(int argc, char** argv) // please note that this regularizations happen // after a _sentence_ -- not after each example/pair! if (l1naive) { - FastSparseVector::iterator it = lambdas.begin(); + SparseVector::iterator it = lambdas.begin(); for (; it != lambdas.end(); ++it) { if (!lambdas_copy.get(it->first) || lambdas_copy.get(it->first)!=it->second) { it->second -= sign(it->second) * l1_reg; } } } else if (l1clip) { - FastSparseVector::iterator it = lambdas.begin(); + SparseVector::iterator it = lambdas.begin(); for (; it != lambdas.end(); ++it) { if (!lambdas_copy.get(it->first) || lambdas_copy.get(it->first)!=it->second) { if (it->second != 0) { @@ -417,7 +432,7 @@ main(int argc, char** argv) } } else if (l1cumul) { weight_t acc_penalty = (ii+1) * l1_reg; // ii is the index of the current input - FastSparseVector::iterator it = lambdas.begin(); + SparseVector::iterator it = lambdas.begin(); for (; it != lambdas.end(); ++it) { if (!lambdas_copy.get(it->first) || lambdas_copy.get(it->first)!=it->second) { if (it->second != 0) { -- cgit v1.2.3 From 451d0c7e865cdea9da6a0fb747782886b49eeeef Mon Sep 17 00:00:00 2001 From: Patrick Simianer Date: Tue, 10 Sep 2013 19:54:40 +0200 Subject: do pclr after sentences.. --- training/dtrain/dtrain.cc | 36 ++++++++++++++++++++-------- training/dtrain/examples/standard/dtrain.ini | 3 ++- 2 files changed, 28 insertions(+), 11 deletions(-) diff --git a/training/dtrain/dtrain.cc b/training/dtrain/dtrain.cc index 34c0a54a..2d090666 100644 --- a/training/dtrain/dtrain.cc +++ b/training/dtrain/dtrain.cc @@ -372,7 +372,8 @@ main(int argc, char** argv) PROsampling(samples, pairs, pair_threshold, max_pairs); npairs += pairs.size(); - SparseVector lambdas_copy; + SparseVector lambdas_copy; // for l1 regularization + SparseVector sum_up; // for pclr if (l1naive||l1clip||l1cumul) lambdas_copy = lambdas; for (vector >::iterator it = pairs.begin(); @@ -392,20 +393,24 @@ main(int argc, char** argv) if (rank_error || margin < loss_margin) { SparseVector diff_vec = it->first.f - it->second.f; if (pclr) { - SparseVector::iterator jt = diff_vec.begin(); - for (; jt != diff_vec.end(); ++it) { - jt->second *= max(0.0000001, eta/(eta+learning_rates[jt->first])); // FIXME - learning_rates[jt->first]++; - } - lambdas += diff_vec; - } else { - lambdas.plus_eq_v_times_s(diff_vec, eta); - } + sum_up += diff_vec; + } else { + lambdas.plus_eq_v_times_s(diff_vec, eta); + } if (gamma) lambdas.plus_eq_v_times_s(lambdas, -2*gamma*eta*(1./npairs)); } } + // per-coordinate learning rate + if (pclr) { + SparseVector::iterator it = sum_up.begin(); + for (; it != lambdas.end(); ++it) { + lambdas[it->first] += it->second * max(0.00000001, eta/(eta+learning_rates[it->first])); + learning_rates[it->first]++; + } + } + // l1 regularization // please note that this regularizations happen // after a _sentence_ -- not after each example/pair! @@ -413,6 +418,8 @@ main(int argc, char** argv) SparseVector::iterator it = lambdas.begin(); for (; it != lambdas.end(); ++it) { if (!lambdas_copy.get(it->first) || lambdas_copy.get(it->first)!=it->second) { + it->second *= max(0.0000001, eta/(eta+learning_rates[it->first])); // FIXME + learning_rates[it->first]++; it->second -= sign(it->second) * l1_reg; } } @@ -530,6 +537,15 @@ main(int argc, char** argv) Weights::WriteToFile(w_fn, dense_weights, true); } + WriteFile of("-"); + ostream& o = *of.stream(); + o << "<<<<<<<<<<<<<<<<<<<<<<<<\n"; + for (SparseVector::iterator it = learning_rates.begin(); it != learning_rates.end(); ++it) { + if (it->second == 0) continue; + o << FD::Convert(it->first) << '\t' << it->second << endl; + } + o << ">>>>>>>>>>>>>>>>>>>>>>>>>\n"; + } // outer loop if (average) w_average /= (weight_t)T; diff --git a/training/dtrain/examples/standard/dtrain.ini b/training/dtrain/examples/standard/dtrain.ini index 23e94285..07350a0b 100644 --- a/training/dtrain/examples/standard/dtrain.ini +++ b/training/dtrain/examples/standard/dtrain.ini @@ -1,6 +1,6 @@ input=./nc-wmt11.de.gz refs=./nc-wmt11.en.gz -output=- # a weights file (add .gz for gzip compression) or STDOUT '-' +output=asdf # a weights file (add .gz for gzip compression) or STDOUT '-' select_weights=VOID # output average (over epochs) weight vector decoder_config=./cdec.ini # config for cdec # weights for these features will be printed on each iteration @@ -22,3 +22,4 @@ pair_sampling=XYX # hi_lo=0.1 # 10 vs 80 vs 10 and 80 vs 10 here pair_threshold=0 # minimum distance in BLEU (here: > 0) loss_margin=0 # update if correctly ranked, but within this margin +pclr=1 -- cgit v1.2.3 From 04784fd692c58e9bb45708b07f33831c9888903b Mon Sep 17 00:00:00 2001 From: Patrick Simianer Date: Tue, 10 Sep 2013 20:03:22 +0200 Subject: rm debug stuff --- training/dtrain/dtrain.cc | 9 --------- training/dtrain/examples/standard/dtrain.ini | 1 - 2 files changed, 10 deletions(-) diff --git a/training/dtrain/dtrain.cc b/training/dtrain/dtrain.cc index 2d090666..5dfd6286 100644 --- a/training/dtrain/dtrain.cc +++ b/training/dtrain/dtrain.cc @@ -537,15 +537,6 @@ main(int argc, char** argv) Weights::WriteToFile(w_fn, dense_weights, true); } - WriteFile of("-"); - ostream& o = *of.stream(); - o << "<<<<<<<<<<<<<<<<<<<<<<<<\n"; - for (SparseVector::iterator it = learning_rates.begin(); it != learning_rates.end(); ++it) { - if (it->second == 0) continue; - o << FD::Convert(it->first) << '\t' << it->second << endl; - } - o << ">>>>>>>>>>>>>>>>>>>>>>>>>\n"; - } // outer loop if (average) w_average /= (weight_t)T; diff --git a/training/dtrain/examples/standard/dtrain.ini b/training/dtrain/examples/standard/dtrain.ini index 07350a0b..c0912a62 100644 --- a/training/dtrain/examples/standard/dtrain.ini +++ b/training/dtrain/examples/standard/dtrain.ini @@ -22,4 +22,3 @@ pair_sampling=XYX # hi_lo=0.1 # 10 vs 80 vs 10 and 80 vs 10 here pair_threshold=0 # minimum distance in BLEU (here: > 0) loss_margin=0 # update if correctly ranked, but within this margin -pclr=1 -- cgit v1.2.3 From b2891df4bc4429fbeec503279fd19e7fafe04a24 Mon Sep 17 00:00:00 2001 From: Patrick Simianer Date: Tue, 17 Sep 2013 21:18:27 +0200 Subject: separate inis for shards --- training/dtrain/dtrain.cc | 2 +- training/dtrain/parallelize.rb | 13 ++++++++++++- 2 files changed, 13 insertions(+), 2 deletions(-) diff --git a/training/dtrain/dtrain.cc b/training/dtrain/dtrain.cc index 2d090666..4521e794 100644 --- a/training/dtrain/dtrain.cc +++ b/training/dtrain/dtrain.cc @@ -406,8 +406,8 @@ main(int argc, char** argv) if (pclr) { SparseVector::iterator it = sum_up.begin(); for (; it != lambdas.end(); ++it) { - lambdas[it->first] += it->second * max(0.00000001, eta/(eta+learning_rates[it->first])); learning_rates[it->first]++; + lambdas[it->first] += it->second / learning_rates[it->first]; //* max(0.00000001, eta/(eta+learning_rates[it->first])); } } diff --git a/training/dtrain/parallelize.rb b/training/dtrain/parallelize.rb index 285f3c9b..66a61b3d 100755 --- a/training/dtrain/parallelize.rb +++ b/training/dtrain/parallelize.rb @@ -21,6 +21,7 @@ opts = Trollop::options do opt :qsub, "use qsub", :type => :bool, :default => false opt :dtrain_binary, "path to dtrain binary", :type => :string opt :extra_qsub, "extra qsub args", :type => :string, :default => "" + opt :per_shard_decoder_configs, "give special decoder config per shard", :type => string end usage if not opts[:config]&&opts[:shards]&&opts[:input]&&opts[:references] @@ -41,9 +42,11 @@ epochs = opts[:epochs] rand = opts[:randomize] reshard = opts[:reshard] predefined_shards = false +per_shard_decoder_configs = false if opts[:shards] == 0 predefined_shards = true num_shards = 0 + per_shard_decoder_configs = true if opts[:per_shard_decoder_configs] else num_shards = opts[:shards] end @@ -101,6 +104,9 @@ refs_files = [] if predefined_shards input_files = File.new(input).readlines.map {|i| i.strip } refs_files = File.new(refs).readlines.map {|i| i.strip } + if per_shard_decoder_configs + decoder_configs = File.new(opts[:per_shard_decoder_configs]).readlines.map {|i| i.strip} + end num_shards = input_files.size else input_files, refs_files = make_shards input, refs, num_shards, 0, rand @@ -126,8 +132,13 @@ end else local_end = "2>work/out.#{shard}.#{epoch}" end + if per_shard_decoder_configs + cdec_cfg = "--decoder_config #{decoder_configs[shard]}" + else + cdec_cfg = "" + end pids << Kernel.fork { - `#{qsub_str_start}#{dtrain_bin} -c #{ini}\ + `#{qsub_str_start}#{dtrain_bin} -c #{ini} #{cdec_cfg}\ --input #{input_files[shard]}\ --refs #{refs_files[shard]} #{input_weights}\ --output work/weights.#{shard}.#{epoch}#{qsub_str_end} #{local_end}` -- cgit v1.2.3 From 2e746d6ad25aaf4d85f9c8f277ff109e45bfd93e Mon Sep 17 00:00:00 2001 From: Patrick Simianer Date: Fri, 20 Sep 2013 20:01:03 +0200 Subject: loo --- extractor/grammar_extractor.cc | 6 ++++-- extractor/grammar_extractor.h | 3 ++- extractor/rule_factory.cc | 5 +++-- extractor/rule_factory.h | 3 ++- extractor/run_extractor.cc | 13 +++++++++++-- extractor/sample_alignment.txt | 3 +++ extractor/sample_bitext.txt | 3 +++ extractor/sampler.cc | 35 ++++++++++++++++++++++++++++++----- extractor/sampler.h | 5 ++++- 9 files changed, 62 insertions(+), 14 deletions(-) diff --git a/extractor/grammar_extractor.cc b/extractor/grammar_extractor.cc index 8050ce7b..1fbdee5b 100644 --- a/extractor/grammar_extractor.cc +++ b/extractor/grammar_extractor.cc @@ -3,11 +3,13 @@ #include #include #include +#include #include "grammar.h" #include "rule.h" #include "rule_factory.h" #include "vocabulary.h" +#include "data_array.h" using namespace std; @@ -32,10 +34,10 @@ GrammarExtractor::GrammarExtractor( vocabulary(vocabulary), rule_factory(rule_factory) {} -Grammar GrammarExtractor::GetGrammar(const string& sentence) { +Grammar GrammarExtractor::GetGrammar(const string& sentence, const unordered_set blacklisted_sentence_ids, const shared_ptr source_data_array) { vector words = TokenizeSentence(sentence); vector word_ids = AnnotateWords(words); - return rule_factory->GetGrammar(word_ids); + return rule_factory->GetGrammar(word_ids, blacklisted_sentence_ids, source_data_array); } vector GrammarExtractor::TokenizeSentence(const string& sentence) { diff --git a/extractor/grammar_extractor.h b/extractor/grammar_extractor.h index b36ceeb9..6c0aafbf 100644 --- a/extractor/grammar_extractor.h +++ b/extractor/grammar_extractor.h @@ -4,6 +4,7 @@ #include #include #include +#include using namespace std; @@ -44,7 +45,7 @@ class GrammarExtractor { // Converts the sentence to a vector of word ids and uses the RuleFactory to // extract the SCFG rules which may be used to decode the sentence. - Grammar GetGrammar(const string& sentence); + Grammar GetGrammar(const string& sentence, const unordered_set blacklisted_sentence_ids, const shared_ptr source_data_array); private: // Splits the sentence in a vector of words. diff --git a/extractor/rule_factory.cc b/extractor/rule_factory.cc index 8c30fb9e..e52019ae 100644 --- a/extractor/rule_factory.cc +++ b/extractor/rule_factory.cc @@ -17,6 +17,7 @@ #include "suffix_array.h" #include "time_util.h" #include "vocabulary.h" +#include "data_array.h" using namespace std; using namespace chrono; @@ -100,7 +101,7 @@ HieroCachingRuleFactory::HieroCachingRuleFactory() {} HieroCachingRuleFactory::~HieroCachingRuleFactory() {} -Grammar HieroCachingRuleFactory::GetGrammar(const vector& word_ids) { +Grammar HieroCachingRuleFactory::GetGrammar(const vector& word_ids, const unordered_set blacklisted_sentence_ids, const shared_ptr source_data_array) { Clock::time_point start_time = Clock::now(); double total_extract_time = 0; double total_intersect_time = 0; @@ -192,7 +193,7 @@ Grammar HieroCachingRuleFactory::GetGrammar(const vector& word_ids) { Clock::time_point extract_start = Clock::now(); if (!state.starts_with_x) { // Extract rules for the sampled set of occurrences. - PhraseLocation sample = sampler->Sample(next_node->matchings); + PhraseLocation sample = sampler->Sample(next_node->matchings, blacklisted_sentence_ids, source_data_array); vector new_rules = rule_extractor->ExtractRules(next_phrase, sample); rules.insert(rules.end(), new_rules.begin(), new_rules.end()); diff --git a/extractor/rule_factory.h b/extractor/rule_factory.h index 52e8712a..c7332720 100644 --- a/extractor/rule_factory.h +++ b/extractor/rule_factory.h @@ -3,6 +3,7 @@ #include #include +#include #include "matchings_trie.h" @@ -71,7 +72,7 @@ class HieroCachingRuleFactory { // Constructs SCFG rules for a given sentence. // (See class description for more details.) - virtual Grammar GetGrammar(const vector& word_ids); + virtual Grammar GetGrammar(const vector& word_ids, const unordered_set blacklisted_sentence_ids, const shared_ptr source_data_array); protected: HieroCachingRuleFactory(); diff --git a/extractor/run_extractor.cc b/extractor/run_extractor.cc index 8a9ca89d..6eb55073 100644 --- a/extractor/run_extractor.cc +++ b/extractor/run_extractor.cc @@ -75,7 +75,9 @@ int main(int argc, char** argv) { ("max_samples", po::value()->default_value(300), "Maximum number of samples") ("tight_phrases", po::value()->default_value(true), - "False if phrases may be loose (better, but slower)"); + "False if phrases may be loose (better, but slower)") + ("leave_one_out", po::value()->zero_tokens(), + "do leave-one-out estimation of grammars (e.g. for extracting grammars for the training set"); po::variables_map vm; po::store(po::parse_command_line(argc, argv, desc), vm); @@ -96,6 +98,11 @@ int main(int argc, char** argv) { return 1; } + bool leave_one_out = false; + if (vm.count("leave_one_out")) { + leave_one_out = true; + } + int num_threads = vm["threads"].as(); cerr << "Grammar extraction will use " << num_threads << " threads." << endl; @@ -223,7 +230,9 @@ int main(int argc, char** argv) { } suffixes[i] = suffix; - Grammar grammar = extractor.GetGrammar(sentences[i]); + unordered_set blacklisted_sentence_ids; + if (leave_one_out) blacklisted_sentence_ids.insert(i); + Grammar grammar = extractor.GetGrammar(sentences[i], blacklisted_sentence_ids, source_data_array); ofstream output(GetGrammarFilePath(grammar_path, i).c_str()); output << grammar; } diff --git a/extractor/sample_alignment.txt b/extractor/sample_alignment.txt index 80b446a4..f0292b01 100644 --- a/extractor/sample_alignment.txt +++ b/extractor/sample_alignment.txt @@ -1,2 +1,5 @@ 0-0 1-1 2-2 1-0 2-1 +0-0 +0-0 1-1 +0-0 1-1 diff --git a/extractor/sample_bitext.txt b/extractor/sample_bitext.txt index 93d6b39d..2b7c8e40 100644 --- a/extractor/sample_bitext.txt +++ b/extractor/sample_bitext.txt @@ -1,2 +1,5 @@ +asdf ||| dontseeme +qqq asdf ||| zzz fdsa +asdf qqq ||| fdsa zzz ana are mere . ||| anna has apples . ana bea mult lapte . ||| anna drinks a lot of milk . diff --git a/extractor/sampler.cc b/extractor/sampler.cc index d81956b5..2f7738db 100644 --- a/extractor/sampler.cc +++ b/extractor/sampler.cc @@ -12,18 +12,43 @@ Sampler::Sampler() {} Sampler::~Sampler() {} -PhraseLocation Sampler::Sample(const PhraseLocation& location) const { +PhraseLocation Sampler::Sample(const PhraseLocation& location, unordered_set blacklisted_sentence_ids, const shared_ptr source_data_array) const { vector sample; int num_subpatterns; if (location.matchings == NULL) { // Sample suffix array range. num_subpatterns = 1; int low = location.sa_low, high = location.sa_high; - double step = max(1.0, (double) (high - low) / max_samples); - for (double i = low; i < high && sample.size() < max_samples; i += step) { - sample.push_back(suffix_array->GetSuffix(Round(i))); + double step = Round(max(1.0, (double) (high - low) / max_samples)); + int i = location.sa_low; + bool found = false; + while (sample.size() < max_samples && i <= location.sa_high) { + int x = suffix_array->GetSuffix(i); + int id = source_data_array->GetSentenceId(x); + if (find(blacklisted_sentence_ids.begin(), blacklisted_sentence_ids.end(), id) != blacklisted_sentence_ids.end()) { + int backoff_step = 1; + while (true) { + int j = i - backoff_step; + x = suffix_array->GetSuffix(j); + id = source_data_array->GetSentenceId(x); + if ((j >= location.sa_low) && (find(blacklisted_sentence_ids.begin(), blacklisted_sentence_ids.end(), id) == blacklisted_sentence_ids.end()) + && (find(sample.begin(), sample.end(), x) == sample.end())) { found = true; break; } + int k = i + backoff_step; + x = suffix_array->GetSuffix(k); + id = source_data_array->GetSentenceId(x); + if ((k <= location.sa_high) && (find(blacklisted_sentence_ids.begin(), blacklisted_sentence_ids.end(), id) == blacklisted_sentence_ids.end()) + && (find(sample.begin(), sample.end(), x) == sample.end())) { found = true; break; } + if (j <= location.sa_low && k >= location.sa_high) break; + backoff_step++; + } + } else { + found = true; + } + if (found && (find(sample.begin(), sample.end(), x) == sample.end())) sample.push_back(x); + i += step; + found = false; } - } else { + } else { // when do we get here? // Sample vector of occurrences. num_subpatterns = location.num_subpatterns; int num_matchings = location.matchings->size() / num_subpatterns; diff --git a/extractor/sampler.h b/extractor/sampler.h index be4aa1bb..30e747fd 100644 --- a/extractor/sampler.h +++ b/extractor/sampler.h @@ -2,6 +2,9 @@ #define _SAMPLER_H_ #include +#include + +#include "data_array.h" using namespace std; @@ -20,7 +23,7 @@ class Sampler { virtual ~Sampler(); // Samples uniformly at most max_samples phrase occurrences. - virtual PhraseLocation Sample(const PhraseLocation& location) const; + virtual PhraseLocation Sample(const PhraseLocation& location, const unordered_set blacklisted_sentence_ids, const shared_ptr source_data_array) const; protected: Sampler(); -- cgit v1.2.3 From 0087a3d427cbe0cfb20548a496124ce7d857da8f Mon Sep 17 00:00:00 2001 From: Patrick Simianer Date: Fri, 20 Sep 2013 20:10:19 +0200 Subject: example file --- extractor/sample_source.txt | 5 +++++ 1 file changed, 5 insertions(+) create mode 100644 extractor/sample_source.txt diff --git a/extractor/sample_source.txt b/extractor/sample_source.txt new file mode 100644 index 00000000..9b46dd6a --- /dev/null +++ b/extractor/sample_source.txt @@ -0,0 +1,5 @@ +asdf +qqq asdf +asdf qqq +ana are mere . +ana bea mult lapte . -- cgit v1.2.3 From a08465b90027cc6f1d17daae2992e67368eeedee Mon Sep 17 00:00:00 2001 From: Patrick Simianer Date: Tue, 24 Sep 2013 15:50:03 +0200 Subject: loo #2 --- extractor/sampler.cc | 26 +++++++++++++++----------- 1 file changed, 15 insertions(+), 11 deletions(-) diff --git a/extractor/sampler.cc b/extractor/sampler.cc index 2f7738db..cb470962 100644 --- a/extractor/sampler.cc +++ b/extractor/sampler.cc @@ -20,35 +20,39 @@ PhraseLocation Sampler::Sample(const PhraseLocation& location, unordered_setGetSuffix(i); int id = source_data_array->GetSentenceId(x); if (find(blacklisted_sentence_ids.begin(), blacklisted_sentence_ids.end(), id) != blacklisted_sentence_ids.end()) { + found = false; int backoff_step = 1; while (true) { + if ((double)backoff_step >= step) break; int j = i - backoff_step; x = suffix_array->GetSuffix(j); id = source_data_array->GetSentenceId(x); - if ((j >= location.sa_low) && (find(blacklisted_sentence_ids.begin(), blacklisted_sentence_ids.end(), id) == blacklisted_sentence_ids.end()) - && (find(sample.begin(), sample.end(), x) == sample.end())) { found = true; break; } + if (j > last && find(blacklisted_sentence_ids.begin(), blacklisted_sentence_ids.end(), id) == blacklisted_sentence_ids.end()) { + found = true; last = i; break; + } int k = i + backoff_step; x = suffix_array->GetSuffix(k); id = source_data_array->GetSentenceId(x); - if ((k <= location.sa_high) && (find(blacklisted_sentence_ids.begin(), blacklisted_sentence_ids.end(), id) == blacklisted_sentence_ids.end()) - && (find(sample.begin(), sample.end(), x) == sample.end())) { found = true; break; } - if (j <= location.sa_low && k >= location.sa_high) break; + if (k < min(i+step, (double)high) && find(blacklisted_sentence_ids.begin(), blacklisted_sentence_ids.end(), id) == blacklisted_sentence_ids.end()) { + found = true; last = k; break; + } + if (j <= last && k >= high) break; backoff_step++; } } else { found = true; + last = i; } - if (found && (find(sample.begin(), sample.end(), x) == sample.end())) sample.push_back(x); + if (found) sample.push_back(x); i += step; - found = false; } - } else { // when do we get here? + } else { // Sample vector of occurrences. num_subpatterns = location.num_subpatterns; int num_matchings = location.matchings->size() / num_subpatterns; -- cgit v1.2.3 From e11bbf4e4afb3e90710e45eb5cc7dff89bb559bc Mon Sep 17 00:00:00 2001 From: Patrick Simianer Date: Wed, 25 Sep 2013 19:27:44 +0200 Subject: fix --- training/dtrain/parallelize.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/training/dtrain/parallelize.rb b/training/dtrain/parallelize.rb index 66a61b3d..2fc66cab 100755 --- a/training/dtrain/parallelize.rb +++ b/training/dtrain/parallelize.rb @@ -21,7 +21,7 @@ opts = Trollop::options do opt :qsub, "use qsub", :type => :bool, :default => false opt :dtrain_binary, "path to dtrain binary", :type => :string opt :extra_qsub, "extra qsub args", :type => :string, :default => "" - opt :per_shard_decoder_configs, "give special decoder config per shard", :type => string + opt :per_shard_decoder_configs, "give special decoder config per shard", :type => :string, :short => :o end usage if not opts[:config]&&opts[:shards]&&opts[:input]&&opts[:references] -- cgit v1.2.3 From 5fc77937dde48dddde264261cb773b07fe7cd560 Mon Sep 17 00:00:00 2001 From: Patrick Simianer Date: Tue, 8 Oct 2013 13:57:45 +0200 Subject: dtrain: added pclr variants and new expected-output; fixed bug in soft syntax features --- decoder/ff_soft_syntax.cc | 2 +- training/dtrain/dtrain.cc | 31 +++--- training/dtrain/examples/standard/dtrain.ini | 6 +- training/dtrain/examples/standard/expected-output | 115 +++++++++++++--------- training/dtrain/parallelize.rb | 11 ++- 5 files changed, 101 insertions(+), 64 deletions(-) diff --git a/decoder/ff_soft_syntax.cc b/decoder/ff_soft_syntax.cc index 9981fa45..d84f2e6d 100644 --- a/decoder/ff_soft_syntax.cc +++ b/decoder/ff_soft_syntax.cc @@ -21,7 +21,7 @@ using namespace std; struct SoftSyntacticFeaturesImpl { SoftSyntacticFeaturesImpl(const string& param) { vector labels = SplitOnWhitespace(param); - for (unsigned int i = 0; i < labels.size(); i++) + //for (unsigned int i = 0; i < labels.size(); i++) //cerr << "Labels: " << labels.at(i) << endl; for (unsigned int i = 0; i < labels.size(); i++) { string label = labels.at(i); diff --git a/training/dtrain/dtrain.cc b/training/dtrain/dtrain.cc index 9d60a903..38a9b69a 100644 --- a/training/dtrain/dtrain.cc +++ b/training/dtrain/dtrain.cc @@ -40,7 +40,7 @@ dtrain_init(int argc, char** argv, po::variables_map* cfg) ("scale_bleu_diff", po::value()->zero_tokens(), "learning rate <- bleu diff of a misranked pair") ("loss_margin", po::value()->default_value(0.), "update if no error in pref pair but model scores this near") ("max_pairs", po::value()->default_value(std::numeric_limits::max()), "max. # of pairs per Sent.") - ("pclr", po::value()->zero_tokens(), "use a (simple) per-coordinate learning rate") + ("pclr", po::value()->default_value("no"), "use a (simple|adagrad) per-coordinate learning rate") ("noup", po::value()->zero_tokens(), "do not update weights"); po::options_description cl("Command Line Options"); cl.add_options() @@ -125,8 +125,7 @@ main(int argc, char** argv) if (loss_margin > 9998.) loss_margin = std::numeric_limits::max(); bool scale_bleu_diff = false; if (cfg.count("scale_bleu_diff")) scale_bleu_diff = true; - bool pclr = false; - if (cfg.count("pclr")) pclr = true; + const string pclr = cfg["pclr"].as(); bool average = false; if (select_weights == "avg") average = true; @@ -190,7 +189,6 @@ main(int argc, char** argv) weight_t gamma = cfg["gamma"].as(); // faster perceptron: consider only misranked pairs, see - // DO NOT ENABLE WITH SVM (gamma > 0) OR loss_margin! bool faster_perceptron = false; if (gamma==0 && loss_margin==0) faster_perceptron = true; @@ -251,8 +249,7 @@ main(int argc, char** argv) cerr << setw(25) << "l1 reg " << l1_reg << " '" << cfg["l1_reg"].as() << "'" << endl; if (rescale) cerr << setw(25) << "rescale " << rescale << endl; - if (pclr) - cerr << setw(25) << "pclr " << pclr << endl; + cerr << setw(25) << "pclr " << pclr << endl; cerr << setw(25) << "max pairs " << max_pairs << endl; cerr << setw(25) << "cdec cfg " << "'" << cfg["decoder_config"].as() << "'" << endl; cerr << setw(25) << "input " << "'" << input_fn << "'" << endl; @@ -392,22 +389,30 @@ main(int argc, char** argv) if (scale_bleu_diff) eta = it->first.score - it->second.score; if (rank_error || margin < loss_margin) { SparseVector diff_vec = it->first.f - it->second.f; - if (pclr) { + if (pclr != "no") { sum_up += diff_vec; } else { lambdas.plus_eq_v_times_s(diff_vec, eta); + if (gamma) lambdas.plus_eq_v_times_s(lambdas, -2*gamma*eta*(1./npairs)); // FIXME } - if (gamma) - lambdas.plus_eq_v_times_s(lambdas, -2*gamma*eta*(1./npairs)); } } // per-coordinate learning rate - if (pclr) { + if (pclr != "no") { SparseVector::iterator it = sum_up.begin(); - for (; it != lambdas.end(); ++it) { - learning_rates[it->first]++; - lambdas[it->first] += it->second / learning_rates[it->first]; //* max(0.00000001, eta/(eta+learning_rates[it->first])); + for (; it != sum_up.end(); ++it) { + if (pclr == "simple") { + lambdas[it->first] += it->second / max(1.0, learning_rates[it->first]); + learning_rates[it->first]++; + } else if (pclr == "adagrad") { + if (learning_rates[it->first] == 0) { + lambdas[it->first] += it->second * eta; + } else { + lambdas[it->first] += it->second * eta * learning_rates[it->first]; + } + learning_rates[it->first] += pow(it->second, 2.0); + } } } diff --git a/training/dtrain/examples/standard/dtrain.ini b/training/dtrain/examples/standard/dtrain.ini index c0912a62..e6d6382e 100644 --- a/training/dtrain/examples/standard/dtrain.ini +++ b/training/dtrain/examples/standard/dtrain.ini @@ -1,6 +1,6 @@ input=./nc-wmt11.de.gz refs=./nc-wmt11.en.gz -output=asdf # a weights file (add .gz for gzip compression) or STDOUT '-' +output=- # a weights file (add .gz for gzip compression) or STDOUT '-' select_weights=VOID # output average (over epochs) weight vector decoder_config=./cdec.ini # config for cdec # weights for these features will be printed on each iteration @@ -10,11 +10,11 @@ print_weights=Glue WordPenalty LanguageModel LanguageModel_OOV PhraseModel_0 Phr stop_after=10 # stop epoch after 10 inputs # interesting stuff -epochs=2 # run over input 2 times +epochs=3 # run over input 3 times k=100 # use 100best lists N=4 # optimize (approx) BLEU4 scorer=fixed_stupid_bleu # use 'stupid' BLEU+1 -learning_rate=1.0 # learning rate, don't care if gamma=0 (perceptron) +learning_rate=1.0 # learning rate, don't care if gamma=0 (perceptron) and loss_margin=0 (not margin perceptron) gamma=0 # use SVM reg sample_from=kbest # use kbest lists (as opposed to forest) filter=uniq # only unique entries in kbest (surface form) diff --git a/training/dtrain/examples/standard/expected-output b/training/dtrain/examples/standard/expected-output index 21f91244..a35bbe6f 100644 --- a/training/dtrain/examples/standard/expected-output +++ b/training/dtrain/examples/standard/expected-output @@ -4,13 +4,13 @@ Reading ./nc-wmt11.en.srilm.gz ----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100 **************************************************************************************************** Example feature: Shape_S00000_T00000 -Seeding random number sequence to 970626287 +Seeding random number sequence to 4049211323 dtrain Parameters: k 100 N 4 - T 2 + T 3 scorer 'fixed_stupid_bleu' sample from 'kbest' filter 'uniq' @@ -23,6 +23,7 @@ Parameters: pair threshold 0 select weights 'VOID' l1 reg 0 'none' + pclr no max pairs 4294967295 cdec cfg './cdec.ini' input './nc-wmt11.de.gz' @@ -30,62 +31,88 @@ Parameters: output '-' stop_after 10 (a dot represents 10 inputs) -Iteration #1 of 2. +Iteration #1 of 3. . 10 Stopping after 10 input sentences. WEIGHTS - Glue = -614 - WordPenalty = +1256.8 - LanguageModel = +5610.5 - LanguageModel_OOV = -1449 - PhraseModel_0 = -2107 - PhraseModel_1 = -4666.1 - PhraseModel_2 = -2713.5 - PhraseModel_3 = +4204.3 - PhraseModel_4 = -1435.8 - PhraseModel_5 = +916 - PhraseModel_6 = +190 - PassThrough = -2527 + Glue = -1100 + WordPenalty = -82.082 + LanguageModel = -3199.1 + LanguageModel_OOV = -192 + PhraseModel_0 = +3128.2 + PhraseModel_1 = -1610.2 + PhraseModel_2 = -4336.5 + PhraseModel_3 = +2910.3 + PhraseModel_4 = +2523.2 + PhraseModel_5 = +506 + PhraseModel_6 = +1467 + PassThrough = -387 --- - 1best avg score: 0.17874 (+0.17874) - 1best avg model score: 88399 (+88399) - avg # pairs: 798.2 (meaningless) - avg # rank err: 798.2 + 1best avg score: 0.16966 (+0.16966) + 1best avg model score: 2.9874e+05 (+2.9874e+05) + avg # pairs: 906.3 (meaningless) + avg # rank err: 906.3 avg # margin viol: 0 - non0 feature count: 887 + non0 feature count: 825 avg list sz: 91.3 - avg f count: 126.85 -(time 0.33 min, 2 s/S) + avg f count: 139.77 +(time 0.35 min, 2.1 s/S) -Iteration #2 of 2. +Iteration #2 of 3. . 10 WEIGHTS - Glue = -1025 - WordPenalty = +1751.5 - LanguageModel = +10059 - LanguageModel_OOV = -4490 - PhraseModel_0 = -2640.7 - PhraseModel_1 = -3757.4 - PhraseModel_2 = -1133.1 - PhraseModel_3 = +1837.3 - PhraseModel_4 = -3534.3 - PhraseModel_5 = +2308 - PhraseModel_6 = +1677 - PassThrough = -6222 + Glue = -1221 + WordPenalty = +836.89 + LanguageModel = +2332.3 + LanguageModel_OOV = -1451 + PhraseModel_0 = +1507.2 + PhraseModel_1 = -2728.4 + PhraseModel_2 = -4183.6 + PhraseModel_3 = +1816.3 + PhraseModel_4 = -2894.7 + PhraseModel_5 = +1403 + PhraseModel_6 = +35 + PassThrough = -1097 --- - 1best avg score: 0.30764 (+0.12891) - 1best avg model score: -2.5042e+05 (-3.3882e+05) - avg # pairs: 725.9 (meaningless) - avg # rank err: 725.9 + 1best avg score: 0.17399 (+0.004325) + 1best avg model score: 49369 (-2.4937e+05) + avg # pairs: 662.4 (meaningless) + avg # rank err: 662.4 avg # margin viol: 0 - non0 feature count: 1499 + non0 feature count: 1235 avg list sz: 91.3 - avg f count: 114.34 -(time 0.32 min, 1.9 s/S) + avg f count: 125.11 +(time 0.27 min, 1.6 s/S) + +Iteration #3 of 3. + . 10 +WEIGHTS + Glue = -1574 + WordPenalty = -17.372 + LanguageModel = +6861.8 + LanguageModel_OOV = -3997 + PhraseModel_0 = -398.76 + PhraseModel_1 = -3419.6 + PhraseModel_2 = -3186.7 + PhraseModel_3 = +1050.8 + PhraseModel_4 = -2902.7 + PhraseModel_5 = -486 + PhraseModel_6 = -436 + PassThrough = -2985 + --- + 1best avg score: 0.30742 (+0.13343) + 1best avg model score: -1.5393e+05 (-2.0329e+05) + avg # pairs: 623.8 (meaningless) + avg # rank err: 623.8 + avg # margin viol: 0 + non0 feature count: 1770 + avg list sz: 91.3 + avg f count: 118.58 +(time 0.25 min, 1.5 s/S) Writing weights file to '-' ... done --- -Best iteration: 2 [SCORE 'fixed_stupid_bleu'=0.30764]. -This took 0.65 min. +Best iteration: 3 [SCORE 'fixed_stupid_bleu'=0.30742]. +This took 0.86667 min. diff --git a/training/dtrain/parallelize.rb b/training/dtrain/parallelize.rb index 2fc66cab..60ca9422 100755 --- a/training/dtrain/parallelize.rb +++ b/training/dtrain/parallelize.rb @@ -21,7 +21,8 @@ opts = Trollop::options do opt :qsub, "use qsub", :type => :bool, :default => false opt :dtrain_binary, "path to dtrain binary", :type => :string opt :extra_qsub, "extra qsub args", :type => :string, :default => "" - opt :per_shard_decoder_configs, "give special decoder config per shard", :type => :string, :short => :o + opt :per_shard_decoder_configs, "give special decoder config per shard", :type => :string, :short => '-o' + opt :first_input_weights, "input weights for first iter", :type => :string, :default => '', :short => '-w' end usage if not opts[:config]&&opts[:shards]&&opts[:input]&&opts[:references] @@ -54,6 +55,7 @@ input = opts[:input] refs = opts[:references] use_qsub = opts[:qsub] shards_at_once = opts[:processes_at_once] +first_input_weights = opts[:first_input_weights] `mkdir work` @@ -137,10 +139,13 @@ end else cdec_cfg = "" end + if first_input_weights!='' && epoch == 0 + input_weights = "--input_weights #{first_input_weights}" + end pids << Kernel.fork { - `#{qsub_str_start}#{dtrain_bin} -c #{ini} #{cdec_cfg}\ + `#{qsub_str_start}#{dtrain_bin} -c #{ini} #{cdec_cfg} #{input_weights}\ --input #{input_files[shard]}\ - --refs #{refs_files[shard]} #{input_weights}\ + --refs #{refs_files[shard]}\ --output work/weights.#{shard}.#{epoch}#{qsub_str_end} #{local_end}` } weights_files << "work/weights.#{shard}.#{epoch}" -- cgit v1.2.3 From 12577135f7504a3909111479c9053410bfed8354 Mon Sep 17 00:00:00 2001 From: Patrick Simianer Date: Sun, 3 Nov 2013 21:24:51 +0100 Subject: bitext input for dtrain --- training/dtrain/Makefile.am | 2 +- training/dtrain/dtrain.cc | 45 ++++++++++++++++++++------ training/dtrain/dtrain.h | 2 ++ training/dtrain/examples/standard/dtrain.ini | 5 +-- training/dtrain/examples/standard/nc-wmt11.gz | Bin 0 -> 113504 bytes 5 files changed, 41 insertions(+), 13 deletions(-) create mode 100644 training/dtrain/examples/standard/nc-wmt11.gz diff --git a/training/dtrain/Makefile.am b/training/dtrain/Makefile.am index 844c790d..ecb6c128 100644 --- a/training/dtrain/Makefile.am +++ b/training/dtrain/Makefile.am @@ -1,7 +1,7 @@ bin_PROGRAMS = dtrain dtrain_SOURCES = dtrain.cc score.cc dtrain.h kbestget.h ksampler.h pairsampling.h score.h -dtrain_LDADD = ../../decoder/libcdec.a ../../klm/search/libksearch.a ../../mteval/libmteval.a ../../utils/libutils.a ../../klm/lm/libklm.a ../../klm/util/libklm_util.a ../../klm/util/double-conversion/libklm_util_double.a +dtrain_LDADD = ../../decoder/libcdec.a ../../klm/search/libksearch.a ../../mteval/libmteval.a ../../utils/libutils.a ../../klm/lm/libklm.a ../../klm/util/libklm_util.a ../../klm/util/double-conversion/libklm_util_double.a -lboost_regex AM_CPPFLAGS = -W -Wall -Wno-sign-compare -I$(top_srcdir)/utils -I$(top_srcdir)/decoder -I$(top_srcdir)/mteval diff --git a/training/dtrain/dtrain.cc b/training/dtrain/dtrain.cc index 38a9b69a..a496f08a 100644 --- a/training/dtrain/dtrain.cc +++ b/training/dtrain/dtrain.cc @@ -12,8 +12,9 @@ dtrain_init(int argc, char** argv, po::variables_map* cfg) { po::options_description ini("Configuration File Options"); ini.add_options() - ("input", po::value()->default_value("-"), "input file (src)") + ("input", po::value(), "input file (src)") ("refs,r", po::value(), "references") + ("bitext,b", po::value(), "bitext: 'src ||| tgt'") ("output", po::value()->default_value("-"), "output weights file, '-' for STDOUT") ("input_weights", po::value(), "input weights file (e.g. from previous iteration)") ("decoder_config", po::value(), "configuration file for cdec") @@ -73,13 +74,17 @@ dtrain_init(int argc, char** argv, po::variables_map* cfg) cerr << "Wrong 'pair_sampling' param: '" << (*cfg)["pair_sampling"].as() << "'." << endl; return false; } - if(cfg->count("hi_lo") && (*cfg)["pair_sampling"].as() != "XYX") { + if (cfg->count("hi_lo") && (*cfg)["pair_sampling"].as() != "XYX") { cerr << "Warning: hi_lo only works with pair_sampling XYX." << endl; } - if((*cfg)["hi_lo"].as() > 0.5 || (*cfg)["hi_lo"].as() < 0.01) { + if ((*cfg)["hi_lo"].as() > 0.5 || (*cfg)["hi_lo"].as() < 0.01) { cerr << "hi_lo must lie in [0.01, 0.5]" << endl; return false; } + if ((cfg->count("input")>0 || cfg->count("refs")>0) && cfg->count("bitext")>0) { + cerr << "Provide 'input' and 'refs' or 'bitext', not both." << endl; + return false; + } if ((*cfg)["pair_threshold"].as() < 0) { cerr << "The threshold must be >= 0!" << endl; return false; @@ -208,13 +213,24 @@ main(int argc, char** argv) // output string output_fn = cfg["output"].as(); // input - string input_fn = cfg["input"].as(); + bool read_bitext = false; + string input_fn; + if (cfg.count("bitext")) { + read_bitext = true; + input_fn = cfg["bitext"].as(); + } else { + input_fn = cfg["input"].as(); + } ReadFile input(input_fn); // buffer input for t > 0 vector src_str_buf; // source strings (decoder takes only strings) vector > ref_ids_buf; // references as WordID vecs - string refs_fn = cfg["refs"].as(); - ReadFile refs(refs_fn); + ReadFile refs; + string refs_fn; + if (!read_bitext) { + refs_fn = cfg["refs"].as(); + refs.Init(refs_fn); + } unsigned in_sz = std::numeric_limits::max(); // input index, input size vector > all_scores; @@ -253,7 +269,8 @@ main(int argc, char** argv) cerr << setw(25) << "max pairs " << max_pairs << endl; cerr << setw(25) << "cdec cfg " << "'" << cfg["decoder_config"].as() << "'" << endl; cerr << setw(25) << "input " << "'" << input_fn << "'" << endl; - cerr << setw(25) << "refs " << "'" << refs_fn << "'" << endl; + if (!read_bitext) + cerr << setw(25) << "refs " << "'" << refs_fn << "'" << endl; cerr << setw(25) << "output " << "'" << output_fn << "'" << endl; if (cfg.count("input_weights")) cerr << setw(25) << "weights in " << "'" << cfg["input_weights"].as() << "'" << endl; @@ -279,9 +296,16 @@ main(int argc, char** argv) { string in; + string ref; bool next = false, stop = false; // next iteration or premature stop if (t == 0) { if(!getline(*input, in)) next = true; + if(read_bitext) { + vector strs; + boost::algorithm::split_regex(strs, in, boost::regex(" \\|\\|\\| ")); + in = strs[0]; + ref = strs[1]; + } } else { if (ii == in_sz) next = true; // stop if we reach the end of our input } @@ -318,10 +342,11 @@ main(int argc, char** argv) // getting input vector ref_ids; // reference as vector if (t == 0) { - string r_; - getline(*refs, r_); + if (!read_bitext) { + getline(*refs, ref); + } vector ref_tok; - boost::split(ref_tok, r_, boost::is_any_of(" ")); + boost::split(ref_tok, ref, boost::is_any_of(" ")); register_and_convert(ref_tok, ref_ids); ref_ids_buf.push_back(ref_ids); src_str_buf.push_back(in); diff --git a/training/dtrain/dtrain.h b/training/dtrain/dtrain.h index 3981fb39..ccb5ad4d 100644 --- a/training/dtrain/dtrain.h +++ b/training/dtrain/dtrain.h @@ -9,6 +9,8 @@ #include #include +#include +#include #include #include "decoder.h" diff --git a/training/dtrain/examples/standard/dtrain.ini b/training/dtrain/examples/standard/dtrain.ini index e6d6382e..7dbb4ff0 100644 --- a/training/dtrain/examples/standard/dtrain.ini +++ b/training/dtrain/examples/standard/dtrain.ini @@ -1,5 +1,6 @@ -input=./nc-wmt11.de.gz -refs=./nc-wmt11.en.gz +#input=./nc-wmt11.de.gz +#refs=./nc-wmt11.en.gz +bitext=./nc-wmt11.gz output=- # a weights file (add .gz for gzip compression) or STDOUT '-' select_weights=VOID # output average (over epochs) weight vector decoder_config=./cdec.ini # config for cdec diff --git a/training/dtrain/examples/standard/nc-wmt11.gz b/training/dtrain/examples/standard/nc-wmt11.gz new file mode 100644 index 00000000..c39c5aef Binary files /dev/null and b/training/dtrain/examples/standard/nc-wmt11.gz differ -- cgit v1.2.3 From a9171fa0aa0ad6d7611fe079ecee464bc5f78231 Mon Sep 17 00:00:00 2001 From: Patrick Simianer Date: Sun, 3 Nov 2013 21:56:06 +0100 Subject: cleaned up parsematch features --- decoder/ff_parse_match.cc | 17 ++++++++++------- decoder/ff_parse_match.h | 1 + 2 files changed, 11 insertions(+), 7 deletions(-) diff --git a/decoder/ff_parse_match.cc b/decoder/ff_parse_match.cc index ed556b91..94634b27 100644 --- a/decoder/ff_parse_match.cc +++ b/decoder/ff_parse_match.cc @@ -13,6 +13,10 @@ using namespace std; // implements the parse match features as described in Vilar et al. (2008) // source trees must be represented in Penn Treebank format, e.g. // (S (NP John) (VP (V left))) +// +// Annotate source sentences with ..." +// Note: You need to escape quite a lot of stuff in all your models! +// struct ParseMatchFeaturesImpl { ParseMatchFeaturesImpl(const string& param) { @@ -42,10 +46,8 @@ struct ParseMatchFeaturesImpl { void InitializeGrids(const string& tree, unsigned src_len) { assert(tree.size() > 0); - //fids_cat.clear(); fids_ef.clear(); src_tree.clear(); - //fids_cat.resize(src_len, src_len + 1); fids_ef.resize(src_len, src_len + 1); src_tree.resize(src_len, src_len + 1, TD::Convert("XX")); ParseTreeString(tree, src_len); @@ -112,7 +114,7 @@ struct ParseMatchFeaturesImpl { int fid_ef = FD::Convert("PM"); int min_dist; // minimal distance to next syntactic constituent of this rule's LHS int summed_min_dists; // minimal distances of LHS and NTs summed up - if (TD::Convert(lhs).compare("XX") != 0) + if (TD::Convert(lhs).compare("XX") != 0) min_dist= 0; // compute the distance to the next syntactical constituent else { @@ -131,7 +133,7 @@ struct ParseMatchFeaturesImpl { ok = 1; break; } - // check if removing k words from the rule span will + // check if removing k words from the rule span will // lead to a syntactical constituent else { //cerr << "Hilfe...!" << endl; @@ -144,7 +146,7 @@ struct ParseMatchFeaturesImpl { ok = 1; break; } - } + } } if (ok) break; } @@ -183,9 +185,9 @@ struct ParseMatchFeaturesImpl { return min_dist; } - Array2D src_tree; // src_tree(i,j) NT = type + Array2D src_tree; // src_tree(i,j) NT = type unsigned int src_sent_len; - mutable Array2D > fids_ef; // fires for fully lexicalized + mutable Array2D > fids_ef; // fires for fully lexicalized int scoring_method; }; @@ -216,3 +218,4 @@ void ParseMatchFeatures::TraversalFeaturesImpl(const SentenceMetadata& smeta, void ParseMatchFeatures::PrepareForInput(const SentenceMetadata& smeta) { impl->InitializeGrids(smeta.GetSGMLValue("src_tree"), smeta.GetSourceLength()); } + diff --git a/decoder/ff_parse_match.h b/decoder/ff_parse_match.h index fa73481a..7820b418 100644 --- a/decoder/ff_parse_match.h +++ b/decoder/ff_parse_match.h @@ -23,3 +23,4 @@ class ParseMatchFeatures : public FeatureFunction { }; #endif + -- cgit v1.2.3 From 5cba65baf55b821cbc22b0ee0e3ae8dc9946ca0f Mon Sep 17 00:00:00 2001 From: Patrick Simianer Date: Tue, 5 Nov 2013 16:29:03 +0100 Subject: cleaning up syntax features --- decoder/Makefile.am | 5 +- decoder/cdec_ff.cc | 25 +--- decoder/ff_parse_match.cc | 4 - decoder/ff_soft_syntax.cc | 34 +++--- decoder/ff_soft_syntax.h | 16 +-- decoder/ff_soft_syntax2.cc | 234 ------------------------------------ decoder/ff_soft_syntax2.h | 27 ----- decoder/ff_soft_syntax_mindist.cc | 235 ++++++++++++++++++++++++++++++++++++ decoder/ff_soft_syntax_mindist.h | 27 +++++ decoder/ff_source_syntax.cc | 37 ++++-- decoder/ff_source_syntax.h | 10 +- decoder/ff_source_syntax2.cc | 25 ++-- decoder/ff_source_syntax2.h | 5 +- decoder/ff_source_syntax2_p.cc | 166 -------------------------- decoder/ff_source_syntax2_p.h | 25 ---- decoder/ff_source_syntax_p.cc | 245 -------------------------------------- decoder/ff_source_syntax_p.h | 42 ------- 17 files changed, 342 insertions(+), 820 deletions(-) delete mode 100644 decoder/ff_soft_syntax2.cc delete mode 100644 decoder/ff_soft_syntax2.h create mode 100644 decoder/ff_soft_syntax_mindist.cc create mode 100644 decoder/ff_soft_syntax_mindist.h delete mode 100644 decoder/ff_source_syntax2_p.cc delete mode 100644 decoder/ff_source_syntax2_p.h delete mode 100644 decoder/ff_source_syntax_p.cc delete mode 100644 decoder/ff_source_syntax_p.h diff --git a/decoder/Makefile.am b/decoder/Makefile.am index 914faaea..e7ebe840 100644 --- a/decoder/Makefile.am +++ b/decoder/Makefile.am @@ -62,7 +62,6 @@ libcdec_a_SOURCES = \ ff_ruleshape.h \ ff_sample_fsa.h \ ff_source_path.h \ - ff_source_syntax.h \ ff_spans.h \ ff_tagger.h \ ff_wordalign.h \ @@ -145,11 +144,9 @@ libcdec_a_SOURCES = \ ff_source_path.cc \ ff_parse_match.cc \ ff_soft_syntax.cc \ - ff_soft_syntax2.cc \ + ff_soft_syntax_mindist.cc \ ff_source_syntax.cc \ - ff_source_syntax_p.cc \ ff_source_syntax2.cc \ - ff_source_syntax2_p.cc \ ff_bleu.cc \ ff_factory.cc \ incremental.cc \ diff --git a/decoder/cdec_ff.cc b/decoder/cdec_ff.cc index e7b31f50..a36a0f5f 100644 --- a/decoder/cdec_ff.cc +++ b/decoder/cdec_ff.cc @@ -15,17 +15,11 @@ #include "ff_ruleshape.h" #include "ff_bleu.h" #include "ff_soft_syntax.h" -#include "ff_soft_syntax2.h" +#include "ff_soft_syntax_mindist.h" #include "ff_source_path.h" - - #include "ff_parse_match.h" #include "ff_source_syntax.h" -#include "ff_source_syntax_p.h" #include "ff_source_syntax2.h" -#include "ff_source_syntax2_p.h" - - #include "ff_register.h" #include "ff_charset.h" #include "ff_wordset.h" @@ -58,23 +52,12 @@ void register_feature_functions() { ff_registry.Register("NgramFeatures", new FFFactory()); ff_registry.Register("RuleContextFeatures", new FFFactory()); ff_registry.Register("RuleIdentityFeatures", new FFFactory()); - - ff_registry.Register("ParseMatchFeatures", new FFFactory); - - ff_registry.Register("SoftSyntacticFeatures", new FFFactory); - ff_registry.Register("SoftSyntacticFeatures2", new FFFactory); - + ff_registry.Register("SoftSyntaxFeatures", new FFFactory); + ff_registry.Register("SoftSyntaxFeaturesMindist", new FFFactory); ff_registry.Register("SourceSyntaxFeatures", new FFFactory); - ff_registry.Register("SourceSyntaxFeatures2", new FFFactory); - ff_registry.Register("SourceSpanSizeFeatures", new FFFactory); - - //ff_registry.Register("PSourceSyntaxFeatures", new FFFactory); - //ff_registry.Register("PSourceSpanSizeFeatures", new FFFactory); - //ff_registry.Register("PSourceSyntaxFeatures2", new FFFactory); - - + ff_registry.Register("SourceSyntaxFeatures2", new FFFactory); ff_registry.Register("CMR2008ReorderingFeatures", new FFFactory()); ff_registry.Register("RuleSourceBigramFeatures", new FFFactory()); ff_registry.Register("RuleTargetBigramFeatures", new FFFactory()); diff --git a/decoder/ff_parse_match.cc b/decoder/ff_parse_match.cc index 94634b27..7c79302b 100644 --- a/decoder/ff_parse_match.cc +++ b/decoder/ff_parse_match.cc @@ -13,10 +13,6 @@ using namespace std; // implements the parse match features as described in Vilar et al. (2008) // source trees must be represented in Penn Treebank format, e.g. // (S (NP John) (VP (V left))) -// -// Annotate source sentences with ..." -// Note: You need to escape quite a lot of stuff in all your models! -// struct ParseMatchFeaturesImpl { ParseMatchFeaturesImpl(const string& param) { diff --git a/decoder/ff_soft_syntax.cc b/decoder/ff_soft_syntax.cc index d84f2e6d..a3d26135 100644 --- a/decoder/ff_soft_syntax.cc +++ b/decoder/ff_soft_syntax.cc @@ -13,16 +13,15 @@ using namespace std; -// Implements the soft syntactic features described in +// Implements the soft syntactic features described in // Marton and Resnik (2008): "Soft Syntacitc Constraints for Hierarchical Phrase-Based Translation". // Source trees must be represented in Penn Treebank format, // e.g. (S (NP John) (VP (V left))). -struct SoftSyntacticFeaturesImpl { - SoftSyntacticFeaturesImpl(const string& param) { +struct SoftSyntaxFeaturesImpl { + SoftSyntaxFeaturesImpl(const string& param) { vector labels = SplitOnWhitespace(param); - //for (unsigned int i = 0; i < labels.size(); i++) - //cerr << "Labels: " << labels.at(i) << endl; + //for (unsigned int i = 0; i < labels.size(); i++) { cerr << "Labels: " << labels.at(i) << endl; } for (unsigned int i = 0; i < labels.size(); i++) { string label = labels.at(i); pair feat_label; @@ -34,10 +33,8 @@ struct SoftSyntacticFeaturesImpl { void InitializeGrids(const string& tree, unsigned src_len) { assert(tree.size() > 0); - //fids_cat.clear(); fids_ef.clear(); src_tree.clear(); - //fids_cat.resize(src_len, src_len + 1); fids_ef.resize(src_len, src_len + 1); src_tree.resize(src_len, src_len + 1, TD::Convert("XX")); ParseTreeString(tree, src_len); @@ -99,7 +96,7 @@ struct SoftSyntacticFeaturesImpl { const WordID lhs = src_tree(i,j); string lhs_str = TD::Convert(lhs); //cerr << "LHS: " << lhs_str << " from " << i << " to " << j << endl; - //cerr << "RULE :"<< rule << endl; + //cerr << "RULE :"<< rule << endl; int& fid_ef = fids_ef(i,j)[&rule]; for (unsigned int i = 0; i < feat_labels.size(); i++) { ostringstream os; @@ -126,7 +123,7 @@ struct SoftSyntacticFeaturesImpl { fid_ef = FD::Convert(os.str()); if (lhs_str.compare(label) == 0) { if (fid_ef > 0) { - //cerr << "Feature: " << os.str() << endl; + //cerr << "Feature: " << os.str() << endl; feats->set_value(fid_ef, 1.0); } } @@ -147,8 +144,8 @@ struct SoftSyntacticFeaturesImpl { } } break; - case '-': - //cerr << "-" << endl; + case '-': + //cerr << "-" << endl; if (lhs_str.compare(label) != 0) { os << "SYN:" << label << "_cross"; fid_ef = FD::Convert(os.str()); @@ -167,22 +164,22 @@ struct SoftSyntacticFeaturesImpl { return lhs; } - Array2D src_tree; // src_tree(i,j) NT = type - mutable Array2D > fids_ef; // fires for fully lexicalized + Array2D src_tree; // src_tree(i,j) NT = type + mutable Array2D > fids_ef; // fires for fully lexicalized vector > feat_labels; }; -SoftSyntacticFeatures::SoftSyntacticFeatures(const string& param) : +SoftSyntaxFeatures::SoftSyntaxFeatures(const string& param) : FeatureFunction(sizeof(WordID)) { - impl = new SoftSyntacticFeaturesImpl(param); + impl = new SoftSyntaxFeaturesImpl(param); } -SoftSyntacticFeatures::~SoftSyntacticFeatures() { +SoftSyntaxFeatures::~SoftSyntaxFeatures() { delete impl; impl = NULL; } -void SoftSyntacticFeatures::TraversalFeaturesImpl(const SentenceMetadata& smeta, +void SoftSyntaxFeatures::TraversalFeaturesImpl(const SentenceMetadata& smeta, const Hypergraph::Edge& edge, const vector& ant_contexts, SparseVector* features, @@ -196,6 +193,7 @@ void SoftSyntacticFeatures::TraversalFeaturesImpl(const SentenceMetadata& smeta, impl->FireFeatures(*edge.rule_, edge.i_, edge.j_, ants, features); } -void SoftSyntacticFeatures::PrepareForInput(const SentenceMetadata& smeta) { +void SoftSyntaxFeatures::PrepareForInput(const SentenceMetadata& smeta) { impl->InitializeGrids(smeta.GetSGMLValue("src_tree"), smeta.GetSourceLength()); } + diff --git a/decoder/ff_soft_syntax.h b/decoder/ff_soft_syntax.h index 79352f49..e71825d5 100644 --- a/decoder/ff_soft_syntax.h +++ b/decoder/ff_soft_syntax.h @@ -1,15 +1,15 @@ -#ifndef _FF_SOFTSYNTAX_H_ -#define _FF_SOFTSYNTAX_H_ +#ifndef _FF_SOFT_SYNTAX_H_ +#define _FF_SOFT_SYNTAX_H_ #include "ff.h" #include "hg.h" -struct SoftSyntacticFeaturesImpl; +struct SoftSyntaxFeaturesImpl; -class SoftSyntacticFeatures : public FeatureFunction { +class SoftSyntaxFeatures : public FeatureFunction { public: - SoftSyntacticFeatures(const std::string& param); - ~SoftSyntacticFeatures(); + SoftSyntaxFeatures(const std::string& param); + ~SoftSyntaxFeatures(); protected: virtual void TraversalFeaturesImpl(const SentenceMetadata& smeta, const Hypergraph::Edge& edge, @@ -19,9 +19,9 @@ class SoftSyntacticFeatures : public FeatureFunction { void* context) const; virtual void PrepareForInput(const SentenceMetadata& smeta); private: - SoftSyntacticFeaturesImpl* impl; + SoftSyntaxFeaturesImpl* impl; }; - #endif + diff --git a/decoder/ff_soft_syntax2.cc b/decoder/ff_soft_syntax2.cc deleted file mode 100644 index 121bc39b..00000000 --- a/decoder/ff_soft_syntax2.cc +++ /dev/null @@ -1,234 +0,0 @@ -#include "ff_soft_syntax2.h" - -#include -#include -#include -#include -#include - -#include "sentence_metadata.h" -#include "stringlib.h" -#include "array2d.h" -#include "filelib.h" - -using namespace std; - -// Implements the soft syntactic features described in -// Marton and Resnik (2008): "Soft Syntacitc Constraints for Hierarchical Phrase-Based Translation". -// Source trees must be represented in Penn Treebank format, -// e.g. (S (NP John) (VP (V left))). - -struct SoftSyntacticFeatures2Impl { - SoftSyntacticFeatures2Impl(const string& param) { - vector labels = SplitOnWhitespace(param); - //for (unsigned int i = 0; i < labels.size(); i++) - //cerr << "Labels: " << labels.at(i) << endl; - for (unsigned int i = 0; i < labels.size(); i++) { - string label = labels.at(i); - pair feat_label; - feat_label.first = label.substr(0, label.size() - 1); - feat_label.second = label.at(label.size() - 1); - feat_labels.push_back(feat_label); - } - } - - void InitializeGrids(const string& tree, unsigned src_len) { - assert(tree.size() > 0); - //fids_cat.clear(); - fids_ef.clear(); - src_tree.clear(); - //fids_cat.resize(src_len, src_len + 1); - fids_ef.resize(src_len, src_len + 1); - src_tree.resize(src_len, src_len + 1, TD::Convert("XX")); - ParseTreeString(tree, src_len); - } - - void ParseTreeString(const string& tree, unsigned src_len) { - stack > stk; // first = i, second = category - pair cur_cat; cur_cat.first = -1; - unsigned i = 0; - unsigned p = 0; - //cerr << "String " << tree << endl; - while(p < tree.size()) { - const char cur = tree[p]; - if (cur == '(') { - stk.push(cur_cat); - ++p; - unsigned k = p + 1; - while (k < tree.size() && tree[k] != ' ') { ++k; } - cur_cat.first = i; - cur_cat.second = TD::Convert(tree.substr(p, k - p)); - //cerr << "NT: '" << tree.substr(p, k-p) << "' (i=" << i << ")\n"; - p = k + 1; - } else if (cur == ')') { - unsigned k = p; - while (k < tree.size() && tree[k] == ')') { ++k; } - const unsigned num_closes = k - p; - for (unsigned ci = 0; ci < num_closes; ++ci) { - // cur_cat.second spans from cur_cat.first to i - //cerr << TD::Convert(cur_cat.second) << " from " << cur_cat.first << " to " << i << endl; - // NOTE: unary rule chains end up being labeled with the top-most category - src_tree(cur_cat.first, i) = cur_cat.second; - cur_cat = stk.top(); - stk.pop(); - } - p = k; - while (p < tree.size() && (tree[p] == ' ' || tree[p] == '\t')) { ++p; } - } else if (cur == ' ' || cur == '\t') { - cerr << "Unexpected whitespace in: " << tree << endl; - abort(); - } else { // terminal symbol - unsigned k = p + 1; - do { - while (k < tree.size() && tree[k] != ')' && tree[k] != ' ') { ++k; } - // cerr << "TERM: '" << tree.substr(p, k-p) << "' (i=" << i << ")\n"; - ++i; - assert(i <= src_len); - while (k < tree.size() && tree[k] == ' ') { ++k; } - p = k; - } while (p < tree.size() && tree[p] != ')'); - } - } - //cerr << "i=" << i << " src_len=" << src_len << endl; - assert(i == src_len); // make sure tree specified in src_tree is - // the same length as the source sentence - } - - WordID FireFeatures(const TRule& rule, const int i, const int j, const WordID* ants, SparseVector* feats) { - //cerr << "fire features: " << rule.AsString() << " for " << i << "," << j << endl; - const WordID lhs = src_tree(i,j); - string lhs_str = TD::Convert(lhs); - //cerr << "LHS: " << lhs_str << " from " << i << " to " << j << endl; - //cerr << "RULE :"<< rule << endl; - int& fid_ef = fids_ef(i,j)[&rule]; - string lhs_to_str = TD::Convert(lhs); - int min_dist; - string min_dist_label; - if (lhs_to_str.compare("XX") != 0) { - min_dist = 0; - min_dist_label = lhs_to_str; - } - else { - int ok = 0; - for (unsigned int k = 1; k < (j - i); k++) { - min_dist = k; - for (unsigned int l = 0; l <= k; l++) { - int l_add = i-l; - int r_add = j+(k-l); - if ((l_add < src_tree.width() && r_add < src_tree.height()) && (TD::Convert(src_tree(l_add, r_add)).compare("XX") != 0)) { - ok = 1; - min_dist_label = (TD::Convert(src_tree(l_add, r_add))); - break; - } - else { - int l_rem= i+l; - int r_rem = j-(k-l); - if ((l_rem < src_tree.width() && r_rem < src_tree.height()) && TD::Convert(src_tree(l_rem, r_rem)).compare("XX") != 0) { - ok = 1; - min_dist_label = (TD::Convert(src_tree(l_rem, r_rem))); - break; - } - } - } - if (ok) break; - } - } - //cerr << "SPAN: " << i << " " << j << endl; - //cerr << "MINDIST: " << min_dist << endl; - //cerr << "MINDISTLABEL: " << min_dist_label << endl; - for (unsigned int i = 0; i < feat_labels.size(); i++) { - ostringstream os; - string label = feat_labels.at(i).first; - //cerr << "This Label: " << label << endl; - char feat_type = (char) feat_labels.at(i).second.c_str()[0]; - //cerr << "feat_type: " << feat_type << endl; - switch(feat_type) { - case '2': - if (min_dist_label.compare(label) == 0) { - if (min_dist == 0) { - os << "SYN:" << label << "_conform"; - } - else { - os << "SYN:" << label << "_cross"; - } - fid_ef = FD::Convert(os.str()); - //cerr << "Feature :" << os.str() << endl; - feats->set_value(fid_ef, 1.0); - } - break; - case '_': - os << "SYN:" << label; - fid_ef = FD::Convert(os.str()); - if (min_dist_label.compare(label) == 0) { - //cerr << "Feature: " << os.str() << endl; - if (min_dist == 0) { - feats->set_value(fid_ef, 1.0); - } - else { - //cerr << "Feature: " << os.str() << endl; - feats->set_value(fid_ef, -1.0); - } - } - break; - case '+': - if (min_dist_label.compare(label) == 0) { - os << "SYN:" << label << "_conform"; - fid_ef = FD::Convert(os.str()); - if (min_dist == 0) { - //cerr << "Feature: " << os.str() << endl; - feats->set_value(fid_ef, 1.0); - } - } - break; - case '-': - //cerr << "-" << endl; - if (min_dist_label.compare(label) != 0) { - os << "SYN:" << label << "_cross"; - fid_ef = FD::Convert(os.str()); - if (min_dist > 0) { - //cerr << "Feature :" << os.str() << endl; - feats->set_value(fid_ef, 1.0); - } - } - break; - os.clear(); - os.str(""); - } - //cerr << "FEATURE: " << os.str() << endl; - //cerr << endl; - } - return lhs; - } - - Array2D src_tree; // src_tree(i,j) NT = type - mutable Array2D > fids_ef; // fires for fully lexicalized - vector > feat_labels; -}; - -SoftSyntacticFeatures2::SoftSyntacticFeatures2(const string& param) : - FeatureFunction(sizeof(WordID)) { - impl = new SoftSyntacticFeatures2Impl(param); -} - -SoftSyntacticFeatures2::~SoftSyntacticFeatures2() { - delete impl; - impl = NULL; -} - -void SoftSyntacticFeatures2::TraversalFeaturesImpl(const SentenceMetadata& smeta, - const Hypergraph::Edge& edge, - const vector& ant_contexts, - SparseVector* features, - SparseVector* estimated_features, - void* context) const { - WordID ants[8]; - for (unsigned i = 0; i < ant_contexts.size(); ++i) - ants[i] = *static_cast(ant_contexts[i]); - - *static_cast(context) = - impl->FireFeatures(*edge.rule_, edge.i_, edge.j_, ants, features); -} - -void SoftSyntacticFeatures2::PrepareForInput(const SentenceMetadata& smeta) { - impl->InitializeGrids(smeta.GetSGMLValue("src_tree"), smeta.GetSourceLength()); -} diff --git a/decoder/ff_soft_syntax2.h b/decoder/ff_soft_syntax2.h deleted file mode 100644 index 4de91d86..00000000 --- a/decoder/ff_soft_syntax2.h +++ /dev/null @@ -1,27 +0,0 @@ -#ifndef _FF_SOFTSYNTAX2_H_ -#define _FF_SOFTSYNTAX2_H_ - -#include "ff.h" -#include "hg.h" - -struct SoftSyntacticFeatures2Impl; - -class SoftSyntacticFeatures2 : public FeatureFunction { - public: - SoftSyntacticFeatures2(const std::string& param); - ~SoftSyntacticFeatures2(); - protected: - virtual void TraversalFeaturesImpl(const SentenceMetadata& smeta, - const Hypergraph::Edge& edge, - const std::vector& ant_contexts, - SparseVector* features, - SparseVector* estimated_features, - void* context) const; - virtual void PrepareForInput(const SentenceMetadata& smeta); - private: - SoftSyntacticFeatures2Impl* impl; -}; - - - -#endif diff --git a/decoder/ff_soft_syntax_mindist.cc b/decoder/ff_soft_syntax_mindist.cc new file mode 100644 index 00000000..3f531986 --- /dev/null +++ b/decoder/ff_soft_syntax_mindist.cc @@ -0,0 +1,235 @@ +#include "ff_soft_syntax_mindist.h" + +#include +#include +#include +#include +#include + +#include "sentence_metadata.h" +#include "stringlib.h" +#include "array2d.h" +#include "filelib.h" + +using namespace std; + +// Implements the soft syntactic features described in +// Marton and Resnik (2008): "Soft Syntacitc Constraints for Hierarchical Phrase-Based Translation". +// Source trees must be represented in Penn Treebank format, +// e.g. (S (NP John) (VP (V left))). +// +// This variant accepts fuzzy matches, choosing the constituent with +// minimum distance. + +struct SoftSyntaxFeaturesMindistImpl { + SoftSyntaxFeaturesMindistImpl(const string& param) { + vector labels = SplitOnWhitespace(param); + //for (unsigned int i = 0; i < labels.size(); i++) { cerr << "Labels: " << labels.at(i) << endl; } + for (unsigned int i = 0; i < labels.size(); i++) { + string label = labels.at(i); + pair feat_label; + feat_label.first = label.substr(0, label.size() - 1); + feat_label.second = label.at(label.size() - 1); + feat_labels.push_back(feat_label); + } + } + + void InitializeGrids(const string& tree, unsigned src_len) { + assert(tree.size() > 0); + fids_ef.clear(); + src_tree.clear(); + fids_ef.resize(src_len, src_len + 1); + src_tree.resize(src_len, src_len + 1, TD::Convert("XX")); + ParseTreeString(tree, src_len); + } + + void ParseTreeString(const string& tree, unsigned src_len) { + stack > stk; // first = i, second = category + pair cur_cat; cur_cat.first = -1; + unsigned i = 0; + unsigned p = 0; + //cerr << "String " << tree << endl; + while(p < tree.size()) { + const char cur = tree[p]; + if (cur == '(') { + stk.push(cur_cat); + ++p; + unsigned k = p + 1; + while (k < tree.size() && tree[k] != ' ') { ++k; } + cur_cat.first = i; + cur_cat.second = TD::Convert(tree.substr(p, k - p)); + //cerr << "NT: '" << tree.substr(p, k-p) << "' (i=" << i << ")\n"; + p = k + 1; + } else if (cur == ')') { + unsigned k = p; + while (k < tree.size() && tree[k] == ')') { ++k; } + const unsigned num_closes = k - p; + for (unsigned ci = 0; ci < num_closes; ++ci) { + // cur_cat.second spans from cur_cat.first to i + //cerr << TD::Convert(cur_cat.second) << " from " << cur_cat.first << " to " << i << endl; + // NOTE: unary rule chains end up being labeled with the top-most category + src_tree(cur_cat.first, i) = cur_cat.second; + cur_cat = stk.top(); + stk.pop(); + } + p = k; + while (p < tree.size() && (tree[p] == ' ' || tree[p] == '\t')) { ++p; } + } else if (cur == ' ' || cur == '\t') { + cerr << "Unexpected whitespace in: " << tree << endl; + abort(); + } else { // terminal symbol + unsigned k = p + 1; + do { + while (k < tree.size() && tree[k] != ')' && tree[k] != ' ') { ++k; } + // cerr << "TERM: '" << tree.substr(p, k-p) << "' (i=" << i << ")\n"; + ++i; + assert(i <= src_len); + while (k < tree.size() && tree[k] == ' ') { ++k; } + p = k; + } while (p < tree.size() && tree[p] != ')'); + } + } + //cerr << "i=" << i << " src_len=" << src_len << endl; + assert(i == src_len); // make sure tree specified in src_tree is + // the same length as the source sentence + } + + WordID FireFeatures(const TRule& rule, const int i, const int j, const WordID* ants, SparseVector* feats) { + //cerr << "fire features: " << rule.AsString() << " for " << i << "," << j << endl; + const WordID lhs = src_tree(i,j); + string lhs_str = TD::Convert(lhs); + //cerr << "LHS: " << lhs_str << " from " << i << " to " << j << endl; + //cerr << "RULE :"<< rule << endl; + int& fid_ef = fids_ef(i,j)[&rule]; + string lhs_to_str = TD::Convert(lhs); + int min_dist; + string min_dist_label; + if (lhs_to_str.compare("XX") != 0) { + min_dist = 0; + min_dist_label = lhs_to_str; + } + else { + int ok = 0; + for (unsigned int k = 1; k < (j - i); k++) { + min_dist = k; + for (unsigned int l = 0; l <= k; l++) { + int l_add = i-l; + int r_add = j+(k-l); + if ((l_add < src_tree.width() && r_add < src_tree.height()) && (TD::Convert(src_tree(l_add, r_add)).compare("XX") != 0)) { + ok = 1; + min_dist_label = (TD::Convert(src_tree(l_add, r_add))); + break; + } + else { + int l_rem= i+l; + int r_rem = j-(k-l); + if ((l_rem < src_tree.width() && r_rem < src_tree.height()) && TD::Convert(src_tree(l_rem, r_rem)).compare("XX") != 0) { + ok = 1; + min_dist_label = (TD::Convert(src_tree(l_rem, r_rem))); + break; + } + } + } + if (ok) break; + } + } + //cerr << "SPAN: " << i << " " << j << endl; + //cerr << "MINDIST: " << min_dist << endl; + //cerr << "MINDISTLABEL: " << min_dist_label << endl; + for (unsigned int i = 0; i < feat_labels.size(); i++) { + ostringstream os; + string label = feat_labels.at(i).first; + //cerr << "This Label: " << label << endl; + char feat_type = (char) feat_labels.at(i).second.c_str()[0]; + //cerr << "feat_type: " << feat_type << endl; + switch(feat_type) { + case '2': + if (min_dist_label.compare(label) == 0) { + if (min_dist == 0) { + os << "SYN:" << label << "_conform"; + } + else { + os << "SYN:" << label << "_cross"; + } + fid_ef = FD::Convert(os.str()); + //cerr << "Feature :" << os.str() << endl; + feats->set_value(fid_ef, 1.0); + } + break; + case '_': + os << "SYN:" << label; + fid_ef = FD::Convert(os.str()); + if (min_dist_label.compare(label) == 0) { + //cerr << "Feature: " << os.str() << endl; + if (min_dist == 0) { + feats->set_value(fid_ef, 1.0); + } + else { + //cerr << "Feature: " << os.str() << endl; + feats->set_value(fid_ef, -1.0); + } + } + break; + case '+': + if (min_dist_label.compare(label) == 0) { + os << "SYN:" << label << "_conform"; + fid_ef = FD::Convert(os.str()); + if (min_dist == 0) { + //cerr << "Feature: " << os.str() << endl; + feats->set_value(fid_ef, 1.0); + } + } + break; + case '-': + //cerr << "-" << endl; + if (min_dist_label.compare(label) != 0) { + os << "SYN:" << label << "_cross"; + fid_ef = FD::Convert(os.str()); + if (min_dist > 0) { + //cerr << "Feature :" << os.str() << endl; + feats->set_value(fid_ef, 1.0); + } + } + break; + os.clear(); + os.str(""); + } + //cerr << "FEATURE: " << os.str() << endl; + //cerr << endl; + } + return lhs; + } + + Array2D src_tree; // src_tree(i,j) NT = type + mutable Array2D > fids_ef; // fires for fully lexicalized + vector > feat_labels; +}; + +SoftSyntaxFeaturesMindist::SoftSyntaxFeaturesMindist(const string& param) : + FeatureFunction(sizeof(WordID)) { + impl = new SoftSyntaxFeaturesMindistImpl(param); +} + +SoftSyntaxFeaturesMindist::~SoftSyntaxFeaturesMindist() { + delete impl; + impl = NULL; +} + +void SoftSyntaxFeaturesMindist::TraversalFeaturesImpl(const SentenceMetadata& smeta, + const Hypergraph::Edge& edge, + const vector& ant_contexts, + SparseVector* features, + SparseVector* estimated_features, + void* context) const { + WordID ants[8]; + for (unsigned i = 0; i < ant_contexts.size(); ++i) + ants[i] = *static_cast(ant_contexts[i]); + + *static_cast(context) = + impl->FireFeatures(*edge.rule_, edge.i_, edge.j_, ants, features); +} + +void SoftSyntaxFeaturesMindist::PrepareForInput(const SentenceMetadata& smeta) { + impl->InitializeGrids(smeta.GetSGMLValue("src_tree"), smeta.GetSourceLength()); +} + diff --git a/decoder/ff_soft_syntax_mindist.h b/decoder/ff_soft_syntax_mindist.h new file mode 100644 index 00000000..bf938b38 --- /dev/null +++ b/decoder/ff_soft_syntax_mindist.h @@ -0,0 +1,27 @@ +#ifndef _FF_SOFT_SYNTAX_MINDIST_H_ +#define _FF_SOFT_SYNTAX_MINDIST_H_ + +#include "ff.h" +#include "hg.h" + +struct SoftSyntaxFeaturesMindistImpl; + +class SoftSyntaxFeaturesMindist : public FeatureFunction { + public: + SoftSyntaxFeaturesMindist(const std::string& param); + ~SoftSyntaxFeaturesMindist(); + protected: + virtual void TraversalFeaturesImpl(const SentenceMetadata& smeta, + const Hypergraph::Edge& edge, + const std::vector& ant_contexts, + SparseVector* features, + SparseVector* estimated_features, + void* context) const; + virtual void PrepareForInput(const SentenceMetadata& smeta); + private: + SoftSyntaxFeaturesMindistImpl* impl; +}; + + +#endif + diff --git a/decoder/ff_source_syntax.cc b/decoder/ff_source_syntax.cc index a1997695..34e7ab69 100644 --- a/decoder/ff_source_syntax.cc +++ b/decoder/ff_source_syntax.cc @@ -2,8 +2,8 @@ #include #include +#include -#include "hg.h" #include "sentence_metadata.h" #include "array2d.h" #include "filelib.h" @@ -24,6 +24,17 @@ inline int SpanSizeTransform(unsigned span_size) { struct SourceSyntaxFeaturesImpl { SourceSyntaxFeaturesImpl() {} + SourceSyntaxFeaturesImpl(const string& param) { + if (!(param.compare("") == 0)) { + string triggered_features_fn = param; + ReadFile triggered_features(triggered_features_fn); + string in; + while(getline(*triggered_features, in)) { + feature_filter.insert(FD::Convert(in)); + } + } + } + void InitializeGrids(const string& tree, unsigned src_len) { assert(tree.size() > 0); //fids_cat.clear(); @@ -118,21 +129,28 @@ struct SourceSyntaxFeaturesImpl { } fid_ef = FD::Convert(os.str()); } - //if (fid_cat > 0) - // feats->set_value(fid_cat, 1.0); - if (fid_ef > 0) - feats->set_value(fid_ef, 1.0); + if (fid_ef > 0) { + if (feature_filter.size()>0) { + if (feature_filter.find(fid_ef) != feature_filter.end()) { + feats->set_value(fid_ef, 1.0); + } + } else { + feats->set_value(fid_ef, 1.0); + } + } + cerr << FD::Convert(fid_ef) << endl; return lhs; } - Array2D src_tree; // src_tree(i,j) NT = type - // mutable Array2D fids_cat; // this tends to overfit baddly - mutable Array2D > fids_ef; // fires for fully lexicalized + Array2D src_tree; // src_tree(i,j) NT = type + // mutable Array2D fids_cat; // this tends to overfit baddly + mutable Array2D > fids_ef; // fires for fully lexicalized + tr1::unordered_set feature_filter; }; SourceSyntaxFeatures::SourceSyntaxFeatures(const string& param) : FeatureFunction(sizeof(WordID)) { - impl = new SourceSyntaxFeaturesImpl; + impl = new SourceSyntaxFeaturesImpl(param); } SourceSyntaxFeatures::~SourceSyntaxFeatures() { @@ -230,4 +248,3 @@ void SourceSpanSizeFeatures::PrepareForInput(const SentenceMetadata& smeta) { impl->InitializeGrids(smeta.GetSourceLength()); } - diff --git a/decoder/ff_source_syntax.h b/decoder/ff_source_syntax.h index a8c7150a..bdd638c1 100644 --- a/decoder/ff_source_syntax.h +++ b/decoder/ff_source_syntax.h @@ -1,7 +1,8 @@ -#ifndef _FF_SOURCE_TOOLS_H_ -#define _FF_SOURCE_TOOLS_H_ +#ifndef _FF_SOURCE_SYNTAX_H_ +#define _FF_SOURCE_SYNTAX_H_ #include "ff.h" +#include "hg.h" struct SourceSyntaxFeaturesImpl; @@ -11,7 +12,7 @@ class SourceSyntaxFeatures : public FeatureFunction { ~SourceSyntaxFeatures(); protected: virtual void TraversalFeaturesImpl(const SentenceMetadata& smeta, - const HG::Edge& edge, + const Hypergraph::Edge& edge, const std::vector& ant_contexts, SparseVector* features, SparseVector* estimated_features, @@ -28,7 +29,7 @@ class SourceSpanSizeFeatures : public FeatureFunction { ~SourceSpanSizeFeatures(); protected: virtual void TraversalFeaturesImpl(const SentenceMetadata& smeta, - const HG::Edge& edge, + const Hypergraph::Edge& edge, const std::vector& ant_contexts, SparseVector* features, SparseVector* estimated_features, @@ -39,3 +40,4 @@ class SourceSpanSizeFeatures : public FeatureFunction { }; #endif + diff --git a/decoder/ff_source_syntax2.cc b/decoder/ff_source_syntax2.cc index 08ece917..63736342 100644 --- a/decoder/ff_source_syntax2.cc +++ b/decoder/ff_source_syntax2.cc @@ -17,7 +17,7 @@ using namespace std; struct SourceSyntaxFeatures2Impl { SourceSyntaxFeatures2Impl(const string& param) { - if (!(param.compare("") == 0)) { + if (param.compare("") != 0) { string triggered_features_fn = param; ReadFile triggered_features(triggered_features_fn); string in; @@ -29,10 +29,8 @@ struct SourceSyntaxFeatures2Impl { void InitializeGrids(const string& tree, unsigned src_len) { assert(tree.size() > 0); - //fids_cat.clear(); fids_ef.clear(); src_tree.clear(); - //fids_cat.resize(src_len, src_len + 1); fids_ef.resize(src_len, src_len + 1); src_tree.resize(src_len, src_len + 1, TD::Convert("XX")); ParseTreeString(tree, src_len); @@ -40,7 +38,7 @@ struct SourceSyntaxFeatures2Impl { void ParseTreeString(const string& tree, unsigned src_len) { //cerr << "TREE: " << tree << endl; - stack > stk; // first = i, second = category + stack > stk; // first = i, second = category pair cur_cat; cur_cat.first = -1; unsigned i = 0; unsigned p = 0; @@ -100,7 +98,7 @@ struct SourceSyntaxFeatures2Impl { if (k > 0 && fj <= 0) os << '_'; if (fj <= 0) { os << '[' << TD::Convert(ants[ntc++]) << ']'; - } /*else { + }/*else { os << TD::Convert(fj); }*/ } @@ -116,16 +114,22 @@ struct SourceSyntaxFeatures2Impl { fid_ef = FD::Convert(os.str()); //cerr << "FEATURE: " << os.str() << endl; //cerr << "FID_EF: " << fid_ef << endl; - if (feature_filter.find(fid_ef) != feature_filter.end()) { - cerr << "SYN-Feature was trigger more than once on training set." << endl; + if (feature_filter.size() > 0) { + if (feature_filter.find(fid_ef) != feature_filter.end()) { + //cerr << "SYN-Feature was trigger more than once on training set." << endl; + feats->set_value(fid_ef, 1.0); + } + //else cerr << "SYN-Feature was triggered less than once on training set." << endli; + } + else { feats->set_value(fid_ef, 1.0); } - else cerr << "SYN-Feature was triggered less than once on training set." << endl; + cerr << FD::Convert(fid_ef) << endl; return lhs; } - Array2D src_tree; // src_tree(i,j) NT = type - mutable Array2D > fids_ef; // fires for fully lexicalized + Array2D src_tree; // src_tree(i,j) NT = type + mutable Array2D > fids_ef; // fires for fully lexicalized tr1::unordered_set feature_filter; }; @@ -157,3 +161,4 @@ void SourceSyntaxFeatures2::TraversalFeaturesImpl(const SentenceMetadata& smeta, void SourceSyntaxFeatures2::PrepareForInput(const SentenceMetadata& smeta) { impl->InitializeGrids(smeta.GetSGMLValue("src_tree"), smeta.GetSourceLength()); } + diff --git a/decoder/ff_source_syntax2.h b/decoder/ff_source_syntax2.h index b6b7dc3d..f606c2bf 100644 --- a/decoder/ff_source_syntax2.h +++ b/decoder/ff_source_syntax2.h @@ -1,5 +1,5 @@ -#ifndef _FF_SOURCE_TOOLS2_H_ -#define _FF_SOURCE_TOOLS2_H_ +#ifndef _FF_SOURCE_SYNTAX2_H_ +#define _FF_SOURCE_SYNTAX2_H_ #include "ff.h" #include "hg.h" @@ -23,3 +23,4 @@ class SourceSyntaxFeatures2 : public FeatureFunction { }; #endif + diff --git a/decoder/ff_source_syntax2_p.cc b/decoder/ff_source_syntax2_p.cc deleted file mode 100644 index dfa791ea..00000000 --- a/decoder/ff_source_syntax2_p.cc +++ /dev/null @@ -1,166 +0,0 @@ -#include "ff_source_syntax2_p.h" - -#include -#include -#include -#include - -#include "sentence_metadata.h" -#include "array2d.h" -#include "filelib.h" - -using namespace std; - -// implements the source side syntax features described in Blunsom et al. (EMNLP 2008) -// source trees must be represented in Penn Treebank format, e.g. -// (S (NP John) (VP (V left))) - -struct PSourceSyntaxFeatures2Impl { - PSourceSyntaxFeatures2Impl(const string& param) { - if (param.compare("") != 0) { - string triggered_features_fn = param; - ReadFile triggered_features(triggered_features_fn); - string in; - while(getline(*triggered_features, in)) { - feature_filter.insert(FD::Convert(in)); - } - } - /*cerr << "find(\"One\") == " << boolalpha << (table.find("One") != table.end()) << endl; - cerr << "find(\"Three\") == " << boolalpha << (table.find("Three") != table.end()) << endl;*/ - } - - void InitializeGrids(const string& tree, unsigned src_len) { - assert(tree.size() > 0); - //fids_cat.clear(); - fids_ef.clear(); - src_tree.clear(); - //fids_cat.resize(src_len, src_len + 1); - fids_ef.resize(src_len, src_len + 1); - src_tree.resize(src_len, src_len + 1, TD::Convert("XX")); - ParseTreeString(tree, src_len); - } - - void ParseTreeString(const string& tree, unsigned src_len) { - //cerr << "TREE: " << tree << endl; - stack > stk; // first = i, second = category - pair cur_cat; cur_cat.first = -1; - unsigned i = 0; - unsigned p = 0; - while(p < tree.size()) { - const char cur = tree[p]; - if (cur == '(') { - stk.push(cur_cat); - ++p; - unsigned k = p + 1; - while (k < tree.size() && tree[k] != ' ') { ++k; } - cur_cat.first = i; - cur_cat.second = TD::Convert(tree.substr(p, k - p)); - // cerr << "NT: '" << tree.substr(p, k-p) << "' (i=" << i << ")\n"; - p = k + 1; - } else if (cur == ')') { - unsigned k = p; - while (k < tree.size() && tree[k] == ')') { ++k; } - const unsigned num_closes = k - p; - for (unsigned ci = 0; ci < num_closes; ++ci) { - src_tree(cur_cat.first, i) = cur_cat.second; - cur_cat = stk.top(); - stk.pop(); - } - p = k; - while (p < tree.size() && (tree[p] == ' ' || tree[p] == '\t')) { ++p; } - } else if (cur == ' ' || cur == '\t') { - cerr << "Unexpected whitespace in: " << tree << endl; - abort(); - } else { // terminal symbol - unsigned k = p + 1; - do { - while (k < tree.size() && tree[k] != ')' && tree[k] != ' ') { ++k; } - // cerr << "TERM: '" << tree.substr(p, k-p) << "' (i=" << i << ")\n"; - ++i; - assert(i <= src_len); - while (k < tree.size() && tree[k] == ' ') { ++k; } - p = k; - } while (p < tree.size() && tree[p] != ')'); - } - //cerr << "i=" << i << " src_len=" << src_len << endl; - } - //cerr << "i=" << i << " src_len=" << src_len << endl; - assert(i == src_len); // make sure tree specified in src_tree is - // the same length as the source sentence - } - - WordID FireFeatures(const TRule& rule, const int i, const int j, const WordID* ants, SparseVector* feats) { - //cerr << "fire features: " << rule.AsString() << " for " << i << "," << j << endl; - const WordID lhs = src_tree(i,j); - int& fid_ef = fids_ef(i,j)[&rule]; - ostringstream os; - os << "SYN:" << TD::Convert(lhs); - os << ':'; - unsigned ntc = 0; - for (unsigned k = 0; k < rule.f_.size(); ++k) { - int fj = rule.f_[k]; - if (k > 0 && fj <= 0) os << '_'; - if (fj <= 0) { - os << '[' << TD::Convert(ants[ntc++]) << ']'; - } /*else { - os << TD::Convert(fj); - }*/ - } - os << ':'; - for (unsigned k = 0; k < rule.e_.size(); ++k) { - const int ei = rule.e_[k]; - if (k > 0) os << '_'; - if (ei <= 0) - os << '[' << (1-ei) << ']'; - else - os << TD::Convert(ei); - } - fid_ef = FD::Convert(os.str()); - //cerr << "FEATURE: " << os.str() << endl; - //cerr << "FID_EF: " << fid_ef << endl; - if (feature_filter.size() > 0) { - if (feature_filter.find(fid_ef) != feature_filter.end()) { - //cerr << "SYN-Feature was trigger more than once on training set." << endl; - feats->set_value(fid_ef, 1.0); - } - //else cerr << "SYN-Feature was triggered less than once on training set." << endli; - } - else { - feats->set_value(fid_ef, 1.0); - } - return lhs; - } - - Array2D src_tree; // src_tree(i,j) NT = type - mutable Array2D > fids_ef; // fires for fully lexicalized - tr1::unordered_set feature_filter; - -}; - -PSourceSyntaxFeatures2::PSourceSyntaxFeatures2(const string& param) : - FeatureFunction(sizeof(WordID)) { - impl = new PSourceSyntaxFeatures2Impl(param); -} - -PSourceSyntaxFeatures2::~PSourceSyntaxFeatures2() { - delete impl; - impl = NULL; -} - -void PSourceSyntaxFeatures2::TraversalFeaturesImpl(const SentenceMetadata& smeta, - const Hypergraph::Edge& edge, - const vector& ant_contexts, - SparseVector* features, - SparseVector* estimated_features, - void* context) const { - WordID ants[8]; - for (unsigned i = 0; i < ant_contexts.size(); ++i) - ants[i] = *static_cast(ant_contexts[i]); - - *static_cast(context) = - impl->FireFeatures(*edge.rule_, edge.i_, edge.j_, ants, features); -} - -void PSourceSyntaxFeatures2::PrepareForInput(const SentenceMetadata& smeta) { - impl->InitializeGrids(smeta.GetSGMLValue("src_tree"), smeta.GetSourceLength()); -} diff --git a/decoder/ff_source_syntax2_p.h b/decoder/ff_source_syntax2_p.h deleted file mode 100644 index d56ecab0..00000000 --- a/decoder/ff_source_syntax2_p.h +++ /dev/null @@ -1,25 +0,0 @@ -#ifndef _FF_SOURCE_TOOLS2_H_ -#define _FF_SOURCE_TOOLS2_H_ - -#include "ff.h" -#include "hg.h" - -struct PSourceSyntaxFeatures2Impl; - -class PSourceSyntaxFeatures2 : public FeatureFunction { - public: - PSourceSyntaxFeatures2(const std::string& param); - ~PSourceSyntaxFeatures2(); - protected: - virtual void TraversalFeaturesImpl(const SentenceMetadata& smeta, - const Hypergraph::Edge& edge, - const std::vector& ant_contexts, - SparseVector* features, - SparseVector* estimated_features, - void* context) const; - virtual void PrepareForInput(const SentenceMetadata& smeta); - private: - PSourceSyntaxFeatures2Impl* impl; -}; - -#endif diff --git a/decoder/ff_source_syntax_p.cc b/decoder/ff_source_syntax_p.cc deleted file mode 100644 index cd081544..00000000 --- a/decoder/ff_source_syntax_p.cc +++ /dev/null @@ -1,245 +0,0 @@ -#include "ff_source_syntax_p.h" - -#include -#include -#include - -#include "sentence_metadata.h" -#include "array2d.h" -#include "filelib.h" - -using namespace std; - -// implements the source side syntax features described in Blunsom et al. (EMNLP 2008) -// source trees must be represented in Penn Treebank format, e.g. -// (S (NP John) (VP (V left))) - -// log transform to make long spans cluster together -// but preserve differences -inline int SpanSizeTransform(unsigned span_size) { - if (!span_size) return 0; - return static_cast(log(span_size+1) / log(1.39)) - 1; -} - -struct PSourceSyntaxFeaturesImpl { - PSourceSyntaxFeaturesImpl() {} - - PSourceSyntaxFeaturesImpl(const string& param) { - if (!(param.compare("") == 0)) { - string triggered_features_fn = param; - ReadFile triggered_features(triggered_features_fn); - string in; - while(getline(*triggered_features, in)) { - feature_filter.insert(FD::Convert(in)); - } - } - } - - void InitializeGrids(const string& tree, unsigned src_len) { - assert(tree.size() > 0); - //fids_cat.clear(); - fids_ef.clear(); - src_tree.clear(); - //fids_cat.resize(src_len, src_len + 1); - fids_ef.resize(src_len, src_len + 1); - src_tree.resize(src_len, src_len + 1, TD::Convert("XX")); - ParseTreeString(tree, src_len); - } - - void ParseTreeString(const string& tree, unsigned src_len) { - stack > stk; // first = i, second = category - pair cur_cat; cur_cat.first = -1; - unsigned i = 0; - unsigned p = 0; - while(p < tree.size()) { - const char cur = tree[p]; - if (cur == '(') { - stk.push(cur_cat); - ++p; - unsigned k = p + 1; - while (k < tree.size() && tree[k] != ' ') { ++k; } - cur_cat.first = i; - cur_cat.second = TD::Convert(tree.substr(p, k - p)); - // cerr << "NT: '" << tree.substr(p, k-p) << "' (i=" << i << ")\n"; - p = k + 1; - } else if (cur == ')') { - unsigned k = p; - while (k < tree.size() && tree[k] == ')') { ++k; } - const unsigned num_closes = k - p; - for (unsigned ci = 0; ci < num_closes; ++ci) { - // cur_cat.second spans from cur_cat.first to i - // cerr << TD::Convert(cur_cat.second) << " from " << cur_cat.first << " to " << i << endl; - // NOTE: unary rule chains end up being labeled with the top-most category - src_tree(cur_cat.first, i) = cur_cat.second; - cur_cat = stk.top(); - stk.pop(); - } - p = k; - while (p < tree.size() && (tree[p] == ' ' || tree[p] == '\t')) { ++p; } - } else if (cur == ' ' || cur == '\t') { - cerr << "Unexpected whitespace in: " << tree << endl; - abort(); - } else { // terminal symbol - unsigned k = p + 1; - do { - while (k < tree.size() && tree[k] != ')' && tree[k] != ' ') { ++k; } - // cerr << "TERM: '" << tree.substr(p, k-p) << "' (i=" << i << ")\n"; - ++i; - assert(i <= src_len); - while (k < tree.size() && tree[k] == ' ') { ++k; } - p = k; - } while (p < tree.size() && tree[p] != ')'); - } - } - // cerr << "i=" << i << " src_len=" << src_len << endl; - assert(i == src_len); // make sure tree specified in src_tree is - // the same length as the source sentence - } - - WordID FireFeatures(const TRule& rule, const int i, const int j, const WordID* ants, SparseVector* feats) { - //cerr << "fire features: " << rule.AsString() << " for " << i << "," << j << endl; - const WordID lhs = src_tree(i,j); - //int& fid_cat = fids_cat(i,j); - int& fid_ef = fids_ef(i,j)[&rule]; - if (fid_ef <= 0) { - ostringstream os; - //ostringstream os2; - os << "SYN:" << TD::Convert(lhs); - //os2 << "SYN:" << TD::Convert(lhs) << '_' << SpanSizeTransform(j - i); - //fid_cat = FD::Convert(os2.str()); - os << ':'; - unsigned ntc = 0; - for (unsigned k = 0; k < rule.f_.size(); ++k) { - if (k > 0) os << '_'; - int fj = rule.f_[k]; - if (fj <= 0) { - os << '[' << TD::Convert(ants[ntc++]) << ']'; - } else { - os << TD::Convert(fj); - } - } - os << ':'; - for (unsigned k = 0; k < rule.e_.size(); ++k) { - const int ei = rule.e_[k]; - if (k > 0) os << '_'; - if (ei <= 0) - os << '[' << (1-ei) << ']'; - else - os << TD::Convert(ei); - } - fid_ef = FD::Convert(os.str()); - } - //if (fid_cat > 0) - // feats->set_value(fid_cat, 1.0); - if (fid_ef > 0 && (feature_filter.find(fid_ef) != feature_filter.end())) - feats->set_value(fid_ef, 1.0); - return lhs; - } - - Array2D src_tree; // src_tree(i,j) NT = type - // mutable Array2D fids_cat; // this tends to overfit baddly - mutable Array2D > fids_ef; // fires for fully lexicalized - tr1::unordered_set feature_filter; -}; - -PSourceSyntaxFeatures::PSourceSyntaxFeatures(const string& param) : - FeatureFunction(sizeof(WordID)) { - impl = new PSourceSyntaxFeaturesImpl(param); -} - -PSourceSyntaxFeatures::~PSourceSyntaxFeatures() { - delete impl; - impl = NULL; -} - -void PSourceSyntaxFeatures::TraversalFeaturesImpl(const SentenceMetadata& smeta, - const Hypergraph::Edge& edge, - const vector& ant_contexts, - SparseVector* features, - SparseVector* estimated_features, - void* context) const { - WordID ants[8]; - for (unsigned i = 0; i < ant_contexts.size(); ++i) - ants[i] = *static_cast(ant_contexts[i]); - - *static_cast(context) = - impl->FireFeatures(*edge.rule_, edge.i_, edge.j_, ants, features); -} - -void PSourceSyntaxFeatures::PrepareForInput(const SentenceMetadata& smeta) { - impl->InitializeGrids(smeta.GetSGMLValue("src_tree"), smeta.GetSourceLength()); -} - -struct PSourceSpanSizeFeaturesImpl { - PSourceSpanSizeFeaturesImpl() {} - - void InitializeGrids(unsigned src_len) { - fids.clear(); - fids.resize(src_len, src_len + 1); - } - - int FireFeatures(const TRule& rule, const int i, const int j, const WordID* ants, SparseVector* feats) { - if (rule.Arity() > 0) { - int& fid = fids(i,j)[&rule]; - if (fid <= 0) { - ostringstream os; - os << "SSS:"; - unsigned ntc = 0; - for (unsigned k = 0; k < rule.f_.size(); ++k) { - if (k > 0) os << '_'; - int fj = rule.f_[k]; - if (fj <= 0) { - os << '[' << TD::Convert(-fj) << ants[ntc++] << ']'; - } else { - os << TD::Convert(fj); - } - } - os << ':'; - for (unsigned k = 0; k < rule.e_.size(); ++k) { - const int ei = rule.e_[k]; - if (k > 0) os << '_'; - if (ei <= 0) - os << '[' << (1-ei) << ']'; - else - os << TD::Convert(ei); - } - fid = FD::Convert(os.str()); - } - if (fid > 0) - feats->set_value(fid, 1.0); - } - return SpanSizeTransform(j - i); - } - - mutable Array2D > fids; -}; - -PSourceSpanSizeFeatures::PSourceSpanSizeFeatures(const string& param) : - FeatureFunction(sizeof(char)) { - impl = new PSourceSpanSizeFeaturesImpl; -} - -PSourceSpanSizeFeatures::~PSourceSpanSizeFeatures() { - delete impl; - impl = NULL; -} - -void PSourceSpanSizeFeatures::TraversalFeaturesImpl(const SentenceMetadata& smeta, - const Hypergraph::Edge& edge, - const vector& ant_contexts, - SparseVector* features, - SparseVector* estimated_features, - void* context) const { - int ants[8]; - for (unsigned i = 0; i < ant_contexts.size(); ++i) - ants[i] = *static_cast(ant_contexts[i]); - - *static_cast(context) = - impl->FireFeatures(*edge.rule_, edge.i_, edge.j_, ants, features); -} - -void PSourceSpanSizeFeatures::PrepareForInput(const SentenceMetadata& smeta) { - impl->InitializeGrids(smeta.GetSourceLength()); -} - - diff --git a/decoder/ff_source_syntax_p.h b/decoder/ff_source_syntax_p.h deleted file mode 100644 index 2dd9094a..00000000 --- a/decoder/ff_source_syntax_p.h +++ /dev/null @@ -1,42 +0,0 @@ -#ifndef _FF_SOURCE_TOOLS_H_ -#define _FF_SOURCE_TOOLS_H_ - -#include "ff.h" -#include "hg.h" - -struct PSourceSyntaxFeaturesImpl; - -class PSourceSyntaxFeatures : public FeatureFunction { - public: - PSourceSyntaxFeatures(const std::string& param); - ~PSourceSyntaxFeatures(); - protected: - virtual void TraversalFeaturesImpl(const SentenceMetadata& smeta, - const Hypergraph::Edge& edge, - const std::vector& ant_contexts, - SparseVector* features, - SparseVector* estimated_features, - void* context) const; - virtual void PrepareForInput(const SentenceMetadata& smeta); - private: - PSourceSyntaxFeaturesImpl* impl; -}; - -struct PSourceSpanSizeFeaturesImpl; -class PSourceSpanSizeFeatures : public FeatureFunction { - public: - PSourceSpanSizeFeatures(const std::string& param); - ~PSourceSpanSizeFeatures(); - protected: - virtual void TraversalFeaturesImpl(const SentenceMetadata& smeta, - const Hypergraph::Edge& edge, - const std::vector& ant_contexts, - SparseVector* features, - SparseVector* estimated_features, - void* context) const; - virtual void PrepareForInput(const SentenceMetadata& smeta); - private: - PSourceSpanSizeFeaturesImpl* impl; -}; - -#endif -- cgit v1.2.3 From decd2c4b1d4fb42a73a3217f347ea8f317e50869 Mon Sep 17 00:00:00 2001 From: Patrick Simianer Date: Tue, 5 Nov 2013 18:15:18 +0100 Subject: syntax features now read trees from files -- no more escaping! --- decoder/ff_parse_match.cc | 5 ++++- decoder/ff_soft_syntax.cc | 15 +++++++++------ decoder/ff_soft_syntax_mindist.cc | 15 +++++++++------ decoder/ff_source_syntax.cc | 7 +++++-- decoder/ff_source_syntax2.cc | 7 +++++-- utils/filelib.h | 5 ++++- 6 files changed, 36 insertions(+), 18 deletions(-) diff --git a/decoder/ff_parse_match.cc b/decoder/ff_parse_match.cc index 7c79302b..58026975 100644 --- a/decoder/ff_parse_match.cc +++ b/decoder/ff_parse_match.cc @@ -212,6 +212,9 @@ void ParseMatchFeatures::TraversalFeaturesImpl(const SentenceMetadata& smeta, } void ParseMatchFeatures::PrepareForInput(const SentenceMetadata& smeta) { - impl->InitializeGrids(smeta.GetSGMLValue("src_tree"), smeta.GetSourceLength()); + ReadFile f = ReadFile(smeta.GetSGMLValue("src_tree")); + string tree; + f.ReadAll(tree); + impl->InitializeGrids(tree, smeta.GetSourceLength()); } diff --git a/decoder/ff_soft_syntax.cc b/decoder/ff_soft_syntax.cc index a3d26135..23fe87bd 100644 --- a/decoder/ff_soft_syntax.cc +++ b/decoder/ff_soft_syntax.cc @@ -107,10 +107,10 @@ struct SoftSyntaxFeaturesImpl { switch(feat_type) { case '2': if (lhs_str.compare(label) == 0) { - os << "SYN:" << label << "_conform"; + os << "SOFT:" << label << "_conform"; } else { - os << "SYN:" << label << "_cross"; + os << "SOFT:" << label << "_cross"; } fid_ef = FD::Convert(os.str()); if (fid_ef > 0) { @@ -119,7 +119,7 @@ struct SoftSyntaxFeaturesImpl { } break; case '_': - os << "SYN:" << label; + os << "SOFT:" << label; fid_ef = FD::Convert(os.str()); if (lhs_str.compare(label) == 0) { if (fid_ef > 0) { @@ -136,7 +136,7 @@ struct SoftSyntaxFeaturesImpl { break; case '+': if (lhs_str.compare(label) == 0) { - os << "SYN:" << label << "_conform"; + os << "SOFT:" << label << "_conform"; fid_ef = FD::Convert(os.str()); if (fid_ef > 0) { //cerr << "Feature: " << os.str() << endl; @@ -147,7 +147,7 @@ struct SoftSyntaxFeaturesImpl { case '-': //cerr << "-" << endl; if (lhs_str.compare(label) != 0) { - os << "SYN:" << label << "_cross"; + os << "SOFT:" << label << "_cross"; fid_ef = FD::Convert(os.str()); if (fid_ef > 0) { //cerr << "Feature :" << os.str() << endl; @@ -194,6 +194,9 @@ void SoftSyntaxFeatures::TraversalFeaturesImpl(const SentenceMetadata& smeta, } void SoftSyntaxFeatures::PrepareForInput(const SentenceMetadata& smeta) { - impl->InitializeGrids(smeta.GetSGMLValue("src_tree"), smeta.GetSourceLength()); + ReadFile f = ReadFile(smeta.GetSGMLValue("src_tree")); + string tree; + f.ReadAll(tree); + impl->InitializeGrids(tree, smeta.GetSourceLength()); } diff --git a/decoder/ff_soft_syntax_mindist.cc b/decoder/ff_soft_syntax_mindist.cc index 3f531986..a23f70f8 100644 --- a/decoder/ff_soft_syntax_mindist.cc +++ b/decoder/ff_soft_syntax_mindist.cc @@ -146,10 +146,10 @@ struct SoftSyntaxFeaturesMindistImpl { case '2': if (min_dist_label.compare(label) == 0) { if (min_dist == 0) { - os << "SYN:" << label << "_conform"; + os << "SOFTM:" << label << "_conform"; } else { - os << "SYN:" << label << "_cross"; + os << "SOFTM:" << label << "_cross"; } fid_ef = FD::Convert(os.str()); //cerr << "Feature :" << os.str() << endl; @@ -157,7 +157,7 @@ struct SoftSyntaxFeaturesMindistImpl { } break; case '_': - os << "SYN:" << label; + os << "SOFTM:" << label; fid_ef = FD::Convert(os.str()); if (min_dist_label.compare(label) == 0) { //cerr << "Feature: " << os.str() << endl; @@ -172,7 +172,7 @@ struct SoftSyntaxFeaturesMindistImpl { break; case '+': if (min_dist_label.compare(label) == 0) { - os << "SYN:" << label << "_conform"; + os << "SOFTM:" << label << "_conform"; fid_ef = FD::Convert(os.str()); if (min_dist == 0) { //cerr << "Feature: " << os.str() << endl; @@ -183,7 +183,7 @@ struct SoftSyntaxFeaturesMindistImpl { case '-': //cerr << "-" << endl; if (min_dist_label.compare(label) != 0) { - os << "SYN:" << label << "_cross"; + os << "SOFTM:" << label << "_cross"; fid_ef = FD::Convert(os.str()); if (min_dist > 0) { //cerr << "Feature :" << os.str() << endl; @@ -230,6 +230,9 @@ void SoftSyntaxFeaturesMindist::TraversalFeaturesImpl(const SentenceMetadata& sm } void SoftSyntaxFeaturesMindist::PrepareForInput(const SentenceMetadata& smeta) { - impl->InitializeGrids(smeta.GetSGMLValue("src_tree"), smeta.GetSourceLength()); + ReadFile f = ReadFile(smeta.GetSGMLValue("src_tree")); + string tree; + f.ReadAll(tree); + impl->InitializeGrids(tree, smeta.GetSourceLength()); } diff --git a/decoder/ff_source_syntax.cc b/decoder/ff_source_syntax.cc index 34e7ab69..4879ca1d 100644 --- a/decoder/ff_source_syntax.cc +++ b/decoder/ff_source_syntax.cc @@ -104,7 +104,7 @@ struct SourceSyntaxFeaturesImpl { if (fid_ef <= 0) { ostringstream os; //ostringstream os2; - os << "SYN:" << TD::Convert(lhs); + os << "SSYN:" << TD::Convert(lhs); //os2 << "SYN:" << TD::Convert(lhs) << '_' << SpanSizeTransform(j - i); //fid_cat = FD::Convert(os2.str()); os << ':'; @@ -173,7 +173,10 @@ void SourceSyntaxFeatures::TraversalFeaturesImpl(const SentenceMetadata& smeta, } void SourceSyntaxFeatures::PrepareForInput(const SentenceMetadata& smeta) { - impl->InitializeGrids(smeta.GetSGMLValue("src_tree"), smeta.GetSourceLength()); + ReadFile f = ReadFile(smeta.GetSGMLValue("src_tree")); + string tree; + f.ReadAll(tree); + impl->InitializeGrids(tree, smeta.GetSourceLength()); } struct SourceSpanSizeFeaturesImpl { diff --git a/decoder/ff_source_syntax2.cc b/decoder/ff_source_syntax2.cc index 63736342..9d0bc33f 100644 --- a/decoder/ff_source_syntax2.cc +++ b/decoder/ff_source_syntax2.cc @@ -90,7 +90,7 @@ struct SourceSyntaxFeatures2Impl { const WordID lhs = src_tree(i,j); int& fid_ef = fids_ef(i,j)[&rule]; ostringstream os; - os << "SYN:" << TD::Convert(lhs); + os << "SSYN2:" << TD::Convert(lhs); os << ':'; unsigned ntc = 0; for (unsigned k = 0; k < rule.f_.size(); ++k) { @@ -159,6 +159,9 @@ void SourceSyntaxFeatures2::TraversalFeaturesImpl(const SentenceMetadata& smeta, } void SourceSyntaxFeatures2::PrepareForInput(const SentenceMetadata& smeta) { - impl->InitializeGrids(smeta.GetSGMLValue("src_tree"), smeta.GetSourceLength()); + ReadFile f = ReadFile(smeta.GetSGMLValue("src_tree")); + string tree; + f.ReadAll(tree); + impl->InitializeGrids(tree, smeta.GetSourceLength()); } diff --git a/utils/filelib.h b/utils/filelib.h index b9ea3940..4fa69760 100644 --- a/utils/filelib.h +++ b/utils/filelib.h @@ -75,7 +75,10 @@ class ReadFile : public BaseFile { } } } - + void ReadAll(std::string& s) { + getline(*stream(), s, (char) EOF); + if (s.size() > 0) s.resize(s.size()-1); + } }; class WriteFile : public BaseFile { -- cgit v1.2.3 From a8ea0a66b798326061bc9f0da153b96b730130f1 Mon Sep 17 00:00:00 2001 From: Patrick Simianer Date: Tue, 12 Nov 2013 18:36:03 +0100 Subject: implemented batch tuning --- training/dtrain/dtrain.cc | 81 ++++++++++++++++++++++------ training/dtrain/examples/standard/dtrain.ini | 4 +- 2 files changed, 67 insertions(+), 18 deletions(-) diff --git a/training/dtrain/dtrain.cc b/training/dtrain/dtrain.cc index a496f08a..23131810 100644 --- a/training/dtrain/dtrain.cc +++ b/training/dtrain/dtrain.cc @@ -42,6 +42,9 @@ dtrain_init(int argc, char** argv, po::variables_map* cfg) ("loss_margin", po::value()->default_value(0.), "update if no error in pref pair but model scores this near") ("max_pairs", po::value()->default_value(std::numeric_limits::max()), "max. # of pairs per Sent.") ("pclr", po::value()->default_value("no"), "use a (simple|adagrad) per-coordinate learning rate") + ("batch", po::value()->zero_tokens(), "do batch optimization") + //("repeat", po::value()->default_value(1), "repeat optimization over kbest list this number of times") + //("test-k-best", po::value()->zero_tokens(), "check if optimization works (use repeat >= 2)") ("noup", po::value()->zero_tokens(), "do not update weights"); po::options_description cl("Command Line Options"); cl.add_options() @@ -126,7 +129,12 @@ main(int argc, char** argv) const float hi_lo = cfg["hi_lo"].as(); const score_t approx_bleu_d = cfg["approx_bleu_d"].as(); const unsigned max_pairs = cfg["max_pairs"].as(); + //int repeat = cfg["repeat"].as(); + //bool test_k_best = false; + //if (cfg.count("test-k-best")) test_k_best = true; weight_t loss_margin = cfg["loss_margin"].as(); + bool batch = false; + if (cfg.count("batch")) batch = true; if (loss_margin > 9998.) loss_margin = std::numeric_limits::max(); bool scale_bleu_diff = false; if (cfg.count("scale_bleu_diff")) scale_bleu_diff = true; @@ -184,10 +192,10 @@ main(int argc, char** argv) observer->SetScorer(scorer); // init weights - vector& dense_weights = decoder.CurrentWeightVector(); + vector& decoder_weights = decoder.CurrentWeightVector(); SparseVector lambdas, cumulative_penalties, w_average; - if (cfg.count("input_weights")) Weights::InitFromFile(cfg["input_weights"].as(), &dense_weights); - Weights::InitSparseVector(dense_weights, &lambdas); + if (cfg.count("input_weights")) Weights::InitFromFile(cfg["input_weights"].as(), &decoder_weights); + Weights::InitSparseVector(decoder_weights, &lambdas); // meta params for perceptron, SVM weight_t eta = cfg["learning_rate"].as(); @@ -245,6 +253,7 @@ main(int argc, char** argv) cerr << setw(25) << "k " << k << endl; cerr << setw(25) << "N " << N << endl; cerr << setw(25) << "T " << T << endl; + cerr << setw(25) << "batch " << batch << endl; cerr << setw(26) << "scorer '" << scorer_str << "'" << endl; if (scorer_str == "approx_bleu") cerr << setw(25) << "approx. B discount " << approx_bleu_d << endl; @@ -267,6 +276,8 @@ main(int argc, char** argv) cerr << setw(25) << "rescale " << rescale << endl; cerr << setw(25) << "pclr " << pclr << endl; cerr << setw(25) << "max pairs " << max_pairs << endl; + //cerr << setw(25) << "repeat " << repeat << endl; + //cerr << setw(25) << "test k-best " << test_k_best << endl; cerr << setw(25) << "cdec cfg " << "'" << cfg["decoder_config"].as() << "'" << endl; cerr << setw(25) << "input " << "'" << input_fn << "'" << endl; if (!read_bitext) @@ -281,17 +292,25 @@ main(int argc, char** argv) // pclr SparseVector learning_rates; + // batch + SparseVector batch_updates; + weight_t batch_loss; + + //int did_improve; // FIXME for test-k-best for (unsigned t = 0; t < T; t++) // T epochs { - + time_t start, end; time(&start); score_t score_sum = 0.; score_t model_sum(0); unsigned ii = 0, rank_errors = 0, margin_violations = 0, npairs = 0, f_count = 0, list_sz = 0; + batch_loss = 0.; if (!quiet) cerr << "Iteration #" << t+1 << " of " << T << "." << endl; + //did_improve = 0; + while(true) { @@ -337,7 +356,7 @@ main(int argc, char** argv) if (next || stop) break; // weights - lambdas.init_vector(&dense_weights); + lambdas.init_vector(&decoder_weights); // getting input vector ref_ids; // reference as vector @@ -392,33 +411,51 @@ main(int argc, char** argv) partXYX(samples, pairs, pair_threshold, max_pairs, faster_perceptron, hi_lo); if (pair_sampling == "PRO") PROsampling(samples, pairs, pair_threshold, max_pairs); - npairs += pairs.size(); + int cur_npairs = pairs.size(); + npairs += cur_npairs; + + weight_t kbest_loss_first, kbest_loss_last = 0.0; +//for (int q=0; q < repeat; q++) { // repeat + + weight_t kbest_loss = 0.0; // test-k-best SparseVector lambdas_copy; // for l1 regularization SparseVector sum_up; // for pclr if (l1naive||l1clip||l1cumul) lambdas_copy = lambdas; for (vector >::iterator it = pairs.begin(); it != pairs.end(); it++) { - bool rank_error; + + /*if (repeat > 1) { + double x = max(0.0, -1.0 * (lambdas.dot(it->first.f) - lambdas.dot(it->second.f))); + kbest_loss += x; + }*/ + + score_t model_diff = it->first.model - it->second.model; + bool rank_error = false; score_t margin; if (faster_perceptron) { // we only have considering misranked pairs rank_error = true; // pair sampling already did this for us margin = std::numeric_limits::max(); } else { - rank_error = it->first.model <= it->second.model; - margin = fabs(it->first.model - it->second.model); + rank_error = model_diff<=0.0; + margin = fabs(model_diff); if (!rank_error && margin < loss_margin) margin_violations++; } if (rank_error) rank_errors++; if (scale_bleu_diff) eta = it->first.score - it->second.score; if (rank_error || margin < loss_margin) { SparseVector diff_vec = it->first.f - it->second.f; + if (batch) { + batch_loss += max(0., -1.0*model_diff); + batch_updates += diff_vec; + continue; + } if (pclr != "no") { sum_up += diff_vec; } else { lambdas.plus_eq_v_times_s(diff_vec, eta); - if (gamma) lambdas.plus_eq_v_times_s(lambdas, -2*gamma*eta*(1./npairs)); // FIXME + if (gamma) lambdas.plus_eq_v_times_s(lambdas, -2*gamma*eta*(1./cur_npairs)); } } } @@ -487,6 +524,11 @@ main(int argc, char** argv) } } + //if (q==0) { kbest_loss_first = kbest_loss; } + //if (q==repeat-1) { kbest_loss_last = kbest_loss; } +//}//repeat +//if((kbest_loss_first - kbest_loss_last) > 0) did_improve++; + } if (rescale) lambdas /= lambdas.l2norm(); @@ -495,14 +537,20 @@ main(int argc, char** argv) } // input loop - if (average) w_average += lambdas; + if (t == 0) in_sz = ii; // remember size of input (# lines) - if (scorer_str == "approx_bleu" || scorer_str == "lc_bleu") scorer->Reset(); + //if (repeat > 1) cout << "did improve? " << did_improve << " out of " << in_sz << endl; - if (t == 0) { - in_sz = ii; // remember size of input (# lines) + if (batch) { + lambdas.plus_eq_v_times_s(batch_updates, eta); + if (gamma) lambdas.plus_eq_v_times_s(lambdas, -2*gamma*eta*(1./npairs)); + batch_updates.clear(); } + if (average) w_average += lambdas; + + if (scorer_str == "approx_bleu" || scorer_str == "lc_bleu") scorer->Reset(); + // print some stats score_t score_avg = score_sum/(score_t)in_sz; score_t model_avg = model_sum/(score_t)in_sz; @@ -534,6 +582,7 @@ main(int argc, char** argv) cerr << endl; cerr << " avg # rank err: "; cerr << rank_errors/(float)in_sz << endl; + if (batch) cerr << " batch loss: " << batch_loss << endl; cerr << " avg # margin viol: "; cerr << margin_violations/(float)in_sz << endl; cerr << " non0 feature count: " << nonz << endl; @@ -562,9 +611,9 @@ main(int argc, char** argv) // write weights to file if (select_weights == "best" || keep) { - lambdas.init_vector(&dense_weights); + lambdas.init_vector(&decoder_weights); string w_fn = "weights." + boost::lexical_cast(t) + ".gz"; - Weights::WriteToFile(w_fn, dense_weights, true); + Weights::WriteToFile(w_fn, decoder_weights, true); } } // outer loop diff --git a/training/dtrain/examples/standard/dtrain.ini b/training/dtrain/examples/standard/dtrain.ini index 7dbb4ff0..4d096dfb 100644 --- a/training/dtrain/examples/standard/dtrain.ini +++ b/training/dtrain/examples/standard/dtrain.ini @@ -11,11 +11,11 @@ print_weights=Glue WordPenalty LanguageModel LanguageModel_OOV PhraseModel_0 Phr stop_after=10 # stop epoch after 10 inputs # interesting stuff -epochs=3 # run over input 3 times +epochs=100 # run over input 3 times k=100 # use 100best lists N=4 # optimize (approx) BLEU4 scorer=fixed_stupid_bleu # use 'stupid' BLEU+1 -learning_rate=1.0 # learning rate, don't care if gamma=0 (perceptron) and loss_margin=0 (not margin perceptron) +learning_rate=0.0001 # learning rate, don't care if gamma=0 (perceptron) and loss_margin=0 (not margin perceptron) gamma=0 # use SVM reg sample_from=kbest # use kbest lists (as opposed to forest) filter=uniq # only unique entries in kbest (surface form) -- cgit v1.2.3 From 68b0969b1f41eacb4b336a66625894995b2f1e74 Mon Sep 17 00:00:00 2001 From: Patrick Simianer Date: Tue, 12 Nov 2013 20:07:47 +0100 Subject: impl repeat param --- training/dtrain/dtrain.cc | 78 ++++++++++++++++------------ training/dtrain/examples/standard/dtrain.ini | 6 ++- 2 files changed, 49 insertions(+), 35 deletions(-) diff --git a/training/dtrain/dtrain.cc b/training/dtrain/dtrain.cc index 23131810..441e2cd7 100644 --- a/training/dtrain/dtrain.cc +++ b/training/dtrain/dtrain.cc @@ -43,7 +43,7 @@ dtrain_init(int argc, char** argv, po::variables_map* cfg) ("max_pairs", po::value()->default_value(std::numeric_limits::max()), "max. # of pairs per Sent.") ("pclr", po::value()->default_value("no"), "use a (simple|adagrad) per-coordinate learning rate") ("batch", po::value()->zero_tokens(), "do batch optimization") - //("repeat", po::value()->default_value(1), "repeat optimization over kbest list this number of times") + ("repeat", po::value()->default_value(1), "repeat optimization over kbest list this number of times") //("test-k-best", po::value()->zero_tokens(), "check if optimization works (use repeat >= 2)") ("noup", po::value()->zero_tokens(), "do not update weights"); po::options_description cl("Command Line Options"); @@ -129,7 +129,7 @@ main(int argc, char** argv) const float hi_lo = cfg["hi_lo"].as(); const score_t approx_bleu_d = cfg["approx_bleu_d"].as(); const unsigned max_pairs = cfg["max_pairs"].as(); - //int repeat = cfg["repeat"].as(); + int repeat = cfg["repeat"].as(); //bool test_k_best = false; //if (cfg.count("test-k-best")) test_k_best = true; weight_t loss_margin = cfg["loss_margin"].as(); @@ -276,7 +276,7 @@ main(int argc, char** argv) cerr << setw(25) << "rescale " << rescale << endl; cerr << setw(25) << "pclr " << pclr << endl; cerr << setw(25) << "max pairs " << max_pairs << endl; - //cerr << setw(25) << "repeat " << repeat << endl; + cerr << setw(25) << "repeat " << repeat << endl; //cerr << setw(25) << "test k-best " << test_k_best << endl; cerr << setw(25) << "cdec cfg " << "'" << cfg["decoder_config"].as() << "'" << endl; cerr << setw(25) << "input " << "'" << input_fn << "'" << endl; @@ -294,23 +294,19 @@ main(int argc, char** argv) SparseVector learning_rates; // batch SparseVector batch_updates; - weight_t batch_loss; - - //int did_improve; // FIXME for test-k-best + score_t batch_loss; for (unsigned t = 0; t < T; t++) // T epochs { - + time_t start, end; time(&start); score_t score_sum = 0.; score_t model_sum(0); - unsigned ii = 0, rank_errors = 0, margin_violations = 0, npairs = 0, f_count = 0, list_sz = 0; + unsigned ii = 0, rank_errors = 0, margin_violations = 0, npairs = 0, f_count = 0, list_sz = 0, kbest_loss_improve = 0; batch_loss = 0.; if (!quiet) cerr << "Iteration #" << t+1 << " of " << T << "." << endl; - //did_improve = 0; - while(true) { @@ -395,8 +391,10 @@ main(int argc, char** argv) } } - score_sum += (*samples)[0].score; // stats for 1best - model_sum += (*samples)[0].model; + if (repeat == 1) { + score_sum += (*samples)[0].score; // stats for 1best + model_sum += (*samples)[0].model; + } f_count += observer->get_f_count(); list_sz += observer->get_sz(); @@ -414,24 +412,22 @@ main(int argc, char** argv) int cur_npairs = pairs.size(); npairs += cur_npairs; - weight_t kbest_loss_first, kbest_loss_last = 0.0; + score_t kbest_loss_first, kbest_loss_last = 0.0; -//for (int q=0; q < repeat; q++) { // repeat + for (int ki=0; ki < repeat; ki++) { - weight_t kbest_loss = 0.0; // test-k-best + score_t kbest_loss = 0.0; // test-k-best SparseVector lambdas_copy; // for l1 regularization SparseVector sum_up; // for pclr if (l1naive||l1clip||l1cumul) lambdas_copy = lambdas; for (vector >::iterator it = pairs.begin(); it != pairs.end(); it++) { - - /*if (repeat > 1) { - double x = max(0.0, -1.0 * (lambdas.dot(it->first.f) - lambdas.dot(it->second.f))); - kbest_loss += x; - }*/ - score_t model_diff = it->first.model - it->second.model; + if (repeat > 1) { + model_diff = lambdas.dot(it->first.f) - lambdas.dot(it->second.f); + kbest_loss += max(0.0, -1.0 * model_diff); + } bool rank_error = false; score_t margin; if (faster_perceptron) { // we only have considering misranked pairs @@ -442,7 +438,7 @@ main(int argc, char** argv) margin = fabs(model_diff); if (!rank_error && margin < loss_margin) margin_violations++; } - if (rank_error) rank_errors++; + if (rank_error && ki==1) rank_errors++; if (scale_bleu_diff) eta = it->first.score - it->second.score; if (rank_error || margin < loss_margin) { SparseVector diff_vec = it->first.f - it->second.f; @@ -524,12 +520,27 @@ main(int argc, char** argv) } } - //if (q==0) { kbest_loss_first = kbest_loss; } - //if (q==repeat-1) { kbest_loss_last = kbest_loss; } -//}//repeat -//if((kbest_loss_first - kbest_loss_last) > 0) did_improve++; + if (ki==0) kbest_loss_first = kbest_loss; + if (ki==repeat-1) { // done + kbest_loss_last = kbest_loss; + score_t best_score = -1.; + score_t best_model = -std::numeric_limits::max(); + unsigned best_idx; + for (unsigned i=0; i < samples->size(); i++) { + score_t s = lambdas.dot((*samples)[i].f); + if (s > best_model) { + best_idx = i; + best_model = s; + } + } + score_sum += (*samples)[best_idx].score; + model_sum += best_model; + } + } // repeat - } + if ((kbest_loss_first - kbest_loss_last) >= 0) kbest_loss_improve++; + + } // noup if (rescale) lambdas /= lambdas.l2norm(); @@ -539,7 +550,6 @@ main(int argc, char** argv) if (t == 0) in_sz = ii; // remember size of input (# lines) - //if (repeat > 1) cout << "did improve? " << did_improve << " out of " << in_sz << endl; if (batch) { lambdas.plus_eq_v_times_s(batch_updates, eta); @@ -577,14 +587,16 @@ main(int argc, char** argv) cerr << _np << " 1best avg model score: " << model_avg; cerr << _p << " (" << model_diff << ")" << endl; cerr << " avg # pairs: "; - cerr << _np << npairs/(float)in_sz; + cerr << _np << npairs/(float)in_sz << endl; + cerr << " avg # margin viol: "; + cerr << margin_violations/(float)in_sz << endl; + cerr << " avg # rank err: "; + cerr << rank_errors/(float)in_sz; if (faster_perceptron) cerr << " (meaningless)"; cerr << endl; - cerr << " avg # rank err: "; - cerr << rank_errors/(float)in_sz << endl; if (batch) cerr << " batch loss: " << batch_loss << endl; - cerr << " avg # margin viol: "; - cerr << margin_violations/(float)in_sz << endl; + if (repeat > 1) cerr << " k-best loss imp: " << ((float)kbest_loss_improve/in_sz)*100 << "%" << endl; + cerr << " non0 feature count: " << nonz << endl; cerr << " avg list sz: " << list_sz/(float)in_sz << endl; cerr << " avg f count: " << f_count/(float)list_sz << endl; diff --git a/training/dtrain/examples/standard/dtrain.ini b/training/dtrain/examples/standard/dtrain.ini index 4d096dfb..ef022469 100644 --- a/training/dtrain/examples/standard/dtrain.ini +++ b/training/dtrain/examples/standard/dtrain.ini @@ -11,11 +11,11 @@ print_weights=Glue WordPenalty LanguageModel LanguageModel_OOV PhraseModel_0 Phr stop_after=10 # stop epoch after 10 inputs # interesting stuff -epochs=100 # run over input 3 times +epochs=3 # run over input 3 times k=100 # use 100best lists N=4 # optimize (approx) BLEU4 scorer=fixed_stupid_bleu # use 'stupid' BLEU+1 -learning_rate=0.0001 # learning rate, don't care if gamma=0 (perceptron) and loss_margin=0 (not margin perceptron) +learning_rate=0.0001 # learning rate, don't care if gamma=0 (perceptron) and loss_margin=0 (not margin perceptron) gamma=0 # use SVM reg sample_from=kbest # use kbest lists (as opposed to forest) filter=uniq # only unique entries in kbest (surface form) @@ -23,3 +23,5 @@ pair_sampling=XYX # hi_lo=0.1 # 10 vs 80 vs 10 and 80 vs 10 here pair_threshold=0 # minimum distance in BLEU (here: > 0) loss_margin=0 # update if correctly ranked, but within this margin +repeat=1 # repeat training on a kbest list 1 times +#batch=true # batch tuning, update after accumulating over all sentences and all kbest lists -- cgit v1.2.3 From 864a25ebf0c6b9ff0e127f310930834326afbfa0 Mon Sep 17 00:00:00 2001 From: Patrick Simianer Date: Tue, 12 Nov 2013 20:39:59 +0100 Subject: fix --- training/dtrain/dtrain.cc | 36 ++++--- training/dtrain/examples/standard/dtrain.ini | 2 +- training/dtrain/examples/standard/expected-output | 112 +++++++++++----------- 3 files changed, 80 insertions(+), 70 deletions(-) diff --git a/training/dtrain/dtrain.cc b/training/dtrain/dtrain.cc index 441e2cd7..0a27a068 100644 --- a/training/dtrain/dtrain.cc +++ b/training/dtrain/dtrain.cc @@ -414,6 +414,12 @@ main(int argc, char** argv) score_t kbest_loss_first, kbest_loss_last = 0.0; + for (vector >::iterator it = pairs.begin(); + it != pairs.end(); it++) { + score_t model_diff = it->first.model - it->second.model; + kbest_loss_first += max(0.0, -1.0 * model_diff); + } + for (int ki=0; ki < repeat; ki++) { score_t kbest_loss = 0.0; // test-k-best @@ -520,21 +526,22 @@ main(int argc, char** argv) } } - if (ki==0) kbest_loss_first = kbest_loss; if (ki==repeat-1) { // done kbest_loss_last = kbest_loss; - score_t best_score = -1.; - score_t best_model = -std::numeric_limits::max(); - unsigned best_idx; - for (unsigned i=0; i < samples->size(); i++) { - score_t s = lambdas.dot((*samples)[i].f); - if (s > best_model) { - best_idx = i; - best_model = s; + if (repeat > 1) { + score_t best_score = -1.; + score_t best_model = -std::numeric_limits::max(); + unsigned best_idx; + for (unsigned i=0; i < samples->size(); i++) { + score_t s = lambdas.dot((*samples)[i].f); + if (s > best_model) { + best_idx = i; + best_model = s; + } } + score_sum += (*samples)[best_idx].score; + model_sum += best_model; } - score_sum += (*samples)[best_idx].score; - model_sum += best_model; } } // repeat @@ -588,15 +595,14 @@ main(int argc, char** argv) cerr << _p << " (" << model_diff << ")" << endl; cerr << " avg # pairs: "; cerr << _np << npairs/(float)in_sz << endl; - cerr << " avg # margin viol: "; - cerr << margin_violations/(float)in_sz << endl; cerr << " avg # rank err: "; cerr << rank_errors/(float)in_sz; if (faster_perceptron) cerr << " (meaningless)"; cerr << endl; + cerr << " avg # margin viol: "; + cerr << margin_violations/(float)in_sz << endl; if (batch) cerr << " batch loss: " << batch_loss << endl; - if (repeat > 1) cerr << " k-best loss imp: " << ((float)kbest_loss_improve/in_sz)*100 << "%" << endl; - + cerr << " k-best loss imp: " << ((float)kbest_loss_improve/in_sz)*100 << "%" << endl; cerr << " non0 feature count: " << nonz << endl; cerr << " avg list sz: " << list_sz/(float)in_sz << endl; cerr << " avg f count: " << f_count/(float)list_sz << endl; diff --git a/training/dtrain/examples/standard/dtrain.ini b/training/dtrain/examples/standard/dtrain.ini index ef022469..fc83f08e 100644 --- a/training/dtrain/examples/standard/dtrain.ini +++ b/training/dtrain/examples/standard/dtrain.ini @@ -15,7 +15,7 @@ epochs=3 # run over input 3 times k=100 # use 100best lists N=4 # optimize (approx) BLEU4 scorer=fixed_stupid_bleu # use 'stupid' BLEU+1 -learning_rate=0.0001 # learning rate, don't care if gamma=0 (perceptron) and loss_margin=0 (not margin perceptron) +learning_rate=0.1 # learning rate, don't care if gamma=0 (perceptron) and loss_margin=0 (not margin perceptron) gamma=0 # use SVM reg sample_from=kbest # use kbest lists (as opposed to forest) filter=uniq # only unique entries in kbest (surface form) diff --git a/training/dtrain/examples/standard/expected-output b/training/dtrain/examples/standard/expected-output index a35bbe6f..75f47337 100644 --- a/training/dtrain/examples/standard/expected-output +++ b/training/dtrain/examples/standard/expected-output @@ -4,17 +4,18 @@ Reading ./nc-wmt11.en.srilm.gz ----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100 **************************************************************************************************** Example feature: Shape_S00000_T00000 -Seeding random number sequence to 4049211323 +Seeding random number sequence to 3751911392 dtrain Parameters: k 100 N 4 T 3 + batch 0 scorer 'fixed_stupid_bleu' sample from 'kbest' filter 'uniq' - learning rate 1 + learning rate 0.1 gamma 0 loss margin 0 faster perceptron 1 @@ -25,9 +26,9 @@ Parameters: l1 reg 0 'none' pclr no max pairs 4294967295 + repeat 1 cdec cfg './cdec.ini' - input './nc-wmt11.de.gz' - refs './nc-wmt11.en.gz' + input './nc-wmt11.gz' output '-' stop_after 10 (a dot represents 10 inputs) @@ -35,25 +36,26 @@ Iteration #1 of 3. . 10 Stopping after 10 input sentences. WEIGHTS - Glue = -1100 - WordPenalty = -82.082 - LanguageModel = -3199.1 - LanguageModel_OOV = -192 - PhraseModel_0 = +3128.2 - PhraseModel_1 = -1610.2 - PhraseModel_2 = -4336.5 - PhraseModel_3 = +2910.3 - PhraseModel_4 = +2523.2 - PhraseModel_5 = +506 - PhraseModel_6 = +1467 - PassThrough = -387 + Glue = -110 + WordPenalty = -8.2082 + LanguageModel = -319.91 + LanguageModel_OOV = -19.2 + PhraseModel_0 = +312.82 + PhraseModel_1 = -161.02 + PhraseModel_2 = -433.65 + PhraseModel_3 = +291.03 + PhraseModel_4 = +252.32 + PhraseModel_5 = +50.6 + PhraseModel_6 = +146.7 + PassThrough = -38.7 --- 1best avg score: 0.16966 (+0.16966) - 1best avg model score: 2.9874e+05 (+2.9874e+05) - avg # pairs: 906.3 (meaningless) - avg # rank err: 906.3 + 1best avg model score: 29874 (+29874) + avg # pairs: 906.3 + avg # rank err: 0 (meaningless) avg # margin viol: 0 - non0 feature count: 825 + k-best loss imp: 100% + non0 feature count: 832 avg list sz: 91.3 avg f count: 139.77 (time 0.35 min, 2.1 s/S) @@ -61,25 +63,26 @@ WEIGHTS Iteration #2 of 3. . 10 WEIGHTS - Glue = -1221 - WordPenalty = +836.89 - LanguageModel = +2332.3 - LanguageModel_OOV = -1451 - PhraseModel_0 = +1507.2 - PhraseModel_1 = -2728.4 - PhraseModel_2 = -4183.6 - PhraseModel_3 = +1816.3 - PhraseModel_4 = -2894.7 - PhraseModel_5 = +1403 - PhraseModel_6 = +35 - PassThrough = -1097 + Glue = -122.1 + WordPenalty = +83.689 + LanguageModel = +233.23 + LanguageModel_OOV = -145.1 + PhraseModel_0 = +150.72 + PhraseModel_1 = -272.84 + PhraseModel_2 = -418.36 + PhraseModel_3 = +181.63 + PhraseModel_4 = -289.47 + PhraseModel_5 = +140.3 + PhraseModel_6 = +3.5 + PassThrough = -109.7 --- 1best avg score: 0.17399 (+0.004325) - 1best avg model score: 49369 (-2.4937e+05) - avg # pairs: 662.4 (meaningless) - avg # rank err: 662.4 + 1best avg model score: 4936.9 (-24937) + avg # pairs: 662.4 + avg # rank err: 0 (meaningless) avg # margin viol: 0 - non0 feature count: 1235 + k-best loss imp: 100% + non0 feature count: 1240 avg list sz: 91.3 avg f count: 125.11 (time 0.27 min, 1.6 s/S) @@ -87,32 +90,33 @@ WEIGHTS Iteration #3 of 3. . 10 WEIGHTS - Glue = -1574 - WordPenalty = -17.372 - LanguageModel = +6861.8 - LanguageModel_OOV = -3997 - PhraseModel_0 = -398.76 - PhraseModel_1 = -3419.6 - PhraseModel_2 = -3186.7 - PhraseModel_3 = +1050.8 - PhraseModel_4 = -2902.7 - PhraseModel_5 = -486 - PhraseModel_6 = -436 - PassThrough = -2985 + Glue = -157.4 + WordPenalty = -1.7372 + LanguageModel = +686.18 + LanguageModel_OOV = -399.7 + PhraseModel_0 = -39.876 + PhraseModel_1 = -341.96 + PhraseModel_2 = -318.67 + PhraseModel_3 = +105.08 + PhraseModel_4 = -290.27 + PhraseModel_5 = -48.6 + PhraseModel_6 = -43.6 + PassThrough = -298.5 --- 1best avg score: 0.30742 (+0.13343) - 1best avg model score: -1.5393e+05 (-2.0329e+05) - avg # pairs: 623.8 (meaningless) - avg # rank err: 623.8 + 1best avg model score: -15393 (-20329) + avg # pairs: 623.8 + avg # rank err: 0 (meaningless) avg # margin viol: 0 - non0 feature count: 1770 + k-best loss imp: 100% + non0 feature count: 1776 avg list sz: 91.3 avg f count: 118.58 -(time 0.25 min, 1.5 s/S) +(time 0.28 min, 1.7 s/S) Writing weights file to '-' ... done --- Best iteration: 3 [SCORE 'fixed_stupid_bleu'=0.30742]. -This took 0.86667 min. +This took 0.9 min. -- cgit v1.2.3 From b8bf706976720527b455eb665fe94f907e372b65 Mon Sep 17 00:00:00 2001 From: Patrick Simianer Date: Wed, 13 Nov 2013 18:00:10 +0100 Subject: unit tests for extractor loo sampling --- extractor/grammar_extractor_test.cc | 7 ++- extractor/mocks/mock_rule_factory.h | 2 +- extractor/rule_factory_test.cc | 8 ++- extractor/sampler.cc | 18 +++---- extractor/sampler_test.cc | 24 +++++---- extractor/sampler_test_blacklist.cc | 102 ++++++++++++++++++++++++++++++++++++ 6 files changed, 138 insertions(+), 23 deletions(-) create mode 100644 extractor/sampler_test_blacklist.cc diff --git a/extractor/grammar_extractor_test.cc b/extractor/grammar_extractor_test.cc index 823bb8b4..f32a9599 100644 --- a/extractor/grammar_extractor_test.cc +++ b/extractor/grammar_extractor_test.cc @@ -39,12 +39,15 @@ TEST(GrammarExtractorTest, TestAnnotatingWords) { vector rules; vector feature_names; Grammar grammar(rules, feature_names); - EXPECT_CALL(*factory, GetGrammar(word_ids)) + unordered_set blacklisted_sentence_ids; + shared_ptr source_data_array; + EXPECT_CALL(*factory, GetGrammar(word_ids, blacklisted_sentence_ids, source_data_array)) .WillOnce(Return(grammar)); GrammarExtractor extractor(vocabulary, factory); string sentence = "Anna has many many apples ."; - extractor.GetGrammar(sentence); + + extractor.GetGrammar(sentence, blacklisted_sentence_ids, source_data_array); } } // namespace diff --git a/extractor/mocks/mock_rule_factory.h b/extractor/mocks/mock_rule_factory.h index 7389b396..86a084b5 100644 --- a/extractor/mocks/mock_rule_factory.h +++ b/extractor/mocks/mock_rule_factory.h @@ -7,7 +7,7 @@ namespace extractor { class MockHieroCachingRuleFactory : public HieroCachingRuleFactory { public: - MOCK_METHOD1(GetGrammar, Grammar(const vector& word_ids)); + MOCK_METHOD3(GetGrammar, Grammar(const vector& word_ids, const unordered_set blacklisted_sentence_ids, const shared_ptr source_data_array)); }; } // namespace extractor diff --git a/extractor/rule_factory_test.cc b/extractor/rule_factory_test.cc index 08af3dcd..f26cc567 100644 --- a/extractor/rule_factory_test.cc +++ b/extractor/rule_factory_test.cc @@ -76,7 +76,9 @@ TEST_F(RuleFactoryTest, TestGetGrammarDifferentWords) { .WillRepeatedly(Return(PhraseLocation(0, 1))); vector word_ids = {2, 3, 4}; - Grammar grammar = factory->GetGrammar(word_ids); + unordered_set blacklisted_sentence_ids; + shared_ptr source_data_array; + Grammar grammar = factory->GetGrammar(word_ids, blacklisted_sentence_ids, source_data_array); EXPECT_EQ(feature_names, grammar.GetFeatureNames()); EXPECT_EQ(7, grammar.GetRules().size()); } @@ -94,7 +96,9 @@ TEST_F(RuleFactoryTest, TestGetGrammarRepeatingWords) { .WillRepeatedly(Return(PhraseLocation(0, 1))); vector word_ids = {2, 3, 4, 2, 3}; - Grammar grammar = factory->GetGrammar(word_ids); + unordered_set blacklisted_sentence_ids; + shared_ptr source_data_array; + Grammar grammar = factory->GetGrammar(word_ids, blacklisted_sentence_ids, source_data_array); EXPECT_EQ(feature_names, grammar.GetFeatureNames()); EXPECT_EQ(28, grammar.GetRules().size()); } diff --git a/extractor/sampler.cc b/extractor/sampler.cc index cb470962..d332dd90 100644 --- a/extractor/sampler.cc +++ b/extractor/sampler.cc @@ -19,25 +19,25 @@ PhraseLocation Sampler::Sample(const PhraseLocation& location, unordered_setGetSuffix(i); + int x = suffix_array->GetSuffix(Round(i)); int id = source_data_array->GetSentenceId(x); if (find(blacklisted_sentence_ids.begin(), blacklisted_sentence_ids.end(), id) != blacklisted_sentence_ids.end()) { found = false; - int backoff_step = 1; + double backoff_step = 1; while (true) { if ((double)backoff_step >= step) break; - int j = i - backoff_step; - x = suffix_array->GetSuffix(j); + double j = i - backoff_step; + x = suffix_array->GetSuffix(Round(j)); id = source_data_array->GetSentenceId(x); - if (j > last && find(blacklisted_sentence_ids.begin(), blacklisted_sentence_ids.end(), id) == blacklisted_sentence_ids.end()) { + if (x >= 0 && j > last && find(blacklisted_sentence_ids.begin(), blacklisted_sentence_ids.end(), id) == blacklisted_sentence_ids.end()) { found = true; last = i; break; } - int k = i + backoff_step; - x = suffix_array->GetSuffix(k); + double k = i + backoff_step; + x = suffix_array->GetSuffix(Round(k)); id = source_data_array->GetSentenceId(x); if (k < min(i+step, (double)high) && find(blacklisted_sentence_ids.begin(), blacklisted_sentence_ids.end(), id) == blacklisted_sentence_ids.end()) { found = true; last = k; break; diff --git a/extractor/sampler_test.cc b/extractor/sampler_test.cc index e9abebfa..965567ba 100644 --- a/extractor/sampler_test.cc +++ b/extractor/sampler_test.cc @@ -3,6 +3,7 @@ #include #include "mocks/mock_suffix_array.h" +#include "mocks/mock_data_array.h" #include "phrase_location.h" #include "sampler.h" @@ -15,6 +16,8 @@ namespace { class SamplerTest : public Test { protected: virtual void SetUp() { + source_data_array = make_shared(); + EXPECT_CALL(*source_data_array, GetSentenceId(_)).WillRepeatedly(Return(9999)); suffix_array = make_shared(); for (int i = 0; i < 10; ++i) { EXPECT_CALL(*suffix_array, GetSuffix(i)).WillRepeatedly(Return(i)); @@ -23,51 +26,54 @@ class SamplerTest : public Test { shared_ptr suffix_array; shared_ptr sampler; + shared_ptr source_data_array; }; TEST_F(SamplerTest, TestSuffixArrayRange) { PhraseLocation location(0, 10); + unordered_set blacklist; sampler = make_shared(suffix_array, 1); vector expected_locations = {0}; - EXPECT_EQ(PhraseLocation(expected_locations, 1), sampler->Sample(location)); + EXPECT_EQ(PhraseLocation(expected_locations, 1), sampler->Sample(location, blacklist, source_data_array)); sampler = make_shared(suffix_array, 2); expected_locations = {0, 5}; - EXPECT_EQ(PhraseLocation(expected_locations, 1), sampler->Sample(location)); + EXPECT_EQ(PhraseLocation(expected_locations, 1), sampler->Sample(location, blacklist, source_data_array)); sampler = make_shared(suffix_array, 3); expected_locations = {0, 3, 7}; - EXPECT_EQ(PhraseLocation(expected_locations, 1), sampler->Sample(location)); + EXPECT_EQ(PhraseLocation(expected_locations, 1), sampler->Sample(location, blacklist, source_data_array)); sampler = make_shared(suffix_array, 4); expected_locations = {0, 3, 5, 8}; - EXPECT_EQ(PhraseLocation(expected_locations, 1), sampler->Sample(location)); + EXPECT_EQ(PhraseLocation(expected_locations, 1), sampler->Sample(location, blacklist, source_data_array)); sampler = make_shared(suffix_array, 100); expected_locations = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9}; - EXPECT_EQ(PhraseLocation(expected_locations, 1), sampler->Sample(location)); + EXPECT_EQ(PhraseLocation(expected_locations, 1), sampler->Sample(location, blacklist, source_data_array)); } TEST_F(SamplerTest, TestSubstringsSample) { vector locations = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9}; + unordered_set blacklist; PhraseLocation location(locations, 2); sampler = make_shared(suffix_array, 1); vector expected_locations = {0, 1}; - EXPECT_EQ(PhraseLocation(expected_locations, 2), sampler->Sample(location)); + EXPECT_EQ(PhraseLocation(expected_locations, 2), sampler->Sample(location, blacklist, source_data_array)); sampler = make_shared(suffix_array, 2); expected_locations = {0, 1, 6, 7}; - EXPECT_EQ(PhraseLocation(expected_locations, 2), sampler->Sample(location)); + EXPECT_EQ(PhraseLocation(expected_locations, 2), sampler->Sample(location, blacklist, source_data_array)); sampler = make_shared(suffix_array, 3); expected_locations = {0, 1, 4, 5, 6, 7}; - EXPECT_EQ(PhraseLocation(expected_locations, 2), sampler->Sample(location)); + EXPECT_EQ(PhraseLocation(expected_locations, 2), sampler->Sample(location, blacklist, source_data_array)); sampler = make_shared(suffix_array, 7); expected_locations = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9}; - EXPECT_EQ(PhraseLocation(expected_locations, 2), sampler->Sample(location)); + EXPECT_EQ(PhraseLocation(expected_locations, 2), sampler->Sample(location, blacklist, source_data_array)); } } // namespace diff --git a/extractor/sampler_test_blacklist.cc b/extractor/sampler_test_blacklist.cc new file mode 100644 index 00000000..3305b990 --- /dev/null +++ b/extractor/sampler_test_blacklist.cc @@ -0,0 +1,102 @@ +#include + +#include + +#include "mocks/mock_suffix_array.h" +#include "mocks/mock_data_array.h" +#include "phrase_location.h" +#include "sampler.h" + +using namespace std; +using namespace ::testing; + +namespace extractor { +namespace { + +class SamplerTestBlacklist : public Test { + protected: + virtual void SetUp() { + source_data_array = make_shared(); + for (int i = 0; i < 10; ++i) { + EXPECT_CALL(*source_data_array, GetSentenceId(i)).WillRepeatedly(Return(i)); + } + for (int i = -10; i < 0; ++i) { + EXPECT_CALL(*source_data_array, GetSentenceId(i)).WillRepeatedly(Return(0)); + } + suffix_array = make_shared(); + for (int i = -10; i < 10; ++i) { + EXPECT_CALL(*suffix_array, GetSuffix(i)).WillRepeatedly(Return(i)); + } + } + + shared_ptr suffix_array; + shared_ptr sampler; + shared_ptr source_data_array; +}; + +TEST_F(SamplerTestBlacklist, TestSuffixArrayRange) { + PhraseLocation location(0, 10); + unordered_set blacklist; + vector expected_locations; + + blacklist.insert(0); + sampler = make_shared(suffix_array, 1); + expected_locations = {1}; + EXPECT_EQ(PhraseLocation(expected_locations, 1), sampler->Sample(location, blacklist, source_data_array)); + blacklist.clear(); + + for (int i = 0; i < 9; i++) { + blacklist.insert(i); + } + sampler = make_shared(suffix_array, 1); + expected_locations = {9}; + EXPECT_EQ(PhraseLocation(expected_locations, 1), sampler->Sample(location, blacklist, source_data_array)); + blacklist.clear(); + + blacklist.insert(0); + blacklist.insert(5); + sampler = make_shared(suffix_array, 2); + expected_locations = {1, 4}; + EXPECT_EQ(PhraseLocation(expected_locations, 1), sampler->Sample(location, blacklist, source_data_array)); + blacklist.clear(); + + blacklist.insert(0); + blacklist.insert(1); + blacklist.insert(2); + blacklist.insert(3); + sampler = make_shared(suffix_array, 2); + expected_locations = {4, 5}; + EXPECT_EQ(PhraseLocation(expected_locations, 1), sampler->Sample(location, blacklist, source_data_array)); + blacklist.clear(); + + blacklist.insert(0); + blacklist.insert(3); + blacklist.insert(7); + sampler = make_shared(suffix_array, 3); + expected_locations = {1, 2, 6}; + EXPECT_EQ(PhraseLocation(expected_locations, 1), sampler->Sample(location, blacklist, source_data_array)); + blacklist.clear(); + + blacklist.insert(0); + blacklist.insert(3); + blacklist.insert(5); + blacklist.insert(8); + sampler = make_shared(suffix_array, 4); + expected_locations = {1, 2, 4, 7}; + EXPECT_EQ(PhraseLocation(expected_locations, 1), sampler->Sample(location, blacklist, source_data_array)); + blacklist.clear(); + + blacklist.insert(0); + sampler = make_shared(suffix_array, 100); + expected_locations = {1, 2, 3, 4, 5, 6, 7, 8, 9}; + EXPECT_EQ(PhraseLocation(expected_locations, 1), sampler->Sample(location, blacklist, source_data_array)); + blacklist.clear(); + + blacklist.insert(9); + sampler = make_shared(suffix_array, 100); + expected_locations = {0, 1, 2, 3, 4, 5, 6, 7, 8}; + EXPECT_EQ(PhraseLocation(expected_locations, 1), sampler->Sample(location, blacklist, source_data_array)); +} + +} // namespace +} // namespace extractor -- cgit v1.2.3 From a5909686af9fef41ca05abde519a06c5b5382225 Mon Sep 17 00:00:00 2001 From: Patrick Simianer Date: Wed, 13 Nov 2013 18:17:54 +0100 Subject: remove crap --- extractor/sample_alignment.txt | 3 --- extractor/sample_bitext.txt | 3 --- extractor/sample_source.txt | 3 --- 3 files changed, 9 deletions(-) diff --git a/extractor/sample_alignment.txt b/extractor/sample_alignment.txt index f0292b01..80b446a4 100644 --- a/extractor/sample_alignment.txt +++ b/extractor/sample_alignment.txt @@ -1,5 +1,2 @@ 0-0 1-1 2-2 1-0 2-1 -0-0 -0-0 1-1 -0-0 1-1 diff --git a/extractor/sample_bitext.txt b/extractor/sample_bitext.txt index 2b7c8e40..93d6b39d 100644 --- a/extractor/sample_bitext.txt +++ b/extractor/sample_bitext.txt @@ -1,5 +1,2 @@ -asdf ||| dontseeme -qqq asdf ||| zzz fdsa -asdf qqq ||| fdsa zzz ana are mere . ||| anna has apples . ana bea mult lapte . ||| anna drinks a lot of milk . diff --git a/extractor/sample_source.txt b/extractor/sample_source.txt index 9b46dd6a..971baf6d 100644 --- a/extractor/sample_source.txt +++ b/extractor/sample_source.txt @@ -1,5 +1,2 @@ -asdf -qqq asdf -asdf qqq ana are mere . ana bea mult lapte . -- cgit v1.2.3 From 4c7d24c9357f500839f04c7c8a8cfa0472801e18 Mon Sep 17 00:00:00 2001 From: Patrick Simianer Date: Wed, 13 Nov 2013 18:28:42 +0100 Subject: README --- training/dtrain/README.md | 30 ++++++++++++------------------ 1 file changed, 12 insertions(+), 18 deletions(-) diff --git a/training/dtrain/README.md b/training/dtrain/README.md index 2bae6b48..aa1ab3e7 100644 --- a/training/dtrain/README.md +++ b/training/dtrain/README.md @@ -1,10 +1,15 @@ This is a simple (and parallelizable) tuning method for cdec -which is able to train the weights of very many (sparse) features. -It was used here: - "Joint Feature Selection in Distributed Stochastic - Learning for Large-Scale Discriminative Training in - SMT" -(Simianer, Riezler, Dyer; ACL 2012) +which is able to train the weights of very many (sparse) features +on the training set. + +It was used in these papers: +> "Joint Feature Selection in Distributed Stochastic +> Learning for Large-Scale Discriminative Training in +> SMT" (Simianer, Riezler, Dyer; ACL 2012) +> +> "Multi-Task Learning for Improved Discriminative +> Training in SMT" (Simianer, Riezler; WMT 2013) +> Building @@ -17,20 +22,9 @@ To build only parts needed for dtrain do cd training/dtrain/; make ``` -Ideas ------ - * get approx_bleu to work? - * implement minibatches (Minibatch and Parallelization for Online Large Margin Structured Learning) - * learning rate 1/T? - * use an oracle? mira-like (model vs. BLEU), feature repr. of reference!? - * implement lc_bleu properly - * merge kbest lists of previous epochs (as MERT does) - * ``walk entire regularization path'' - * rerank after each update? - Running ------- -See directories under test/ . +See directories under examples/ . Legal ----- -- cgit v1.2.3