diff options
author | Patrick Simianer <p@simianer.de> | 2015-09-19 11:05:23 +0200 |
---|---|---|
committer | Patrick Simianer <p@simianer.de> | 2015-09-19 11:05:23 +0200 |
commit | 92a376c3479a18e2a6f674ecc6955a7eb29f217a (patch) | |
tree | 3ac35108b8ca53299db638bf7f7695c93f4806f5 /training/dtrain | |
parent | 86ea4ed498d96c1d988f2287afa580dcf558ddb0 (diff) |
dtrain: current version
Diffstat (limited to 'training/dtrain')
-rw-r--r-- | training/dtrain/README.md | 25 | ||||
-rw-r--r-- | training/dtrain/dtrain.cc | 840 | ||||
-rw-r--r-- | training/dtrain/dtrain.h | 162 | ||||
-rwxr-xr-x | training/dtrain/lplp.rb | 12 | ||||
-rwxr-xr-x | training/dtrain/parallelize.rb | 134 | ||||
-rw-r--r-- | training/dtrain/sample.h | 59 | ||||
-rw-r--r-- | training/dtrain/score.h | 411 | ||||
-rw-r--r-- | training/dtrain/update.h | 150 |
8 files changed, 869 insertions, 924 deletions
diff --git a/training/dtrain/README.md b/training/dtrain/README.md index aa1ab3e7..73a6a5a5 100644 --- a/training/dtrain/README.md +++ b/training/dtrain/README.md @@ -1,35 +1,28 @@ This is a simple (and parallelizable) tuning method for cdec -which is able to train the weights of very many (sparse) features -on the training set. +which enables training weights of very many (sparse) features +on the full training set. -It was used in these papers: +Please cite as: > "Joint Feature Selection in Distributed Stochastic > Learning for Large-Scale Discriminative Training in > SMT" (Simianer, Riezler, Dyer; ACL 2012) > -> "Multi-Task Learning for Improved Discriminative -> Training in SMT" (Simianer, Riezler; WMT 2013) -> - Building -------- -Builds when building cdec, see ../BUILDING . -To build only parts needed for dtrain do -``` - autoreconf -ifv - ./configure - cd training/dtrain/; make -``` +Builds when building cdec, see ../../BUILDING . Running ------- -See directories under examples/ . +Download runnable examples for all use cases from [1] and extract here. Legal ----- -Copyright (c) 2012-2013 by Patrick Simianer <p@simianer.de> +Copyright (c) 2012-2015 by Patrick Simianer <p@simianer.de> See the file LICENSE.txt in the root folder for the licensing terms that this software is released under. + +[1] http://simianer.de/dtrain-example.tar.gz + diff --git a/training/dtrain/dtrain.cc b/training/dtrain/dtrain.cc index ccb50af2..b39fff3e 100644 --- a/training/dtrain/dtrain.cc +++ b/training/dtrain/dtrain.cc @@ -1,698 +1,288 @@ #include "dtrain.h" +#include "sample.h" #include "score.h" -#include "kbestget.h" -#include "ksampler.h" -#include "pairsampling.h" +#include "update.h" using namespace dtrain; - -bool -dtrain_init(int argc, char** argv, po::variables_map* cfg) -{ - po::options_description ini("Configuration File Options"); - ini.add_options() - ("input", po::value<string>(), "input file (src)") - ("refs,r", po::value<string>(), "references") - ("bitext,b", po::value<string>(), "bitext: 'src ||| tgt'") - ("output", po::value<string>()->default_value("-"), "output weights file, '-' for STDOUT") - ("input_weights", po::value<string>(), "input weights file (e.g. from previous iteration)") - ("decoder_config", po::value<string>(), "configuration file for cdec") - ("print_weights", po::value<string>(), "weights to print on each iteration") - ("stop_after", po::value<unsigned>()->default_value(0), "stop after X input sentences") - ("keep", po::value<bool>()->zero_tokens(), "keep weights files for each iteration") - ("epochs", po::value<unsigned>()->default_value(10), "# of iterations T (per shard)") - ("k", po::value<unsigned>()->default_value(100), "how many translations to sample") - ("sample_from", po::value<string>()->default_value("kbest"), "where to sample translations from: 'kbest', 'forest'") - ("filter", po::value<string>()->default_value("uniq"), "filter kbest list: 'not', 'uniq'") - ("pair_sampling", po::value<string>()->default_value("XYX"), "how to sample pairs: 'all', 'XYX' or 'PRO'") - ("hi_lo", po::value<float>()->default_value(0.1), "hi and lo (X) for XYX (default 0.1), <= 0.5") - ("pair_threshold", po::value<score_t>()->default_value(0.), "bleu [0,1] threshold to filter pairs") - ("N", po::value<unsigned>()->default_value(4), "N for Ngrams (BLEU)") - ("scorer", po::value<string>()->default_value("stupid_bleu"), "scoring: bleu, stupid_, smooth_, approx_, lc_") - ("learning_rate", po::value<weight_t>()->default_value(1.0), "learning rate") - ("gamma", po::value<weight_t>()->default_value(0.), "gamma for SVM (0 for perceptron)") - ("select_weights", po::value<string>()->default_value("last"), "output best, last, avg weights ('VOID' to throw away)") - ("rescale", po::value<bool>()->zero_tokens(), "rescale weight vector after each input") - ("l1_reg", po::value<string>()->default_value("none"), "apply l1 regularization as in 'Tsuroka et al' (2010) UNTESTED") - ("l1_reg_strength", po::value<weight_t>(), "l1 regularization strength") - ("fselect", po::value<weight_t>()->default_value(-1), "select top x percent (or by threshold) of features after each epoch NOT IMPLEMENTED") // TODO - ("approx_bleu_d", po::value<score_t>()->default_value(0.9), "discount for approx. BLEU") - ("scale_bleu_diff", po::value<bool>()->zero_tokens(), "learning rate <- bleu diff of a misranked pair") - ("loss_margin", po::value<weight_t>()->default_value(0.), "update if no error in pref pair but model scores this near") - ("max_pairs", po::value<unsigned>()->default_value(std::numeric_limits<unsigned>::max()), "max. # of pairs per Sent.") - ("pclr", po::value<string>()->default_value("no"), "use a (simple|adagrad) per-coordinate learning rate") - ("batch", po::value<bool>()->zero_tokens(), "do batch optimization") - ("repeat", po::value<unsigned>()->default_value(1), "repeat optimization over kbest list this number of times") - ("check", po::value<bool>()->zero_tokens(), "produce list of loss differentials") - ("noup", po::value<bool>()->zero_tokens(), "do not update weights"); - po::options_description cl("Command Line Options"); - cl.add_options() - ("config,c", po::value<string>(), "dtrain config file") - ("quiet,q", po::value<bool>()->zero_tokens(), "be quiet") - ("verbose,v", po::value<bool>()->zero_tokens(), "be verbose"); - cl.add(ini); - po::store(parse_command_line(argc, argv, cl), *cfg); - if (cfg->count("config")) { - ifstream ini_f((*cfg)["config"].as<string>().c_str()); - po::store(po::parse_config_file(ini_f, ini), *cfg); - } - po::notify(*cfg); - if (!cfg->count("decoder_config")) { - cerr << cl << endl; - return false; - } - if ((*cfg)["sample_from"].as<string>() != "kbest" - && (*cfg)["sample_from"].as<string>() != "forest") { - cerr << "Wrong 'sample_from' param: '" << (*cfg)["sample_from"].as<string>() << "', use 'kbest' or 'forest'." << endl; - return false; - } - if ((*cfg)["sample_from"].as<string>() == "kbest" && (*cfg)["filter"].as<string>() != "uniq" && - (*cfg)["filter"].as<string>() != "not") { - cerr << "Wrong 'filter' param: '" << (*cfg)["filter"].as<string>() << "', use 'uniq' or 'not'." << endl; - return false; - } - if ((*cfg)["pair_sampling"].as<string>() != "all" && (*cfg)["pair_sampling"].as<string>() != "XYX" && - (*cfg)["pair_sampling"].as<string>() != "PRO") { - cerr << "Wrong 'pair_sampling' param: '" << (*cfg)["pair_sampling"].as<string>() << "'." << endl; - return false; - } - if (cfg->count("hi_lo") && (*cfg)["pair_sampling"].as<string>() != "XYX") { - cerr << "Warning: hi_lo only works with pair_sampling XYX." << endl; - } - if ((*cfg)["hi_lo"].as<float>() > 0.5 || (*cfg)["hi_lo"].as<float>() < 0.01) { - cerr << "hi_lo must lie in [0.01, 0.5]" << endl; - return false; - } - if ((cfg->count("input")>0 || cfg->count("refs")>0) && cfg->count("bitext")>0) { - cerr << "Provide 'input' and 'refs' or 'bitext', not both." << endl; - return false; - } - if ((*cfg)["pair_threshold"].as<score_t>() < 0) { - cerr << "The threshold must be >= 0!" << endl; - return false; - } - if ((*cfg)["select_weights"].as<string>() != "last" && (*cfg)["select_weights"].as<string>() != "best" && - (*cfg)["select_weights"].as<string>() != "avg" && (*cfg)["select_weights"].as<string>() != "VOID") { - cerr << "Wrong 'select_weights' param: '" << (*cfg)["select_weights"].as<string>() << "', use 'last' or 'best'." << endl; - return false; - } - return true; -} - int main(int argc, char** argv) { - // handle most parameters - po::variables_map cfg; - if (!dtrain_init(argc, argv, &cfg)) exit(1); // something is wrong - bool quiet = false; - if (cfg.count("quiet")) quiet = true; - bool verbose = false; - if (cfg.count("verbose")) verbose = true; - bool noup = false; - if (cfg.count("noup")) noup = true; - bool rescale = false; - if (cfg.count("rescale")) rescale = true; - bool keep = false; - if (cfg.count("keep")) keep = true; - - const unsigned k = cfg["k"].as<unsigned>(); - const unsigned N = cfg["N"].as<unsigned>(); - const unsigned T = cfg["epochs"].as<unsigned>(); - const unsigned stop_after = cfg["stop_after"].as<unsigned>(); - const string filter_type = cfg["filter"].as<string>(); - const string sample_from = cfg["sample_from"].as<string>(); - const string pair_sampling = cfg["pair_sampling"].as<string>(); - const score_t pair_threshold = cfg["pair_threshold"].as<score_t>(); - const string select_weights = cfg["select_weights"].as<string>(); - const float hi_lo = cfg["hi_lo"].as<float>(); - const score_t approx_bleu_d = cfg["approx_bleu_d"].as<score_t>(); - const unsigned max_pairs = cfg["max_pairs"].as<unsigned>(); - int repeat = cfg["repeat"].as<unsigned>(); - bool check = false; - if (cfg.count("check")) check = true; - weight_t loss_margin = cfg["loss_margin"].as<weight_t>(); - bool batch = false; - if (cfg.count("batch")) batch = true; - if (loss_margin > 9998.) loss_margin = std::numeric_limits<float>::max(); - bool scale_bleu_diff = false; - if (cfg.count("scale_bleu_diff")) scale_bleu_diff = true; - const string pclr = cfg["pclr"].as<string>(); - bool average = false; - if (select_weights == "avg") - average = true; + // get configuration + po::variables_map conf; + if (!dtrain_init(argc, argv, &conf)) + return 1; + const size_t k = conf["k"].as<size_t>(); + const string score_name = conf["score"].as<string>(); + const size_t N = conf["N"].as<size_t>(); + const size_t T = conf["iterations"].as<size_t>(); + const weight_t eta = conf["learning_rate"].as<weight_t>(); + const weight_t margin = conf["margin"].as<weight_t>(); + const bool average = conf["average"].as<bool>(); + const bool structured = conf["struct"].as<bool>(); + const weight_t l1_reg = conf["l1_reg"].as<weight_t>(); + const bool keep = conf["keep"].as<bool>(); + const bool noup = conf["disable_learning"].as<bool>(); + const string output_fn = conf["output"].as<string>(); + const string output_data_which = conf["output_data"].as<string>(); + const bool output_data = output_data_which!=""; vector<string> print_weights; - if (cfg.count("print_weights")) - boost::split(print_weights, cfg["print_weights"].as<string>(), boost::is_any_of(" ")); + boost::split(print_weights, conf["print_weights"].as<string>(), + boost::is_any_of(" ")); - // setup decoder + // setup decoder and scorer register_feature_functions(); SetSilent(true); - ReadFile ini_rf(cfg["decoder_config"].as<string>()); - if (!quiet) - cerr << setw(25) << "cdec cfg " << "'" << cfg["decoder_config"].as<string>() << "'" << endl; - Decoder decoder(ini_rf.stream()); - - // scoring metric/scorer - string scorer_str = cfg["scorer"].as<string>(); - LocalScorer* scorer; - if (scorer_str == "bleu") { - scorer = static_cast<BleuScorer*>(new BleuScorer); - } else if (scorer_str == "stupid_bleu") { - scorer = static_cast<StupidBleuScorer*>(new StupidBleuScorer); - } else if (scorer_str == "fixed_stupid_bleu") { - scorer = static_cast<FixedStupidBleuScorer*>(new FixedStupidBleuScorer); - } else if (scorer_str == "smooth_bleu") { - scorer = static_cast<SmoothBleuScorer*>(new SmoothBleuScorer); - } else if (scorer_str == "sum_bleu") { - scorer = static_cast<SumBleuScorer*>(new SumBleuScorer); - } else if (scorer_str == "sumexp_bleu") { - scorer = static_cast<SumExpBleuScorer*>(new SumExpBleuScorer); - } else if (scorer_str == "sumwhatever_bleu") { - scorer = static_cast<SumWhateverBleuScorer*>(new SumWhateverBleuScorer); - } else if (scorer_str == "approx_bleu") { - scorer = static_cast<ApproxBleuScorer*>(new ApproxBleuScorer(N, approx_bleu_d)); - } else if (scorer_str == "lc_bleu") { - scorer = static_cast<LinearBleuScorer*>(new LinearBleuScorer(N)); + ReadFile f(conf["decoder_conf"].as<string>()); + Decoder decoder(f.stream()); + Scorer* scorer; + if (score_name == "nakov") { + scorer = static_cast<PerSentenceBleuScorer*>(new PerSentenceBleuScorer(N)); + } else if (score_name == "papineni") { + scorer = static_cast<BleuScorer*>(new BleuScorer(N)); + } else if (score_name == "lin") { + scorer = static_cast<OriginalPerSentenceBleuScorer*>\ + (new OriginalPerSentenceBleuScorer(N)); + } else if (score_name == "liang") { + scorer = static_cast<SmoothPerSentenceBleuScorer*>\ + (new SmoothPerSentenceBleuScorer(N)); + } else if (score_name == "chiang") { + scorer = static_cast<ApproxBleuScorer*>(new ApproxBleuScorer(N)); } else { - cerr << "Don't know scoring metric: '" << scorer_str << "', exiting." << endl; - exit(1); + assert(false); } - vector<score_t> bleu_weights; - scorer->Init(N, bleu_weights); - - // setup decoder observer - MT19937 rng; // random number generator, only for forest sampling - HypSampler* observer; - if (sample_from == "kbest") - observer = static_cast<KBestGetter*>(new KBestGetter(k, filter_type)); - else - observer = static_cast<KSampler*>(new KSampler(k, &rng)); - observer->SetScorer(scorer); - - // init weights + ScoredKbest* observer = new ScoredKbest(k, scorer); + + // weights vector<weight_t>& decoder_weights = decoder.CurrentWeightVector(); - SparseVector<weight_t> lambdas, cumulative_penalties, w_average; - if (cfg.count("input_weights")) Weights::InitFromFile(cfg["input_weights"].as<string>(), &decoder_weights); - Weights::InitSparseVector(decoder_weights, &lambdas); - - // meta params for perceptron, SVM - weight_t eta = cfg["learning_rate"].as<weight_t>(); - weight_t gamma = cfg["gamma"].as<weight_t>(); - - // faster perceptron: consider only misranked pairs, see - bool faster_perceptron = false; - if (gamma==0 && loss_margin==0) faster_perceptron = true; - - // l1 regularization - bool l1naive = false; - bool l1clip = false; - bool l1cumul = false; - weight_t l1_reg = 0; - if (cfg["l1_reg"].as<string>() != "none") { - string s = cfg["l1_reg"].as<string>(); - if (s == "naive") l1naive = true; - else if (s == "clip") l1clip = true; - else if (s == "cumul") l1cumul = true; - l1_reg = cfg["l1_reg_strength"].as<weight_t>(); + SparseVector<weight_t> lambdas, w_average; + if (conf.count("input_weights")) { + Weights::InitFromFile(conf["input_weights"].as<string>(), &decoder_weights); + Weights::InitSparseVector(decoder_weights, &lambdas); } - // output - string output_fn = cfg["output"].as<string>(); // input - bool read_bitext = false; - string input_fn; - if (cfg.count("bitext")) { - read_bitext = true; - input_fn = cfg["bitext"].as<string>(); - } else { - input_fn = cfg["input"].as<string>(); - } + string input_fn = conf["bitext"].as<string>(); ReadFile input(input_fn); - // buffer input for t > 0 - vector<string> src_str_buf; // source strings (decoder takes only strings) - vector<vector<WordID> > ref_ids_buf; // references as WordID vecs - ReadFile refs; - string refs_fn; - if (!read_bitext) { - refs_fn = cfg["refs"].as<string>(); - refs.Init(refs_fn); - } - - unsigned in_sz = std::numeric_limits<unsigned>::max(); // input index, input size - vector<pair<score_t, score_t> > all_scores; - score_t max_score = 0.; - unsigned best_it = 0; - float overall_time = 0.; - - // output cfg - if (!quiet) { - cerr << _p5; - cerr << endl << "dtrain" << endl << "Parameters:" << endl; - cerr << setw(25) << "k " << k << endl; - cerr << setw(25) << "N " << N << endl; - cerr << setw(25) << "T " << T << endl; - cerr << setw(25) << "batch " << batch << endl; - cerr << setw(26) << "scorer '" << scorer_str << "'" << endl; - if (scorer_str == "approx_bleu") - cerr << setw(25) << "approx. B discount " << approx_bleu_d << endl; - cerr << setw(25) << "sample from " << "'" << sample_from << "'" << endl; - if (sample_from == "kbest") - cerr << setw(25) << "filter " << "'" << filter_type << "'" << endl; - if (!scale_bleu_diff) cerr << setw(25) << "learning rate " << eta << endl; - else cerr << setw(25) << "learning rate " << "bleu diff" << endl; - cerr << setw(25) << "gamma " << gamma << endl; - cerr << setw(25) << "loss margin " << loss_margin << endl; - cerr << setw(25) << "faster perceptron " << faster_perceptron << endl; - cerr << setw(25) << "pairs " << "'" << pair_sampling << "'" << endl; - if (pair_sampling == "XYX") - cerr << setw(25) << "hi lo " << hi_lo << endl; - cerr << setw(25) << "pair threshold " << pair_threshold << endl; - cerr << setw(25) << "select weights " << "'" << select_weights << "'" << endl; - if (cfg.count("l1_reg")) - cerr << setw(25) << "l1 reg " << l1_reg << " '" << cfg["l1_reg"].as<string>() << "'" << endl; - if (rescale) - cerr << setw(25) << "rescale " << rescale << endl; - cerr << setw(25) << "pclr " << pclr << endl; - cerr << setw(25) << "max pairs " << max_pairs << endl; - cerr << setw(25) << "repeat " << repeat << endl; - //cerr << setw(25) << "test k-best " << test_k_best << endl; - cerr << setw(25) << "cdec cfg " << "'" << cfg["decoder_config"].as<string>() << "'" << endl; - cerr << setw(25) << "input " << "'" << input_fn << "'" << endl; - if (!read_bitext) - cerr << setw(25) << "refs " << "'" << refs_fn << "'" << endl; - cerr << setw(25) << "output " << "'" << output_fn << "'" << endl; - if (cfg.count("input_weights")) - cerr << setw(25) << "weights in " << "'" << cfg["input_weights"].as<string>() << "'" << endl; - if (stop_after > 0) - cerr << setw(25) << "stop_after " << stop_after << endl; - if (!verbose) cerr << "(a dot represents " << DTRAIN_DOTS << " inputs)" << endl; + vector<string> buf; // decoder only accepts strings as input + vector<vector<Ngrams> > buf_ngs; // compute ngrams and lengths of references + vector<vector<size_t> > buf_ls; // just once + size_t input_sz = 0; + + cerr << _p4; + // output configuration + cerr << "Parameters:" << endl; + cerr << setw(25) << "bitext " << "'" << input_fn << "'" << endl; + cerr << setw(25) << "k " << k << endl; + cerr << setw(25) << "score " << "'" << score_name << "'" << endl; + cerr << setw(25) << "N " << N << endl; + cerr << setw(25) << "T " << T << endl; + cerr << setw(25) << "learning rate " << eta << endl; + cerr << setw(25) << "margin " << margin << endl; + cerr << setw(25) << "average " << average << endl; + cerr << setw(25) << "l1 reg " << l1_reg << endl; + cerr << setw(25) << "decoder conf " << "'" + << conf["decoder_conf"].as<string>() << "'" << endl; + cerr << setw(25) << "input " << "'" << input_fn << "'" << endl; + cerr << setw(25) << "output " << "'" << output_fn << "'" << endl; + if (conf.count("input_weights")) { + cerr << setw(25) << "weights in " << "'" + << conf["input_weights"].as<string>() << "'" << endl; } + cerr << "(1 dot per processed input)" << endl; - // pclr - SparseVector<weight_t> learning_rates; - // batch - SparseVector<weight_t> batch_updates; - score_t batch_loss; + // meta + weight_t best=0., gold_prev=0.; + size_t best_iteration = 0; + time_t total_time = 0.; - for (unsigned t = 0; t < T; t++) // T epochs + for (size_t t = 0; t < T; t++) // T iterations { time_t start, end; time(&start); - score_t score_sum = 0.; - score_t model_sum(0); - unsigned ii = 0, rank_errors = 0, margin_violations = 0, npairs = 0, f_count = 0, list_sz = 0, kbest_loss_improve = 0; - batch_loss = 0.; - if (!quiet) cerr << "Iteration #" << t+1 << " of " << T << "." << endl; + weight_t gold_sum=0., model_sum=0.; + size_t i=0, num_up=0, feature_count=0, list_sz=0; + + cerr << "Iteration #" << t+1 << " of " << T << "." << endl; while(true) { + bool next = true; - string in; - string ref; - bool next = false, stop = false; // next iteration or premature stop + // getting input if (t == 0) { - if(!getline(*input, in)) next = true; - if(read_bitext) { - vector<string> strs; - boost::algorithm::split_regex(strs, in, boost::regex(" \\|\\|\\| ")); - in = strs[0]; - ref = strs[1]; - } - } else { - if (ii == in_sz) next = true; // stop if we reach the end of our input - } - // stop after X sentences (but still go on for those) - if (stop_after > 0 && stop_after == ii && !next) stop = true; - - // produce some pretty output - if (!quiet && !verbose) { - if (ii == 0) cerr << " "; - if ((ii+1) % (DTRAIN_DOTS) == 0) { - cerr << "."; - cerr.flush(); - } - if ((ii+1) % (20*DTRAIN_DOTS) == 0) { - cerr << " " << ii+1 << endl; - if (!next && !stop) cerr << " "; - } - if (stop) { - if (ii % (20*DTRAIN_DOTS) != 0) cerr << " " << ii << endl; - cerr << "Stopping after " << stop_after << " input sentences." << endl; + string in; + if(!getline(*input, in)) { + next = false; } else { - if (next) { - if (ii % (20*DTRAIN_DOTS) != 0) cerr << " " << ii << endl; + vector<string> parts; + boost::algorithm::split_regex(parts, in, boost::regex(" \\|\\|\\| ")); + buf.push_back(parts[0]); + parts.erase(parts.begin()); + buf_ngs.push_back({}); + buf_ls.push_back({}); + for (auto s: parts) { + vector<WordID> r; + vector<string> toks; + boost::split(toks, s, boost::is_any_of(" ")); + for (auto tok: toks) + r.push_back(TD::Convert(tok)); + buf_ngs.back().emplace_back(MakeNgrams(r, N)); + buf_ls.back().push_back(r.size()); } } + } else { + next = i<input_sz; } - // next iteration - if (next || stop) break; - - // weights - lambdas.init_vector(&decoder_weights); - - // getting input - vector<WordID> ref_ids; // reference as vector<WordID> - if (t == 0) { - if (!read_bitext) { - getline(*refs, ref); - } - vector<string> ref_tok; - boost::split(ref_tok, ref, boost::is_any_of(" ")); - register_and_convert(ref_tok, ref_ids); - ref_ids_buf.push_back(ref_ids); - src_str_buf.push_back(in); + // produce some pretty output + if (next) { + if (i%20 == 0) + cerr << " "; + cerr << "."; + if ((i+1)%20==0) + cerr << " " << i+1 << endl; } else { - ref_ids = ref_ids_buf[ii]; + if (i%20 != 0) + cerr << " " << i << endl; } - observer->SetRef(ref_ids); - if (t == 0) - decoder.Decode(in, observer); - else - decoder.Decode(src_str_buf[ii], observer); + cerr.flush(); + + // stop iterating + if (!next) break; - // get (scored) samples + // decode + if (t > 0 || i > 0) + lambdas.init_vector(&decoder_weights); + observer->SetReference(buf_ngs[i], buf_ls[i]); + decoder.Decode(buf[i], observer); vector<ScoredHyp>* samples = observer->GetSamples(); - if (verbose) { - cerr << "--- ref for " << ii << ": "; - if (t > 0) printWordIDVec(ref_ids_buf[ii]); - else printWordIDVec(ref_ids); - cerr << endl; - for (unsigned u = 0; u < samples->size(); u++) { - cerr << _p2 << _np << "[" << u << ". '"; - printWordIDVec((*samples)[u].w); - cerr << "'" << endl; - cerr << "SCORE=" << (*samples)[u].score << ",model="<< (*samples)[u].model << endl; - cerr << "F{" << (*samples)[u].f << "} ]" << endl << endl; + // stats for 1best + gold_sum += samples->front().gold; + model_sum += samples->front().model; + feature_count += observer->GetFeatureCount(); + list_sz += observer->GetSize(); + + if (output_data) { + if (output_data_which == "kbest") { + OutputKbest(samples); + } else if (output_data_which == "default") { + OutputMultipartitePairs(samples, margin); + } else if (output_data_which == "all") { + OutputAllPairs(samples); } } - if (repeat == 1) { - score_sum += (*samples)[0].score; // stats for 1best - model_sum += (*samples)[0].model; - } - - f_count += observer->get_f_count(); - list_sz += observer->get_sz(); - - // weight updates + // get pairs and update if (!noup) { - // get pairs - vector<pair<ScoredHyp,ScoredHyp> > pairs; - if (pair_sampling == "all") - all_pairs(samples, pairs, pair_threshold, max_pairs, faster_perceptron); - if (pair_sampling == "XYX") - partXYX(samples, pairs, pair_threshold, max_pairs, faster_perceptron, hi_lo); - if (pair_sampling == "PRO") - PROsampling(samples, pairs, pair_threshold, max_pairs); - int cur_npairs = pairs.size(); - npairs += cur_npairs; - - score_t kbest_loss_first = 0.0, kbest_loss_last = 0.0; - - if (check) repeat = 2; - vector<float> losses; // for check - - for (vector<pair<ScoredHyp,ScoredHyp> >::iterator it = pairs.begin(); - it != pairs.end(); it++) { - score_t model_diff = it->first.model - it->second.model; - score_t loss = max(0.0, -1.0 * model_diff); - losses.push_back(loss); - kbest_loss_first += loss; - } - score_t kbest_loss = 0.0; - for (int ki=0; ki < repeat; ki++) { - - SparseVector<weight_t> lambdas_copy; // for l1 regularization - SparseVector<weight_t> sum_up; // for pclr - if (l1naive||l1clip||l1cumul) lambdas_copy = lambdas; - - unsigned pair_idx = 0; // for check - for (vector<pair<ScoredHyp,ScoredHyp> >::iterator it = pairs.begin(); - it != pairs.end(); it++) { - score_t model_diff = it->first.model - it->second.model; - score_t loss = max(0.0, -1.0 * model_diff); - - if (check && ki==repeat-1) cout << losses[pair_idx] - loss << endl; - pair_idx++; - - if (repeat > 1) { - model_diff = lambdas.dot(it->first.f) - lambdas.dot(it->second.f); - kbest_loss += loss; - } - bool rank_error = false; - score_t margin; - if (faster_perceptron) { // we only have considering misranked pairs - rank_error = true; // pair sampling already did this for us - margin = std::numeric_limits<float>::max(); - } else { - rank_error = model_diff<=0.0; - margin = fabs(model_diff); - if (!rank_error && margin < loss_margin) margin_violations++; - } - if (rank_error && ki==0) rank_errors++; - if (scale_bleu_diff) eta = it->first.score - it->second.score; - if (rank_error || margin < loss_margin) { - SparseVector<weight_t> diff_vec = it->first.f - it->second.f; - if (batch) { - batch_loss += max(0., -1.0 * model_diff); - batch_updates += diff_vec; - continue; - } - if (pclr != "no") { - sum_up += diff_vec; - } else { - lambdas.plus_eq_v_times_s(diff_vec, eta); - if (gamma) lambdas.plus_eq_v_times_s(lambdas, -2*gamma*eta*(1./cur_npairs)); - } - } - } - - // per-coordinate learning rate - if (pclr != "no") { - SparseVector<weight_t>::iterator it = sum_up.begin(); - for (; it != sum_up.end(); ++it) { - if (pclr == "simple") { - lambdas[it->first] += it->second / max(1.0, learning_rates[it->first]); - learning_rates[it->first]++; - } else if (pclr == "adagrad") { - if (learning_rates[it->first] == 0) { - lambdas[it->first] += it->second * eta; - } else { - lambdas[it->first] += it->second * eta * learning_rates[it->first]; - } - learning_rates[it->first] += pow(it->second, 2.0); - } - } - } - - // l1 regularization - // please note that this regularizations happen - // after a _sentence_ -- not after each example/pair! - if (l1naive) { - SparseVector<weight_t>::iterator it = lambdas.begin(); - for (; it != lambdas.end(); ++it) { - if (!lambdas_copy.get(it->first) || lambdas_copy.get(it->first)!=it->second) { - it->second *= max(0.0000001, eta/(eta+learning_rates[it->first])); // FIXME - learning_rates[it->first]++; - it->second -= sign(it->second) * l1_reg; - } - } - } else if (l1clip) { - SparseVector<weight_t>::iterator it = lambdas.begin(); - for (; it != lambdas.end(); ++it) { - if (!lambdas_copy.get(it->first) || lambdas_copy.get(it->first)!=it->second) { - if (it->second != 0) { - weight_t v = it->second; - if (v > 0) { - it->second = max(0., v - l1_reg); - } else { - it->second = min(0., v + l1_reg); - } - } - } - } - } else if (l1cumul) { - weight_t acc_penalty = (ii+1) * l1_reg; // ii is the index of the current input - SparseVector<weight_t>::iterator it = lambdas.begin(); - for (; it != lambdas.end(); ++it) { - if (!lambdas_copy.get(it->first) || lambdas_copy.get(it->first)!=it->second) { - if (it->second != 0) { - weight_t v = it->second; - weight_t penalized = 0.; - if (v > 0) { - penalized = max(0., v-(acc_penalty + cumulative_penalties.get(it->first))); - } else { - penalized = min(0., v+(acc_penalty - cumulative_penalties.get(it->first))); - } - it->second = penalized; - cumulative_penalties.set_value(it->first, cumulative_penalties.get(it->first)+penalized); - } - } + SparseVector<weight_t> updates; + if (structured) + num_up += CollectUpdatesStruct(samples, updates); + else + num_up += CollectUpdates(samples, updates, margin); + SparseVector<weight_t> lambdas_copy; + if (l1_reg) + lambdas_copy = lambdas; + lambdas.plus_eq_v_times_s(updates, eta); + + // update context for approx. BLEU + if (score_name == "chiang") { + for (auto it: *samples) { + if (it.rank == 0) { + scorer->UpdateContext(it.w, buf_ngs[i], buf_ls[i], 0.9); + break; } } + } - if (ki==repeat-1) { // done - kbest_loss_last = kbest_loss; - if (repeat > 1) { - score_t best_model = -std::numeric_limits<score_t>::max(); - unsigned best_idx = 0; - for (unsigned i=0; i < samples->size(); i++) { - score_t s = lambdas.dot((*samples)[i].f); - if (s > best_model) { - best_idx = i; - best_model = s; - } + // l1 regularization + // NB: regularization is done after each sentence, + // not after every single pair! + if (l1_reg) { + SparseVector<weight_t>::iterator it = lambdas.begin(); + for (; it != lambdas.end(); ++it) { + weight_t v = it->second; + if (!v) + continue; + if (!lambdas_copy.get(it->first) // new or.. + || lambdas_copy.get(it->first)!=v) // updated feature + { + if (v > 0) { + it->second = max(0., v - l1_reg); + } else { + it->second = min(0., v + l1_reg); } - score_sum += (*samples)[best_idx].score; - model_sum += best_model; } } - } // repeat - - if ((kbest_loss_first - kbest_loss_last) >= 0) kbest_loss_improve++; + } } // noup - if (rescale) lambdas /= lambdas.l2norm(); - - ++ii; + i++; } // input loop - if (t == 0) in_sz = ii; // remember size of input (# lines) - - - if (batch) { - lambdas.plus_eq_v_times_s(batch_updates, eta); - if (gamma) lambdas.plus_eq_v_times_s(lambdas, -2*gamma*eta*(1./npairs)); - batch_updates.clear(); + if (t == 0) + input_sz = i; // remember size of input (# lines) + + // update average + if (average) + w_average += lambdas; + + // stats + weight_t gold_avg = gold_sum/(weight_t)input_sz; + cerr << _p << "WEIGHTS" << endl; + for (auto name: print_weights) + cerr << setw(18) << name << " = " << lambdas.get(FD::Convert(name)) << endl; + cerr << " ---" << endl; + cerr << _np << " 1best avg score: " << gold_avg*100; + cerr << _p << " (" << (gold_avg-gold_prev)*100 << ")" << endl; + cerr << " 1best avg model score: " + << model_sum/(weight_t)input_sz << endl; + cerr << " avg # updates: "; + cerr << _np << num_up/(float)input_sz << endl; + cerr << " non-0 feature count: " << lambdas.num_nonzero() << endl; + cerr << " avg f count: " << feature_count/(float)list_sz << endl; + cerr << " avg list sz: " << list_sz/(float)input_sz << endl; + + if (gold_avg > best) { + best = gold_avg; + best_iteration = t; } + gold_prev = gold_avg; - if (average) w_average += lambdas; - - if (scorer_str == "approx_bleu" || scorer_str == "lc_bleu") scorer->Reset(); - - // print some stats - score_t score_avg = score_sum/(score_t)in_sz; - score_t model_avg = model_sum/(score_t)in_sz; - score_t score_diff, model_diff; - if (t > 0) { - score_diff = score_avg - all_scores[t-1].first; - model_diff = model_avg - all_scores[t-1].second; - } else { - score_diff = score_avg; - model_diff = model_avg; - } - - unsigned nonz = 0; - if (!quiet) nonz = (unsigned)lambdas.num_nonzero(); - - if (!quiet) { - cerr << _p5 << _p << "WEIGHTS" << endl; - for (vector<string>::iterator it = print_weights.begin(); it != print_weights.end(); it++) { - cerr << setw(18) << *it << " = " << lambdas.get(FD::Convert(*it)) << endl; - } - cerr << " ---" << endl; - cerr << _np << " 1best avg score: " << score_avg; - cerr << _p << " (" << score_diff << ")" << endl; - cerr << _np << " 1best avg model score: " << model_avg; - cerr << _p << " (" << model_diff << ")" << endl; - cerr << " avg # pairs: "; - cerr << _np << npairs/(float)in_sz << endl; - cerr << " avg # rank err: "; - cerr << rank_errors/(float)in_sz; - if (faster_perceptron) cerr << " (meaningless)"; - cerr << endl; - cerr << " avg # margin viol: "; - cerr << margin_violations/(float)in_sz << endl; - if (batch) cerr << " batch loss: " << batch_loss << endl; - cerr << " k-best loss imp: " << ((float)kbest_loss_improve/in_sz)*100 << "%" << endl; - cerr << " non0 feature count: " << nonz << endl; - cerr << " avg list sz: " << list_sz/(float)in_sz << endl; - cerr << " avg f count: " << f_count/(float)list_sz << endl; - } - - pair<score_t,score_t> remember; - remember.first = score_avg; - remember.second = model_avg; - all_scores.push_back(remember); - if (score_avg > max_score) { - max_score = score_avg; - best_it = t; - } time (&end); - float time_diff = difftime(end, start); - overall_time += time_diff; - if (!quiet) { - cerr << _p2 << _np << "(time " << time_diff/60. << " min, "; - cerr << time_diff/in_sz << " s/S)" << endl; - } - if (t+1 != T && !quiet) cerr << endl; - - if (noup) break; + time_t time_diff = difftime(end, start); + total_time += time_diff; + cerr << "(time " << time_diff/60. << " min, "; + cerr << time_diff/input_sz << " s/S)" << endl; + if (t+1 != T) cerr << endl; - // write weights to file - if (select_weights == "best" || keep) { + if (keep) { // keep intermediate weights lambdas.init_vector(&decoder_weights); string w_fn = "weights." + boost::lexical_cast<string>(t) + ".gz"; Weights::WriteToFile(w_fn, decoder_weights, true); } - if (check) cout << "---" << endl; - } // outer loop - if (average) w_average /= (weight_t)T; - - if (!noup) { - if (!quiet) cerr << endl << "Writing weights file to '" << output_fn << "' ..." << endl; - if (select_weights == "last" || average) { // last, average - WriteFile of(output_fn); // works with '-' - ostream& o = *of.stream(); - o.precision(17); - o << _np; - if (average) { - for (SparseVector<weight_t>::iterator it = w_average.begin(); it != w_average.end(); ++it) { - if (it->second == 0) continue; - o << FD::Convert(it->first) << '\t' << it->second << endl; - } - } else { - for (SparseVector<weight_t>::iterator it = lambdas.begin(); it != lambdas.end(); ++it) { - if (it->second == 0) continue; - o << FD::Convert(it->first) << '\t' << it->second << endl; - } - } - } else if (select_weights == "VOID") { // do nothing with the weights - } else { // best - if (output_fn != "-") { - CopyFile("weights."+boost::lexical_cast<string>(best_it)+".gz", output_fn); - } else { - ReadFile bestw("weights."+boost::lexical_cast<string>(best_it)+".gz"); - string o; - cout.precision(17); - cout << _np; - while(getline(*bestw, o)) cout << o << endl; - } - if (!keep) { - for (unsigned i = 0; i < T; i++) { - string s = "weights." + boost::lexical_cast<string>(i) + ".gz"; - unlink(s.c_str()); - } - } - } - if (!quiet) cerr << "done" << endl; + // final weights + if (average) { + w_average /= T; + w_average.init_vector(decoder_weights); + } else if (!keep) { + lambdas.init_vector(decoder_weights); } + if (average || !keep) + Weights::WriteToFile(output_fn, decoder_weights, true); - if (!quiet) { - cerr << _p5 << _np << endl << "---" << endl << "Best iteration: "; - cerr << best_it+1 << " [SCORE '" << scorer_str << "'=" << max_score << "]." << endl; - cerr << "This took " << overall_time/60. << " min." << endl; - } + cerr << endl << "---" << endl << "Best iteration: "; + cerr << best_iteration+1 << " [GOLD = " << best*100 << "]." << endl; + cerr << "This took " << total_time/60. << " min." << endl; + + return 0; } diff --git a/training/dtrain/dtrain.h b/training/dtrain/dtrain.h index 07bd9b65..0bbb5c9b 100644 --- a/training/dtrain/dtrain.h +++ b/training/dtrain/dtrain.h @@ -1,9 +1,6 @@ #ifndef _DTRAIN_H_ #define _DTRAIN_H_ -#define DTRAIN_DOTS 10 // after how many inputs to display a '.' -#define DTRAIN_SCALE 100000 - #include <iomanip> #include <climits> #include <string.h> @@ -25,113 +22,92 @@ namespace po = boost::program_options; namespace dtrain { - -inline void register_and_convert(const vector<string>& strs, vector<WordID>& ids) -{ - vector<string>::const_iterator it; - for (it = strs.begin(); it < strs.end(); it++) - ids.push_back(TD::Convert(*it)); -} - -inline string gettmpf(const string path, const string infix) -{ - char fn[path.size() + infix.size() + 8]; - strcpy(fn, path.c_str()); - strcat(fn, "/"); - strcat(fn, infix.c_str()); - strcat(fn, "-XXXXXX"); - if (!mkstemp(fn)) { - cerr << "Cannot make temp file in" << path << " , exiting." << endl; - exit(1); - } - return string(fn); -} - -typedef double score_t; - struct ScoredHyp { - vector<WordID> w; - SparseVector<double> f; - score_t model; - score_t score; - unsigned rank; + vector<WordID> w; + SparseVector<weight_t> f; + weight_t model, gold; + size_t rank; }; -struct LocalScorer +inline void +PrintWordIDVec(vector<WordID>& v, ostream& os=cerr) { - unsigned N_; - vector<score_t> w_; - - virtual score_t - Score(const vector<WordID>& hyp, const vector<WordID>& ref, const unsigned rank, const unsigned src_len)=0; - - virtual void Reset() {} // only for ApproxBleuScorer, LinearBleuScorer - - inline void - Init(unsigned N, vector<score_t> weights) - { - assert(N > 0); - N_ = N; - if (weights.empty()) for (unsigned i = 0; i < N_; i++) w_.push_back(1./N_); - else w_ = weights; - } - - inline score_t - brevity_penalty(const unsigned hyp_len, const unsigned ref_len) - { - if (hyp_len > ref_len) return 1; - return exp(1 - (score_t)ref_len/hyp_len); + for (size_t i = 0; i < v.size(); i++) { + os << TD::Convert(v[i]); + if (i < v.size()-1) os << " "; } -}; +} -struct HypSampler : public DecoderObserver -{ - LocalScorer* scorer_; - vector<WordID>* ref_; - unsigned f_count_, sz_; - virtual vector<ScoredHyp>* GetSamples()=0; - inline void SetScorer(LocalScorer* scorer) { scorer_ = scorer; } - inline void SetRef(vector<WordID>& ref) { ref_ = &ref; } - inline unsigned get_f_count() { return f_count_; } - inline unsigned get_sz() { return sz_; } -}; +inline ostream& _np(ostream& out) { return out << resetiosflags(ios::showpos); } +inline ostream& _p(ostream& out) { return out << setiosflags(ios::showpos); } +inline ostream& _p4(ostream& out) { return out << setprecision(4); } -struct HSReporter +bool +dtrain_init(int argc, char** argv, po::variables_map* conf) { - string task_id_; - - HSReporter(string task_id) : task_id_(task_id) {} - - inline void update_counter(string name, unsigned amount) { - cerr << "reporter:counter:" << task_id_ << "," << name << "," << amount << endl; + po::options_description opts("Configuration File Options"); + opts.add_options() + ("bitext,b", po::value<string>(), "bitext") + ("decoder_conf,C", po::value<string>(), "configuration file for decoder") + ("iterations,T", po::value<size_t>()->default_value(15), "number of iterations T (per shard)") + ("k", po::value<size_t>()->default_value(100), "size of kbest list") + ("learning_rate,l", po::value<weight_t>()->default_value(0.00001), "learning rate") + ("l1_reg,r", po::value<weight_t>()->default_value(0.), "l1 regularization strength") + ("margin,m", po::value<weight_t>()->default_value(1.0), "margin for margin perceptron") + ("score,s", po::value<string>()->default_value("chiang"), "per-sentence BLEU approx.") + ("N", po::value<size_t>()->default_value(4), "N for BLEU approximation") + ("input_weights,w", po::value<string>(), "input weights file") + ("average,a", po::bool_switch()->default_value(true), "output average weights") + ("keep,K", po::bool_switch()->default_value(false), "output a weight file per iteration") + ("struct,S", po::bool_switch()->default_value(false), "structured SGD with hope/fear") + ("output,o", po::value<string>()->default_value("-"), "output weights file, '-' for STDOUT") + ("disable_learning,X", po::bool_switch()->default_value(false), "disable learning") + ("output_data,D", po::value<string>()->default_value(""), "output data to STDOUT; arg. is 'kbest', 'default' or 'all'") + ("print_weights,P", po::value<string>()->default_value("EgivenFCoherent SampleCountF CountEF MaxLexFgivenE MaxLexEgivenF IsSingletonF IsSingletonFE Glue WordPenalty PassThrough LanguageModel LanguageModel_OOV"), + "list of weights to print after each iteration"); + po::options_description clopts("Command Line Options"); + clopts.add_options() + ("conf,c", po::value<string>(), "dtrain configuration file") + ("help,h", po::bool_switch(), "display options"); + opts.add(clopts); + po::store(parse_command_line(argc, argv, opts), *conf); + cerr << "dtrain" << endl << endl; + if ((*conf)["help"].as<bool>()) { + cerr << opts << endl; + + return false; } - inline void update_gcounter(string name, unsigned amount) { - cerr << "reporter:counter:Global," << name << "," << amount << endl; + if (conf->count("conf")) { + ifstream f((*conf)["conf"].as<string>().c_str()); + po::store(po::parse_config_file(f, opts), *conf); } -}; + po::notify(*conf); + if (!conf->count("decoder_conf")) { + cerr << "Missing decoder configuration." << endl; + cerr << opts << endl; -inline ostream& _np(ostream& out) { return out << resetiosflags(ios::showpos); } -inline ostream& _p(ostream& out) { return out << setiosflags(ios::showpos); } -inline ostream& _p2(ostream& out) { return out << setprecision(2); } -inline ostream& _p5(ostream& out) { return out << setprecision(5); } + return false; + } + if (!conf->count("bitext")) { + cerr << "No input given." << endl; + cerr << opts << endl; -inline void printWordIDVec(vector<WordID>& v, ostream& os=cerr) -{ - for (unsigned i = 0; i < v.size(); i++) { - os << TD::Convert(v[i]); - if (i < v.size()-1) os << " "; + return false; + } + if ((*conf)["output_data"].as<string>() != "") { + if ((*conf)["output_data"].as<string>() != "kbest" && + (*conf)["output_data"].as<string>() != "default" && + (*conf)["output_data"].as<string>() != "all") { + cerr << "Wrong 'output_data' argument: "; + cerr << (*conf)["output_data"].as<string>() << endl; + return false; + } } -} -template<typename T> -inline T sign(T z) -{ - if (z == 0) return 0; - return z < 0 ? -1 : +1; + return true; } - } // namespace #endif diff --git a/training/dtrain/lplp.rb b/training/dtrain/lplp.rb index 86e835e8..cf28b477 100755 --- a/training/dtrain/lplp.rb +++ b/training/dtrain/lplp.rb @@ -1,4 +1,4 @@ -# lplp.rb +#!/usr/bin/env ruby # norms def l0(feature_column, n) @@ -19,7 +19,8 @@ end # stats def median(feature_column, n) - return feature_column.concat(0.step(n-feature_column.size-1).map{|i|0}).sort[feature_column.size/2] + return feature_column.concat(0.step(n-feature_column.size-1).map{|i|0}) + .sort[feature_column.size/2] end def mean(feature_column, n) @@ -28,7 +29,7 @@ end # selection def select_k(weights, norm_fun, n, k=10000) - weights.sort{|a,b| norm_fun.call(b[1], n) <=> norm_fun.call(a[1], n)}.each { |p| + weights.sort{|a,b| norm_fun.call(b[1], n)<=>norm_fun.call(a[1], n)}.each { |p| puts "#{p[0]}\t#{mean(p[1], n)}" k -= 1 if k == 0 then break end @@ -84,17 +85,16 @@ def _test() end #_test() - def usage() puts "lplp.rb <l0,l1,l2,linfty,mean,median> <cut|select_k> <k|threshold> <#shards> < <input>" puts " l0...: norms for selection" puts "select_k: only output top k (according to the norm of their column vector) features" puts " cut: output features with weight >= threshold" - puts " n: if we do not have a shard count use this number for averaging" + puts " n: number of shards for averaging" exit 1 end -if ARGV.size < 4 then usage end +usage if ARGV.size<4 norm_fun = method(ARGV[0].to_sym) type = ARGV[1] x = ARGV[2].to_f diff --git a/training/dtrain/parallelize.rb b/training/dtrain/parallelize.rb index 82600009..128f3f98 100755 --- a/training/dtrain/parallelize.rb +++ b/training/dtrain/parallelize.rb @@ -1,155 +1,134 @@ #!/usr/bin/env ruby require 'trollop' +require 'zipf' -def usage - STDERR.write "Usage: " - STDERR.write "ruby parallelize.rb -c <dtrain.ini> [-e <epochs=10>] [--randomize/-z] [--reshard/-y] -s <#shards|0> [-p <at once=9999>] -i <input> -r <refs> [--qsub/-q] [--dtrain_binary <path to dtrain binary>] [-l \"l2 select_k 100000\"] [--extra_qsub \"-l virtual_free=24G\"]\n" - exit 1 +conf = Trollop::options do + opt :conf, "dtrain configuration", :type => :string, :short => '-c' + opt :input, "input as bitext (f ||| e)", :type => :string, :short => '-i' + opt :epochs, "number of epochs", :type => :int, :default => 10 + opt :randomize, "randomize shards once", :type => :bool, :default => false, :short => '-z' + opt :reshard, "randomize after each epoch", :type => :bool, :default => false, :short => '-y' + opt :shards, "number of shards", :type => :int, :short => '-s' + opt :weights, "input weights for first epoch", :type => :string, :default => '', :short => '-w' + opt :lplp_args, "arguments for lplp.rb", :type => :string, :default => "l2 select_k 100000", :short => '-l' + opt :per_shard_decoder_configs, "give custom decoder config per shard", :type => :string, :short => '-o' + opt :processes_at_once, "jobs to run at oce", :type => :int, :default => 9999, :short => '-p' + opt :qsub, "use qsub", :type => :bool, :default => false, :short => '-q' + opt :qsub_args, "extra args for qsub", :type => :string, :default => "h_vmem=5G", :short => 'r' + opt :dtrain_binary, "path to dtrain binary", :type => :string, :short => '-d' end -opts = Trollop::options do - opt :config, "dtrain config file", :type => :string - opt :epochs, "number of epochs", :type => :int, :default => 10 - opt :lplp_args, "arguments for lplp.rb", :type => :string, :default => "l2 select_k 100000" - opt :randomize, "randomize shards before each epoch", :type => :bool, :short => '-z', :default => false - opt :reshard, "reshard after each epoch", :type => :bool, :short => '-y', :default => false - opt :shards, "number of shards", :type => :int - opt :processes_at_once, "have this number (max) running at the same time", :type => :int, :default => 9999 - opt :input, "input", :type => :string - opt :references, "references", :type => :string - opt :qsub, "use qsub", :type => :bool, :default => false - opt :dtrain_binary, "path to dtrain binary", :type => :string - opt :extra_qsub, "extra qsub args", :type => :string, :default => "" - opt :per_shard_decoder_configs, "give special decoder config per shard", :type => :string, :short => '-o' - opt :first_input_weights, "input weights for first iter", :type => :string, :default => '', :short => '-w' -end -usage if not opts[:config]&&opts[:shards]&&opts[:input]&&opts[:references] - dtrain_dir = File.expand_path File.dirname(__FILE__) -if not opts[:dtrain_binary] +if not conf[:dtrain_binary] dtrain_bin = "#{dtrain_dir}/dtrain" else - dtrain_bin = opts[:dtrain_binary] + dtrain_bin = conf[:dtrain_binary] end -ruby = '/usr/bin/ruby' lplp_rb = "#{dtrain_dir}/lplp.rb" -lplp_args = opts[:lplp_args] -cat = '/bin/cat' +lplp_args = conf[:lplp_args] -ini = opts[:config] -epochs = opts[:epochs] -rand = opts[:randomize] -reshard = opts[:reshard] -predefined_shards = false +dtrain_conf = conf[:conf] +epochs = conf[:epochs] +rand = conf[:randomize] +reshard = conf[:reshard] +predefined_shards = false per_shard_decoder_configs = false -if opts[:shards] == 0 +if conf[:shards] == 0 predefined_shards = true num_shards = 0 - per_shard_decoder_configs = true if opts[:per_shard_decoder_configs] + per_shard_decoder_configs = true if conf[:per_shard_decoder_configs] else - num_shards = opts[:shards] + num_shards = conf[:shards] end -input = opts[:input] -refs = opts[:references] -use_qsub = opts[:qsub] -shards_at_once = opts[:processes_at_once] -first_input_weights = opts[:first_input_weights] -opts[:extra_qsub] = "-l #{opts[:extra_qsub]}" if opts[:extra_qsub]!="" +input = conf[:input] +use_qsub = conf[:qsub] +shards_at_once = conf[:processes_at_once] +first_input_weights = conf[:weights] `mkdir work` -def make_shards(input, refs, num_shards, epoch, rand) +def make_shards input, num_shards, epoch, rand lc = `wc -l #{input}`.split.first.to_i index = (0..lc-1).to_a index.reverse! index.shuffle! if rand shard_sz = (lc / num_shards.to_f).round 0 leftover = lc - (num_shards*shard_sz) - leftover = 0 if leftover < 0 + leftover = [0, leftover].max in_f = File.new input, 'r' in_lines = in_f.readlines - refs_f = File.new refs, 'r' - refs_lines = refs_f.readlines shard_in_files = [] - shard_refs_files = [] in_fns = [] - refs_fns = [] - new_num_shards = 0 + real_num_shards = 0 0.upto(num_shards-1) { |shard| break if index.size==0 - new_num_shards += 1 - in_fn = "work/shard.#{shard}.#{epoch}.in" + real_num_shards += 1 + in_fn = "work/shard.#{shard}.#{epoch}" shard_in = File.new in_fn, 'w+' in_fns << in_fn - refs_fn = "work/shard.#{shard}.#{epoch}.refs" - shard_refs = File.new refs_fn, 'w+' - refs_fns << refs_fn 0.upto(shard_sz-1) { |i| j = index.pop + break if !j shard_in.write in_lines[j] - shard_refs.write refs_lines[j] } shard_in_files << shard_in - shard_refs_files << shard_refs } while leftover > 0 j = index.pop shard_in_files[-1].write in_lines[j] - shard_refs_files[-1].write refs_lines[j] leftover -= 1 end - (shard_in_files + shard_refs_files).each do |f| f.close end + shard_in_files.each do |f| f.close end in_f.close - refs_f.close - return in_fns, refs_fns, new_num_shards + return in_fns, real_num_shards end input_files = [] -refs_files = [] if predefined_shards - input_files = File.new(input).readlines.map {|i| i.strip } - refs_files = File.new(refs).readlines.map {|i| i.strip } + input_files = File.new(input).readlines.map { |i| i.strip } if per_shard_decoder_configs - decoder_configs = File.new(opts[:per_shard_decoder_configs]).readlines.map {|i| i.strip} + decoder_configs = ReadFile.readlines_strip(conf[:per_shard_decoder_configs] + ).map { |i| i.strip } end num_shards = input_files.size else - input_files, refs_files, num_shards = make_shards input, refs, num_shards, 0, rand + input_files, num_shards = make_shards input, num_shards, 0, rand end 0.upto(epochs-1) { |epoch| puts "epoch #{epoch+1}" pids = [] input_weights = '' - if epoch > 0 then input_weights = "--input_weights work/weights.#{epoch-1}" end + input_weights = "--input_weights work/weights.#{epoch-1}" if epoch>0 weights_files = [] shard = 0 remaining_shards = num_shards while remaining_shards > 0 shards_at_once.times { break if remaining_shards==0 - qsub_str_start = qsub_str_end = '' - local_end = '' + qsub_str_start = qsub_str_end = local_end = '' if use_qsub - qsub_str_start = "qsub #{opts[:extra_qsub]} -cwd -sync y -b y -j y -o work/out.#{shard}.#{epoch} -N dtrain.#{shard}.#{epoch} \"" + qsub_str_start = "qsub -l #{conf[:qsub_args]} -cwd -sync y -b y -j y\ + -o work/out.#{shard}.#{epoch}\ + -N dtrain.#{shard}.#{epoch} \"" qsub_str_end = "\"" local_end = '' else local_end = "2>work/out.#{shard}.#{epoch}" end if per_shard_decoder_configs - cdec_cfg = "--decoder_config #{decoder_configs[shard]}" + cdec_conf = "--decoder_conf #{decoder_configs[shard]}" else - cdec_cfg = "" + cdec_conf = "" end - if first_input_weights!='' && epoch == 0 + if first_input_weights != '' && epoch == 0 input_weights = "--input_weights #{first_input_weights}" end pids << Kernel.fork { - `#{qsub_str_start}#{dtrain_bin} -c #{ini} #{cdec_cfg} #{input_weights}\ - --input #{input_files[shard]}\ - --refs #{refs_files[shard]}\ + `#{qsub_str_start}#{dtrain_bin} -c #{dtrain_conf} #{cdec_conf}\ + #{input_weights}\ + --bitext #{input_files[shard]}\ --output work/weights.#{shard}.#{epoch}#{qsub_str_end} #{local_end}` } weights_files << "work/weights.#{shard}.#{epoch}" @@ -159,10 +138,11 @@ end pids.each { |pid| Process.wait(pid) } pids.clear end - `#{cat} work/weights.*.#{epoch} > work/weights_cat` - `#{ruby} #{lplp_rb} #{lplp_args} #{num_shards} < work/weights_cat > work/weights.#{epoch}` + `cat work/weights.*.#{epoch} > work/weights_cat` + `ruby #{lplp_rb} #{lplp_args} #{num_shards} < work/weights_cat\ + > work/weights.#{epoch}` if rand and reshard and epoch+1!=epochs - input_files, refs_files, num_shards = make_shards input, refs, num_shards, epoch+1, rand + input_files, num_shards = make_shards input, num_shards, epoch+1, rand end } diff --git a/training/dtrain/sample.h b/training/dtrain/sample.h new file mode 100644 index 00000000..1249e372 --- /dev/null +++ b/training/dtrain/sample.h @@ -0,0 +1,59 @@ +#ifndef _DTRAIN_SAMPLE_H_ +#define _DTRAIN_SAMPLE_H_ + +#include "kbest.h" + +#include "score.h" + +namespace dtrain +{ + +struct ScoredKbest : public DecoderObserver +{ + const size_t k_; + size_t feature_count_, effective_sz_; + vector<ScoredHyp> samples_; + Scorer* scorer_; + vector<Ngrams>* ref_ngs_; + vector<size_t>* ref_ls_; + + ScoredKbest(const size_t k, Scorer* scorer) : + k_(k), scorer_(scorer) {} + + virtual void + NotifyTranslationForest(const SentenceMetadata& /*smeta*/, Hypergraph* hg) + { + samples_.clear(); effective_sz_ = feature_count_ = 0; + KBest::KBestDerivations<vector<WordID>, ESentenceTraversal, + KBest::FilterUnique, prob_t, EdgeProb> kbest(*hg, k_); + for (size_t i = 0; i < k_; ++i) { + const KBest::KBestDerivations<vector<WordID>, ESentenceTraversal, + KBest::FilterUnique, prob_t, EdgeProb>::Derivation* d = + kbest.LazyKthBest(hg->nodes_.size() - 1, i); + if (!d) break; + ScoredHyp h; + h.w = d->yield; + h.f = d->feature_values; + h.model = log(d->score); + h.rank = i; + h.gold = scorer_->Score(h.w, *ref_ngs_, *ref_ls_); + samples_.push_back(h); + effective_sz_++; + feature_count_ += h.f.size(); + } + } + + vector<ScoredHyp>* GetSamples() { return &samples_; } + inline void SetReference(vector<Ngrams>& ngs, vector<size_t>& ls) + { + ref_ngs_ = &ngs; + ref_ls_ = &ls; + } + inline size_t GetFeatureCount() { return feature_count_; } + inline size_t GetSize() { return effective_sz_; } +}; + +} // namespace + +#endif + diff --git a/training/dtrain/score.h b/training/dtrain/score.h index 1cdd3fa9..ca3da39b 100644 --- a/training/dtrain/score.h +++ b/training/dtrain/score.h @@ -6,35 +6,28 @@ namespace dtrain { - struct NgramCounts { - unsigned N_; - map<unsigned, score_t> clipped_; - map<unsigned, score_t> sum_; + size_t N_; + map<size_t, weight_t> clipped_; + map<size_t, weight_t> sum_; + + NgramCounts() {} - NgramCounts(const unsigned N) : N_(N) { Zero(); } + NgramCounts(const size_t N) : N_(N) { Zero(); } inline void operator+=(const NgramCounts& rhs) { if (rhs.N_ > N_) Resize(rhs.N_); - for (unsigned i = 0; i < N_; i++) { + for (size_t i = 0; i < N_; i++) { this->clipped_[i] += rhs.clipped_.find(i)->second; this->sum_[i] += rhs.sum_.find(i)->second; } } - inline const NgramCounts - operator+(const NgramCounts &other) const - { - NgramCounts result = *this; - result += other; - return result; - } - inline void - operator*=(const score_t rhs) + operator*=(const weight_t rhs) { for (unsigned i = 0; i < N_; i++) { this->clipped_[i] *= rhs; @@ -43,7 +36,7 @@ struct NgramCounts } inline void - Add(const unsigned count, const unsigned ref_count, const unsigned i) + Add(const size_t count, const size_t ref_count, const size_t i) { assert(i < N_); if (count > ref_count) { @@ -57,40 +50,23 @@ struct NgramCounts inline void Zero() { - for (unsigned i = 0; i < N_; i++) { + for (size_t i = 0; i < N_; i++) { clipped_[i] = 0.; sum_[i] = 0.; } } inline void - One() - { - for (unsigned i = 0; i < N_; i++) { - clipped_[i] = 1.; - sum_[i] = 1.; - } - } - - inline void - Print() - { - for (unsigned i = 0; i < N_; i++) { - cout << i+1 << "grams (clipped):\t" << clipped_[i] << endl; - cout << i+1 << "grams:\t\t\t" << sum_[i] << endl; - } - } - - inline void Resize(unsigned N) + Resize(size_t N) { if (N == N_) return; else if (N > N_) { - for (unsigned i = N_; i < N; i++) { + for (size_t i = N_; i < N; i++) { clipped_[i] = 0.; sum_[i] = 0.; } } else { // N < N_ - for (unsigned i = N_-1; i > N-1; i--) { + for (size_t i = N_-1; i > N-1; i--) { clipped_.erase(i); sum_.erase(i); } @@ -99,123 +75,344 @@ struct NgramCounts } }; -typedef map<vector<WordID>, unsigned> Ngrams; +typedef map<vector<WordID>, size_t> Ngrams; inline Ngrams -make_ngrams(const vector<WordID>& s, const unsigned N) +MakeNgrams(const vector<WordID>& s, const size_t N) { Ngrams ngrams; vector<WordID> ng; for (size_t i = 0; i < s.size(); i++) { ng.clear(); - for (unsigned j = i; j < min(i+N, s.size()); j++) { + for (size_t j = i; j < min(i+N, s.size()); j++) { ng.push_back(s[j]); ngrams[ng]++; } } + return ngrams; } inline NgramCounts -make_ngram_counts(const vector<WordID>& hyp, const vector<WordID>& ref, const unsigned N) +MakeNgramCounts(const vector<WordID>& hyp, + const vector<Ngrams>& ref, + const size_t N) { - Ngrams hyp_ngrams = make_ngrams(hyp, N); - Ngrams ref_ngrams = make_ngrams(ref, N); + Ngrams hyp_ngrams = MakeNgrams(hyp, N); NgramCounts counts(N); - Ngrams::iterator it; - Ngrams::iterator ti; + Ngrams::iterator it, ti; for (it = hyp_ngrams.begin(); it != hyp_ngrams.end(); it++) { - ti = ref_ngrams.find(it->first); - if (ti != ref_ngrams.end()) { - counts.Add(it->second, ti->second, it->first.size() - 1); - } else { - counts.Add(it->second, 0, it->first.size() - 1); + size_t max_ref_count = 0; + for (auto r: ref) { + ti = r.find(it->first); + if (ti != r.end()) + max_ref_count = max(max_ref_count, ti->second); } + counts.Add(it->second, min(it->second, max_ref_count), it->first.size()-1); } + return counts; } -struct BleuScorer : public LocalScorer +class Scorer { - score_t Bleu(NgramCounts& counts, const unsigned hyp_len, const unsigned ref_len); - score_t Score(const vector<WordID>& hyp, const vector<WordID>& ref, const unsigned /*rank*/, const unsigned /*src_len*/); - void Reset() {} -}; + protected: + const size_t N_; + vector<weight_t> w_; + + public: + Scorer(size_t n): N_(n) + { + for (size_t i = 1; i <= N_; i++) + w_.push_back(1.0/N_); + } -struct StupidBleuScorer : public LocalScorer -{ - score_t Score(const vector<WordID>& hyp, const vector<WordID>& ref, const unsigned /*rank*/, const unsigned /*src_len*/); - void Reset() {} -}; + inline bool + Init(const vector<WordID>& hyp, + const vector<Ngrams>& ref_ngs, + const vector<size_t>& ref_ls, + size_t& hl, + size_t& rl, + size_t& M, + vector<weight_t>& v, + NgramCounts& counts) + { + hl = hyp.size(); + if (hl == 0) return false; + rl = BestMatchLength(hl, ref_ls); + if (rl == 0) return false; + counts = MakeNgramCounts(hyp, ref_ngs, N_); + if (rl < N_) { + M = rl; + for (size_t i = 0; i < M; i++) v.push_back(1/((weight_t)M)); + } else { + M = N_; + v = w_; + } -struct FixedStupidBleuScorer : public LocalScorer -{ - score_t Score(const vector<WordID>& hyp, const vector<WordID>& ref, const unsigned /*rank*/, const unsigned /*src_len*/); - void Reset() {} -}; + return true; + } -struct SmoothBleuScorer : public LocalScorer -{ - score_t Score(const vector<WordID>& hyp, const vector<WordID>& ref, const unsigned /*rank*/, const unsigned /*src_len*/); - void Reset() {} -}; + inline weight_t + BrevityPenalty(const size_t hl, const size_t rl) + { + if (hl > rl) + return 1; -struct SumBleuScorer : public LocalScorer -{ - score_t Score(const vector<WordID>& hyp, const vector<WordID>& ref, const unsigned /*rank*/, const unsigned /*src_len*/); - void Reset() {} + return exp(1 - (weight_t)rl/hl); + } + + inline size_t + BestMatchLength(const size_t hl, + const vector<size_t>& ref_ls) + { + size_t m; + if (ref_ls.size() == 1) { + m = ref_ls.front(); + } else { + size_t i = 0, best_idx = 0; + size_t best = numeric_limits<size_t>::max(); + for (auto l: ref_ls) { + size_t d = abs(hl-l); + if (d < best) { + best_idx = i; + best = d; + } + i += 1; + } + m = ref_ls[best_idx]; + } + + return m; + } + + virtual weight_t + Score(const vector<WordID>&, + const vector<Ngrams>&, + const vector<size_t>&) = 0; + + void + UpdateContext(const vector<WordID>& /*hyp*/, + const vector<Ngrams>& /*ref_ngs*/, + const vector<size_t>& /*ref_ls*/, + weight_t /*decay*/) {} }; -struct SumExpBleuScorer : public LocalScorer +/* + * 'fixed' per-sentence BLEU + * simply add 1 to reference length for calculation of BP + * + * as in "Optimizing for Sentence-Level BLEU+1 + * Yields Short Translations" + * (Nakov et al. '12) + * + */ +class PerSentenceBleuScorer : public Scorer { - score_t Score(const vector<WordID>& hyp, const vector<WordID>& ref, const unsigned /*rank*/, const unsigned /*src_len*/); - void Reset() {} + public: + PerSentenceBleuScorer(size_t n) : Scorer(n) {} + + weight_t + Score(const vector<WordID>& hyp, + const vector<Ngrams>& ref_ngs, + const vector<size_t>& ref_ls) + { + size_t hl, rl, M; + vector<weight_t> v; + NgramCounts counts; + if (!Init(hyp, ref_ngs, ref_ls, hl, rl, M, v, counts)) + return 0.; + weight_t sum=0, add=0; + for (size_t i = 0; i < M; i++) { + if (i == 0 && (counts.sum_[i] == 0 || counts.clipped_[i] == 0)) return 0.; + if (i > 0) add = 1; + sum += v[i] * log(((weight_t)counts.clipped_[i] + add) + / ((counts.sum_[i] + add))); + } + + return BrevityPenalty(hl, rl+1) * exp(sum); + } }; -struct SumWhateverBleuScorer : public LocalScorer + +/* + * BLEU + * 0 if for one n \in {1..N} count is 0 + * + * as in "BLEU: a Method for Automatic Evaluation + * of Machine Translation" + * (Papineni et al. '02) + * + */ + +class BleuScorer : public Scorer { - score_t Score(const vector<WordID>& hyp, const vector<WordID>& ref, const unsigned /*rank*/, const unsigned /*src_len*/); - void Reset() {}; + public: + BleuScorer(size_t n) : Scorer(n) {} + + weight_t + Score(const vector<WordID>& hyp, + const vector<Ngrams>& ref_ngs, + const vector<size_t>& ref_ls) + { + size_t hl, rl, M; + vector<weight_t> v; + NgramCounts counts; + if (!Init(hyp, ref_ngs, ref_ls, hl, rl, M, v, counts)) + return 0.; + weight_t sum = 0; + for (size_t i = 0; i < M; i++) { + if (counts.sum_[i] == 0 || counts.clipped_[i] == 0) return 0.; + sum += v[i] * log((weight_t)counts.clipped_[i]/counts.sum_[i]); + } + + return BrevityPenalty(hl, rl) * exp(sum); + } }; -struct ApproxBleuScorer : public BleuScorer +/* + * original BLEU+1 + * 0 iff no 1gram match ('grounded') + * + * as in "ORANGE: a Method for Evaluating + * Automatic Evaluation Metrics + * for Machine Translation" + * (Lin & Och '04) + * + */ +class OriginalPerSentenceBleuScorer : public Scorer { - NgramCounts glob_onebest_counts_; - unsigned glob_hyp_len_, glob_ref_len_, glob_src_len_; - score_t discount_; + public: + OriginalPerSentenceBleuScorer(size_t n) : Scorer(n) {} + + weight_t + Score(const vector<WordID>& hyp, + const vector<Ngrams>& ref_ngs, + const vector<size_t>& ref_ls) + { + size_t hl, rl, M; + vector<weight_t> v; + NgramCounts counts; + if (!Init(hyp, ref_ngs, ref_ls, hl, rl, M, v, counts)) + return 0.; + weight_t sum=0, add=0; + for (size_t i = 0; i < M; i++) { + if (i == 0 && (counts.sum_[i] == 0 || counts.clipped_[i] == 0)) return 0.; + if (i == 1) add = 1; + sum += v[i] * log(((weight_t)counts.clipped_[i] + add)/((counts.sum_[i] + add))); + } - ApproxBleuScorer(unsigned N, score_t d) : glob_onebest_counts_(NgramCounts(N)), discount_(d) - { - glob_hyp_len_ = glob_ref_len_ = glob_src_len_ = 0; - } + return BrevityPenalty(hl, rl) * exp(sum); + } +}; - inline void Reset() { - glob_onebest_counts_.Zero(); - glob_hyp_len_ = glob_ref_len_ = glob_src_len_ = 0.; - } +/* + * smooth BLEU + * max is 0.9375 (with N=4) + * + * as in "An End-to-End Discriminative Approach + * to Machine Translation" + * (Liang et al. '06) + * + */ +class SmoothPerSentenceBleuScorer : public Scorer +{ + public: + SmoothPerSentenceBleuScorer(size_t n) : Scorer(n) {} + + weight_t + Score(const vector<WordID>& hyp, + const vector<Ngrams>& ref_ngs, + const vector<size_t>& ref_ls) + { + size_t hl=hyp.size(), rl=BestMatchLength(hl, ref_ls); + if (hl == 0 || rl == 0) return 0.; + NgramCounts counts = MakeNgramCounts(hyp, ref_ngs, N_); + size_t M = N_; + if (rl < N_) M = rl; + weight_t sum = 0.; + vector<weight_t> i_bleu; + for (size_t i=0; i < M; i++) + i_bleu.push_back(0.); + for (size_t i=0; i < M; i++) { + if (counts.sum_[i] == 0 || counts.clipped_[i] == 0) { + break; + } else { + weight_t i_score = log((weight_t)counts.clipped_[i]/counts.sum_[i]); + for (size_t j=i; j < M; j++) { + i_bleu[j] += (1/((weight_t)j+1)) * i_score; + } + } + sum += exp(i_bleu[i])/pow(2.0, (double)(N_-i+2)); + } - score_t Score(const vector<WordID>& hyp, const vector<WordID>& ref, const unsigned rank, const unsigned src_len); + return BrevityPenalty(hl, hl) * sum; + } }; -struct LinearBleuScorer : public BleuScorer +/* + * approx. bleu + * Needs some more code in dtrain.cc . + * We do not scaling by source lengths, as hypotheses are compared only + * within an kbest list, not globally. + * + * as in "Online Large-Margin Training of Syntactic + * and Structural Translation Features" + * (Chiang et al. '08) + * + + */ +class ApproxBleuScorer : public Scorer { - unsigned onebest_len_; - NgramCounts onebest_counts_; - - LinearBleuScorer(unsigned N) : onebest_len_(1), onebest_counts_(N) - { - onebest_counts_.One(); - } + private: + NgramCounts context; + weight_t hyp_sz_sum; + weight_t ref_sz_sum; + + public: + ApproxBleuScorer(size_t n) : + Scorer(n), context(n), hyp_sz_sum(0), ref_sz_sum(0) {} + + weight_t + Score(const vector<WordID>& hyp, + const vector<Ngrams>& ref_ngs, + const vector<size_t>& ref_ls) + { + size_t hl, rl, M; + vector<weight_t> v; + NgramCounts counts; + if (!Init(hyp, ref_ngs, ref_ls, hl, rl, M, v, counts)) + return 0.; + counts += context; + weight_t sum = 0; + for (size_t i = 0; i < M; i++) { + if (counts.sum_[i] == 0 || counts.clipped_[i] == 0) return 0.; + sum += v[i] * log((weight_t)counts.clipped_[i]/counts.sum_[i]); + } - score_t Score(const vector<WordID>& hyp, const vector<WordID>& ref, const unsigned rank, const unsigned /*src_len*/); + return BrevityPenalty(hyp_sz_sum+hl, ref_sz_sum+rl) * exp(sum); + } - inline void Reset() { - onebest_len_ = 1; - onebest_counts_.One(); - } + void + UpdateContext(const vector<WordID>& hyp, + const vector<Ngrams>& ref_ngs, + const vector<size_t>& ref_ls, + weight_t decay=0.9) + { + size_t hl, rl, M; + vector<weight_t> v; + NgramCounts counts; + Init(hyp, ref_ngs, ref_ls, hl, rl, M, v, counts); + + context += counts; + context *= decay; + hyp_sz_sum += hl; + hyp_sz_sum *= decay; + ref_sz_sum += rl; + ref_sz_sum *= decay; + } }; - } // namespace #endif diff --git a/training/dtrain/update.h b/training/dtrain/update.h new file mode 100644 index 00000000..83dc3186 --- /dev/null +++ b/training/dtrain/update.h @@ -0,0 +1,150 @@ +#ifndef _DTRAIN_UPDATE_H_ +#define _DTRAIN_UPDATE_H_ + +namespace dtrain +{ + +bool +_cmp(ScoredHyp a, ScoredHyp b) +{ + return a.gold > b.gold; +} + +bool +_cmpHope(ScoredHyp a, ScoredHyp b) +{ + return (a.model+a.gold) > (b.model+b.gold); +} + +bool +_cmpFear(ScoredHyp a, ScoredHyp b) +{ + return (a.model-a.gold) > (b.model-b.gold); +} + +inline bool +_good(ScoredHyp& a, ScoredHyp& b, weight_t margin) +{ + if ((a.model-b.model)>margin + || a.gold==b.gold) + return true; + + return false; +} + +inline bool +_goodS(ScoredHyp& a, ScoredHyp& b) +{ + if (a.gold==b.gold) + return true; + + return false; +} + +/* + * multipartite ranking + * sort (descending) by bleu + * compare top X (hi) to middle Y (med) and low X (lo) + * cmp middle Y to low X + */ +inline size_t +CollectUpdates(vector<ScoredHyp>* s, + SparseVector<weight_t>& updates, + weight_t margin=0.) +{ + size_t num_up = 0; + size_t sz = s->size(); + sort(s->begin(), s->end(), _cmp); + size_t sep = round(sz*0.1); + for (size_t i = 0; i < sep; i++) { + for (size_t j = sep; j < sz; j++) { + if (_good((*s)[i], (*s)[j], margin)) + continue; + updates += (*s)[i].f-(*s)[j].f; + num_up++; + } + } + size_t sep_lo = sz-sep; + for (size_t i = sep; i < sep_lo; i++) { + for (size_t j = sep_lo; j < sz; j++) { + if (_good((*s)[i], (*s)[j], margin)) + continue; + updates += (*s)[i].f-(*s)[j].f; + num_up++; + } + } + + return num_up; +} + +inline size_t +CollectUpdatesStruct(vector<ScoredHyp>* s, + SparseVector<weight_t>& updates, + weight_t unused=-1) +{ + // hope + sort(s->begin(), s->end(), _cmpHope); + ScoredHyp hope = (*s)[0]; + // fear + sort(s->begin(), s->end(), _cmpFear); + ScoredHyp fear = (*s)[0]; + if (!_goodS(hope, fear)) + updates += hope.f - fear.f; + + return updates.size(); +} + +inline void +OutputKbest(vector<ScoredHyp>* s) +{ + sort(s->begin(), s->end(), _cmp); + size_t i = 0; + for (auto k: *s) { + cout << i << "\t" << k.gold << "\t" << k.model << " \t" << k.f << endl; + i++; + } +} + +inline void +OutputMultipartitePairs(vector<ScoredHyp>* s, + weight_t margin=0., + bool all=true) +{ + size_t sz = s->size(); + sort(s->begin(), s->end(), _cmp); + size_t sep = round(sz*0.1); + for (size_t i = 0; i < sep; i++) { + for (size_t j = sep; j < sz; j++) { + if (!all && _good((*s)[i], (*s)[j], margin)) + continue; + cout << (*s)[i].f-(*s)[j].f << endl; + } + } + size_t sep_lo = sz-sep; + for (size_t i = sep; i < sep_lo; i++) { + for (size_t j = sep_lo; j < sz; j++) { + if (!all && _good((*s)[i], (*s)[j], margin)) + continue; + cout << (*s)[i].f-(*s)[j].f << endl; + } + } +} + +inline void +OutputAllPairs(vector<ScoredHyp>* s) +{ + size_t sz = s->size(); + sort(s->begin(), s->end(), _cmp); + for (size_t i = 0; i < sz-1; i++) { + for (size_t j = i+1; j < sz; j++) { + if ((*s)[i].gold == (*s)[j].gold) + continue; + cout << (*s)[i].f-(*s)[j].f << endl; + } + } +} + +} // namespace + +#endif + |