diff options
Diffstat (limited to 'dtrain')
35 files changed, 2615 insertions, 0 deletions
diff --git a/dtrain/Makefile.am b/dtrain/Makefile.am new file mode 100644 index 00000000..471977e1 --- /dev/null +++ b/dtrain/Makefile.am @@ -0,0 +1,7 @@ +bin_PROGRAMS = dtrain + +dtrain_SOURCES = dtrain.cc score.cc +dtrain_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a ../klm/lm/libklm.a ../klm/util/libklm_util.a -lz + +AM_CPPFLAGS = -W -Wall -Wno-sign-compare -I$(top_srcdir)/utils -I$(top_srcdir)/decoder -I$(top_srcdir)/mteval -O3 + diff --git a/dtrain/README.md b/dtrain/README.md new file mode 100644 index 00000000..c39d94d2 --- /dev/null +++ b/dtrain/README.md @@ -0,0 +1,40 @@ +This is a really fast (parallelizable) tuning method for cdec as used here: + "Joint Feature Selection in Distributed Stochastic + Learning for Large-Scale Discriminative Training in + SMT" Simianer, Riezler, Dyer + ACL 2012 + + +Building +-------- +builds when building cdec, see ../BUILDING + +Running +------- +To run this on a dev set locally (default): +<code> +#define DTRAIN_LOCAL +</code> +otherwise remove that line or undef. You need a single grammar file +or per-sentence-grammars (psg) as you would use with cdec. +Additionally you need to give dtrain a file with +references (--refs). + +The input for use with hadoop streaming looks like this: +<code> +<id>\t<source>\t<ref>\t<grammar rules separated by tab> +</code> +To convert a psg to this format you need to replace all "\n" +by "\t". Make sure there are no tabs in your data. + +For an example of local usage (with 'distributed' format) +the see test/example/ . This expects dtrain to be built without +DTRAIN_LOCAL param. + +Legal stuff +----------- +Copyright (c) 2012 by Patrick Simianer <p@simianer.de> + +See the file ../LICENSE.txt for the licensing terms that this software is +released under. + diff --git a/dtrain/dtrain.cc b/dtrain/dtrain.cc new file mode 100644 index 00000000..fb6c6880 --- /dev/null +++ b/dtrain/dtrain.cc @@ -0,0 +1,634 @@ +#include "dtrain.h" + + +bool +dtrain_init(int argc, char** argv, po::variables_map* cfg) +{ + po::options_description ini("Configuration File Options"); + ini.add_options() + ("input", po::value<string>()->default_value("-"), "input file") + ("output", po::value<string>()->default_value("-"), "output weights file, '-' for STDOUT") + ("input_weights", po::value<string>(), "input weights file (e.g. from previous iteration)") + ("decoder_config", po::value<string>(), "configuration file for cdec") + ("print_weights", po::value<string>(), "weights to print on each iteration") + ("stop_after", po::value<unsigned>()->default_value(0), "stop after X input sentences") + ("tmp", po::value<string>()->default_value("/tmp"), "temp dir to use") + ("keep", po::value<bool>()->zero_tokens(), "keep weights files for each iteration") + ("hstreaming", po::value<string>(), "run in hadoop streaming mode, arg is a task id") + ("epochs", po::value<unsigned>()->default_value(10), "# of iterations T (per shard)") + ("k", po::value<unsigned>()->default_value(100), "how many translations to sample") + ("sample_from", po::value<string>()->default_value("kbest"), "where to sample translations from: 'kbest', 'forest'") + ("filter", po::value<string>()->default_value("uniq"), "filter kbest list: 'not', 'uniq'") + ("pair_sampling", po::value<string>()->default_value("108010"), "how to sample pairs: 'all', '108010' or 'PRO'") + ("pair_threshold", po::value<score_t>()->default_value(0), "bleu [0,1] threshold to filter pairs") + ("N", po::value<unsigned>()->default_value(4), "N for Ngrams (BLEU)") + ("scorer", po::value<string>()->default_value("stupid_bleu"), "scoring: bleu, stupid_, smooth_, approx_") + ("learning_rate", po::value<weight_t>()->default_value(0.0001), "learning rate") + ("gamma", po::value<weight_t>()->default_value(0), "gamma for SVM (0 for perceptron)") + ("select_weights", po::value<string>()->default_value("last"), "output best, last, avg weights ('VOID' to throw away)") + ("rescale", po::value<bool>()->zero_tokens(), "rescale weight vector after each input") + ("l1_reg", po::value<string>()->default_value("none"), "apply l1 regularization as in 'Tsuroka et al' (2010)") + ("l1_reg_strength", po::value<weight_t>(), "l1 regularization strength") + ("funny", po::value<bool>()->zero_tokens(), "include correctly ranked pairs into updates") + ("fselect", po::value<weight_t>()->default_value(-1), "select top x percent of features after each epoch") +#ifdef DTRAIN_LOCAL + ("refs,r", po::value<string>(), "references in local mode") +#endif + ("noup", po::value<bool>()->zero_tokens(), "do not update weights"); + po::options_description cl("Command Line Options"); + cl.add_options() + ("config,c", po::value<string>(), "dtrain config file") + ("quiet,q", po::value<bool>()->zero_tokens(), "be quiet") + ("verbose,v", po::value<bool>()->zero_tokens(), "be verbose"); + cl.add(ini); + po::store(parse_command_line(argc, argv, cl), *cfg); + if (cfg->count("config")) { + ifstream ini_f((*cfg)["config"].as<string>().c_str()); + po::store(po::parse_config_file(ini_f, ini), *cfg); + } + po::notify(*cfg); + if (!cfg->count("decoder_config")) { + cerr << cl << endl; + return false; + } + if (cfg->count("hstreaming") && (*cfg)["output"].as<string>() != "-") { + cerr << "When using 'hstreaming' the 'output' param should be '-'." << endl; + return false; + } +#ifdef DTRAIN_LOCAL + if ((*cfg)["input"].as<string>() == "-") { + cerr << "Can't use stdin as input with this binary. Recompile without DTRAIN_LOCAL" << endl; + return false; + } +#endif + if ((*cfg)["sample_from"].as<string>() != "kbest" + && (*cfg)["sample_from"].as<string>() != "forest") { + cerr << "Wrong 'sample_from' param: '" << (*cfg)["sample_from"].as<string>() << "', use 'kbest' or 'forest'." << endl; + return false; + } + if ((*cfg)["sample_from"].as<string>() == "kbest" && (*cfg)["filter"].as<string>() != "uniq" && + (*cfg)["filter"].as<string>() != "not") { + cerr << "Wrong 'filter' param: '" << (*cfg)["filter"].as<string>() << "', use 'uniq' or 'not'." << endl; + return false; + } + if ((*cfg)["pair_sampling"].as<string>() != "all" && (*cfg)["pair_sampling"].as<string>() != "108010" && + (*cfg)["pair_sampling"].as<string>() != "PRO") { + cerr << "Wrong 'pair_sampling' param: '" << (*cfg)["pair_sampling"].as<string>() << "'." << endl; + return false; + } + if ((*cfg)["pair_threshold"].as<score_t>() < 0) { + cerr << "The threshold must be >= 0!" << endl; + return false; + } + if ((*cfg)["select_weights"].as<string>() != "last" && (*cfg)["select_weights"].as<string>() != "best" && + (*cfg)["select_weights"].as<string>() != "avg" && (*cfg)["select_weights"].as<string>() != "VOID") { + cerr << "Wrong 'select_weights' param: '" << (*cfg)["select_weights"].as<string>() << "', use 'last' or 'best'." << endl; + return false; + } + return true; +} + +int +main(int argc, char** argv) +{ + // handle most parameters + po::variables_map cfg; + if (!dtrain_init(argc, argv, &cfg)) exit(1); // something is wrong + bool quiet = false; + if (cfg.count("quiet")) quiet = true; + bool verbose = false; + if (cfg.count("verbose")) verbose = true; + bool noup = false; + if (cfg.count("noup")) noup = true; + bool hstreaming = false; + string task_id; + if (cfg.count("hstreaming")) { + hstreaming = true; + quiet = true; + task_id = cfg["hstreaming"].as<string>(); + cerr.precision(17); + } + bool rescale = false; + if (cfg.count("rescale")) rescale = true; + HSReporter rep(task_id); + bool keep = false; + if (cfg.count("keep")) keep = true; + bool funny = false; + if (cfg.count("funny")) + funny = true; + + const unsigned k = cfg["k"].as<unsigned>(); + const unsigned N = cfg["N"].as<unsigned>(); + const unsigned T = cfg["epochs"].as<unsigned>(); + const unsigned stop_after = cfg["stop_after"].as<unsigned>(); + const string filter_type = cfg["filter"].as<string>(); + const string sample_from = cfg["sample_from"].as<string>(); + const string pair_sampling = cfg["pair_sampling"].as<string>(); + const score_t pair_threshold = cfg["pair_threshold"].as<score_t>(); + const string select_weights = cfg["select_weights"].as<string>(); + bool average = false; + if (select_weights == "avg") + average = true; + vector<string> print_weights; + if (cfg.count("print_weights")) + boost::split(print_weights, cfg["print_weights"].as<string>(), boost::is_any_of(" ")); + + // setup decoder + register_feature_functions(); + SetSilent(true); + ReadFile ini_rf(cfg["decoder_config"].as<string>()); + if (!quiet) + cerr << setw(25) << "cdec cfg " << "'" << cfg["decoder_config"].as<string>() << "'" << endl; + Decoder decoder(ini_rf.stream()); + + // scoring metric/scorer + string scorer_str = cfg["scorer"].as<string>(); + LocalScorer* scorer; + if (scorer_str == "bleu") { + scorer = dynamic_cast<BleuScorer*>(new BleuScorer); + } else if (scorer_str == "stupid_bleu") { + scorer = dynamic_cast<StupidBleuScorer*>(new StupidBleuScorer); + } else if (scorer_str == "smooth_bleu") { + scorer = dynamic_cast<SmoothBleuScorer*>(new SmoothBleuScorer); + } else if (scorer_str == "approx_bleu") { + scorer = dynamic_cast<ApproxBleuScorer*>(new ApproxBleuScorer(N)); + } else { + cerr << "Don't know scoring metric: '" << scorer_str << "', exiting." << endl; + exit(1); + } + vector<score_t> bleu_weights; + scorer->Init(N, bleu_weights); + if (!quiet) cerr << setw(26) << "scorer '" << scorer_str << "'" << endl << endl; + + // setup decoder observer + MT19937 rng; // random number generator + HypSampler* observer; + if (sample_from == "kbest") + observer = dynamic_cast<KBestGetter*>(new KBestGetter(k, filter_type)); + else + observer = dynamic_cast<KSampler*>(new KSampler(k, &rng)); + observer->SetScorer(scorer); + + // init weights + vector<weight_t>& dense_weights = decoder.CurrentWeightVector(); + SparseVector<weight_t> lambdas, cumulative_penalties, w_average; + if (cfg.count("input_weights")) Weights::InitFromFile(cfg["input_weights"].as<string>(), &dense_weights); + Weights::InitSparseVector(dense_weights, &lambdas); + + // meta params for perceptron, SVM + weight_t eta = cfg["learning_rate"].as<weight_t>(); + weight_t gamma = cfg["gamma"].as<weight_t>(); + + // l1 regularization + bool l1naive = false; + bool l1clip = false; + bool l1cumul = false; + weight_t l1_reg = 0; + if (cfg["l1_reg"].as<string>() != "none") { + string s = cfg["l1_reg"].as<string>(); + if (s == "naive") l1naive = true; + else if (s == "clip") l1clip = true; + else if (s == "cumul") l1cumul = true; + l1_reg = cfg["l1_reg_strength"].as<weight_t>(); + } + + // output + string output_fn = cfg["output"].as<string>(); + // input + string input_fn = cfg["input"].as<string>(); + ReadFile input(input_fn); + // buffer input for t > 0 + vector<string> src_str_buf; // source strings (decoder takes only strings) + vector<vector<WordID> > ref_ids_buf; // references as WordID vecs + // where temp files go + string tmp_path = cfg["tmp"].as<string>(); +#ifdef DTRAIN_LOCAL + string refs_fn = cfg["refs"].as<string>(); + ReadFile refs(refs_fn); +#else + string grammar_buf_fn = gettmpf(tmp_path, "dtrain-grammars"); + ogzstream grammar_buf_out; + grammar_buf_out.open(grammar_buf_fn.c_str()); +#endif + + unsigned in_sz = UINT_MAX; // input index, input size + vector<pair<score_t, score_t> > all_scores; + score_t max_score = 0.; + unsigned best_it = 0; + float overall_time = 0.; + unsigned pair_count = 0, feature_count = 0; + + // output cfg + if (!quiet) { + cerr << _p5; + cerr << endl << "dtrain" << endl << "Parameters:" << endl; + cerr << setw(25) << "k " << k << endl; + cerr << setw(25) << "N " << N << endl; + cerr << setw(25) << "T " << T << endl; + cerr << setw(25) << "sample from " << "'" << sample_from << "'" << endl; + if (sample_from == "kbest") + cerr << setw(25) << "filter " << "'" << filter_type << "'" << endl; + cerr << setw(25) << "learning rate " << eta << endl; + cerr << setw(25) << "gamma " << gamma << endl; + cerr << setw(25) << "pairs " << "'" << pair_sampling << "'" << endl; + cerr << setw(25) << "pair threshold " << pair_threshold << endl; + cerr << setw(25) << "select weights " << "'" << select_weights << "'" << endl; + if (cfg.count("l1_reg")) + cerr << setw(25) << "l1 reg " << l1_reg << " '" << cfg["l1_reg"].as<string>() << "'" << endl; + if (funny) + cerr << setw(25) << "funny " << funny << endl; + if (rescale) + cerr << setw(25) << "rescale " << rescale << endl; + cerr << setw(25) << "cdec cfg " << "'" << cfg["decoder_config"].as<string>() << "'" << endl; + cerr << setw(25) << "input " << "'" << input_fn << "'" << endl; +#ifdef DTRAIN_LOCAL + cerr << setw(25) << "refs " << "'" << refs_fn << "'" << endl; +#endif + cerr << setw(25) << "output " << "'" << output_fn << "'" << endl; + if (cfg.count("input_weights")) + cerr << setw(25) << "weights in" << cfg["input_weights"].as<string>() << endl; + if (cfg.count("stop-after")) + cerr << setw(25) << "stop_after " << stop_after << endl; + if (!verbose) cerr << "(a dot represents " << DTRAIN_DOTS << " lines of input)" << endl; + } + + + for (unsigned t = 0; t < T; t++) // T epochs + { + + if (hstreaming) cerr << "reporter:status:Iteration #" << t+1 << " of " << T << endl; + + time_t start, end; + time(&start); +#ifndef DTRAIN_LOCAL + igzstream grammar_buf_in; + if (t > 0) grammar_buf_in.open(grammar_buf_fn.c_str()); +#endif + score_t score_sum = 0.; + score_t model_sum(0); + unsigned ii = 0, rank_errors = 0, margin_violations = 0, npairs = 0; + if (!quiet) cerr << "Iteration #" << t+1 << " of " << T << "." << endl; + + while(true) + { + + string in; + bool next = false, stop = false; // next iteration or premature stop + if (t == 0) { + if(!getline(*input, in)) next = true; + } else { + if (ii == in_sz) next = true; // stop if we reach the end of our input + } + // stop after X sentences (but still iterate for those) + if (stop_after > 0 && stop_after == ii && !next) stop = true; + + // produce some pretty output + if (!quiet && !verbose) { + if (ii == 0) cerr << " "; + if ((ii+1) % (DTRAIN_DOTS) == 0) { + cerr << "."; + cerr.flush(); + } + if ((ii+1) % (20*DTRAIN_DOTS) == 0) { + cerr << " " << ii+1 << endl; + if (!next && !stop) cerr << " "; + } + if (stop) { + if (ii % (20*DTRAIN_DOTS) != 0) cerr << " " << ii << endl; + cerr << "Stopping after " << stop_after << " input sentences." << endl; + } else { + if (next) { + if (ii % (20*DTRAIN_DOTS) != 0) cerr << " " << ii << endl; + } + } + } + + // next iteration + if (next || stop) break; + + // weights + lambdas.init_vector(&dense_weights); + + // getting input + vector<WordID> ref_ids; // reference as vector<WordID> +#ifndef DTRAIN_LOCAL + vector<string> in_split; // input: sid\tsrc\tref\tpsg + if (t == 0) { + // handling input + split_in(in, in_split); + if (hstreaming && ii == 0) cerr << "reporter:counter:" << task_id << ",First ID," << in_split[0] << endl; + // getting reference + vector<string> ref_tok; + boost::split(ref_tok, in_split[2], boost::is_any_of(" ")); + register_and_convert(ref_tok, ref_ids); + ref_ids_buf.push_back(ref_ids); + // process and set grammar + bool broken_grammar = true; + for (string::iterator it = in.begin(); it != in.end(); it++) { + if (!isspace(*it)) { + broken_grammar = false; + break; + } + } + if (broken_grammar) continue; + boost::replace_all(in, "\t", "\n"); + in += "\n"; + grammar_buf_out << in << DTRAIN_GRAMMAR_DELIM << " " << in_split[0] << endl; + decoder.SetSentenceGrammarFromString(in); + src_str_buf.push_back(in_split[1]); + // decode + observer->SetRef(ref_ids); + decoder.Decode(in_split[1], observer); + } else { + // get buffered grammar + string grammar_str; + while (true) { + string rule; + getline(grammar_buf_in, rule); + if (boost::starts_with(rule, DTRAIN_GRAMMAR_DELIM)) break; + grammar_str += rule + "\n"; + } + decoder.SetSentenceGrammarFromString(grammar_str); + // decode + observer->SetRef(ref_ids_buf[ii]); + decoder.Decode(src_str_buf[ii], observer); + } +#else + if (t == 0) { + string r_; + getline(*refs, r_); + vector<string> ref_tok; + boost::split(ref_tok, r_, boost::is_any_of(" ")); + register_and_convert(ref_tok, ref_ids); + ref_ids_buf.push_back(ref_ids); + src_str_buf.push_back(in); + } else { + ref_ids = ref_ids_buf[ii]; + } + observer->SetRef(ref_ids); + if (t == 0) + decoder.Decode(in, observer); + else + decoder.Decode(src_str_buf[ii], observer); +#endif + + // get (scored) samples + vector<ScoredHyp>* samples = observer->GetSamples(); + + if (verbose) { + cerr << "--- ref for " << ii << ": "; + if (t > 0) printWordIDVec(ref_ids_buf[ii]); + else printWordIDVec(ref_ids); + cerr << endl; + for (unsigned u = 0; u < samples->size(); u++) { + cerr << _p5 << _np << "[" << u << ". '"; + printWordIDVec((*samples)[u].w); + cerr << "'" << endl; + cerr << "SCORE=" << (*samples)[u].score << ",model="<< (*samples)[u].model << endl; + cerr << "F{" << (*samples)[u].f << "} ]" << endl << endl; + } + } + + score_sum += (*samples)[0].score; + model_sum += (*samples)[0].model; + + // weight updates + if (!noup) { + vector<pair<ScoredHyp,ScoredHyp> > pairs; + if (pair_sampling == "all") + all_pairs(samples, pairs, pair_threshold); + if (pair_sampling == "108010") + part108010(samples, pairs, pair_threshold); + if (pair_sampling == "PRO") + PROsampling(samples, pairs); + npairs += pairs.size(); + pair_count += 2*pairs.size(); + + for (vector<pair<ScoredHyp,ScoredHyp> >::iterator it = pairs.begin(); + it != pairs.end(); it++) { + score_t rank_error = it->second.score - it->first.score; + feature_count += it->first.f.size() + it->second.f.size(); + if (!gamma) { + // perceptron + if (rank_error > 0) { + SparseVector<weight_t> diff_vec = it->second.f - it->first.f; + lambdas.plus_eq_v_times_s(diff_vec, eta); + rank_errors++; + } else { + if (funny) { + SparseVector<weight_t> diff_vec = it->first.f - it->second.f; + lambdas.plus_eq_v_times_s(diff_vec, eta); + } + } + if (it->first.model - it->second.model < 1) margin_violations++; + } else { + // SVM + score_t margin = it->first.model - it->second.model; + if (rank_error > 0 || margin < 1) { + SparseVector<weight_t> diff_vec = it->second.f - it->first.f; + lambdas.plus_eq_v_times_s(diff_vec, eta); + if (rank_error > 0) rank_errors++; + if (margin < 1) margin_violations++; + } + // regularization + lambdas.plus_eq_v_times_s(lambdas, -2*gamma*eta*(1./npairs)); + } + } + + // l1 regularization + if (l1naive) { + for (unsigned d = 0; d < lambdas.size(); d++) { + weight_t v = lambdas.get(d); + lambdas.set_value(d, v - sign(v) * l1_reg); + } + } else if (l1clip) { + for (unsigned d = 0; d < lambdas.size(); d++) { + if (lambdas.nonzero(d)) { + weight_t v = lambdas.get(d); + if (v > 0) { + lambdas.set_value(d, max(0., v - l1_reg)); + } else { + lambdas.set_value(d, min(0., v + l1_reg)); + } + } + } + } else if (l1cumul) { + weight_t acc_penalty = (ii+1) * l1_reg; // ii is the index of the current input + for (unsigned d = 0; d < lambdas.size(); d++) { + if (lambdas.nonzero(d)) { + weight_t v = lambdas.get(d); + weight_t penalty = 0; + if (v > 0) { + penalty = max(0., v-(acc_penalty + cumulative_penalties.get(d))); + } else { + penalty = min(0., v+(acc_penalty - cumulative_penalties.get(d))); + } + lambdas.set_value(d, penalty); + cumulative_penalties.set_value(d, cumulative_penalties.get(d)+penalty); + } + } + } + + } + + if (rescale) lambdas /= lambdas.l2norm(); + + ++ii; + + if (hstreaming) { + rep.update_counter("Seen #"+boost::lexical_cast<string>(t+1), 1u); + rep.update_counter("Seen", 1u); + } + + } // input loop + + if (average) w_average += lambdas; + + if (scorer_str == "approx_bleu") scorer->Reset(); + + if (t == 0) { + in_sz = ii; // remember size of input (# lines) + if (hstreaming) { + rep.update_counter("|Input|", ii); + rep.update_gcounter("|Input|", ii); + rep.update_gcounter("Shards", 1u); + } + } + +#ifndef DTRAIN_LOCAL + if (t == 0) { + grammar_buf_out.close(); + } else { + grammar_buf_in.close(); + } +#endif + + // print some stats + score_t score_avg = score_sum/(score_t)in_sz; + score_t model_avg = model_sum/(score_t)in_sz; + score_t score_diff, model_diff; + if (t > 0) { + score_diff = score_avg - all_scores[t-1].first; + model_diff = model_avg - all_scores[t-1].second; + } else { + score_diff = score_avg; + model_diff = model_avg; + } + + unsigned nonz; + if (!quiet || hstreaming) nonz = (unsigned)lambdas.size_nonzero(); + + if (!quiet) { + cerr << _p9 << _p << "WEIGHTS" << endl; + for (vector<string>::iterator it = print_weights.begin(); it != print_weights.end(); it++) { + cerr << setw(18) << *it << " = " << lambdas.get(FD::Convert(*it)) << endl; + } + cerr << " ---" << endl; + cerr << _np << " 1best avg score: " << score_avg; + cerr << _p << " (" << score_diff << ")" << endl; + cerr << _np << "1best avg model score: " << model_avg; + cerr << _p << " (" << model_diff << ")" << endl; + cerr << " avg #pairs: "; + cerr << _np << npairs/(float)in_sz << endl; + cerr << " avg #rank err: "; + cerr << rank_errors/(float)in_sz << endl; + cerr << " avg #margin viol: "; + cerr << margin_violations/(float)in_sz << endl; + cerr << " non0 feature count: " << nonz << endl; + cerr << " avg f count: "; + cerr << feature_count/(float)pair_count << endl; + } + + if (hstreaming) { + rep.update_counter("Score 1best avg #"+boost::lexical_cast<string>(t+1), (unsigned)(score_avg*DTRAIN_SCALE)); + rep.update_counter("Model 1best avg #"+boost::lexical_cast<string>(t+1), (unsigned)(model_avg*DTRAIN_SCALE)); + rep.update_counter("Pairs avg #"+boost::lexical_cast<string>(t+1), (unsigned)((npairs/(weight_t)in_sz)*DTRAIN_SCALE)); + rep.update_counter("Rank errors avg #"+boost::lexical_cast<string>(t+1), (unsigned)((rank_errors/(weight_t)in_sz)*DTRAIN_SCALE)); + rep.update_counter("Margin violations avg #"+boost::lexical_cast<string>(t+1), (unsigned)((margin_violations/(weight_t)in_sz)*DTRAIN_SCALE)); + rep.update_counter("Non zero feature count #"+boost::lexical_cast<string>(t+1), nonz); + rep.update_gcounter("Non zero feature count #"+boost::lexical_cast<string>(t+1), nonz); + } + + pair<score_t,score_t> remember; + remember.first = score_avg; + remember.second = model_avg; + all_scores.push_back(remember); + if (score_avg > max_score) { + max_score = score_avg; + best_it = t; + } + time (&end); + float time_diff = difftime(end, start); + overall_time += time_diff; + if (!quiet) { + cerr << _p2 << _np << "(time " << time_diff/60. << " min, "; + cerr << time_diff/(float)in_sz<< " s/S)" << endl; + } + if (t+1 != T && !quiet) cerr << endl; + + if (noup) break; + + // write weights to file + if (select_weights == "best" || keep) { + lambdas.init_vector(&dense_weights); + string w_fn = "weights." + boost::lexical_cast<string>(t) + ".gz"; + Weights::WriteToFile(w_fn, dense_weights, true); + } + + } // outer loop + + if (average) w_average /= (weight_t)T; + +#ifndef DTRAIN_LOCAL + unlink(grammar_buf_fn.c_str()); +#endif + + if (!noup) { + if (!quiet) cerr << endl << "Writing weights file to '" << output_fn << "' ..." << endl; + if (select_weights == "last" || average) { // last, average + WriteFile of(output_fn); // works with '-' + ostream& o = *of.stream(); + o.precision(17); + o << _np; + if (average) { + for (SparseVector<weight_t>::const_iterator it = w_average.begin(); it != w_average.end(); ++it) { + if (it->second == 0) continue; + o << FD::Convert(it->first) << '\t' << it->second << endl; + } + } else { + for (SparseVector<weight_t>::const_iterator it = lambdas.begin(); it != lambdas.end(); ++it) { + if (it->second == 0) continue; + o << FD::Convert(it->first) << '\t' << it->second << endl; + } + } + } else if (select_weights == "VOID") { // do nothing with the weights + } else { // best + if (output_fn != "-") { + CopyFile("weights."+boost::lexical_cast<string>(best_it)+".gz", output_fn); + } else { + ReadFile bestw("weights."+boost::lexical_cast<string>(best_it)+".gz"); + string o; + cout.precision(17); + cout << _np; + while(getline(*bestw, o)) cout << o << endl; + } + if (!keep) { + for (unsigned i = 0; i < T; i++) { + string s = "weights." + boost::lexical_cast<string>(i) + ".gz"; + unlink(s.c_str()); + } + } + } + if (output_fn == "-" && hstreaming) cout << "__SHARD_COUNT__\t1" << endl; + if (!quiet) cerr << "done" << endl; + } + + if (!quiet) { + cerr << _p5 << _np << endl << "---" << endl << "Best iteration: "; + cerr << best_it+1 << " [SCORE '" << scorer_str << "'=" << max_score << "]." << endl; + cerr << _p2 << "This took " << overall_time/60. << " min." << endl; + } + + return 0; +} + diff --git a/dtrain/dtrain.h b/dtrain/dtrain.h new file mode 100644 index 00000000..59ceb6f6 --- /dev/null +++ b/dtrain/dtrain.h @@ -0,0 +1,89 @@ +#ifndef _DTRAIN_COMMON_H_ +#define _DTRAIN_COMMON_H_ + +#include <iomanip> +#include <climits> +#include <string.h> + +#include <boost/algorithm/string.hpp> +#include <boost/program_options.hpp> + +#include "ksampler.h" +#include "pairsampling.h" + +#include "filelib.h" + +//#define DTRAIN_LOCAL + +#define DTRAIN_DOTS 10 // when to display a '.' +#define DTRAIN_GRAMMAR_DELIM "########EOS########" +#define DTRAIN_SCALE 100000 + +using namespace std; +using namespace dtrain; +namespace po = boost::program_options; + +inline void register_and_convert(const vector<string>& strs, vector<WordID>& ids) { + vector<string>::const_iterator it; + for (it = strs.begin(); it < strs.end(); it++) + ids.push_back(TD::Convert(*it)); +} + +inline string gettmpf(const string path, const string infix) { + char fn[1024]; + strcpy(fn, path.c_str()); + strcat(fn, "/"); + strcat(fn, infix.c_str()); + strcat(fn, "-XXXXXX"); + mkstemp(fn); + return string(fn); +} + +inline void split_in(string& s, vector<string>& parts) +{ + unsigned f = 0; + for(unsigned i = 0; i < 3; i++) { + unsigned e = f; + f = s.find("\t", f+1); + if (e != 0) parts.push_back(s.substr(e+1, f-e-1)); + else parts.push_back(s.substr(0, f)); + } + s.erase(0, f+1); +} + +struct HSReporter +{ + string task_id_; + + HSReporter(string task_id) : task_id_(task_id) {} + + inline void update_counter(string name, unsigned amount) { + cerr << "reporter:counter:" << task_id_ << "," << name << "," << amount << endl; + } + inline void update_gcounter(string name, unsigned amount) { + cerr << "reporter:counter:Global," << name << "," << amount << endl; + } +}; + +inline ostream& _np(ostream& out) { return out << resetiosflags(ios::showpos); } +inline ostream& _p(ostream& out) { return out << setiosflags(ios::showpos); } +inline ostream& _p2(ostream& out) { return out << setprecision(2); } +inline ostream& _p5(ostream& out) { return out << setprecision(5); } +inline ostream& _p9(ostream& out) { return out << setprecision(9); } + +inline void printWordIDVec(vector<WordID>& v) +{ + for (unsigned i = 0; i < v.size(); i++) { + cerr << TD::Convert(v[i]); + if (i < v.size()-1) cerr << " "; + } +} + +template<typename T> +inline T sign(T z) { + if (z == 0) return 0; + return z < 0 ? -1 : +1; +} + +#endif + diff --git a/dtrain/hstreaming/avg.rb b/dtrain/hstreaming/avg.rb new file mode 100755 index 00000000..91d4e29a --- /dev/null +++ b/dtrain/hstreaming/avg.rb @@ -0,0 +1,31 @@ +#!/usr/bin/env ruby + +shard_count_key = "__SHARD_COUNT__" + +STDIN.set_encoding 'utf-8' +STDOUT.set_encoding 'utf-8' + +w = {} +c = {} +w.default = 0 +c.default = 0 +while line = STDIN.gets + key, val = line.split /\s/ + w[key] += val.to_f + c[key] += 1 +end + +if ARGV.size == 0 + shard_count = w["__SHARD_COUNT__"] +else + shard_count = ARGV[0].to_f +end +w.each_key { |k| + if k == shard_count_key + puts "# shard count: #{shard_count.to_i}" + else + puts "#{k}\t#{w[k]/shard_count}" + puts "# #{c[k]}" + end +} + diff --git a/dtrain/hstreaming/cdec.ini b/dtrain/hstreaming/cdec.ini new file mode 100644 index 00000000..61f13e86 --- /dev/null +++ b/dtrain/hstreaming/cdec.ini @@ -0,0 +1,21 @@ +formalism=scfg +add_pass_through_rules=true +scfg_max_span_limit=15 +intersection_strategy=cube_pruning +cubepruning_pop_limit=200 +feature_function=WordPenalty +feature_function=KLanguageModel nc-wmt11.en.srilm.gz +#feature_function=ArityPenalty +#feature_function=CMR2008ReorderingFeatures +#feature_function=InputIndicator +#feature_function=LexNullJump +#feature_function=NewJump +#feature_function=NgramFeatures +#feature_function=NonLatinCount +#feature_function=OutputIndicator +#feature_function=RuleIdentityFeatures +#feature_function=RuleNgramFeatures +#feature_function=RuleShape +#feature_function=SourceSpanSizeFeatures +#feature_function=SourceWordPenalty +#feature_function=SpanFeatures diff --git a/dtrain/hstreaming/dtrain.ini b/dtrain/hstreaming/dtrain.ini new file mode 100644 index 00000000..118a27c5 --- /dev/null +++ b/dtrain/hstreaming/dtrain.ini @@ -0,0 +1,15 @@ +input=- +output=- +decoder_config=cdec.ini +tmp=/var/hadoop/mapred/local/ +epochs=10 +k=100 +N=4 +learning_rate=0.0001 +gamma=0.00001 +scorer=stupid_bleu +sample_from=kbest +filter=uniq +pair_sampling=108010 +pair_threshold=0 +select_weights=last diff --git a/dtrain/hstreaming/dtrain.sh b/dtrain/hstreaming/dtrain.sh new file mode 100755 index 00000000..ea0276dd --- /dev/null +++ b/dtrain/hstreaming/dtrain.sh @@ -0,0 +1,8 @@ +#!/bin/bash + +pushd . &>/dev/null +cd .. +ID=$(basename $(pwd)) # attempt_... +popd &>/dev/null +./dtrain -c dtrain.ini --hstreaming $ID + diff --git a/dtrain/hstreaming/hadoop-streaming-job.sh b/dtrain/hstreaming/hadoop-streaming-job.sh new file mode 100755 index 00000000..90c2b790 --- /dev/null +++ b/dtrain/hstreaming/hadoop-streaming-job.sh @@ -0,0 +1,31 @@ +#!/bin/sh + +EXP=a_simple_test + +# change these vars to fit your hadoop installation +HADOOP_HOME=/usr/lib/hadoop-0.20 +JAR=contrib/streaming/hadoop-streaming-0.20.2-cdh3u1.jar +HSTREAMING="$HADOOP_HOME/bin/hadoop jar $HADOOP_HOME/$JAR" +# ^^^ + + IN=input_on_hdfs +OUT=output_weights_on_hdfs + +# you can remove the -reducer line if you want to +# do feature selection/averaging locally (e.g. to +# keep weights of the iterations) +$HSTREAMING \ + -mapper "dtrain.sh" \ + -reducer "lplp.rb l2 select_k 100000" \ + -input $IN \ + -output $OUT \ + -file dtrain.sh \ + -file lplp.rb \ + -file ../dtrain \ + -file dtrain.ini \ + -file cdec.ini \ + -file ../test/example/nc-wmt11.en.srilm.gz \ + -jobconf mapred.reduce.tasks=30 \ + -jobconf mapred.max.map.failures.percent=0 \ + -jobconf mapred.job.name="dtrain $EXP" + diff --git a/dtrain/hstreaming/lplp.rb b/dtrain/hstreaming/lplp.rb new file mode 100755 index 00000000..40409bbd --- /dev/null +++ b/dtrain/hstreaming/lplp.rb @@ -0,0 +1,131 @@ +# lplp.rb + +# norms +def l0(feature_column, n) + if feature_column.size >= n then return 1 else return 0 end +end + +def l1(feature_column, n=-1) + return feature_column.map { |i| i.abs }.reduce { |sum,i| sum+i } +end + +def l2(feature_column, n=-1) + return Math.sqrt feature_column.map { |i| i.abs2 }.reduce { |sum,i| sum+i } +end + +def linfty(feature_column, n=-1) + return feature_column.map { |i| i.abs }.max +end + +# stats +def median(feature_column, n) + return feature_column.concat(0.step(n-feature_column.size-1).map{|i|0}).sort[feature_column.size/2] +end + +def mean(feature_column, n) + return feature_column.reduce { |sum, i| sum+i } / n +end + +# selection +def select_k(weights, norm_fun, n, k=10000) + weights.sort{|a,b| norm_fun.call(b[1], n) <=> norm_fun.call(a[1], n)}.each { |p| + puts "#{p[0]}\t#{mean(p[1], n)}" + k -= 1 + if k == 0 then break end + } +end + +def cut(weights, norm_fun, n, epsilon=0.0001) + weights.each { |k,v| + if norm_fun.call(v, n).abs >= epsilon + puts "#{k}\t#{mean(v, n)}" + end + } +end + +# test +def _test() + puts + w = {} + w["a"] = [1, 2, 3] + w["b"] = [1, 2] + w["c"] = [66] + w["d"] = [10, 20, 30] + n = 3 + puts w.to_s + puts + puts "select_k" + puts "l0 expect ad" + select_k(w, method(:l0), n, 2) + puts "l1 expect cd" + select_k(w, method(:l1), n, 2) + puts "l2 expect c" + select_k(w, method(:l2), n, 1) + puts + puts "cut" + puts "l1 expect cd" + cut(w, method(:l1), n, 7) + puts + puts "median" + a = [1,2,3,4,5] + puts a.to_s + puts median(a, 5) + puts + puts "#{median(a, 7)} <- that's because we add missing 0s:" + puts a.concat(0.step(7-a.size-1).map{|i|0}).to_s + puts + puts "mean expect bc" + w.clear + w["a"] = [2] + w["b"] = [2.1] + w["c"] = [2.2] + cut(w, method(:mean), 1, 2.05) + exit +end +_test() + +# actually do something +def usage() + puts "lplp.rb <l0,l1,l2,linfty,mean,median> <cut|select_k> <k|threshold> [n] < <input>" + puts " l0...: norms for selection" + puts "select_k: only output top k (according to the norm of their column vector) features" + puts " cut: output features with weight >= threshold" + puts " n: if we do not have a shard count use this number for averaging" + exit +end + +if ARGV.size < 3 then usage end +norm_fun = method(ARGV[0].to_sym) +type = ARGV[1] +x = ARGV[2].to_f + +shard_count_key = "__SHARD_COUNT__" + +STDIN.set_encoding 'utf-8' +STDOUT.set_encoding 'utf-8' + +w = {} +shard_count = 0 +while line = STDIN.gets + key, val = line.split /\t/ + if key == shard_count_key + shard_count += 1 + next + end + if w.has_key? key + w[key].push val.to_f + else + w[key] = [val.to_f] + end +end + +if ARGV.size == 4 then shard_count = ARGV[3].to_f end + +if type == 'cut' + cut(w, norm_fun, shard_count, x) +elsif type == 'select_k' + select_k(w, norm_fun, shard_count, x) +else + puts "oh oh" +end + diff --git a/dtrain/hstreaming/red-test b/dtrain/hstreaming/red-test new file mode 100644 index 00000000..2623d697 --- /dev/null +++ b/dtrain/hstreaming/red-test @@ -0,0 +1,9 @@ +a 1 +b 2 +c 3.5 +a 1 +b 2 +c 3.5 +d 1 +e 2 +__SHARD_COUNT__ 2 diff --git a/dtrain/hstreaming/rule_count/map.sh b/dtrain/hstreaming/rule_count/map.sh new file mode 100755 index 00000000..ae75fece --- /dev/null +++ b/dtrain/hstreaming/rule_count/map.sh @@ -0,0 +1,4 @@ +#!/bin/sh + +ruby rulecount.rb | sort | ruby red.rb + diff --git a/dtrain/hstreaming/rule_count/red.rb b/dtrain/hstreaming/rule_count/red.rb new file mode 100644 index 00000000..874ae7ac --- /dev/null +++ b/dtrain/hstreaming/rule_count/red.rb @@ -0,0 +1,24 @@ +#!/usr/bin/env ruby + +STDIN.set_encoding 'utf-8' +STDOUT.set_encoding 'utf-8' + +def output(key, val) + puts "#{key}\t#{val}" +end + +prev_key = nil +sum = 0 +while line = STDIN.gets + key, val = line.strip.split /\t/ + if key != prev_key && sum > 0 + output prev_key, sum + prev_key = key + sum = 0 + elsif !prev_key + prev_key = key + end + sum += val.to_i +end +output prev_key, sum + diff --git a/dtrain/hstreaming/rule_count/rulecount.rb b/dtrain/hstreaming/rule_count/rulecount.rb new file mode 100644 index 00000000..67361fa4 --- /dev/null +++ b/dtrain/hstreaming/rule_count/rulecount.rb @@ -0,0 +1,13 @@ +#!/usr/bin/env ruby + +STDIN.set_encoding 'utf-8' +STDOUT.set_encoding 'utf-8' + +while line = STDIN.gets + a = line.strip.chomp.split "\t" + a[3..a.size].each { |r| + id = r.split("|||")[0..2].join("|||").to_s.strip.gsub("\s", "_") + puts "#{id}\t1" + } +end + diff --git a/dtrain/hstreaming/rule_count/test b/dtrain/hstreaming/rule_count/test new file mode 100644 index 00000000..acd00a5e --- /dev/null +++ b/dtrain/hstreaming/rule_count/test @@ -0,0 +1,8 @@ +a 1 +a 1 +a 1 +b 1 +b 1 +c 1 +d 1 +a 1 diff --git a/dtrain/kbestget.h b/dtrain/kbestget.h new file mode 100644 index 00000000..1b96bbf4 --- /dev/null +++ b/dtrain/kbestget.h @@ -0,0 +1,141 @@ +#ifndef _DTRAIN_KBESTGET_H_ +#define _DTRAIN_KBESTGET_H_ + +#include "kbest.h" // cdec +#include "verbose.h" +#include "viterbi.h" +#include "ff_register.h" +#include "decoder.h" +#include "weights.h" +#include "logval.h" + +using namespace std; + +namespace dtrain +{ + + +typedef double score_t; + +struct ScoredHyp +{ + vector<WordID> w; + SparseVector<double> f; + score_t model; + score_t score; + unsigned rank; +}; + +struct LocalScorer +{ + unsigned N_; + vector<score_t> w_; + + virtual score_t + Score(vector<WordID>& hyp, vector<WordID>& ref, const unsigned rank)=0; + + void Reset() {} // only for approx bleu + + inline void + Init(unsigned N, vector<score_t> weights) + { + assert(N > 0); + N_ = N; + if (weights.empty()) for (unsigned i = 0; i < N_; i++) w_.push_back(1./N_); + else w_ = weights; + } + + inline score_t + brevity_penaly(const unsigned hyp_len, const unsigned ref_len) + { + if (hyp_len > ref_len) return 1; + return exp(1 - (score_t)ref_len/hyp_len); + } +}; + +struct HypSampler : public DecoderObserver +{ + LocalScorer* scorer_; + vector<WordID>* ref_; + virtual vector<ScoredHyp>* GetSamples()=0; + inline void SetScorer(LocalScorer* scorer) { scorer_ = scorer; } + inline void SetRef(vector<WordID>& ref) { ref_ = &ref; } +}; +/////////////////////////////////////////////////////////////////////////////// + + + + +struct KBestGetter : public HypSampler +{ + const unsigned k_; + const string filter_type_; + vector<ScoredHyp> s_; + + KBestGetter(const unsigned k, const string filter_type) : + k_(k), filter_type_(filter_type) {} + + virtual void + NotifyTranslationForest(const SentenceMetadata& smeta, Hypergraph* hg) + { + KBestScored(*hg); + } + + vector<ScoredHyp>* GetSamples() { return &s_; } + + void + KBestScored(const Hypergraph& forest) + { + if (filter_type_ == "uniq") { + KBestUnique(forest); + } else if (filter_type_ == "not") { + KBestNoFilter(forest); + } + } + + void + KBestUnique(const Hypergraph& forest) + { + s_.clear(); + KBest::KBestDerivations<vector<WordID>, ESentenceTraversal, + KBest::FilterUnique, prob_t, EdgeProb> kbest(forest, k_); + for (unsigned i = 0; i < k_; ++i) { + const KBest::KBestDerivations<vector<WordID>, ESentenceTraversal, KBest::FilterUnique, + prob_t, EdgeProb>::Derivation* d = + kbest.LazyKthBest(forest.nodes_.size() - 1, i); + if (!d) break; + ScoredHyp h; + h.w = d->yield; + h.f = d->feature_values; + h.model = log(d->score); + h.rank = i; + h.score = scorer_->Score(h.w, *ref_, i); + s_.push_back(h); + } + } + + void + KBestNoFilter(const Hypergraph& forest) + { + s_.clear(); + KBest::KBestDerivations<vector<WordID>, ESentenceTraversal> kbest(forest, k_); + for (unsigned i = 0; i < k_; ++i) { + const KBest::KBestDerivations<vector<WordID>, ESentenceTraversal>::Derivation* d = + kbest.LazyKthBest(forest.nodes_.size() - 1, i); + if (!d) break; + ScoredHyp h; + h.w = d->yield; + h.f = d->feature_values; + h.model = log(d->score); + h.rank = i; + h.score = scorer_->Score(h.w, *ref_, i); + s_.push_back(h); + } + } +}; + + +} // namespace + +#endif + diff --git a/dtrain/ksampler.h b/dtrain/ksampler.h new file mode 100644 index 00000000..8b1c09f2 --- /dev/null +++ b/dtrain/ksampler.h @@ -0,0 +1,50 @@ +#ifndef _DTRAIN_KSAMPLER_H_ +#define _DTRAIN_KSAMPLER_H_ + +#include "hg_sampler.h" // cdec +#include "kbestget.h" +#include "score.h" + +namespace dtrain +{ + + +struct KSampler : public HypSampler +{ + const unsigned k_; + vector<ScoredHyp> s_; + MT19937* prng_; + score_t (*scorer)(NgramCounts&, const unsigned, const unsigned, unsigned, vector<score_t>); + + explicit KSampler(const unsigned k, MT19937* prng) : + k_(k), prng_(prng) {} + + virtual void + NotifyTranslationForest(const SentenceMetadata& smeta, Hypergraph* hg) + { + ScoredSamples(*hg); + } + + vector<ScoredHyp>* GetSamples() { return &s_; } + + void ScoredSamples(const Hypergraph& forest) { + s_.clear(); + std::vector<HypergraphSampler::Hypothesis> samples; + HypergraphSampler::sample_hypotheses(forest, k_, prng_, &samples); + for (unsigned i = 0; i < k_; ++i) { + ScoredHyp h; + h.w = samples[i].words; + h.f = samples[i].fmap; + h.model = log(samples[i].model_score); + h.rank = i; + h.score = scorer_->Score(h.w, *ref_, i); + s_.push_back(h); + } + } +}; + + +} // namespace + +#endif + diff --git a/dtrain/pairsampling.h b/dtrain/pairsampling.h new file mode 100644 index 00000000..1fc5b8a0 --- /dev/null +++ b/dtrain/pairsampling.h @@ -0,0 +1,119 @@ +#ifndef _DTRAIN_PAIRSAMPLING_H_ +#define _DTRAIN_PAIRSAMPLING_H_ + +namespace dtrain +{ + + +bool +accept_pair(score_t a, score_t b, score_t threshold) +{ + if (fabs(a - b) < threshold) return false; + return true; +} + +inline void +all_pairs(vector<ScoredHyp>* s, vector<pair<ScoredHyp,ScoredHyp> >& training, score_t threshold) +{ + for (unsigned i = 0; i < s->size()-1; i++) { + for (unsigned j = i+1; j < s->size(); j++) { + if (threshold > 0) { + if (accept_pair((*s)[i].score, (*s)[j].score, threshold)) { + training.push_back(make_pair((*s)[i], (*s)[j])); + } + } else { + training.push_back(make_pair((*s)[i], (*s)[j])); + } + } + } +} + +/* + * multipartite ranking + * sort by bleu + * compare top 10% to middle 80% and low 10% + * cmp middle 80% to low 10% + */ +bool +_108010_cmp_hyp_by_score(ScoredHyp a, ScoredHyp b) +{ + return a.score < b.score; +} +inline void +part108010(vector<ScoredHyp>* s, vector<pair<ScoredHyp,ScoredHyp> >& training, score_t threshold) +{ + sort(s->begin(), s->end(), _108010_cmp_hyp_by_score); + unsigned sz = s->size(); + unsigned slice = 10; + unsigned sep = sz%slice; + if (sep == 0) sep = sz/slice; + for (unsigned i = 0; i < sep; i++) { + for (unsigned j = sep; j < sz; j++) { + if ((*s)[i].rank < (*s)[j].rank) { + if (threshold > 0) { + if (accept_pair((*s)[i].score, (*s)[j].score, threshold)) { + training.push_back(make_pair((*s)[i], (*s)[j])); + } + } else { + training.push_back(make_pair((*s)[i], (*s)[j])); + } + } + } + } + for (unsigned i = sep; i < sz-sep; i++) { + for (unsigned j = sz-sep; j < sz; j++) { + if ((*s)[i].rank < (*s)[j].rank) { + if (threshold > 0) { + if (accept_pair((*s)[i].score, (*s)[j].score, threshold)) { + training.push_back(make_pair((*s)[i], (*s)[j])); + } + } else { + training.push_back(make_pair((*s)[i], (*s)[j])); + } + } + } + } +} + +/* + * pair sampling as in + * 'Tuning as Ranking' (Hopkins & May, 2011) + * count = 5000 + * threshold = 5% BLEU + * cut = top 50 + */ +bool +_PRO_cmp_pair_by_diff(pair<ScoredHyp,ScoredHyp> a, pair<ScoredHyp,ScoredHyp> b) +{ + // descending order + return (fabs(a.first.score - a.second.score)) > (fabs(b.first.score - b.second.score)); +} +inline void +PROsampling(vector<ScoredHyp>* s, vector<pair<ScoredHyp,ScoredHyp> >& training, score_t threshold=0.05) +{ + unsigned max_count = 5000, count = 0; + bool b = false; + for (unsigned i = 0; i < s->size()-1; i++) { + for (unsigned j = i+1; j < s->size(); j++) { + if (accept_pair((*s)[i].score, (*s)[j].score, threshold)) { + training.push_back(make_pair((*s)[i], (*s)[j])); + if (++count == max_count) { + b = true; + break; + } + } + } + if (b) break; + } + if (training.size() > 50) { + sort(training.begin(), training.end(), _PRO_cmp_pair_by_diff); + training.erase(training.begin()+50, training.end()); + } + return; +} + + +} // namespace + +#endif + diff --git a/dtrain/score.cc b/dtrain/score.cc new file mode 100644 index 00000000..4cde638a --- /dev/null +++ b/dtrain/score.cc @@ -0,0 +1,127 @@ +#include "score.h" + +namespace dtrain +{ + + +/* + * bleu + * + * as in "BLEU: a Method for Automatic Evaluation + * of Machine Translation" + * (Papineni et al. '02) + * + * NOTE: 0 if for one n \in {1..N} count is 0 + */ +score_t +BleuScorer::Bleu(NgramCounts& counts, const unsigned hyp_len, const unsigned ref_len) +{ + if (hyp_len == 0 || ref_len == 0) return 0; + unsigned M = N_; + if (ref_len < N_) M = ref_len; + score_t sum = 0; + for (unsigned i = 0; i < M; i++) { + if (counts.clipped[i] == 0 || counts.sum[i] == 0) return 0; + sum += w_[i] * log((score_t)counts.clipped[i]/counts.sum[i]); + } + return brevity_penaly(hyp_len, ref_len) * exp(sum); +} + +score_t +BleuScorer::Score(vector<WordID>& hyp, vector<WordID>& ref, + const unsigned rank) +{ + unsigned hyp_len = hyp.size(), ref_len = ref.size(); + if (hyp_len == 0 || ref_len == 0) return 0; + NgramCounts counts = make_ngram_counts(hyp, ref, N_); + return Bleu(counts, hyp_len, ref_len); +} + +/* + * 'stupid' bleu + * + * as in "ORANGE: a Method for Evaluating + * Automatic Evaluation Metrics + * for Machine Translation" + * (Lin & Och '04) + * + * NOTE: 0 iff no 1gram match + */ +score_t +StupidBleuScorer::Score(vector<WordID>& hyp, vector<WordID>& ref, + const unsigned rank) +{ + unsigned hyp_len = hyp.size(), ref_len = ref.size(); + if (hyp_len == 0 || ref_len == 0) return 0; + NgramCounts counts = make_ngram_counts(hyp, ref, N_); + unsigned M = N_; + if (ref_len < N_) M = ref_len; + score_t sum = 0, add = 0; + for (unsigned i = 0; i < M; i++) { + if (i == 1) add = 1; + sum += w_[i] * log(((score_t)counts.clipped[i] + add)/((counts.sum[i] + add))); + } + return brevity_penaly(hyp_len, ref_len) * exp(sum); +} + +/* + * smooth bleu + * + * as in "An End-to-End Discriminative Approach + * to Machine Translation" + * (Liang et al. '06) + * + * NOTE: max is 0.9375 + */ +score_t +SmoothBleuScorer::Score(vector<WordID>& hyp, vector<WordID>& ref, + const unsigned rank) +{ + unsigned hyp_len = hyp.size(), ref_len = ref.size(); + if (hyp_len == 0 || ref_len == 0) return 0; + NgramCounts counts = make_ngram_counts(hyp, ref, N_); + score_t sum = 0; + unsigned j = 1; + for (unsigned i = 0; i < N_; i++) { + if (counts.clipped[i] == 0 || counts.sum[i] == 0) continue; + sum += exp((w_[i] * log((score_t)counts.clipped[i]/counts.sum[i])))/pow(2, N_-j+1); + j++; + } + return brevity_penaly(hyp_len, ref_len) * sum; +} + +/* + * approx. bleu + * + * as in "Online Large-Margin Training of Syntactic + * and Structural Translation Features" + * (Chiang et al. '08) + * + * NOTE: needs some code in dtrain.cc + */ +score_t +ApproxBleuScorer::Score(vector<WordID>& hyp, vector<WordID>& ref, + const unsigned rank) +{ + unsigned hyp_len = hyp.size(), ref_len = ref.size(); + if (hyp_len == 0 || ref_len == 0) return 0; + NgramCounts counts = make_ngram_counts(hyp, ref, N_); + NgramCounts tmp(N_); + if (rank == 0) { // 'context of 1best translations' + glob_onebest_counts += counts; + glob_hyp_len += hyp_len; + glob_ref_len += ref_len; + hyp_len = glob_hyp_len; + ref_len = glob_ref_len; + tmp = glob_onebest_counts; + } else { + hyp_len = hyp.size(); + ref_len = ref.size(); + tmp = glob_onebest_counts + counts; + } + return 0.9 * Bleu(tmp, hyp_len, ref_len); +} + + +} // namespace + diff --git a/dtrain/score.h b/dtrain/score.h new file mode 100644 index 00000000..85cd0317 --- /dev/null +++ b/dtrain/score.h @@ -0,0 +1,144 @@ +#ifndef _DTRAIN_SCORE_H_ +#define _DTRAIN_SCORE_H_ + +#include "kbestget.h" + +using namespace std; + +namespace dtrain +{ + + +struct NgramCounts +{ + unsigned N_; + map<unsigned, unsigned> clipped; + map<unsigned, unsigned> sum; + + NgramCounts(const unsigned N) : N_(N) { Zero(); } + + inline void + operator+=(const NgramCounts& rhs) + { + assert(N_ == rhs.N_); + for (unsigned i = 0; i < N_; i++) { + this->clipped[i] += rhs.clipped.find(i)->second; + this->sum[i] += rhs.sum.find(i)->second; + } + } + + inline const NgramCounts + operator+(const NgramCounts &other) const + { + NgramCounts result = *this; + result += other; + return result; + } + + inline void + Add(const unsigned count, const unsigned ref_count, const unsigned i) + { + assert(i < N_); + if (count > ref_count) { + clipped[i] += ref_count; + } else { + clipped[i] += count; + } + sum[i] += count; + } + + inline void + Zero() + { + unsigned i; + for (i = 0; i < N_; i++) { + clipped[i] = 0; + sum[i] = 0; + } + } + + inline void + Print() + { + for (unsigned i = 0; i < N_; i++) { + cout << i+1 << "grams (clipped):\t" << clipped[i] << endl; + cout << i+1 << "grams:\t\t\t" << sum[i] << endl; + } + } +}; + +typedef map<vector<WordID>, unsigned> Ngrams; + +inline Ngrams +make_ngrams(const vector<WordID>& s, const unsigned N) +{ + Ngrams ngrams; + vector<WordID> ng; + for (size_t i = 0; i < s.size(); i++) { + ng.clear(); + for (unsigned j = i; j < min(i+N, s.size()); j++) { + ng.push_back(s[j]); + ngrams[ng]++; + } + } + return ngrams; +} + +inline NgramCounts +make_ngram_counts(const vector<WordID>& hyp, const vector<WordID>& ref, const unsigned N) +{ + Ngrams hyp_ngrams = make_ngrams(hyp, N); + Ngrams ref_ngrams = make_ngrams(ref, N); + NgramCounts counts(N); + Ngrams::iterator it; + Ngrams::iterator ti; + for (it = hyp_ngrams.begin(); it != hyp_ngrams.end(); it++) { + ti = ref_ngrams.find(it->first); + if (ti != ref_ngrams.end()) { + counts.Add(it->second, ti->second, it->first.size() - 1); + } else { + counts.Add(it->second, 0, it->first.size() - 1); + } + } + return counts; +} + +struct BleuScorer : public LocalScorer +{ + score_t Bleu(NgramCounts& counts, const unsigned hyp_len, const unsigned ref_len); + score_t Score(vector<WordID>& hyp, vector<WordID>& ref, const unsigned rank); +}; + +struct StupidBleuScorer : public LocalScorer +{ + score_t Score(vector<WordID>& hyp, vector<WordID>& ref, const unsigned rank); +}; + +struct SmoothBleuScorer : public LocalScorer +{ + score_t Score(vector<WordID>& hyp, vector<WordID>& ref, const unsigned rank); +}; + +struct ApproxBleuScorer : public BleuScorer +{ + NgramCounts glob_onebest_counts; + unsigned glob_hyp_len, glob_ref_len; + + ApproxBleuScorer(unsigned N) : glob_onebest_counts(NgramCounts(N)) + { + glob_hyp_len = glob_ref_len = 0; + } + + inline void Reset() { + glob_onebest_counts.Zero(); + glob_hyp_len = glob_ref_len = 0; + } + + score_t Score(vector<WordID>& hyp, vector<WordID>& ref, const unsigned rank); +}; + + +} // namespace + +#endif + diff --git a/dtrain/test/example/cdec.ini b/dtrain/test/example/cdec.ini new file mode 100644 index 00000000..fe5ca759 --- /dev/null +++ b/dtrain/test/example/cdec.ini @@ -0,0 +1,24 @@ +formalism=scfg +add_pass_through_rules=true +scfg_max_span_limit=15 +intersection_strategy=cube_pruning +cubepruning_pop_limit=30 +feature_function=WordPenalty +feature_function=KLanguageModel test/example/nc-wmt11.en.srilm.gz +# all currently working feature function for translation: +#feature_function=ArityPenalty +#feature_function=CMR2008ReorderingFeatures +#feature_function=Dwarf +#feature_function=InputIndicator +#feature_function=LexNullJump +#feature_function=NewJump +#feature_function=NgramFeatures +#feature_function=NonLatinCount +#feature_function=OutputIndicator +feature_function=RuleIdentityFeatures +feature_function=RuleNgramFeatures +feature_function=RuleShape +#feature_function=SourceSpanSizeFeatures +#feature_function=SourceWordPenalty +#feature_function=SpanFeatures +# ^^^ features active that were used in the ACL paper diff --git a/dtrain/test/example/dtrain.ini b/dtrain/test/example/dtrain.ini new file mode 100644 index 00000000..68173e11 --- /dev/null +++ b/dtrain/test/example/dtrain.ini @@ -0,0 +1,20 @@ +input=test/example/nc-wmt11.1k.gz # use '-' for stdin +output=- # a weights file or stdout +decoder_config=test/example/cdec.ini # ini for cdec +# these will be printed on each iteration +print_weights=Glue WordPenalty LanguageModel LanguageModel_OOV PhraseModel_0 PhraseModel_1 PhraseModel_2 PhraseModel_3 PhraseModel_4 PhraseModel_5 PhraseModel_6 PassThrough +tmp=/tmp +stop_after=10 # stop iteration after 10 inputs + +# interesting stuff +epochs=3 # run over input 3 times +k=200 # use 100best lists +N=4 # optimize (approx) BLEU4 +learning_rate=0.0001 # learning rate +gamma=0.00001 # use SVM reg +scorer=stupid_bleu # use stupid BLEU+1 approx. +sample_from=kbest # use kbest lists (as opposed to forest) +filter=uniq # only uniq entries in kbest +pair_sampling=108010 # 10 vs 80 vs 10 and 80 vs 10 +pair_threshold=0 # minimum distance in BLEU +select_weights=last # just output last weights diff --git a/dtrain/test/example/nc-wmt11.1k.gz b/dtrain/test/example/nc-wmt11.1k.gz Binary files differnew file mode 100644 index 00000000..45496cd8 --- /dev/null +++ b/dtrain/test/example/nc-wmt11.1k.gz diff --git a/dtrain/test/example/nc-wmt11.en.srilm.gz b/dtrain/test/example/nc-wmt11.en.srilm.gz Binary files differnew file mode 100644 index 00000000..7ce81057 --- /dev/null +++ b/dtrain/test/example/nc-wmt11.en.srilm.gz diff --git a/dtrain/test/mtm11/logreg_cd/bin_class.cc b/dtrain/test/mtm11/logreg_cd/bin_class.cc new file mode 100644 index 00000000..19bcde25 --- /dev/null +++ b/dtrain/test/mtm11/logreg_cd/bin_class.cc @@ -0,0 +1,4 @@ +#include "bin_class.h" + +Objective::~Objective() {} + diff --git a/dtrain/test/mtm11/logreg_cd/bin_class.h b/dtrain/test/mtm11/logreg_cd/bin_class.h new file mode 100644 index 00000000..3466109a --- /dev/null +++ b/dtrain/test/mtm11/logreg_cd/bin_class.h @@ -0,0 +1,22 @@ +#ifndef _BIN_CLASS_H_ +#define _BIN_CLASS_H_ + +#include <vector> +#include "sparse_vector.h" + +struct TrainingInstance { + // TODO add other info? loss for MIRA-type updates? + SparseVector<double> x_feature_map; + bool y; +}; + +struct Objective { + virtual ~Objective(); + + // returns f(x) and f'(x) + virtual double ObjectiveAndGradient(const SparseVector<double>& x, + const std::vector<TrainingInstance>& training_instances, + SparseVector<double>* g) const = 0; +}; + +#endif diff --git a/dtrain/test/mtm11/logreg_cd/log_reg.cc b/dtrain/test/mtm11/logreg_cd/log_reg.cc new file mode 100644 index 00000000..ec2331fe --- /dev/null +++ b/dtrain/test/mtm11/logreg_cd/log_reg.cc @@ -0,0 +1,39 @@ +#include "log_reg.h" + +#include <vector> +#include <cmath> + +#include "sparse_vector.h" + +using namespace std; + +double LogisticRegression::ObjectiveAndGradient(const SparseVector<double>& x, + const vector<TrainingInstance>& training_instances, + SparseVector<double>* g) const { + double cll = 0; + for (int i = 0; i < training_instances.size(); ++i) { + const double dotprod = training_instances[i].x_feature_map.dot(x); // TODO no bias, if bias, add x[0] + double lp_false = dotprod; + double lp_true = -dotprod; + if (0 < lp_true) { + lp_true += log1p(exp(-lp_true)); + lp_false = log1p(exp(lp_false)); + } else { + lp_true = log1p(exp(lp_true)); + lp_false += log1p(exp(-lp_false)); + } + lp_true *= -1; + lp_false *= -1; + if (training_instances[i].y) { // true label + cll -= lp_true; + (*g) -= training_instances[i].x_feature_map * exp(lp_false); + // (*g)[0] -= exp(lp_false); // bias + } else { // false label + cll -= lp_false; + (*g) += training_instances[i].x_feature_map * exp(lp_true); + // g += corpus[i].second * exp(lp_true); + } + } + return cll; +} + diff --git a/dtrain/test/mtm11/logreg_cd/log_reg.h b/dtrain/test/mtm11/logreg_cd/log_reg.h new file mode 100644 index 00000000..ecc560b8 --- /dev/null +++ b/dtrain/test/mtm11/logreg_cd/log_reg.h @@ -0,0 +1,14 @@ +#ifndef _LOG_REG_H_ +#define _LOG_REG_H_ + +#include <vector> +#include "sparse_vector.h" +#include "bin_class.h" + +struct LogisticRegression : public Objective { + double ObjectiveAndGradient(const SparseVector<double>& x, + const std::vector<TrainingInstance>& training_instances, + SparseVector<double>* g) const; +}; + +#endif diff --git a/dtrain/test/mtm11/mira_update/Hildreth.cpp b/dtrain/test/mtm11/mira_update/Hildreth.cpp new file mode 100644 index 00000000..0e67eb15 --- /dev/null +++ b/dtrain/test/mtm11/mira_update/Hildreth.cpp @@ -0,0 +1,187 @@ +#include "Hildreth.h" +#include "sparse_vector.h" + +using namespace std; + +namespace Mira { + vector<double> Hildreth::optimise (vector< SparseVector<double> >& a, vector<double>& b) { + + size_t i; + int max_iter = 10000; + double eps = 0.00000001; + double zero = 0.000000000001; + + vector<double> alpha ( b.size() ); + vector<double> F ( b.size() ); + vector<double> kkt ( b.size() ); + + double max_kkt = -1e100; + + size_t K = b.size(); + + double A[K][K]; + bool is_computed[K]; + for ( i = 0; i < K; i++ ) + { + A[i][i] = a[i].dot(a[i]); + is_computed[i] = false; + } + + int max_kkt_i = -1; + + + for ( i = 0; i < b.size(); i++ ) + { + F[i] = b[i]; + kkt[i] = F[i]; + if ( kkt[i] > max_kkt ) + { + max_kkt = kkt[i]; + max_kkt_i = i; + } + } + + int iter = 0; + double diff_alpha; + double try_alpha; + double add_alpha; + + while ( max_kkt >= eps && iter < max_iter ) + { + + diff_alpha = A[max_kkt_i][max_kkt_i] <= zero ? 0.0 : F[max_kkt_i]/A[max_kkt_i][max_kkt_i]; + try_alpha = alpha[max_kkt_i] + diff_alpha; + add_alpha = 0.0; + + if ( try_alpha < 0.0 ) + add_alpha = -1.0 * alpha[max_kkt_i]; + else + add_alpha = diff_alpha; + + alpha[max_kkt_i] = alpha[max_kkt_i] + add_alpha; + + if ( !is_computed[max_kkt_i] ) + { + for ( i = 0; i < K; i++ ) + { + A[i][max_kkt_i] = a[i].dot(a[max_kkt_i] ); // for version 1 + //A[i][max_kkt_i] = 0; // for version 1 + is_computed[max_kkt_i] = true; + } + } + + for ( i = 0; i < F.size(); i++ ) + { + F[i] -= add_alpha * A[i][max_kkt_i]; + kkt[i] = F[i]; + if ( alpha[i] > zero ) + kkt[i] = abs ( F[i] ); + } + max_kkt = -1e100; + max_kkt_i = -1; + for ( i = 0; i < F.size(); i++ ) + if ( kkt[i] > max_kkt ) + { + max_kkt = kkt[i]; + max_kkt_i = i; + } + + iter++; + } + + return alpha; + } + + vector<double> Hildreth::optimise (vector< SparseVector<double> >& a, vector<double>& b, double C) { + + size_t i; + int max_iter = 10000; + double eps = 0.00000001; + double zero = 0.000000000001; + + vector<double> alpha ( b.size() ); + vector<double> F ( b.size() ); + vector<double> kkt ( b.size() ); + + double max_kkt = -1e100; + + size_t K = b.size(); + + double A[K][K]; + bool is_computed[K]; + for ( i = 0; i < K; i++ ) + { + A[i][i] = a[i].dot(a[i]); + is_computed[i] = false; + } + + int max_kkt_i = -1; + + + for ( i = 0; i < b.size(); i++ ) + { + F[i] = b[i]; + kkt[i] = F[i]; + if ( kkt[i] > max_kkt ) + { + max_kkt = kkt[i]; + max_kkt_i = i; + } + } + + int iter = 0; + double diff_alpha; + double try_alpha; + double add_alpha; + + while ( max_kkt >= eps && iter < max_iter ) + { + + diff_alpha = A[max_kkt_i][max_kkt_i] <= zero ? 0.0 : F[max_kkt_i]/A[max_kkt_i][max_kkt_i]; + try_alpha = alpha[max_kkt_i] + diff_alpha; + add_alpha = 0.0; + + if ( try_alpha < 0.0 ) + add_alpha = -1.0 * alpha[max_kkt_i]; + else if (try_alpha > C) + add_alpha = C - alpha[max_kkt_i]; + else + add_alpha = diff_alpha; + + alpha[max_kkt_i] = alpha[max_kkt_i] + add_alpha; + + if ( !is_computed[max_kkt_i] ) + { + for ( i = 0; i < K; i++ ) + { + A[i][max_kkt_i] = a[i].dot(a[max_kkt_i] ); // for version 1 + //A[i][max_kkt_i] = 0; // for version 1 + is_computed[max_kkt_i] = true; + } + } + + for ( i = 0; i < F.size(); i++ ) + { + F[i] -= add_alpha * A[i][max_kkt_i]; + kkt[i] = F[i]; + if (alpha[i] > C - zero) + kkt[i]=-kkt[i]; + else if (alpha[i] > zero) + kkt[i] = abs(F[i]); + + } + max_kkt = -1e100; + max_kkt_i = -1; + for ( i = 0; i < F.size(); i++ ) + if ( kkt[i] > max_kkt ) + { + max_kkt = kkt[i]; + max_kkt_i = i; + } + + iter++; + } + + return alpha; + } +} diff --git a/dtrain/test/mtm11/mira_update/Hildreth.h b/dtrain/test/mtm11/mira_update/Hildreth.h new file mode 100644 index 00000000..8d791085 --- /dev/null +++ b/dtrain/test/mtm11/mira_update/Hildreth.h @@ -0,0 +1,10 @@ +#include "sparse_vector.h" + +namespace Mira { + class Hildreth { + public : + static std::vector<double> optimise(std::vector< SparseVector<double> >& a, std::vector<double>& b); + static std::vector<double> optimise(std::vector< SparseVector<double> >& a, std::vector<double>& b, double C); + }; +} + diff --git a/dtrain/test/mtm11/mira_update/dtrain.cc b/dtrain/test/mtm11/mira_update/dtrain.cc new file mode 100644 index 00000000..933417a4 --- /dev/null +++ b/dtrain/test/mtm11/mira_update/dtrain.cc @@ -0,0 +1,532 @@ +#include "common.h" +#include "kbestget.h" +#include "util.h" +#include "sample.h" +#include "Hildreth.h" + +#include "ksampler.h" + +// boost compression +#include <boost/iostreams/device/file.hpp> +#include <boost/iostreams/filtering_stream.hpp> +#include <boost/iostreams/filter/gzip.hpp> +//#include <boost/iostreams/filter/zlib.hpp> +//#include <boost/iostreams/filter/bzip2.hpp> +using namespace boost::iostreams; + + +#ifdef DTRAIN_DEBUG +#include "tests.h" +#endif + + +/* + * init + * + */ +bool +init(int argc, char** argv, po::variables_map* cfg) +{ + po::options_description conff( "Configuration File Options" ); + size_t k, N, T, stop, n_pairs; + string s, f, update_type; + conff.add_options() + ( "decoder_config", po::value<string>(), "configuration file for cdec" ) + ( "kbest", po::value<size_t>(&k)->default_value(DTRAIN_DEFAULT_K), "k for kbest" ) + ( "ngrams", po::value<size_t>(&N)->default_value(DTRAIN_DEFAULT_N), "N for Ngrams" ) + ( "filter", po::value<string>(&f)->default_value("unique"), "filter kbest list" ) + ( "epochs", po::value<size_t>(&T)->default_value(DTRAIN_DEFAULT_T), "# of iterations T" ) + ( "input", po::value<string>(), "input file" ) + ( "scorer", po::value<string>(&s)->default_value(DTRAIN_DEFAULT_SCORER), "scoring metric" ) + ( "output", po::value<string>(), "output weights file" ) + ( "stop_after", po::value<size_t>(&stop)->default_value(0), "stop after X input sentences" ) + ( "weights_file", po::value<string>(), "input weights file (e.g. from previous iteration)" ) + ( "wprint", po::value<string>(), "weights to print on each iteration" ) + ( "noup", po::value<bool>()->zero_tokens(), "do not update weights" ); + + po::options_description clo("Command Line Options"); + clo.add_options() + ( "config,c", po::value<string>(), "dtrain config file" ) + ( "quiet,q", po::value<bool>()->zero_tokens(), "be quiet" ) + ( "update-type", po::value<string>(&update_type)->default_value("mira"), "perceptron or mira" ) + ( "n-pairs", po::value<size_t>(&n_pairs)->default_value(10), "number of pairs used to compute update" ) + ( "verbose,v", po::value<bool>()->zero_tokens(), "be verbose" ) +#ifndef DTRAIN_DEBUG + ; +#else + ( "test", "run tests and exit"); +#endif + po::options_description config_options, cmdline_options; + + config_options.add(conff); + cmdline_options.add(clo); + cmdline_options.add(conff); + + po::store( parse_command_line(argc, argv, cmdline_options), *cfg ); + if ( cfg->count("config") ) { + ifstream config( (*cfg)["config"].as<string>().c_str() ); + po::store( po::parse_config_file(config, config_options), *cfg ); + } + po::notify(*cfg); + + if ( !cfg->count("decoder_config") || !cfg->count("input") ) { + cerr << cmdline_options << endl; + return false; + } + if ( cfg->count("noup") && cfg->count("decode") ) { + cerr << "You can't use 'noup' and 'decode' at once." << endl; + return false; + } + if ( cfg->count("filter") && (*cfg)["filter"].as<string>() != "unique" + && (*cfg)["filter"].as<string>() != "no" ) { + cerr << "Wrong 'filter' type: '" << (*cfg)["filter"].as<string>() << "'." << endl; + } + #ifdef DTRAIN_DEBUG + if ( !cfg->count("test") ) { + cerr << cmdline_options << endl; + return false; + } + #endif + return true; +} + + +// output formatting +ostream& _nopos( ostream& out ) { return out << resetiosflags( ios::showpos ); } +ostream& _pos( ostream& out ) { return out << setiosflags( ios::showpos ); } +ostream& _prec2( ostream& out ) { return out << setprecision(2); } +ostream& _prec5( ostream& out ) { return out << setprecision(5); } + + + + +/* + * dtrain + * + */ +int +main( int argc, char** argv ) +{ + cout << setprecision( 5 ); + // handle most parameters + po::variables_map cfg; + if ( ! init(argc, argv, &cfg) ) exit(1); // something is wrong +#ifdef DTRAIN_DEBUG + if ( cfg.count("test") ) run_tests(); // run tests and exit +#endif + bool quiet = false; + if ( cfg.count("quiet") ) quiet = true; + bool verbose = false; + if ( cfg.count("verbose") ) verbose = true; + bool noup = false; + if ( cfg.count("noup") ) noup = true; + const size_t k = cfg["kbest"].as<size_t>(); + const size_t N = cfg["ngrams"].as<size_t>(); + const size_t T = cfg["epochs"].as<size_t>(); + const size_t stop_after = cfg["stop_after"].as<size_t>(); + const string filter_type = cfg["filter"].as<string>(); + const string update_type = cfg["update-type"].as<string>(); + const size_t n_pairs = cfg["n-pairs"].as<size_t>(); + const string output_file = cfg["output"].as<string>(); + if ( !quiet ) { + cout << endl << "dtrain" << endl << "Parameters:" << endl; + cout << setw(25) << "k " << k << endl; + cout << setw(25) << "N " << N << endl; + cout << setw(25) << "T " << T << endl; + if ( cfg.count("stop-after") ) + cout << setw(25) << "stop_after " << stop_after << endl; + if ( cfg.count("weights") ) + cout << setw(25) << "weights " << cfg["weights"].as<string>() << endl; + cout << setw(25) << "input " << "'" << cfg["input"].as<string>() << "'" << endl; + cout << setw(25) << "filter " << "'" << filter_type << "'" << endl; + } + + vector<string> wprint; + if ( cfg.count("wprint") ) { + boost::split( wprint, cfg["wprint"].as<string>(), boost::is_any_of(" ") ); + } + + // setup decoder, observer + register_feature_functions(); + SetSilent(true); + ReadFile ini_rf( cfg["decoder_config"].as<string>() ); + if ( !quiet ) + cout << setw(25) << "cdec cfg " << "'" << cfg["decoder_config"].as<string>() << "'" << endl; + Decoder decoder( ini_rf.stream() ); + //KBestGetter observer( k, filter_type ); + MT19937 rng; + KSampler observer( k, &rng ); + + // scoring metric/scorer + string scorer_str = cfg["scorer"].as<string>(); + double (*scorer)( NgramCounts&, const size_t, const size_t, size_t, vector<float> ); + if ( scorer_str == "bleu" ) { + scorer = &bleu; + } else if ( scorer_str == "stupid_bleu" ) { + scorer = &stupid_bleu; + } else if ( scorer_str == "smooth_bleu" ) { + scorer = &smooth_bleu; + } else if ( scorer_str == "approx_bleu" ) { + scorer = &approx_bleu; + } else { + cerr << "Don't know scoring metric: '" << scorer_str << "', exiting." << endl; + exit(1); + } + // for approx_bleu + NgramCounts global_counts( N ); // counts for 1 best translations + size_t global_hyp_len = 0; // sum hypothesis lengths + size_t global_ref_len = 0; // sum reference lengths + // this is all BLEU implmentations + vector<float> bleu_weights; // we leave this empty -> 1/N; TODO? + if ( !quiet ) cout << setw(26) << "scorer '" << scorer_str << "'" << endl << endl; + + // init weights + Weights weights; + if ( cfg.count("weights") ) weights.InitFromFile( cfg["weights"].as<string>() ); + SparseVector<double> lambdas; + weights.InitSparseVector( &lambdas ); + vector<double> dense_weights; + + // input + if ( !quiet && !verbose ) + cout << "(a dot represents " << DTRAIN_DOTS << " lines of input)" << endl; + string input_fn = cfg["input"].as<string>(); + ifstream input; + if ( input_fn != "-" ) input.open( input_fn.c_str() ); + string in; + vector<string> in_split; // input: src\tref\tpsg + vector<string> ref_tok; // tokenized reference + vector<WordID> ref_ids; // reference as vector of WordID + string grammar_str; + + // buffer input for t > 0 + vector<string> src_str_buf; // source strings, TODO? memory + vector<vector<WordID> > ref_ids_buf; // references as WordID vecs + filtering_ostream grammar_buf; // written to compressed file in /tmp + // this is for writing the grammar buffer file + grammar_buf.push( gzip_compressor() ); + char grammar_buf_tmp_fn[] = DTRAIN_TMP_DIR"/dtrain-grammars-XXXXXX"; + mkstemp( grammar_buf_tmp_fn ); + grammar_buf.push( file_sink(grammar_buf_tmp_fn, ios::binary | ios::trunc) ); + + size_t sid = 0, in_sz = 99999999; // sentence id, input size + double acc_1best_score = 0., acc_1best_model = 0.; + vector<vector<double> > scores_per_iter; + double max_score = 0.; + size_t best_t = 0; + bool next = false, stop = false; + double score = 0.; + size_t cand_len = 0; + double overall_time = 0.; + + // for the perceptron/SVM; TODO as params + double eta = 0.0005; + double gamma = 0.;//01; // -> SVM + lambdas.add_value( FD::Convert("__bias"), 0 ); + + // for random sampling + srand ( time(NULL) ); + + + for ( size_t t = 0; t < T; t++ ) // T epochs + { + + time_t start, end; + time( &start ); + + // actually, we need only need this if t > 0 FIXME + ifstream grammar_file( grammar_buf_tmp_fn, ios_base::in | ios_base::binary ); + filtering_istream grammar_buf_in; + grammar_buf_in.push( gzip_decompressor() ); + grammar_buf_in.push( grammar_file ); + + // reset average scores + acc_1best_score = acc_1best_model = 0.; + + // reset sentence counter + sid = 0; + + if ( !quiet ) cout << "Iteration #" << t+1 << " of " << T << "." << endl; + + while( true ) + { + + // get input from stdin or file + in.clear(); + next = stop = false; // next iteration, premature stop + if ( t == 0 ) { + if ( input_fn == "-" ) { + if ( !getline(cin, in) ) next = true; + } else { + if ( !getline(input, in) ) next = true; + } + } else { + if ( sid == in_sz ) next = true; // stop if we reach the end of our input + } + // stop after X sentences (but still iterate for those) + if ( stop_after > 0 && stop_after == sid && !next ) stop = true; + + // produce some pretty output + if ( !quiet && !verbose ) { + if ( sid == 0 ) cout << " "; + if ( (sid+1) % (DTRAIN_DOTS) == 0 ) { + cout << "."; + cout.flush(); + } + if ( (sid+1) % (20*DTRAIN_DOTS) == 0) { + cout << " " << sid+1 << endl; + if ( !next && !stop ) cout << " "; + } + if ( stop ) { + if ( sid % (20*DTRAIN_DOTS) != 0 ) cout << " " << sid << endl; + cout << "Stopping after " << stop_after << " input sentences." << endl; + } else { + if ( next ) { + if ( sid % (20*DTRAIN_DOTS) != 0 ) { + cout << " " << sid << endl; + } + } + } + } + + // next iteration + if ( next || stop ) break; + + // weights + dense_weights.clear(); + weights.InitFromVector( lambdas ); + weights.InitVector( &dense_weights ); + decoder.SetWeights( dense_weights ); + + if ( t == 0 ) { + // handling input + in_split.clear(); + boost::split( in_split, in, boost::is_any_of("\t") ); // in_split[0] is id + // getting reference + ref_tok.clear(); ref_ids.clear(); + boost::split( ref_tok, in_split[2], boost::is_any_of(" ") ); + register_and_convert( ref_tok, ref_ids ); + ref_ids_buf.push_back( ref_ids ); + // process and set grammar + bool broken_grammar = true; + for ( string::iterator ti = in_split[3].begin(); ti != in_split[3].end(); ti++ ) { + if ( !isspace(*ti) ) { + broken_grammar = false; + break; + } + } + if ( broken_grammar ) continue; + grammar_str = boost::replace_all_copy( in_split[3], " __NEXT__RULE__ ", "\n" ) + "\n"; // FIXME copy, __ + grammar_buf << grammar_str << DTRAIN_GRAMMAR_DELIM << endl; + decoder.SetSentenceGrammarFromString( grammar_str ); + // decode, kbest + src_str_buf.push_back( in_split[1] ); + decoder.Decode( in_split[1], &observer ); + } else { + // get buffered grammar + grammar_str.clear(); + int i = 1; + while ( true ) { + string g; + getline( grammar_buf_in, g ); + if ( g == DTRAIN_GRAMMAR_DELIM ) break; + grammar_str += g+"\n"; + i += 1; + } + decoder.SetSentenceGrammarFromString( grammar_str ); + // decode, kbest + decoder.Decode( src_str_buf[sid], &observer ); + } + + // get kbest list + KBestList* kb; + //if ( ) { // TODO get from forest + kb = observer.GetKBest(); + //} + + // scoring kbest + if ( t > 0 ) ref_ids = ref_ids_buf[sid]; + for ( size_t i = 0; i < kb->GetSize(); i++ ) { + NgramCounts counts = make_ngram_counts( ref_ids, kb->sents[i], N ); + // this is for approx bleu + if ( scorer_str == "approx_bleu" ) { + if ( i == 0 ) { // 'context of 1best translations' + global_counts += counts; + global_hyp_len += kb->sents[i].size(); + global_ref_len += ref_ids.size(); + counts.reset(); + cand_len = 0; + } else { + cand_len = kb->sents[i].size(); + } + NgramCounts counts_tmp = global_counts + counts; + // TODO as param + score = 0.9 * scorer( counts_tmp, + global_ref_len, + global_hyp_len + cand_len, N, bleu_weights ); + } else { + // other scorers + cand_len = kb->sents[i].size(); + score = scorer( counts, + ref_ids.size(), + kb->sents[i].size(), N, bleu_weights ); + } + + kb->scores.push_back( score ); + + if ( i == 0 ) { + acc_1best_score += score; + acc_1best_model += kb->model_scores[i]; + } + + if ( verbose ) { + if ( i == 0 ) cout << "'" << TD::GetString( ref_ids ) << "' [ref]" << endl; + cout << _prec5 << _nopos << "[hyp " << i << "] " << "'" << TD::GetString( kb->sents[i] ) << "'"; + cout << " [SCORE=" << score << ",model="<< kb->model_scores[i] << "]" << endl; + cout << kb->feats[i] << endl; // this is maybe too verbose + } + } // Nbest loop + + if ( verbose ) cout << endl; + + + // UPDATE WEIGHTS + if ( !noup ) { + + TrainingInstances pairs; + sample_all( kb, pairs, n_pairs ); + + vector< SparseVector<double> > featureValueDiffs; + vector<double> lossMinusModelScoreDiffs; + for ( TrainingInstances::iterator ti = pairs.begin(); + ti != pairs.end(); ti++ ) { + + SparseVector<double> dv; + if ( ti->first_score - ti->second_score < 0 ) { + dv = ti->second - ti->first; + dv.add_value( FD::Convert("__bias"), -1 ); + + featureValueDiffs.push_back(dv); + double lossMinusModelScoreDiff = ti->loss_diff - ti->model_score_diff; + lossMinusModelScoreDiffs.push_back(lossMinusModelScoreDiff); + + if (update_type == "perceptron") { + lambdas += dv * eta; + cerr << "after perceptron update: " << lambdas << endl << endl; + } + + if ( verbose ) { + cout << "{{ f("<< ti->first_rank <<") > f(" << ti->second_rank << ") but g(i)="<< ti->first_score <<" < g(j)="<< ti->second_score << " so update" << endl; + cout << " i " << TD::GetString(kb->sents[ti->first_rank]) << endl; + cout << " " << kb->feats[ti->first_rank] << endl; + cout << " j " << TD::GetString(kb->sents[ti->second_rank]) << endl; + cout << " " << kb->feats[ti->second_rank] << endl; + cout << " diff vec: " << dv << endl; + cout << " lambdas after update: " << lambdas << endl; + cout << "}}" << endl; + } + } else { + //SparseVector<double> reg; + //reg = lambdas * ( 2 * gamma ); + //lambdas += reg * ( -eta ); + } + } + cerr << "Collected " << featureValueDiffs.size() << " constraints." << endl; + + double slack = 0.01; + if (update_type == "mira") { + if (featureValueDiffs.size() > 0) { + vector<double> alphas; + if (slack != 0) { + alphas = Mira::Hildreth::optimise(featureValueDiffs, lossMinusModelScoreDiffs, slack); + } else { + alphas = Mira::Hildreth::optimise(featureValueDiffs, lossMinusModelScoreDiffs); + } + + for (size_t k = 0; k < featureValueDiffs.size(); ++k) { + lambdas += featureValueDiffs[k] * alphas[k]; + } + // cerr << "after mira update: " << lambdas << endl << endl; + } + } + } + + ++sid; + + } // input loop + + if ( t == 0 ) in_sz = sid; // remember size (lines) of input + + // print some stats + double avg_1best_score = acc_1best_score/(double)in_sz; + double avg_1best_model = acc_1best_model/(double)in_sz; + double avg_1best_score_diff, avg_1best_model_diff; + if ( t > 0 ) { + avg_1best_score_diff = avg_1best_score - scores_per_iter[t-1][0]; + avg_1best_model_diff = avg_1best_model - scores_per_iter[t-1][1]; + } else { + avg_1best_score_diff = avg_1best_score; + avg_1best_model_diff = avg_1best_model; + } + cout << _prec5 << _pos << "WEIGHTS" << endl; + for (vector<string>::iterator it = wprint.begin(); it != wprint.end(); it++) { + cout << setw(16) << *it << " = " << dense_weights[FD::Convert( *it )] << endl; + } + + cout << " ---" << endl; + cout << _nopos << " avg score: " << avg_1best_score; + cout << _pos << " (" << avg_1best_score_diff << ")" << endl; + cout << _nopos << "avg model score: " << avg_1best_model; + cout << _pos << " (" << avg_1best_model_diff << ")" << endl; + vector<double> remember_scores; + remember_scores.push_back( avg_1best_score ); + remember_scores.push_back( avg_1best_model ); + scores_per_iter.push_back( remember_scores ); + if ( avg_1best_score > max_score ) { + max_score = avg_1best_score; + best_t = t; + } + + // close open files + if ( input_fn != "-" ) input.close(); + close( grammar_buf ); + grammar_file.close(); + + time ( &end ); + double time_dif = difftime( end, start ); + overall_time += time_dif; + if ( !quiet ) { + cout << _prec2 << _nopos << "(time " << time_dif/60. << " min, "; + cout << time_dif/(double)in_sz<< " s/S)" << endl; + } + + if ( t+1 != T ) cout << endl; + + if ( noup ) break; + + // write weights after every epoch + std::string s; + std::stringstream out; + out << t; + s = out.str(); + string weights_file = output_file + "." + s; + weights.WriteToFile(weights_file, true ); + + } // outer loop + + unlink( grammar_buf_tmp_fn ); + if ( !noup ) { + if ( !quiet ) cout << endl << "writing weights file '" << cfg["output"].as<string>() << "' ..."; + weights.WriteToFile( cfg["output"].as<string>(), true ); + if ( !quiet ) cout << "done" << endl; + } + + if ( !quiet ) { + cout << _prec5 << _nopos << endl << "---" << endl << "Best iteration: "; + cout << best_t+1 << " [SCORE '" << scorer_str << "'=" << max_score << "]." << endl; + cout << _prec2 << "This took " << overall_time/60. << " min." << endl; + } + + return 0; +} + diff --git a/dtrain/test/mtm11/mira_update/sample.h b/dtrain/test/mtm11/mira_update/sample.h new file mode 100644 index 00000000..5c331bba --- /dev/null +++ b/dtrain/test/mtm11/mira_update/sample.h @@ -0,0 +1,101 @@ +#ifndef _DTRAIN_SAMPLE_H_ +#define _DTRAIN_SAMPLE_H_ + + +#include "kbestget.h" + + +namespace dtrain +{ + + +struct TPair +{ + SparseVector<double> first, second; + size_t first_rank, second_rank; + double first_score, second_score; + double model_score_diff; + double loss_diff; +}; + +typedef vector<TPair> TrainingInstances; + + +void + sample_all( KBestList* kb, TrainingInstances &training, size_t n_pairs ) +{ + std::vector<double> loss_diffs; + TrainingInstances training_tmp; + for ( size_t i = 0; i < kb->GetSize()-1; i++ ) { + for ( size_t j = i+1; j < kb->GetSize(); j++ ) { + TPair p; + p.first = kb->feats[i]; + p.second = kb->feats[j]; + p.first_rank = i; + p.second_rank = j; + p.first_score = kb->scores[i]; + p.second_score = kb->scores[j]; + + bool conservative = 1; + if ( kb->scores[i] - kb->scores[j] < 0 ) { + // j=hope, i=fear + p.model_score_diff = kb->model_scores[j] - kb->model_scores[i]; + p.loss_diff = kb->scores[j] - kb->scores[i]; + training_tmp.push_back(p); + loss_diffs.push_back(p.loss_diff); + } + else if (!conservative) { + // i=hope, j=fear + p.model_score_diff = kb->model_scores[i] - kb->model_scores[j]; + p.loss_diff = kb->scores[i] - kb->scores[j]; + training_tmp.push_back(p); + loss_diffs.push_back(p.loss_diff); + } + } + } + + if (training_tmp.size() > 0) { + double threshold; + std::sort(loss_diffs.begin(), loss_diffs.end()); + std::reverse(loss_diffs.begin(), loss_diffs.end()); + threshold = loss_diffs.size() >= n_pairs ? loss_diffs[n_pairs-1] : loss_diffs[loss_diffs.size()-1]; + cerr << "threshold: " << threshold << endl; + size_t constraints = 0; + for (size_t i = 0; (i < training_tmp.size() && constraints < n_pairs); ++i) { + if (training_tmp[i].loss_diff >= threshold) { + training.push_back(training_tmp[i]); + constraints++; + } + } + } + else { + cerr << "No pairs selected." << endl; + } +} + +void +sample_rand( KBestList* kb, TrainingInstances &training ) +{ + srand( time(NULL) ); + for ( size_t i = 0; i < kb->GetSize()-1; i++ ) { + for ( size_t j = i+1; j < kb->GetSize(); j++ ) { + if ( rand() % 2 ) { + TPair p; + p.first = kb->feats[i]; + p.second = kb->feats[j]; + p.first_rank = i; + p.second_rank = j; + p.first_score = kb->scores[i]; + p.second_score = kb->scores[j]; + training.push_back( p ); + } + } + } +} + + +} // namespace + + +#endif + diff --git a/dtrain/test/toy/cdec.ini b/dtrain/test/toy/cdec.ini new file mode 100644 index 00000000..98b02d44 --- /dev/null +++ b/dtrain/test/toy/cdec.ini @@ -0,0 +1,2 @@ +formalism=scfg +add_pass_through_rules=true diff --git a/dtrain/test/toy/dtrain.ini b/dtrain/test/toy/dtrain.ini new file mode 100644 index 00000000..abf22b94 --- /dev/null +++ b/dtrain/test/toy/dtrain.ini @@ -0,0 +1,12 @@ +decoder_config=test/toy/cdec.ini +input=test/toy/input +output=- +print_weights=logp shell_rule house_rule small_rule little_rule PassThrough +k=4 +N=4 +epochs=3 +scorer=stupid_bleu +sample_from=kbest +filter=uniq +pair_sampling=all +learning_rate=1 diff --git a/dtrain/test/toy/input b/dtrain/test/toy/input new file mode 100644 index 00000000..4d10a9ea --- /dev/null +++ b/dtrain/test/toy/input @@ -0,0 +1,2 @@ +0 ich sah ein kleines haus i saw a little house [S] ||| [NP,1] [VP,2] ||| [1] [2] ||| logp=0 [NP] ||| ich ||| i ||| logp=0 [NP] ||| ein [NN,1] ||| a [1] ||| logp=0 [NN] ||| [JJ,1] haus ||| [1] house ||| logp=0 house_rule=1 [NN] ||| [JJ,1] haus ||| [1] shell ||| logp=0 shell_rule=1 [JJ] ||| kleines ||| small ||| logp=0 small_rule=1 [JJ] ||| kleines ||| little ||| logp=0 little_rule=1 [JJ] ||| grosses ||| big ||| logp=0 [JJ] ||| grosses ||| large ||| logp=0 [VP] ||| [V,1] [NP,2] ||| [1] [2] ||| logp=0 [V] ||| sah ||| saw ||| logp=0 [V] ||| fand ||| found ||| logp=0 +1 ich fand ein kleines haus i found a little house [S] ||| [NP,1] [VP,2] ||| [1] [2] ||| logp=0 [NP] ||| ich ||| i ||| logp=0 [NP] ||| ein [NN,1] ||| a [1] ||| logp=0 [NN] ||| [JJ,1] haus ||| [1] house ||| logp=0 house_rule=1 [NN] ||| [JJ,1] haus ||| [1] shell ||| logp=0 shell_rule=1 [JJ] ||| kleines ||| small ||| logp=0 small_rule=1 [JJ] ||| kleines ||| little ||| logp=0 little_rule=1 [JJ] ||| grosses ||| big ||| logp=0 [JJ] ||| grosses ||| large ||| logp=0 [VP] ||| [V,1] [NP,2] ||| [1] [2] ||| logp=0 [V] ||| sah ||| saw ||| logp=0 [V] ||| fand ||| found ||| logp=0 |