#include "dtrain.h" bool dtrain_init(int argc, char** argv, po::variables_map* cfg) { po::options_description ini("Configuration File Options"); ini.add_options() ("input", po::value()->default_value("-"), "input file") ("output", po::value()->default_value("-"), "output weights file (or VOID)") ("input_weights", po::value(), "input weights file (e.g. from previous iteration)") ("decoder_config", po::value(), "configuration file for cdec") ("k", po::value()->default_value(100), "size of kbest or sample from forest") ("sample_from", po::value()->default_value("kbest"), "where to get translations from") ("filter", po::value()->default_value("unique"), "filter kbest list") ("pair_sampling", po::value()->default_value("all"), "how to sample pairs: all, rand") ("N", po::value()->default_value(3), "N for Ngrams") ("epochs", po::value()->default_value(2), "# of iterations T") ("scorer", po::value()->default_value("stupid_bleu"), "scoring metric") ("stop_after", po::value()->default_value(0), "stop after X input sentences") ("print_weights", po::value(), "weights to print on each iteration") ("hstreaming", po::value()->zero_tokens(), "run in hadoop streaming mode") ("learning_rate", po::value()->default_value(0.0005), "learning rate") ("gamma", po::value()->default_value(0.), "gamma for SVM (0 for perceptron)") ("tmp", po::value()->default_value("/tmp"), "temp dir to use") // FIXME ("noup", po::value()->zero_tokens(), "do not update weights"); po::options_description cl("Command Line Options"); cl.add_options() ("config,c", po::value(), "dtrain config file") ("quiet,q", po::value()->zero_tokens(), "be quiet") ("verbose,v", po::value()->zero_tokens(), "be verbose"); cl.add(ini); po::store(parse_command_line(argc, argv, cl), *cfg); if (cfg->count("config")) { ifstream ini_f((*cfg)["config"].as().c_str()); po::store(po::parse_config_file(ini_f, ini), *cfg); } po::notify(*cfg); if (!cfg->count("decoder_config")) { cerr << cl << endl; return false; } if (cfg->count("hstreaming") && (*cfg)["output"].as() != "-") { cerr << "When using 'hstreaming' the 'output' param should be '-'."; return false; } if ((*cfg)["filter"].as() != "unique" && (*cfg)["filter"].as() != "no") { cerr << "Wrong 'filter' param: '" << (*cfg)["filter"].as() << "', use 'unique' or 'no'." << endl; } if ((*cfg)["pair_sampling"].as() != "all" && (*cfg)["pair_sampling"].as() != "rand") { cerr << "Wrong 'pair_sampling' param: '" << (*cfg)["pair_sampling"].as() << "', use 'all' or 'rand'." << endl; } if ((*cfg)["sample_from"].as() != "kbest" && (*cfg)["sample_from"].as() != "forest") { cerr << "Wrong 'sample_from' param: '" << (*cfg)["sample_from"].as() << "', use 'kbest' or 'forest'." << endl; } return true; } int main(int argc, char** argv) { // handle most parameters po::variables_map cfg; if (!dtrain_init(argc, argv, &cfg)) exit(1); // something is wrong bool quiet = false; if (cfg.count("quiet")) quiet = true; bool verbose = false; if (cfg.count("verbose")) verbose = true; bool noup = false; if (cfg.count("noup")) noup = true; bool hstreaming = false; if (cfg.count("hstreaming")) { hstreaming = true; quiet = true; } const unsigned k = cfg["k"].as(); const unsigned N = cfg["N"].as(); const unsigned T = cfg["epochs"].as(); const unsigned stop_after = cfg["stop_after"].as(); const string filter_type = cfg["filter"].as(); const string sample_from = cfg["sample_from"].as(); const string pair_sampling = cfg["pair_sampling"].as(); vector print_weights; if (cfg.count("print_weights")) boost::split(print_weights, cfg["print_weights"].as(), boost::is_any_of(" ")); // setup decoder register_feature_functions(); SetSilent(true); ReadFile ini_rf(cfg["decoder_config"].as()); if (!quiet) cerr << setw(25) << "cdec cfg " << "'" << cfg["decoder_config"].as() << "'" << endl; Decoder decoder(ini_rf.stream()); MT19937 rng; // random number generator // setup decoder observer HypSampler* observer; if (sample_from == "kbest") { observer = dynamic_cast(new KBestGetter(k, filter_type)); } else { observer = dynamic_cast(new KSampler(k, &rng)); } // scoring metric/scorer string scorer_str = cfg["scorer"].as(); score_t (*scorer)(NgramCounts&, const unsigned, const unsigned, unsigned, vector); if (scorer_str == "bleu") { scorer = &bleu; } else if (scorer_str == "stupid_bleu") { scorer = &stupid_bleu; } else if (scorer_str == "smooth_bleu") { scorer = &smooth_bleu; } else if (scorer_str == "approx_bleu") { scorer = &approx_bleu; } else { cerr << "Don't know scoring metric: '" << scorer_str << "', exiting." << endl; exit(1); } NgramCounts global_counts(N); // counts for 1 best translations unsigned global_hyp_len = 0; // sum hypothesis lengths unsigned global_ref_len = 0; // sum reference lengths // ^^^ global_* for approx_bleu vector bleu_weights; // we leave this empty -> 1/N if (!quiet) cerr << setw(26) << "scorer '" << scorer_str << "'" << endl << endl; // init weights Weights weights; if (cfg.count("input_weights")) weights.InitFromFile(cfg["input_weights"].as()); SparseVector lambdas; weights.InitSparseVector(&lambdas); vector dense_weights; // meta params for perceptron, SVM double eta = cfg["learning_rate"].as(); double gamma = cfg["gamma"].as(); lambdas.add_value(FD::Convert("__bias"), 0); // input string input_fn = cfg["input"].as(); ReadFile input(input_fn); // buffer input for t > 0 vector src_str_buf; // source strings vector > ref_ids_buf; // references as WordID vecs // this is for writing the grammar buffer file char grammar_buf_fn[] = DTRAIN_TMP_DIR"/dtrain-grammars-XXXXXX"; mkstemp(grammar_buf_fn); ogzstream grammar_buf_out; grammar_buf_out.open(grammar_buf_fn); unsigned in_sz = 999999999; // input index, input size vector > all_scores; score_t max_score = 0.; unsigned best_it = 0; float overall_time = 0.; // output cfg if (!quiet) { cerr << _p5; cerr << endl << "dtrain" << endl << "Parameters:" << endl; cerr << setw(25) << "k " << k << endl; cerr << setw(25) << "N " << N << endl; cerr << setw(25) << "T " << T << endl; if (cfg.count("stop-after")) cerr << setw(25) << "stop_after " << stop_after << endl; if (cfg.count("input_weights")) cerr << setw(25) << "weights in" << cfg["input_weights"].as() << endl; cerr << setw(25) << "input " << "'" << cfg["input"].as() << "'" << endl; cerr << setw(25) << "output " << "'" << cfg["output"].as() << "'" << endl; if (sample_from == "kbest") cerr << setw(25) << "filter " << "'" << filter_type << "'" << endl; cerr << setw(25) << "learning rate " << eta << endl; cerr << setw(25) << "gamma " << gamma << endl; cerr << setw(25) << "sample from " << "'" << sample_from << "'" << endl; cerr << setw(25) << "pairs " << "'" << pair_sampling << "'" << endl; if (!verbose) cerr << "(a dot represents " << DTRAIN_DOTS << " lines of input)" << endl; } for (unsigned t = 0; t < T; t++) // T epochs { time_t start, end; time(&start); igzstream grammar_buf_in; if (t > 0) grammar_buf_in.open(grammar_buf_fn); score_t score_sum = 0., model_sum = 0.; unsigned ii = 0; if (!quiet) cerr << "Iteration #" << t+1 << " of " << T << "." << endl; while(true) { string in; bool next = false, stop = false; // next iteration or premature stop if (t == 0) { if(!getline(*input, in)) next = true; } else { if (ii == in_sz) next = true; // stop if we reach the end of our input } // stop after X sentences (but still iterate for those) if (stop_after > 0 && stop_after == ii && !next) stop = true; // produce some pretty output if (!quiet && !verbose) { if (ii == 0) cerr << " "; if ((ii+1) % (DTRAIN_DOTS) == 0) { cerr << "."; cerr.flush(); } if ((ii+1) % (20*DTRAIN_DOTS) == 0) { cerr << " " << ii+1 << endl; if (!next && !stop) cerr << " "; } if (stop) { if (ii % (20*DTRAIN_DOTS) != 0) cerr << " " << ii << endl; cerr << "Stopping after " << stop_after << " input sentences." << endl; } else { if (next) { if (ii % (20*DTRAIN_DOTS) != 0) cerr << " " << ii << endl; } } } // next iteration if (next || stop) break; // weights dense_weights.clear(); weights.InitFromVector(lambdas); weights.InitVector(&dense_weights); decoder.SetWeights(dense_weights); // getting input vector in_split; // input: sid\tsrc\tref\tpsg vector ref_ids; // reference as vector if (t == 0) { // handling input strsplit(in, in_split, '\t', 4); // getting reference ref_ids.clear(); vector ref_tok; strsplit(in_split[2], ref_tok, ' '); register_and_convert(ref_tok, ref_ids); ref_ids_buf.push_back(ref_ids); // process and set grammar bool broken_grammar = true; for (string::iterator ti = in_split[3].begin(); ti != in_split[3].end(); ti++) { if (!isspace(*ti)) { broken_grammar = false; break; } } if (broken_grammar) continue; boost::replace_all(in_split[3], " __NEXT__RULE__ ", "\n"); // TODO in_split[3] += "\n"; grammar_buf_out << in_split[3] << DTRAIN_GRAMMAR_DELIM << " " << in_split[0] << endl; decoder.SetSentenceGrammarFromString(in_split[3]); // decode src_str_buf.push_back(in_split[1]); decoder.Decode(in_split[1], observer); } else { // get buffered grammar string grammar_str; while (true) { string rule; getline(grammar_buf_in, rule); if (boost::starts_with(rule, DTRAIN_GRAMMAR_DELIM)) break; grammar_str += rule + "\n"; } decoder.SetSentenceGrammarFromString(grammar_str); // decode decoder.Decode(src_str_buf[ii], observer); } vector* samples = observer->GetSamples(); // (local) scoring if (t > 0) ref_ids = ref_ids_buf[ii]; score_t score = 0.; for (unsigned i = 0; i < samples->size(); i++) { NgramCounts counts = make_ngram_counts(ref_ids, (*samples)[i].w, N); if (scorer_str == "approx_bleu") { unsigned hyp_len = 0; if (i == 0) { // 'context of 1best translations' global_counts += counts; global_hyp_len += (*samples)[i].w.size(); global_ref_len += ref_ids.size(); counts.reset(); } else { hyp_len = (*samples)[i].w.size(); } NgramCounts _c = global_counts + counts; score = .9 * scorer(_c, global_ref_len, global_hyp_len + hyp_len, N, bleu_weights); } else { score = scorer(counts, ref_ids.size(), (*samples)[i].w.size(), N, bleu_weights); } (*samples)[i].score = (score); if (i == 0) { score_sum += score; model_sum += (*samples)[i].model; } if (verbose) { if (i == 0) cerr << "'" << TD::GetString(ref_ids) << "' [ref]" << endl; cerr << _p5 << _np << "[hyp " << i << "] " << "'" << TD::GetString((*samples)[i].w) << "'"; cerr << " [SCORE=" << score << ",model="<< (*samples)[i].model << "]" << endl; cerr << (*samples)[i].f << endl; } } // sample/scoring loop if (verbose) cerr << endl; ////////////////////////////////////////////////////////// // UPDATE WEIGHTS if (!noup) { vector > pairs; if (pair_sampling == "all") sample_all_pairs(samples, pairs); if (pair_sampling == "rand") sample_rand_pairs(samples, pairs, &rng); for (vector >::iterator ti = pairs.begin(); ti != pairs.end(); ti++) { SparseVector dv; if (ti->first.score - ti->second.score < 0) { dv = ti->second.f - ti->first.f; //} else { //dv = ti->first - ti->second; //} dv.add_value(FD::Convert("__bias"), -1); //SparseVector reg; //reg = lambdas * (2 * gamma); //dv -= reg; lambdas += dv * eta; if (verbose) { /*cerr << "{{ f("<< ti->first_rank <<") > f(" << ti->second_rank << ") but g(i)="<< ti->first_score <<" < g(j)="<< ti->second_score << " so update" << endl; cerr << " i " << TD::GetString(samples->sents[ti->first_rank]) << endl; cerr << " " << samples->feats[ti->first_rank] << endl; cerr << " j " << TD::GetString(samples->sents[ti->second_rank]) << endl; cerr << " " << samples->feats[ti->second_rank] << endl; cerr << " diff vec: " << dv << endl; cerr << " lambdas after update: " << lambdas << endl; cerr << "}}" << endl;*/ } } else { //SparseVector reg; //reg = lambdas * (2 * gamma); //lambdas += reg * (-eta); } } //double l2 = lambdas.l2norm(); //if (l2) lambdas /= lambdas.l2norm(); } ////////////////////////////////////////////////////////// ++ii; if (hstreaming) cerr << "reporter:counter:dtrain,sid," << in_split[0] << endl; } // input loop if (t == 0) { in_sz = ii; // remember size of input (# lines) grammar_buf_out.close(); } else { grammar_buf_in.close(); } // print some stats score_t score_avg = score_sum/(score_t)in_sz; score_t model_avg = model_sum/(score_t)in_sz; score_t score_diff, model_diff; if (t > 0) { score_diff = score_avg - all_scores[t-1].first; model_diff = model_avg - all_scores[t-1].second; } else { score_diff = score_avg; model_diff = model_avg; } if (!quiet) { cerr << _p5 << _p << "WEIGHTS" << endl; for (vector::iterator it = print_weights.begin(); it != print_weights.end(); it++) { cerr << setw(16) << *it << " = " << lambdas.get(FD::Convert(*it)) << endl; } cerr << " ---" << endl; cerr << _np << " 1best avg score: " << score_avg; cerr << _p << " (" << score_diff << ")" << endl; cerr << _np << "1best avg model score: " << model_avg; cerr << _p << " (" << model_diff << ")" << endl; } pair remember; remember.first = score_avg; remember.second = model_avg; all_scores.push_back(remember); if (score_avg > max_score) { max_score = score_avg; best_it = t; } time (&end); float time_diff = difftime(end, start); overall_time += time_diff; if (!quiet) { cerr << _p2 << _np << "(time " << time_diff/60. << " min, "; cerr << time_diff/(float)in_sz<< " s/S)" << endl; } if (t+1 != T && !quiet) cerr << endl; if (noup) break; } // outer loop unlink(grammar_buf_fn); if (!noup) { if (!quiet) cerr << endl << "writing weights file '" << cfg["output"].as() << "' ..."; if (cfg["output"].as() == "-") { cout << _p9; for (SparseVector::const_iterator ti = lambdas.begin(); ti != lambdas.end(); ++ti) { if (ti->second == 0) continue; cout << _np << FD::Convert(ti->first) << "\t" << ti->second << endl; } if (hstreaming) cout << "__SHARD_COUNT__\t1" << endl; } else if (cfg["output"].as() != "VOID") { weights.InitFromVector(lambdas); weights.WriteToFile(cfg["output"].as(), true); } if (!quiet) cerr << "done" << endl; } if (!quiet) { cerr << _p5 << _np << endl << "---" << endl << "Best iteration: "; cerr << best_it+1 << " [SCORE '" << scorer_str << "'=" << max_score << "]." << endl; cerr << _p2 << "This took " << overall_time/60. << " min." << endl; } return 0; }