diff options
author | mjdenkowski <michael.j.denkowski@gmail.com> | 2013-08-28 18:07:42 -0400 |
---|---|---|
committer | mjdenkowski <michael.j.denkowski@gmail.com> | 2013-08-28 18:07:42 -0400 |
commit | 6f462d23384b6e42a944feedaf6f37ae7a5b7921 (patch) | |
tree | 5aeb4ef2b9e3e83182d43f89dc637297d6e1bbad | |
parent | 03799a2d330c6dbbe12154d4bcea236210b4f6ed (diff) |
Stream support for MIRA (part of realtime)
-rw-r--r-- | mteval/scorer.cc | 25 | ||||
-rw-r--r-- | mteval/scorer.h | 26 | ||||
-rw-r--r-- | training/mira/kbest_cut_mira.cc | 122 |
3 files changed, 137 insertions, 36 deletions
diff --git a/mteval/scorer.cc b/mteval/scorer.cc index a83b9e2f..ced0cadf 100644 --- a/mteval/scorer.cc +++ b/mteval/scorer.cc @@ -595,6 +595,7 @@ void DocScorer::Init( const vector<string>& ref_files, const string& src_file, bool verbose) { scorers_.clear(); + this->type = type; // TODO stop using valarray, start using ReadFile cerr << "Loading references (" << ref_files.size() << " files)\n"; ReadFile srcrf; @@ -644,3 +645,27 @@ void DocScorer::Init( cerr << "Loaded reference translations for " << scorers_.size() << " sentences.\n"; } +void DocStreamScorer::Init( + const ScoreType type, + const vector<string>& ref_files, + const string& src_file, bool verbose) { + scorers_.clear(); + // AER not supported in stream mode + assert(type != AER); + this->type = type; + vector<vector<WordID> > refs(1); + string src_line; + // Empty reference 0 + TD::ConvertSentence("", &refs[0]); + scorers_.push_back(ScorerP(SentenceScorer::CreateSentenceScorer(type, refs, src_line))); + // Reference 1 starts empty, updated as needed + scorers_.push_back(ScorerP(SentenceScorer::CreateSentenceScorer(type, refs, src_line))); +} + +void DocStreamScorer::update(const std::string& ref) { + scorers_.pop_back(); + vector<vector<WordID> > refs(1); + string src_line; + TD::ConvertSentence(ref, &refs[0]); + scorers_.push_back(ScorerP(SentenceScorer::CreateSentenceScorer(this->type, refs, src_line))); +} diff --git a/mteval/scorer.h b/mteval/scorer.h index 8bd76982..56c39a7d 100644 --- a/mteval/scorer.h +++ b/mteval/scorer.h @@ -83,10 +83,11 @@ class SentenceScorer { //TODO: should be able to GetOne GetZero without supplying sentence (just type) class DocScorer { + friend class DocStreamScorer; public: - ~DocScorer(); + virtual ~DocScorer(); DocScorer() { } - void Init(const ScoreType type, + virtual void Init(const ScoreType type, const std::vector<std::string>& ref_files, const std::string& src_file = "", bool verbose=false @@ -102,9 +103,30 @@ class DocScorer { int size() const { return scorers_.size(); } ScorerP operator[](size_t i) const { return scorers_[i]; } + virtual void update(const std::string& ref) {} private: + ScoreType type; std::vector<ScorerP> scorers_; }; +class DocStreamScorer : public DocScorer { + public: + void Init(const ScoreType type, + const std::vector<std::string>& ref_files, + const std::string& src_file = "", + bool verbose=false + ); + DocStreamScorer(const ScoreType type, + const std::vector<std::string>& ref_files, + const std::string& src_file = "", + bool verbose=false + ) + { + Init(type,ref_files,src_file,verbose); + } + ScorerP operator[](size_t i); + int size(); + void update(const std::string& ref); +}; #endif diff --git a/training/mira/kbest_cut_mira.cc b/training/mira/kbest_cut_mira.cc index 7df9a18f..d8c42db7 100644 --- a/training/mira/kbest_cut_mira.cc +++ b/training/mira/kbest_cut_mira.cc @@ -48,6 +48,7 @@ int hope_select; bool pseudo_doc; bool sent_approx; bool checkloss; +bool stream; void SanityCheck(const vector<double>& w) { for (int i = 0; i < w.size(); ++i) { @@ -99,6 +100,7 @@ bool InitCommandLine(int argc, char** argv, po::variables_map* conf) { ("k_best_size,k", po::value<int>()->default_value(250), "Size of hypothesis list to search for oracles") ("update_k_best,b", po::value<int>()->default_value(1), "Size of good, bad lists to perform update with") ("unique_k_best,u", "Unique k-best translation list") + ("stream,t", "Stream mode (used for realtime)") ("weights_output,O",po::value<string>(),"Directory to write weights to") ("output_dir,D",po::value<string>(),"Directory to place output in") ("decoder_config,c",po::value<string>(),"Decoder configuration file"); @@ -117,7 +119,11 @@ bool InitCommandLine(int argc, char** argv, po::variables_map* conf) { } po::notify(*conf); - if (conf->count("help") || !conf->count("input_weights") || !conf->count("decoder_config") || !conf->count("reference")) { + if (conf->count("help") + || !conf->count("input_weights") + || !conf->count("decoder_config") + || (!conf->count("stream") && (!conf->count("reference") || !conf->count("weights_output") || !conf->count("output_dir"))) + ) { cerr << dcmdline_options << endl; return false; } @@ -321,6 +327,25 @@ struct GoodBadOracle { vector<shared_ptr<HypothesisInfo> > bad; }; +struct BasicObserver: public DecoderObserver { + Hypergraph* hypergraph; + BasicObserver() : hypergraph(NULL) {} + ~BasicObserver() { + if(hypergraph != NULL) delete hypergraph; + } + void NotifyDecodingStart(const SentenceMetadata& smeta) {} + void NotifySourceParseFailure(const SentenceMetadata& smeta) {} + void NotifyTranslationForest(const SentenceMetadata& smeta, Hypergraph* hg) { + if(hypergraph != NULL) delete hypergraph; + hypergraph = new Hypergraph(*hg); + } + void NotifyAlignmentFailure(const SentenceMetadata& semta) { + if(hypergraph != NULL) delete hypergraph; + } + void NotifyAlignmentForest(const SentenceMetadata& smeta, Hypergraph* hg) {} + void NotifyDecodingComplete(const SentenceMetadata& smeta) {} +}; + struct TrainingObserver : public DecoderObserver { TrainingObserver(const int k, const DocScorer& d, vector<GoodBadOracle>* o, vector<ScoreP>* cbs) : ds(d), oracles(*o), corpus_bleu_sent_stats(*cbs), kbest_size(k) { @@ -619,14 +644,15 @@ int main(int argc, char** argv) { no_select = conf.count("no_select"); update_list_size = conf["update_k_best"].as<int>(); unique_kbest = conf.count("unique_k_best"); + stream = conf.count("stream"); pseudo_doc = conf.count("pseudo_doc"); sent_approx = conf.count("sent_approx"); cerr << "Using pseudo-doc:" << pseudo_doc << " Sent:" << sent_approx << endl; if(pseudo_doc) mt_metric_scale=1; - const string weights_dir = conf["weights_output"].as<string>(); - const string output_dir = conf["output_dir"].as<string>(); + const string weights_dir = stream ? "-" : conf["weights_output"].as<string>(); + const string output_dir = stream ? "-" : conf["output_dir"].as<string>(); ScoreType type = ScoreTypeFromString(metric_name); //establish metric used for tuning @@ -636,16 +662,22 @@ int main(int argc, char** argv) { invert_score = false; } - //load references - DocScorer ds(type, conf["reference"].as<vector<string> >(), ""); - cerr << "Loaded " << ds.size() << " references for scoring with " << metric_name << endl; + shared_ptr<DocScorer> ds; + //normal: load references, stream: start stream scorer + if (stream) { + ds = shared_ptr<DocScorer>(new DocStreamScorer(type, vector<string>(0), "")); + cerr << "Scoring doc stream with " << metric_name << endl; + } else { + ds = shared_ptr<DocScorer>(new DocScorer(type, conf["reference"].as<vector<string> >(), "")); + cerr << "Loaded " << ds->size() << " references for scoring with " << metric_name << endl; + } vector<ScoreP> corpus_bleu_sent_stats; //check training pass,if >0, then use previous iterations corpus bleu stats - cur_pass = conf["pass"].as<int>(); + cur_pass = stream ? 0 : conf["pass"].as<int>(); if(cur_pass > 0) { - ReadPastTranslationForScore(cur_pass, &corpus_bleu_sent_stats, ds, output_dir); + ReadPastTranslationForScore(cur_pass, &corpus_bleu_sent_stats, *ds, output_dir); } cerr << "Using optimizer:" << optimizer << endl; @@ -659,7 +691,7 @@ int main(int argc, char** argv) { Weights::InitFromFile(conf["input_weights"].as<string>(), &dense_weights); Weights::InitSparseVector(dense_weights, &lambdas); - const string input = decoder.GetConf()["input"].as<string>(); + const string input = stream ? "-" : decoder.GetConf()["input"].as<string>(); if (!SILENT) cerr << "Reading input from " << ((input == "-") ? "STDIN" : input.c_str()) << endl; ReadFile in_read(input); istream *in = in_read.stream(); @@ -668,9 +700,10 @@ int main(int argc, char** argv) { const double max_step_size = conf["max_step_size"].as<double>(); - vector<GoodBadOracle> oracles(ds.size()); + vector<GoodBadOracle> oracles(ds->size()); - TrainingObserver observer(conf["k_best_size"].as<int>(), ds, &oracles, &corpus_bleu_sent_stats); + BasicObserver bobs; + TrainingObserver observer(conf["k_best_size"].as<int>(), *ds, &oracles, &corpus_bleu_sent_stats); int cur_sent = 0; int lcount = 0; @@ -689,12 +722,30 @@ int main(int argc, char** argv) { while(*in) { getline(*in, buf); if (buf.empty()) continue; + if (stream) { + int delim = buf.find(" ||| "); + // Translate only + if (delim == -1) { + cur_sent = 0; + decoder.SetId(cur_sent); + decoder.Decode(buf, &bobs); + vector<WordID> trans; + ViterbiESentence(bobs.hypergraph[0], &trans); + cout << TD::GetString(trans) << endl; + continue; + // Translate and update (normal MIRA) + } else { + cur_sent = 1; + ds->update(buf.substr(delim + 5)); + buf = buf.substr(0, delim); + } + } //TODO: allow batch updating lambdas.init_vector(&dense_weights); dense_w_local = dense_weights; decoder.SetId(cur_sent); decoder.Decode(buf, &observer); // decode the sentence, calling Notify to get the hope,fear, and model best hyps. - + cur_sent = observer.GetCurrentSent(); cerr << "SENT: " << cur_sent << endl; const HypothesisInfo& cur_hyp = observer.GetCurrentBestHypothesis(); @@ -708,15 +759,15 @@ int main(int argc, char** argv) { tot_loss += cur_hyp.mt_metric; //score hyps to be able to compute corpus level bleu after we finish this iteration through the corpus - ScoreP sentscore = ds[cur_sent]->ScoreCandidate(cur_hyp.hyp); + ScoreP sentscore = (*ds)[cur_sent]->ScoreCandidate(cur_hyp.hyp); if (!acc) { acc = sentscore->GetZero(); } acc->PlusEquals(*sentscore); - ScoreP hope_sentscore = ds[cur_sent]->ScoreCandidate(cur_good.hyp); + ScoreP hope_sentscore = (*ds)[cur_sent]->ScoreCandidate(cur_good.hyp); if (!acc_h) { acc_h = hope_sentscore->GetZero(); } acc_h->PlusEquals(*hope_sentscore); - ScoreP fear_sentscore = ds[cur_sent]->ScoreCandidate(cur_bad.hyp); + ScoreP fear_sentscore = (*ds)[cur_sent]->ScoreCandidate(cur_bad.hyp); if (!acc_f) { acc_f = fear_sentscore->GetZero(); } acc_f->PlusEquals(*fear_sentscore); @@ -915,11 +966,11 @@ int main(int argc, char** argv) { } - if ((cur_sent * 40 / ds.size()) > dots) { ++dots; cerr << '.'; } + if ((cur_sent * 40 / ds->size()) > dots) { ++dots; cerr << '.'; } tot += lambdas; ++lcount; cur_sent++; - + cout << TD::GetString(cur_good_v[0]->hyp) << " ||| " << TD::GetString(cur_best_v[0]->hyp) << " ||| " << TD::GetString(cur_bad_v[0]->hyp) << endl; } @@ -929,24 +980,27 @@ int main(int argc, char** argv) { cerr << "Translated " << lcount << " sentences " << endl; cerr << " [AVG METRIC LAST PASS=" << (tot_loss / lcount) << "]\n"; tot_loss = 0; + + // Write weights unless streaming + if (!stream) { + int node_id = rng->next() * 100000; + cerr << " Writing weights to " << node_id << endl; + Weights::ShowLargestFeatures(dense_weights); + dots = 0; + ostringstream os; + os << weights_dir << "/weights.mira-pass" << (cur_pass < 10 ? "0" : "") << cur_pass << "." << node_id << ".gz"; + string msg = "# MIRA tuned weights ||| " + boost::lexical_cast<std::string>(node_id) + " ||| " + boost::lexical_cast<std::string>(lcount); + lambdas.init_vector(&dense_weights); + Weights::WriteToFile(os.str(), dense_weights, true, &msg); - int node_id = rng->next() * 100000; - cerr << " Writing weights to " << node_id << endl; - Weights::ShowLargestFeatures(dense_weights); - dots = 0; - ostringstream os; - os << weights_dir << "/weights.mira-pass" << (cur_pass < 10 ? "0" : "") << cur_pass << "." << node_id << ".gz"; - string msg = "# MIRA tuned weights ||| " + boost::lexical_cast<std::string>(node_id) + " ||| " + boost::lexical_cast<std::string>(lcount); - lambdas.init_vector(&dense_weights); - Weights::WriteToFile(os.str(), dense_weights, true, &msg); - - SparseVector<double> x = tot; - x /= lcount+1; - ostringstream sa; - string msga = "# MIRA tuned weights AVERAGED ||| " + boost::lexical_cast<std::string>(node_id) + " ||| " + boost::lexical_cast<std::string>(lcount); - sa << weights_dir << "/weights.mira-pass" << (cur_pass < 10 ? "0" : "") << cur_pass << "." << node_id << "-avg.gz"; - x.init_vector(&dense_weights); - Weights::WriteToFile(sa.str(), dense_weights, true, &msga); + SparseVector<double> x = tot; + x /= lcount+1; + ostringstream sa; + string msga = "# MIRA tuned weights AVERAGED ||| " + boost::lexical_cast<std::string>(node_id) + " ||| " + boost::lexical_cast<std::string>(lcount); + sa << weights_dir << "/weights.mira-pass" << (cur_pass < 10 ? "0" : "") << cur_pass << "." << node_id << "-avg.gz"; + x.init_vector(&dense_weights); + Weights::WriteToFile(sa.str(), dense_weights, true, &msga); + } cerr << "Optimization complete.\n"; return 0; |