diff options
author | Patrick Simianer <p@simianer.de> | 2012-03-13 09:24:47 +0100 |
---|---|---|
committer | Patrick Simianer <p@simianer.de> | 2012-03-13 09:24:47 +0100 |
commit | ef6085e558e26c8819f1735425761103021b6470 (patch) | |
tree | 5cf70e4c48c64d838e1326b5a505c8c4061bff4a /vest/mbr_kbest.cc | |
parent | 10a232656a0c882b3b955d2bcfac138ce11e8a2e (diff) | |
parent | dfbc278c1057555fda9312291c8024049e00b7d8 (diff) |
merge with upstream
Diffstat (limited to 'vest/mbr_kbest.cc')
-rw-r--r-- | vest/mbr_kbest.cc | 138 |
1 files changed, 0 insertions, 138 deletions
diff --git a/vest/mbr_kbest.cc b/vest/mbr_kbest.cc deleted file mode 100644 index 2867b36b..00000000 --- a/vest/mbr_kbest.cc +++ /dev/null @@ -1,138 +0,0 @@ -#include <iostream> -#include <vector> - -#include <boost/program_options.hpp> - -#include "prob.h" -#include "tdict.h" -#include "scorer.h" -#include "filelib.h" -#include "stringlib.h" - -using namespace std; - -namespace po = boost::program_options; - -void InitCommandLine(int argc, char** argv, po::variables_map* conf) { - po::options_description opts("Configuration options"); - opts.add_options() - ("scale,a",po::value<double>()->default_value(1.0), "Posterior scaling factor (alpha)") - ("loss_function,l",po::value<string>()->default_value("bleu"), "Loss function") - ("input,i",po::value<string>()->default_value("-"), "File to read k-best lists from") - ("output_list,L", "Show reranked list as output") - ("help,h", "Help"); - po::options_description dcmdline_options; - dcmdline_options.add(opts); - po::store(parse_command_line(argc, argv, dcmdline_options), *conf); - bool flag = false; - if (flag || conf->count("help")) { - cerr << dcmdline_options << endl; - exit(1); - } -} - -struct LossComparer { - bool operator()(const pair<vector<WordID>, double>& a, const pair<vector<WordID>, double>& b) const { - return a.second < b.second; - } -}; - -bool ReadKBestList(istream* in, string* sent_id, vector<pair<vector<WordID>, prob_t> >* list) { - static string cache_id; - static pair<vector<WordID>, prob_t> cache_pair; - list->clear(); - string cur_id; - if (cache_pair.first.size() > 0) { - list->push_back(cache_pair); - cur_id = cache_id; - cache_pair.first.clear(); - } - string line; - string tstr; - while(*in) { - getline(*in, line); - if (line.empty()) continue; - size_t p1 = line.find(" ||| "); - if (p1 == string::npos) { cerr << "Bad format: " << line << endl; abort(); } - size_t p2 = line.find(" ||| ", p1 + 4); - if (p2 == string::npos) { cerr << "Bad format: " << line << endl; abort(); } - size_t p3 = line.rfind(" ||| "); - cache_id = line.substr(0, p1); - tstr = line.substr(p1 + 5, p2 - p1 - 5); - double val = strtod(line.substr(p3 + 5).c_str(), NULL); - TD::ConvertSentence(tstr, &cache_pair.first); - cache_pair.second.logeq(val); - if (cur_id.empty()) cur_id = cache_id; - if (cur_id == cache_id) { - list->push_back(cache_pair); - *sent_id = cur_id; - cache_pair.first.clear(); - } else { break; } - } - return !list->empty(); -} - -int main(int argc, char** argv) { - po::variables_map conf; - InitCommandLine(argc, argv, &conf); - const string metric = conf["loss_function"].as<string>(); - const bool output_list = conf.count("output_list") > 0; - const string file = conf["input"].as<string>(); - const double mbr_scale = conf["scale"].as<double>(); - cerr << "Posterior scaling factor (alpha) = " << mbr_scale << endl; - - ScoreType type = ScoreTypeFromString(metric); - vector<pair<vector<WordID>, prob_t> > list; - ReadFile rf(file); - string sent_id; - while(ReadKBestList(rf.stream(), &sent_id, &list)) { - vector<prob_t> joints(list.size()); - const prob_t max_score = pow(list.front().second, mbr_scale); - prob_t marginal = prob_t::Zero(); - for (int i = 0 ; i < list.size(); ++i) { - const prob_t joint = pow(list[i].second, mbr_scale) / max_score; - joints[i] = joint; - // cerr << "list[" << i << "] joint=" << log(joint) << endl; - marginal += joint; - } - int mbr_idx = -1; - vector<double> mbr_scores(output_list ? list.size() : 0); - double mbr_loss = numeric_limits<double>::max(); - for (int i = 0 ; i < list.size(); ++i) { - vector<vector<WordID> > refs(1, list[i].first); - //cerr << i << ": " << list[i].second <<"\t" << TD::GetString(list[i].first) << endl; - ScorerP scorer = SentenceScorer::CreateSentenceScorer(type, refs); - double wl_acc = 0; - for (int j = 0; j < list.size(); ++j) { - if (i != j) { - ScoreP s = scorer->ScoreCandidate(list[j].first); - double loss = 1.0 - s->ComputeScore(); - if (type == TER || type == AER) loss = 1.0 - loss; - double weighted_loss = loss * (joints[j] / marginal); - wl_acc += weighted_loss; - if ((!output_list) && wl_acc > mbr_loss) break; - } - } - if (output_list) mbr_scores[i] = wl_acc; - if (wl_acc < mbr_loss) { - mbr_loss = wl_acc; - mbr_idx = i; - } - } - // cerr << "ML translation: " << TD::GetString(list[0].first) << endl; - cerr << "MBR Best idx: " << mbr_idx << endl; - if (output_list) { - for (int i = 0; i < list.size(); ++i) - list[i].second.logeq(mbr_scores[i]); - sort(list.begin(), list.end(), LossComparer()); - for (int i = 0; i < list.size(); ++i) - cout << sent_id << " ||| " - << TD::GetString(list[i].first) << " ||| " - << log(list[i].second) << endl; - } else { - cout << TD::GetString(list[mbr_idx].first) << endl; - } - } - return 0; -} - |