summaryrefslogtreecommitdiff
path: root/vest
diff options
context:
space:
mode:
Diffstat (limited to 'vest')
-rw-r--r--vest/ces.cc42
-rw-r--r--vest/ces.h10
-rwxr-xr-xvest/dist-vest.pl4
-rw-r--r--vest/error_surface.cc11
-rw-r--r--vest/error_surface.h6
-rw-r--r--vest/line_optimizer.cc20
-rw-r--r--vest/line_optimizer.h2
-rw-r--r--vest/lo_test.cc21
-rw-r--r--vest/mr_vest_map.cc16
-rw-r--r--vest/mr_vest_reduce.cc34
10 files changed, 88 insertions, 78 deletions
diff --git a/vest/ces.cc b/vest/ces.cc
index 4ae6b695..cd89aa69 100644
--- a/vest/ces.cc
+++ b/vest/ces.cc
@@ -4,25 +4,32 @@
#include <sstream>
#include <boost/shared_ptr.hpp>
-#include "aligner.h"
+// TODO, if AER is to be optimized again, we will need this
+// #include "aligner.h"
#include "lattice.h"
#include "viterbi_envelope.h"
#include "error_surface.h"
+#include "ns.h"
using boost::shared_ptr;
using namespace std;
const bool minimize_segments = true; // if adjacent segments have equal scores, merge them
-void ComputeErrorSurface(const SentenceScorer& ss, const ViterbiEnvelope& ve, ErrorSurface* env, const ScoreType type, const Hypergraph& hg) {
+void ComputeErrorSurface(const SegmentEvaluator& ss,
+ const ViterbiEnvelope& ve,
+ ErrorSurface* env,
+ const EvaluationMetric* metric,
+ const Hypergraph& hg) {
vector<WordID> prev_trans;
const vector<shared_ptr<Segment> >& ienv = ve.GetSortedSegs();
env->resize(ienv.size());
- ScoreP prev_score;
+ SufficientStats prev_score; // defaults to 0
int j = 0;
for (int i = 0; i < ienv.size(); ++i) {
const Segment& seg = *ienv[i];
vector<WordID> trans;
+#if 0
if (type == AER) {
vector<bool> edges(hg.edges_.size(), false);
seg.CollectEdgesUsed(&edges); // get the set of edges in the viterbi
@@ -46,34 +53,31 @@ void ComputeErrorSurface(const SentenceScorer& ss, const ViterbiEnvelope& ve, Er
string tstr = os.str();
TD::ConvertSentence(tstr.substr(tstr.rfind(" ||| ") + 5), &trans);
} else {
+#endif
seg.ConstructTranslation(&trans);
- }
- // cerr << "Scoring: " << TD::GetString(trans) << endl;
+ //}
+ //cerr << "Scoring: " << TD::GetString(trans) << endl;
if (trans == prev_trans) {
if (!minimize_segments) {
- assert(prev_score); // if this fails, it means
- // the decoder can generate null translations
ErrorSegment& out = (*env)[j];
- out.delta = prev_score->GetZero();
+ out.delta.fields.clear();
out.x = seg.x;
++j;
}
- // cerr << "Identical translation, skipping scoring\n";
+ //cerr << "Identical translation, skipping scoring\n";
} else {
- ScoreP score = ss.ScoreCandidate(trans);
+ SufficientStats score;
+ ss.Evaluate(trans, &score);
// cerr << "score= " << score->ComputeScore() << "\n";
- ScoreP cur_delta_p = score->GetZero();
- Score* cur_delta = cur_delta_p.get();
- // just record the score diffs
- if (!prev_score)
- prev_score = score->GetZero();
-
- score->Subtract(*prev_score, cur_delta);
+ //string x1; score.Encode(&x1); cerr << "STATS: " << x1 << endl;
+ const SufficientStats delta = score - prev_score;
+ //string x2; delta.Encode(&x2); cerr << "DELTA: " << x2 << endl;
+ //string xx; delta.Encode(&xx); cerr << xx << endl;
prev_trans.swap(trans);
prev_score = score;
- if ((!minimize_segments) || (!cur_delta->IsAdditiveIdentity())) {
+ if ((!minimize_segments) || (!delta.IsAdditiveIdentity())) {
ErrorSegment& out = (*env)[j];
- out.delta = cur_delta_p;
+ out.delta = delta;
out.x = seg.x;
++j;
}
diff --git a/vest/ces.h b/vest/ces.h
index 2f098990..e021e715 100644
--- a/vest/ces.h
+++ b/vest/ces.h
@@ -1,12 +1,16 @@
#ifndef _CES_H_
#define _CES_H_
-#include "scorer.h"
-
class ViterbiEnvelope;
class Hypergraph;
+class SegmentEvaluator;
class ErrorSurface;
+class EvaluationMetric;
-void ComputeErrorSurface(const SentenceScorer& ss, const ViterbiEnvelope& ve, ErrorSurface* es, const ScoreType type, const Hypergraph& hg);
+void ComputeErrorSurface(const SegmentEvaluator& ss,
+ const ViterbiEnvelope& ve,
+ ErrorSurface* es,
+ const EvaluationMetric* metric,
+ const Hypergraph& hg);
#endif
diff --git a/vest/dist-vest.pl b/vest/dist-vest.pl
index c382a972..8cde748b 100755
--- a/vest/dist-vest.pl
+++ b/vest/dist-vest.pl
@@ -364,7 +364,7 @@ while (1){
$mapoutput =~ s/mapinput/mapoutput/;
push @mapoutputs, "$dir/splag.$im1/$mapoutput";
$o2i{"$dir/splag.$im1/$mapoutput"} = "$dir/splag.$im1/$shard";
- my $script = "$MAPPER -s $srcFile -l $metric $refs_comma_sep < $dir/splag.$im1/$shard | sort -t \$'\\t' -k 1 > $dir/splag.$im1/$mapoutput";
+ my $script = "$MAPPER -s $srcFile -m $metric $refs_comma_sep < $dir/splag.$im1/$shard | sort -t \$'\\t' -k 1 > $dir/splag.$im1/$mapoutput";
if ($use_make) {
my $script_file = "$dir/scripts/map.$shard";
open F, ">$script_file" or die "Can't write $script_file: $!";
@@ -424,7 +424,7 @@ while (1){
print STDERR "Results for $tol/$til lines\n";
print STDERR "\nSORTING AND RUNNING VEST REDUCER\n";
print STDERR unchecked_output("date");
- $cmd="sort -t \$'\\t' -k 1 @mapoutputs | $REDUCER -l $metric > $dir/redoutput.$im1";
+ $cmd="sort -t \$'\\t' -k 1 @mapoutputs | $REDUCER -m $metric > $dir/redoutput.$im1";
print STDERR "COMMAND:\n$cmd\n";
check_bash_call($cmd);
$cmd="sort -nk3 $DIR_FLAG '-t|' $dir/redoutput.$im1 | head -1";
diff --git a/vest/error_surface.cc b/vest/error_surface.cc
index 754aa8de..515b67f8 100644
--- a/vest/error_surface.cc
+++ b/vest/error_surface.cc
@@ -5,8 +5,7 @@
using namespace std;
-ErrorSurface::~ErrorSurface() {
-}
+ErrorSurface::~ErrorSurface() {}
void ErrorSurface::Serialize(std::string* out) const {
const int segments = this->size();
@@ -15,8 +14,8 @@ void ErrorSurface::Serialize(std::string* out) const {
for (int i = 0; i < segments; ++i) {
const ErrorSegment& cur = (*this)[i];
string senc;
- cur.delta->Encode(&senc);
- assert(senc.size() < 256);
+ cur.delta.Encode(&senc);
+ assert(senc.size() < 1024);
unsigned char len = senc.size();
os.write((const char*)&cur.x, sizeof(cur.x));
os.write((const char*)&len, sizeof(len));
@@ -25,7 +24,7 @@ void ErrorSurface::Serialize(std::string* out) const {
*out = os.str();
}
-void ErrorSurface::Deserialize(ScoreType type, const std::string& in) {
+void ErrorSurface::Deserialize(const std::string& in) {
istringstream is(in, ios::binary);
int segments;
is.read((char*)&segments, sizeof(segments));
@@ -37,7 +36,7 @@ void ErrorSurface::Deserialize(ScoreType type, const std::string& in) {
is.read((char*)&len, sizeof(len));
string senc(len, '\0'); assert(senc.size() == len);
is.read((char*)&senc[0], len);
- cur.delta = SentenceScorer::CreateScoreFromString(type, senc);
+ cur.delta = SufficientStats(senc);
}
}
diff --git a/vest/error_surface.h b/vest/error_surface.h
index ad728cfa..bb65847b 100644
--- a/vest/error_surface.h
+++ b/vest/error_surface.h
@@ -4,13 +4,13 @@
#include <vector>
#include <string>
-#include "scorer.h"
+#include "ns.h"
class Score;
struct ErrorSegment {
double x;
- ScoreP delta;
+ SufficientStats delta;
ErrorSegment() : x(0), delta() {}
};
@@ -18,7 +18,7 @@ class ErrorSurface : public std::vector<ErrorSegment> {
public:
~ErrorSurface();
void Serialize(std::string* out) const;
- void Deserialize(ScoreType type, const std::string& in);
+ void Deserialize(const std::string& in);
};
#endif
diff --git a/vest/line_optimizer.cc b/vest/line_optimizer.cc
index 7303df8d..49443fbe 100644
--- a/vest/line_optimizer.cc
+++ b/vest/line_optimizer.cc
@@ -4,7 +4,7 @@
#include <algorithm>
#include "sparse_vector.h"
-#include "scorer.h"
+#include "ns.h"
using namespace std;
@@ -18,6 +18,7 @@ struct IntervalComp {
};
double LineOptimizer::LineOptimize(
+ const EvaluationMetric* metric,
const vector<ErrorSurface>& surfaces,
const LineOptimizer::ScoreType type,
float* best_score,
@@ -32,8 +33,7 @@ double LineOptimizer::LineOptimize(
}
sort(all_ints.begin(), all_ints.end(), IntervalComp());
double last_boundary = all_ints.front()->x;
- ScoreP accp = all_ints.front()->delta->GetZero();
- Score *acc=accp.get();
+ SufficientStats acc;
float& cur_best_score = *best_score;
cur_best_score = (type == MAXIMIZE_SCORE ?
-numeric_limits<float>::max() : numeric_limits<float>::max());
@@ -42,9 +42,8 @@ double LineOptimizer::LineOptimize(
for (vector<ErrorIter>::iterator i = all_ints.begin();
i != all_ints.end(); ++i) {
const ErrorSegment& seg = **i;
- assert(seg.delta);
if (seg.x - last_boundary > epsilon) {
- float sco = acc->ComputeScore();
+ float sco = metric->ComputeScore(acc);
if ((type == MAXIMIZE_SCORE && sco > cur_best_score) ||
(type == MINIMIZE_SCORE && sco < cur_best_score) ) {
cur_best_score = sco;
@@ -54,16 +53,18 @@ double LineOptimizer::LineOptimize(
} else {
pos = last_boundary + (seg.x - last_boundary) / 2;
}
- // cerr << "NEW BEST: " << pos << " (score=" << cur_best_score << ")\n";
+ //cerr << "NEW BEST: " << pos << " (score=" << cur_best_score << ")\n";
}
- // string xx; acc->ScoreDetails(&xx); cerr << "---- " << xx;
+ // string xx = metric->DetailedScore(acc); cerr << "---- " << xx;
// cerr << "---- s=" << sco << "\n";
last_boundary = seg.x;
}
// cerr << "x-boundary=" << seg.x << "\n";
- acc->PlusEquals(*seg.delta);
+ //string x2; acc.Encode(&x2); cerr << " ACC: " << x2 << endl;
+ //string x1; seg.delta.Encode(&x1); cerr << " DELTA: " << x1 << endl;
+ acc += seg.delta;
}
- float sco = acc->ComputeScore();
+ float sco = metric->ComputeScore(acc);
if ((type == MAXIMIZE_SCORE && sco > cur_best_score) ||
(type == MINIMIZE_SCORE && sco < cur_best_score) ) {
cur_best_score = sco;
@@ -107,3 +108,4 @@ void LineOptimizer::CreateOptimizationDirections(
RandomUnitVector(features_to_optimize, &out[i], rng);
cerr << "Generated " << out.size() << " total axes to optimize along.\n";
}
+
diff --git a/vest/line_optimizer.h b/vest/line_optimizer.h
index 99a591f4..83819f41 100644
--- a/vest/line_optimizer.h
+++ b/vest/line_optimizer.h
@@ -7,6 +7,7 @@
#include "error_surface.h"
#include "sampler.h"
+class EvaluationMetric;
class Weights;
struct LineOptimizer {
@@ -18,6 +19,7 @@ struct LineOptimizer {
// merge all the error surfaces together into a global
// error surface and find (the middle of) the best segment
static double LineOptimize(
+ const EvaluationMetric* metric,
const std::vector<ErrorSurface>& envs,
const LineOptimizer::ScoreType type,
float* best_score,
diff --git a/vest/lo_test.cc b/vest/lo_test.cc
index f5638600..a67f65e1 100644
--- a/vest/lo_test.cc
+++ b/vest/lo_test.cc
@@ -5,6 +5,8 @@
#include <boost/shared_ptr.hpp>
#include <gtest/gtest.h>
+#include "ns.h"
+#include "ns_docscorer.h"
#include "ces.h"
#include "fdict.h"
#include "hg.h"
@@ -15,7 +17,6 @@
#include "viterbi.h"
#include "viterbi_envelope.h"
#include "line_optimizer.h"
-#include "scorer.h"
using namespace std;
using boost::shared_ptr;
@@ -141,9 +142,6 @@ TEST_F(OptTest, TestS1) {
TD::ConvertSentence(ref22, &refs2[1]);
TD::ConvertSentence(ref32, &refs2[2]);
TD::ConvertSentence(ref42, &refs2[3]);
- ScoreType type = ScoreTypeFromString("ibm_bleu");
- ScorerP scorer1 = SentenceScorer::CreateSentenceScorer(type, refs1);
- ScorerP scorer2 = SentenceScorer::CreateSentenceScorer(type, refs2);
vector<ViterbiEnvelope> envs(2);
RandomNumberGenerator<boost::mt19937> rng;
@@ -167,14 +165,17 @@ TEST_F(OptTest, TestS1) {
envs[1] = Inside<ViterbiEnvelope, ViterbiEnvelopeWeightFunction>(hg2, NULL, wf);
vector<ErrorSurface> es(2);
- ComputeErrorSurface(*scorer1, envs[0], &es[0], IBM_BLEU, hg);
- ComputeErrorSurface(*scorer2, envs[1], &es[1], IBM_BLEU, hg2);
+ EvaluationMetric* metric = EvaluationMetric::Instance("IBM_BLEU");
+ boost::shared_ptr<SegmentEvaluator> scorer1 = metric->CreateSegmentEvaluator(refs1);
+ boost::shared_ptr<SegmentEvaluator> scorer2 = metric->CreateSegmentEvaluator(refs2);
+ ComputeErrorSurface(*scorer1, envs[0], &es[0], metric, hg);
+ ComputeErrorSurface(*scorer2, envs[1], &es[1], metric, hg2);
cerr << envs[0].size() << " " << envs[1].size() << endl;
cerr << es[0].size() << " " << es[1].size() << endl;
envs.clear();
clock_t t_env=clock();
float score;
- double m = LineOptimizer::LineOptimize(es, LineOptimizer::MAXIMIZE_SCORE, &score);
+ double m = LineOptimizer::LineOptimize(metric,es, LineOptimizer::MAXIMIZE_SCORE, &score);
clock_t t_opt=clock();
cerr << "line optimizer returned: " << m << " (SCORE=" << score << ")\n";
EXPECT_FLOAT_EQ(0.48719698, score);
@@ -217,15 +218,15 @@ TEST_F(OptTest,TestZeroOrigin) {
vector<ViterbiEnvelope> envs(1);
envs[0] = Inside<ViterbiEnvelope, ViterbiEnvelopeWeightFunction>(hg, NULL, wf);
- ScoreType type = ScoreTypeFromString("ibm_bleu");
vector<vector<WordID> > mr(4);
TD::ConvertSentence("untitled", &mr[0]);
TD::ConvertSentence("with no title", &mr[1]);
TD::ConvertSentence("without a title", &mr[2]);
TD::ConvertSentence("without title", &mr[3]);
- ScorerP scorer1 = SentenceScorer::CreateSentenceScorer(type, mr);
+ EvaluationMetric* metric = EvaluationMetric::Instance("IBM_BLEU");
+ boost::shared_ptr<SegmentEvaluator> scorer1 = metric->CreateSegmentEvaluator(mr);
vector<ErrorSurface> es(1);
- ComputeErrorSurface(*scorer1, envs[0], &es[0], IBM_BLEU, hg);
+ ComputeErrorSurface(*scorer1, envs[0], &es[0], metric, hg);
}
int main(int argc, char **argv) {
diff --git a/vest/mr_vest_map.cc b/vest/mr_vest_map.cc
index 71dda6d7..8f6e085d 100644
--- a/vest/mr_vest_map.cc
+++ b/vest/mr_vest_map.cc
@@ -6,11 +6,12 @@
#include <boost/program_options.hpp>
#include <boost/program_options/variables_map.hpp>
+#include "ns.h"
+#include "ns_docscorer.h"
#include "ces.h"
#include "filelib.h"
#include "stringlib.h"
#include "sparse_vector.h"
-#include "scorer.h"
#include "viterbi_envelope.h"
#include "inside_outside.h"
#include "error_surface.h"
@@ -25,7 +26,7 @@ void InitCommandLine(int argc, char** argv, po::variables_map* conf) {
opts.add_options()
("reference,r",po::value<vector<string> >(), "[REQD] Reference translation (tokenized text)")
("source,s",po::value<string>(), "Source file (ignored, except for AER)")
- ("loss_function,l",po::value<string>()->default_value("ibm_bleu"), "Loss function being optimized")
+ ("evaluation_metric,m",po::value<string>()->default_value("ibm_bleu"), "Evaluation metric being optimized")
("input,i",po::value<string>()->default_value("-"), "Input file to map (- is STDIN)")
("help,h", "Help");
po::options_description dcmdline_options;
@@ -67,10 +68,10 @@ bool ReadSparseVectorString(const string& s, SparseVector<double>* v) {
int main(int argc, char** argv) {
po::variables_map conf;
InitCommandLine(argc, argv, &conf);
- const string loss_function = conf["loss_function"].as<string>();
- ScoreType type = ScoreTypeFromString(loss_function);
- DocScorer ds(type, conf["reference"].as<vector<string> >(), conf["source"].as<string>());
- cerr << "Loaded " << ds.size() << " references for scoring with " << loss_function << endl;
+ const string evaluation_metric = conf["evaluation_metric"].as<string>();
+ EvaluationMetric* metric = EvaluationMetric::Instance(evaluation_metric);
+ DocumentScorer ds(metric, conf["reference"].as<vector<string> >());
+ cerr << "Loaded " << ds.size() << " references for scoring with " << evaluation_metric << endl;
Hypergraph hg;
string last_file;
ReadFile in_read(conf["input"].as<string>());
@@ -97,7 +98,8 @@ int main(int argc, char** argv) {
ViterbiEnvelopeWeightFunction wf(origin, axis);
ViterbiEnvelope ve = Inside<ViterbiEnvelope, ViterbiEnvelopeWeightFunction>(hg, NULL, wf);
ErrorSurface es;
- ComputeErrorSurface(*ds[sent_id], ve, &es, type, hg);
+
+ ComputeErrorSurface(*ds[sent_id], ve, &es, metric, hg);
//cerr << "Viterbi envelope has " << ve.size() << " segments\n";
// cerr << "Error surface has " << es.size() << " segments\n";
string val;
diff --git a/vest/mr_vest_reduce.cc b/vest/mr_vest_reduce.cc
index 3df52020..dda61f88 100644
--- a/vest/mr_vest_reduce.cc
+++ b/vest/mr_vest_reduce.cc
@@ -10,6 +10,7 @@
#include "error_surface.h"
#include "line_optimizer.h"
#include "b64tools.h"
+#include "stringlib.h"
using namespace std;
namespace po = boost::program_options;
@@ -17,12 +18,12 @@ namespace po = boost::program_options;
void InitCommandLine(int argc, char** argv, po::variables_map* conf) {
po::options_description opts("Configuration options");
opts.add_options()
- ("loss_function,l",po::value<string>(), "Loss function being optimized")
+ ("evaluation_metric,m",po::value<string>(), "Evaluation metric (IBM_BLEU, etc.)")
("help,h", "Help");
po::options_description dcmdline_options;
dcmdline_options.add(opts);
po::store(parse_command_line(argc, argv, dcmdline_options), *conf);
- bool flag = conf->count("loss_function") == 0;
+ bool flag = conf->count("evaluation_metric") == 0;
if (flag || conf->count("help")) {
cerr << dcmdline_options << endl;
exit(1);
@@ -32,30 +33,27 @@ void InitCommandLine(int argc, char** argv, po::variables_map* conf) {
int main(int argc, char** argv) {
po::variables_map conf;
InitCommandLine(argc, argv, &conf);
- const string loss_function = conf["loss_function"].as<string>();
- ScoreType type = ScoreTypeFromString(loss_function);
+ const string evaluation_metric = conf["evaluation_metric"].as<string>();
LineOptimizer::ScoreType opt_type = LineOptimizer::MAXIMIZE_SCORE;
- if (type == TER || type == AER) {
+ if (UppercaseString(evaluation_metric) == "TER")
opt_type = LineOptimizer::MINIMIZE_SCORE;
- }
- string last_key;
+ EvaluationMetric* metric = EvaluationMetric::Instance(evaluation_metric);
+
vector<ErrorSurface> esv;
- while(cin) {
- string line;
- getline(cin, line);
- if (line.empty()) continue;
+ string last_key, line, key, val;
+ while(getline(cin, line)) {
size_t ks = line.find("\t");
assert(string::npos != ks);
assert(ks > 2);
- string key = line.substr(2, ks - 2);
- string val = line.substr(ks + 1);
+ key = line.substr(2, ks - 2);
+ val = line.substr(ks + 1);
if (key != last_key) {
if (!last_key.empty()) {
float score;
- double x = LineOptimizer::LineOptimize(esv, opt_type, &score);
+ double x = LineOptimizer::LineOptimize(metric, esv, opt_type, &score);
cout << last_key << "|" << x << "|" << score << endl;
}
- last_key = key;
+ last_key.swap(key);
esv.clear();
}
if (val.size() % 4 != 0) {
@@ -68,13 +66,11 @@ int main(int argc, char** argv) {
continue;
}
esv.push_back(ErrorSurface());
- esv.back().Deserialize(type, encoded);
+ esv.back().Deserialize(encoded);
}
if (!esv.empty()) {
- // cerr << "ESV=" << esv.size() << endl;
- // for (int i = 0; i < esv.size(); ++i) { cerr << esv[i].size() << endl; }
float score;
- double x = LineOptimizer::LineOptimize(esv, opt_type, &score);
+ double x = LineOptimizer::LineOptimize(metric, esv, opt_type, &score);
cout << last_key << "|" << x << "|" << score << endl;
}
return 0;