summaryrefslogtreecommitdiff
path: root/vest
diff options
context:
space:
mode:
authorredpony <redpony@ec762483-ff6d-05da-a07a-a48fb63a330f>2010-08-11 02:37:10 +0000
committerredpony <redpony@ec762483-ff6d-05da-a07a-a48fb63a330f>2010-08-11 02:37:10 +0000
commita53461650fbdcd3cfe7543d28af9647ac3e5e47e (patch)
treee812756c733b34f9c16894265204acfa9f9998a9 /vest
parent19b59489bb600f438ad96f04ec5d5c5b6616c9c2 (diff)
major refactor, break bad circular deps
git-svn-id: https://ws10smt.googlecode.com/svn/trunk@509 ec762483-ff6d-05da-a07a-a48fb63a330f
Diffstat (limited to 'vest')
-rw-r--r--vest/Makefile.am30
-rw-r--r--vest/aer_scorer.cc135
-rw-r--r--vest/aer_scorer.h23
-rw-r--r--vest/comb_scorer.cc97
-rw-r--r--vest/comb_scorer.h17
-rw-r--r--vest/fast_score.cc72
-rw-r--r--vest/lo_test.cc5
-rw-r--r--vest/mr_vest_map.cc5
-rw-r--r--vest/mr_vest_reduce.cc2
-rw-r--r--vest/scorer.cc708
-rw-r--r--vest/scorer.h111
-rw-r--r--vest/ter.cc535
-rw-r--r--vest/ter.h19
13 files changed, 16 insertions, 1743 deletions
diff --git a/vest/Makefile.am b/vest/Makefile.am
index abdc8146..b869672b 100644
--- a/vest/Makefile.am
+++ b/vest/Makefile.am
@@ -1,15 +1,12 @@
bin_PROGRAMS = \
- mbr_kbest \
mr_vest_map \
mr_vest_reduce \
mr_vest_generate_mapper_input \
- fast_score \
sentserver \
sentclient
if HAVE_GTEST
noinst_PROGRAMS = \
- scorer_test \
lo_test
endif
@@ -17,25 +14,16 @@ sentserver_SOURCES = sentserver.c
sentclient_SOURCES = sentclient.c
-mbr_kbest_SOURCES = mbr_kbest.cc ter.cc comb_scorer.cc aer_scorer.cc scorer.cc viterbi_envelope.cc
-mbr_kbest_LDADD = $(top_srcdir)/decoder/libcdec.a -lz
+mr_vest_generate_mapper_input_SOURCES = mr_vest_generate_mapper_input.cc line_optimizer.cc
+mr_vest_generate_mapper_input_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a -lz
-fast_score_SOURCES = fast_score.cc ter.cc comb_scorer.cc aer_scorer.cc scorer.cc viterbi_envelope.cc
-fast_score_LDADD = $(top_srcdir)/decoder/libcdec.a -lz
+mr_vest_map_SOURCES = viterbi_envelope.cc ces.cc error_surface.cc mr_vest_map.cc line_optimizer.cc
+mr_vest_map_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a -lz
-mr_vest_generate_mapper_input_SOURCES = mr_vest_generate_mapper_input.cc line_optimizer.cc $(top_srcdir)/decoder/timing_stats.cc
-mr_vest_generate_mapper_input_LDADD = $(top_srcdir)/decoder/libcdec.a -lz
+mr_vest_reduce_SOURCES = error_surface.cc ces.cc mr_vest_reduce.cc line_optimizer.cc viterbi_envelope.cc
+mr_vest_reduce_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a -lz
-mr_vest_map_SOURCES = viterbi_envelope.cc error_surface.cc aer_scorer.cc mr_vest_map.cc scorer.cc ter.cc comb_scorer.cc line_optimizer.cc
-mr_vest_map_LDADD = $(top_srcdir)/decoder/libcdec.a -lz
+lo_test_SOURCES = lo_test.cc ces.cc viterbi_envelope.cc error_surface.cc line_optimizer.cc
+lo_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a -lz
-mr_vest_reduce_SOURCES = error_surface.cc aer_scorer.cc mr_vest_reduce.cc scorer.cc ter.cc comb_scorer.cc line_optimizer.cc viterbi_envelope.cc
-mr_vest_reduce_LDADD = $(top_srcdir)/decoder/libcdec.a -lz
-
-scorer_test_SOURCES = aer_scorer.cc scorer_test.cc scorer.cc ter.cc comb_scorer.cc viterbi_envelope.cc
-scorer_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) $(top_srcdir)/decoder/libcdec.a -lz
-
-lo_test_SOURCES = lo_test.cc scorer.cc ter.cc aer_scorer.cc comb_scorer.cc viterbi_envelope.cc error_surface.cc line_optimizer.cc
-lo_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) $(top_srcdir)/decoder/libcdec.a -lz
-
-AM_CPPFLAGS = -W -Wall -Wno-sign-compare $(GTEST_CPPFLAGS) -I$(top_srcdir)/decoder
+AM_CPPFLAGS = -W -Wall -Wno-sign-compare $(GTEST_CPPFLAGS) -I$(top_srcdir)/utils -I$(top_srcdir)/decoder -I$(top_srcdir)/mteval
diff --git a/vest/aer_scorer.cc b/vest/aer_scorer.cc
deleted file mode 100644
index 25b58b5e..00000000
--- a/vest/aer_scorer.cc
+++ /dev/null
@@ -1,135 +0,0 @@
-#include "aer_scorer.h"
-
-#include <cmath>
-#include <cassert>
-#include <sstream>
-
-#include "tdict.h"
-#include "aligner.h"
-
-using namespace std;
-
-class AERScore : public ScoreBase<AERScore> {
- friend class AERScorer;
- public:
- AERScore() : num_matches(), num_predicted(), num_in_ref() {}
- AERScore(int m, int p, int r) :
- num_matches(m), num_predicted(p), num_in_ref(r) {}
- virtual void PlusPartialEquals(const Score& rhs, int oracle_e_cover, int oracle_f_cover, int src_len){}
- virtual void PlusEquals(const Score& delta, const float scale) {
- const AERScore& other = static_cast<const AERScore&>(delta);
- num_matches += scale*other.num_matches;
- num_predicted += scale*other.num_predicted;
- num_in_ref += scale*other.num_in_ref;
- }
- virtual void PlusEquals(const Score& delta) {
- const AERScore& other = static_cast<const AERScore&>(delta);
- num_matches += other.num_matches;
- num_predicted += other.num_predicted;
- num_in_ref += other.num_in_ref;
- }
-
-
- virtual ScoreP GetZero() const {
- return ScoreP(new AERScore);
- }
- virtual ScoreP GetOne() const {
- return ScoreP(new AERScore);
- }
- virtual void Subtract(const Score& rhs, Score* out) const {
- AERScore* res = static_cast<AERScore*>(out);
- const AERScore& other = static_cast<const AERScore&>(rhs);
- res->num_matches = num_matches - other.num_matches;
- res->num_predicted = num_predicted - other.num_predicted;
- res->num_in_ref = num_in_ref - other.num_in_ref;
- }
- float Precision() const {
- return static_cast<float>(num_matches) / num_predicted;
- }
- float Recall() const {
- return static_cast<float>(num_matches) / num_in_ref;
- }
- float ComputePartialScore() const { return 0.0;}
- virtual float ComputeScore() const {
- const float prec = Precision();
- const float rec = Recall();
- const float f = (2.0 * prec * rec) / (rec + prec);
- if (isnan(f)) return 1.0f;
- return 1.0f - f;
- }
- virtual bool IsAdditiveIdentity() const {
- return (num_matches == 0) && (num_predicted == 0) && (num_in_ref == 0);
- }
- virtual void ScoreDetails(std::string* out) const {
- ostringstream os;
- os << "AER=" << (ComputeScore() * 100.0)
- << " F=" << (100 - ComputeScore() * 100.0)
- << " P=" << (Precision() * 100.0) << " R=" << (Recall() * 100.0)
- << " [" << num_matches << " " << num_predicted << " " << num_in_ref << "]";
- *out = os.str();
- }
- virtual void Encode(std::string*out) const {
- out->resize(sizeof(int) * 3);
- *(int *)&(*out)[sizeof(int) * 0] = num_matches;
- *(int *)&(*out)[sizeof(int) * 1] = num_predicted;
- *(int *)&(*out)[sizeof(int) * 2] = num_in_ref;
- }
- private:
- int num_matches;
- int num_predicted;
- int num_in_ref;
-};
-
-AERScorer::AERScorer(const vector<vector<WordID> >& refs, const string& src) : src_(src) {
- if (refs.size() != 1) {
- cerr << "AERScorer can only take a single reference!\n";
- abort();
- }
- ref_ = AlignerTools::ReadPharaohAlignmentGrid(TD::GetString(refs.front()));
-}
-
-static inline bool Safe(const Array2D<bool>& a, int i, int j) {
- if (i >= 0 && j >= 0 && i < a.width() && j < a.height())
- return a(i,j);
- else
- return false;
-}
-
-ScoreP AERScorer::ScoreCCandidate(const vector<WordID>& shyp) const {
- return ScoreP();
-}
-
-ScoreP AERScorer::ScoreCandidate(const vector<WordID>& shyp) const {
- boost::shared_ptr<Array2D<bool> > hyp =
- AlignerTools::ReadPharaohAlignmentGrid(TD::GetString(shyp));
-
- int m = 0;
- int r = 0;
- int p = 0;
- int i_len = ref_->width();
- int j_len = ref_->height();
- for (int i = 0; i < i_len; ++i) {
- for (int j = 0; j < j_len; ++j) {
- if ((*ref_)(i,j)) {
- ++r;
- if (Safe(*hyp, i, j)) ++m;
- }
- }
- }
- for (int i = 0; i < hyp->width(); ++i)
- for (int j = 0; j < hyp->height(); ++j)
- if ((*hyp)(i,j)) ++p;
-
- return ScoreP(new AERScore(m,p,r));
-}
-
-ScoreP AERScorer::ScoreFromString(const string& in) {
- AERScore* res = new AERScore;
- res->num_matches = *(const int *)&in[sizeof(int) * 0];
- res->num_predicted = *(const int *)&in[sizeof(int) * 1];
- res->num_in_ref = *(const int *)&in[sizeof(int) * 2];
- return ScoreP(res);
-}
-
-const std::string* AERScorer::GetSource() const { return &src_; }
-
diff --git a/vest/aer_scorer.h b/vest/aer_scorer.h
deleted file mode 100644
index 6d53d359..00000000
--- a/vest/aer_scorer.h
+++ /dev/null
@@ -1,23 +0,0 @@
-#ifndef _AER_SCORER_
-#define _AER_SCORER_
-
-#include <boost/shared_ptr.hpp>
-
-#include "scorer.h"
-#include "array2d.h"
-
-class AERScorer : public SentenceScorer {
- public:
- // when constructing alignment strings from a hypergraph, the source
- // is necessary.
- AERScorer(const std::vector<std::vector<WordID> >& refs, const std::string& src = "");
- ScoreP ScoreCandidate(const std::vector<WordID>& hyp) const;
- ScoreP ScoreCCandidate(const std::vector<WordID>& hyp) const;
- static ScoreP ScoreFromString(const std::string& in);
- const std::string* GetSource() const;
- private:
- std::string src_;
- boost::shared_ptr<Array2D<bool> > ref_;
-};
-
-#endif
diff --git a/vest/comb_scorer.cc b/vest/comb_scorer.cc
deleted file mode 100644
index 9fc37868..00000000
--- a/vest/comb_scorer.cc
+++ /dev/null
@@ -1,97 +0,0 @@
-#include "comb_scorer.h"
-
-#include <cstdio>
-
-using namespace std;
-
-class BLEUTERCombinationScore : public ScoreBase<BLEUTERCombinationScore> {
- friend class BLEUTERCombinationScorer;
- public:
- ~BLEUTERCombinationScore();
- float ComputePartialScore() const { return 0.0;}
- float ComputeScore() const {
- return (bleu->ComputeScore() - ter->ComputeScore()) / 2.0f;
- }
- void ScoreDetails(string* details) const {
- char buf[160];
- sprintf(buf, "Combi = %.2f, BLEU = %.2f, TER = %.2f",
- ComputeScore()*100.0f, bleu->ComputeScore()*100.0f, ter->ComputeScore()*100.0f);
- *details = buf;
- }
- void PlusPartialEquals(const Score& rhs, int oracle_e_cover, int oracle_f_cover, int src_len){}
-
- void PlusEquals(const Score& delta, const float scale) {
- bleu->PlusEquals(*static_cast<const BLEUTERCombinationScore&>(delta).bleu, scale);
- ter->PlusEquals(*static_cast<const BLEUTERCombinationScore&>(delta).ter, scale);
- }
- void PlusEquals(const Score& delta) {
- bleu->PlusEquals(*static_cast<const BLEUTERCombinationScore&>(delta).bleu);
- ter->PlusEquals(*static_cast<const BLEUTERCombinationScore&>(delta).ter);
- }
-
-
-
- ScoreP GetOne() const {
- BLEUTERCombinationScore* res = new BLEUTERCombinationScore;
- res->bleu = bleu->GetOne();
- res->ter = ter->GetOne();
- return ScoreP(res);
- }
- ScoreP GetZero() const {
- BLEUTERCombinationScore* res = new BLEUTERCombinationScore;
- res->bleu = bleu->GetZero();
- res->ter = ter->GetZero();
- return ScoreP(res);
- }
- void Subtract(const Score& rhs, Score* res) const {
- bleu->Subtract(*static_cast<const BLEUTERCombinationScore&>(rhs).bleu,
- static_cast<BLEUTERCombinationScore*>(res)->bleu.get());
- ter->Subtract(*static_cast<const BLEUTERCombinationScore&>(rhs).ter,
- static_cast<BLEUTERCombinationScore*>(res)->ter.get());
- }
- void Encode(std::string* out) const {
- string bs, ts;
- bleu->Encode(&bs);
- ter->Encode(&ts);
- out->clear();
- (*out) += static_cast<char>(bs.size());
- (*out) += bs;
- (*out) += ts;
- }
- bool IsAdditiveIdentity() const {
- return bleu->IsAdditiveIdentity() && ter->IsAdditiveIdentity();
- }
- private:
- ScoreP bleu;
- ScoreP ter;
-};
-
-BLEUTERCombinationScore::~BLEUTERCombinationScore() {
-}
-
-BLEUTERCombinationScorer::BLEUTERCombinationScorer(const vector<vector<WordID> >& refs) {
- bleu_ = SentenceScorer::CreateSentenceScorer(IBM_BLEU, refs);
- ter_ = SentenceScorer::CreateSentenceScorer(TER, refs);
-}
-
-BLEUTERCombinationScorer::~BLEUTERCombinationScorer() {
-}
-
-ScoreP BLEUTERCombinationScorer::ScoreCCandidate(const vector<WordID>& hyp) const {
- return ScoreP();
-}
-
-ScoreP BLEUTERCombinationScorer::ScoreCandidate(const std::vector<WordID>& hyp) const {
- BLEUTERCombinationScore* res = new BLEUTERCombinationScore;
- res->bleu = bleu_->ScoreCandidate(hyp);
- res->ter = ter_->ScoreCandidate(hyp);
- return ScoreP(res);
-}
-
-ScoreP BLEUTERCombinationScorer::ScoreFromString(const std::string& in) {
- int bss = in[0];
- BLEUTERCombinationScore* r = new BLEUTERCombinationScore;
- r->bleu = SentenceScorer::CreateScoreFromString(IBM_BLEU, in.substr(1, bss));
- r->ter = SentenceScorer::CreateScoreFromString(TER, in.substr(1 + bss));
- return ScoreP(r);
-}
diff --git a/vest/comb_scorer.h b/vest/comb_scorer.h
deleted file mode 100644
index 346be576..00000000
--- a/vest/comb_scorer.h
+++ /dev/null
@@ -1,17 +0,0 @@
-#ifndef _COMB_SCORER_
-#define _COMB_SCORER_
-
-#include "scorer.h"
-
-class BLEUTERCombinationScorer : public SentenceScorer {
- public:
- BLEUTERCombinationScorer(const std::vector<std::vector<WordID> >& refs);
- ~BLEUTERCombinationScorer();
- ScoreP ScoreCandidate(const std::vector<WordID>& hyp) const;
- ScoreP ScoreCCandidate(const std::vector<WordID>& hyp) const;
- static ScoreP ScoreFromString(const std::string& in);
- private:
- ScorerP bleu_,ter_;
-};
-
-#endif
diff --git a/vest/fast_score.cc b/vest/fast_score.cc
deleted file mode 100644
index 5ee264a6..00000000
--- a/vest/fast_score.cc
+++ /dev/null
@@ -1,72 +0,0 @@
-#include <iostream>
-#include <vector>
-
-#include <boost/program_options.hpp>
-#include <boost/program_options/variables_map.hpp>
-
-#include "filelib.h"
-#include "tdict.h"
-#include "scorer.h"
-
-using namespace std;
-namespace po = boost::program_options;
-
-void InitCommandLine(int argc, char** argv, po::variables_map* conf) {
- po::options_description opts("Configuration options");
- opts.add_options()
- ("reference,r",po::value<vector<string> >(), "[REQD] Reference translation(s) (tokenized text file)")
- ("loss_function,l",po::value<string>()->default_value("ibm_bleu"), "Scoring metric (ibm_bleu, nist_bleu, koehn_bleu, ter, combi)")
- ("in_file,i", po::value<string>()->default_value("-"), "Input file")
- ("help,h", "Help");
- po::options_description dcmdline_options;
- dcmdline_options.add(opts);
- po::store(parse_command_line(argc, argv, dcmdline_options), *conf);
- bool flag = false;
- if (!conf->count("reference")) {
- cerr << "Please specify one or more references using -r <REF1.TXT> -r <REF2.TXT> ...\n";
- flag = true;
- }
- if (flag || conf->count("help")) {
- cerr << dcmdline_options << endl;
- exit(1);
- }
-}
-
-int main(int argc, char** argv) {
- po::variables_map conf;
- InitCommandLine(argc, argv, &conf);
- const string loss_function = conf["loss_function"].as<string>();
- ScoreType type = ScoreTypeFromString(loss_function);
- DocScorer ds(type, conf["reference"].as<vector<string> >(), "");
- cerr << "Loaded " << ds.size() << " references for scoring with " << loss_function << endl;
-
- ReadFile rf(conf["in_file"].as<string>());
- ScoreP acc;
- istream& in = *rf.stream();
- int lc = 0;
- while(in) {
- string line;
- getline(in, line);
- if (line.empty() && !in) break;
- vector<WordID> sent;
- TD::ConvertSentence(line, &sent);
- ScoreP sentscore = ds[lc]->ScoreCandidate(sent);
- if (!acc) { acc = sentscore->GetZero(); }
- acc->PlusEquals(*sentscore);
- ++lc;
- }
- assert(lc > 0);
- if (lc > ds.size()) {
- cerr << "Too many (" << lc << ") translations in input, expected " << ds.size() << endl;
- return 1;
- }
- if (lc != ds.size())
- cerr << "Fewer sentences in hyp (" << lc << ") than refs ("
- << ds.size() << "): scoring partial set!\n";
- float score = acc->ComputeScore();
- string details;
- acc->ScoreDetails(&details);
- cerr << details << endl;
- cout << score << endl;
- return 0;
-}
diff --git a/vest/lo_test.cc b/vest/lo_test.cc
index 577113bb..9200eb34 100644
--- a/vest/lo_test.cc
+++ b/vest/lo_test.cc
@@ -5,6 +5,7 @@
#include <boost/shared_ptr.hpp>
#include <gtest/gtest.h>
+#include "ces.h"
#include "fdict.h"
#include "hg.h"
#include "kbest.h"
@@ -166,8 +167,8 @@ TEST_F(OptTest, TestS1) {
envs[1] = Inside<ViterbiEnvelope, ViterbiEnvelopeWeightFunction>(hg2, NULL, wf);
vector<ErrorSurface> es(2);
- scorer1->ComputeErrorSurface(envs[0], &es[0], IBM_BLEU, hg);
- scorer2->ComputeErrorSurface(envs[1], &es[1], IBM_BLEU, hg2);
+ ComputeErrorSurface(*scorer1, envs[0], &es[0], IBM_BLEU, hg);
+ ComputeErrorSurface(*scorer2, envs[1], &es[1], IBM_BLEU, hg2);
cerr << envs[0].size() << " " << envs[1].size() << endl;
cerr << es[0].size() << " " << es[1].size() << endl;
envs.clear();
diff --git a/vest/mr_vest_map.cc b/vest/mr_vest_map.cc
index b3acc5dd..1506a99f 100644
--- a/vest/mr_vest_map.cc
+++ b/vest/mr_vest_map.cc
@@ -6,6 +6,7 @@
#include <boost/program_options.hpp>
#include <boost/program_options/variables_map.hpp>
+#include "ces.h"
#include "filelib.h"
#include "stringlib.h"
#include "sparse_vector.h"
@@ -13,7 +14,7 @@
#include "viterbi_envelope.h"
#include "inside_outside.h"
#include "error_surface.h"
-#include "hg.h"
+#include "b64tools.h"
#include "hg_io.h"
using namespace std;
@@ -90,7 +91,7 @@ int main(int argc, char** argv) {
ViterbiEnvelopeWeightFunction wf(origin, axis);
ViterbiEnvelope ve = Inside<ViterbiEnvelope, ViterbiEnvelopeWeightFunction>(hg, NULL, wf);
ErrorSurface es;
- ds[sent_id]->ComputeErrorSurface(ve, &es, type, hg);
+ ComputeErrorSurface(*ds[sent_id], ve, &es, type, hg);
//cerr << "Viterbi envelope has " << ve.size() << " segments\n";
// cerr << "Error surface has " << es.size() << " segments\n";
string val;
diff --git a/vest/mr_vest_reduce.cc b/vest/mr_vest_reduce.cc
index 5efcc19a..3df52020 100644
--- a/vest/mr_vest_reduce.cc
+++ b/vest/mr_vest_reduce.cc
@@ -9,7 +9,7 @@
#include "sparse_vector.h"
#include "error_surface.h"
#include "line_optimizer.h"
-#include "hg_io.h"
+#include "b64tools.h"
using namespace std;
namespace po = boost::program_options;
diff --git a/vest/scorer.cc b/vest/scorer.cc
deleted file mode 100644
index 70fdef34..00000000
--- a/vest/scorer.cc
+++ /dev/null
@@ -1,708 +0,0 @@
-#include "scorer.h"
-
-#include <boost/lexical_cast.hpp>
-#include <map>
-#include <sstream>
-#include <iostream>
-#include <fstream>
-#include <cstdio>
-#include <valarray>
-#include <algorithm>
-
-#include <boost/shared_ptr.hpp>
-
-#include "filelib.h"
-#include "aligner.h"
-#include "viterbi_envelope.h"
-#include "error_surface.h"
-#include "ter.h"
-#include "aer_scorer.h"
-#include "comb_scorer.h"
-#include "tdict.h"
-#include "stringlib.h"
-#include "lattice.h"
-
-
-using boost::shared_ptr;
-using namespace std;
-
-const bool minimize_segments = true; // if adjacent segments have equal scores, merge them
-
-void Score::TimesEquals(float scale) {
- cerr<<"UNIMPLEMENTED except for BLEU (for MIRA): Score::TimesEquals"<<endl;abort();
-}
-
-ScoreType ScoreTypeFromString(const string& st) {
- const string sl = LowercaseString(st);
- if (sl == "ser")
- return SER;
- if (sl == "ter")
- return TER;
- if (sl == "aer")
- return AER;
- if (sl == "bleu" || sl == "ibm_bleu")
- return IBM_BLEU;
- if (sl == "ibm_bleu_3")
- return IBM_BLEU_3;
- if (sl == "nist_bleu")
- return NIST_BLEU;
- if (sl == "koehn_bleu")
- return Koehn_BLEU;
- if (sl == "combi")
- return BLEU_minus_TER_over_2;
- cerr << "Don't understand score type '" << st << "', defaulting to ibm_bleu.\n";
- return IBM_BLEU;
-}
-
-static char const* score_names[]={
- "IBM_BLEU", "NIST_BLEU", "Koehn_BLEU", "TER", "BLEU_minus_TER_over_2", "SER", "AER", "IBM_BLEU_3"
-};
-
-std::string StringFromScoreType(ScoreType st) {
- assert(st>=0 && st<sizeof(score_names)/sizeof(score_names[0]));
- return score_names[(int)st];
-}
-
-
-Score::~Score() {}
-SentenceScorer::~SentenceScorer() {}
-
-struct length_accum {
- template <class S>
- float operator()(float sum,S const& ref) const {
- return sum+ref.size();
- }
-};
-
-template <class S>
-float avg_reflength(vector<S> refs) {
- unsigned n=refs.size();
- return n?accumulate(refs.begin(),refs.end(),0.,length_accum())/n:0.;
-}
-
-
-float SentenceScorer::ComputeRefLength(const Sentence &hyp) const {
- return hyp.size(); // reasonable default? :)
-}
-
-const std::string* SentenceScorer::GetSource() const { return NULL; }
-
-class SERScore : public ScoreBase<SERScore> {
- friend class SERScorer;
- public:
- SERScore() : correct(0), total(0) {}
- float ComputePartialScore() const { return 0.0;}
- float ComputeScore() const {
- return static_cast<float>(correct) / static_cast<float>(total);
- }
- void ScoreDetails(string* details) const {
- ostringstream os;
- os << "SER= " << ComputeScore() << " (" << correct << '/' << total << ')';
- *details = os.str();
- }
- void PlusPartialEquals(const Score& /* delta */, int /* oracle_e_cover */, int /* oracle_f_cover */, int /* src_len */){}
-
- void PlusEquals(const Score& delta, const float scale) {
- correct += scale*static_cast<const SERScore&>(delta).correct;
- total += scale*static_cast<const SERScore&>(delta).total;
- }
- void PlusEquals(const Score& delta) {
- correct += static_cast<const SERScore&>(delta).correct;
- total += static_cast<const SERScore&>(delta).total;
- }
- ScoreP GetZero() const { return ScoreP(new SERScore); }
- ScoreP GetOne() const { return ScoreP(new SERScore); }
- void Subtract(const Score& rhs, Score* res) const {
- SERScore* r = static_cast<SERScore*>(res);
- r->correct = correct - static_cast<const SERScore&>(rhs).correct;
- r->total = total - static_cast<const SERScore&>(rhs).total;
- }
- void Encode(string* out) const {
- assert(!"not implemented");
- }
- bool IsAdditiveIdentity() const {
- return (total == 0 && correct == 0); // correct is always 0 <= n <= total
- }
- private:
- int correct, total;
-};
-
-std::string SentenceScorer::verbose_desc() const {
- return desc+",ref0={ "+TD::GetString(refs[0])+" }";
-}
-
-class SERScorer : public SentenceScorer {
- public:
- SERScorer(const vector<vector<WordID> >& references) : SentenceScorer("SERScorer",references),refs_(references) {}
- ScoreP ScoreCCandidate(const vector<WordID>& /* hyp */) const {
- return ScoreP();
- }
- ScoreP ScoreCandidate(const vector<WordID>& hyp) const {
- SERScore* res = new SERScore;
- res->total = 1;
- for (int i = 0; i < refs_.size(); ++i)
- if (refs_[i] == hyp) res->correct = 1;
- return ScoreP(res);
- }
- static ScoreP ScoreFromString(const string& data) {
- assert(!"Not implemented");
- }
- private:
- vector<vector<WordID> > refs_;
-};
-
-class BLEUScore : public ScoreBase<BLEUScore> {
- friend class BLEUScorerBase;
- public:
- BLEUScore(int n) : correct_ngram_hit_counts(float(0),n), hyp_ngram_counts(float(0),n) {
- ref_len = 0;
- hyp_len = 0; }
- BLEUScore(int n, int k) : correct_ngram_hit_counts(float(k),n), hyp_ngram_counts(float(k),n) {
- ref_len = k;
- hyp_len = k; }
- float ComputeScore() const;
- float ComputePartialScore() const;
- void ScoreDetails(string* details) const;
- void TimesEquals(float scale);
- void PlusEquals(const Score& delta);
- void PlusEquals(const Score& delta, const float scale);
- void PlusPartialEquals(const Score& delta, int oracle_e_cover, int oracle_f_cover, int src_len);
- ScoreP GetZero() const;
- ScoreP GetOne() const;
- void Subtract(const Score& rhs, Score* res) const;
- void Encode(string* out) const;
- bool IsAdditiveIdentity() const {
- if (fabs(ref_len) > 0.1f || hyp_len != 0) return false;
- for (int i = 0; i < correct_ngram_hit_counts.size(); ++i)
- if (hyp_ngram_counts[i] != 0 ||
- correct_ngram_hit_counts[i] != 0) return false;
- return true;
- }
- private:
- int N() const {
- return hyp_ngram_counts.size();
- }
- float ComputeScore(vector<float>* precs, float* bp) const;
- float ComputePartialScore(vector<float>* prec, float* bp) const;
- valarray<float> correct_ngram_hit_counts;
- valarray<float> hyp_ngram_counts;
- float ref_len;
- float hyp_len;
-};
-
-class BLEUScorerBase : public SentenceScorer {
- public:
- BLEUScorerBase(const vector<vector<WordID> >& references,
- int n
- );
- ScoreP ScoreCandidate(const vector<WordID>& hyp) const;
- ScoreP ScoreCCandidate(const vector<WordID>& hyp) const;
- static ScoreP ScoreFromString(const string& in);
-
- virtual float ComputeRefLength(const vector<WordID>& hyp) const = 0;
- private:
- struct NGramCompare {
- int operator() (const vector<WordID>& a, const vector<WordID>& b) {
- size_t as = a.size();
- size_t bs = b.size();
- const size_t s = (as < bs ? as : bs);
- for (size_t i = 0; i < s; ++i) {
- int d = a[i] - b[i];
- if (d < 0) return true;
- if (d > 0) return false;
- }
- return as < bs;
- }
- };
- typedef map<vector<WordID>, pair<int,int>, NGramCompare> NGramCountMap;
- void CountRef(const vector<WordID>& ref) {
- NGramCountMap tc;
- vector<WordID> ngram(n_);
- int s = ref.size();
- for (int j=0; j<s; ++j) {
- int remaining = s-j;
- int k = (n_ < remaining ? n_ : remaining);
- ngram.clear();
- for (int i=1; i<=k; ++i) {
- ngram.push_back(ref[j + i - 1]);
- tc[ngram].first++;
- }
- }
- for (NGramCountMap::iterator i = tc.begin(); i != tc.end(); ++i) {
- pair<int,int>& p = ngrams_[i->first];
- if (p.first < i->second.first)
- p = i->second;
- }
- }
-
- void ComputeNgramStats(const vector<WordID>& sent,
- valarray<float>* correct,
- valarray<float>* hyp,
- bool clip_counts)
- const {
- assert(correct->size() == n_);
- assert(hyp->size() == n_);
- vector<WordID> ngram(n_);
- (*correct) *= 0;
- (*hyp) *= 0;
- int s = sent.size();
- for (int j=0; j<s; ++j) {
- int remaining = s-j;
- int k = (n_ < remaining ? n_ : remaining);
- ngram.clear();
- for (int i=1; i<=k; ++i) {
- ngram.push_back(sent[j + i - 1]);
- pair<int,int>& p = ngrams_[ngram];
- if(clip_counts){
- if (p.second < p.first) {
- ++p.second;
- (*correct)[i-1]++;
- }}
- else {
- ++p.second;
- (*correct)[i-1]++;
- }
- // if the 1 gram isn't found, don't try to match don't need to match any 2- 3- .. grams:
- if (!p.first) {
- for (; i<=k; ++i)
- (*hyp)[i-1]++;
- } else {
- (*hyp)[i-1]++;
- }
- }
- }
- }
-
- mutable NGramCountMap ngrams_;
- int n_;
- vector<int> lengths_;
-};
-
-ScoreP BLEUScorerBase::ScoreFromString(const string& in) {
- istringstream is(in);
- int n;
- is >> n;
- BLEUScore* r = new BLEUScore(n);
- is >> r->ref_len >> r->hyp_len;
-
- for (int i = 0; i < n; ++i) {
- is >> r->correct_ngram_hit_counts[i];
- is >> r->hyp_ngram_counts[i];
- }
- return ScoreP(r);
-}
-
-class IBM_BLEUScorer : public BLEUScorerBase {
- public:
- IBM_BLEUScorer(const vector<vector<WordID> >& references,
- int n=4) : BLEUScorerBase(references, n), lengths_(references.size()) {
- for (int i=0; i < references.size(); ++i)
- lengths_[i] = references[i].size();
- }
- float ComputeRefLength(const vector<WordID>& hyp) const {
- if (lengths_.size() == 1) return lengths_[0];
- int bestd = 2000000;
- int hl = hyp.size();
- int bl = -1;
- for (vector<int>::const_iterator ci = lengths_.begin(); ci != lengths_.end(); ++ci) {
- int cl = *ci;
- if (abs(cl - hl) < bestd) {
- bestd = abs(cl - hl);
- bl = cl;
- }
- }
- return bl;
- }
- private:
- vector<int> lengths_;
-};
-
-class NIST_BLEUScorer : public BLEUScorerBase {
- public:
- NIST_BLEUScorer(const vector<vector<WordID> >& references,
- int n=4) : BLEUScorerBase(references, n),
- shortest_(references[0].size()) {
- for (int i=1; i < references.size(); ++i)
- if (references[i].size() < shortest_)
- shortest_ = references[i].size();
- }
- float ComputeRefLength(const vector<WordID>& /* hyp */) const {
- return shortest_;
- }
- private:
- float shortest_;
-};
-
-class Koehn_BLEUScorer : public BLEUScorerBase {
- public:
- Koehn_BLEUScorer(const vector<vector<WordID> >& references,
- int n=4) : BLEUScorerBase(references, n),
- avg_(0) {
- for (int i=0; i < references.size(); ++i)
- avg_ += references[i].size();
- avg_ /= references.size();
- }
- float ComputeRefLength(const vector<WordID>& /* hyp */) const {
- return avg_;
- }
- private:
- float avg_;
-};
-
-ScorerP SentenceScorer::CreateSentenceScorer(const ScoreType type,
- const vector<vector<WordID> >& refs,
- const string& src)
-{
- SentenceScorer *r=0;
- switch (type) {
- case IBM_BLEU: r = new IBM_BLEUScorer(refs, 4);break;
- case IBM_BLEU_3 : r = new IBM_BLEUScorer(refs,3);break;
- case NIST_BLEU: r = new NIST_BLEUScorer(refs, 4);break;
- case Koehn_BLEU: r = new Koehn_BLEUScorer(refs, 4);break;
- case AER: r = new AERScorer(refs, src);break;
- case TER: r = new TERScorer(refs);break;
- case SER: r = new SERScorer(refs);break;
- case BLEU_minus_TER_over_2: r = new BLEUTERCombinationScorer(refs);break;
- default:
- assert(!"Not implemented!");
- }
- return ScorerP(r);
-}
-
-ScoreP SentenceScorer::GetOne() const {
- Sentence s;
- return ScoreCCandidate(s)->GetOne();
-}
-
-ScoreP SentenceScorer::GetZero() const {
- Sentence s;
- return ScoreCCandidate(s)->GetZero();
-}
-
-ScoreP Score::GetOne(ScoreType type) {
- std::vector<SentenceScorer::Sentence > refs;
- return SentenceScorer::CreateSentenceScorer(type,refs)->GetOne();
-}
-
-ScoreP Score::GetZero(ScoreType type) {
- std::vector<SentenceScorer::Sentence > refs;
- return SentenceScorer::CreateSentenceScorer(type,refs)->GetZero();
-}
-
-
-ScoreP SentenceScorer::CreateScoreFromString(const ScoreType type, const string& in) {
- switch (type) {
- case IBM_BLEU:
- case IBM_BLEU_3:
- case NIST_BLEU:
- case Koehn_BLEU:
- return BLEUScorerBase::ScoreFromString(in);
- case TER:
- return TERScorer::ScoreFromString(in);
- case AER:
- return AERScorer::ScoreFromString(in);
- case SER:
- return SERScorer::ScoreFromString(in);
- case BLEU_minus_TER_over_2:
- return BLEUTERCombinationScorer::ScoreFromString(in);
- default:
- assert(!"Not implemented!");
- }
-}
-
-void SentenceScorer::ComputeErrorSurface(const ViterbiEnvelope& ve, ErrorSurface* env, const ScoreType type, const Hypergraph& hg) const {
- vector<WordID> prev_trans;
- const vector<shared_ptr<Segment> >& ienv = ve.GetSortedSegs();
- env->resize(ienv.size());
- ScoreP prev_score;
- int j = 0;
- for (int i = 0; i < ienv.size(); ++i) {
- const Segment& seg = *ienv[i];
- vector<WordID> trans;
- if (type == AER) {
- vector<bool> edges(hg.edges_.size(), false);
- seg.CollectEdgesUsed(&edges); // get the set of edges in the viterbi
- // alignment
- ostringstream os;
- const string* psrc = this->GetSource();
- if (psrc == NULL) {
- cerr << "AER scoring in VEST requires source, but it is missing!\n";
- abort();
- }
- size_t pos = psrc->rfind(" ||| ");
- if (pos == string::npos) {
- cerr << "Malformed source for AER: expected |||\nINPUT: " << *psrc << endl;
- abort();
- }
- Lattice src;
- Lattice ref;
- LatticeTools::ConvertTextOrPLF(psrc->substr(0, pos), &src);
- LatticeTools::ConvertTextOrPLF(psrc->substr(pos + 5), &ref);
- AlignerTools::WriteAlignment(src, ref, hg, &os, true, &edges);
- string tstr = os.str();
- TD::ConvertSentence(tstr.substr(tstr.rfind(" ||| ") + 5), &trans);
- } else {
- seg.ConstructTranslation(&trans);
- }
- // cerr << "Scoring: " << TD::GetString(trans) << endl;
- if (trans == prev_trans) {
- if (!minimize_segments) {
- assert(prev_score); // if this fails, it means
- // the decoder can generate null translations
- ErrorSegment& out = (*env)[j];
- out.delta = prev_score->GetZero();
- out.x = seg.x;
- ++j;
- }
- // cerr << "Identical translation, skipping scoring\n";
- } else {
- ScoreP score = ScoreCandidate(trans);
- // cerr << "score= " << score->ComputeScore() << "\n";
- ScoreP cur_delta_p = score->GetZero();
- Score* cur_delta = cur_delta_p.get();
- // just record the score diffs
- if (!prev_score)
- prev_score = score->GetZero();
-
- score->Subtract(*prev_score, cur_delta);
- prev_trans.swap(trans);
- prev_score = score;
- if ((!minimize_segments) || (!cur_delta->IsAdditiveIdentity())) {
- ErrorSegment& out = (*env)[j];
- out.delta = cur_delta_p;
- out.x = seg.x;
- ++j;
- }
- }
- }
- // cerr << " In segments: " << ienv.size() << endl;
- // cerr << "Out segments: " << j << endl;
- assert(j > 0);
- env->resize(j);
-}
-
-void BLEUScore::ScoreDetails(string* details) const {
- char buf[2000];
- vector<float> precs(max(N(),4));
- float bp;
- float bleu = ComputeScore(&precs, &bp);
- for (int i=N();i<4;++i)
- precs[i]=0.;
- char *bufn;
- bufn=buf+sprintf(buf, "BLEU = %.2f, %.1f|%.1f|%.1f|%.1f (brev=%.3f)",
- bleu*100.0,
- precs[0]*100.0,
- precs[1]*100.0,
- precs[2]*100.0,
- precs[3]*100.0,
- bp);
- *details = buf;
-}
-
-float BLEUScore::ComputeScore(vector<float>* precs, float* bp) const {
- float log_bleu = 0;
- if (precs) precs->clear();
- int count = 0;
- for (int i = 0; i < N(); ++i) {
- if (hyp_ngram_counts[i] > 0) {
- float lprec = log(correct_ngram_hit_counts[i]) - log(hyp_ngram_counts[i]);
- if (precs) precs->push_back(exp(lprec));
- log_bleu += lprec;
- ++count;
- }
- }
- log_bleu /= static_cast<float>(count);
- float lbp = 0.0;
- if (hyp_len < ref_len)
- lbp = (hyp_len - ref_len) / hyp_len;
- log_bleu += lbp;
- if (bp) *bp = exp(lbp);
- return exp(log_bleu);
-}
-
-
-//comptue scaled score for oracle retrieval
-float BLEUScore::ComputePartialScore(vector<float>* precs, float* bp) const {
- // cerr << "Then here " << endl;
- float log_bleu = 0;
- if (precs) precs->clear();
- int count = 0;
- for (int i = 0; i < N(); ++i) {
- // cerr << "In CPS " << hyp_ngram_counts[i] << " " << correct_ngram_hit_counts[i] << endl;
- if (hyp_ngram_counts[i] > 0) {
- float lprec = log(correct_ngram_hit_counts[i]) - log(hyp_ngram_counts[i]);
- if (precs) precs->push_back(exp(lprec));
- log_bleu += lprec;
- ++count;
- }
- }
- log_bleu /= static_cast<float>(count);
- float lbp = 0.0;
- if (hyp_len < ref_len)
- lbp = (hyp_len - ref_len) / hyp_len;
- log_bleu += lbp;
- if (bp) *bp = exp(lbp);
- return exp(log_bleu);
-}
-
-float BLEUScore::ComputePartialScore() const {
- // cerr << "In here first " << endl;
- return ComputePartialScore(NULL, NULL);
-}
-
-float BLEUScore::ComputeScore() const {
- return ComputeScore(NULL, NULL);
-}
-
-void BLEUScore::Subtract(const Score& rhs, Score* res) const {
- const BLEUScore& d = static_cast<const BLEUScore&>(rhs);
- BLEUScore* o = static_cast<BLEUScore*>(res);
- o->ref_len = ref_len - d.ref_len;
- o->hyp_len = hyp_len - d.hyp_len;
- o->correct_ngram_hit_counts = correct_ngram_hit_counts - d.correct_ngram_hit_counts;
- o->hyp_ngram_counts = hyp_ngram_counts - d.hyp_ngram_counts;
-}
-
-void BLEUScore::PlusEquals(const Score& delta) {
- const BLEUScore& d = static_cast<const BLEUScore&>(delta);
- correct_ngram_hit_counts += d.correct_ngram_hit_counts;
- hyp_ngram_counts += d.hyp_ngram_counts;
- ref_len += d.ref_len;
- hyp_len += d.hyp_len;
-}
-
-void BLEUScore::TimesEquals(float scale) {
- correct_ngram_hit_counts *= scale;
- hyp_ngram_counts *= scale;
- ref_len *= scale;
- hyp_len *= scale;
-}
-
-void BLEUScore::PlusEquals(const Score& delta, const float scale) {
- const BLEUScore& d = static_cast<const BLEUScore&>(delta);
- correct_ngram_hit_counts = correct_ngram_hit_counts + (d.correct_ngram_hit_counts * scale);
- hyp_ngram_counts = hyp_ngram_counts + (d.hyp_ngram_counts * scale);
- ref_len = ref_len + (d.ref_len * scale);
- hyp_len = hyp_len + (d.hyp_len * scale);
-}
-
-void BLEUScore::PlusPartialEquals(const Score& delta, int oracle_e_cover, int oracle_f_cover, int src_len){
- const BLEUScore& d = static_cast<const BLEUScore&>(delta);
- correct_ngram_hit_counts += d.correct_ngram_hit_counts;
- hyp_ngram_counts += d.hyp_ngram_counts;
- //scale the reference length according to the size of the input sentence covered by this rule
-
- ref_len *= (float)oracle_f_cover / src_len;
- ref_len += d.ref_len;
-
- hyp_len = oracle_e_cover;
- hyp_len += d.hyp_len;
-}
-
-
-ScoreP BLEUScore::GetZero() const {
- return ScoreP(new BLEUScore(N()));
-}
-
-ScoreP BLEUScore::GetOne() const {
- return ScoreP(new BLEUScore(N(),1));
-}
-
-
-void BLEUScore::Encode(string* out) const {
- ostringstream os;
- const int n = correct_ngram_hit_counts.size();
- os << n << ' ' << ref_len << ' ' << hyp_len;
- for (int i = 0; i < n; ++i)
- os << ' ' << correct_ngram_hit_counts[i] << ' ' << hyp_ngram_counts[i];
- *out = os.str();
-}
-
-BLEUScorerBase::BLEUScorerBase(const vector<vector<WordID> >& references,
- int n) : SentenceScorer("BLEU"+boost::lexical_cast<string>(n),references),n_(n) {
- for (vector<vector<WordID> >::const_iterator ci = references.begin();
- ci != references.end(); ++ci) {
- lengths_.push_back(ci->size());
- CountRef(*ci);
- }
-}
-
-ScoreP BLEUScorerBase::ScoreCandidate(const vector<WordID>& hyp) const {
- BLEUScore* bs = new BLEUScore(n_);
- for (NGramCountMap::iterator i=ngrams_.begin(); i != ngrams_.end(); ++i)
- i->second.second = 0;
- ComputeNgramStats(hyp, &bs->correct_ngram_hit_counts, &bs->hyp_ngram_counts, true);
- bs->ref_len = ComputeRefLength(hyp);
- bs->hyp_len = hyp.size();
- return ScoreP(bs);
-}
-
-ScoreP BLEUScorerBase::ScoreCCandidate(const vector<WordID>& hyp) const {
- BLEUScore* bs = new BLEUScore(n_);
- for (NGramCountMap::iterator i=ngrams_.begin(); i != ngrams_.end(); ++i)
- i->second.second = 0;
- bool clip = false;
- ComputeNgramStats(hyp, &bs->correct_ngram_hit_counts, &bs->hyp_ngram_counts,clip);
- bs->ref_len = ComputeRefLength(hyp);
- bs->hyp_len = hyp.size();
- return ScoreP(bs);
-}
-
-
-DocScorer::~DocScorer() {
-}
-
-void DocScorer::Init(
- const ScoreType type,
- const vector<string>& ref_files,
- const string& src_file, bool verbose) {
- scorers_.clear();
- // TODO stop using valarray, start using ReadFile
- cerr << "Loading references (" << ref_files.size() << " files)\n";
- ReadFile srcrf;
- if (type == AER && src_file.size() > 0) {
- cerr << " (source=" << src_file << ")\n";
- srcrf.Init(src_file);
- }
- std::vector<ReadFile> ifs(ref_files.begin(),ref_files.end());
- for (int i=0; i < ref_files.size(); ++i) ifs[i].Init(ref_files[i]);
- char buf[64000];
- bool expect_eof = false;
- int line=0;
- while (ifs[0].get()) {
- vector<vector<WordID> > refs(ref_files.size());
- for (int i=0; i < ref_files.size(); ++i) {
- istream &in=ifs[i].get();
- if (in.eof()) break;
- in.getline(buf, 64000);
- refs[i].clear();
- if (strlen(buf) == 0) {
- if (in.eof()) {
- if (!expect_eof) {
- assert(i == 0);
- expect_eof = true;
- }
- break;
- }
- } else {
- TD::ConvertSentence(buf, &refs[i]);
- assert(!refs[i].empty());
- }
- assert(!expect_eof);
- }
- if (!expect_eof) {
- string src_line;
- if (srcrf) {
- getline(srcrf.get(), src_line);
- map<string,string> dummy;
- ProcessAndStripSGML(&src_line, &dummy);
- }
- scorers_.push_back(ScorerP(SentenceScorer::CreateSentenceScorer(type, refs, src_line)));
- if (verbose)
- cerr<<"doc_scorer["<<line<<"] = "<<scorers_.back()->verbose_desc()<<endl;
- ++line;
- }
- }
- cerr << "Loaded reference translations for " << scorers_.size() << " sentences.\n";
-}
-
diff --git a/vest/scorer.h b/vest/scorer.h
deleted file mode 100644
index 0c8b380f..00000000
--- a/vest/scorer.h
+++ /dev/null
@@ -1,111 +0,0 @@
-#ifndef SCORER_H_
-#define SCORER_H_
-#include <vector>
-#include <string>
-#include <boost/shared_ptr.hpp>
-//TODO: use intrusive shared_ptr in Score (because there are many of them on ErrorSurfaces)
-#include "wordid.h"
-#include "intrusive_refcount.hpp"
-
-class Score;
-class SentenceScorer;
-typedef boost::intrusive_ptr<Score> ScoreP;
-typedef boost::shared_ptr<SentenceScorer> ScorerP;
-
-class ViterbiEnvelope;
-class ErrorSurface;
-class Hypergraph; // needed for alignment
-
-//TODO: BLEU N (N separate arg, not part of enum)?
-enum ScoreType { IBM_BLEU, NIST_BLEU, Koehn_BLEU, TER, BLEU_minus_TER_over_2, SER, AER, IBM_BLEU_3 };
-ScoreType ScoreTypeFromString(const std::string& st);
-std::string StringFromScoreType(ScoreType st);
-
-class Score : public boost::intrusive_refcount<Score> {
- public:
- virtual ~Score();
- virtual float ComputeScore() const = 0;
- virtual float ComputePartialScore() const =0;
- virtual void ScoreDetails(std::string* details) const = 0;
- std::string ScoreDetails() {
- std::string d;
- ScoreDetails(&d);
- return d;
- }
- virtual void TimesEquals(float scale); // only for bleu; for mira oracle
- /// same as rhs.TimesEquals(scale);PlusEquals(rhs) except doesn't modify rhs.
- virtual void PlusEquals(const Score& rhs, const float scale) = 0;
- virtual void PlusEquals(const Score& rhs) = 0;
- virtual void PlusPartialEquals(const Score& rhs, int oracle_e_cover, int oracle_f_cover, int src_len) = 0;
- virtual void Subtract(const Score& rhs, Score *res) const = 0;
- virtual ScoreP GetZero() const = 0;
- virtual ScoreP GetOne() const = 0;
- virtual bool IsAdditiveIdentity() const = 0; // returns true if adding this delta
- // to another score results in no score change
- // under any circumstances
- virtual void Encode(std::string* out) const = 0;
- static ScoreP GetZero(ScoreType type);
- static ScoreP GetOne(ScoreType type);
- virtual ScoreP Clone() const = 0;
-protected:
- Score() { } // we define these explicitly because refcount is noncopyable
- Score(Score const& o) { }
-};
-
-//TODO: make sure default copy ctors for score types do what we want.
-template <class Derived>
-struct ScoreBase : public Score {
- ScoreP Clone() const {
- return ScoreP(new Derived(dynamic_cast<Derived const&>(*this)));
- }
-};
-
-class SentenceScorer {
- public:
- typedef std::vector<WordID> Sentence;
- typedef std::vector<Sentence> Sentences;
- std::string desc;
- Sentences refs;
- SentenceScorer(std::string desc="SentenceScorer_unknown", Sentences const& refs=Sentences()) : desc(desc),refs(refs) { }
- std::string verbose_desc() const;
- virtual float ComputeRefLength(const Sentence& hyp) const; // default: avg of refs.length
- virtual ~SentenceScorer();
- virtual ScoreP GetOne() const;
- virtual ScoreP GetZero() const;
- void ComputeErrorSurface(const ViterbiEnvelope& ve, ErrorSurface* es, const ScoreType type, const Hypergraph& hg) const;
- virtual ScoreP ScoreCandidate(const Sentence& hyp) const = 0;
- virtual ScoreP ScoreCCandidate(const Sentence& hyp) const =0;
- virtual const std::string* GetSource() const;
- static ScoreP CreateScoreFromString(const ScoreType type, const std::string& in);
- static ScorerP CreateSentenceScorer(const ScoreType type,
- const std::vector<Sentence >& refs,
- const std::string& src = "");
-};
-
-//TODO: should be able to GetOne GetZero without supplying sentence (just type)
-class DocScorer {
- public:
- ~DocScorer();
- DocScorer() { }
- void Init(const ScoreType type,
- const std::vector<std::string>& ref_files,
- const std::string& src_file = "",
- bool verbose=false
- );
- DocScorer(const ScoreType type,
- const std::vector<std::string>& ref_files,
- const std::string& src_file = "",
- bool verbose=false
- )
- {
- Init(type,ref_files,src_file,verbose);
- }
-
- int size() const { return scorers_.size(); }
- ScorerP operator[](size_t i) const { return scorers_[i]; }
- private:
- std::vector<ScorerP> scorers_;
-};
-
-
-#endif
diff --git a/vest/ter.cc b/vest/ter.cc
deleted file mode 100644
index cacc5b00..00000000
--- a/vest/ter.cc
+++ /dev/null
@@ -1,535 +0,0 @@
-#include "ter.h"
-
-#include <cstdio>
-#include <cassert>
-#include <iostream>
-#include <limits>
-#include <sstream>
-#include <tr1/unordered_map>
-#include <set>
-#include <valarray>
-#include <boost/functional/hash.hpp>
-#include <stdexcept>
-#include "tdict.h"
-
-const bool ter_use_average_ref_len = true;
-const int ter_short_circuit_long_sentences = -1;
-
-using namespace std;
-using namespace std::tr1;
-
-struct COSTS {
- static const float substitution;
- static const float deletion;
- static const float insertion;
- static const float shift;
-};
-const float COSTS::substitution = 1.0f;
-const float COSTS::deletion = 1.0f;
-const float COSTS::insertion = 1.0f;
-const float COSTS::shift = 1.0f;
-
-static const int MAX_SHIFT_SIZE = 10;
-static const int MAX_SHIFT_DIST = 50;
-
-struct Shift {
- unsigned int d_;
- Shift() : d_() {}
- Shift(int b, int e, int m) : d_() {
- begin(b);
- end(e);
- moveto(m);
- }
- inline int begin() const {
- return d_ & 0x3ff;
- }
- inline int end() const {
- return (d_ >> 10) & 0x3ff;
- }
- inline int moveto() const {
- int m = (d_ >> 20) & 0x7ff;
- if (m > 1024) { m -= 1024; m *= -1; }
- return m;
- }
- inline void begin(int b) {
- d_ &= 0xfffffc00u;
- d_ |= (b & 0x3ff);
- }
- inline void end(int e) {
- d_ &= 0xfff003ffu;
- d_ |= (e & 0x3ff) << 10;
- }
- inline void moveto(int m) {
- bool neg = (m < 0);
- if (neg) { m *= -1; m += 1024; }
- d_ &= 0xfffff;
- d_ |= (m & 0x7ff) << 20;
- }
-};
-
-class TERScorerImpl {
-
- public:
- enum TransType { MATCH, SUBSTITUTION, INSERTION, DELETION };
-
- explicit TERScorerImpl(const vector<WordID>& ref) : ref_(ref) {
- for (int i = 0; i < ref.size(); ++i)
- rwexists_.insert(ref[i]);
- }
-
- float Calculate(const vector<WordID>& hyp, int* subs, int* ins, int* dels, int* shifts) const {
- return CalculateAllShifts(hyp, subs, ins, dels, shifts);
- }
-
- inline int GetRefLength() const {
- return ref_.size();
- }
-
- private:
- vector<WordID> ref_;
- set<WordID> rwexists_;
-
- typedef unordered_map<vector<WordID>, set<int>, boost::hash<vector<WordID> > > NgramToIntsMap;
- mutable NgramToIntsMap nmap_;
-
- static float MinimumEditDistance(
- const vector<WordID>& hyp,
- const vector<WordID>& ref,
- vector<TransType>* path) {
- vector<vector<TransType> > bmat(hyp.size() + 1, vector<TransType>(ref.size() + 1, MATCH));
- vector<vector<float> > cmat(hyp.size() + 1, vector<float>(ref.size() + 1, 0));
- for (int i = 0; i <= hyp.size(); ++i)
- cmat[i][0] = i;
- for (int j = 0; j <= ref.size(); ++j)
- cmat[0][j] = j;
- for (int i = 1; i <= hyp.size(); ++i) {
- const WordID& hw = hyp[i-1];
- for (int j = 1; j <= ref.size(); ++j) {
- const WordID& rw = ref[j-1];
- float& cur_c = cmat[i][j];
- TransType& cur_b = bmat[i][j];
-
- if (rw == hw) {
- cur_c = cmat[i-1][j-1];
- cur_b = MATCH;
- } else {
- cur_c = cmat[i-1][j-1] + COSTS::substitution;
- cur_b = SUBSTITUTION;
- }
- float cwoi = cmat[i-1][j];
- if (cur_c > cwoi + COSTS::insertion) {
- cur_c = cwoi + COSTS::insertion;
- cur_b = INSERTION;
- }
- float cwod = cmat[i][j-1];
- if (cur_c > cwod + COSTS::deletion) {
- cur_c = cwod + COSTS::deletion;
- cur_b = DELETION;
- }
- }
- }
-
- // trace back along the best path and record the transition types
- path->clear();
- int i = hyp.size();
- int j = ref.size();
- while (i > 0 || j > 0) {
- if (j == 0) {
- --i;
- path->push_back(INSERTION);
- } else if (i == 0) {
- --j;
- path->push_back(DELETION);
- } else {
- TransType t = bmat[i][j];
- path->push_back(t);
- switch (t) {
- case SUBSTITUTION:
- case MATCH:
- --i; --j; break;
- case INSERTION:
- --i; break;
- case DELETION:
- --j; break;
- }
- }
- }
- reverse(path->begin(), path->end());
- return cmat[hyp.size()][ref.size()];
- }
-
- void BuildWordMatches(const vector<WordID>& hyp, NgramToIntsMap* nmap) const {
- nmap->clear();
- set<WordID> exists_both;
- for (int i = 0; i < hyp.size(); ++i)
- if (rwexists_.find(hyp[i]) != rwexists_.end())
- exists_both.insert(hyp[i]);
- for (int start=0; start<ref_.size(); ++start) {
- if (exists_both.find(ref_[start]) == exists_both.end()) continue;
- vector<WordID> cp;
- int mlen = min(MAX_SHIFT_SIZE, static_cast<int>(ref_.size() - start));
- for (int len=0; len<mlen; ++len) {
- if (len && exists_both.find(ref_[start + len]) == exists_both.end()) break;
- cp.push_back(ref_[start + len]);
- (*nmap)[cp].insert(start);
- }
- }
- }
-
- static void PerformShift(const vector<WordID>& in,
- int start, int end, int moveto, vector<WordID>* out) {
- // cerr << "ps: " << start << " " << end << " " << moveto << endl;
- out->clear();
- if (moveto == -1) {
- for (int i = start; i <= end; ++i)
- out->push_back(in[i]);
- for (int i = 0; i < start; ++i)
- out->push_back(in[i]);
- for (int i = end+1; i < in.size(); ++i)
- out->push_back(in[i]);
- } else if (moveto < start) {
- for (int i = 0; i <= moveto; ++i)
- out->push_back(in[i]);
- for (int i = start; i <= end; ++i)
- out->push_back(in[i]);
- for (int i = moveto+1; i < start; ++i)
- out->push_back(in[i]);
- for (int i = end+1; i < in.size(); ++i)
- out->push_back(in[i]);
- } else if (moveto > end) {
- for (int i = 0; i < start; ++i)
- out->push_back(in[i]);
- for (int i = end+1; i <= moveto; ++i)
- out->push_back(in[i]);
- for (int i = start; i <= end; ++i)
- out->push_back(in[i]);
- for (int i = moveto+1; i < in.size(); ++i)
- out->push_back(in[i]);
- } else {
- for (int i = 0; i < start; ++i)
- out->push_back(in[i]);
- for (int i = end+1; (i < in.size()) && (i <= end + (moveto - start)); ++i)
- out->push_back(in[i]);
- for (int i = start; i <= end; ++i)
- out->push_back(in[i]);
- for (int i = (end + (moveto - start))+1; i < in.size(); ++i)
- out->push_back(in[i]);
- }
- if (out->size() != in.size()) {
- cerr << "ps: " << start << " " << end << " " << moveto << endl;
- cerr << "in=" << TD::GetString(in) << endl;
- cerr << "out=" << TD::GetString(*out) << endl;
- }
- assert(out->size() == in.size());
- // cerr << "ps: " << TD::GetString(*out) << endl;
- }
-
- void GetAllPossibleShifts(const vector<WordID>& hyp,
- const vector<int>& ralign,
- const vector<bool>& herr,
- const vector<bool>& rerr,
- const int min_size,
- vector<vector<Shift> >* shifts) const {
- for (int start = 0; start < hyp.size(); ++start) {
- vector<WordID> cp(1, hyp[start]);
- NgramToIntsMap::iterator niter = nmap_.find(cp);
- if (niter == nmap_.end()) continue;
- bool ok = false;
- int moveto;
- for (set<int>::iterator i = niter->second.begin(); i != niter->second.end(); ++i) {
- moveto = *i;
- int rm = ralign[moveto];
- ok = (start != rm &&
- (rm - start) < MAX_SHIFT_DIST &&
- (start - rm - 1) < MAX_SHIFT_DIST);
- if (ok) break;
- }
- if (!ok) continue;
- cp.clear();
- for (int end = start + min_size - 1;
- ok && end < hyp.size() && end < (start + MAX_SHIFT_SIZE); ++end) {
- cp.push_back(hyp[end]);
- vector<Shift>& sshifts = (*shifts)[end - start];
- ok = false;
- NgramToIntsMap::iterator niter = nmap_.find(cp);
- if (niter == nmap_.end()) break;
- bool any_herr = false;
- for (int i = start; i <= end && !any_herr; ++i)
- any_herr = herr[i];
- if (!any_herr) {
- ok = true;
- continue;
- }
- for (set<int>::iterator mi = niter->second.begin();
- mi != niter->second.end(); ++mi) {
- int moveto = *mi;
- int rm = ralign[moveto];
- if (! ((rm != start) &&
- ((rm < start) || (rm > end)) &&
- (rm - start <= MAX_SHIFT_DIST) &&
- ((start - rm - 1) <= MAX_SHIFT_DIST))) continue;
- ok = true;
- bool any_rerr = false;
- for (int i = 0; (i <= end - start) && (!any_rerr); ++i)
- any_rerr = rerr[moveto+i];
- if (!any_rerr) continue;
- for (int roff = 0; roff <= (end - start); ++roff) {
- int rmr = ralign[moveto+roff];
- if ((start != rmr) && ((roff == 0) || (rmr != ralign[moveto])))
- sshifts.push_back(Shift(start, end, moveto + roff));
- }
- }
- }
- }
- }
-
- bool CalculateBestShift(const vector<WordID>& cur,
- const vector<WordID>& hyp,
- float curerr,
- const vector<TransType>& path,
- vector<WordID>* new_hyp,
- float* newerr,
- vector<TransType>* new_path) const {
- vector<bool> herr, rerr;
- vector<int> ralign;
- int hpos = -1;
- for (int i = 0; i < path.size(); ++i) {
- switch (path[i]) {
- case MATCH:
- ++hpos;
- herr.push_back(false);
- rerr.push_back(false);
- ralign.push_back(hpos);
- break;
- case SUBSTITUTION:
- ++hpos;
- herr.push_back(true);
- rerr.push_back(true);
- ralign.push_back(hpos);
- break;
- case INSERTION:
- ++hpos;
- herr.push_back(true);
- break;
- case DELETION:
- rerr.push_back(true);
- ralign.push_back(hpos);
- break;
- }
- }
-#if 0
- cerr << "RALIGN: ";
- for (int i = 0; i < rerr.size(); ++i)
- cerr << ralign[i] << " ";
- cerr << endl;
- cerr << "RERR: ";
- for (int i = 0; i < rerr.size(); ++i)
- cerr << (bool)rerr[i] << " ";
- cerr << endl;
- cerr << "HERR: ";
- for (int i = 0; i < herr.size(); ++i)
- cerr << (bool)herr[i] << " ";
- cerr << endl;
-#endif
-
- vector<vector<Shift> > shifts(MAX_SHIFT_SIZE + 1);
- GetAllPossibleShifts(cur, ralign, herr, rerr, 1, &shifts);
- float cur_best_shift_cost = 0;
- *newerr = curerr;
- vector<TransType> cur_best_path;
- vector<WordID> cur_best_hyp;
-
- bool res = false;
- for (int i = shifts.size() - 1; i >=0; --i) {
- float curfix = curerr - (cur_best_shift_cost + *newerr);
- float maxfix = 2.0f * (1 + i) - COSTS::shift;
- if ((curfix > maxfix) || ((cur_best_shift_cost == 0) && (curfix == maxfix))) break;
- for (int j = 0; j < shifts[i].size(); ++j) {
- const Shift& s = shifts[i][j];
- curfix = curerr - (cur_best_shift_cost + *newerr);
- maxfix = 2.0f * (1 + i) - COSTS::shift; // TODO remove?
- if ((curfix > maxfix) || ((cur_best_shift_cost == 0) && (curfix == maxfix))) continue;
- vector<WordID> shifted(cur.size());
- PerformShift(cur, s.begin(), s.end(), ralign[s.moveto()], &shifted);
- vector<TransType> try_path;
- float try_cost = MinimumEditDistance(shifted, ref_, &try_path);
- float gain = (*newerr + cur_best_shift_cost) - (try_cost + COSTS::shift);
- if (gain > 0.0f || ((cur_best_shift_cost == 0.0f) && (gain == 0.0f))) {
- *newerr = try_cost;
- cur_best_shift_cost = COSTS::shift;
- new_path->swap(try_path);
- new_hyp->swap(shifted);
- res = true;
- // cerr << "Found better shift " << s.begin() << "..." << s.end() << " moveto " << s.moveto() << endl;
- }
- }
- }
-
- return res;
- }
-
- static void GetPathStats(const vector<TransType>& path, int* subs, int* ins, int* dels) {
- *subs = *ins = *dels = 0;
- for (int i = 0; i < path.size(); ++i) {
- switch (path[i]) {
- case SUBSTITUTION:
- ++(*subs);
- case MATCH:
- break;
- case INSERTION:
- ++(*ins); break;
- case DELETION:
- ++(*dels); break;
- }
- }
- }
-
- float CalculateAllShifts(const vector<WordID>& hyp,
- int* subs, int* ins, int* dels, int* shifts) const {
- BuildWordMatches(hyp, &nmap_);
- vector<TransType> path;
- float med_cost = MinimumEditDistance(hyp, ref_, &path);
- float edits = 0;
- vector<WordID> cur = hyp;
- *shifts = 0;
- if (ter_short_circuit_long_sentences < 0 ||
- ref_.size() < ter_short_circuit_long_sentences) {
- while (true) {
- vector<WordID> new_hyp;
- vector<TransType> new_path;
- float new_med_cost;
- if (!CalculateBestShift(cur, hyp, med_cost, path, &new_hyp, &new_med_cost, &new_path))
- break;
- edits += COSTS::shift;
- ++(*shifts);
- med_cost = new_med_cost;
- path.swap(new_path);
- cur.swap(new_hyp);
- }
- }
- GetPathStats(path, subs, ins, dels);
- return med_cost + edits;
- }
-};
-
-class TERScore : public ScoreBase<TERScore> {
- friend class TERScorer;
-
- public:
- static const unsigned kINSERTIONS = 0;
- static const unsigned kDELETIONS = 1;
- static const unsigned kSUBSTITUTIONS = 2;
- static const unsigned kSHIFTS = 3;
- static const unsigned kREF_WORDCOUNT = 4;
- static const unsigned kDUMMY_LAST_ENTRY = 5;
-
- TERScore() : stats(0,kDUMMY_LAST_ENTRY) {}
- float ComputePartialScore() const { return 0.0;}
- float ComputeScore() const {
- float edits = static_cast<float>(stats[kINSERTIONS] + stats[kDELETIONS] + stats[kSUBSTITUTIONS] + stats[kSHIFTS]);
- return edits / static_cast<float>(stats[kREF_WORDCOUNT]);
- }
- void ScoreDetails(string* details) const;
- void PlusPartialEquals(const Score& rhs, int oracle_e_cover, int oracle_f_cover, int src_len){}
- void PlusEquals(const Score& delta, const float scale) {
- if (scale==1)
- stats += static_cast<const TERScore&>(delta).stats;
- if (scale==-1)
- stats -= static_cast<const TERScore&>(delta).stats;
- throw std::runtime_error("TERScore::PlusEquals with scale != +-1");
- }
- void PlusEquals(const Score& delta) {
- stats += static_cast<const TERScore&>(delta).stats;
- }
-
- ScoreP GetZero() const {
- return ScoreP(new TERScore);
- }
- ScoreP GetOne() const {
- return ScoreP(new TERScore);
- }
- void Subtract(const Score& rhs, Score* res) const {
- static_cast<TERScore*>(res)->stats = stats - static_cast<const TERScore&>(rhs).stats;
- }
- void Encode(std::string* out) const {
- ostringstream os;
- os << stats[kINSERTIONS] << ' '
- << stats[kDELETIONS] << ' '
- << stats[kSUBSTITUTIONS] << ' '
- << stats[kSHIFTS] << ' '
- << stats[kREF_WORDCOUNT];
- *out = os.str();
- }
- bool IsAdditiveIdentity() const {
- for (int i = 0; i < kDUMMY_LAST_ENTRY; ++i)
- if (stats[i] != 0) return false;
- return true;
- }
- private:
- valarray<int> stats;
-};
-
-ScoreP TERScorer::ScoreFromString(const std::string& data) {
- istringstream is(data);
- TERScore* r = new TERScore;
- is >> r->stats[TERScore::kINSERTIONS]
- >> r->stats[TERScore::kDELETIONS]
- >> r->stats[TERScore::kSUBSTITUTIONS]
- >> r->stats[TERScore::kSHIFTS]
- >> r->stats[TERScore::kREF_WORDCOUNT];
- return ScoreP(r);
-}
-
-void TERScore::ScoreDetails(std::string* details) const {
- char buf[200];
- sprintf(buf, "TER = %.2f, %3d|%3d|%3d|%3d (len=%d)",
- ComputeScore() * 100.0f,
- stats[kINSERTIONS],
- stats[kDELETIONS],
- stats[kSUBSTITUTIONS],
- stats[kSHIFTS],
- stats[kREF_WORDCOUNT]);
- *details = buf;
-}
-
-TERScorer::~TERScorer() {
- for (vector<TERScorerImpl*>::iterator i = impl_.begin(); i != impl_.end(); ++i)
- delete *i;
-}
-
-TERScorer::TERScorer(const vector<vector<WordID> >& refs) : impl_(refs.size()) {
- for (int i = 0; i < refs.size(); ++i)
- impl_[i] = new TERScorerImpl(refs[i]);
-}
-
-ScoreP TERScorer::ScoreCCandidate(const vector<WordID>& hyp) const {
- return ScoreP();
-}
-
-ScoreP TERScorer::ScoreCandidate(const std::vector<WordID>& hyp) const {
- float best_score = numeric_limits<float>::max();
- TERScore* res = new TERScore;
- int avg_len = 0;
- for (int i = 0; i < impl_.size(); ++i)
- avg_len += impl_[i]->GetRefLength();
- avg_len /= impl_.size();
- for (int i = 0; i < impl_.size(); ++i) {
- int subs, ins, dels, shifts;
- float score = impl_[i]->Calculate(hyp, &subs, &ins, &dels, &shifts);
- // cerr << "Component TER cost: " << score << endl;
- if (score < best_score) {
- res->stats[TERScore::kINSERTIONS] = ins;
- res->stats[TERScore::kDELETIONS] = dels;
- res->stats[TERScore::kSUBSTITUTIONS] = subs;
- res->stats[TERScore::kSHIFTS] = shifts;
- if (ter_use_average_ref_len) {
- res->stats[TERScore::kREF_WORDCOUNT] = avg_len;
- } else {
- res->stats[TERScore::kREF_WORDCOUNT] = impl_[i]->GetRefLength();
- }
-
- best_score = score;
- }
- }
- return ScoreP(res);
-}
diff --git a/vest/ter.h b/vest/ter.h
deleted file mode 100644
index 43314791..00000000
--- a/vest/ter.h
+++ /dev/null
@@ -1,19 +0,0 @@
-#ifndef _TER_H_
-#define _TER_H_
-
-#include "scorer.h"
-
-class TERScorerImpl;
-
-class TERScorer : public SentenceScorer {
- public:
- TERScorer(const std::vector<std::vector<WordID> >& references);
- ~TERScorer();
- ScoreP ScoreCandidate(const std::vector<WordID>& hyp) const;
- ScoreP ScoreCCandidate(const std::vector<WordID>& hyp) const;
- static ScoreP ScoreFromString(const std::string& data);
- private:
- std::vector<TERScorerImpl*> impl_;
-};
-
-#endif