diff options
author | redpony <redpony@ec762483-ff6d-05da-a07a-a48fb63a330f> | 2010-08-11 02:37:10 +0000 |
---|---|---|
committer | redpony <redpony@ec762483-ff6d-05da-a07a-a48fb63a330f> | 2010-08-11 02:37:10 +0000 |
commit | 80686d4e567bae579ea39e009826a2de92cd4ace (patch) | |
tree | c3c35fcba57dde423a248f38aa121ad197c79734 /vest | |
parent | 3c85c407c333899f6b4bc26632d312b8e568b638 (diff) |
major refactor, break bad circular deps
git-svn-id: https://ws10smt.googlecode.com/svn/trunk@509 ec762483-ff6d-05da-a07a-a48fb63a330f
Diffstat (limited to 'vest')
-rw-r--r-- | vest/Makefile.am | 30 | ||||
-rw-r--r-- | vest/aer_scorer.cc | 135 | ||||
-rw-r--r-- | vest/aer_scorer.h | 23 | ||||
-rw-r--r-- | vest/comb_scorer.cc | 97 | ||||
-rw-r--r-- | vest/comb_scorer.h | 17 | ||||
-rw-r--r-- | vest/fast_score.cc | 72 | ||||
-rw-r--r-- | vest/lo_test.cc | 5 | ||||
-rw-r--r-- | vest/mr_vest_map.cc | 5 | ||||
-rw-r--r-- | vest/mr_vest_reduce.cc | 2 | ||||
-rw-r--r-- | vest/scorer.cc | 708 | ||||
-rw-r--r-- | vest/scorer.h | 111 | ||||
-rw-r--r-- | vest/ter.cc | 535 | ||||
-rw-r--r-- | vest/ter.h | 19 |
13 files changed, 16 insertions, 1743 deletions
diff --git a/vest/Makefile.am b/vest/Makefile.am index abdc8146..b869672b 100644 --- a/vest/Makefile.am +++ b/vest/Makefile.am @@ -1,15 +1,12 @@ bin_PROGRAMS = \ - mbr_kbest \ mr_vest_map \ mr_vest_reduce \ mr_vest_generate_mapper_input \ - fast_score \ sentserver \ sentclient if HAVE_GTEST noinst_PROGRAMS = \ - scorer_test \ lo_test endif @@ -17,25 +14,16 @@ sentserver_SOURCES = sentserver.c sentclient_SOURCES = sentclient.c -mbr_kbest_SOURCES = mbr_kbest.cc ter.cc comb_scorer.cc aer_scorer.cc scorer.cc viterbi_envelope.cc -mbr_kbest_LDADD = $(top_srcdir)/decoder/libcdec.a -lz +mr_vest_generate_mapper_input_SOURCES = mr_vest_generate_mapper_input.cc line_optimizer.cc +mr_vest_generate_mapper_input_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a -lz -fast_score_SOURCES = fast_score.cc ter.cc comb_scorer.cc aer_scorer.cc scorer.cc viterbi_envelope.cc -fast_score_LDADD = $(top_srcdir)/decoder/libcdec.a -lz +mr_vest_map_SOURCES = viterbi_envelope.cc ces.cc error_surface.cc mr_vest_map.cc line_optimizer.cc +mr_vest_map_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a -lz -mr_vest_generate_mapper_input_SOURCES = mr_vest_generate_mapper_input.cc line_optimizer.cc $(top_srcdir)/decoder/timing_stats.cc -mr_vest_generate_mapper_input_LDADD = $(top_srcdir)/decoder/libcdec.a -lz +mr_vest_reduce_SOURCES = error_surface.cc ces.cc mr_vest_reduce.cc line_optimizer.cc viterbi_envelope.cc +mr_vest_reduce_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a -lz -mr_vest_map_SOURCES = viterbi_envelope.cc error_surface.cc aer_scorer.cc mr_vest_map.cc scorer.cc ter.cc comb_scorer.cc line_optimizer.cc -mr_vest_map_LDADD = $(top_srcdir)/decoder/libcdec.a -lz +lo_test_SOURCES = lo_test.cc ces.cc viterbi_envelope.cc error_surface.cc line_optimizer.cc +lo_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a -lz -mr_vest_reduce_SOURCES = error_surface.cc aer_scorer.cc mr_vest_reduce.cc scorer.cc ter.cc comb_scorer.cc line_optimizer.cc viterbi_envelope.cc -mr_vest_reduce_LDADD = $(top_srcdir)/decoder/libcdec.a -lz - -scorer_test_SOURCES = aer_scorer.cc scorer_test.cc scorer.cc ter.cc comb_scorer.cc viterbi_envelope.cc -scorer_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) $(top_srcdir)/decoder/libcdec.a -lz - -lo_test_SOURCES = lo_test.cc scorer.cc ter.cc aer_scorer.cc comb_scorer.cc viterbi_envelope.cc error_surface.cc line_optimizer.cc -lo_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) $(top_srcdir)/decoder/libcdec.a -lz - -AM_CPPFLAGS = -W -Wall -Wno-sign-compare $(GTEST_CPPFLAGS) -I$(top_srcdir)/decoder +AM_CPPFLAGS = -W -Wall -Wno-sign-compare $(GTEST_CPPFLAGS) -I$(top_srcdir)/utils -I$(top_srcdir)/decoder -I$(top_srcdir)/mteval diff --git a/vest/aer_scorer.cc b/vest/aer_scorer.cc deleted file mode 100644 index 25b58b5e..00000000 --- a/vest/aer_scorer.cc +++ /dev/null @@ -1,135 +0,0 @@ -#include "aer_scorer.h" - -#include <cmath> -#include <cassert> -#include <sstream> - -#include "tdict.h" -#include "aligner.h" - -using namespace std; - -class AERScore : public ScoreBase<AERScore> { - friend class AERScorer; - public: - AERScore() : num_matches(), num_predicted(), num_in_ref() {} - AERScore(int m, int p, int r) : - num_matches(m), num_predicted(p), num_in_ref(r) {} - virtual void PlusPartialEquals(const Score& rhs, int oracle_e_cover, int oracle_f_cover, int src_len){} - virtual void PlusEquals(const Score& delta, const float scale) { - const AERScore& other = static_cast<const AERScore&>(delta); - num_matches += scale*other.num_matches; - num_predicted += scale*other.num_predicted; - num_in_ref += scale*other.num_in_ref; - } - virtual void PlusEquals(const Score& delta) { - const AERScore& other = static_cast<const AERScore&>(delta); - num_matches += other.num_matches; - num_predicted += other.num_predicted; - num_in_ref += other.num_in_ref; - } - - - virtual ScoreP GetZero() const { - return ScoreP(new AERScore); - } - virtual ScoreP GetOne() const { - return ScoreP(new AERScore); - } - virtual void Subtract(const Score& rhs, Score* out) const { - AERScore* res = static_cast<AERScore*>(out); - const AERScore& other = static_cast<const AERScore&>(rhs); - res->num_matches = num_matches - other.num_matches; - res->num_predicted = num_predicted - other.num_predicted; - res->num_in_ref = num_in_ref - other.num_in_ref; - } - float Precision() const { - return static_cast<float>(num_matches) / num_predicted; - } - float Recall() const { - return static_cast<float>(num_matches) / num_in_ref; - } - float ComputePartialScore() const { return 0.0;} - virtual float ComputeScore() const { - const float prec = Precision(); - const float rec = Recall(); - const float f = (2.0 * prec * rec) / (rec + prec); - if (isnan(f)) return 1.0f; - return 1.0f - f; - } - virtual bool IsAdditiveIdentity() const { - return (num_matches == 0) && (num_predicted == 0) && (num_in_ref == 0); - } - virtual void ScoreDetails(std::string* out) const { - ostringstream os; - os << "AER=" << (ComputeScore() * 100.0) - << " F=" << (100 - ComputeScore() * 100.0) - << " P=" << (Precision() * 100.0) << " R=" << (Recall() * 100.0) - << " [" << num_matches << " " << num_predicted << " " << num_in_ref << "]"; - *out = os.str(); - } - virtual void Encode(std::string*out) const { - out->resize(sizeof(int) * 3); - *(int *)&(*out)[sizeof(int) * 0] = num_matches; - *(int *)&(*out)[sizeof(int) * 1] = num_predicted; - *(int *)&(*out)[sizeof(int) * 2] = num_in_ref; - } - private: - int num_matches; - int num_predicted; - int num_in_ref; -}; - -AERScorer::AERScorer(const vector<vector<WordID> >& refs, const string& src) : src_(src) { - if (refs.size() != 1) { - cerr << "AERScorer can only take a single reference!\n"; - abort(); - } - ref_ = AlignerTools::ReadPharaohAlignmentGrid(TD::GetString(refs.front())); -} - -static inline bool Safe(const Array2D<bool>& a, int i, int j) { - if (i >= 0 && j >= 0 && i < a.width() && j < a.height()) - return a(i,j); - else - return false; -} - -ScoreP AERScorer::ScoreCCandidate(const vector<WordID>& shyp) const { - return ScoreP(); -} - -ScoreP AERScorer::ScoreCandidate(const vector<WordID>& shyp) const { - boost::shared_ptr<Array2D<bool> > hyp = - AlignerTools::ReadPharaohAlignmentGrid(TD::GetString(shyp)); - - int m = 0; - int r = 0; - int p = 0; - int i_len = ref_->width(); - int j_len = ref_->height(); - for (int i = 0; i < i_len; ++i) { - for (int j = 0; j < j_len; ++j) { - if ((*ref_)(i,j)) { - ++r; - if (Safe(*hyp, i, j)) ++m; - } - } - } - for (int i = 0; i < hyp->width(); ++i) - for (int j = 0; j < hyp->height(); ++j) - if ((*hyp)(i,j)) ++p; - - return ScoreP(new AERScore(m,p,r)); -} - -ScoreP AERScorer::ScoreFromString(const string& in) { - AERScore* res = new AERScore; - res->num_matches = *(const int *)&in[sizeof(int) * 0]; - res->num_predicted = *(const int *)&in[sizeof(int) * 1]; - res->num_in_ref = *(const int *)&in[sizeof(int) * 2]; - return ScoreP(res); -} - -const std::string* AERScorer::GetSource() const { return &src_; } - diff --git a/vest/aer_scorer.h b/vest/aer_scorer.h deleted file mode 100644 index 6d53d359..00000000 --- a/vest/aer_scorer.h +++ /dev/null @@ -1,23 +0,0 @@ -#ifndef _AER_SCORER_ -#define _AER_SCORER_ - -#include <boost/shared_ptr.hpp> - -#include "scorer.h" -#include "array2d.h" - -class AERScorer : public SentenceScorer { - public: - // when constructing alignment strings from a hypergraph, the source - // is necessary. - AERScorer(const std::vector<std::vector<WordID> >& refs, const std::string& src = ""); - ScoreP ScoreCandidate(const std::vector<WordID>& hyp) const; - ScoreP ScoreCCandidate(const std::vector<WordID>& hyp) const; - static ScoreP ScoreFromString(const std::string& in); - const std::string* GetSource() const; - private: - std::string src_; - boost::shared_ptr<Array2D<bool> > ref_; -}; - -#endif diff --git a/vest/comb_scorer.cc b/vest/comb_scorer.cc deleted file mode 100644 index 9fc37868..00000000 --- a/vest/comb_scorer.cc +++ /dev/null @@ -1,97 +0,0 @@ -#include "comb_scorer.h" - -#include <cstdio> - -using namespace std; - -class BLEUTERCombinationScore : public ScoreBase<BLEUTERCombinationScore> { - friend class BLEUTERCombinationScorer; - public: - ~BLEUTERCombinationScore(); - float ComputePartialScore() const { return 0.0;} - float ComputeScore() const { - return (bleu->ComputeScore() - ter->ComputeScore()) / 2.0f; - } - void ScoreDetails(string* details) const { - char buf[160]; - sprintf(buf, "Combi = %.2f, BLEU = %.2f, TER = %.2f", - ComputeScore()*100.0f, bleu->ComputeScore()*100.0f, ter->ComputeScore()*100.0f); - *details = buf; - } - void PlusPartialEquals(const Score& rhs, int oracle_e_cover, int oracle_f_cover, int src_len){} - - void PlusEquals(const Score& delta, const float scale) { - bleu->PlusEquals(*static_cast<const BLEUTERCombinationScore&>(delta).bleu, scale); - ter->PlusEquals(*static_cast<const BLEUTERCombinationScore&>(delta).ter, scale); - } - void PlusEquals(const Score& delta) { - bleu->PlusEquals(*static_cast<const BLEUTERCombinationScore&>(delta).bleu); - ter->PlusEquals(*static_cast<const BLEUTERCombinationScore&>(delta).ter); - } - - - - ScoreP GetOne() const { - BLEUTERCombinationScore* res = new BLEUTERCombinationScore; - res->bleu = bleu->GetOne(); - res->ter = ter->GetOne(); - return ScoreP(res); - } - ScoreP GetZero() const { - BLEUTERCombinationScore* res = new BLEUTERCombinationScore; - res->bleu = bleu->GetZero(); - res->ter = ter->GetZero(); - return ScoreP(res); - } - void Subtract(const Score& rhs, Score* res) const { - bleu->Subtract(*static_cast<const BLEUTERCombinationScore&>(rhs).bleu, - static_cast<BLEUTERCombinationScore*>(res)->bleu.get()); - ter->Subtract(*static_cast<const BLEUTERCombinationScore&>(rhs).ter, - static_cast<BLEUTERCombinationScore*>(res)->ter.get()); - } - void Encode(std::string* out) const { - string bs, ts; - bleu->Encode(&bs); - ter->Encode(&ts); - out->clear(); - (*out) += static_cast<char>(bs.size()); - (*out) += bs; - (*out) += ts; - } - bool IsAdditiveIdentity() const { - return bleu->IsAdditiveIdentity() && ter->IsAdditiveIdentity(); - } - private: - ScoreP bleu; - ScoreP ter; -}; - -BLEUTERCombinationScore::~BLEUTERCombinationScore() { -} - -BLEUTERCombinationScorer::BLEUTERCombinationScorer(const vector<vector<WordID> >& refs) { - bleu_ = SentenceScorer::CreateSentenceScorer(IBM_BLEU, refs); - ter_ = SentenceScorer::CreateSentenceScorer(TER, refs); -} - -BLEUTERCombinationScorer::~BLEUTERCombinationScorer() { -} - -ScoreP BLEUTERCombinationScorer::ScoreCCandidate(const vector<WordID>& hyp) const { - return ScoreP(); -} - -ScoreP BLEUTERCombinationScorer::ScoreCandidate(const std::vector<WordID>& hyp) const { - BLEUTERCombinationScore* res = new BLEUTERCombinationScore; - res->bleu = bleu_->ScoreCandidate(hyp); - res->ter = ter_->ScoreCandidate(hyp); - return ScoreP(res); -} - -ScoreP BLEUTERCombinationScorer::ScoreFromString(const std::string& in) { - int bss = in[0]; - BLEUTERCombinationScore* r = new BLEUTERCombinationScore; - r->bleu = SentenceScorer::CreateScoreFromString(IBM_BLEU, in.substr(1, bss)); - r->ter = SentenceScorer::CreateScoreFromString(TER, in.substr(1 + bss)); - return ScoreP(r); -} diff --git a/vest/comb_scorer.h b/vest/comb_scorer.h deleted file mode 100644 index 346be576..00000000 --- a/vest/comb_scorer.h +++ /dev/null @@ -1,17 +0,0 @@ -#ifndef _COMB_SCORER_ -#define _COMB_SCORER_ - -#include "scorer.h" - -class BLEUTERCombinationScorer : public SentenceScorer { - public: - BLEUTERCombinationScorer(const std::vector<std::vector<WordID> >& refs); - ~BLEUTERCombinationScorer(); - ScoreP ScoreCandidate(const std::vector<WordID>& hyp) const; - ScoreP ScoreCCandidate(const std::vector<WordID>& hyp) const; - static ScoreP ScoreFromString(const std::string& in); - private: - ScorerP bleu_,ter_; -}; - -#endif diff --git a/vest/fast_score.cc b/vest/fast_score.cc deleted file mode 100644 index 5ee264a6..00000000 --- a/vest/fast_score.cc +++ /dev/null @@ -1,72 +0,0 @@ -#include <iostream> -#include <vector> - -#include <boost/program_options.hpp> -#include <boost/program_options/variables_map.hpp> - -#include "filelib.h" -#include "tdict.h" -#include "scorer.h" - -using namespace std; -namespace po = boost::program_options; - -void InitCommandLine(int argc, char** argv, po::variables_map* conf) { - po::options_description opts("Configuration options"); - opts.add_options() - ("reference,r",po::value<vector<string> >(), "[REQD] Reference translation(s) (tokenized text file)") - ("loss_function,l",po::value<string>()->default_value("ibm_bleu"), "Scoring metric (ibm_bleu, nist_bleu, koehn_bleu, ter, combi)") - ("in_file,i", po::value<string>()->default_value("-"), "Input file") - ("help,h", "Help"); - po::options_description dcmdline_options; - dcmdline_options.add(opts); - po::store(parse_command_line(argc, argv, dcmdline_options), *conf); - bool flag = false; - if (!conf->count("reference")) { - cerr << "Please specify one or more references using -r <REF1.TXT> -r <REF2.TXT> ...\n"; - flag = true; - } - if (flag || conf->count("help")) { - cerr << dcmdline_options << endl; - exit(1); - } -} - -int main(int argc, char** argv) { - po::variables_map conf; - InitCommandLine(argc, argv, &conf); - const string loss_function = conf["loss_function"].as<string>(); - ScoreType type = ScoreTypeFromString(loss_function); - DocScorer ds(type, conf["reference"].as<vector<string> >(), ""); - cerr << "Loaded " << ds.size() << " references for scoring with " << loss_function << endl; - - ReadFile rf(conf["in_file"].as<string>()); - ScoreP acc; - istream& in = *rf.stream(); - int lc = 0; - while(in) { - string line; - getline(in, line); - if (line.empty() && !in) break; - vector<WordID> sent; - TD::ConvertSentence(line, &sent); - ScoreP sentscore = ds[lc]->ScoreCandidate(sent); - if (!acc) { acc = sentscore->GetZero(); } - acc->PlusEquals(*sentscore); - ++lc; - } - assert(lc > 0); - if (lc > ds.size()) { - cerr << "Too many (" << lc << ") translations in input, expected " << ds.size() << endl; - return 1; - } - if (lc != ds.size()) - cerr << "Fewer sentences in hyp (" << lc << ") than refs (" - << ds.size() << "): scoring partial set!\n"; - float score = acc->ComputeScore(); - string details; - acc->ScoreDetails(&details); - cerr << details << endl; - cout << score << endl; - return 0; -} diff --git a/vest/lo_test.cc b/vest/lo_test.cc index 577113bb..9200eb34 100644 --- a/vest/lo_test.cc +++ b/vest/lo_test.cc @@ -5,6 +5,7 @@ #include <boost/shared_ptr.hpp> #include <gtest/gtest.h> +#include "ces.h" #include "fdict.h" #include "hg.h" #include "kbest.h" @@ -166,8 +167,8 @@ TEST_F(OptTest, TestS1) { envs[1] = Inside<ViterbiEnvelope, ViterbiEnvelopeWeightFunction>(hg2, NULL, wf); vector<ErrorSurface> es(2); - scorer1->ComputeErrorSurface(envs[0], &es[0], IBM_BLEU, hg); - scorer2->ComputeErrorSurface(envs[1], &es[1], IBM_BLEU, hg2); + ComputeErrorSurface(*scorer1, envs[0], &es[0], IBM_BLEU, hg); + ComputeErrorSurface(*scorer2, envs[1], &es[1], IBM_BLEU, hg2); cerr << envs[0].size() << " " << envs[1].size() << endl; cerr << es[0].size() << " " << es[1].size() << endl; envs.clear(); diff --git a/vest/mr_vest_map.cc b/vest/mr_vest_map.cc index b3acc5dd..1506a99f 100644 --- a/vest/mr_vest_map.cc +++ b/vest/mr_vest_map.cc @@ -6,6 +6,7 @@ #include <boost/program_options.hpp> #include <boost/program_options/variables_map.hpp> +#include "ces.h" #include "filelib.h" #include "stringlib.h" #include "sparse_vector.h" @@ -13,7 +14,7 @@ #include "viterbi_envelope.h" #include "inside_outside.h" #include "error_surface.h" -#include "hg.h" +#include "b64tools.h" #include "hg_io.h" using namespace std; @@ -90,7 +91,7 @@ int main(int argc, char** argv) { ViterbiEnvelopeWeightFunction wf(origin, axis); ViterbiEnvelope ve = Inside<ViterbiEnvelope, ViterbiEnvelopeWeightFunction>(hg, NULL, wf); ErrorSurface es; - ds[sent_id]->ComputeErrorSurface(ve, &es, type, hg); + ComputeErrorSurface(*ds[sent_id], ve, &es, type, hg); //cerr << "Viterbi envelope has " << ve.size() << " segments\n"; // cerr << "Error surface has " << es.size() << " segments\n"; string val; diff --git a/vest/mr_vest_reduce.cc b/vest/mr_vest_reduce.cc index 5efcc19a..3df52020 100644 --- a/vest/mr_vest_reduce.cc +++ b/vest/mr_vest_reduce.cc @@ -9,7 +9,7 @@ #include "sparse_vector.h" #include "error_surface.h" #include "line_optimizer.h" -#include "hg_io.h" +#include "b64tools.h" using namespace std; namespace po = boost::program_options; diff --git a/vest/scorer.cc b/vest/scorer.cc deleted file mode 100644 index 70fdef34..00000000 --- a/vest/scorer.cc +++ /dev/null @@ -1,708 +0,0 @@ -#include "scorer.h" - -#include <boost/lexical_cast.hpp> -#include <map> -#include <sstream> -#include <iostream> -#include <fstream> -#include <cstdio> -#include <valarray> -#include <algorithm> - -#include <boost/shared_ptr.hpp> - -#include "filelib.h" -#include "aligner.h" -#include "viterbi_envelope.h" -#include "error_surface.h" -#include "ter.h" -#include "aer_scorer.h" -#include "comb_scorer.h" -#include "tdict.h" -#include "stringlib.h" -#include "lattice.h" - - -using boost::shared_ptr; -using namespace std; - -const bool minimize_segments = true; // if adjacent segments have equal scores, merge them - -void Score::TimesEquals(float scale) { - cerr<<"UNIMPLEMENTED except for BLEU (for MIRA): Score::TimesEquals"<<endl;abort(); -} - -ScoreType ScoreTypeFromString(const string& st) { - const string sl = LowercaseString(st); - if (sl == "ser") - return SER; - if (sl == "ter") - return TER; - if (sl == "aer") - return AER; - if (sl == "bleu" || sl == "ibm_bleu") - return IBM_BLEU; - if (sl == "ibm_bleu_3") - return IBM_BLEU_3; - if (sl == "nist_bleu") - return NIST_BLEU; - if (sl == "koehn_bleu") - return Koehn_BLEU; - if (sl == "combi") - return BLEU_minus_TER_over_2; - cerr << "Don't understand score type '" << st << "', defaulting to ibm_bleu.\n"; - return IBM_BLEU; -} - -static char const* score_names[]={ - "IBM_BLEU", "NIST_BLEU", "Koehn_BLEU", "TER", "BLEU_minus_TER_over_2", "SER", "AER", "IBM_BLEU_3" -}; - -std::string StringFromScoreType(ScoreType st) { - assert(st>=0 && st<sizeof(score_names)/sizeof(score_names[0])); - return score_names[(int)st]; -} - - -Score::~Score() {} -SentenceScorer::~SentenceScorer() {} - -struct length_accum { - template <class S> - float operator()(float sum,S const& ref) const { - return sum+ref.size(); - } -}; - -template <class S> -float avg_reflength(vector<S> refs) { - unsigned n=refs.size(); - return n?accumulate(refs.begin(),refs.end(),0.,length_accum())/n:0.; -} - - -float SentenceScorer::ComputeRefLength(const Sentence &hyp) const { - return hyp.size(); // reasonable default? :) -} - -const std::string* SentenceScorer::GetSource() const { return NULL; } - -class SERScore : public ScoreBase<SERScore> { - friend class SERScorer; - public: - SERScore() : correct(0), total(0) {} - float ComputePartialScore() const { return 0.0;} - float ComputeScore() const { - return static_cast<float>(correct) / static_cast<float>(total); - } - void ScoreDetails(string* details) const { - ostringstream os; - os << "SER= " << ComputeScore() << " (" << correct << '/' << total << ')'; - *details = os.str(); - } - void PlusPartialEquals(const Score& /* delta */, int /* oracle_e_cover */, int /* oracle_f_cover */, int /* src_len */){} - - void PlusEquals(const Score& delta, const float scale) { - correct += scale*static_cast<const SERScore&>(delta).correct; - total += scale*static_cast<const SERScore&>(delta).total; - } - void PlusEquals(const Score& delta) { - correct += static_cast<const SERScore&>(delta).correct; - total += static_cast<const SERScore&>(delta).total; - } - ScoreP GetZero() const { return ScoreP(new SERScore); } - ScoreP GetOne() const { return ScoreP(new SERScore); } - void Subtract(const Score& rhs, Score* res) const { - SERScore* r = static_cast<SERScore*>(res); - r->correct = correct - static_cast<const SERScore&>(rhs).correct; - r->total = total - static_cast<const SERScore&>(rhs).total; - } - void Encode(string* out) const { - assert(!"not implemented"); - } - bool IsAdditiveIdentity() const { - return (total == 0 && correct == 0); // correct is always 0 <= n <= total - } - private: - int correct, total; -}; - -std::string SentenceScorer::verbose_desc() const { - return desc+",ref0={ "+TD::GetString(refs[0])+" }"; -} - -class SERScorer : public SentenceScorer { - public: - SERScorer(const vector<vector<WordID> >& references) : SentenceScorer("SERScorer",references),refs_(references) {} - ScoreP ScoreCCandidate(const vector<WordID>& /* hyp */) const { - return ScoreP(); - } - ScoreP ScoreCandidate(const vector<WordID>& hyp) const { - SERScore* res = new SERScore; - res->total = 1; - for (int i = 0; i < refs_.size(); ++i) - if (refs_[i] == hyp) res->correct = 1; - return ScoreP(res); - } - static ScoreP ScoreFromString(const string& data) { - assert(!"Not implemented"); - } - private: - vector<vector<WordID> > refs_; -}; - -class BLEUScore : public ScoreBase<BLEUScore> { - friend class BLEUScorerBase; - public: - BLEUScore(int n) : correct_ngram_hit_counts(float(0),n), hyp_ngram_counts(float(0),n) { - ref_len = 0; - hyp_len = 0; } - BLEUScore(int n, int k) : correct_ngram_hit_counts(float(k),n), hyp_ngram_counts(float(k),n) { - ref_len = k; - hyp_len = k; } - float ComputeScore() const; - float ComputePartialScore() const; - void ScoreDetails(string* details) const; - void TimesEquals(float scale); - void PlusEquals(const Score& delta); - void PlusEquals(const Score& delta, const float scale); - void PlusPartialEquals(const Score& delta, int oracle_e_cover, int oracle_f_cover, int src_len); - ScoreP GetZero() const; - ScoreP GetOne() const; - void Subtract(const Score& rhs, Score* res) const; - void Encode(string* out) const; - bool IsAdditiveIdentity() const { - if (fabs(ref_len) > 0.1f || hyp_len != 0) return false; - for (int i = 0; i < correct_ngram_hit_counts.size(); ++i) - if (hyp_ngram_counts[i] != 0 || - correct_ngram_hit_counts[i] != 0) return false; - return true; - } - private: - int N() const { - return hyp_ngram_counts.size(); - } - float ComputeScore(vector<float>* precs, float* bp) const; - float ComputePartialScore(vector<float>* prec, float* bp) const; - valarray<float> correct_ngram_hit_counts; - valarray<float> hyp_ngram_counts; - float ref_len; - float hyp_len; -}; - -class BLEUScorerBase : public SentenceScorer { - public: - BLEUScorerBase(const vector<vector<WordID> >& references, - int n - ); - ScoreP ScoreCandidate(const vector<WordID>& hyp) const; - ScoreP ScoreCCandidate(const vector<WordID>& hyp) const; - static ScoreP ScoreFromString(const string& in); - - virtual float ComputeRefLength(const vector<WordID>& hyp) const = 0; - private: - struct NGramCompare { - int operator() (const vector<WordID>& a, const vector<WordID>& b) { - size_t as = a.size(); - size_t bs = b.size(); - const size_t s = (as < bs ? as : bs); - for (size_t i = 0; i < s; ++i) { - int d = a[i] - b[i]; - if (d < 0) return true; - if (d > 0) return false; - } - return as < bs; - } - }; - typedef map<vector<WordID>, pair<int,int>, NGramCompare> NGramCountMap; - void CountRef(const vector<WordID>& ref) { - NGramCountMap tc; - vector<WordID> ngram(n_); - int s = ref.size(); - for (int j=0; j<s; ++j) { - int remaining = s-j; - int k = (n_ < remaining ? n_ : remaining); - ngram.clear(); - for (int i=1; i<=k; ++i) { - ngram.push_back(ref[j + i - 1]); - tc[ngram].first++; - } - } - for (NGramCountMap::iterator i = tc.begin(); i != tc.end(); ++i) { - pair<int,int>& p = ngrams_[i->first]; - if (p.first < i->second.first) - p = i->second; - } - } - - void ComputeNgramStats(const vector<WordID>& sent, - valarray<float>* correct, - valarray<float>* hyp, - bool clip_counts) - const { - assert(correct->size() == n_); - assert(hyp->size() == n_); - vector<WordID> ngram(n_); - (*correct) *= 0; - (*hyp) *= 0; - int s = sent.size(); - for (int j=0; j<s; ++j) { - int remaining = s-j; - int k = (n_ < remaining ? n_ : remaining); - ngram.clear(); - for (int i=1; i<=k; ++i) { - ngram.push_back(sent[j + i - 1]); - pair<int,int>& p = ngrams_[ngram]; - if(clip_counts){ - if (p.second < p.first) { - ++p.second; - (*correct)[i-1]++; - }} - else { - ++p.second; - (*correct)[i-1]++; - } - // if the 1 gram isn't found, don't try to match don't need to match any 2- 3- .. grams: - if (!p.first) { - for (; i<=k; ++i) - (*hyp)[i-1]++; - } else { - (*hyp)[i-1]++; - } - } - } - } - - mutable NGramCountMap ngrams_; - int n_; - vector<int> lengths_; -}; - -ScoreP BLEUScorerBase::ScoreFromString(const string& in) { - istringstream is(in); - int n; - is >> n; - BLEUScore* r = new BLEUScore(n); - is >> r->ref_len >> r->hyp_len; - - for (int i = 0; i < n; ++i) { - is >> r->correct_ngram_hit_counts[i]; - is >> r->hyp_ngram_counts[i]; - } - return ScoreP(r); -} - -class IBM_BLEUScorer : public BLEUScorerBase { - public: - IBM_BLEUScorer(const vector<vector<WordID> >& references, - int n=4) : BLEUScorerBase(references, n), lengths_(references.size()) { - for (int i=0; i < references.size(); ++i) - lengths_[i] = references[i].size(); - } - float ComputeRefLength(const vector<WordID>& hyp) const { - if (lengths_.size() == 1) return lengths_[0]; - int bestd = 2000000; - int hl = hyp.size(); - int bl = -1; - for (vector<int>::const_iterator ci = lengths_.begin(); ci != lengths_.end(); ++ci) { - int cl = *ci; - if (abs(cl - hl) < bestd) { - bestd = abs(cl - hl); - bl = cl; - } - } - return bl; - } - private: - vector<int> lengths_; -}; - -class NIST_BLEUScorer : public BLEUScorerBase { - public: - NIST_BLEUScorer(const vector<vector<WordID> >& references, - int n=4) : BLEUScorerBase(references, n), - shortest_(references[0].size()) { - for (int i=1; i < references.size(); ++i) - if (references[i].size() < shortest_) - shortest_ = references[i].size(); - } - float ComputeRefLength(const vector<WordID>& /* hyp */) const { - return shortest_; - } - private: - float shortest_; -}; - -class Koehn_BLEUScorer : public BLEUScorerBase { - public: - Koehn_BLEUScorer(const vector<vector<WordID> >& references, - int n=4) : BLEUScorerBase(references, n), - avg_(0) { - for (int i=0; i < references.size(); ++i) - avg_ += references[i].size(); - avg_ /= references.size(); - } - float ComputeRefLength(const vector<WordID>& /* hyp */) const { - return avg_; - } - private: - float avg_; -}; - -ScorerP SentenceScorer::CreateSentenceScorer(const ScoreType type, - const vector<vector<WordID> >& refs, - const string& src) -{ - SentenceScorer *r=0; - switch (type) { - case IBM_BLEU: r = new IBM_BLEUScorer(refs, 4);break; - case IBM_BLEU_3 : r = new IBM_BLEUScorer(refs,3);break; - case NIST_BLEU: r = new NIST_BLEUScorer(refs, 4);break; - case Koehn_BLEU: r = new Koehn_BLEUScorer(refs, 4);break; - case AER: r = new AERScorer(refs, src);break; - case TER: r = new TERScorer(refs);break; - case SER: r = new SERScorer(refs);break; - case BLEU_minus_TER_over_2: r = new BLEUTERCombinationScorer(refs);break; - default: - assert(!"Not implemented!"); - } - return ScorerP(r); -} - -ScoreP SentenceScorer::GetOne() const { - Sentence s; - return ScoreCCandidate(s)->GetOne(); -} - -ScoreP SentenceScorer::GetZero() const { - Sentence s; - return ScoreCCandidate(s)->GetZero(); -} - -ScoreP Score::GetOne(ScoreType type) { - std::vector<SentenceScorer::Sentence > refs; - return SentenceScorer::CreateSentenceScorer(type,refs)->GetOne(); -} - -ScoreP Score::GetZero(ScoreType type) { - std::vector<SentenceScorer::Sentence > refs; - return SentenceScorer::CreateSentenceScorer(type,refs)->GetZero(); -} - - -ScoreP SentenceScorer::CreateScoreFromString(const ScoreType type, const string& in) { - switch (type) { - case IBM_BLEU: - case IBM_BLEU_3: - case NIST_BLEU: - case Koehn_BLEU: - return BLEUScorerBase::ScoreFromString(in); - case TER: - return TERScorer::ScoreFromString(in); - case AER: - return AERScorer::ScoreFromString(in); - case SER: - return SERScorer::ScoreFromString(in); - case BLEU_minus_TER_over_2: - return BLEUTERCombinationScorer::ScoreFromString(in); - default: - assert(!"Not implemented!"); - } -} - -void SentenceScorer::ComputeErrorSurface(const ViterbiEnvelope& ve, ErrorSurface* env, const ScoreType type, const Hypergraph& hg) const { - vector<WordID> prev_trans; - const vector<shared_ptr<Segment> >& ienv = ve.GetSortedSegs(); - env->resize(ienv.size()); - ScoreP prev_score; - int j = 0; - for (int i = 0; i < ienv.size(); ++i) { - const Segment& seg = *ienv[i]; - vector<WordID> trans; - if (type == AER) { - vector<bool> edges(hg.edges_.size(), false); - seg.CollectEdgesUsed(&edges); // get the set of edges in the viterbi - // alignment - ostringstream os; - const string* psrc = this->GetSource(); - if (psrc == NULL) { - cerr << "AER scoring in VEST requires source, but it is missing!\n"; - abort(); - } - size_t pos = psrc->rfind(" ||| "); - if (pos == string::npos) { - cerr << "Malformed source for AER: expected |||\nINPUT: " << *psrc << endl; - abort(); - } - Lattice src; - Lattice ref; - LatticeTools::ConvertTextOrPLF(psrc->substr(0, pos), &src); - LatticeTools::ConvertTextOrPLF(psrc->substr(pos + 5), &ref); - AlignerTools::WriteAlignment(src, ref, hg, &os, true, &edges); - string tstr = os.str(); - TD::ConvertSentence(tstr.substr(tstr.rfind(" ||| ") + 5), &trans); - } else { - seg.ConstructTranslation(&trans); - } - // cerr << "Scoring: " << TD::GetString(trans) << endl; - if (trans == prev_trans) { - if (!minimize_segments) { - assert(prev_score); // if this fails, it means - // the decoder can generate null translations - ErrorSegment& out = (*env)[j]; - out.delta = prev_score->GetZero(); - out.x = seg.x; - ++j; - } - // cerr << "Identical translation, skipping scoring\n"; - } else { - ScoreP score = ScoreCandidate(trans); - // cerr << "score= " << score->ComputeScore() << "\n"; - ScoreP cur_delta_p = score->GetZero(); - Score* cur_delta = cur_delta_p.get(); - // just record the score diffs - if (!prev_score) - prev_score = score->GetZero(); - - score->Subtract(*prev_score, cur_delta); - prev_trans.swap(trans); - prev_score = score; - if ((!minimize_segments) || (!cur_delta->IsAdditiveIdentity())) { - ErrorSegment& out = (*env)[j]; - out.delta = cur_delta_p; - out.x = seg.x; - ++j; - } - } - } - // cerr << " In segments: " << ienv.size() << endl; - // cerr << "Out segments: " << j << endl; - assert(j > 0); - env->resize(j); -} - -void BLEUScore::ScoreDetails(string* details) const { - char buf[2000]; - vector<float> precs(max(N(),4)); - float bp; - float bleu = ComputeScore(&precs, &bp); - for (int i=N();i<4;++i) - precs[i]=0.; - char *bufn; - bufn=buf+sprintf(buf, "BLEU = %.2f, %.1f|%.1f|%.1f|%.1f (brev=%.3f)", - bleu*100.0, - precs[0]*100.0, - precs[1]*100.0, - precs[2]*100.0, - precs[3]*100.0, - bp); - *details = buf; -} - -float BLEUScore::ComputeScore(vector<float>* precs, float* bp) const { - float log_bleu = 0; - if (precs) precs->clear(); - int count = 0; - for (int i = 0; i < N(); ++i) { - if (hyp_ngram_counts[i] > 0) { - float lprec = log(correct_ngram_hit_counts[i]) - log(hyp_ngram_counts[i]); - if (precs) precs->push_back(exp(lprec)); - log_bleu += lprec; - ++count; - } - } - log_bleu /= static_cast<float>(count); - float lbp = 0.0; - if (hyp_len < ref_len) - lbp = (hyp_len - ref_len) / hyp_len; - log_bleu += lbp; - if (bp) *bp = exp(lbp); - return exp(log_bleu); -} - - -//comptue scaled score for oracle retrieval -float BLEUScore::ComputePartialScore(vector<float>* precs, float* bp) const { - // cerr << "Then here " << endl; - float log_bleu = 0; - if (precs) precs->clear(); - int count = 0; - for (int i = 0; i < N(); ++i) { - // cerr << "In CPS " << hyp_ngram_counts[i] << " " << correct_ngram_hit_counts[i] << endl; - if (hyp_ngram_counts[i] > 0) { - float lprec = log(correct_ngram_hit_counts[i]) - log(hyp_ngram_counts[i]); - if (precs) precs->push_back(exp(lprec)); - log_bleu += lprec; - ++count; - } - } - log_bleu /= static_cast<float>(count); - float lbp = 0.0; - if (hyp_len < ref_len) - lbp = (hyp_len - ref_len) / hyp_len; - log_bleu += lbp; - if (bp) *bp = exp(lbp); - return exp(log_bleu); -} - -float BLEUScore::ComputePartialScore() const { - // cerr << "In here first " << endl; - return ComputePartialScore(NULL, NULL); -} - -float BLEUScore::ComputeScore() const { - return ComputeScore(NULL, NULL); -} - -void BLEUScore::Subtract(const Score& rhs, Score* res) const { - const BLEUScore& d = static_cast<const BLEUScore&>(rhs); - BLEUScore* o = static_cast<BLEUScore*>(res); - o->ref_len = ref_len - d.ref_len; - o->hyp_len = hyp_len - d.hyp_len; - o->correct_ngram_hit_counts = correct_ngram_hit_counts - d.correct_ngram_hit_counts; - o->hyp_ngram_counts = hyp_ngram_counts - d.hyp_ngram_counts; -} - -void BLEUScore::PlusEquals(const Score& delta) { - const BLEUScore& d = static_cast<const BLEUScore&>(delta); - correct_ngram_hit_counts += d.correct_ngram_hit_counts; - hyp_ngram_counts += d.hyp_ngram_counts; - ref_len += d.ref_len; - hyp_len += d.hyp_len; -} - -void BLEUScore::TimesEquals(float scale) { - correct_ngram_hit_counts *= scale; - hyp_ngram_counts *= scale; - ref_len *= scale; - hyp_len *= scale; -} - -void BLEUScore::PlusEquals(const Score& delta, const float scale) { - const BLEUScore& d = static_cast<const BLEUScore&>(delta); - correct_ngram_hit_counts = correct_ngram_hit_counts + (d.correct_ngram_hit_counts * scale); - hyp_ngram_counts = hyp_ngram_counts + (d.hyp_ngram_counts * scale); - ref_len = ref_len + (d.ref_len * scale); - hyp_len = hyp_len + (d.hyp_len * scale); -} - -void BLEUScore::PlusPartialEquals(const Score& delta, int oracle_e_cover, int oracle_f_cover, int src_len){ - const BLEUScore& d = static_cast<const BLEUScore&>(delta); - correct_ngram_hit_counts += d.correct_ngram_hit_counts; - hyp_ngram_counts += d.hyp_ngram_counts; - //scale the reference length according to the size of the input sentence covered by this rule - - ref_len *= (float)oracle_f_cover / src_len; - ref_len += d.ref_len; - - hyp_len = oracle_e_cover; - hyp_len += d.hyp_len; -} - - -ScoreP BLEUScore::GetZero() const { - return ScoreP(new BLEUScore(N())); -} - -ScoreP BLEUScore::GetOne() const { - return ScoreP(new BLEUScore(N(),1)); -} - - -void BLEUScore::Encode(string* out) const { - ostringstream os; - const int n = correct_ngram_hit_counts.size(); - os << n << ' ' << ref_len << ' ' << hyp_len; - for (int i = 0; i < n; ++i) - os << ' ' << correct_ngram_hit_counts[i] << ' ' << hyp_ngram_counts[i]; - *out = os.str(); -} - -BLEUScorerBase::BLEUScorerBase(const vector<vector<WordID> >& references, - int n) : SentenceScorer("BLEU"+boost::lexical_cast<string>(n),references),n_(n) { - for (vector<vector<WordID> >::const_iterator ci = references.begin(); - ci != references.end(); ++ci) { - lengths_.push_back(ci->size()); - CountRef(*ci); - } -} - -ScoreP BLEUScorerBase::ScoreCandidate(const vector<WordID>& hyp) const { - BLEUScore* bs = new BLEUScore(n_); - for (NGramCountMap::iterator i=ngrams_.begin(); i != ngrams_.end(); ++i) - i->second.second = 0; - ComputeNgramStats(hyp, &bs->correct_ngram_hit_counts, &bs->hyp_ngram_counts, true); - bs->ref_len = ComputeRefLength(hyp); - bs->hyp_len = hyp.size(); - return ScoreP(bs); -} - -ScoreP BLEUScorerBase::ScoreCCandidate(const vector<WordID>& hyp) const { - BLEUScore* bs = new BLEUScore(n_); - for (NGramCountMap::iterator i=ngrams_.begin(); i != ngrams_.end(); ++i) - i->second.second = 0; - bool clip = false; - ComputeNgramStats(hyp, &bs->correct_ngram_hit_counts, &bs->hyp_ngram_counts,clip); - bs->ref_len = ComputeRefLength(hyp); - bs->hyp_len = hyp.size(); - return ScoreP(bs); -} - - -DocScorer::~DocScorer() { -} - -void DocScorer::Init( - const ScoreType type, - const vector<string>& ref_files, - const string& src_file, bool verbose) { - scorers_.clear(); - // TODO stop using valarray, start using ReadFile - cerr << "Loading references (" << ref_files.size() << " files)\n"; - ReadFile srcrf; - if (type == AER && src_file.size() > 0) { - cerr << " (source=" << src_file << ")\n"; - srcrf.Init(src_file); - } - std::vector<ReadFile> ifs(ref_files.begin(),ref_files.end()); - for (int i=0; i < ref_files.size(); ++i) ifs[i].Init(ref_files[i]); - char buf[64000]; - bool expect_eof = false; - int line=0; - while (ifs[0].get()) { - vector<vector<WordID> > refs(ref_files.size()); - for (int i=0; i < ref_files.size(); ++i) { - istream &in=ifs[i].get(); - if (in.eof()) break; - in.getline(buf, 64000); - refs[i].clear(); - if (strlen(buf) == 0) { - if (in.eof()) { - if (!expect_eof) { - assert(i == 0); - expect_eof = true; - } - break; - } - } else { - TD::ConvertSentence(buf, &refs[i]); - assert(!refs[i].empty()); - } - assert(!expect_eof); - } - if (!expect_eof) { - string src_line; - if (srcrf) { - getline(srcrf.get(), src_line); - map<string,string> dummy; - ProcessAndStripSGML(&src_line, &dummy); - } - scorers_.push_back(ScorerP(SentenceScorer::CreateSentenceScorer(type, refs, src_line))); - if (verbose) - cerr<<"doc_scorer["<<line<<"] = "<<scorers_.back()->verbose_desc()<<endl; - ++line; - } - } - cerr << "Loaded reference translations for " << scorers_.size() << " sentences.\n"; -} - diff --git a/vest/scorer.h b/vest/scorer.h deleted file mode 100644 index 0c8b380f..00000000 --- a/vest/scorer.h +++ /dev/null @@ -1,111 +0,0 @@ -#ifndef SCORER_H_ -#define SCORER_H_ -#include <vector> -#include <string> -#include <boost/shared_ptr.hpp> -//TODO: use intrusive shared_ptr in Score (because there are many of them on ErrorSurfaces) -#include "wordid.h" -#include "intrusive_refcount.hpp" - -class Score; -class SentenceScorer; -typedef boost::intrusive_ptr<Score> ScoreP; -typedef boost::shared_ptr<SentenceScorer> ScorerP; - -class ViterbiEnvelope; -class ErrorSurface; -class Hypergraph; // needed for alignment - -//TODO: BLEU N (N separate arg, not part of enum)? -enum ScoreType { IBM_BLEU, NIST_BLEU, Koehn_BLEU, TER, BLEU_minus_TER_over_2, SER, AER, IBM_BLEU_3 }; -ScoreType ScoreTypeFromString(const std::string& st); -std::string StringFromScoreType(ScoreType st); - -class Score : public boost::intrusive_refcount<Score> { - public: - virtual ~Score(); - virtual float ComputeScore() const = 0; - virtual float ComputePartialScore() const =0; - virtual void ScoreDetails(std::string* details) const = 0; - std::string ScoreDetails() { - std::string d; - ScoreDetails(&d); - return d; - } - virtual void TimesEquals(float scale); // only for bleu; for mira oracle - /// same as rhs.TimesEquals(scale);PlusEquals(rhs) except doesn't modify rhs. - virtual void PlusEquals(const Score& rhs, const float scale) = 0; - virtual void PlusEquals(const Score& rhs) = 0; - virtual void PlusPartialEquals(const Score& rhs, int oracle_e_cover, int oracle_f_cover, int src_len) = 0; - virtual void Subtract(const Score& rhs, Score *res) const = 0; - virtual ScoreP GetZero() const = 0; - virtual ScoreP GetOne() const = 0; - virtual bool IsAdditiveIdentity() const = 0; // returns true if adding this delta - // to another score results in no score change - // under any circumstances - virtual void Encode(std::string* out) const = 0; - static ScoreP GetZero(ScoreType type); - static ScoreP GetOne(ScoreType type); - virtual ScoreP Clone() const = 0; -protected: - Score() { } // we define these explicitly because refcount is noncopyable - Score(Score const& o) { } -}; - -//TODO: make sure default copy ctors for score types do what we want. -template <class Derived> -struct ScoreBase : public Score { - ScoreP Clone() const { - return ScoreP(new Derived(dynamic_cast<Derived const&>(*this))); - } -}; - -class SentenceScorer { - public: - typedef std::vector<WordID> Sentence; - typedef std::vector<Sentence> Sentences; - std::string desc; - Sentences refs; - SentenceScorer(std::string desc="SentenceScorer_unknown", Sentences const& refs=Sentences()) : desc(desc),refs(refs) { } - std::string verbose_desc() const; - virtual float ComputeRefLength(const Sentence& hyp) const; // default: avg of refs.length - virtual ~SentenceScorer(); - virtual ScoreP GetOne() const; - virtual ScoreP GetZero() const; - void ComputeErrorSurface(const ViterbiEnvelope& ve, ErrorSurface* es, const ScoreType type, const Hypergraph& hg) const; - virtual ScoreP ScoreCandidate(const Sentence& hyp) const = 0; - virtual ScoreP ScoreCCandidate(const Sentence& hyp) const =0; - virtual const std::string* GetSource() const; - static ScoreP CreateScoreFromString(const ScoreType type, const std::string& in); - static ScorerP CreateSentenceScorer(const ScoreType type, - const std::vector<Sentence >& refs, - const std::string& src = ""); -}; - -//TODO: should be able to GetOne GetZero without supplying sentence (just type) -class DocScorer { - public: - ~DocScorer(); - DocScorer() { } - void Init(const ScoreType type, - const std::vector<std::string>& ref_files, - const std::string& src_file = "", - bool verbose=false - ); - DocScorer(const ScoreType type, - const std::vector<std::string>& ref_files, - const std::string& src_file = "", - bool verbose=false - ) - { - Init(type,ref_files,src_file,verbose); - } - - int size() const { return scorers_.size(); } - ScorerP operator[](size_t i) const { return scorers_[i]; } - private: - std::vector<ScorerP> scorers_; -}; - - -#endif diff --git a/vest/ter.cc b/vest/ter.cc deleted file mode 100644 index cacc5b00..00000000 --- a/vest/ter.cc +++ /dev/null @@ -1,535 +0,0 @@ -#include "ter.h" - -#include <cstdio> -#include <cassert> -#include <iostream> -#include <limits> -#include <sstream> -#include <tr1/unordered_map> -#include <set> -#include <valarray> -#include <boost/functional/hash.hpp> -#include <stdexcept> -#include "tdict.h" - -const bool ter_use_average_ref_len = true; -const int ter_short_circuit_long_sentences = -1; - -using namespace std; -using namespace std::tr1; - -struct COSTS { - static const float substitution; - static const float deletion; - static const float insertion; - static const float shift; -}; -const float COSTS::substitution = 1.0f; -const float COSTS::deletion = 1.0f; -const float COSTS::insertion = 1.0f; -const float COSTS::shift = 1.0f; - -static const int MAX_SHIFT_SIZE = 10; -static const int MAX_SHIFT_DIST = 50; - -struct Shift { - unsigned int d_; - Shift() : d_() {} - Shift(int b, int e, int m) : d_() { - begin(b); - end(e); - moveto(m); - } - inline int begin() const { - return d_ & 0x3ff; - } - inline int end() const { - return (d_ >> 10) & 0x3ff; - } - inline int moveto() const { - int m = (d_ >> 20) & 0x7ff; - if (m > 1024) { m -= 1024; m *= -1; } - return m; - } - inline void begin(int b) { - d_ &= 0xfffffc00u; - d_ |= (b & 0x3ff); - } - inline void end(int e) { - d_ &= 0xfff003ffu; - d_ |= (e & 0x3ff) << 10; - } - inline void moveto(int m) { - bool neg = (m < 0); - if (neg) { m *= -1; m += 1024; } - d_ &= 0xfffff; - d_ |= (m & 0x7ff) << 20; - } -}; - -class TERScorerImpl { - - public: - enum TransType { MATCH, SUBSTITUTION, INSERTION, DELETION }; - - explicit TERScorerImpl(const vector<WordID>& ref) : ref_(ref) { - for (int i = 0; i < ref.size(); ++i) - rwexists_.insert(ref[i]); - } - - float Calculate(const vector<WordID>& hyp, int* subs, int* ins, int* dels, int* shifts) const { - return CalculateAllShifts(hyp, subs, ins, dels, shifts); - } - - inline int GetRefLength() const { - return ref_.size(); - } - - private: - vector<WordID> ref_; - set<WordID> rwexists_; - - typedef unordered_map<vector<WordID>, set<int>, boost::hash<vector<WordID> > > NgramToIntsMap; - mutable NgramToIntsMap nmap_; - - static float MinimumEditDistance( - const vector<WordID>& hyp, - const vector<WordID>& ref, - vector<TransType>* path) { - vector<vector<TransType> > bmat(hyp.size() + 1, vector<TransType>(ref.size() + 1, MATCH)); - vector<vector<float> > cmat(hyp.size() + 1, vector<float>(ref.size() + 1, 0)); - for (int i = 0; i <= hyp.size(); ++i) - cmat[i][0] = i; - for (int j = 0; j <= ref.size(); ++j) - cmat[0][j] = j; - for (int i = 1; i <= hyp.size(); ++i) { - const WordID& hw = hyp[i-1]; - for (int j = 1; j <= ref.size(); ++j) { - const WordID& rw = ref[j-1]; - float& cur_c = cmat[i][j]; - TransType& cur_b = bmat[i][j]; - - if (rw == hw) { - cur_c = cmat[i-1][j-1]; - cur_b = MATCH; - } else { - cur_c = cmat[i-1][j-1] + COSTS::substitution; - cur_b = SUBSTITUTION; - } - float cwoi = cmat[i-1][j]; - if (cur_c > cwoi + COSTS::insertion) { - cur_c = cwoi + COSTS::insertion; - cur_b = INSERTION; - } - float cwod = cmat[i][j-1]; - if (cur_c > cwod + COSTS::deletion) { - cur_c = cwod + COSTS::deletion; - cur_b = DELETION; - } - } - } - - // trace back along the best path and record the transition types - path->clear(); - int i = hyp.size(); - int j = ref.size(); - while (i > 0 || j > 0) { - if (j == 0) { - --i; - path->push_back(INSERTION); - } else if (i == 0) { - --j; - path->push_back(DELETION); - } else { - TransType t = bmat[i][j]; - path->push_back(t); - switch (t) { - case SUBSTITUTION: - case MATCH: - --i; --j; break; - case INSERTION: - --i; break; - case DELETION: - --j; break; - } - } - } - reverse(path->begin(), path->end()); - return cmat[hyp.size()][ref.size()]; - } - - void BuildWordMatches(const vector<WordID>& hyp, NgramToIntsMap* nmap) const { - nmap->clear(); - set<WordID> exists_both; - for (int i = 0; i < hyp.size(); ++i) - if (rwexists_.find(hyp[i]) != rwexists_.end()) - exists_both.insert(hyp[i]); - for (int start=0; start<ref_.size(); ++start) { - if (exists_both.find(ref_[start]) == exists_both.end()) continue; - vector<WordID> cp; - int mlen = min(MAX_SHIFT_SIZE, static_cast<int>(ref_.size() - start)); - for (int len=0; len<mlen; ++len) { - if (len && exists_both.find(ref_[start + len]) == exists_both.end()) break; - cp.push_back(ref_[start + len]); - (*nmap)[cp].insert(start); - } - } - } - - static void PerformShift(const vector<WordID>& in, - int start, int end, int moveto, vector<WordID>* out) { - // cerr << "ps: " << start << " " << end << " " << moveto << endl; - out->clear(); - if (moveto == -1) { - for (int i = start; i <= end; ++i) - out->push_back(in[i]); - for (int i = 0; i < start; ++i) - out->push_back(in[i]); - for (int i = end+1; i < in.size(); ++i) - out->push_back(in[i]); - } else if (moveto < start) { - for (int i = 0; i <= moveto; ++i) - out->push_back(in[i]); - for (int i = start; i <= end; ++i) - out->push_back(in[i]); - for (int i = moveto+1; i < start; ++i) - out->push_back(in[i]); - for (int i = end+1; i < in.size(); ++i) - out->push_back(in[i]); - } else if (moveto > end) { - for (int i = 0; i < start; ++i) - out->push_back(in[i]); - for (int i = end+1; i <= moveto; ++i) - out->push_back(in[i]); - for (int i = start; i <= end; ++i) - out->push_back(in[i]); - for (int i = moveto+1; i < in.size(); ++i) - out->push_back(in[i]); - } else { - for (int i = 0; i < start; ++i) - out->push_back(in[i]); - for (int i = end+1; (i < in.size()) && (i <= end + (moveto - start)); ++i) - out->push_back(in[i]); - for (int i = start; i <= end; ++i) - out->push_back(in[i]); - for (int i = (end + (moveto - start))+1; i < in.size(); ++i) - out->push_back(in[i]); - } - if (out->size() != in.size()) { - cerr << "ps: " << start << " " << end << " " << moveto << endl; - cerr << "in=" << TD::GetString(in) << endl; - cerr << "out=" << TD::GetString(*out) << endl; - } - assert(out->size() == in.size()); - // cerr << "ps: " << TD::GetString(*out) << endl; - } - - void GetAllPossibleShifts(const vector<WordID>& hyp, - const vector<int>& ralign, - const vector<bool>& herr, - const vector<bool>& rerr, - const int min_size, - vector<vector<Shift> >* shifts) const { - for (int start = 0; start < hyp.size(); ++start) { - vector<WordID> cp(1, hyp[start]); - NgramToIntsMap::iterator niter = nmap_.find(cp); - if (niter == nmap_.end()) continue; - bool ok = false; - int moveto; - for (set<int>::iterator i = niter->second.begin(); i != niter->second.end(); ++i) { - moveto = *i; - int rm = ralign[moveto]; - ok = (start != rm && - (rm - start) < MAX_SHIFT_DIST && - (start - rm - 1) < MAX_SHIFT_DIST); - if (ok) break; - } - if (!ok) continue; - cp.clear(); - for (int end = start + min_size - 1; - ok && end < hyp.size() && end < (start + MAX_SHIFT_SIZE); ++end) { - cp.push_back(hyp[end]); - vector<Shift>& sshifts = (*shifts)[end - start]; - ok = false; - NgramToIntsMap::iterator niter = nmap_.find(cp); - if (niter == nmap_.end()) break; - bool any_herr = false; - for (int i = start; i <= end && !any_herr; ++i) - any_herr = herr[i]; - if (!any_herr) { - ok = true; - continue; - } - for (set<int>::iterator mi = niter->second.begin(); - mi != niter->second.end(); ++mi) { - int moveto = *mi; - int rm = ralign[moveto]; - if (! ((rm != start) && - ((rm < start) || (rm > end)) && - (rm - start <= MAX_SHIFT_DIST) && - ((start - rm - 1) <= MAX_SHIFT_DIST))) continue; - ok = true; - bool any_rerr = false; - for (int i = 0; (i <= end - start) && (!any_rerr); ++i) - any_rerr = rerr[moveto+i]; - if (!any_rerr) continue; - for (int roff = 0; roff <= (end - start); ++roff) { - int rmr = ralign[moveto+roff]; - if ((start != rmr) && ((roff == 0) || (rmr != ralign[moveto]))) - sshifts.push_back(Shift(start, end, moveto + roff)); - } - } - } - } - } - - bool CalculateBestShift(const vector<WordID>& cur, - const vector<WordID>& hyp, - float curerr, - const vector<TransType>& path, - vector<WordID>* new_hyp, - float* newerr, - vector<TransType>* new_path) const { - vector<bool> herr, rerr; - vector<int> ralign; - int hpos = -1; - for (int i = 0; i < path.size(); ++i) { - switch (path[i]) { - case MATCH: - ++hpos; - herr.push_back(false); - rerr.push_back(false); - ralign.push_back(hpos); - break; - case SUBSTITUTION: - ++hpos; - herr.push_back(true); - rerr.push_back(true); - ralign.push_back(hpos); - break; - case INSERTION: - ++hpos; - herr.push_back(true); - break; - case DELETION: - rerr.push_back(true); - ralign.push_back(hpos); - break; - } - } -#if 0 - cerr << "RALIGN: "; - for (int i = 0; i < rerr.size(); ++i) - cerr << ralign[i] << " "; - cerr << endl; - cerr << "RERR: "; - for (int i = 0; i < rerr.size(); ++i) - cerr << (bool)rerr[i] << " "; - cerr << endl; - cerr << "HERR: "; - for (int i = 0; i < herr.size(); ++i) - cerr << (bool)herr[i] << " "; - cerr << endl; -#endif - - vector<vector<Shift> > shifts(MAX_SHIFT_SIZE + 1); - GetAllPossibleShifts(cur, ralign, herr, rerr, 1, &shifts); - float cur_best_shift_cost = 0; - *newerr = curerr; - vector<TransType> cur_best_path; - vector<WordID> cur_best_hyp; - - bool res = false; - for (int i = shifts.size() - 1; i >=0; --i) { - float curfix = curerr - (cur_best_shift_cost + *newerr); - float maxfix = 2.0f * (1 + i) - COSTS::shift; - if ((curfix > maxfix) || ((cur_best_shift_cost == 0) && (curfix == maxfix))) break; - for (int j = 0; j < shifts[i].size(); ++j) { - const Shift& s = shifts[i][j]; - curfix = curerr - (cur_best_shift_cost + *newerr); - maxfix = 2.0f * (1 + i) - COSTS::shift; // TODO remove? - if ((curfix > maxfix) || ((cur_best_shift_cost == 0) && (curfix == maxfix))) continue; - vector<WordID> shifted(cur.size()); - PerformShift(cur, s.begin(), s.end(), ralign[s.moveto()], &shifted); - vector<TransType> try_path; - float try_cost = MinimumEditDistance(shifted, ref_, &try_path); - float gain = (*newerr + cur_best_shift_cost) - (try_cost + COSTS::shift); - if (gain > 0.0f || ((cur_best_shift_cost == 0.0f) && (gain == 0.0f))) { - *newerr = try_cost; - cur_best_shift_cost = COSTS::shift; - new_path->swap(try_path); - new_hyp->swap(shifted); - res = true; - // cerr << "Found better shift " << s.begin() << "..." << s.end() << " moveto " << s.moveto() << endl; - } - } - } - - return res; - } - - static void GetPathStats(const vector<TransType>& path, int* subs, int* ins, int* dels) { - *subs = *ins = *dels = 0; - for (int i = 0; i < path.size(); ++i) { - switch (path[i]) { - case SUBSTITUTION: - ++(*subs); - case MATCH: - break; - case INSERTION: - ++(*ins); break; - case DELETION: - ++(*dels); break; - } - } - } - - float CalculateAllShifts(const vector<WordID>& hyp, - int* subs, int* ins, int* dels, int* shifts) const { - BuildWordMatches(hyp, &nmap_); - vector<TransType> path; - float med_cost = MinimumEditDistance(hyp, ref_, &path); - float edits = 0; - vector<WordID> cur = hyp; - *shifts = 0; - if (ter_short_circuit_long_sentences < 0 || - ref_.size() < ter_short_circuit_long_sentences) { - while (true) { - vector<WordID> new_hyp; - vector<TransType> new_path; - float new_med_cost; - if (!CalculateBestShift(cur, hyp, med_cost, path, &new_hyp, &new_med_cost, &new_path)) - break; - edits += COSTS::shift; - ++(*shifts); - med_cost = new_med_cost; - path.swap(new_path); - cur.swap(new_hyp); - } - } - GetPathStats(path, subs, ins, dels); - return med_cost + edits; - } -}; - -class TERScore : public ScoreBase<TERScore> { - friend class TERScorer; - - public: - static const unsigned kINSERTIONS = 0; - static const unsigned kDELETIONS = 1; - static const unsigned kSUBSTITUTIONS = 2; - static const unsigned kSHIFTS = 3; - static const unsigned kREF_WORDCOUNT = 4; - static const unsigned kDUMMY_LAST_ENTRY = 5; - - TERScore() : stats(0,kDUMMY_LAST_ENTRY) {} - float ComputePartialScore() const { return 0.0;} - float ComputeScore() const { - float edits = static_cast<float>(stats[kINSERTIONS] + stats[kDELETIONS] + stats[kSUBSTITUTIONS] + stats[kSHIFTS]); - return edits / static_cast<float>(stats[kREF_WORDCOUNT]); - } - void ScoreDetails(string* details) const; - void PlusPartialEquals(const Score& rhs, int oracle_e_cover, int oracle_f_cover, int src_len){} - void PlusEquals(const Score& delta, const float scale) { - if (scale==1) - stats += static_cast<const TERScore&>(delta).stats; - if (scale==-1) - stats -= static_cast<const TERScore&>(delta).stats; - throw std::runtime_error("TERScore::PlusEquals with scale != +-1"); - } - void PlusEquals(const Score& delta) { - stats += static_cast<const TERScore&>(delta).stats; - } - - ScoreP GetZero() const { - return ScoreP(new TERScore); - } - ScoreP GetOne() const { - return ScoreP(new TERScore); - } - void Subtract(const Score& rhs, Score* res) const { - static_cast<TERScore*>(res)->stats = stats - static_cast<const TERScore&>(rhs).stats; - } - void Encode(std::string* out) const { - ostringstream os; - os << stats[kINSERTIONS] << ' ' - << stats[kDELETIONS] << ' ' - << stats[kSUBSTITUTIONS] << ' ' - << stats[kSHIFTS] << ' ' - << stats[kREF_WORDCOUNT]; - *out = os.str(); - } - bool IsAdditiveIdentity() const { - for (int i = 0; i < kDUMMY_LAST_ENTRY; ++i) - if (stats[i] != 0) return false; - return true; - } - private: - valarray<int> stats; -}; - -ScoreP TERScorer::ScoreFromString(const std::string& data) { - istringstream is(data); - TERScore* r = new TERScore; - is >> r->stats[TERScore::kINSERTIONS] - >> r->stats[TERScore::kDELETIONS] - >> r->stats[TERScore::kSUBSTITUTIONS] - >> r->stats[TERScore::kSHIFTS] - >> r->stats[TERScore::kREF_WORDCOUNT]; - return ScoreP(r); -} - -void TERScore::ScoreDetails(std::string* details) const { - char buf[200]; - sprintf(buf, "TER = %.2f, %3d|%3d|%3d|%3d (len=%d)", - ComputeScore() * 100.0f, - stats[kINSERTIONS], - stats[kDELETIONS], - stats[kSUBSTITUTIONS], - stats[kSHIFTS], - stats[kREF_WORDCOUNT]); - *details = buf; -} - -TERScorer::~TERScorer() { - for (vector<TERScorerImpl*>::iterator i = impl_.begin(); i != impl_.end(); ++i) - delete *i; -} - -TERScorer::TERScorer(const vector<vector<WordID> >& refs) : impl_(refs.size()) { - for (int i = 0; i < refs.size(); ++i) - impl_[i] = new TERScorerImpl(refs[i]); -} - -ScoreP TERScorer::ScoreCCandidate(const vector<WordID>& hyp) const { - return ScoreP(); -} - -ScoreP TERScorer::ScoreCandidate(const std::vector<WordID>& hyp) const { - float best_score = numeric_limits<float>::max(); - TERScore* res = new TERScore; - int avg_len = 0; - for (int i = 0; i < impl_.size(); ++i) - avg_len += impl_[i]->GetRefLength(); - avg_len /= impl_.size(); - for (int i = 0; i < impl_.size(); ++i) { - int subs, ins, dels, shifts; - float score = impl_[i]->Calculate(hyp, &subs, &ins, &dels, &shifts); - // cerr << "Component TER cost: " << score << endl; - if (score < best_score) { - res->stats[TERScore::kINSERTIONS] = ins; - res->stats[TERScore::kDELETIONS] = dels; - res->stats[TERScore::kSUBSTITUTIONS] = subs; - res->stats[TERScore::kSHIFTS] = shifts; - if (ter_use_average_ref_len) { - res->stats[TERScore::kREF_WORDCOUNT] = avg_len; - } else { - res->stats[TERScore::kREF_WORDCOUNT] = impl_[i]->GetRefLength(); - } - - best_score = score; - } - } - return ScoreP(res); -} diff --git a/vest/ter.h b/vest/ter.h deleted file mode 100644 index 43314791..00000000 --- a/vest/ter.h +++ /dev/null @@ -1,19 +0,0 @@ -#ifndef _TER_H_ -#define _TER_H_ - -#include "scorer.h" - -class TERScorerImpl; - -class TERScorer : public SentenceScorer { - public: - TERScorer(const std::vector<std::vector<WordID> >& references); - ~TERScorer(); - ScoreP ScoreCandidate(const std::vector<WordID>& hyp) const; - ScoreP ScoreCCandidate(const std::vector<WordID>& hyp) const; - static ScoreP ScoreFromString(const std::string& data); - private: - std::vector<TERScorerImpl*> impl_; -}; - -#endif |