From 27ed3c0fecde089a761ccf718748413bb572a3a4 Mon Sep 17 00:00:00 2001 From: graehl Date: Thu, 15 Jul 2010 03:50:05 +0000 Subject: oracle bleu refactor git-svn-id: https://ws10smt.googlecode.com/svn/trunk@259 ec762483-ff6d-05da-a07a-a48fb63a330f --- vest/mr_vest_generate_mapper_input.cc | 15 ++++++++--- vest/scorer.cc | 48 +++++++++++++++++++++++++++-------- vest/scorer.h | 11 +++++--- 3 files changed, 56 insertions(+), 18 deletions(-) (limited to 'vest') diff --git a/vest/mr_vest_generate_mapper_input.cc b/vest/mr_vest_generate_mapper_input.cc index 5c3e8181..c0f80d0c 100644 --- a/vest/mr_vest_generate_mapper_input.cc +++ b/vest/mr_vest_generate_mapper_input.cc @@ -11,6 +11,8 @@ #include "line_optimizer.h" #include "hg.h" #include "hg_io.h" +#include "scorer.h" +#include "oracle_bleu.h" using namespace std; namespace po = boost::program_options; @@ -30,16 +32,20 @@ struct oracle_directions { return o.str(); } - oracle_directions(string forest_repository,unsigned dev_set_size,vector const& fids=vector()): forest_repository(forest_repository),dev_set_size(dev_set_size),fids(fids) { + void set_dev_set_size(int i) { + dev_set_size=i; dirs.resize(dev_set_size); } + + oracle_directions(string forest_repository="",unsigned dev_set_sz=0,vector const& fids=vector()): forest_repository(forest_repository),fids(fids) { + set_dev_set_size(dev_set_sz); + } + Dir const& operator[](unsigned i) { Dir &dir=dirs[i]; if (dir.empty()) { ReadFile rf(forest_file(i)); - Hypergraph hg; - HypergraphIO::ReadFromJSON(rf.stream(), &hg); - cerr<<"oracle: forest["< &dirs,double min_dist,ostream *log=&cerr,bool void InitCommandLine(int argc, char** argv, po::variables_map* conf) { po::options_description opts("Configuration options"); + OracleBleu::AddOptions(&opts); opts.add_options() ("dev_set_size,s",po::value(),"[REQD] Development set size (# of parallel sentences)") ("forest_repository,r",po::value(),"[REQD] Path to forest repository") diff --git a/vest/scorer.cc b/vest/scorer.cc index 524b15a5..8f981af6 100644 --- a/vest/scorer.cc +++ b/vest/scorer.cc @@ -6,6 +6,7 @@ #include #include #include +#include #include @@ -47,8 +48,37 @@ ScoreType ScoreTypeFromString(const string& st) { return IBM_BLEU; } +static char const* score_names[]={ + "IBM_BLEU", "NIST_BLEU", "Koehn_BLEU", "TER", "BLEU_minus_TER_over_2", "SER", "AER", "IBM_BLEU_3" +}; + +std::string StringFromScoreType(ScoreType st) { + assert(st>=0 && st + float operator()(float sum,S const& ref) const { + return sum+ref.size(); + } +}; + +template +float avg_reflength(vector refs) { + unsigned n=refs.size(); + return n?accumulate(refs.begin(),refs.end(),0.,length_accum())/n:0.; +} + + +float SentenceScorer::ComputeRefLength(const Sentence &hyp) const { + return hyp.size(); // reasonable default? :) +} + const std::string* SentenceScorer::GetSource() const { return NULL; } class SERScore : public Score { @@ -64,9 +94,9 @@ class SERScore : public Score { os << "SER= " << ComputeScore() << " (" << correct << '/' << total << ')'; *details = os.str(); } - void PlusPartialEquals(const Score& delta, int oracle_e_cover, int oracle_f_cover, int src_len){} - - void PlusEquals(const Score& delta, const float scale) { + void PlusPartialEquals(const Score& /* delta */, int /* oracle_e_cover */, int /* oracle_f_cover */, int /* src_len */){} + + void PlusEquals(const Score& delta, const float /* scale */) { correct += static_cast(delta).correct; total += static_cast(delta).total; } @@ -94,7 +124,7 @@ class SERScore : public Score { class SERScorer : public SentenceScorer { public: SERScorer(const vector >& references) : refs_(references) {} - Score* ScoreCCandidate(const vector& hyp) const { + Score* ScoreCCandidate(const vector& /* hyp */) const { Score* a = NULL; return a; } @@ -120,7 +150,7 @@ class BLEUScore : public Score { hyp_len = 0; } BLEUScore(int n, int k) : correct_ngram_hit_counts(float(k),float(n)), hyp_ngram_counts(float(k),float(n)) { ref_len = k; - hyp_len = k; } + hyp_len = k; } float ComputeScore() const; float ComputePartialScore() const; void ScoreDetails(string* details) const; @@ -156,7 +186,6 @@ class BLEUScorerBase : public SentenceScorer { Score* ScoreCCandidate(const vector& hyp) const; static Score* ScoreFromString(const string& in); - protected: virtual float ComputeRefLength(const vector& hyp) const = 0; private: struct NGramCompare { @@ -257,7 +286,6 @@ class IBM_BLEUScorer : public BLEUScorerBase { for (int i=0; i < references.size(); ++i) lengths_[i] = references[i].size(); } - protected: float ComputeRefLength(const vector& hyp) const { if (lengths_.size() == 1) return lengths_[0]; int bestd = 2000000; @@ -285,7 +313,6 @@ class NIST_BLEUScorer : public BLEUScorerBase { if (references[i].size() < shortest_) shortest_ = references[i].size(); } - protected: float ComputeRefLength(const vector& /* hyp */) const { return shortest_; } @@ -302,7 +329,6 @@ class Koehn_BLEUScorer : public BLEUScorerBase { avg_ += references[i].size(); avg_ /= references.size(); } - protected: float ComputeRefLength(const vector& /* hyp */) const { return avg_; } @@ -520,10 +546,10 @@ void BLEUScore::PlusPartialEquals(const Score& delta, int oracle_e_cover, int or correct_ngram_hit_counts += d.correct_ngram_hit_counts; hyp_ngram_counts += d.hyp_ngram_counts; //scale the reference length according to the size of the input sentence covered by this rule - + ref_len *= (float)oracle_f_cover / src_len; ref_len += d.ref_len; - + hyp_len = oracle_e_cover; hyp_len += d.hyp_len; } diff --git a/vest/scorer.h b/vest/scorer.h index 7ce688c4..5bfeee0f 100644 --- a/vest/scorer.h +++ b/vest/scorer.h @@ -12,6 +12,7 @@ class Hypergraph; // needed for alignment enum ScoreType { IBM_BLEU, NIST_BLEU, Koehn_BLEU, TER, BLEU_minus_TER_over_2, SER, AER, IBM_BLEU_3 }; ScoreType ScoreTypeFromString(const std::string& st); +std::string StringFromScoreType(ScoreType st); class Score { public: @@ -33,20 +34,24 @@ class Score { class SentenceScorer { public: + typedef std::vector Sentence; + virtual float ComputeRefLength(const Sentence& hyp) const; // default: avg of refs.length virtual ~SentenceScorer(); void ComputeErrorSurface(const ViterbiEnvelope& ve, ErrorSurface* es, const ScoreType type, const Hypergraph& hg) const; - virtual Score* ScoreCandidate(const std::vector& hyp) const = 0; - virtual Score* ScoreCCandidate(const std::vector& hyp) const =0; + virtual Score* ScoreCandidate(const Sentence& hyp) const = 0; + virtual Score* ScoreCCandidate(const Sentence& hyp) const =0; virtual const std::string* GetSource() const; static Score* CreateScoreFromString(const ScoreType type, const std::string& in); static SentenceScorer* CreateSentenceScorer(const ScoreType type, - const std::vector >& refs, + const std::vector& refs, const std::string& src = ""); }; +//TODO: should be able to GetOne GetZero without supplying sentence (just type) class DocScorer { public: ~DocScorer(); + DocScorer() { } DocScorer( const ScoreType type, const std::vector& ref_files, -- cgit v1.2.3