diff options
author | graehl <graehl@ec762483-ff6d-05da-a07a-a48fb63a330f> | 2010-07-15 03:50:05 +0000 |
---|---|---|
committer | graehl <graehl@ec762483-ff6d-05da-a07a-a48fb63a330f> | 2010-07-15 03:50:05 +0000 |
commit | f819992b0b22b4fec88c15fe13118aa6b484b91b (patch) | |
tree | 1bf835e4b29ca926a4ca33a2a57743559c9ba58f /vest | |
parent | c61c0f2f664eebcc434ce76e6767fccdbdf6fae2 (diff) |
oracle bleu refactor
git-svn-id: https://ws10smt.googlecode.com/svn/trunk@259 ec762483-ff6d-05da-a07a-a48fb63a330f
Diffstat (limited to 'vest')
-rw-r--r-- | vest/mr_vest_generate_mapper_input.cc | 15 | ||||
-rw-r--r-- | vest/scorer.cc | 48 | ||||
-rw-r--r-- | vest/scorer.h | 11 |
3 files changed, 56 insertions, 18 deletions
diff --git a/vest/mr_vest_generate_mapper_input.cc b/vest/mr_vest_generate_mapper_input.cc index 5c3e8181..c0f80d0c 100644 --- a/vest/mr_vest_generate_mapper_input.cc +++ b/vest/mr_vest_generate_mapper_input.cc @@ -11,6 +11,8 @@ #include "line_optimizer.h" #include "hg.h" #include "hg_io.h" +#include "scorer.h" +#include "oracle_bleu.h" using namespace std; namespace po = boost::program_options; @@ -30,16 +32,20 @@ struct oracle_directions { return o.str(); } - oracle_directions(string forest_repository,unsigned dev_set_size,vector<int> const& fids=vector<int>()): forest_repository(forest_repository),dev_set_size(dev_set_size),fids(fids) { + void set_dev_set_size(int i) { + dev_set_size=i; dirs.resize(dev_set_size); } + + oracle_directions(string forest_repository="",unsigned dev_set_sz=0,vector<int> const& fids=vector<int>()): forest_repository(forest_repository),fids(fids) { + set_dev_set_size(dev_set_sz); + } + Dir const& operator[](unsigned i) { Dir &dir=dirs[i]; if (dir.empty()) { ReadFile rf(forest_file(i)); - Hypergraph hg; - HypergraphIO::ReadFromJSON(rf.stream(), &hg); - cerr<<"oracle: forest["<<i<<"] loaded: "<<hg.stats()<<endl; + FeatureVector fear,hope,best; //TODO: get hope/oracle from vlad. random for now. LineOptimizer::RandomUnitVector(fids,&dir,&rng); } @@ -86,6 +92,7 @@ void compress_similar(vector<Dir> &dirs,double min_dist,ostream *log=&cerr,bool void InitCommandLine(int argc, char** argv, po::variables_map* conf) { po::options_description opts("Configuration options"); + OracleBleu::AddOptions(&opts); opts.add_options() ("dev_set_size,s",po::value<unsigned int>(),"[REQD] Development set size (# of parallel sentences)") ("forest_repository,r",po::value<string>(),"[REQD] Path to forest repository") diff --git a/vest/scorer.cc b/vest/scorer.cc index 524b15a5..8f981af6 100644 --- a/vest/scorer.cc +++ b/vest/scorer.cc @@ -6,6 +6,7 @@ #include <fstream> #include <cstdio> #include <valarray> +#include <algorithm> #include <boost/shared_ptr.hpp> @@ -47,8 +48,37 @@ ScoreType ScoreTypeFromString(const string& st) { return IBM_BLEU; } +static char const* score_names[]={ + "IBM_BLEU", "NIST_BLEU", "Koehn_BLEU", "TER", "BLEU_minus_TER_over_2", "SER", "AER", "IBM_BLEU_3" +}; + +std::string StringFromScoreType(ScoreType st) { + assert(st>=0 && st<sizeof(score_names)/sizeof(score_names[0])); + return score_names[(int)st]; +} + + Score::~Score() {} SentenceScorer::~SentenceScorer() {} + +struct length_accum { + template <class S> + float operator()(float sum,S const& ref) const { + return sum+ref.size(); + } +}; + +template <class S> +float avg_reflength(vector<S> refs) { + unsigned n=refs.size(); + return n?accumulate(refs.begin(),refs.end(),0.,length_accum())/n:0.; +} + + +float SentenceScorer::ComputeRefLength(const Sentence &hyp) const { + return hyp.size(); // reasonable default? :) +} + const std::string* SentenceScorer::GetSource() const { return NULL; } class SERScore : public Score { @@ -64,9 +94,9 @@ class SERScore : public Score { os << "SER= " << ComputeScore() << " (" << correct << '/' << total << ')'; *details = os.str(); } - void PlusPartialEquals(const Score& delta, int oracle_e_cover, int oracle_f_cover, int src_len){} - - void PlusEquals(const Score& delta, const float scale) { + void PlusPartialEquals(const Score& /* delta */, int /* oracle_e_cover */, int /* oracle_f_cover */, int /* src_len */){} + + void PlusEquals(const Score& delta, const float /* scale */) { correct += static_cast<const SERScore&>(delta).correct; total += static_cast<const SERScore&>(delta).total; } @@ -94,7 +124,7 @@ class SERScore : public Score { class SERScorer : public SentenceScorer { public: SERScorer(const vector<vector<WordID> >& references) : refs_(references) {} - Score* ScoreCCandidate(const vector<WordID>& hyp) const { + Score* ScoreCCandidate(const vector<WordID>& /* hyp */) const { Score* a = NULL; return a; } @@ -120,7 +150,7 @@ class BLEUScore : public Score { hyp_len = 0; } BLEUScore(int n, int k) : correct_ngram_hit_counts(float(k),float(n)), hyp_ngram_counts(float(k),float(n)) { ref_len = k; - hyp_len = k; } + hyp_len = k; } float ComputeScore() const; float ComputePartialScore() const; void ScoreDetails(string* details) const; @@ -156,7 +186,6 @@ class BLEUScorerBase : public SentenceScorer { Score* ScoreCCandidate(const vector<WordID>& hyp) const; static Score* ScoreFromString(const string& in); - protected: virtual float ComputeRefLength(const vector<WordID>& hyp) const = 0; private: struct NGramCompare { @@ -257,7 +286,6 @@ class IBM_BLEUScorer : public BLEUScorerBase { for (int i=0; i < references.size(); ++i) lengths_[i] = references[i].size(); } - protected: float ComputeRefLength(const vector<WordID>& hyp) const { if (lengths_.size() == 1) return lengths_[0]; int bestd = 2000000; @@ -285,7 +313,6 @@ class NIST_BLEUScorer : public BLEUScorerBase { if (references[i].size() < shortest_) shortest_ = references[i].size(); } - protected: float ComputeRefLength(const vector<WordID>& /* hyp */) const { return shortest_; } @@ -302,7 +329,6 @@ class Koehn_BLEUScorer : public BLEUScorerBase { avg_ += references[i].size(); avg_ /= references.size(); } - protected: float ComputeRefLength(const vector<WordID>& /* hyp */) const { return avg_; } @@ -520,10 +546,10 @@ void BLEUScore::PlusPartialEquals(const Score& delta, int oracle_e_cover, int or correct_ngram_hit_counts += d.correct_ngram_hit_counts; hyp_ngram_counts += d.hyp_ngram_counts; //scale the reference length according to the size of the input sentence covered by this rule - + ref_len *= (float)oracle_f_cover / src_len; ref_len += d.ref_len; - + hyp_len = oracle_e_cover; hyp_len += d.hyp_len; } diff --git a/vest/scorer.h b/vest/scorer.h index 7ce688c4..5bfeee0f 100644 --- a/vest/scorer.h +++ b/vest/scorer.h @@ -12,6 +12,7 @@ class Hypergraph; // needed for alignment enum ScoreType { IBM_BLEU, NIST_BLEU, Koehn_BLEU, TER, BLEU_minus_TER_over_2, SER, AER, IBM_BLEU_3 }; ScoreType ScoreTypeFromString(const std::string& st); +std::string StringFromScoreType(ScoreType st); class Score { public: @@ -33,20 +34,24 @@ class Score { class SentenceScorer { public: + typedef std::vector<WordID> Sentence; + virtual float ComputeRefLength(const Sentence& hyp) const; // default: avg of refs.length virtual ~SentenceScorer(); void ComputeErrorSurface(const ViterbiEnvelope& ve, ErrorSurface* es, const ScoreType type, const Hypergraph& hg) const; - virtual Score* ScoreCandidate(const std::vector<WordID>& hyp) const = 0; - virtual Score* ScoreCCandidate(const std::vector<WordID>& hyp) const =0; + virtual Score* ScoreCandidate(const Sentence& hyp) const = 0; + virtual Score* ScoreCCandidate(const Sentence& hyp) const =0; virtual const std::string* GetSource() const; static Score* CreateScoreFromString(const ScoreType type, const std::string& in); static SentenceScorer* CreateSentenceScorer(const ScoreType type, - const std::vector<std::vector<WordID> >& refs, + const std::vector<Sentence >& refs, const std::string& src = ""); }; +//TODO: should be able to GetOne GetZero without supplying sentence (just type) class DocScorer { public: ~DocScorer(); + DocScorer() { } DocScorer( const ScoreType type, const std::vector<std::string>& ref_files, |