summaryrefslogtreecommitdiff
path: root/vest
diff options
context:
space:
mode:
authorgraehl <graehl@ec762483-ff6d-05da-a07a-a48fb63a330f>2010-07-15 03:50:05 +0000
committergraehl <graehl@ec762483-ff6d-05da-a07a-a48fb63a330f>2010-07-15 03:50:05 +0000
commitf819992b0b22b4fec88c15fe13118aa6b484b91b (patch)
tree1bf835e4b29ca926a4ca33a2a57743559c9ba58f /vest
parentc61c0f2f664eebcc434ce76e6767fccdbdf6fae2 (diff)
oracle bleu refactor
git-svn-id: https://ws10smt.googlecode.com/svn/trunk@259 ec762483-ff6d-05da-a07a-a48fb63a330f
Diffstat (limited to 'vest')
-rw-r--r--vest/mr_vest_generate_mapper_input.cc15
-rw-r--r--vest/scorer.cc48
-rw-r--r--vest/scorer.h11
3 files changed, 56 insertions, 18 deletions
diff --git a/vest/mr_vest_generate_mapper_input.cc b/vest/mr_vest_generate_mapper_input.cc
index 5c3e8181..c0f80d0c 100644
--- a/vest/mr_vest_generate_mapper_input.cc
+++ b/vest/mr_vest_generate_mapper_input.cc
@@ -11,6 +11,8 @@
#include "line_optimizer.h"
#include "hg.h"
#include "hg_io.h"
+#include "scorer.h"
+#include "oracle_bleu.h"
using namespace std;
namespace po = boost::program_options;
@@ -30,16 +32,20 @@ struct oracle_directions {
return o.str();
}
- oracle_directions(string forest_repository,unsigned dev_set_size,vector<int> const& fids=vector<int>()): forest_repository(forest_repository),dev_set_size(dev_set_size),fids(fids) {
+ void set_dev_set_size(int i) {
+ dev_set_size=i;
dirs.resize(dev_set_size);
}
+
+ oracle_directions(string forest_repository="",unsigned dev_set_sz=0,vector<int> const& fids=vector<int>()): forest_repository(forest_repository),fids(fids) {
+ set_dev_set_size(dev_set_sz);
+ }
+
Dir const& operator[](unsigned i) {
Dir &dir=dirs[i];
if (dir.empty()) {
ReadFile rf(forest_file(i));
- Hypergraph hg;
- HypergraphIO::ReadFromJSON(rf.stream(), &hg);
- cerr<<"oracle: forest["<<i<<"] loaded: "<<hg.stats()<<endl;
+ FeatureVector fear,hope,best;
//TODO: get hope/oracle from vlad. random for now.
LineOptimizer::RandomUnitVector(fids,&dir,&rng);
}
@@ -86,6 +92,7 @@ void compress_similar(vector<Dir> &dirs,double min_dist,ostream *log=&cerr,bool
void InitCommandLine(int argc, char** argv, po::variables_map* conf) {
po::options_description opts("Configuration options");
+ OracleBleu::AddOptions(&opts);
opts.add_options()
("dev_set_size,s",po::value<unsigned int>(),"[REQD] Development set size (# of parallel sentences)")
("forest_repository,r",po::value<string>(),"[REQD] Path to forest repository")
diff --git a/vest/scorer.cc b/vest/scorer.cc
index 524b15a5..8f981af6 100644
--- a/vest/scorer.cc
+++ b/vest/scorer.cc
@@ -6,6 +6,7 @@
#include <fstream>
#include <cstdio>
#include <valarray>
+#include <algorithm>
#include <boost/shared_ptr.hpp>
@@ -47,8 +48,37 @@ ScoreType ScoreTypeFromString(const string& st) {
return IBM_BLEU;
}
+static char const* score_names[]={
+ "IBM_BLEU", "NIST_BLEU", "Koehn_BLEU", "TER", "BLEU_minus_TER_over_2", "SER", "AER", "IBM_BLEU_3"
+};
+
+std::string StringFromScoreType(ScoreType st) {
+ assert(st>=0 && st<sizeof(score_names)/sizeof(score_names[0]));
+ return score_names[(int)st];
+}
+
+
Score::~Score() {}
SentenceScorer::~SentenceScorer() {}
+
+struct length_accum {
+ template <class S>
+ float operator()(float sum,S const& ref) const {
+ return sum+ref.size();
+ }
+};
+
+template <class S>
+float avg_reflength(vector<S> refs) {
+ unsigned n=refs.size();
+ return n?accumulate(refs.begin(),refs.end(),0.,length_accum())/n:0.;
+}
+
+
+float SentenceScorer::ComputeRefLength(const Sentence &hyp) const {
+ return hyp.size(); // reasonable default? :)
+}
+
const std::string* SentenceScorer::GetSource() const { return NULL; }
class SERScore : public Score {
@@ -64,9 +94,9 @@ class SERScore : public Score {
os << "SER= " << ComputeScore() << " (" << correct << '/' << total << ')';
*details = os.str();
}
- void PlusPartialEquals(const Score& delta, int oracle_e_cover, int oracle_f_cover, int src_len){}
-
- void PlusEquals(const Score& delta, const float scale) {
+ void PlusPartialEquals(const Score& /* delta */, int /* oracle_e_cover */, int /* oracle_f_cover */, int /* src_len */){}
+
+ void PlusEquals(const Score& delta, const float /* scale */) {
correct += static_cast<const SERScore&>(delta).correct;
total += static_cast<const SERScore&>(delta).total;
}
@@ -94,7 +124,7 @@ class SERScore : public Score {
class SERScorer : public SentenceScorer {
public:
SERScorer(const vector<vector<WordID> >& references) : refs_(references) {}
- Score* ScoreCCandidate(const vector<WordID>& hyp) const {
+ Score* ScoreCCandidate(const vector<WordID>& /* hyp */) const {
Score* a = NULL;
return a;
}
@@ -120,7 +150,7 @@ class BLEUScore : public Score {
hyp_len = 0; }
BLEUScore(int n, int k) : correct_ngram_hit_counts(float(k),float(n)), hyp_ngram_counts(float(k),float(n)) {
ref_len = k;
- hyp_len = k; }
+ hyp_len = k; }
float ComputeScore() const;
float ComputePartialScore() const;
void ScoreDetails(string* details) const;
@@ -156,7 +186,6 @@ class BLEUScorerBase : public SentenceScorer {
Score* ScoreCCandidate(const vector<WordID>& hyp) const;
static Score* ScoreFromString(const string& in);
- protected:
virtual float ComputeRefLength(const vector<WordID>& hyp) const = 0;
private:
struct NGramCompare {
@@ -257,7 +286,6 @@ class IBM_BLEUScorer : public BLEUScorerBase {
for (int i=0; i < references.size(); ++i)
lengths_[i] = references[i].size();
}
- protected:
float ComputeRefLength(const vector<WordID>& hyp) const {
if (lengths_.size() == 1) return lengths_[0];
int bestd = 2000000;
@@ -285,7 +313,6 @@ class NIST_BLEUScorer : public BLEUScorerBase {
if (references[i].size() < shortest_)
shortest_ = references[i].size();
}
- protected:
float ComputeRefLength(const vector<WordID>& /* hyp */) const {
return shortest_;
}
@@ -302,7 +329,6 @@ class Koehn_BLEUScorer : public BLEUScorerBase {
avg_ += references[i].size();
avg_ /= references.size();
}
- protected:
float ComputeRefLength(const vector<WordID>& /* hyp */) const {
return avg_;
}
@@ -520,10 +546,10 @@ void BLEUScore::PlusPartialEquals(const Score& delta, int oracle_e_cover, int or
correct_ngram_hit_counts += d.correct_ngram_hit_counts;
hyp_ngram_counts += d.hyp_ngram_counts;
//scale the reference length according to the size of the input sentence covered by this rule
-
+
ref_len *= (float)oracle_f_cover / src_len;
ref_len += d.ref_len;
-
+
hyp_len = oracle_e_cover;
hyp_len += d.hyp_len;
}
diff --git a/vest/scorer.h b/vest/scorer.h
index 7ce688c4..5bfeee0f 100644
--- a/vest/scorer.h
+++ b/vest/scorer.h
@@ -12,6 +12,7 @@ class Hypergraph; // needed for alignment
enum ScoreType { IBM_BLEU, NIST_BLEU, Koehn_BLEU, TER, BLEU_minus_TER_over_2, SER, AER, IBM_BLEU_3 };
ScoreType ScoreTypeFromString(const std::string& st);
+std::string StringFromScoreType(ScoreType st);
class Score {
public:
@@ -33,20 +34,24 @@ class Score {
class SentenceScorer {
public:
+ typedef std::vector<WordID> Sentence;
+ virtual float ComputeRefLength(const Sentence& hyp) const; // default: avg of refs.length
virtual ~SentenceScorer();
void ComputeErrorSurface(const ViterbiEnvelope& ve, ErrorSurface* es, const ScoreType type, const Hypergraph& hg) const;
- virtual Score* ScoreCandidate(const std::vector<WordID>& hyp) const = 0;
- virtual Score* ScoreCCandidate(const std::vector<WordID>& hyp) const =0;
+ virtual Score* ScoreCandidate(const Sentence& hyp) const = 0;
+ virtual Score* ScoreCCandidate(const Sentence& hyp) const =0;
virtual const std::string* GetSource() const;
static Score* CreateScoreFromString(const ScoreType type, const std::string& in);
static SentenceScorer* CreateSentenceScorer(const ScoreType type,
- const std::vector<std::vector<WordID> >& refs,
+ const std::vector<Sentence >& refs,
const std::string& src = "");
};
+//TODO: should be able to GetOne GetZero without supplying sentence (just type)
class DocScorer {
public:
~DocScorer();
+ DocScorer() { }
DocScorer(
const ScoreType type,
const std::vector<std::string>& ref_files,