diff options
Diffstat (limited to 'vest')
| -rw-r--r-- | vest/ces.cc | 42 | ||||
| -rw-r--r-- | vest/ces.h | 10 | ||||
| -rwxr-xr-x | vest/dist-vest.pl | 4 | ||||
| -rw-r--r-- | vest/error_surface.cc | 11 | ||||
| -rw-r--r-- | vest/error_surface.h | 6 | ||||
| -rw-r--r-- | vest/line_optimizer.cc | 20 | ||||
| -rw-r--r-- | vest/line_optimizer.h | 2 | ||||
| -rw-r--r-- | vest/lo_test.cc | 21 | ||||
| -rw-r--r-- | vest/mr_vest_map.cc | 16 | ||||
| -rw-r--r-- | vest/mr_vest_reduce.cc | 34 | 
10 files changed, 88 insertions, 78 deletions
| diff --git a/vest/ces.cc b/vest/ces.cc index 4ae6b695..cd89aa69 100644 --- a/vest/ces.cc +++ b/vest/ces.cc @@ -4,25 +4,32 @@  #include <sstream>  #include <boost/shared_ptr.hpp> -#include "aligner.h" +// TODO, if AER is to be optimized again, we will need this +// #include "aligner.h"  #include "lattice.h"  #include "viterbi_envelope.h"  #include "error_surface.h" +#include "ns.h"  using boost::shared_ptr;  using namespace std;  const bool minimize_segments = true;    // if adjacent segments have equal scores, merge them -void ComputeErrorSurface(const SentenceScorer& ss, const ViterbiEnvelope& ve, ErrorSurface* env, const ScoreType type, const Hypergraph& hg) { +void ComputeErrorSurface(const SegmentEvaluator& ss, +                         const ViterbiEnvelope& ve, +                         ErrorSurface* env, +                         const EvaluationMetric* metric, +                         const Hypergraph& hg) {    vector<WordID> prev_trans;    const vector<shared_ptr<Segment> >& ienv = ve.GetSortedSegs();    env->resize(ienv.size()); -  ScoreP prev_score; +  SufficientStats prev_score; // defaults to 0    int j = 0;    for (int i = 0; i < ienv.size(); ++i) {      const Segment& seg = *ienv[i];      vector<WordID> trans; +#if 0      if (type == AER) {        vector<bool> edges(hg.edges_.size(), false);        seg.CollectEdgesUsed(&edges);  // get the set of edges in the viterbi @@ -46,34 +53,31 @@ void ComputeErrorSurface(const SentenceScorer& ss, const ViterbiEnvelope& ve, Er        string tstr = os.str();        TD::ConvertSentence(tstr.substr(tstr.rfind(" ||| ") + 5), &trans);      } else { +#endif        seg.ConstructTranslation(&trans); -    } -    // cerr << "Scoring: " << TD::GetString(trans) << endl; +    //} +    //cerr << "Scoring: " << TD::GetString(trans) << endl;      if (trans == prev_trans) {        if (!minimize_segments) { -        assert(prev_score); // if this fails, it means -	                    // the decoder can generate null translations          ErrorSegment& out = (*env)[j]; -        out.delta = prev_score->GetZero(); +        out.delta.fields.clear();          out.x = seg.x;  	++j;        } -      // cerr << "Identical translation, skipping scoring\n"; +      //cerr << "Identical translation, skipping scoring\n";      } else { -      ScoreP score = ss.ScoreCandidate(trans); +      SufficientStats score; +      ss.Evaluate(trans, &score);        // cerr << "score= " << score->ComputeScore() << "\n"; -      ScoreP cur_delta_p = score->GetZero(); -      Score* cur_delta = cur_delta_p.get(); -      // just record the score diffs -      if (!prev_score) -        prev_score = score->GetZero(); - -      score->Subtract(*prev_score, cur_delta); +      //string x1; score.Encode(&x1); cerr << "STATS: " << x1 << endl; +      const SufficientStats delta = score - prev_score; +      //string x2; delta.Encode(&x2); cerr << "DELTA: " << x2 << endl; +      //string xx; delta.Encode(&xx); cerr << xx << endl;        prev_trans.swap(trans);        prev_score = score; -      if ((!minimize_segments) || (!cur_delta->IsAdditiveIdentity())) { +      if ((!minimize_segments) || (!delta.IsAdditiveIdentity())) {          ErrorSegment& out = (*env)[j]; -        out.delta = cur_delta_p; +        out.delta = delta;          out.x = seg.x;          ++j;        } @@ -1,12 +1,16 @@  #ifndef _CES_H_  #define _CES_H_ -#include "scorer.h" -  class ViterbiEnvelope;  class Hypergraph; +class SegmentEvaluator;  class ErrorSurface; +class EvaluationMetric; -void ComputeErrorSurface(const SentenceScorer& ss, const ViterbiEnvelope& ve, ErrorSurface* es, const ScoreType type, const Hypergraph& hg); +void ComputeErrorSurface(const SegmentEvaluator& ss, +                         const ViterbiEnvelope& ve, +                         ErrorSurface* es, +                         const EvaluationMetric* metric, +                         const Hypergraph& hg);  #endif diff --git a/vest/dist-vest.pl b/vest/dist-vest.pl index c382a972..8cde748b 100755 --- a/vest/dist-vest.pl +++ b/vest/dist-vest.pl @@ -364,7 +364,7 @@ while (1){  			$mapoutput =~ s/mapinput/mapoutput/;  			push @mapoutputs, "$dir/splag.$im1/$mapoutput";  			$o2i{"$dir/splag.$im1/$mapoutput"} = "$dir/splag.$im1/$shard"; -			my $script = "$MAPPER -s $srcFile -l $metric $refs_comma_sep < $dir/splag.$im1/$shard | sort -t \$'\\t' -k 1 > $dir/splag.$im1/$mapoutput"; +			my $script = "$MAPPER -s $srcFile -m $metric $refs_comma_sep < $dir/splag.$im1/$shard | sort -t \$'\\t' -k 1 > $dir/splag.$im1/$mapoutput";  			if ($use_make) {  				my $script_file = "$dir/scripts/map.$shard";  				open F, ">$script_file" or die "Can't write $script_file: $!"; @@ -424,7 +424,7 @@ while (1){  		print STDERR "Results for $tol/$til lines\n";  		print STDERR "\nSORTING AND RUNNING VEST REDUCER\n";  		print STDERR unchecked_output("date"); -		$cmd="sort -t \$'\\t' -k 1 @mapoutputs | $REDUCER -l $metric > $dir/redoutput.$im1"; +		$cmd="sort -t \$'\\t' -k 1 @mapoutputs | $REDUCER -m $metric > $dir/redoutput.$im1";  		print STDERR "COMMAND:\n$cmd\n";  		check_bash_call($cmd);  		$cmd="sort -nk3 $DIR_FLAG '-t|' $dir/redoutput.$im1 | head -1"; diff --git a/vest/error_surface.cc b/vest/error_surface.cc index 754aa8de..515b67f8 100644 --- a/vest/error_surface.cc +++ b/vest/error_surface.cc @@ -5,8 +5,7 @@  using namespace std; -ErrorSurface::~ErrorSurface() { -} +ErrorSurface::~ErrorSurface() {}  void ErrorSurface::Serialize(std::string* out) const {    const int segments = this->size(); @@ -15,8 +14,8 @@ void ErrorSurface::Serialize(std::string* out) const {    for (int i = 0; i < segments; ++i) {      const ErrorSegment& cur = (*this)[i];      string senc; -    cur.delta->Encode(&senc); -    assert(senc.size() < 256); +    cur.delta.Encode(&senc); +    assert(senc.size() < 1024);      unsigned char len = senc.size();      os.write((const char*)&cur.x, sizeof(cur.x));      os.write((const char*)&len, sizeof(len)); @@ -25,7 +24,7 @@ void ErrorSurface::Serialize(std::string* out) const {    *out = os.str();  } -void ErrorSurface::Deserialize(ScoreType type, const std::string& in) { +void ErrorSurface::Deserialize(const std::string& in) {    istringstream is(in, ios::binary);    int segments;    is.read((char*)&segments, sizeof(segments)); @@ -37,7 +36,7 @@ void ErrorSurface::Deserialize(ScoreType type, const std::string& in) {      is.read((char*)&len, sizeof(len));      string senc(len, '\0'); assert(senc.size() == len);      is.read((char*)&senc[0], len); -    cur.delta = SentenceScorer::CreateScoreFromString(type, senc); +    cur.delta = SufficientStats(senc);    }  } diff --git a/vest/error_surface.h b/vest/error_surface.h index ad728cfa..bb65847b 100644 --- a/vest/error_surface.h +++ b/vest/error_surface.h @@ -4,13 +4,13 @@  #include <vector>  #include <string> -#include "scorer.h" +#include "ns.h"  class Score;  struct ErrorSegment {    double x; -  ScoreP delta; +  SufficientStats delta;    ErrorSegment() : x(0), delta() {}  }; @@ -18,7 +18,7 @@ class ErrorSurface : public std::vector<ErrorSegment> {   public:    ~ErrorSurface();    void Serialize(std::string* out) const; -  void Deserialize(ScoreType type, const std::string& in); +  void Deserialize(const std::string& in);  };  #endif diff --git a/vest/line_optimizer.cc b/vest/line_optimizer.cc index 7303df8d..49443fbe 100644 --- a/vest/line_optimizer.cc +++ b/vest/line_optimizer.cc @@ -4,7 +4,7 @@  #include <algorithm>  #include "sparse_vector.h" -#include "scorer.h" +#include "ns.h"  using namespace std; @@ -18,6 +18,7 @@ struct IntervalComp {  };  double LineOptimizer::LineOptimize( +    const EvaluationMetric* metric,      const vector<ErrorSurface>& surfaces,      const LineOptimizer::ScoreType type,      float* best_score, @@ -32,8 +33,7 @@ double LineOptimizer::LineOptimize(    }    sort(all_ints.begin(), all_ints.end(), IntervalComp());    double last_boundary = all_ints.front()->x; -  ScoreP accp = all_ints.front()->delta->GetZero(); -  Score *acc=accp.get(); +  SufficientStats acc;    float& cur_best_score = *best_score;    cur_best_score = (type == MAXIMIZE_SCORE ?      -numeric_limits<float>::max() : numeric_limits<float>::max()); @@ -42,9 +42,8 @@ double LineOptimizer::LineOptimize(    for (vector<ErrorIter>::iterator i = all_ints.begin();         i != all_ints.end(); ++i) {      const ErrorSegment& seg = **i; -    assert(seg.delta);      if (seg.x - last_boundary > epsilon) { -      float sco = acc->ComputeScore(); +      float sco = metric->ComputeScore(acc);        if ((type == MAXIMIZE_SCORE && sco > cur_best_score) ||            (type == MINIMIZE_SCORE && sco < cur_best_score) ) {          cur_best_score = sco; @@ -54,16 +53,18 @@ double LineOptimizer::LineOptimize(  	} else {  	  pos = last_boundary + (seg.x - last_boundary) / 2;  	} -	// cerr << "NEW BEST: " << pos << "  (score=" << cur_best_score << ")\n"; +	//cerr << "NEW BEST: " << pos << "  (score=" << cur_best_score << ")\n";        } -      // string xx; acc->ScoreDetails(&xx); cerr << "---- " << xx; +      // string xx = metric->DetailedScore(acc); cerr << "---- " << xx;        // cerr << "---- s=" << sco << "\n";        last_boundary = seg.x;      }      // cerr << "x-boundary=" << seg.x << "\n"; -    acc->PlusEquals(*seg.delta); +    //string x2; acc.Encode(&x2); cerr << "   ACC: " << x2 << endl; +    //string x1; seg.delta.Encode(&x1); cerr << " DELTA: " << x1 << endl; +    acc += seg.delta;    } -  float sco = acc->ComputeScore(); +  float sco = metric->ComputeScore(acc);    if ((type == MAXIMIZE_SCORE && sco > cur_best_score) ||        (type == MINIMIZE_SCORE && sco < cur_best_score) ) {      cur_best_score = sco; @@ -107,3 +108,4 @@ void LineOptimizer::CreateOptimizationDirections(       RandomUnitVector(features_to_optimize, &out[i], rng);    cerr << "Generated " << out.size() << " total axes to optimize along.\n";  } + diff --git a/vest/line_optimizer.h b/vest/line_optimizer.h index 99a591f4..83819f41 100644 --- a/vest/line_optimizer.h +++ b/vest/line_optimizer.h @@ -7,6 +7,7 @@  #include "error_surface.h"  #include "sampler.h" +class EvaluationMetric;  class Weights;  struct LineOptimizer { @@ -18,6 +19,7 @@ struct LineOptimizer {    // merge all the error surfaces together into a global    // error surface and find (the middle of) the best segment    static double LineOptimize( +     const EvaluationMetric* metric,       const std::vector<ErrorSurface>& envs,       const LineOptimizer::ScoreType type,       float* best_score, diff --git a/vest/lo_test.cc b/vest/lo_test.cc index f5638600..a67f65e1 100644 --- a/vest/lo_test.cc +++ b/vest/lo_test.cc @@ -5,6 +5,8 @@  #include <boost/shared_ptr.hpp>  #include <gtest/gtest.h> +#include "ns.h" +#include "ns_docscorer.h"  #include "ces.h"  #include "fdict.h"  #include "hg.h" @@ -15,7 +17,6 @@  #include "viterbi.h"  #include "viterbi_envelope.h"  #include "line_optimizer.h" -#include "scorer.h"  using namespace std;  using boost::shared_ptr; @@ -141,9 +142,6 @@ TEST_F(OptTest, TestS1) {    TD::ConvertSentence(ref22, &refs2[1]);    TD::ConvertSentence(ref32, &refs2[2]);    TD::ConvertSentence(ref42, &refs2[3]); -  ScoreType type = ScoreTypeFromString("ibm_bleu"); -  ScorerP scorer1 = SentenceScorer::CreateSentenceScorer(type, refs1); -  ScorerP scorer2 = SentenceScorer::CreateSentenceScorer(type, refs2);    vector<ViterbiEnvelope> envs(2);    RandomNumberGenerator<boost::mt19937> rng; @@ -167,14 +165,17 @@ TEST_F(OptTest, TestS1) {    envs[1] = Inside<ViterbiEnvelope, ViterbiEnvelopeWeightFunction>(hg2, NULL, wf);    vector<ErrorSurface> es(2); -  ComputeErrorSurface(*scorer1, envs[0], &es[0], IBM_BLEU, hg); -  ComputeErrorSurface(*scorer2, envs[1], &es[1], IBM_BLEU, hg2); +  EvaluationMetric* metric = EvaluationMetric::Instance("IBM_BLEU"); +  boost::shared_ptr<SegmentEvaluator> scorer1 = metric->CreateSegmentEvaluator(refs1); +  boost::shared_ptr<SegmentEvaluator> scorer2 = metric->CreateSegmentEvaluator(refs2); +  ComputeErrorSurface(*scorer1, envs[0], &es[0], metric, hg); +  ComputeErrorSurface(*scorer2, envs[1], &es[1], metric, hg2);    cerr << envs[0].size() << " " << envs[1].size() << endl;    cerr << es[0].size() << " " << es[1].size() << endl;    envs.clear();    clock_t t_env=clock();    float score; -  double m = LineOptimizer::LineOptimize(es, LineOptimizer::MAXIMIZE_SCORE, &score); +  double m = LineOptimizer::LineOptimize(metric,es, LineOptimizer::MAXIMIZE_SCORE, &score);    clock_t t_opt=clock();    cerr << "line optimizer returned: " << m << " (SCORE=" << score << ")\n";    EXPECT_FLOAT_EQ(0.48719698, score); @@ -217,15 +218,15 @@ TEST_F(OptTest,TestZeroOrigin) {    vector<ViterbiEnvelope> envs(1);    envs[0] = Inside<ViterbiEnvelope, ViterbiEnvelopeWeightFunction>(hg, NULL, wf); -  ScoreType type = ScoreTypeFromString("ibm_bleu");    vector<vector<WordID> > mr(4);    TD::ConvertSentence("untitled", &mr[0]);    TD::ConvertSentence("with no title", &mr[1]);    TD::ConvertSentence("without a title", &mr[2]);    TD::ConvertSentence("without title", &mr[3]); -  ScorerP scorer1 = SentenceScorer::CreateSentenceScorer(type, mr); +  EvaluationMetric* metric = EvaluationMetric::Instance("IBM_BLEU"); +  boost::shared_ptr<SegmentEvaluator> scorer1 = metric->CreateSegmentEvaluator(mr);    vector<ErrorSurface> es(1); -  ComputeErrorSurface(*scorer1, envs[0], &es[0], IBM_BLEU, hg); +  ComputeErrorSurface(*scorer1, envs[0], &es[0], metric, hg);  }  int main(int argc, char **argv) { diff --git a/vest/mr_vest_map.cc b/vest/mr_vest_map.cc index 71dda6d7..8f6e085d 100644 --- a/vest/mr_vest_map.cc +++ b/vest/mr_vest_map.cc @@ -6,11 +6,12 @@  #include <boost/program_options.hpp>  #include <boost/program_options/variables_map.hpp> +#include "ns.h" +#include "ns_docscorer.h"  #include "ces.h"  #include "filelib.h"  #include "stringlib.h"  #include "sparse_vector.h" -#include "scorer.h"  #include "viterbi_envelope.h"  #include "inside_outside.h"  #include "error_surface.h" @@ -25,7 +26,7 @@ void InitCommandLine(int argc, char** argv, po::variables_map* conf) {    opts.add_options()          ("reference,r",po::value<vector<string> >(), "[REQD] Reference translation (tokenized text)")          ("source,s",po::value<string>(), "Source file (ignored, except for AER)") -        ("loss_function,l",po::value<string>()->default_value("ibm_bleu"), "Loss function being optimized") +        ("evaluation_metric,m",po::value<string>()->default_value("ibm_bleu"), "Evaluation metric being optimized")          ("input,i",po::value<string>()->default_value("-"), "Input file to map (- is STDIN)")          ("help,h", "Help");    po::options_description dcmdline_options; @@ -67,10 +68,10 @@ bool ReadSparseVectorString(const string& s, SparseVector<double>* v) {  int main(int argc, char** argv) {    po::variables_map conf;    InitCommandLine(argc, argv, &conf); -  const string loss_function = conf["loss_function"].as<string>(); -  ScoreType type = ScoreTypeFromString(loss_function); -  DocScorer ds(type, conf["reference"].as<vector<string> >(), conf["source"].as<string>()); -  cerr << "Loaded " << ds.size() << " references for scoring with " << loss_function << endl; +  const string evaluation_metric = conf["evaluation_metric"].as<string>(); +  EvaluationMetric* metric = EvaluationMetric::Instance(evaluation_metric); +  DocumentScorer ds(metric, conf["reference"].as<vector<string> >()); +  cerr << "Loaded " << ds.size() << " references for scoring with " << evaluation_metric << endl;    Hypergraph hg;    string last_file;    ReadFile in_read(conf["input"].as<string>()); @@ -97,7 +98,8 @@ int main(int argc, char** argv) {      ViterbiEnvelopeWeightFunction wf(origin, axis);      ViterbiEnvelope ve = Inside<ViterbiEnvelope, ViterbiEnvelopeWeightFunction>(hg, NULL, wf);      ErrorSurface es; -    ComputeErrorSurface(*ds[sent_id], ve, &es, type, hg); + +    ComputeErrorSurface(*ds[sent_id], ve, &es, metric, hg);      //cerr << "Viterbi envelope has " << ve.size() << " segments\n";      // cerr << "Error surface has " << es.size() << " segments\n";      string val; diff --git a/vest/mr_vest_reduce.cc b/vest/mr_vest_reduce.cc index 3df52020..dda61f88 100644 --- a/vest/mr_vest_reduce.cc +++ b/vest/mr_vest_reduce.cc @@ -10,6 +10,7 @@  #include "error_surface.h"  #include "line_optimizer.h"  #include "b64tools.h" +#include "stringlib.h"  using namespace std;  namespace po = boost::program_options; @@ -17,12 +18,12 @@ namespace po = boost::program_options;  void InitCommandLine(int argc, char** argv, po::variables_map* conf) {    po::options_description opts("Configuration options");    opts.add_options() -        ("loss_function,l",po::value<string>(), "Loss function being optimized") +        ("evaluation_metric,m",po::value<string>(), "Evaluation metric (IBM_BLEU, etc.)")          ("help,h", "Help");    po::options_description dcmdline_options;    dcmdline_options.add(opts);    po::store(parse_command_line(argc, argv, dcmdline_options), *conf); -  bool flag = conf->count("loss_function") == 0; +  bool flag = conf->count("evaluation_metric") == 0;    if (flag || conf->count("help")) {      cerr << dcmdline_options << endl;      exit(1); @@ -32,30 +33,27 @@ void InitCommandLine(int argc, char** argv, po::variables_map* conf) {  int main(int argc, char** argv) {    po::variables_map conf;    InitCommandLine(argc, argv, &conf); -  const string loss_function = conf["loss_function"].as<string>(); -  ScoreType type = ScoreTypeFromString(loss_function); +  const string evaluation_metric = conf["evaluation_metric"].as<string>();    LineOptimizer::ScoreType opt_type = LineOptimizer::MAXIMIZE_SCORE; -  if (type == TER || type == AER) { +  if (UppercaseString(evaluation_metric) == "TER")      opt_type = LineOptimizer::MINIMIZE_SCORE; -  } -  string last_key; +  EvaluationMetric* metric = EvaluationMetric::Instance(evaluation_metric); +    vector<ErrorSurface> esv; -  while(cin) { -    string line; -    getline(cin, line); -    if (line.empty()) continue; +  string last_key, line, key, val; +  while(getline(cin, line)) {      size_t ks = line.find("\t");      assert(string::npos != ks);      assert(ks > 2); -    string key = line.substr(2, ks - 2); -    string val = line.substr(ks + 1); +    key = line.substr(2, ks - 2); +    val = line.substr(ks + 1);      if (key != last_key) {        if (!last_key.empty()) {  	float score; -        double x = LineOptimizer::LineOptimize(esv, opt_type, &score); +        double x = LineOptimizer::LineOptimize(metric, esv, opt_type, &score);  	cout << last_key << "|" << x << "|" << score << endl;        } -      last_key = key; +      last_key.swap(key);        esv.clear();      }      if (val.size() % 4 != 0) { @@ -68,13 +66,11 @@ int main(int argc, char** argv) {        continue;      }      esv.push_back(ErrorSurface()); -    esv.back().Deserialize(type, encoded); +    esv.back().Deserialize(encoded);    }    if (!esv.empty()) { -    // cerr << "ESV=" << esv.size() << endl; -    // for (int i = 0; i < esv.size(); ++i) { cerr << esv[i].size() << endl; }      float score; -    double x = LineOptimizer::LineOptimize(esv, opt_type, &score); +    double x = LineOptimizer::LineOptimize(metric, esv, opt_type, &score);      cout << last_key << "|" << x << "|" << score << endl;    }    return 0; | 
