summaryrefslogtreecommitdiff
path: root/dtrain
diff options
context:
space:
mode:
authorPatrick Simianer <p@simianer.de>2011-09-26 21:51:52 +0200
committerPatrick Simianer <p@simianer.de>2011-09-26 21:51:52 +0200
commit36de7283576dd22a91577ef175c62434f3d933b4 (patch)
tree544916f6305deb5c281153e7f4e208b6e3a8b568 /dtrain
parente16b311246f9f2c309b257debd5f50a28b04802b (diff)
got rid of scoring loop
Diffstat (limited to 'dtrain')
-rw-r--r--dtrain/dtrain.cc77
-rw-r--r--dtrain/dtrain.h25
-rw-r--r--dtrain/kbestget.h46
-rw-r--r--dtrain/ksampler.h11
-rw-r--r--dtrain/pairsampling.h6
-rw-r--r--dtrain/score.cc37
-rw-r--r--dtrain/score.h44
-rw-r--r--dtrain/test/example/dtrain.ini8
-rw-r--r--dtrain/test/example/weights.gzbin12001 -> 395 bytes
9 files changed, 98 insertions, 156 deletions
diff --git a/dtrain/dtrain.cc b/dtrain/dtrain.cc
index 35e6cc46..622cd01e 100644
--- a/dtrain/dtrain.cc
+++ b/dtrain/dtrain.cc
@@ -95,38 +95,32 @@ main(int argc, char** argv)
cerr << setw(25) << "cdec cfg " << "'" << cfg["decoder_config"].as<string>() << "'" << endl;
Decoder decoder(ini_rf.stream());
- MT19937 rng; // random number generator
- // setup decoder observer
- HypSampler* observer;
- if (sample_from == "kbest") {
- observer = dynamic_cast<KBestGetter*>(new KBestGetter(k, filter_type));
- } else {
- observer = dynamic_cast<KSampler*>(new KSampler(k, &rng));
- }
-
// scoring metric/scorer
string scorer_str = cfg["scorer"].as<string>();
- /*score_t (*scorer)(NgramCounts&, const unsigned, const unsigned, unsigned, vector<score_t>);
+ LocalScorer* scorer;
if (scorer_str == "bleu") {
- scorer = &bleu;
} else if (scorer_str == "stupid_bleu") {
- scorer = &stupid_bleu;
+ scorer = dynamic_cast<StupidBleuScorer*>(new StupidBleuScorer);
} else if (scorer_str == "smooth_bleu") {
- scorer = &smooth_bleu;
+ scorer = dynamic_cast<SmoothBleuScorer*>(new SmoothBleuScorer);
} else if (scorer_str == "approx_bleu") {
- scorer = &approx_bleu;
+ scorer = dynamic_cast<StupidBleuScorer*>(new StupidBleuScorer); // FIXME
} else {
cerr << "Don't know scoring metric: '" << scorer_str << "', exiting." << endl;
exit(1);
}
- NgramCounts global_counts(N); // counts for 1 best translations
- unsigned global_hyp_len = 0; // sum hypothesis lengths
- unsigned global_ref_len = 0; // sum reference lengths
- // ^^^ global_* for approx_bleu*/
- vector<score_t> bleu_weights; // we leave this empty -> 1/N
- //if (!quiet) cerr << setw(26) << "scorer '" << scorer_str << "'" << endl << endl;
- StupidBleuScorer scorer;
- scorer.Init(N, bleu_weights);
+ vector<score_t> bleu_weights;
+ scorer->Init(N, bleu_weights);
+ if (!quiet) cerr << setw(26) << "scorer '" << scorer_str << "'" << endl << endl;
+
+ // setup decoder observer
+ MT19937 rng; // random number generator
+ HypSampler* observer;
+ if (sample_from == "kbest")
+ observer = dynamic_cast<KBestGetter*>(new KBestGetter(k, filter_type));
+ else
+ observer = dynamic_cast<KSampler*>(new KSampler(k, &rng));
+ observer->SetScorer(scorer);
// init weights
Weights weights;
@@ -240,10 +234,10 @@ main(int argc, char** argv)
vector<WordID> ref_ids; // reference as vector<WordID>
if (t == 0) {
// handling input
- strsplit(in, in_split, '\t', 4);
+ boost::split(in_split, in, boost::is_any_of("\t"));
// getting reference
vector<string> ref_tok;
- strsplit(in_split[2], ref_tok, ' ');
+ boost::split(ref_tok, in_split[2], boost::is_any_of(" "));
register_and_convert(ref_tok, ref_ids);
ref_ids_buf.push_back(ref_ids);
// process and set grammar
@@ -259,8 +253,9 @@ main(int argc, char** argv)
in_split[3] += "\n";
grammar_buf_out << in_split[3] << DTRAIN_GRAMMAR_DELIM << " " << in_split[0] << endl;
decoder.SetSentenceGrammarFromString(in_split[3]);
- // decode
src_str_buf.push_back(in_split[1]);
+ // decode
+ observer->SetRef(ref_ids);
decoder.Decode(in_split[1], observer);
} else {
// get buffered grammar
@@ -273,32 +268,24 @@ main(int argc, char** argv)
}
decoder.SetSentenceGrammarFromString(grammar_str);
// decode
+ observer->SetRef(ref_ids_buf[ii]);
decoder.Decode(src_str_buf[ii], observer);
}
+ // get (scored) samples
vector<ScoredHyp>* samples = observer->GetSamples();
- // (local) scoring
- if (t > 0) ref_ids = ref_ids_buf[ii];
- for (unsigned i = 0; i < samples->size(); i++) {
- //cout << ii << " " << i << endl;
-
- cout << _p9;
- (*samples)[i].score = scorer.Score((*samples)[i], ref_ids, ii);
- if (i == 0) {
- score_sum += (*samples)[i].score;
- model_sum += (*samples)[i].model;
- }
-
- if (verbose) {
- if (i == 0) cerr << "'" << TD::GetString(ref_ids) << "' [ref]" << endl;
- cerr << _p5 << _np << "[hyp " << i << "] " << "'" << TD::GetString((*samples)[i].w) << "'";
- cerr << " [SCORE=" << (*samples)[i].score << ",model="<< (*samples)[i].model << "]" << endl;
- cerr << (*samples)[i].f << endl;
- }
+ if (verbose) {
+ cout << "[ref: '";
+ if (t > 0) cout << ref_ids_buf[ii];
+ else cout << ref_ids;
+ cout << endl;
+ cout << _p5 << _np << "1best: " << "'" << (*samples)[0].w << "'" << endl;
+ cout << "SCORE=" << (*samples)[0].score << ",model="<< (*samples)[0].model << endl;
+ cout << "F{" << (*samples)[0].f << "} ]" << endl << endl;
}
-
- if (verbose) cerr << endl;
+ score_sum += (*samples)[0].score;
+ model_sum += (*samples)[0].model;
//////////////////////////////////////////////////////////
// UPDATE WEIGHTS
diff --git a/dtrain/dtrain.h b/dtrain/dtrain.h
index ed75a297..0c27167d 100644
--- a/dtrain/dtrain.h
+++ b/dtrain/dtrain.h
@@ -7,14 +7,6 @@
#include <boost/algorithm/string.hpp>
#include <boost/program_options.hpp>
-#include "verbose.h"
-#include "viterbi.h"
-#include "ff_register.h"
-#include "decoder.h"
-#include "weights.h"
-
-#include "score.h"
-#include "kbestget.h"
#include "ksampler.h"
#include "pairsampling.h"
@@ -31,27 +23,12 @@ inline void register_and_convert(const vector<string>& strs, vector<WordID>& ids
for (it = strs.begin(); it < strs.end(); it++)
ids.push_back(TD::Convert(*it));
}
+
inline ostream& _np(ostream& out) { return out << resetiosflags(ios::showpos); }
inline ostream& _p(ostream& out) { return out << setiosflags(ios::showpos); }
inline ostream& _p2(ostream& out) { return out << setprecision(2); }
inline ostream& _p5(ostream& out) { return out << setprecision(5); }
inline ostream& _p9(ostream& out) { return out << setprecision(9); }
-inline void strsplit(string &s, vector<string>& v, char d = '\t', unsigned parts = 0) {
- stringstream ss(s);
- string t;
- unsigned i = 0;
- while(true)
- {
- if (parts > 0 && i == parts-1) {
- getline(ss, t);
- v.push_back(t);
- break;
- }
- if (!getline(ss, t, d)) break;
- v.push_back(t);
- i++;
- }
-}
#endif
diff --git a/dtrain/kbestget.h b/dtrain/kbestget.h
index 2a2c6073..c0fd3f47 100644
--- a/dtrain/kbestget.h
+++ b/dtrain/kbestget.h
@@ -1,12 +1,6 @@
#ifndef _DTRAIN_KBESTGET_H_
#define _DTRAIN_KBESTGET_H_
-
-#include <vector>
-#include <string>
-
-using namespace std;
-
#include "kbest.h" // cdec
#include "verbose.h"
#include "viterbi.h"
@@ -14,11 +8,13 @@ using namespace std;
#include "decoder.h"
#include "weights.h"
+using namespace std;
+
namespace dtrain
{
-typedef double score_t; // float
+typedef double score_t; // float
struct ScoredHyp
{
@@ -29,10 +25,44 @@ struct ScoredHyp
unsigned rank;
};
+struct LocalScorer
+{
+ unsigned N_;
+ vector<score_t> w_;
+
+ virtual score_t
+ Score(vector<WordID>& hyp, vector<WordID>& ref)=0;
+
+ void
+ Init(unsigned N, vector<score_t> weights)
+ {
+ assert(N > 0);
+ N_ = N;
+ if (weights.empty()) for (unsigned i = 0; i < N_; i++) w_.push_back(1./N_);
+ else w_ = weights;
+ }
+
+ score_t
+ brevity_penaly(const unsigned hyp_len, const unsigned ref_len)
+ {
+ if (hyp_len > ref_len) return 1;
+ return exp(1 - (score_t)ref_len/hyp_len);
+ }
+};
+
struct HypSampler : public DecoderObserver
{
+ LocalScorer* scorer_;
+ vector<WordID>* ref_;
virtual vector<ScoredHyp>* GetSamples()=0;
+ void SetScorer(LocalScorer* scorer) { scorer_ = scorer; }
+ void SetRef(vector<WordID>& ref) { ref_ = &ref; }
};
+/////////////////////////////////////////////////////////////////////
+// wtf
+
+
+
struct KBestGetter : public HypSampler
{
@@ -77,6 +107,7 @@ struct KBestGetter : public HypSampler
h.f = d->feature_values;
h.model = log(d->score);
h.rank = i;
+ h.score = scorer_->Score(h.w, *ref_);
s_.push_back(h);
}
}
@@ -95,6 +126,7 @@ struct KBestGetter : public HypSampler
h.f = d->feature_values;
h.model = log(d->score);
h.rank = i;
+ h.score = scorer_->Score(h.w, *ref_);
s_.push_back(h);
}
}
diff --git a/dtrain/ksampler.h b/dtrain/ksampler.h
index 767dc42e..7567f43a 100644
--- a/dtrain/ksampler.h
+++ b/dtrain/ksampler.h
@@ -1,15 +1,9 @@
#ifndef _DTRAIN_KSAMPLER_H_
#define _DTRAIN_KSAMPLER_H_
-#include "kbestget.h"
#include "hgsampler.h"
-#include <vector>
-#include <string>
-
-using namespace std;
-
-#include "kbest.h" // cdec
-#include "sampler.h"
+#include "kbestget.h"
+#include "score.h"
namespace dtrain
{
@@ -43,6 +37,7 @@ struct KSampler : public HypSampler
h.f = samples[i].fmap;
h.model = log(samples[i].model_score);
h.rank = i;
+ h.score = scorer_->Score(h.w, *ref_);
s_.push_back(h);
}
}
diff --git a/dtrain/pairsampling.h b/dtrain/pairsampling.h
index 4a6d93d1..6db0c045 100644
--- a/dtrain/pairsampling.h
+++ b/dtrain/pairsampling.h
@@ -1,12 +1,6 @@
#ifndef _DTRAIN_PAIRSAMPLING_H_
#define _DTRAIN_PAIRSAMPLING_H_
-#include "kbestget.h"
-#include "score.h"
-#include <vector>
-#include <string>
-using namespace std;
-#include "sampler.h" // cdec, MT19937
namespace dtrain
{
diff --git a/dtrain/score.cc b/dtrain/score.cc
index 9b22508b..93c4e80b 100644
--- a/dtrain/score.cc
+++ b/dtrain/score.cc
@@ -22,17 +22,17 @@ BleuScorer::Bleu(NgramCounts& counts, const unsigned hyp_len, const unsigned ref
score_t sum = 0;
for (unsigned i = 0; i < M; i++) {
if (counts.clipped[i] == 0 || counts.sum[i] == 0) return 0;
- sum += w_[i] * log((score_t)counts.clipped[i] / counts.sum[i]);
+ sum += w_[i] * log((score_t)counts.clipped[i]/counts.sum[i]);
}
return brevity_penaly(hyp_len, ref_len) * exp(sum);
}
score_t
-BleuScorer::Score(ScoredHyp& hyp, vector<WordID>& ref_ids, unsigned id)
+BleuScorer::Score(vector<WordID>& hyp, vector<WordID>& ref)
{
- unsigned hyp_len = hyp.w.size(), ref_len = ref_ids.size();
+ unsigned hyp_len = hyp.size(), ref_len = ref.size();
if (hyp_len == 0 || ref_len == 0) return 0;
- NgramCounts counts = make_ngram_counts(hyp.w, ref_ids, N_);
+ NgramCounts counts = make_ngram_counts(hyp, ref, N_);
return Bleu(counts, hyp_len, ref_len);
}
@@ -47,30 +47,18 @@ BleuScorer::Score(ScoredHyp& hyp, vector<WordID>& ref_ids, unsigned id)
* NOTE: 0 iff no 1gram match
*/
score_t
-StupidBleuScorer::Score(ScoredHyp& hyp, vector<WordID>& ref_ids, unsigned id)
+StupidBleuScorer::Score(vector<WordID>& hyp, vector<WordID>& ref)
{
- unsigned hyp_len = hyp.w.size(), ref_len = ref_ids.size();
+ unsigned hyp_len = hyp.size(), ref_len = ref.size();
if (hyp_len == 0 || ref_len == 0) return 0;
- NgramCounts counts = make_ngram_counts(hyp.w, ref_ids, N_);
+ NgramCounts counts = make_ngram_counts(hyp, ref, N_);
unsigned M = N_;
if (ref_len < N_) M = ref_len;
score_t sum = 0, add = 0;
for (unsigned i = 0; i < M; i++) {
if (i == 1) add = 1;
- //cout << ((score_t)counts.clipped[i] + add) << "/" << counts.sum[i] +add << "." << endl;
- //cout << "w_[i] " << w_[i] << endl;
- sum += w_[i] * log(((score_t)counts.clipped[i] + add) / ((counts.sum[i] + add)));
- //cout << "sum += "<< w_[i] * log(((score_t)counts.clipped[i] + add) / ((counts.sum[i] + add))) << endl;
+ sum += w_[i] * log(((score_t)counts.clipped[i] + add)/((counts.sum[i] + add)));
}
- /*cout << ref_ids << endl;
- cout << hyp.w << endl;
- cout << "ref_len " << ref_len << endl;
- cout << "hyp_len " << hyp_len << endl;
- cout << "bp " << brevity_penaly(hyp_len, ref_len) << endl;
- cout << "exp(sum) " << exp(sum) << endl;
- counts.Print();
- cout << brevity_penaly(hyp_len, ref_len) * exp(sum) << endl;
- cout << "---" << endl;*/
return brevity_penaly(hyp_len, ref_len) * exp(sum);
}
@@ -84,21 +72,22 @@ StupidBleuScorer::Score(ScoredHyp& hyp, vector<WordID>& ref_ids, unsigned id)
* NOTE: max is 0.9375
*/
score_t
-SmoothBleuScorer::Score(ScoredHyp& hyp, vector<WordID>& ref_ids, unsigned id)
+SmoothBleuScorer::Score(vector<WordID>& hyp, vector<WordID>& ref)
{
- unsigned hyp_len = hyp.w.size(), ref_len = ref_ids.size();
+ unsigned hyp_len = hyp.size(), ref_len = ref.size();
if (hyp_len == 0 || ref_len == 0) return 0;
- NgramCounts counts = make_ngram_counts(hyp.w, ref_ids, N_);
+ NgramCounts counts = make_ngram_counts(hyp, ref, N_);
score_t sum = 0;
unsigned j = 1;
for (unsigned i = 0; i < N_; i++) {
if (counts.clipped[i] == 0 || counts.sum[i] == 0) continue;
- sum += exp((w_[i] * log((score_t)counts.clipped[i]/counts.sum[i]))) / pow(2, N_-j+1);
+ sum += exp((w_[i] * log((score_t)counts.clipped[i]/counts.sum[i])))/pow(2, N_-j+1);
j++;
}
return brevity_penaly(hyp_len, ref_len) * sum;
}
+// FIXME
/*
* approx. bleu
*
diff --git a/dtrain/score.h b/dtrain/score.h
index f87d708c..9af56ef9 100644
--- a/dtrain/score.h
+++ b/dtrain/score.h
@@ -1,16 +1,8 @@
#ifndef _DTRAIN_SCORE_H_
#define _DTRAIN_SCORE_H_
-#include <iostream>
-#include <vector>
-#include <map>
-#include <cassert>
-#include <cmath>
-
#include "kbestget.h"
-#include "wordid.h" // cdec
-
using namespace std;
namespace dtrain
@@ -111,51 +103,28 @@ make_ngram_counts(const vector<WordID>& hyp, const vector<WordID>& ref, const un
return counts;
}
-struct LocalScorer
-{
- unsigned N_;
- vector<score_t> w_;
-
- virtual score_t
- Score(ScoredHyp& hyp, vector<WordID>& ref_ids, unsigned id)=0;
-
- void
- Init(unsigned N, vector<score_t> weights)
- {
- assert(N > 0);
- N_ = N;
- if (weights.empty()) for (unsigned i = 0; i < N_; i++) w_.push_back(1./N_);
- else w_ = weights;
- }
-
- score_t
- brevity_penaly(const unsigned hyp_len, const unsigned ref_len)
- {
- if (hyp_len > ref_len) return 1;
- return exp(1 - (score_t)ref_len/hyp_len);
- }
-};
-
struct BleuScorer : public LocalScorer
{
score_t Bleu(NgramCounts& counts, const unsigned hyp_len, const unsigned ref_len);
- score_t Score(ScoredHyp& hyp, vector<WordID>& ref_ids, unsigned id);
+ score_t Score(vector<WordID>& hyp, vector<WordID>& ref_ids);
};
struct StupidBleuScorer : public LocalScorer
{
- score_t Score(ScoredHyp& hyp, vector<WordID>& ref_ids, unsigned id);
+ score_t Score(vector<WordID>& hyp, vector<WordID>& ref);
};
struct SmoothBleuScorer : public LocalScorer
{
- score_t Score(ScoredHyp& hyp, vector<WordID>& ref_ids, unsigned id);
+ score_t Score(vector<WordID>& hyp, vector<WordID>& ref);
};
// FIXME
/*struct ApproxBleuScorer : public LocalScorer
{
- NgramCounts glob_onebest_counts;
+ bool prepped;
+
+ NgramCounts* glob_onebest_counts;
unsigned glob_hyp_len, glob_ref_len;
void Prep(NgramCounts& counts, const unsigned hyp_len, const unsigned ref_len);
@@ -171,7 +140,6 @@ struct SmoothBleuScorer : public LocalScorer
};*/
-
} // namespace
#endif
diff --git a/dtrain/test/example/dtrain.ini b/dtrain/test/example/dtrain.ini
index df746e51..fd3a3841 100644
--- a/dtrain/test/example/dtrain.ini
+++ b/dtrain/test/example/dtrain.ini
@@ -1,11 +1,11 @@
decoder_config=test/example/cdec.ini
k=100
N=4
-epochs=10
+epochs=100
input=test/example/nc-1k.gz
scorer=stupid_bleu
output=test/example/weights.gz
-stop_after=10
-sample_from=kbest
-pair_sampling=all
+stop_after=0
+sample_from=forest
+pair_sampling=rand
print_weights=Glue WordPenalty LanguageModel LanguageModel_OOV PhraseModel_0 PhraseModel_1 PhraseModel_2 PhraseModel_3 PhraseModel_4 PassThrough
diff --git a/dtrain/test/example/weights.gz b/dtrain/test/example/weights.gz
index e7baa367..7960a05a 100644
--- a/dtrain/test/example/weights.gz
+++ b/dtrain/test/example/weights.gz
Binary files differ