From b732e625ffcf59da8440db577183110488f5c4b7 Mon Sep 17 00:00:00 2001 From: Patrick Simianer Date: Fri, 29 Jul 2011 00:48:04 +0200 Subject: first cut for sofia-ml, little change in utils/dict.h, coarse refactoring --- dtrain/dcommon.h | 230 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 230 insertions(+) create mode 100644 dtrain/dcommon.h (limited to 'dtrain/dcommon.h') diff --git a/dtrain/dcommon.h b/dtrain/dcommon.h new file mode 100644 index 00000000..ff796642 --- /dev/null +++ b/dtrain/dcommon.h @@ -0,0 +1,230 @@ +#include +#include +#include +#include +#include + +#include "config.h" + +#include +#include +#include +#include + +#include "sentence_metadata.h" +#include "scorer.h" +#include "verbose.h" +#include "viterbi.h" +#include "hg.h" +#include "prob.h" +#include "kbest.h" +#include "ff_register.h" +#include "decoder.h" +#include "filelib.h" +#include "fdict.h" +#include "weights.h" +#include "sparse_vector.h" +#include "sampler.h" + +using namespace std; +namespace po = boost::program_options; + + +struct ScorePair +{ + ScorePair(double modelscore, double score) : modelscore_(modelscore), score_(score) {} + double modelscore_, score_; + double GetModelScore() { return modelscore_; } + double GetScore() { return score_; } +}; +typedef vector Scores; + + +/* + * KBestGetter + * + */ +struct KBestList { + vector > feats; + vector > sents; + vector scores; +}; +struct KBestGetter : public DecoderObserver +{ + KBestGetter( const size_t k ) : k_(k) {} + const size_t k_; + KBestList kb; + + virtual void + NotifyTranslationForest(const SentenceMetadata& smeta, Hypergraph* hg) + { + GetKBest(smeta.GetSentenceID(), *hg); + } + + KBestList* GetKBest() { return &kb; } + + void + GetKBest(int sent_id, const Hypergraph& forest) + { + kb.scores.clear(); + kb.sents.clear(); + kb.feats.clear(); + KBest::KBestDerivations, ESentenceTraversal> kbest( forest, k_ ); + for ( size_t i = 0; i < k_; ++i ) { + const KBest::KBestDerivations, ESentenceTraversal>::Derivation* d = + kbest.LazyKthBest( forest.nodes_.size() - 1, i ); + if (!d) break; + kb.sents.push_back( d->yield); + kb.feats.push_back( d->feature_values ); + kb.scores.push_back( d->score ); + } + } +}; + + +/* + * NgramCounts + * + */ +struct NgramCounts +{ + NgramCounts( const size_t N ) : N_( N ) { + reset(); + } + size_t N_; + map clipped; + map sum; + + void + operator+=( const NgramCounts& rhs ) + { + assert( N_ == rhs.N_ ); + for ( size_t i = 0; i < N_; i++ ) { + this->clipped[i] += rhs.clipped.find(i)->second; + this->sum[i] += rhs.sum.find(i)->second; + } + } + + void + add( size_t count, size_t ref_count, size_t i ) + { + assert( i < N_ ); + if ( count > ref_count ) { + clipped[i] += ref_count; + sum[i] += count; + } else { + clipped[i] += count; + sum[i] += count; + } + } + + void + reset() + { + size_t i; + for ( i = 0; i < N_; i++ ) { + clipped[i] = 0; + sum[i] = 0; + } + } + + void + print() + { + for ( size_t i = 0; i < N_; i++ ) { + cout << i+1 << "grams (clipped):\t" << clipped[i] << endl; + cout << i+1 << "grams:\t\t\t" << sum[i] << endl; + } + } +}; + + +/*class Learnerx +{ + public: + virtual void Init(const vector >& kbest, const Scores& scores) {}; + virtual void Update(SparseVector& lambdas); +};*/ + +class SofiaLearner //: public Learnerx FIXME +{ + // TODO bool invert_score + public: + void + Init( const size_t sid, const vector >& kbest, /*const*/ Scores& scores ) + { + assert( kbest.size() == scores.size() ); + ofstream o; + unlink( "/tmo/sofia_ml_training" ); + o.open( "/tmp/sofia_ml_training", ios::trunc ); // TODO randomize, filename exists + int fid = 0; + map::iterator ff; + for ( size_t k = 0; k < kbest.size(); ++k ) { + SparseVector::const_iterator it = kbest[k].begin(); + o << scores[k].GetScore(); + for ( ; it != kbest[k].end(); ++it) { + ff = fmap.find( it->first ); + if ( ff == fmap.end() ) { + fmap.insert( pair(it->first, fid) ); + fmap1.insert( pair(fid, it->first) ); + fid++; + } + o << " "<< fmap[it->first] << ":" << it->second; + } + o << endl; + } + o.close(); + } + + void + Update(SparseVector& lambdas) + { + string call = "./sofia-ml --training_file /tmp/sofia_ml_training --model_out /tmp/sofia_ml_model --loop_type stochastic --lambda 100 --dimensionality "; + std::stringstream out; + out << fmap.size(); + call += out.str(); + call += " &>/dev/null"; + system ( call.c_str() ); + ifstream i; + unlink( "/tmo/sofia_ml_model" ); + i.open( "/tmp/sofia_ml_model", ios::in ); + string model; + getline( i, model ); + //cout << model << endl; + vector strs; + boost::split( strs, model, boost::is_any_of(" ") ); + int j = 0; + for ( vector::iterator it = strs.begin(); it != strs.end(); ++it ) { + lambdas.set_value(fmap1[j], atof( it->c_str() ) ); + j++; + } + + } + + private: + map fmap; + map fmap1; +}; + +typedef map, size_t> Ngrams; +Ngrams make_ngrams( vector& s, size_t N ); +NgramCounts make_ngram_counts( vector hyp, vector ref, size_t N ); +double brevity_penaly( const size_t hyp_len, const size_t ref_len ); +double bleu( NgramCounts& counts, const size_t hyp_len, const size_t ref_len, size_t N, vector weights = vector() ); +double stupid_bleu( NgramCounts& counts, const size_t hyp_len, const size_t ref_len, size_t N, vector weights = vector() ); +double smooth_bleu( NgramCounts& counts, const size_t hyp_len, const size_t ref_len, const size_t N, vector weights = vector() ); +double approx_bleu( NgramCounts& counts, const size_t hyp_len, const size_t ref_len, const size_t N, vector weights = vector() ); +void register_and_convert(const vector& strs, vector& ids); + + + + +void print_FD(); +void run_tests(); +void test_SetWeights(); +#include +#include +void test_metrics(); +double approx_equal( double x, double y ); +void test_ngrams(); + -- cgit v1.2.3