From b732e625ffcf59da8440db577183110488f5c4b7 Mon Sep 17 00:00:00 2001
From: Patrick Simianer
Date: Fri, 29 Jul 2011 00:48:04 +0200
Subject: first cut for sofia-ml, little change in utils/dict.h, coarse
refactoring
---
dtrain/dcommon.h | 230 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 230 insertions(+)
create mode 100644 dtrain/dcommon.h
(limited to 'dtrain/dcommon.h')
diff --git a/dtrain/dcommon.h b/dtrain/dcommon.h
new file mode 100644
index 00000000..ff796642
--- /dev/null
+++ b/dtrain/dcommon.h
@@ -0,0 +1,230 @@
+#include
+#include
+#include
+#include
+#include
+
+#include "config.h"
+
+#include
+#include
+#include
+#include
+
+#include "sentence_metadata.h"
+#include "scorer.h"
+#include "verbose.h"
+#include "viterbi.h"
+#include "hg.h"
+#include "prob.h"
+#include "kbest.h"
+#include "ff_register.h"
+#include "decoder.h"
+#include "filelib.h"
+#include "fdict.h"
+#include "weights.h"
+#include "sparse_vector.h"
+#include "sampler.h"
+
+using namespace std;
+namespace po = boost::program_options;
+
+
+struct ScorePair
+{
+ ScorePair(double modelscore, double score) : modelscore_(modelscore), score_(score) {}
+ double modelscore_, score_;
+ double GetModelScore() { return modelscore_; }
+ double GetScore() { return score_; }
+};
+typedef vector Scores;
+
+
+/*
+ * KBestGetter
+ *
+ */
+struct KBestList {
+ vector > feats;
+ vector > sents;
+ vector scores;
+};
+struct KBestGetter : public DecoderObserver
+{
+ KBestGetter( const size_t k ) : k_(k) {}
+ const size_t k_;
+ KBestList kb;
+
+ virtual void
+ NotifyTranslationForest(const SentenceMetadata& smeta, Hypergraph* hg)
+ {
+ GetKBest(smeta.GetSentenceID(), *hg);
+ }
+
+ KBestList* GetKBest() { return &kb; }
+
+ void
+ GetKBest(int sent_id, const Hypergraph& forest)
+ {
+ kb.scores.clear();
+ kb.sents.clear();
+ kb.feats.clear();
+ KBest::KBestDerivations, ESentenceTraversal> kbest( forest, k_ );
+ for ( size_t i = 0; i < k_; ++i ) {
+ const KBest::KBestDerivations, ESentenceTraversal>::Derivation* d =
+ kbest.LazyKthBest( forest.nodes_.size() - 1, i );
+ if (!d) break;
+ kb.sents.push_back( d->yield);
+ kb.feats.push_back( d->feature_values );
+ kb.scores.push_back( d->score );
+ }
+ }
+};
+
+
+/*
+ * NgramCounts
+ *
+ */
+struct NgramCounts
+{
+ NgramCounts( const size_t N ) : N_( N ) {
+ reset();
+ }
+ size_t N_;
+ map clipped;
+ map sum;
+
+ void
+ operator+=( const NgramCounts& rhs )
+ {
+ assert( N_ == rhs.N_ );
+ for ( size_t i = 0; i < N_; i++ ) {
+ this->clipped[i] += rhs.clipped.find(i)->second;
+ this->sum[i] += rhs.sum.find(i)->second;
+ }
+ }
+
+ void
+ add( size_t count, size_t ref_count, size_t i )
+ {
+ assert( i < N_ );
+ if ( count > ref_count ) {
+ clipped[i] += ref_count;
+ sum[i] += count;
+ } else {
+ clipped[i] += count;
+ sum[i] += count;
+ }
+ }
+
+ void
+ reset()
+ {
+ size_t i;
+ for ( i = 0; i < N_; i++ ) {
+ clipped[i] = 0;
+ sum[i] = 0;
+ }
+ }
+
+ void
+ print()
+ {
+ for ( size_t i = 0; i < N_; i++ ) {
+ cout << i+1 << "grams (clipped):\t" << clipped[i] << endl;
+ cout << i+1 << "grams:\t\t\t" << sum[i] << endl;
+ }
+ }
+};
+
+
+/*class Learnerx
+{
+ public:
+ virtual void Init(const vector >& kbest, const Scores& scores) {};
+ virtual void Update(SparseVector& lambdas);
+};*/
+
+class SofiaLearner //: public Learnerx FIXME
+{
+ // TODO bool invert_score
+ public:
+ void
+ Init( const size_t sid, const vector >& kbest, /*const*/ Scores& scores )
+ {
+ assert( kbest.size() == scores.size() );
+ ofstream o;
+ unlink( "/tmo/sofia_ml_training" );
+ o.open( "/tmp/sofia_ml_training", ios::trunc ); // TODO randomize, filename exists
+ int fid = 0;
+ map::iterator ff;
+ for ( size_t k = 0; k < kbest.size(); ++k ) {
+ SparseVector::const_iterator it = kbest[k].begin();
+ o << scores[k].GetScore();
+ for ( ; it != kbest[k].end(); ++it) {
+ ff = fmap.find( it->first );
+ if ( ff == fmap.end() ) {
+ fmap.insert( pair(it->first, fid) );
+ fmap1.insert( pair(fid, it->first) );
+ fid++;
+ }
+ o << " "<< fmap[it->first] << ":" << it->second;
+ }
+ o << endl;
+ }
+ o.close();
+ }
+
+ void
+ Update(SparseVector& lambdas)
+ {
+ string call = "./sofia-ml --training_file /tmp/sofia_ml_training --model_out /tmp/sofia_ml_model --loop_type stochastic --lambda 100 --dimensionality ";
+ std::stringstream out;
+ out << fmap.size();
+ call += out.str();
+ call += " &>/dev/null";
+ system ( call.c_str() );
+ ifstream i;
+ unlink( "/tmo/sofia_ml_model" );
+ i.open( "/tmp/sofia_ml_model", ios::in );
+ string model;
+ getline( i, model );
+ //cout << model << endl;
+ vector strs;
+ boost::split( strs, model, boost::is_any_of(" ") );
+ int j = 0;
+ for ( vector::iterator it = strs.begin(); it != strs.end(); ++it ) {
+ lambdas.set_value(fmap1[j], atof( it->c_str() ) );
+ j++;
+ }
+
+ }
+
+ private:
+ map fmap;
+ map fmap1;
+};
+
+typedef map, size_t> Ngrams;
+Ngrams make_ngrams( vector& s, size_t N );
+NgramCounts make_ngram_counts( vector hyp, vector ref, size_t N );
+double brevity_penaly( const size_t hyp_len, const size_t ref_len );
+double bleu( NgramCounts& counts, const size_t hyp_len, const size_t ref_len, size_t N, vector weights = vector() );
+double stupid_bleu( NgramCounts& counts, const size_t hyp_len, const size_t ref_len, size_t N, vector weights = vector() );
+double smooth_bleu( NgramCounts& counts, const size_t hyp_len, const size_t ref_len, const size_t N, vector weights = vector() );
+double approx_bleu( NgramCounts& counts, const size_t hyp_len, const size_t ref_len, const size_t N, vector weights = vector() );
+void register_and_convert(const vector& strs, vector& ids);
+
+
+
+
+void print_FD();
+void run_tests();
+void test_SetWeights();
+#include
+#include
+void test_metrics();
+double approx_equal( double x, double y );
+void test_ngrams();
+
--
cgit v1.2.3