From b7568a8dad2720d5ea0418171e9b85229adbbcc5 Mon Sep 17 00:00:00 2001
From: Patrick Simianer <p@simianer.de>
Date: Sun, 31 Jul 2011 19:24:02 +0200
Subject: bugfixing, begin refactoring

---
 dtrain/dcommon.cc | 79 ++++++++++++++++++++++++++++++++++---------------------
 dtrain/dcommon.h  | 71 ++-----------------------------------------------
 dtrain/dtest.cc   | 33 +++++++++++++++--------
 dtrain/dtrain.cc  | 48 ++++++++++++++++-----------------
 dtrain/learner.h  | 71 +++++++++++++++++++++++++++++++++++++++++++++++++
 dtrain/test.sh    |  2 +-
 6 files changed, 168 insertions(+), 136 deletions(-)
 create mode 100644 dtrain/learner.h

(limited to 'dtrain')
diff --git a/dtrain/dcommon.cc b/dtrain/dcommon.cc
index a6bdc92c..6657bed6 100644
--- a/dtrain/dcommon.cc
+++ b/dtrain/dcommon.cc
@@ -2,7 +2,11 @@
 
 
 
-/*
+
+/******************************************************************************
+ * NGRAMS
+ *
+ *
  * make_ngrams
  *
  */
@@ -23,9 +27,6 @@ make_ngrams( vector<WordID>& s, size_t N )
 }
 
 
-
-
-
 /*
  * ngram_matches
  *
@@ -50,7 +51,12 @@ make_ngram_counts( vector<WordID> hyp, vector<WordID> ref, size_t N )
 }
 
 
-/*
+
+
+/******************************************************************************
+ * SCORES
+ *
+ *
  * brevity_penaly
  *
  */
@@ -156,7 +162,12 @@ approx_bleu( NgramCounts& counts, const size_t hyp_len, const size_t ref_len,
 }
 
 
-/*
+
+
+/******************************************************************************
+ * UTILS
+ *
+ *
  * register_and_convert
  *
  */
@@ -170,11 +181,39 @@ register_and_convert(const vector<string>& strs, vector<WordID>& ids)
 }
 
 
+/*
+ * approx_equal
+ *
+ */
+double
+approx_equal( double x, double y )
+{
+  const double EPSILON = 1E-5;
+  if ( x == 0 ) return fabs( y ) <= EPSILON;
+  if ( y == 0 ) return fabs( x ) <= EPSILON;
+  return fabs( x - y ) / max( fabs(x), fabs(y) ) <= EPSILON;
+}
 
 
 /*
+ * print_FD
+ *
+ */
+void
+print_FD()
+{
+  for ( size_t i = 0; i < FD::NumFeats(); i++ ) cout << FD::Convert(i)<< endl;
+}
+
+
+
+
+/******************************************************************************
+ * TESTS
  *
  *
+ * test_ngrams
+ *
  */
 void
 test_ngrams()
@@ -207,21 +246,7 @@ test_ngrams()
 
 
 /*
- *
- *
- */
-double
-approx_equal( double x, double y )
-{
-  const double EPSILON = 1E-5;
-  if ( x == 0 ) return fabs( y ) <= EPSILON;
-  if ( y == 0 ) return fabs( x ) <= EPSILON;
-  return fabs( x - y ) / max( fabs(x), fabs(y) ) <= EPSILON;
-}
-
-
-/*
- *
+ * test_metrics
  *
  */
 void
@@ -263,8 +288,9 @@ test_metrics()
   cout << endl;
 }
 
+
 /*
- *
+ * test_SetWeights
  *
  */
 void
@@ -287,7 +313,7 @@ test_SetWeights()
 
 
 /*
- *
+ * run_tests
  *
  */
 void
@@ -302,10 +328,3 @@ run_tests()
   exit(0);
 }
 
-
-void
-print_FD()
-{
-  for ( size_t i = 0; i < FD::NumFeats(); i++ ) cout << FD::Convert(i)<< endl;
-}
-
diff --git a/dtrain/dcommon.h b/dtrain/dcommon.h
index ff796642..6df841bb 100644
--- a/dtrain/dcommon.h
+++ b/dtrain/dcommon.h
@@ -30,6 +30,8 @@ using namespace std;
 namespace po = boost::program_options;
 
 
+
+
 struct ScorePair
 {
   ScorePair(double modelscore, double score) : modelscore_(modelscore), score_(score) {} 
@@ -139,72 +141,7 @@ struct NgramCounts
 };
 
 
-/*class Learnerx
-{
-  public:
-    virtual void Init(const vector<SparseVector<double> >& kbest, const Scores& scores) {};
-    virtual void Update(SparseVector<double>& lambdas);
-};*/
-
-class SofiaLearner //: public Learnerx FIXME
-{
-  // TODO bool invert_score
-  public:
-  void
-  Init( const size_t sid, const vector<SparseVector<double> >& kbest, /*const*/ Scores& scores )
-  {
-    assert( kbest.size() == scores.size() );
-    ofstream o;
-    unlink( "/tmo/sofia_ml_training" );
-    o.open( "/tmp/sofia_ml_training", ios::trunc ); // TODO randomize, filename exists
-    int fid = 0;
-    map<int,int>::iterator ff;
-    for ( size_t k = 0; k < kbest.size(); ++k ) {
-      SparseVector<double>::const_iterator it = kbest[k].begin();
-      o << scores[k].GetScore();
-      for ( ; it != kbest[k].end(); ++it) {
-        ff = fmap.find( it->first );
-        if ( ff == fmap.end() ) {
-          fmap.insert( pair<int,int>(it->first, fid) );
-          fmap1.insert( pair<int,int>(fid, it->first) );
-          fid++;
-        }
-        o << " "<< fmap[it->first] << ":" << it->second;
-      }
-      o << endl;
-    }
-    o.close();
-  }
-
-  void
-  Update(SparseVector<double>& lambdas)
-  {
-    string call = "./sofia-ml --training_file /tmp/sofia_ml_training --model_out /tmp/sofia_ml_model --loop_type stochastic --lambda 100 --dimensionality ";
-    std::stringstream out;
-    out << fmap.size();
-    call += out.str();
-    call += " &>/dev/null";
-    system ( call.c_str() );
-    ifstream i;
-    unlink( "/tmo/sofia_ml_model" );
-    i.open( "/tmp/sofia_ml_model", ios::in );
-    string model;
-    getline( i, model );
-    //cout << model << endl;
-    vector<string> strs;
-    boost::split( strs, model, boost::is_any_of(" ") );
-    int j = 0;
-    for ( vector<string>::iterator it = strs.begin(); it != strs.end(); ++it ) {
-      lambdas.set_value(fmap1[j], atof( it->c_str() ) );
-      j++;
-    }
-
-  }
 
-  private:
-    map<int,int> fmap;
-    map<int,int> fmap1;
-};
 
 typedef map<vector<WordID>, size_t> Ngrams;
 Ngrams make_ngrams( vector<WordID>& s, size_t N );
@@ -215,10 +152,6 @@ double stupid_bleu( NgramCounts& counts, const size_t hyp_len, const size_t ref_
 double smooth_bleu( NgramCounts& counts, const size_t hyp_len, const size_t ref_len, const size_t N, vector<float> weights = vector<float>() );
 double approx_bleu( NgramCounts& counts, const size_t hyp_len, const size_t ref_len, const size_t N, vector<float> weights = vector<float>() );
 void register_and_convert(const vector<string>& strs, vector<WordID>& ids);
-
-
-
-
 void print_FD();
 void run_tests();
 void test_SetWeights();
diff --git a/dtrain/dtest.cc b/dtrain/dtest.cc
index 9975794f..5ae473e6 100644
--- a/dtrain/dtest.cc
+++ b/dtrain/dtest.cc
@@ -11,11 +11,13 @@ bool
 init(int argc, char** argv, po::variables_map* conf)
 {
   int N;
+  bool q;
   po::options_description opts( "Options" );
   opts.add_options()
-    ( "decoder-config,c", po::value<string>(),                  "configuration file for cdec" )
-    ( "weights,w",        po::value<string>(),                  "weights file")
-    ( "ngrams,n",         po::value<int>(&N)->default_value(4), "N for Ngrams (default 5)" );
+    ( "decoder-config,c", po::value<string>(),                      "configuration file for cdec" )
+    ( "weights,w",        po::value<string>(),                      "weights file")
+    ( "ngrams,n",         po::value<int>(&N)->default_value(4),     "N for Ngrams (default 5)" )
+    ( "quiet,q",          po::value<bool>(&q)->default_value(true), "do not output translations" );
   po::options_description cmdline_options;
   cmdline_options.add(opts);
   po::store( parse_command_line(argc, argv, cmdline_options), *conf );
@@ -44,6 +46,7 @@ main(int argc, char** argv)
   Decoder decoder(ini_rf.stream());
   KBestGetter observer(k);
   size_t N = conf["ngrams"].as<int>();
+  bool quiet = conf["quiet"].as<bool>();
 
   Weights weights;
   weights.InitFromFile(conf["weights"].as<string>());
@@ -56,13 +59,15 @@ main(int argc, char** argv)
   string in, psg;
   size_t sid = 0;
   double overall = 0.0;
-  cerr << "(1 dot equals 100 lines of input)" << endl;
+  double overall1 = 0.0;
+  double overall2 = 0.0;
+  cerr << "(a dot equals 100 lines of input)" << endl;
   while( getline(cin, in) ) {
     if ( (sid+1) % 100 == 0 ) {
         cerr << ".";
         if ( (sid+1)%1000 == 0 ) cerr << endl;
     }
-    if ( sid > 5000 ) break;
+    //if ( sid > 5000 ) break;
     strs.clear();
     boost::split( strs, in, boost::is_any_of("\t") );
     // grammar
@@ -75,19 +80,25 @@ main(int argc, char** argv)
     boost::split( ref_strs, strs[1], boost::is_any_of(" ") );
     register_and_convert( ref_strs, ref_ids );
     // scoring kbest
-    double score = 0;
-    Scores scores;
+    double score = 0.0;
+    double score1 = 0.0;
+    double score2 = 0.0;
     NgramCounts counts = make_ngram_counts( ref_ids, kb->sents[0], 4 );
     score = smooth_bleu( counts,
                          ref_ids.size(),
                          kb->sents[0].size(), N );
-    ScorePair sp( kb->scores[0], score );
-    scores.push_back( sp );
-    //cout << TD::GetString( kb->sents[0] ) << endl;
+    score1 = stupid_bleu( counts, ref_ids.size(), kb->sents[0].size(), N) ;
+    score2 = bleu( counts, ref_ids.size(), kb->sents[0].size(), N );
+    //if ( ! quiet )
+    cout << TD::GetString( kb->sents[0] ) << endl;
     overall += score;
+    overall1 += score1;
+    overall2 += score2;
     sid += 1;
   }
-  cout << "Average score: " << overall/(sid+1) << endl;
+  cerr << "Average score (smooth): " << overall/(double)(sid+1) << endl;
+  cerr << "Average score (stupid): " << overall1/(double)(sid+1) << endl;
+  cerr << "Average score (normal): " << overall2/(double)(sid+1) << endl;
   cerr << endl;
 
   return 0;
diff --git a/dtrain/dtrain.cc b/dtrain/dtrain.cc
index 95fc81af..373458e8 100644
--- a/dtrain/dtrain.cc
+++ b/dtrain/dtrain.cc
@@ -1,4 +1,5 @@
 #include "dcommon.h"
+#include "learner.h"
 
 
 
@@ -45,41 +46,35 @@ main(int argc, char** argv)
   ReadFile ini_rf(conf["decoder-config"].as<string>());
   Decoder decoder(ini_rf.stream());
   KBestGetter observer(k);
-  size_t N = 4; // TODO as parameter/in config 
+  size_t N = 3; // TODO as parameter/in config 
 
   // TODO scoring metric as parameter/in config 
   // for approx. bleu
-  //NgramCounts global_counts;
-  //size_t global_hyp_len;
-  //size_t global_ref_len;
+  NgramCounts global_counts(N);
+  size_t global_hyp_len = 0;
+  size_t global_ref_len = 0;
 
   Weights weights;
   SparseVector<double> lambdas;
   weights.InitSparseVector(&lambdas);
   vector<double> dense_weights;
 
-  lambdas.set_value(FD::Convert("logp"), 0);
-
- 
   vector<string> strs, ref_strs;
   vector<WordID> ref_ids;
   string in, psg;
   size_t sid = 0;
   cerr << "(1 dot equals 100 lines of input)" << endl;
   while( getline(cin, in) ) {
-    //if ( !SILENT )
-    //    cerr << endl << endl << "Getting kbest for sentence #" << sid << endl;
     if ( (sid+1) % 100 == 0 ) {
         cerr << ".";
         if ( (sid+1)%1000 == 0 ) cerr << endl;
     }
-    if ( sid > 5000 ) break;
+    //if ( sid > 5000 ) break;
     // weights
     dense_weights.clear();
     weights.InitFromVector( lambdas );
     weights.InitVector( &dense_weights );
     decoder.SetWeights( dense_weights );
-    //if ( sid > 100 ) break;
     // handling input..
     strs.clear();
     boost::split( strs, in, boost::is_any_of("\t") );
@@ -94,33 +89,36 @@ main(int argc, char** argv)
     register_and_convert( ref_strs, ref_ids );
     // scoring kbest
     double score = 0;
+    size_t cand_len = 0;
     Scores scores;
-    for ( size_t i = 0; i < k; i++ ) {
-      NgramCounts counts = make_ngram_counts( ref_ids, kb->sents[i], 4 );
-      score = smooth_bleu( counts,
-                           ref_ids.size(),
-                           kb->sents[i].size(), N );
+    for ( size_t i = 0; i < kb->sents.size(); i++ ) {
+      NgramCounts counts = make_ngram_counts( ref_ids, kb->sents[i], N );
+      if ( i == 0) {
+        global_counts += counts;
+        global_hyp_len += kb->sents[i].size();
+        global_ref_len += ref_ids.size();
+        cand_len = 0;
+      } else {
+        cand_len = kb->sents[i].size();
+      }
+      //score = bleu( global_counts,
+      //                     global_ref_len,
+       //                    global_hyp_len + cand_len, N );
+      score = bleu ( counts, ref_ids.size(), kb->sents[i].size(), N );
       ScorePair sp( kb->scores[i], score );
       scores.push_back( sp );
       //cout << "'" << TD::GetString( ref_ids ) << "' vs '" << TD::GetString( kb->sents[i] ) << "' SCORE=" << score << endl;
       //cout << kb->feats[i] << endl;
     }
-    //cout << "###" << endl;
+    // learner
     SofiaLearner learner;
     learner.Init( sid, kb->feats, scores );
     learner.Update(lambdas);
-    // initializing learner
-    // TODO
-    // updating weights
-    //lambdas.set_value( FD::Convert("use_shell"), 1 );
-    //lambdas.set_value( FD::Convert("use_a"), 1 );
     //print_FD();
     sid += 1; // TODO does cdec count this already?
   }
-
-  weights.WriteToFile( "weights-final", true );
-  
   cerr << endl;
+  weights.WriteToFile( "data/weights-final-normalx", true );
 
   return 0;
 }
diff --git a/dtrain/learner.h b/dtrain/learner.h
new file mode 100644
index 00000000..a953284d
--- /dev/null
+++ b/dtrain/learner.h
@@ -0,0 +1,71 @@
+/*class Learnerx
+{
+  public:
+    virtual void Init(const vector<SparseVector<double> >& kbest, const Scores& scores) {};
+    virtual void Update(SparseVector<double>& lambdas);
+};*/
+
+class SofiaLearner //: public Learnerx FIXME
+{
+  // TODO bool invert_score
+  public:
+  void
+  Init( const size_t sid, const vector<SparseVector<double> >& kbest, /*const*/ Scores& scores )
+  {
+    assert( kbest.size() == scores.size() );
+    ofstream o;
+    //unlink( "/tmp/sofia_ml_training_stupid" );
+    o.open( "/tmp/sofia_ml_training_normalx", ios::trunc ); // TODO randomize, filename exists
+    int fid = 0;
+    map<int,int>::iterator ff;
+
+    for ( size_t k = 0; k < kbest.size(); ++k ) {
+      map<int,double> m;
+      SparseVector<double>::const_iterator it = kbest[k].begin();
+      o << scores[k].GetScore();
+      for ( ; it != kbest[k].end(); ++it) {
+        ff = fmap.find( it->first );
+        if ( ff == fmap.end() ) {
+          fmap.insert( pair<int,int>(it->first, fid) );
+          fmap1.insert( pair<int,int>(fid, it->first) );
+          fid++;
+        }
+        m.insert(pair<int,double>(fmap[it->first], it->second));
+      }
+      map<int,double>::iterator ti = m.begin();
+      for ( ; ti != m.end(); ++ti ) {
+        o << " " << ti->first << ":" << ti->second;
+      }
+      o << endl;
+    }
+    o.close();
+  }
+
+  void
+  Update(SparseVector<double>& lambdas)
+  {
+    string call = "./sofia-ml --training_file /tmp/sofia_ml_training_normalx --model_out /tmp/sofia_ml_model_normalx --loop_type stochastic --lambda 100 --dimensionality ";
+    std::stringstream out;
+    out << fmap.size();
+    call += out.str();
+    call += " &>/dev/null";
+    system ( call.c_str() );
+    ifstream i;
+    //unlink( "/tmp/sofia_ml_model_stupid" );
+    i.open( "/tmp/sofia_ml_model_normalx", ios::in );
+    string model;
+    getline( i, model );
+    vector<string> strs;
+    boost::split( strs, model, boost::is_any_of(" ") );
+    int j = 0;
+    for ( vector<string>::iterator it = strs.begin(); it != strs.end(); ++it ) {
+      lambdas.set_value(fmap1[j], atof( it->c_str() ) );
+      j++;
+    }
+  }
+
+  private:
+    map<int,int> fmap;
+    map<int,int> fmap1;
+};
+
diff --git a/dtrain/test.sh b/dtrain/test.sh
index ad45bd1e..bc318ae7 100755
--- a/dtrain/test.sh
+++ b/dtrain/test.sh
@@ -1,4 +1,4 @@
 #!/bin/sh
 
-./dtrain -c data/cdec.ini -k 4 < data/in.blunsom08 #< data/in.toy
+./dtrain -c data/cdec.ini -k 200 < data/in.blunsom08 #< data/in.toy
 
-- 
cgit v1.2.3