added svm, ksampler

author: Patrick Simianer <p@simianer.de> 2011-09-09 00:41:17 +0200
committer: Patrick Simianer <p@simianer.de> 2011-09-23 19:13:58 +0200
commit: 14637f89c899179f54a5bc327857db8ea1e1d427 (patch)
tree: 66cdbfd8039107075509faa4a4d1dc5dc33698cc /dtrain
parent: 0269777fc54bc554c12107bdd5498f743df2a1ce (diff)
8 files changed, 133 insertions, 92 deletions
diff --git a/dtrain/Makefile.am b/dtrain/Makefile.am
index afab00f2..c08cd1ea 100644
--- a/dtrain/Makefile.am
+++ b/dtrain/Makefile.am
@@ -1,7 +1,7 @@
 # TODO I'm sure I can leave something out.
 bin_PROGRAMS = dtrain dtest
 
-dtrain_SOURCES = dtrain.cc score.cc tests.cc util.cc
+dtrain_SOURCES = dtrain.cc score.cc tests.cc util.cc sample_hg.cc
 dtrain_LDADD   = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a ../klm/lm/libklm.a ../klm/util/libklm_util.a -lz -lboost_filesystem -lboost_iostreams
 
 dtest_SOURCES = dtest.cc score.cc util.cc
diff --git a/dtrain/dtrain.cc b/dtrain/dtrain.cc
index 4554e417..d58478a8 100644
--- a/dtrain/dtrain.cc
+++ b/dtrain/dtrain.cc
@@ -3,6 +3,8 @@
 #include "util.h"
 #include "sample.h"
 
+#include "ksampler.h"
+
 // boost compression
 #include <boost/iostreams/device/file.hpp> 
 #include <boost/iostreams/filtering_stream.hpp>
@@ -11,6 +13,7 @@
 //#include <boost/iostreams/filter/bzip2.hpp>
 using namespace boost::iostreams;
 
+
 #ifdef DTRAIN_DEBUG
 #include "tests.h"
 #endif
@@ -101,6 +104,7 @@ ostream& _prec5( ostream& out ) { return out << setprecision(5); }
 int
 main( int argc, char** argv )
 {
+  cout << setprecision( 5 );
   // handle most parameters
   po::variables_map cfg;
   if ( ! init(argc, argv, &cfg) ) exit(1); // something is wrong 
@@ -143,7 +147,9 @@ main( int argc, char** argv )
   if ( !quiet )
     cout << setw(25) << "cdec cfg " << "'" << cfg["decoder_config"].as<string>() << "'" << endl;
   Decoder decoder( ini_rf.stream() );
-  KBestGetter observer( k, filter_type );
+  //KBestGetter observer( k, filter_type );
+  MT19937 rng;
+  KSampler observer( k, &rng );
 
   // scoring metric/scorer
   string scorer_str = cfg["scorer"].as<string>();
@@ -207,11 +213,13 @@ main( int argc, char** argv )
   size_t cand_len = 0;
   double overall_time = 0.;
 
-  cout << setprecision( 5 );
-
-  // for the perceptron
-  double eta = 0.5; // TODO as parameter
+  // for the perceptron/SVM; TODO as params
+  double eta = 0.0005;
+  double gamma = 0.01; // -> SVM
   lambdas.add_value( FD::Convert("__bias"), 0 );
+  
+  // for random sampling
+  srand ( time(NULL) );
 
 
   for ( size_t t = 0; t < T; t++ ) // T epochs
@@ -284,44 +292,44 @@ main( int argc, char** argv )
     weights.InitVector( &dense_weights );
     decoder.SetWeights( dense_weights );
 
-    srand ( time(NULL) );
-
-    switch ( t ) {
-      case 0:
-        // handling input
-        in_split.clear();
-        boost::split( in_split, in, boost::is_any_of("\t") );
-        // in_split[0] is id
-        //cout << in_split[0] << endl;
-        // getting reference
-        ref_tok.clear(); ref_ids.clear();
-        boost::split( ref_tok, in_split[2], boost::is_any_of(" ") );
-        register_and_convert( ref_tok, ref_ids );
-        ref_ids_buf.push_back( ref_ids );
-        // process and set grammar
-        //grammar_buf << in_split[3] << endl;
-        grammar_str = boost::replace_all_copy( in_split[3], " __NEXT__RULE__ ", "\n" ) + "\n"; // FIXME copy, __
-        grammar_buf << grammar_str << DTRAIN_GRAMMAR_DELIM << endl;
-        decoder.SetSentenceGrammarFromString( grammar_str );
-        // decode, kbest
-        src_str_buf.push_back( in_split[1] );
-        decoder.Decode( in_split[1], &observer );
-        break;
-      default:
-        // get buffered grammar
-        grammar_str.clear();
-        int i = 1;
-        while ( true ) {
-          string g;  
-          getline( grammar_buf_in, g );
-          if ( g == DTRAIN_GRAMMAR_DELIM ) break;
-          grammar_str += g+"\n";
-          i += 1;
+    if ( t == 0 ) {
+      // handling input
+      in_split.clear();
+      boost::split( in_split, in, boost::is_any_of("\t") ); // in_split[0] is id
+      // getting reference
+      ref_tok.clear(); ref_ids.clear();
+      boost::split( ref_tok, in_split[2], boost::is_any_of(" ") );
+      register_and_convert( ref_tok, ref_ids );
+      ref_ids_buf.push_back( ref_ids );
+      // process and set grammar
+      bool broken_grammar = true;
+      for ( string::iterator ti = in_split[3].begin(); ti != in_split[3].end(); ti++ ) {
+        if ( !isspace(*ti) ) {
+          broken_grammar = false;
+          break;
         }
-        decoder.SetSentenceGrammarFromString( grammar_str );
-        // decode, kbest
-        decoder.Decode( src_str_buf[sid], &observer );
-        break;
+      }
+      if ( broken_grammar ) continue;
+      grammar_str = boost::replace_all_copy( in_split[3], " __NEXT__RULE__ ", "\n" ) + "\n"; // FIXME copy, __
+      grammar_buf << grammar_str << DTRAIN_GRAMMAR_DELIM << endl;
+      decoder.SetSentenceGrammarFromString( grammar_str );
+      // decode, kbest
+      src_str_buf.push_back( in_split[1] );
+      decoder.Decode( in_split[1], &observer );
+    } else {
+      // get buffered grammar
+      grammar_str.clear();
+      int i = 1;
+      while ( true ) {
+        string g;  
+        getline( grammar_buf_in, g );
+        if ( g == DTRAIN_GRAMMAR_DELIM ) break;
+        grammar_str += g+"\n";
+        i += 1;
+      }
+      decoder.SetSentenceGrammarFromString( grammar_str );
+      // decode, kbest
+      decoder.Decode( src_str_buf[sid], &observer );
     }
 
     // get kbest list
@@ -346,6 +354,7 @@ main( int argc, char** argv )
             cand_len = kb->sents[i].size();
         }
         NgramCounts counts_tmp = global_counts + counts;
+        // TODO as param
         score = 0.9 * scorer( counts_tmp,
                               global_ref_len,
                               global_hyp_len + cand_len, N, bleu_weights );
@@ -380,31 +389,48 @@ main( int argc, char** argv )
 
       TrainingInstances pairs;
 
-      sample_all(kb, pairs);
+      sample_all_rand(kb, pairs);
+      cout << pairs.size() << endl;
             
       for ( TrainingInstances::iterator ti = pairs.begin();
             ti != pairs.end(); ti++ ) {
-        // perceptron
+
         SparseVector<double> dv;
-        if ( ti->type == -1 ) {
+        if ( ti->first_score - ti->second_score < 0 ) {
           dv = ti->second - ti->first;
-        } else {
-          dv = ti->first - ti->second;
-        }
-        dv.add_value(FD::Convert("__bias"), -1);
+      //} else {
+        //dv = ti->first - ti->second;
+      //}
+        dv.add_value( FD::Convert("__bias"), -1 );
+        
+        SparseVector<double> reg;
+        reg = lambdas * ( 2 * gamma );
+        dv -= reg;
         lambdas += dv * eta;
 
-        /*if ( verbose ) {
-          cout << "{{ f(i) > f(j) but g(i) < g(j), so update" << endl;
-          cout << " i  " << TD::GetString(kb->sents[ii]) << endl;
-          cout << "    " << kb->feats[ii] << endl;
-          cout << " j  " << TD::GetString(kb->sents[jj]) << endl;
-          cout << "    " << kb->feats[jj] << endl; 
-          cout << " dv " << dv << endl;
+        if ( verbose ) {
+          cout << "{{ f("<< ti->first_rank <<") > f(" << ti->second_rank << ") but g(i)="<< ti->first_score <<" < g(j)="<< ti->second_score << " so update" << endl;
+          cout << " i  " << TD::GetString(kb->sents[ti->first_rank]) << endl;
+          cout << "    " << kb->feats[ti->first_rank] << endl;
+          cout << " j  " << TD::GetString(kb->sents[ti->second_rank]) << endl;
+          cout << "    " << kb->feats[ti->second_rank] << endl; 
+          cout << " diff vec: " << dv << endl;
+          cout << " lambdas after update: " << lambdas << endl;
           cout << "}}" << endl;
-        }*/
+        }
+
+        } else {
+            //if ( 0 ) {
+            SparseVector<double> reg;
+            reg = lambdas * ( gamma * 2 );
+            lambdas += reg * ( -eta );
+            //}
+        }
       }
 
+      //double l2 = lambdas.l2norm();
+      //if ( l2 ) lambdas /= lambdas.l2norm();
+
     }
 
     ++sid;
diff --git a/dtrain/kbestget.h b/dtrain/kbestget.h
index ae4588c9..cf466fe4 100644
--- a/dtrain/kbestget.h
+++ b/dtrain/kbestget.h
@@ -1,8 +1,10 @@
 #ifndef _DTRAIN_KBESTGET_H_
 #define _DTRAIN_KBESTGET_H_
 
+
 #include "kbest.h"
 
+
 namespace dtrain
 {
 
@@ -65,7 +67,7 @@ struct KBestGetter : public DecoderObserver
       if (!d) break;
       kb.sents.push_back( d->yield);
       kb.feats.push_back( d->feature_values );
-      kb.model_scores.push_back( d->score );
+      kb.model_scores.push_back( log(d->score) );
     }
   }
 
@@ -83,7 +85,7 @@ struct KBestGetter : public DecoderObserver
       if (!d) break;
       kb.sents.push_back( d->yield);
       kb.feats.push_back( d->feature_values );
-      kb.model_scores.push_back( d->score );
+      kb.model_scores.push_back( log(d->score) );
     }
   }
 };
diff --git a/dtrain/run.sh b/dtrain/run.sh
index b2012bcf..16575c25 100755
--- a/dtrain/run.sh
+++ b/dtrain/run.sh
@@ -3,7 +3,8 @@
 #INI=test/blunsom08.dtrain.ini
 #INI=test/nc-wmt11/dtrain.ini
 #INI=test/EXAMPLE/dtrain.ini
-INI=test/toy.dtrain.ini
+INI=test/EXAMPLE/dtrain.ruleids.ini
+#INI=test/toy.dtrain.ini
 
 rm /tmp/dtrain-*
 ./dtrain -c $INI $1 $2 $3 $4 
diff --git a/dtrain/sample.h b/dtrain/sample.h
index b9bc4461..b6aa9abd 100644
--- a/dtrain/sample.h
+++ b/dtrain/sample.h
@@ -1,3 +1,7 @@
+#ifndef _DTRAIN_SAMPLE_H_
+#define _DTRAIN_SAMPLE_H_
+
+
 #include "kbestget.h"
 
 
@@ -7,9 +11,9 @@ namespace dtrain
 
 struct TPair
 {
-  double type;
-  SparseVector<double> first;
-  SparseVector<double> second;
+  SparseVector<double> first, second;
+  size_t first_rank, second_rank;
+  double first_score, second_score;
 };
 
 typedef vector<TPair> TrainingInstances;
@@ -18,35 +22,43 @@ typedef vector<TPair> TrainingInstances;
 void
 sample_all( KBestList* kb, TrainingInstances &training )
 {
-  double type;
   for ( size_t i = 0; i < kb->GetSize()-1; i++ ) {
-   for ( size_t j = i+1; j < kb->GetSize(); j++ ) {
-     if ( kb->scores[i] - kb->scores[j] < 0 ) {
-       type = -1; 
-     } else {
-       type = 1;
-     }
-     TPair p;
-     p.type = type;
-     p.first = kb->feats[i];
-     p.second = kb->feats[j];
-     training.push_back( p );
-   }
- }
-}
-
-/*void
-sample_all_only_neg(, vector<pair<SparSparseVector<double> > pairs)
-{
-
+    for ( size_t j = i+1; j < kb->GetSize(); j++ ) {
+      TPair p;
+      p.first = kb->feats[i];
+      p.second = kb->feats[j];
+      p.first_rank = i;
+      p.second_rank = j;
+      p.first_score = kb->scores[i];
+      p.second_score = kb->scores[j];
+      training.push_back( p );
+    }
+  }
 }
 
 void
-sample_random_pos()
+sample_all_rand( KBestList* kb, TrainingInstances &training )
 {
-  if ( rand() % 2 ) { // sample it?
-}*/
+  srand( time(NULL) );
+  for ( size_t i = 0; i < kb->GetSize()-1; i++ ) {
+    for ( size_t j = i+1; j < kb->GetSize(); j++ ) {
+      if ( rand() % 2 ) {
+      TPair p;
+      p.first = kb->feats[i];
+      p.second = kb->feats[j];
+      p.first_rank = i;
+      p.second_rank = j;
+      p.first_score = kb->scores[i];
+      p.second_score = kb->scores[j];
+      training.push_back( p );
+      }
+    }
+  }
+}
 
 
 } // namespace
 
+
+#endif
+
diff --git a/dtrain/test/EXAMPLE/cdec.ini b/dtrain/test/EXAMPLE/cdec.ini
index e57138b0..bda0d12c 100644
--- a/dtrain/test/EXAMPLE/cdec.ini
+++ b/dtrain/test/EXAMPLE/cdec.ini
@@ -2,6 +2,6 @@ formalism=scfg
 add_pass_through_rules=true
 feature_function=WordPenalty
 cubepruning_pop_limit=30
-feature_function=KLanguageModel data/nc-wmt11.en.srilm.gz
+feature_function=KLanguageModel /home/pks/src/cdec/dtrain/data/nc-wmt11.en.srilm.gz
 scfg_max_span_limit=15
 
diff --git a/dtrain/test/EXAMPLE/dtrain.ini b/dtrain/test/EXAMPLE/dtrain.ini
index ffafd0b8..7645921a 100644
--- a/dtrain/test/EXAMPLE/dtrain.ini
+++ b/dtrain/test/EXAMPLE/dtrain.ini
@@ -1,10 +1,10 @@
 decoder_config=test/EXAMPLE/cdec.ini
 kbest=100
 ngrams=3
-epochs=8
+epochs=3
 input=test/EXAMPLE/dtrain.nc-1k
-scorer=approx_bleu
+scorer=stupid_bleu
 output=test/EXAMPLE/weights.gz
-stop_after=1000
+stop_after=100
 wprint=Glue WordPenalty LanguageModel LanguageModel_OOV PhraseModel_0 PhraseModel_1 PhraseModel_2 PhraseModel_3 PhraseModel_4 PassThrough
 
diff --git a/dtrain/test/toy.dtrain.ini b/dtrain/test/toy.dtrain.ini
index 35f76281..e9ed0ce5 100644
--- a/dtrain/test/toy.dtrain.ini
+++ b/dtrain/test/toy.dtrain.ini
@@ -4,7 +4,7 @@ ngrams=1
 epochs=3
 input=test/toy.in
 scorer=bleu
-output=toy.gz
+output=data/w/toy.gz
 #stop_after=1000
 wprint=logp use_shell use_house PassThrough
author	Patrick Simianer <p@simianer.de>	2011-09-09 00:41:17 +0200
committer	Patrick Simianer <p@simianer.de>	2011-09-23 19:13:58 +0200
commit	14637f89c899179f54a5bc327857db8ea1e1d427 (patch)
tree	66cdbfd8039107075509faa4a4d1dc5dc33698cc /dtrain
parent	0269777fc54bc554c12107bdd5498f743df2a1ce (diff)