hacking in weights setting, getting

author: Patrick Simianer <p@simianer.de> 2011-07-27 00:03:35 +0200
committer: Patrick Simianer <p@simianer.de> 2011-09-23 19:13:57 +0200
commit: 4a1d2e56744cc97c11ef8220623bd7c5467d6c02 (patch)
tree: beb2ce8b03cc18ae52b9cf7e4e28ee394096dc24 /dtrain
parent: 6057a8ea61eebd4b698b78814b05f3de1c96944f (diff)
6 files changed, 152 insertions, 72 deletions
diff --git a/dtrain/cdec.ini b/dtrain/cdec.ini
index 596fbf71..92a4a335 100644
--- a/dtrain/cdec.ini
+++ b/dtrain/cdec.ini
@@ -1,3 +1,4 @@
 formalism=scfg
+#feature_function=KLanguageModel europarl-v6.tok.lc.s-tag.en.arpa.kenlm.v4.mma
 #k_best=2
 #add_pass_through_rules=true
diff --git a/dtrain/dtrain b/dtrain/dtrain
deleted file mode 100755
index e200e05c..00000000
--- a/dtrain/dtrain
+++ /dev/null
diff --git a/dtrain/dtrain.cc b/dtrain/dtrain.cc
index 25249c7f..8464a429 100644
--- a/dtrain/dtrain.cc
+++ b/dtrain/dtrain.cc
@@ -40,14 +40,15 @@ init(int argc, char** argv, boostpo::variables_map* conf)
   boostpo::options_description opts( "Options" );
   opts.add_options()
     ( "decoder-config,c", boostpo::value<string>(), "configuration file for cdec" )
-    ( "kbest,k",          boostpo::value<int>(),    "k for kbest" )
+    ( "kbest,k",          boostpo::value<size_t>(), "k for kbest" )
     ( "ngrams,n",         boostpo::value<int>(),    "n for Ngrams" )
-    ( "filter,f",         boostpo::value<string>(), "filter kbest list" );
+    ( "filter,f",         boostpo::value<string>(), "filter kbest list" )
+    ( "test",                                       "run tests and exit");
   boostpo::options_description cmdline_options;
   cmdline_options.add(opts);
   boostpo::store( parse_command_line(argc, argv, cmdline_options), *conf );
   boostpo::notify( *conf );
-  if ( ! conf->count("decoder-config") ) {
+  if ( ! (conf->count("decoder-config") || conf->count("test")) ) {
     cerr << cmdline_options << endl;
     return false;
   }
@@ -67,7 +68,7 @@ struct KBestList {
 struct KBestGetter : public DecoderObserver
 {
   KBestGetter( const size_t k ) : k_(k) {}
-  size_t k_;
+  const size_t k_;
   KBestList kb;
 
   virtual void
@@ -164,7 +165,7 @@ struct NgramCounts
   map<size_t, size_t> clipped;
   map<size_t, size_t> sum;
 
-  NgramCounts&
+  void
   operator+=( const NgramCounts& rhs )
   {
     assert( N_ == rhs.N_ );
@@ -247,6 +248,7 @@ brevity_penaly( const size_t hyp_len, const size_t ref_len )
 /*
  * bleu
  * as in "BLEU: a Method for Automatic Evaluation of Machine Translation" (Papineni et al. '02)
+ * page TODO
  * 0 if for N one of the counts = 0
  */
 double
@@ -272,6 +274,7 @@ bleu( NgramCounts& counts, const size_t hyp_len, const size_t ref_len,
 /*
  * stupid_bleu
  * as in "ORANGE: a Method for Evaluating Automatic Evaluation Metrics for Machine Translation (Lin & Och '04)
+ * page TODO
  * 0 iff no 1gram match
  */
 double
@@ -298,6 +301,7 @@ stupid_bleu( NgramCounts& counts, const size_t hyp_len, const size_t ref_len,
 /*
  * smooth_bleu
  * as in "An End-to-End Discriminative Approach to Machine Translation" (Liang et al. '06)
+ * page TODO
  * max. 0.9375
  */
 double
@@ -324,6 +328,7 @@ smooth_bleu( NgramCounts& counts, const size_t hyp_len, const size_t ref_len,
 /*
  * approx_bleu
  * as in "Online Large-Margin Training for Statistical Machine Translation" (Watanabe et al. '07)
+ * page TODO
  *
  */
 double
@@ -348,11 +353,16 @@ register_and_convert(const vector<string>& strs, vector<WordID>& ids)
 }
 
 
+/*
+ *
+ *
+ */
 void
 test_ngrams()
 {
   cout << "Testing ngrams..." << endl << endl;
   size_t N = 5;
+  cout << "N = " << N << endl;
   vector<int> a; // hyp
   vector<int> b; // ref
   cout << "a ";
@@ -373,18 +383,28 @@ test_ngrams()
   c += c;
   cout << endl;
   c.print();
+  cout << endl;
 }
 
+
+/*
+ *
+ *
+ */
 double
 approx_equal( double x, double y )
 {
   const double EPSILON = 1E-5;
-  if ( x == 0 ) return fabs(y) <= EPSILON;
-  if ( y == 0 ) return fabs(x) <= EPSILON;
+  if ( x == 0 ) return fabs( y ) <= EPSILON;
+  if ( y == 0 ) return fabs( x ) <= EPSILON;
   return fabs( x - y ) / max( fabs(x), fabs(y) ) <= EPSILON;
 }
 
 
+/*
+ *
+ *
+ */
 #include <boost/assign/std/vector.hpp>
 #include <iomanip>
 void
@@ -423,104 +443,162 @@ test_metrics()
     cout << setw(14) << "smooth bleu = " << smooth << endl;
     cout << setw(14) << "stupid bleu = " << stupid << endl << endl;
   }
+  cout << endl;
 }
 
-
 /*
- * main
+ *
  *
  */
-int
-main(int argc, char** argv)
+void
+test_SetWeights()
 {
-  /*vector<string> v;
-  for (int i = 0; i <= 10; i++) {
-      v.push_back("asdf");
-  }
-  vector<vector<string> > ng = ngrams(v, 5);
-  for (int i = 0; i < ng.size(); i++) {
-    for (int j = 0; j < ng[i].size(); j++) {
-        cout << " " << ng[i][j];
-    }
-    cout << endl;
-  }*/
-
-  test_metrics();
-
-
-  //NgramCounts counts2 = make_ngram_counts( ref_ids, ref_ids, 4);
-  //counts += counts2;
-  //cout << counts.cNipped[1] << endl;
-
-  //size_t c, r; // c length of candidates, r of references
-  //c += cand.size();
-  //r += ref.size();
-  /*NgramMatches ngm; // for approx bleu
-  ngm.sum = 1;
-  ngm.clipped = 1;
+  cout << "Testing Weights::SetWeight..." << endl << endl;
+  Weights weights;
+  SparseVector<double> lambdas;
+  weights.InitSparseVector( &lambdas );
+  weights.SetWeight( &lambdas, "test", 0 );
+  weights.SetWeight( &lambdas, "test1", 1 );
+  WordID fid = FD::Convert( "test2" );
+  weights.SetWeight( &lambdas, fid, 2 );
+  string fn = "weights-test";
+  cout << "FD::NumFeats() " << FD::NumFeats() << endl;
+  assert( FD::NumFeats() == 4 );
+  weights.WriteToFile( fn, true );
+  cout << endl;
+}
 
-  NgramMatches x;
-  x.clipped = 1;
-  x.sum = 1;
 
-  x += ngm;
-  x += x;
-  x+= ngm;
+/*
+ *
+ *
+ */
+void
+run_tests()
+{
+  cout << endl;
+  test_ngrams();
+  cout << endl;
+  test_metrics();
+  cout << endl;
+  test_SetWeights();
+  exit(0);
+}
 
-  cout << x.clipped << " " << x.sum << endl;*/
 
+void
+print_FD()
+{
+  for ( size_t i = 0; i < FD::NumFeats(); i++ ) cout << FD::Convert(i)<< endl;
+}
 
-  /*register_feature_functions();
-  SetSilent(true);
 
-  boost::program_options::variables_map conf;
+/*
+ * main
+ *
+ */
+int
+main(int argc, char** argv)
+{
+  //SetSilent(true);
+  boostpo::variables_map conf;
   if (!init(argc, argv, &conf)) return 1;
+  if ( conf.count("test") ) run_tests(); 
+  register_feature_functions();
+  size_t k = conf["kbest"].as<size_t>();
   ReadFile ini_rf(conf["decoder-config"].as<string>());
   Decoder decoder(ini_rf.stream());
+  KBestGetter observer(k);
+  
+  // for approx. bleu
+  //NgramCounts global_counts;
+  //size_t global_hyp_len;
+  //size_t global_ref_len;
+
   Weights weights;
   SparseVector<double> lambdas;
   weights.InitSparseVector(&lambdas);
+  vector<double> dense_weights;
 
-  int k = conf["kbest"].as<int>();
+  lambdas.set_value(FD::Convert("logp"), 0);
 
-  KBestGetter observer(k);
-  string in, psg;
+ 
   vector<string> strs;
-  int i = 0;
-  while(getline(cin, in)) {
-    if (!SILENT) cerr << "getting kbest for sentence #" << i << endl;
+  string in, psg;
+  size_t i = 0;
+  while( getline(cin, in) ) {
+    if ( !SILENT ) cerr << endl << endl << "Getting kbest for sentence #" << i << endl;
+    // why? why!?
+    dense_weights.clear();
+    weights.InitFromVector( lambdas );
+    weights.InitVector( &dense_weights );
+    decoder.SetWeights( dense_weights );
+    //cout << "use_shell " << dense_weights[FD::Convert("use_shell")] << endl;
     strs.clear();
-    boost::split(strs, in, boost::is_any_of("\t"));
-    psg = boost::replace_all_copy(strs[2], " __NEXT_RULE__ ", "\n"); psg += "\n";
+    boost::split( strs, in, boost::is_any_of("\t") );
+    psg = boost::replace_all_copy( strs[2], " __NEXT_RULE__ ", "\n" ); psg += "\n";
+    //decoder.SetId(i);
     decoder.SetSentenceGrammar( psg );
     decoder.Decode( strs[0], &observer );
     KBestList* kb = observer.getkb();
-    // FIXME not pretty iterating twice over k
-    for (int i = 0; i < k; i++) {
-      for (int j = 0; j < kb->sents[i].size(); ++j) {
-        cout << TD::Convert(kb->sents[i][j]) << endl;
+    for ( size_t i = 0; i < k; i++ ) {
+      cout << i << " ";
+      for (size_t j = 0; j < kb->sents[i].size(); ++j ) {
+        cout << TD::Convert( kb->sents[i][j] ) << " ";
       }
+      cout << kb->scores[i];
+      cout << endl;
     }
+    lambdas.set_value( FD::Convert("use_shell"), 1 );
+    lambdas.set_value( FD::Convert("use_a"), 1 );
+    //print_FD();
   }
+  
+  weights.WriteToFile( "weights-final", true );
 
-  return 0;*/
+  return 0;
 }
 
+    // next: FMap, ->sofia, ->FMap, -> Weights
+    // learner gets all used features (binary! and dense (logprob is sum of logprobs!))
+    // only for those feats with weight > 0 after learning
+    // see decoder line 548
+
 
 /*
  * TODO
- *  for t =1..T
- *  mapper, reducer (average, handle ngram statistics for approx bleu)
- *    1st streaming
- *  batch, non-batch in the mapper (what sofia gets)
- *  filter yes/no
+ *  iterate over training set, for t=1..T
+ *  mapred impl
+ *   mapper:  main
+ *   reducer: average weights, global NgramCounts for approx. bleu
+ *  1st cut: hadoop streaming?
+ *  batch, non-batch in the mapper (what sofia gets, regenerated Kbest lists)
+ *  filter kbest yes/no
  *  sofia: --eta_type explicit
- *  psg preparation
- *  set ref?
- *  shared LM?
+ *  psg preparation source\tref\tpsg
+ *  set reference for cdec?
+ *  LM
+ *   shared?
+ *   startup?
  *  X reference(s) for *bleu!?
- *  kbest nicer!? shared_ptr
- *  multipartite
+ *  kbest nicer (do not iterate twice)!? -> shared_ptr
+ *  multipartite ranking
  *  weights! global, per sentence from global, featuremap
- * todo const
+ *  const decl...
+ *  sketch: batch/iter options
+ *  weights.cc: why wv_?
+ *  --weights cmd line (for iterations): script to call again/hadoop streaming?
+ *  I do not need to remember features, cdec does
+ *  resocre hg?
+ *  do not use Decoder::Decode!?
+ *  what happens if feature not in FD? 0???
  */
+
+/*
+ * PROBLEMS
+ *  cdec kbest vs 1best (no -k param)
+ *  FD, Weights::wv_ grow too large, see utils/weights.cc; decoder/hg.h; decoder/scfg_translator.cc; utils/fdict.cc!?
+ *  sparse vector instead of vector<double> for weights in Decoder?
+ *  PhraseModel_* features for psg!? (seem to be generated)
+ */
+
diff --git a/dtrain/in.toy b/dtrain/in.toy
new file mode 100644
index 00000000..71b736a6
--- /dev/null
+++ b/dtrain/in.toy
@@ -0,0 +1,2 @@
+ich sah ein kleines haus	i saw a little shell	[S] ||| [NP,1] [VP,2] ||| [1] [2] ||| logp=0 __NEXT_RULE__ [NP] ||| ich ||| i ||| logp=0 __NEXT_RULE__ [NP] ||| ein [NN,1] ||| a [1] ||| logp=0 __NEXT_RULE__ [NN] ||| [JJ,1] haus ||| [1] house ||| logp=-0.5 __NEXT_RULE__ [NN] ||| [JJ,1] haus ||| [1] shell ||| logp=-0.5 use_shell=1 __NEXT_RULE__ [JJ] ||| kleines ||| small ||| logp=-0.5 __NEXT_RULE__ [JJ] ||| kleines ||| little ||| logp=-0.9 __NEXT_RULE__ [JJ] ||| grosses ||| big ||| logp=-0.5 __NEXT_RULE__ [JJ] ||| grosses ||| large ||| logp=-1.5 __NEXT_RULE__ [VP] ||| [V,1] [NP,2] ||| [1] [2] ||| logp=0 __NEXT_RULE__ [V] ||| sah ||| saw ||| logp=0 __NEXT_RULE__ [V] ||| fand ||| found ||| logp=0
+ich fand ein grosses haus	i found a little shell	[S] ||| [NP,1] [VP,2] ||| [1] [2] ||| logp=0 __NEXT_RULE__ [NP] ||| ich ||| i ||| logp=0 __NEXT_RULE__ [NP] ||| ein [NN,1] ||| a [1] ||| logp=0 use_a=1 __NEXT_RULE__ [NN] ||| [JJ,1] haus ||| [1] house ||| logp=-1000 __NEXT_RULE__ [NN] ||| [JJ,1] haus ||| [1] shell ||| logp=-1 use_shell=1 __NEXT_RULE__ [JJ] ||| kleines ||| small ||| logp=-0.5 __NEXT_RULE__ [JJ] ||| kleines ||| little ||| logp=-0.9 __NEXT_RULE__ [JJ] ||| grosses ||| big ||| logp=-0.5 __NEXT_RULE__ [JJ] ||| grosses ||| large ||| logp=-1.5 __NEXT_RULE__ [VP] ||| [V,1] [NP,2] ||| [1] [2] ||| logp=0 __NEXT_RULE__ [V] ||| sah ||| saw ||| logp=0 __NEXT_RULE__ [V] ||| fand ||| found ||| logp=0
diff --git a/dtrain/input b/dtrain/input
deleted file mode 100644
index ff005d22..00000000
--- a/dtrain/input
+++ /dev/null
@@ -1 +0,0 @@
-ich sah ein kleines haus @@@@@@@@ i saw a little shell @@@@@@@@ [S] ||| [NP,1] [VP,2] ||| [1] [2] ||| logp=0 __NEXT_RULE__ [NP] ||| ich ||| i ||| logp=0 __NEXT_RULE__ [NP] ||| ein [NN,1] ||| a [1] ||| logp=0 __NEXT_RULE__ [NN] ||| [JJ,1] haus ||| [1] house ||| logp=-0.5 __NEXT_RULE__ [NN] ||| [JJ,1] haus ||| [1] shell ||| logp=-0.9 __NEXT_RULE__ [JJ] ||| kleines ||| small ||| logp=-0.5 __NEXT_RULE__ [JJ] ||| kleines ||| little ||| logp=-0.9 __NEXT_RULE__ [JJ] ||| grosses ||| big ||| logp=-0.5 __NEXT_RULE__ [JJ] ||| grosses ||| large ||| logp=-1.5 __NEXT_RULE__ [VP] ||| [V,1] [NP,2] ||| [1] [2] ||| logp=0 __NEXT_RULE__ [V] ||| sah ||| saw ||| logp=0 __NEXT_RULE__ [V] ||| fand ||| found ||| logp=0 @@@@@@@@ logp
diff --git a/dtrain/test.sh b/dtrain/test.sh
index 508038ec..a0ebb420 100755
--- a/dtrain/test.sh
+++ b/dtrain/test.sh
@@ -1,4 +1,4 @@
 #!/bin/sh
 
-./dtrain -c cdec.ini -k 2 < in
+./dtrain -c cdec.ini -k 4 < in.toy
author	Patrick Simianer <p@simianer.de>	2011-07-27 00:03:35 +0200
committer	Patrick Simianer <p@simianer.de>	2011-09-23 19:13:57 +0200
commit	4a1d2e56744cc97c11ef8220623bd7c5467d6c02 (patch)
tree	beb2ce8b03cc18ae52b9cf7e4e28ee394096dc24 /dtrain
parent	6057a8ea61eebd4b698b78814b05f3de1c96944f (diff)