From 0269777fc54bc554c12107bdd5498f743df2a1ce Mon Sep 17 00:00:00 2001
From: Patrick Simianer <p@simianer.de>
Date: Thu, 8 Sep 2011 00:06:52 +0200
Subject: a lot of stuff, fast_sparse_vector, perceptron, removed sofia, sample
 [...]

---
 dtrain/README                    | 15 +++----
 dtrain/dtrain.cc                 | 87 +++++++++++++++++++++++++++-------------
 dtrain/kbestget.h                | 12 ++++--
 dtrain/run.sh                    |  8 ++--
 dtrain/sample.h                  | 52 ++++++++++++++++++++++++
 dtrain/score.h                   | 16 --------
 dtrain/test/EXAMPLE/cdec.ini     |  1 +
 dtrain/test/EXAMPLE/dtrain.ini   |  6 +--
 dtrain/test/log_reg/bin_class.cc |  4 ++
 dtrain/test/log_reg/bin_class.h  | 22 ++++++++++
 dtrain/test/log_reg/log_reg.cc   | 39 ++++++++++++++++++
 dtrain/test/log_reg/log_reg.h    | 14 +++++++
 dtrain/test/nc-wmt11/dtrain.ini  |  2 +-
 dtrain/test/toy.dtrain.ini       |  3 +-
 dtrain/test/toy.in               |  4 +-
 dtrain/test/toy_cdec/cdec.ini    |  3 ++
 dtrain/test/toy_cdec/grammar     | 12 ++++++
 dtrain/test/toy_cdec/in          |  1 +
 dtrain/test/toy_cdec/weights     |  2 +
 utils/fast_sparse_vector.h       | 64 ++++++++++++++++++++++++-----
 20 files changed, 293 insertions(+), 74 deletions(-)
 create mode 100644 dtrain/sample.h
 create mode 100644 dtrain/test/log_reg/bin_class.cc
 create mode 100644 dtrain/test/log_reg/bin_class.h
 create mode 100644 dtrain/test/log_reg/log_reg.cc
 create mode 100644 dtrain/test/log_reg/log_reg.h
 create mode 100644 dtrain/test/toy_cdec/cdec.ini
 create mode 100644 dtrain/test/toy_cdec/grammar
 create mode 100644 dtrain/test/toy_cdec/in
 create mode 100644 dtrain/test/toy_cdec/weights
diff --git a/dtrain/README b/dtrain/README
index 74bac6a0..b3f513be 100644
--- a/dtrain/README
+++ b/dtrain/README
@@ -1,7 +1,7 @@
 NOTES
  learner gets all used features (binary! and dense (logprob is sum of logprobs!))
  weights: see decoder/decoder.cc line 548
- 40k sents, k=100 = ~400M mem, 1 iteration 45min
+ (40k sents, k=100 = ~400M mem, 1 iteration 45min)?
  utils/weights.cc: why wv_?
  FD, Weights::wv_ grow too large, see utils/weights.cc;
      decoder/hg.h; decoder/scfg_translator.cc; utils/fdict.cc
@@ -15,25 +15,26 @@ TODO
  GENERATED data? (multi-task, ability to learn, perfect translation in nbest, at first all modelscore 1)
  CACHING (ngrams for scoring)
  hadoop PIPES imlementation
- SHARED LM?
+ SHARED LM (kenlm actually does this!)?
  ITERATION variants
   once -> average
   shuffle resulting weights
  weights AVERAGING in reducer (global Ngram counts)
  BATCH implementation (no update after each Kbest list)
- SOFIA --eta_type explicit
  set REFERENCE for cdec (rescoring)?
  MORE THAN ONE reference for BLEU?
  kbest NICER (do not iterate twice)!? -> shared_ptr?
  DO NOT USE Decoder::Decode (input caching as WordID)!?
   sparse vector instead of vector<double> for weights in Decoder(::SetWeights)?
  reactivate DTEST and tests
- non deterministic, high variance, RANDOWM RESTARTS
+ non deterministic, high variance, RANDOM RESTARTS
  use separate TEST SET
 
 KNOWN BUGS PROBLEMS
- does probably OVERFIT
- cdec kbest vs 1best (no -k param) fishy!
+ cdec kbest vs 1best (no -k param), rescoring? => ok(?)
+ no sparse vector in decoder => ok
+ ? ok
  sh: error while loading shared libraries: libreadline.so.6: cannot open shared object file: Error 24
- PhraseModel_* features (0..99 seem to be generated, default?)
+ PhraseModel_* features (0..99 seem to be generated, why 99?)
+ flex scanner jams on malicious input, we could skip that
 
diff --git a/dtrain/dtrain.cc b/dtrain/dtrain.cc
index 30ced234..4554e417 100644
--- a/dtrain/dtrain.cc
+++ b/dtrain/dtrain.cc
@@ -1,7 +1,7 @@
 #include "common.h"
 #include "kbestget.h"
-#include "updater.h"
 #include "util.h"
+#include "sample.h"
 
 // boost compression
 #include <boost/iostreams/device/file.hpp> 
@@ -85,18 +85,21 @@ init(int argc, char** argv, po::variables_map* cfg)
 }
 
 
+// output formatting
 ostream& _nopos( ostream& out ) { return out << resetiosflags( ios::showpos ); }
 ostream& _pos( ostream& out ) { return out << setiosflags( ios::showpos ); }
 ostream& _prec2( ostream& out ) { return out << setprecision(2); }
 ostream& _prec5( ostream& out ) { return out << setprecision(5); }
 
 
+
+
 /*
- * main
+ * dtrain
  *
  */
 int
-main(int argc, char** argv)
+main( int argc, char** argv )
 {
   // handle most parameters
   po::variables_map cfg;
@@ -202,11 +205,14 @@ main(int argc, char** argv)
   bool next = false, stop = false;
   double score = 0.;
   size_t cand_len = 0;
-  Scores scores;
   double overall_time = 0.;
 
   cout << setprecision( 5 );
 
+  // for the perceptron
+  double eta = 0.5; // TODO as parameter
+  lambdas.add_value( FD::Convert("__bias"), 0 );
+
 
   for ( size_t t = 0; t < T; t++ ) // T epochs
   {
@@ -278,12 +284,15 @@ main(int argc, char** argv)
     weights.InitVector( &dense_weights );
     decoder.SetWeights( dense_weights );
 
+    srand ( time(NULL) );
+
     switch ( t ) {
       case 0:
         // handling input
         in_split.clear();
         boost::split( in_split, in, boost::is_any_of("\t") );
         // in_split[0] is id
+        //cout << in_split[0] << endl;
         // getting reference
         ref_tok.clear(); ref_ids.clear();
         boost::split( ref_tok, in_split[2], boost::is_any_of(" ") );
@@ -291,7 +300,7 @@ main(int argc, char** argv)
         ref_ids_buf.push_back( ref_ids );
         // process and set grammar
         //grammar_buf << in_split[3] << endl;
-        grammar_str = boost::replace_all_copy( in_split[3], " __NEXT__RULE__ ", "\n" ) + "\n";
+        grammar_str = boost::replace_all_copy( in_split[3], " __NEXT__RULE__ ", "\n" ) + "\n"; // FIXME copy, __
         grammar_buf << grammar_str << DTRAIN_GRAMMAR_DELIM << endl;
         decoder.SetSentenceGrammarFromString( grammar_str );
         // decode, kbest
@@ -316,14 +325,16 @@ main(int argc, char** argv)
     }
 
     // get kbest list
-    KBestList* kb = observer.GetKBest();
+    KBestList* kb;
+    //if ( ) { // TODO get from forest
+      kb = observer.GetKBest();
+    //}
 
     // scoring kbest
-    scores.clear();
     if ( t > 0 ) ref_ids = ref_ids_buf[sid];
-    for ( size_t i = 0; i < kb->sents.size(); i++ ) {
+    for ( size_t i = 0; i < kb->GetSize(); i++ ) {
       NgramCounts counts = make_ngram_counts( ref_ids, kb->sents[i], N );
-      // for approx bleu
+      // this is for approx bleu
       if ( scorer_str == "approx_bleu" ) {
         if ( i == 0 ) { // 'context of 1best translations'
           global_counts  += counts;
@@ -346,29 +357,54 @@ main(int argc, char** argv)
                         kb->sents[i].size(), N, bleu_weights );
       }
 
+      kb->scores.push_back( score );
+
       if ( i == 0 ) {
         acc_1best_score += score;
-        acc_1best_model += kb->scores[i];
+        acc_1best_model += kb->model_scores[i];
       }
 
-      // scorer score and model score
-      ScorePair sp( kb->scores[i], score );
-      scores.push_back( sp );
-
       if ( verbose ) {
-        cout << "k=" << i+1 << " '" << TD::GetString( ref_ids ) << "'[ref] vs '";
-        cout << _prec5 << _nopos << TD::GetString( kb->sents[i] ) << "'[hyp]";
-        cout << " [SCORE=" << score << ",model="<< kb->scores[i] << "]" << endl;
-        //cout << kb->feats[i] << endl; // this is maybe too verbose
+        if ( i == 0 ) cout << "'" << TD::GetString( ref_ids ) << "' [ref]" << endl;
+        cout << _prec5 << _nopos << "[hyp " << i << "] " << "'" << TD::GetString( kb->sents[i] ) << "'";
+        cout << " [SCORE=" << score << ",model="<< kb->model_scores[i] << "]" << endl;
+        cout << kb->feats[i] << endl; // this is maybe too verbose
       }
     } // Nbest loop
+
     if ( verbose ) cout << endl;
 
-    // update weights; TODO other updaters
+
+    // UPDATE WEIGHTS
     if ( !noup ) {
-      SofiaUpdater updater;
-      updater.Init( sid, kb->feats, scores );
-      updater.Update( lambdas );
+
+      TrainingInstances pairs;
+
+      sample_all(kb, pairs);
+            
+      for ( TrainingInstances::iterator ti = pairs.begin();
+            ti != pairs.end(); ti++ ) {
+        // perceptron
+        SparseVector<double> dv;
+        if ( ti->type == -1 ) {
+          dv = ti->second - ti->first;
+        } else {
+          dv = ti->first - ti->second;
+        }
+        dv.add_value(FD::Convert("__bias"), -1);
+        lambdas += dv * eta;
+
+        /*if ( verbose ) {
+          cout << "{{ f(i) > f(j) but g(i) < g(j), so update" << endl;
+          cout << " i  " << TD::GetString(kb->sents[ii]) << endl;
+          cout << "    " << kb->feats[ii] << endl;
+          cout << " j  " << TD::GetString(kb->sents[jj]) << endl;
+          cout << "    " << kb->feats[jj] << endl; 
+          cout << " dv " << dv << endl;
+          cout << "}}" << endl;
+        }*/
+      }
+
     }
 
     ++sid;
@@ -426,7 +462,7 @@ main(int argc, char** argv)
 
   } // outer loop
 
-  //unlink( grammar_buf_tmp_fn );
+  unlink( grammar_buf_tmp_fn );
   if ( !noup ) {
     if ( !quiet ) cout << endl << "writing weights file '" << cfg["output"].as<string>() << "' ...";
     weights.WriteToFile( cfg["output"].as<string>(), true );
@@ -439,11 +475,6 @@ main(int argc, char** argv)
     cout << _prec2 << "This took " << overall_time/60. << " min." << endl;
   }
 
-  // don't do this with many features...
-  /*for ( size_t i = 0; i < FD::NumFeats(); i++ ) {
-      cout << FD::Convert(i) << " " << dense_weights[i] << endl;
-  }*/
-
   return 0;
 }
 
diff --git a/dtrain/kbestget.h b/dtrain/kbestget.h
index bb430b85..ae4588c9 100644
--- a/dtrain/kbestget.h
+++ b/dtrain/kbestget.h
@@ -14,7 +14,9 @@ namespace dtrain
 struct KBestList {
   vector<SparseVector<double> > feats;
   vector<vector<WordID> > sents;
+  vector<double> model_scores;
   vector<double> scores;
+  size_t GetSize() { return sents.size(); }
 };
 
 
@@ -52,9 +54,10 @@ struct KBestGetter : public DecoderObserver
   void
   KBestUnique( const Hypergraph& forest )
   {
-    kb.scores.clear();
     kb.sents.clear();
     kb.feats.clear();
+    kb.model_scores.clear();
+    kb.scores.clear();
     KBest::KBestDerivations<vector<WordID>, ESentenceTraversal, KBest::FilterUnique, prob_t, EdgeProb> kbest( forest, k_ );
     for ( size_t i = 0; i < k_; ++i ) {
       const KBest::KBestDerivations<vector<WordID>, ESentenceTraversal, KBest::FilterUnique, prob_t, EdgeProb>::Derivation* d =
@@ -62,16 +65,17 @@ struct KBestGetter : public DecoderObserver
       if (!d) break;
       kb.sents.push_back( d->yield);
       kb.feats.push_back( d->feature_values );
-      kb.scores.push_back( d->score );
+      kb.model_scores.push_back( d->score );
     }
   }
 
   void
   KBestNoFilter( const Hypergraph& forest )
   {
-    kb.scores.clear();
     kb.sents.clear();
     kb.feats.clear();
+    kb.model_scores.clear();
+    kb.scores.clear();
     KBest::KBestDerivations<vector<WordID>, ESentenceTraversal> kbest( forest, k_ );
     for ( size_t i = 0; i < k_; ++i ) {
       const KBest::KBestDerivations<vector<WordID>, ESentenceTraversal>::Derivation* d =
@@ -79,7 +83,7 @@ struct KBestGetter : public DecoderObserver
       if (!d) break;
       kb.sents.push_back( d->yield);
       kb.feats.push_back( d->feature_values );
-      kb.scores.push_back( d->score );
+      kb.model_scores.push_back( d->score );
     }
   }
 };
diff --git a/dtrain/run.sh b/dtrain/run.sh
index cdaea067..b2012bcf 100755
--- a/dtrain/run.sh
+++ b/dtrain/run.sh
@@ -1,8 +1,10 @@
 #!/bin/sh
 
-INI=test/blunsom08.dtrain.ini
-#INI=test/nc-wmt11/nc-wmt11.loo.dtrain.ini
+#INI=test/blunsom08.dtrain.ini
+#INI=test/nc-wmt11/dtrain.ini
+#INI=test/EXAMPLE/dtrain.ini
+INI=test/toy.dtrain.ini
 
 rm /tmp/dtrain-*
-./dtrain -c $INI $1 $2 $3 $4 2>/dev/null
+./dtrain -c $INI $1 $2 $3 $4 
 
diff --git a/dtrain/sample.h b/dtrain/sample.h
new file mode 100644
index 00000000..b9bc4461
--- /dev/null
+++ b/dtrain/sample.h
@@ -0,0 +1,52 @@
+#include "kbestget.h"
+
+
+namespace dtrain
+{
+
+
+struct TPair
+{
+  double type;
+  SparseVector<double> first;
+  SparseVector<double> second;
+};
+
+typedef vector<TPair> TrainingInstances;
+
+
+void
+sample_all( KBestList* kb, TrainingInstances &training )
+{
+  double type;
+  for ( size_t i = 0; i < kb->GetSize()-1; i++ ) {
+   for ( size_t j = i+1; j < kb->GetSize(); j++ ) {
+     if ( kb->scores[i] - kb->scores[j] < 0 ) {
+       type = -1; 
+     } else {
+       type = 1;
+     }
+     TPair p;
+     p.type = type;
+     p.first = kb->feats[i];
+     p.second = kb->feats[j];
+     training.push_back( p );
+   }
+ }
+}
+
+/*void
+sample_all_only_neg(, vector<pair<SparSparseVector<double> > pairs)
+{
+
+}
+
+void
+sample_random_pos()
+{
+  if ( rand() % 2 ) { // sample it?
+}*/
+
+
+} // namespace
+
diff --git a/dtrain/score.h b/dtrain/score.h
index 4314157b..e88387c5 100644
--- a/dtrain/score.h
+++ b/dtrain/score.h
@@ -17,22 +17,6 @@ namespace dtrain
 {
 
 
-/*
- * ScorePair
- *
- */
-struct ScorePair
-{
-  ScorePair(double modelscore, double score) : modelscore_(modelscore), score_(score) {} 
-  double modelscore_, score_;
-  double GetModelScore() { return modelscore_; }
-  double GetScore() { return score_; }
-};
-
-
-typedef vector<ScorePair> Scores;
-
-
 /*
  * NgramCounts
  *
diff --git a/dtrain/test/EXAMPLE/cdec.ini b/dtrain/test/EXAMPLE/cdec.ini
index b6e92b5f..e57138b0 100644
--- a/dtrain/test/EXAMPLE/cdec.ini
+++ b/dtrain/test/EXAMPLE/cdec.ini
@@ -2,5 +2,6 @@ formalism=scfg
 add_pass_through_rules=true
 feature_function=WordPenalty
 cubepruning_pop_limit=30
+feature_function=KLanguageModel data/nc-wmt11.en.srilm.gz
 scfg_max_span_limit=15
 
diff --git a/dtrain/test/EXAMPLE/dtrain.ini b/dtrain/test/EXAMPLE/dtrain.ini
index 1467b332..ffafd0b8 100644
--- a/dtrain/test/EXAMPLE/dtrain.ini
+++ b/dtrain/test/EXAMPLE/dtrain.ini
@@ -1,10 +1,10 @@
 decoder_config=test/EXAMPLE/cdec.ini
 kbest=100
 ngrams=3
-epochs=22
+epochs=8
 input=test/EXAMPLE/dtrain.nc-1k
 scorer=approx_bleu
 output=test/EXAMPLE/weights.gz
-stop_after=5
-wprint=Glue WordPenalty LanguageModel LanguageModel_OOV PhraseModel_0 PhraseModel_1 PhraseModel_2 PhraseModel_3 PhraseModel_4
+stop_after=1000
+wprint=Glue WordPenalty LanguageModel LanguageModel_OOV PhraseModel_0 PhraseModel_1 PhraseModel_2 PhraseModel_3 PhraseModel_4 PassThrough
 
diff --git a/dtrain/test/log_reg/bin_class.cc b/dtrain/test/log_reg/bin_class.cc
new file mode 100644
index 00000000..19bcde25
--- /dev/null
+++ b/dtrain/test/log_reg/bin_class.cc
@@ -0,0 +1,4 @@
+#include "bin_class.h"
+
+Objective::~Objective() {}
+
diff --git a/dtrain/test/log_reg/bin_class.h b/dtrain/test/log_reg/bin_class.h
new file mode 100644
index 00000000..3466109a
--- /dev/null
+++ b/dtrain/test/log_reg/bin_class.h
@@ -0,0 +1,22 @@
+#ifndef _BIN_CLASS_H_
+#define _BIN_CLASS_H_
+
+#include <vector>
+#include "sparse_vector.h"
+
+struct TrainingInstance {
+  // TODO add other info? loss for MIRA-type updates?
+  SparseVector<double> x_feature_map;
+  bool y;
+};
+
+struct Objective {
+  virtual ~Objective();
+
+  // returns f(x) and f'(x)
+  virtual double ObjectiveAndGradient(const SparseVector<double>& x,
+                  const std::vector<TrainingInstance>& training_instances,
+                  SparseVector<double>* g) const = 0;
+};
+
+#endif
diff --git a/dtrain/test/log_reg/log_reg.cc b/dtrain/test/log_reg/log_reg.cc
new file mode 100644
index 00000000..ec2331fe
--- /dev/null
+++ b/dtrain/test/log_reg/log_reg.cc
@@ -0,0 +1,39 @@
+#include "log_reg.h"
+
+#include <vector>
+#include <cmath>
+
+#include "sparse_vector.h"
+
+using namespace std;
+
+double LogisticRegression::ObjectiveAndGradient(const SparseVector<double>& x,
+                              const vector<TrainingInstance>& training_instances,
+                              SparseVector<double>* g) const {
+  double cll = 0;
+  for (int i = 0; i < training_instances.size(); ++i) {
+    const double dotprod = training_instances[i].x_feature_map.dot(x); // TODO no bias, if bias, add x[0]
+    double lp_false = dotprod;
+    double lp_true = -dotprod;
+    if (0 < lp_true) {
+      lp_true += log1p(exp(-lp_true));
+      lp_false = log1p(exp(lp_false));
+    } else {
+      lp_true = log1p(exp(lp_true));
+      lp_false += log1p(exp(-lp_false));
+    }
+    lp_true *= -1;
+    lp_false *= -1;
+    if (training_instances[i].y) {  // true label
+      cll -= lp_true;
+      (*g) -= training_instances[i].x_feature_map * exp(lp_false);
+      // (*g)[0] -= exp(lp_false); // bias
+    } else {                  // false label
+      cll -= lp_false;
+      (*g) += training_instances[i].x_feature_map * exp(lp_true);
+      // g += corpus[i].second * exp(lp_true);
+    }
+  }
+  return cll;
+}
+
diff --git a/dtrain/test/log_reg/log_reg.h b/dtrain/test/log_reg/log_reg.h
new file mode 100644
index 00000000..ecc560b8
--- /dev/null
+++ b/dtrain/test/log_reg/log_reg.h
@@ -0,0 +1,14 @@
+#ifndef _LOG_REG_H_
+#define _LOG_REG_H_
+
+#include <vector>
+#include "sparse_vector.h"
+#include "bin_class.h"
+
+struct LogisticRegression : public Objective {
+  double ObjectiveAndGradient(const SparseVector<double>& x,
+                              const std::vector<TrainingInstance>& training_instances,
+                              SparseVector<double>* g) const;
+};
+
+#endif
diff --git a/dtrain/test/nc-wmt11/dtrain.ini b/dtrain/test/nc-wmt11/dtrain.ini
index 51033f2d..ddbf5da7 100644
--- a/dtrain/test/nc-wmt11/dtrain.ini
+++ b/dtrain/test/nc-wmt11/dtrain.ini
@@ -2,7 +2,7 @@ decoder_config=test/nc-wmt11/cdec.ini
 kbest=100
 ngrams=3
 epochs=8
-input=data/nc-wmt11.loo.localf.p0.500.rule-id #nc-wmt11-de-en-dyer-cs-joshua.tok.lc.fixamp1.loo.psg.dtrain.localf.p0
+input=data/nc-wmt11.loo.localf.p0.500.rule-id
 scorer=approx_bleu
 output=data/w/nc-wmt11.loo.p0.weights.gz
 #stop_after=100
diff --git a/dtrain/test/toy.dtrain.ini b/dtrain/test/toy.dtrain.ini
index cacb3a2c..35f76281 100644
--- a/dtrain/test/toy.dtrain.ini
+++ b/dtrain/test/toy.dtrain.ini
@@ -2,8 +2,9 @@ decoder_config=test/cdec.ini
 kbest=4
 ngrams=1
 epochs=3
-input=data/in.toy
+input=test/toy.in
 scorer=bleu
 output=toy.gz
 #stop_after=1000
+wprint=logp use_shell use_house PassThrough
 
diff --git a/dtrain/test/toy.in b/dtrain/test/toy.in
index 63f97158..989a1f77 100644
--- a/dtrain/test/toy.in
+++ b/dtrain/test/toy.in
@@ -1,2 +1,2 @@
-0	ich sah ein kleines haus	i saw a little house	[S] ||| [NP,1] [VP,2] ||| [1] [2] ||| logp=0 __NEXT_RULE__ [NP] ||| ich ||| i ||| logp=0 __NEXT_RULE__ [NP] ||| ein [NN,1] ||| a [1] ||| logp=0 __NEXT_RULE__ [NN] ||| [JJ,1] haus ||| [1] house ||| logp=0 use_house=1 __NEXT_RULE__ [NN] ||| [JJ,1] haus ||| [1] shell ||| logp=0 use_shell=1 __NEXT_RULE__ [JJ] ||| kleines ||| small ||| logp=0 __NEXT_RULE__ [JJ] ||| kleines ||| little ||| logp=0 __NEXT_RULE__ [JJ] ||| grosses ||| big ||| logp=0 __NEXT_RULE__ [JJ] ||| grosses ||| large ||| logp=0 __NEXT_RULE__ [VP] ||| [V,1] [NP,2] ||| [1] [2] ||| logp=0 __NEXT_RULE__ [V] ||| sah ||| saw ||| logp=0 __NEXT_RULE__ [V] ||| fand ||| found ||| logp=0
-1	ich fand ein grosses haus	i found a large house	[S] ||| [NP,1] [VP,2] ||| [1] [2] ||| logp=0 __NEXT_RULE__ [NP] ||| ich ||| i ||| logp=0 __NEXT_RULE__ [NP] ||| ein [NN,1] ||| a [1] ||| logp=0 __NEXT_RULE__ [NN] ||| [JJ,1] haus ||| [1] house ||| logp=0 use_house=1 __NEXT_RULE__ [NN] ||| [JJ,1] haus ||| [1] shell ||| logp=0 use_shell=1 __NEXT_RULE__ [JJ] ||| kleines ||| small ||| logp=0 __NEXT_RULE__ [JJ] ||| kleines ||| little ||| logp=0 __NEXT_RULE__ [JJ] ||| grosses ||| big ||| logp=0 __NEXT_RULE__ [JJ] ||| grosses ||| large ||| logp=0 __NEXT_RULE__ [VP] ||| [V,1] [NP,2] ||| [1] [2] ||| logp=0 __NEXT_RULE__ [V] ||| sah ||| saw ||| logp=0 __NEXT_RULE__ [V] ||| fand ||| found ||| logp=0
+0	ich sah ein kleines haus	i saw a little house	[S] ||| [NP,1] [VP,2] ||| [1] [2] ||| logp=0 __NEXT__RULE__ [NP] ||| ich ||| i ||| logp=0 __NEXT__RULE__ [NP] ||| ein [NN,1] ||| a [1] ||| logp=0 __NEXT__RULE__ [NN] ||| [JJ,1] haus ||| [1] house ||| logp=0 use_house=1 __NEXT__RULE__ [NN] ||| [JJ,1] haus ||| [1] shell ||| logp=0 use_shell=1 __NEXT__RULE__ [JJ] ||| kleines ||| small ||| logp=0 __NEXT__RULE__ [JJ] ||| kleines ||| little ||| logp=0 __NEXT__RULE__ [JJ] ||| grosses ||| big ||| logp=0 __NEXT__RULE__ [JJ] ||| grosses ||| large ||| logp=0 __NEXT__RULE__ [VP] ||| [V,1] [NP,2] ||| [1] [2] ||| logp=0 __NEXT__RULE__ [V] ||| sah ||| saw ||| logp=0 __NEXT__RULE__ [V] ||| fand ||| found ||| logp=0
+1	ich fand ein grosses haus	i found a large house	[S] ||| [NP,1] [VP,2] ||| [1] [2] ||| logp=0 __NEXT__RULE__ [NP] ||| ich ||| i ||| logp=0 __NEXT__RULE__ [NP] ||| ein [NN,1] ||| a [1] ||| logp=0 __NEXT__RULE__ [NN] ||| [JJ,1] haus ||| [1] house ||| logp=0 use_house=1 __NEXT__RULE__ [NN] ||| [JJ,1] haus ||| [1] shell ||| logp=0 use_shell=1 __NEXT__RULE__ [JJ] ||| kleines ||| small ||| logp=0 __NEXT__RULE__ [JJ] ||| kleines ||| little ||| logp=0 __NEXT__RULE__ [JJ] ||| grosses ||| big ||| logp=0 __NEXT__RULE__ [JJ] ||| grosses ||| large ||| logp=0 __NEXT__RULE__ [VP] ||| [V,1] [NP,2] ||| [1] [2] ||| logp=0 __NEXT__RULE__ [V] ||| sah ||| saw ||| logp=0 __NEXT__RULE__ [V] ||| fand ||| found ||| logp=0
diff --git a/dtrain/test/toy_cdec/cdec.ini b/dtrain/test/toy_cdec/cdec.ini
new file mode 100644
index 00000000..3a6bab68
--- /dev/null
+++ b/dtrain/test/toy_cdec/cdec.ini
@@ -0,0 +1,3 @@
+formalism=scfg
+grammar=../dtrain/test/toy_cdec/grammar
+add_pass_through_rules=true
diff --git a/dtrain/test/toy_cdec/grammar b/dtrain/test/toy_cdec/grammar
new file mode 100644
index 00000000..aeed75ef
--- /dev/null
+++ b/dtrain/test/toy_cdec/grammar
@@ -0,0 +1,12 @@
+[S] ||| [NP,1] [VP,2] ||| [1] [2] ||| logp=0
+[NP] ||| ich ||| i ||| logp=0
+[NP] ||| ein [NN,1] ||| a [1] ||| logp=0
+[NN] ||| [JJ,1] haus ||| [1] house ||| logp=0 use_house=1
+[NN] ||| [JJ,1] haus ||| [1] shell ||| logp=0 use_shell=1
+[JJ] ||| kleines ||| small ||| logp=0
+[JJ] ||| kleines ||| little ||| logp=0
+[JJ] ||| grosses ||| big ||| logp=0
+[JJ] ||| grosses ||| large ||| logp=0
+[VP] ||| [V,1] [NP,2] ||| [1] [2] ||| logp=0
+[V] ||| sah ||| saw ||| logp=0
+[V] ||| fand ||| found ||| logp=0
diff --git a/dtrain/test/toy_cdec/in b/dtrain/test/toy_cdec/in
new file mode 100644
index 00000000..e6df9275
--- /dev/null
+++ b/dtrain/test/toy_cdec/in
@@ -0,0 +1 @@
+ich sah ein kleines haus
diff --git a/dtrain/test/toy_cdec/weights b/dtrain/test/toy_cdec/weights
new file mode 100644
index 00000000..10d7ed83
--- /dev/null
+++ b/dtrain/test/toy_cdec/weights
@@ -0,0 +1,2 @@
+logp 1
+use_shell 1
diff --git a/utils/fast_sparse_vector.h b/utils/fast_sparse_vector.h
index 4aae2039..1301581a 100644
--- a/utils/fast_sparse_vector.h
+++ b/utils/fast_sparse_vector.h
@@ -7,6 +7,8 @@
 // important: indexes are integers
 // important: iterators may return elements in any order
 
+#include "config.h"
+
 #include <cmath>
 #include <cstring>
 #include <climits>
@@ -16,6 +18,12 @@
 
 #include <boost/static_assert.hpp>
 
+#if HAVE_BOOST_ARCHIVE_TEXT_OARCHIVE_HPP
+#include <boost/serialization/map.hpp>
+#endif
+
+#include "fdict.h"
+
 // this is architecture dependent, it should be
 // detected in some way but it's probably easiest (for me)
 // to just set it
@@ -235,6 +243,13 @@ class FastSparseVector {
     }
     return *this;
   }
+  FastSparseVector<T> erase_zeros(const T& EPSILON = 1e-4) const {
+    FastSparseVector<T> o;
+    for (const_iterator it = begin(); it != end(); ++it) {
+      if (fabs(it->second) > EPSILON) o.set_value(it->first, it->second);
+    }
+    return o;
+  }
   const_iterator begin() const {
     return const_iterator(*this, false);
   }
@@ -327,8 +342,45 @@ class FastSparseVector {
   } data_;
   unsigned char local_size_;
   bool is_remote_;
+
+#if HAVE_BOOST_ARCHIVE_TEXT_OARCHIVE_HPP
+ private:
+  friend class boost::serialization::access;
+  template<class Archive>
+  void save(Archive & ar, const unsigned int version) const {
+    (void) version;
+    int eff_size = size();
+    const_iterator it = this->begin();
+    if (eff_size > 0) {
+      // 0 index is reserved as empty
+      if (it->first == 0) { ++it; --eff_size; }
+    }
+    ar & eff_size;
+    while (it != this->end()) {
+      const std::pair<const std::string&, const T&> wire_pair(FD::Convert(it->first), it->second);
+      ar & wire_pair;
+      ++it;
+    }
+  }
+  template<class Archive>
+  void load(Archive & ar, const unsigned int version) {
+    (void) version;
+    this->clear();
+    int sz; ar & sz;
+    for (int i = 0; i < sz; ++i) {
+      std::pair<std::string, T> wire_pair;
+      ar & wire_pair;
+      this->set_value(FD::Convert(wire_pair.first), wire_pair.second);
+    }
+  }
+  BOOST_SERIALIZATION_SPLIT_MEMBER()
+#endif
 };
 
+#if HAVE_BOOST_ARCHIVE_TEXT_OARCHIVE_HPP
+BOOST_CLASS_TRACKING(FastSparseVector<double>,track_never)
+#endif
+
 template <typename T>
 const FastSparseVector<T> operator+(const FastSparseVector<T>& x, const FastSparseVector<T>& y) {
   if (x.size() > y.size()) {
@@ -344,15 +396,9 @@ const FastSparseVector<T> operator+(const FastSparseVector<T>& x, const FastSpar
 
 template <typename T>
 const FastSparseVector<T> operator-(const FastSparseVector<T>& x, const FastSparseVector<T>& y) {
-  if (x.size() > y.size()) {
-    FastSparseVector<T> res(x);
-    res -= y;
-    return res;
-  } else {
-    FastSparseVector<T> res(y);
-    res -= x;
-    return res;
-  }
+  FastSparseVector<T> res(x);
+  res -= y;
+  return res;
 }
 
 template <class T>
-- 
cgit v1.2.3