From 574e2336348e5d3960b3232209d01845b40e6ea8 Mon Sep 17 00:00:00 2001
From: Patrick Simianer <p@simianer.de>
Date: Mon, 21 Nov 2011 12:21:08 +0100
Subject: added pro stuff,clean up

---
 dtrain/README.md                      | 125 +++++++++++++++++++++++++---------
 dtrain/dtrain.cc                      | 107 +++++++++++++++++------------
 dtrain/dtrain.h                       |   4 +-
 dtrain/hstreaming/cdec.ini            |   3 +-
 dtrain/hstreaming/dtrain.sh           |   2 +-
 dtrain/kbestget.h                     |   2 +-
 dtrain/pairsampling.h                 |  55 +++++++++++++--
 dtrain/test/example/cdec.ini          |   5 +-
 dtrain/test/example/dtrain.ini        |  24 ++++---
 dtrain/test/example/nc-1k-tabs.gz     | Bin 21185883 -> 0 bytes
 dtrain/test/example/nc-1k.gz          | Bin 21474865 -> 0 bytes
 dtrain/test/example/nc-wmt11.1k.gz    | Bin 0 -> 21185883 bytes
 dtrain/test/log_reg_dyer/bin_class.cc |   4 --
 dtrain/test/log_reg_dyer/bin_class.h  |  22 ------
 dtrain/test/log_reg_dyer/log_reg.cc   |  39 -----------
 dtrain/test/log_reg_dyer/log_reg.h    |  14 ----
 dtrain/test/logreg_cd/bin_class.cc    |   4 ++
 dtrain/test/logreg_cd/bin_class.h     |  22 ++++++
 dtrain/test/logreg_cd/log_reg.cc      |  39 +++++++++++
 dtrain/test/logreg_cd/log_reg.h       |  14 ++++
 dtrain/test/toy/dtrain.ini            |   4 +-
 21 files changed, 308 insertions(+), 181 deletions(-)
 delete mode 100644 dtrain/test/example/nc-1k-tabs.gz
 delete mode 100644 dtrain/test/example/nc-1k.gz
 create mode 100644 dtrain/test/example/nc-wmt11.1k.gz
 delete mode 100644 dtrain/test/log_reg_dyer/bin_class.cc
 delete mode 100644 dtrain/test/log_reg_dyer/bin_class.h
 delete mode 100644 dtrain/test/log_reg_dyer/log_reg.cc
 delete mode 100644 dtrain/test/log_reg_dyer/log_reg.h
 create mode 100644 dtrain/test/logreg_cd/bin_class.cc
 create mode 100644 dtrain/test/logreg_cd/bin_class.h
 create mode 100644 dtrain/test/logreg_cd/log_reg.cc
 create mode 100644 dtrain/test/logreg_cd/log_reg.h

(limited to 'dtrain')
diff --git a/dtrain/README.md b/dtrain/README.md
index 46f783b0..c50f3cad 100644
--- a/dtrain/README.md
+++ b/dtrain/README.md
@@ -23,67 +23,60 @@ Ideas
 -----
 * *MULTIPARTITE* ranking (1 vs rest, cluster model/score)
 * *REMEMBER* sampled translations (merge kbest lists)
-* *SELECT* iteration with highest real BLEU on devtest?
-* *GENERATED* data? (perfect translation always in kbest)
+* *SELECT* iteration with highest _real_ BLEU on devtest?
+* *SYNTHETIC* data? (perfect translation always in kbest)
 * *CACHE* ngrams for scoring
-* hadoop *PIPES* imlementation
+* hadoop *PIPES* implementation
 * *ITERATION* variants (shuffle resulting weights, re-iterate)
-* *MORE THAN ONE* reference for BLEU?
-* *RANDOM RESTARTS* or directions
+* *MORE THAN ONE* reference for BLEU, paraphrases?
+* *RANDOM RESTARTS* or random directions
 * use separate *TEST SET* for each shard
 * *REDUCE* training set (50k?)
 * *SYNTAX* features (CD)
 * distribute *DEV* set to all nodes, avg
-* *PARAPHRASES* for better approx BLEU?
 
-
-Uncertain, known bugs, problems
+Notes
 -------------------------------
 * cdec kbest vs 1best (no -k param), rescoring (ref?)? => ok(?)
-* no sparse vector in decoder => ok/fixed
-* PhraseModel features, mapping?
+* no sparse vector in decoder => fixed/'ok'
+* PhraseModel features 0..99, mapping?
 * flex scanner jams on bad input, we could skip that
-* input/grammar caching (strings -> WordIDs)
-* look at forest sampling...
-* devtest loo or not? why loo grammars larger? (sort psgs | uniq -> grammar)
+* input/grammar caching (vector<string> -> vector<WordID>)
+* why loo grammars larger? are they? (sort psgs | uniq -> grammar)
 * lower beam size to be faster?
 * why is <unk> -100 in lm so good?
 * noise helps for discriminative training?
 * what does srilm do with -unk but nothing mapped to unk (<unk> unigram)?
   => this: http://www-speech.sri.com/pipermail/srilm-user/2007q4/000543.html
-* mira translation sampling? => done
-* does AER correlate with BLEU?
-
-random notes
-------------
-* learning rate tuned with perceptron
-* aer correlation with bleu?
-* dtrain (perc) used for some tests because no optimizer instability
+* does AER correlate with BLEU? paper?
+* learning rate tuned with perceptron?
+* dtrain (perceptron) used for some tests because no optimizer instability
 * http://www.ark.cs.cmu.edu/cdyer/dtrain/
 * repeat as often as max needed by any learner!
-* don't compare lms with diff vocab (stupid backoff paper)
-* what does mira/pro optimize?
-* early stopping
-* 10-20k rules per sent normal
-* shard size 500 -> 2k
-* giza vs. berkeleyaligner: giza less noise?
+* don't compare lms (perplex.) with diff vocab (see stupid backoff paper)
+* what does mira/pro optimize exactly?
+* early stopping (epsilon, no change in kbest list)
+* 10-20k rules per sent are normal
+* giza vs. berkeleyaligner: giza more/less noise?
 * compound splitting -> more rules?
-* loo => ref can't be reached? (jackknifing)
+* loo (jackknifing) => ref can't be reached?
 * prune singletons -> less noise? (do I do this?)
-* random sample: take 100 at random
+* random sample: take fixed X at random
+* scale of features/weights?
 
-features
+Features
 --------
 * baseline features (take whatever cdec implements for VEST)
 * rule identifiers (feature name = rule as string)
 * rule discounts (taken from frequency i or frequency interval [i,j] of rule in extraction from parallel training data) bins
+  => from PRO
 * target ngrams (from nonterminals in rule rhs), with gaps?
 * source-target unigrams (from word alignments used in rule extraction, if they are?)
 * lhs, rhs, rule length features
 * all other features depend on syntax annotation. 
 * word alignment
 
-FIXME, todo
+Todo
 -----------
 * merge dtrain part-X files, for better blocks (how to do this with 4.5tb ep)
 * mapred count shard sents
@@ -114,7 +107,6 @@ FIXME, todo
 * sample pairs like in pro
 * mira forest sampling
 
-
 Data
 ----
 <pre>
@@ -274,3 +266,72 @@ loo vs non-loo? => generalization
  train on dev, test on devtest
  train on devtest, test on dev
  as above ^^^
+
+
+ ---
+
+as PRO
+ - UPDATES:       perceptron
+ - LEARNING RATE: 0.0005
+ - GAMMA:         -
+ - #ITERATIONS:   30
+ - SCORER:        stupid_bleu@4
+ - K:             100, 1500?(top X pairs)
+ - SAMPLE:        kbest uniq, kbest no
+ - PAIR SAMPLING: all, PRO?TODO
+ - SELECT:        best
+ - FEATURES:      baseline, RuleShape+SpanFeatures
+ ---
+ - Note: no weight interpolation
+         no early stopping based on kbest lists (epsilon?TODO)
+
+dtrain tune reg 
+ - updates: SVM
+ - pair sampling important!
+ - learning_rate= 100 50 10 5 1 0.5 0.1 0.05 0.01 0.005 0.001 0.0005 0.0001 0.00005 0.00001 0.000005 0.000001 0.0000005 0.0000001 0.0000000001
+   
+ - gamma=
+   
+ - scorer: stupid_bleu 3
+ - test weights: last
+ -
+ -
+ - test: devtest
+
+
+---
+weights visualization (blocks, color coded)
+zig zag!?
+repeat all basic exps with training set
+merge?
+
+
+
+
+--sample_from
+--k
+--filter
+--pair_sampling
+--N
+--epochs
+--scorer
+--learning_rate
+--gamma
+--select_weights
+[--unit_weight_vector]
+[--l1_reg]
+[--l1_reg_strength]
+
+---------
+corr best = really best?
+108010gaps
+
+coltrane:  9
+gillespie: 9
+staley:    2
+io:        6
+ioh:       4
+         slots
+
+
+when does overfitting begin?
diff --git a/dtrain/dtrain.cc b/dtrain/dtrain.cc
index 0853173f..3d3aa2d3 100644
--- a/dtrain/dtrain.cc
+++ b/dtrain/dtrain.cc
@@ -6,32 +6,33 @@ dtrain_init(int argc, char** argv, po::variables_map* cfg)
 {
   po::options_description ini("Configuration File Options");
   ini.add_options()
-    ("input",          po::value<string>()->default_value("-"),                                                "input file")
-    ("output",         po::value<string>()->default_value("-"),                       "output weights file, '-' for STDOUT")
-    ("input_weights",  po::value<string>(),                             "input weights file (e.g. from previous iteration)")
-    ("decoder_config", po::value<string>(),                                                   "configuration file for cdec")
-    ("sample_from",    po::value<string>()->default_value("kbest"),      "where to sample translations from: kbest, forest")
-    ("k",              po::value<unsigned>()->default_value(100),                         "how many translations to sample")
-    ("filter",         po::value<string>()->default_value("unique"),                        "filter kbest list: no, unique")
-    ("pair_sampling",  po::value<string>()->default_value("all"),                  "how to sample pairs: all, rand, 108010")
-    ("N",              po::value<unsigned>()->default_value(3),                                       "N for Ngrams (BLEU)")
-    ("epochs",         po::value<unsigned>()->default_value(2),                             "# of iterations T (per shard)") 
-    ("scorer",         po::value<string>()->default_value("stupid_bleu"),     "scoring: bleu, stupid_*, smooth_*, approx_*")
-    ("stop_after",     po::value<unsigned>()->default_value(0),                              "stop after X input sentences")
-    ("print_weights",  po::value<string>(),                                            "weights to print on each iteration")
-    ("hstreaming",     po::value<string>(),                                "run in hadoop streaming mode, arg is a task id")
-    ("learning_rate",  po::value<weight_t>()->default_value(0.0005),                                        "learning rate")
-    ("gamma",          po::value<weight_t>()->default_value(0),                          "gamma for SVM (0 for perceptron)")
-    ("tmp",            po::value<string>()->default_value("/tmp"),                                        "temp dir to use")
-    ("select_weights", po::value<string>()->default_value("last"), "output 'best' or 'last' weights ('VOID' to throw away)")
-    ("keep_w",         po::value<bool>()->zero_tokens(),                              "protocol weights for each iteration")
-    ("unit_weight_vector", po::value<bool>()->zero_tokens(),                       "Rescale weight vector after each input")
-    ("l1_reg",         po::value<string>()->default_value("no"),         "apply l1 regularization as in Tsuroka et al 2010")
-    ("l1_reg_strength", po::value<weight_t>(),                                                 "l1 regularization strength")
+    ("input",           po::value<string>()->default_value("-"),                                                "input file")
+    ("output",          po::value<string>()->default_value("-"),                       "output weights file, '-' for STDOUT")
+    ("input_weights",   po::value<string>(),                             "input weights file (e.g. from previous iteration)")
+    ("decoder_config",  po::value<string>(),                                                   "configuration file for cdec")
+    ("sample_from",     po::value<string>()->default_value("kbest"),      "where to sample translations from: kbest, forest")
+    ("k",               po::value<unsigned>()->default_value(100),                         "how many translations to sample")
+    ("filter",          po::value<string>()->default_value("uniq"),                            "filter kbest list: no, uniq")
+    ("pair_sampling",   po::value<string>()->default_value("all"),             "how to sample pairs: all, 5050, 108010, PRO")
+    ("N",               po::value<unsigned>()->default_value(3),                                       "N for Ngrams (BLEU)")
+    ("epochs",          po::value<unsigned>()->default_value(2),                             "# of iterations T (per shard)") 
+    ("scorer",          po::value<string>()->default_value("stupid_bleu"),     "scoring: bleu, stupid_*, smooth_*, approx_*")
+    ("learning_rate",   po::value<weight_t>()->default_value(0.0005),                                        "learning rate")
+    ("gamma",           po::value<weight_t>()->default_value(0),                          "gamma for SVM (0 for perceptron)")
+    ("select_weights",  po::value<string>()->default_value("last"), "output 'best' or 'last' weights ('VOID' to throw away)")
+    ("unit_wv",         po::value<bool>()->zero_tokens(),                           "Rescale weight vector after each input")
+    ("l1_reg",          po::value<string>()->default_value("no"),         "apply l1 regularization as in Tsuroka et al 2010")
+    ("l1_reg_strength", po::value<weight_t>(),                                                  "l1 regularization strength")
+    ("update_ok",       po::value<bool>()->zero_tokens(),                      "include correctly ranked pairs into updates")
+    ("stop_after",      po::value<unsigned>()->default_value(0),                              "stop after X input sentences")
+    ("keep_w",          po::value<bool>()->zero_tokens(),                            "keep weights files for each iteration")
+    ("print_weights",   po::value<string>(),                                            "weights to print on each iteration")
+    ("hstreaming",      po::value<string>(),                                "run in hadoop streaming mode, arg is a task id")
+    ("tmp",             po::value<string>()->default_value("/tmp"),                                        "temp dir to use")
 #ifdef DTRAIN_LOCAL
-    ("refs,r",         po::value<string>(),                                                      "references in local mode")
+    ("refs,r",         po::value<string>(),                                                       "references in local mode")
 #endif
-    ("noup",           po::value<bool>()->zero_tokens(),                                            "do not update weights");
+    ("noup",           po::value<bool>()->zero_tokens(),                                             "do not update weights");
   po::options_description cl("Command Line Options");
   cl.add_options()
     ("config,c",         po::value<string>(),              "dtrain config file")
@@ -63,13 +64,14 @@ dtrain_init(int argc, char** argv, po::variables_map* cfg)
     cerr << "Wrong 'sample_from' param: '" << (*cfg)["sample_from"].as<string>() << "', use 'kbest' or 'forest'." << endl;
     return false;
   }
-  if ((*cfg)["sample_from"].as<string>() == "kbest" && (*cfg)["filter"].as<string>() != "unique"
+  if ((*cfg)["sample_from"].as<string>() == "kbest" && (*cfg)["filter"].as<string>() != "uniq"
        && (*cfg)["filter"].as<string>() != "no") {
-    cerr << "Wrong 'filter' param: '" << (*cfg)["filter"].as<string>() << "', use 'unique' or 'no'." << endl;
+    cerr << "Wrong 'filter' param: '" << (*cfg)["filter"].as<string>() << "', use 'uniq' or 'no'." << endl;
     return false;
   }
   if ((*cfg)["pair_sampling"].as<string>() != "all"
-       && (*cfg)["pair_sampling"].as<string>() != "rand" && (*cfg)["pair_sampling"].as<string>() != "108010") {
+       && (*cfg)["pair_sampling"].as<string>() != "5050" && (*cfg)["pair_sampling"].as<string>() != "108010"
+       && (*cfg)["pair_sampling"].as<string>() != "PRO") {
     cerr << "Wrong 'pair_sampling' param: '" << (*cfg)["pair_sampling"].as<string>() << "', use 'all' or 'rand'." << endl;
     return false;
   }
@@ -101,11 +103,14 @@ main(int argc, char** argv)
     task_id = cfg["hstreaming"].as<string>();
     cerr.precision(17);
   }
-  bool unit_weight_vector = false;
-  if (cfg.count("unit_weight_vector")) unit_weight_vector = true;
+  bool unit_wv = false;
+  if (cfg.count("unit_wv")) unit_wv = true;
   HSReporter rep(task_id);
   bool keep_w = false;
   if (cfg.count("keep_w")) keep_w = true;
+  bool update_ok = false;
+  if (cfg.count("update_ok"))
+    update_ok = true;
 
   const unsigned k = cfg["k"].as<unsigned>();
   const unsigned N = cfg["N"].as<unsigned>(); 
@@ -118,7 +123,7 @@ main(int argc, char** argv)
   vector<string> print_weights;
   if (cfg.count("print_weights"))
     boost::split(print_weights, cfg["print_weights"].as<string>(), boost::is_any_of(" "));
-  
+
   // setup decoder
   register_feature_functions();
   SetSilent(true);
@@ -187,7 +192,7 @@ main(int argc, char** argv)
   vector<vector<WordID> > ref_ids_buf; // references as WordID vecs
   // where temp files go
   string tmp_path = cfg["tmp"].as<string>();
-  vector<string> w_tmp_files; // used for protocol_w
+  vector<string> w_tmp_files; // used for keep_w 
 #ifdef DTRAIN_LOCAL
   string refs_fn = cfg["refs"].as<string>();
   ReadFile refs(refs_fn);
@@ -226,6 +231,12 @@ main(int argc, char** argv)
     cerr << setw(25) << "sample from " << "'" << sample_from << "'" << endl;
     cerr << setw(25) << "pairs " << "'" << pair_sampling << "'" << endl;
     cerr << setw(25) << "select weights " << "'" << select_weights << "'" << endl;
+    if (cfg.count("l1_reg"))
+      cerr << setw(25) << "l1 reg " << l1_reg << " '" << cfg["l1_reg"].as<string>() << "'" << endl;
+    if (update_ok)
+      cerr << setw(25) << "up ok " << update_ok << endl;
+    if (unit_wv)
+      cerr << setw(25) << "unit weight vec " << unit_wv << endl;
     if (!verbose) cerr << "(a dot represents " << DTRAIN_DOTS << " lines of input)" << endl;
   }
 
@@ -320,7 +331,7 @@ main(int argc, char** argv)
       // get buffered grammar
       string grammar_str;
       while (true) {
-        string rule;  
+        string rule;
         getline(grammar_buf_in, rule);
         if (boost::starts_with(rule, DTRAIN_GRAMMAR_DELIM)) break;
         grammar_str += rule + "\n";
@@ -372,13 +383,15 @@ main(int argc, char** argv)
     if (!noup) {
       vector<pair<ScoredHyp,ScoredHyp> > pairs;
       if (pair_sampling == "all")
-        sample_all_pairs(samples, pairs);
-      if (pair_sampling == "rand")
-        sample_rand_pairs(samples, pairs, &rng);
+        all_pairs(samples, pairs);
+      if (pair_sampling == "5050")
+        rand_pairs_5050(samples, pairs, &rng);
       if (pair_sampling == "108010")
-        sample108010(samples, pairs);
+        multpart108010(samples, pairs);
+      if (pair_sampling == "PRO")
+        PROsampling(samples, pairs);
       npairs += pairs.size();
-       
+
       for (vector<pair<ScoredHyp,ScoredHyp> >::iterator it = pairs.begin();
            it != pairs.end(); it++) {
         score_t rank_error = it->second.score - it->first.score;
@@ -388,6 +401,11 @@ main(int argc, char** argv)
             SparseVector<weight_t> diff_vec = it->second.f - it->first.f;
             lambdas.plus_eq_v_times_s(diff_vec, eta);
             rank_errors++;
+          } else {
+            if (update_ok) {
+              SparseVector<weight_t> diff_vec = it->first.f - it->second.f;
+              lambdas.plus_eq_v_times_s(diff_vec, eta);
+            }
           }
           if (it->first.model - it->second.model < 1) margin_violations++;
         } else {
@@ -404,6 +422,8 @@ main(int argc, char** argv)
         }
       }
 
+      ////////
+      // TEST THIS
       // reset cumulative_penalties after 1 iter? 
       // do this only once per INPUT (not per pair)
       if (l1naive) {
@@ -439,8 +459,9 @@ main(int argc, char** argv)
         }
       }
     }
+    ////////
 
-    if (unit_weight_vector && sample_from == "forest") lambdas /= lambdas.l2norm();
+    if (unit_wv && sample_from == "forest") lambdas /= lambdas.l2norm();
     
     ++ii;
 
@@ -501,11 +522,11 @@ main(int argc, char** argv)
   }
 
   if (hstreaming) {
-    rep.update_counter("Score 1best avg #"+boost::lexical_cast<string>(t+1), (unsigned)(score_avg*_SCALE)); 
-    rep.update_counter("Model 1best avg #"+boost::lexical_cast<string>(t+1), (unsigned)(model_avg*_SCALE)); 
-    rep.update_counter("Pairs avg #"+boost::lexical_cast<string>(t+1), (unsigned)((npairs/(weight_t)in_sz)*_SCALE)); 
-    rep.update_counter("Rank errors avg #"+boost::lexical_cast<string>(t+1), (unsigned)((rank_errors/(weight_t)in_sz)*_SCALE)); 
-    rep.update_counter("Margin violations avg #"+boost::lexical_cast<string>(t+1), (unsigned)((margin_violations/(weight_t)in_sz)*_SCALE)); 
+    rep.update_counter("Score 1best avg #"+boost::lexical_cast<string>(t+1), (unsigned)(score_avg*DTRAIN_SCALE)); 
+    rep.update_counter("Model 1best avg #"+boost::lexical_cast<string>(t+1), (unsigned)(model_avg*DTRAIN_SCALE)); 
+    rep.update_counter("Pairs avg #"+boost::lexical_cast<string>(t+1), (unsigned)((npairs/(weight_t)in_sz)*DTRAIN_SCALE)); 
+    rep.update_counter("Rank errors avg #"+boost::lexical_cast<string>(t+1), (unsigned)((rank_errors/(weight_t)in_sz)*DTRAIN_SCALE)); 
+    rep.update_counter("Margin violations avg #"+boost::lexical_cast<string>(t+1), (unsigned)((margin_violations/(weight_t)in_sz)*DTRAIN_SCALE)); 
     unsigned nonz = (unsigned)lambdas.size_nonzero();
     rep.update_counter("Non zero feature count #"+boost::lexical_cast<string>(t+1), nonz); 
     rep.update_gcounter("Non zero feature count #"+boost::lexical_cast<string>(t+1), nonz);
diff --git a/dtrain/dtrain.h b/dtrain/dtrain.h
index f0d8fd45..3d76bd7f 100644
--- a/dtrain/dtrain.h
+++ b/dtrain/dtrain.h
@@ -1,8 +1,6 @@
 #ifndef _DTRAIN_COMMON_H_
 #define _DTRAIN_COMMON_H_
 
-
-
 #include <iomanip>
 #include <climits>
 #include <string.h>
@@ -19,7 +17,7 @@
 
 #define DTRAIN_DOTS 100 // when to display a '.'
 #define DTRAIN_GRAMMAR_DELIM "########EOS########"
-#define _SCALE 100000
+#define DTRAIN_SCALE 100000
 
 using namespace std;
 using namespace dtrain;
diff --git a/dtrain/hstreaming/cdec.ini b/dtrain/hstreaming/cdec.ini
index bea54afe..5afa89a9 100644
--- a/dtrain/hstreaming/cdec.ini
+++ b/dtrain/hstreaming/cdec.ini
@@ -1,7 +1,8 @@
 formalism=scfg
 add_pass_through_rules=true
-cubepruning_pop_limit=30
 scfg_max_span_limit=15
+intersection_strategy=cube_pruning
+cubepruning_pop_limit=200
 feature_function=WordPenalty
 feature_function=KLanguageModel nc-wmt11.en.srilm.3.gz
 feature_function=RuleIdentityFeatures
diff --git a/dtrain/hstreaming/dtrain.sh b/dtrain/hstreaming/dtrain.sh
index 6d34012a..b6847591 100755
--- a/dtrain/hstreaming/dtrain.sh
+++ b/dtrain/hstreaming/dtrain.sh
@@ -2,7 +2,7 @@
 
 pushd .
 cd ..
-ID=$(basename $(pwd))
+ID=$(basename $(pwd)) # attempt_...
 popd
 ./dtrain -c dtrain.ini --hstreaming $ID 
 
diff --git a/dtrain/kbestget.h b/dtrain/kbestget.h
index 88f8bc17..08104dec 100644
--- a/dtrain/kbestget.h
+++ b/dtrain/kbestget.h
@@ -86,7 +86,7 @@ struct KBestGetter : public HypSampler
   void
   KBestScored(const Hypergraph& forest)
   {
-    if (filter_type_ == "unique") {
+    if (filter_type_ == "uniq") {
       KBestUnique(forest);
     } else if (filter_type_ == "no") {
       KBestNoFilter(forest);
diff --git a/dtrain/pairsampling.h b/dtrain/pairsampling.h
index 131e90ca..4399dfee 100644
--- a/dtrain/pairsampling.h
+++ b/dtrain/pairsampling.h
@@ -6,7 +6,7 @@ namespace dtrain
 
 
 inline void
-sample_all_pairs(vector<ScoredHyp>* s, vector<pair<ScoredHyp,ScoredHyp> >& training)
+all_pairs(vector<ScoredHyp>* s, vector<pair<ScoredHyp,ScoredHyp> >& training)
 {
   for (unsigned i = 0; i < s->size()-1; i++) {
     for (unsigned j = i+1; j < s->size(); j++) {
@@ -19,7 +19,7 @@ sample_all_pairs(vector<ScoredHyp>* s, vector<pair<ScoredHyp,ScoredHyp> >& train
 }
 
 inline void
-sample_rand_pairs(vector<ScoredHyp>* s, vector<pair<ScoredHyp,ScoredHyp> >& training,
+rand_pairs_5050(vector<ScoredHyp>* s, vector<pair<ScoredHyp,ScoredHyp> >& training,
                   MT19937* prng)
 {
   for (unsigned i = 0; i < s->size()-1; i++) {
@@ -35,15 +35,14 @@ sample_rand_pairs(vector<ScoredHyp>* s, vector<pair<ScoredHyp,ScoredHyp> >& trai
 }
 
 bool
-sort_samples_by_score(ScoredHyp a, ScoredHyp b)
+_multpart_cmp_hyp_by_score(ScoredHyp a, ScoredHyp b)
 {
   return a.score < b.score;
 }
-
 inline void
-sample108010(vector<ScoredHyp>* s, vector<pair<ScoredHyp,ScoredHyp> >& training)
+multpart108010(vector<ScoredHyp>* s, vector<pair<ScoredHyp,ScoredHyp> >& training)
 {
-  sort(s->begin(), s->end(), sort_samples_by_score);
+  sort(s->begin(), s->end(), _multpart_cmp_hyp_by_score);
   pair<ScoredHyp,ScoredHyp>  p;
   unsigned sz = s->size();
   unsigned slice = 10;
@@ -66,6 +65,50 @@ sample108010(vector<ScoredHyp>* s, vector<pair<ScoredHyp,ScoredHyp> >& training)
 }
 
 
+inline bool
+_PRO_accept_pair(pair<ScoredHyp,ScoredHyp> &p)
+{
+  if (fabs(p.first.score - p.second.score) < 0.05) return false;
+  return true;
+}
+bool
+_PRO_cmp_pair_by_diff(pair<ScoredHyp,ScoredHyp> a, pair<ScoredHyp,ScoredHyp> b)
+{
+  // descending order
+  return (fabs(a.first.score - a.second.score)) > (fabs(b.first.score - b.second.score));
+}
+inline void
+PROsampling(vector<ScoredHyp>* s, vector<pair<ScoredHyp,ScoredHyp> >& training) // ugly
+{
+  unsigned max_count = 5000, count = 0;
+  bool b = false;
+  //unsigned max_pairs = (s->size()*(s->size()-1))/2;
+  vector<pair<unsigned,unsigned> > taken;
+  for (unsigned i = 0; i < s->size()-1; i++) {
+    for (unsigned j = i+1; j < s->size(); j++) {
+      pair<ScoredHyp,ScoredHyp> p;
+      p.first = (*s)[i];
+      p.second = (*s)[j];
+      vector<pair<unsigned,unsigned> >::iterator it = find(taken.begin(), taken.end(), make_pair(i, j));
+      if (_PRO_accept_pair(p) && it == taken.end()) {
+        training.push_back(p);
+        count++;
+        taken.push_back(make_pair(i, j));
+        if (count == max_count) {
+          b = true;
+          break;
+        }
+      }
+    }
+    if (b) break;
+  }
+  sort(training.begin(), training.end(), _PRO_cmp_pair_by_diff);
+  if (training.size() > 50)
+    training.erase(training.begin()+50, training.end()); 
+  return;
+}
+
+
 } // namespace
 
 #endif
diff --git a/dtrain/test/example/cdec.ini b/dtrain/test/example/cdec.ini
index 31a205c7..ff99de7b 100644
--- a/dtrain/test/example/cdec.ini
+++ b/dtrain/test/example/cdec.ini
@@ -1,7 +1,8 @@
 formalism=scfg
 add_pass_through_rules=true
-cubepruning_pop_limit=30
 scfg_max_span_limit=15
+intersection_strategy=cube_pruning
+cubepruning_pop_limit=30
 feature_function=WordPenalty
 feature_function=KLanguageModel test/example/nc-wmt11.en.srilm.gz
-feature_function=RuleIdentityFeatures
+#feature_function=RuleIdentityFeatures
diff --git a/dtrain/test/example/dtrain.ini b/dtrain/test/example/dtrain.ini
index 0b066013..fab4d317 100644
--- a/dtrain/test/example/dtrain.ini
+++ b/dtrain/test/example/dtrain.ini
@@ -1,18 +1,20 @@
 decoder_config=test/example/cdec.ini
 k=100
 N=3
-gamma=0.001
-epochs=20
-input=test/example/nc-1k-tabs.gz
-scorer=smooth_bleu
-output=- #weights.gz
-stop_after=5
+learning_rate=0.0005
+gamma=0
+epochs=3
+input=test/example/nc-wmt11.1k.gz
+output=-
+scorer=stupid_bleu
 sample_from=forest
-pair_sampling=108010
-select_weights=VOID
+#filter=unique
+pair_sampling=5050
+select_weights=last
 print_weights=Glue WordPenalty LanguageModel LanguageModel_OOV PhraseModel_0 PhraseModel_1 PhraseModel_2 PhraseModel_3 PhraseModel_4 PassThrough
 tmp=/tmp
-#unit_weight_vector=
-keep_w=true
+stop_after=10
+#keep_w=
+#update_ok=
 #l1_reg=clip
-#l1_reg_strength=0.00001
+#l1_reg_strength=0.0001
diff --git a/dtrain/test/example/nc-1k-tabs.gz b/dtrain/test/example/nc-1k-tabs.gz
deleted file mode 100644
index 45496cd8..00000000
Binary files a/dtrain/test/example/nc-1k-tabs.gz and /dev/null differ
diff --git a/dtrain/test/example/nc-1k.gz b/dtrain/test/example/nc-1k.gz
deleted file mode 100644
index f638a166..00000000
Binary files a/dtrain/test/example/nc-1k.gz and /dev/null differ
diff --git a/dtrain/test/example/nc-wmt11.1k.gz b/dtrain/test/example/nc-wmt11.1k.gz
new file mode 100644
index 00000000..45496cd8
Binary files /dev/null and b/dtrain/test/example/nc-wmt11.1k.gz differ
diff --git a/dtrain/test/log_reg_dyer/bin_class.cc b/dtrain/test/log_reg_dyer/bin_class.cc
deleted file mode 100644
index 19bcde25..00000000
--- a/dtrain/test/log_reg_dyer/bin_class.cc
+++ /dev/null
@@ -1,4 +0,0 @@
-#include "bin_class.h"
-
-Objective::~Objective() {}
-
diff --git a/dtrain/test/log_reg_dyer/bin_class.h b/dtrain/test/log_reg_dyer/bin_class.h
deleted file mode 100644
index 3466109a..00000000
--- a/dtrain/test/log_reg_dyer/bin_class.h
+++ /dev/null
@@ -1,22 +0,0 @@
-#ifndef _BIN_CLASS_H_
-#define _BIN_CLASS_H_
-
-#include <vector>
-#include "sparse_vector.h"
-
-struct TrainingInstance {
-  // TODO add other info? loss for MIRA-type updates?
-  SparseVector<double> x_feature_map;
-  bool y;
-};
-
-struct Objective {
-  virtual ~Objective();
-
-  // returns f(x) and f'(x)
-  virtual double ObjectiveAndGradient(const SparseVector<double>& x,
-                  const std::vector<TrainingInstance>& training_instances,
-                  SparseVector<double>* g) const = 0;
-};
-
-#endif
diff --git a/dtrain/test/log_reg_dyer/log_reg.cc b/dtrain/test/log_reg_dyer/log_reg.cc
deleted file mode 100644
index ec2331fe..00000000
--- a/dtrain/test/log_reg_dyer/log_reg.cc
+++ /dev/null
@@ -1,39 +0,0 @@
-#include "log_reg.h"
-
-#include <vector>
-#include <cmath>
-
-#include "sparse_vector.h"
-
-using namespace std;
-
-double LogisticRegression::ObjectiveAndGradient(const SparseVector<double>& x,
-                              const vector<TrainingInstance>& training_instances,
-                              SparseVector<double>* g) const {
-  double cll = 0;
-  for (int i = 0; i < training_instances.size(); ++i) {
-    const double dotprod = training_instances[i].x_feature_map.dot(x); // TODO no bias, if bias, add x[0]
-    double lp_false = dotprod;
-    double lp_true = -dotprod;
-    if (0 < lp_true) {
-      lp_true += log1p(exp(-lp_true));
-      lp_false = log1p(exp(lp_false));
-    } else {
-      lp_true = log1p(exp(lp_true));
-      lp_false += log1p(exp(-lp_false));
-    }
-    lp_true *= -1;
-    lp_false *= -1;
-    if (training_instances[i].y) {  // true label
-      cll -= lp_true;
-      (*g) -= training_instances[i].x_feature_map * exp(lp_false);
-      // (*g)[0] -= exp(lp_false); // bias
-    } else {                  // false label
-      cll -= lp_false;
-      (*g) += training_instances[i].x_feature_map * exp(lp_true);
-      // g += corpus[i].second * exp(lp_true);
-    }
-  }
-  return cll;
-}
-
diff --git a/dtrain/test/log_reg_dyer/log_reg.h b/dtrain/test/log_reg_dyer/log_reg.h
deleted file mode 100644
index ecc560b8..00000000
--- a/dtrain/test/log_reg_dyer/log_reg.h
+++ /dev/null
@@ -1,14 +0,0 @@
-#ifndef _LOG_REG_H_
-#define _LOG_REG_H_
-
-#include <vector>
-#include "sparse_vector.h"
-#include "bin_class.h"
-
-struct LogisticRegression : public Objective {
-  double ObjectiveAndGradient(const SparseVector<double>& x,
-                              const std::vector<TrainingInstance>& training_instances,
-                              SparseVector<double>* g) const;
-};
-
-#endif
diff --git a/dtrain/test/logreg_cd/bin_class.cc b/dtrain/test/logreg_cd/bin_class.cc
new file mode 100644
index 00000000..19bcde25
--- /dev/null
+++ b/dtrain/test/logreg_cd/bin_class.cc
@@ -0,0 +1,4 @@
+#include "bin_class.h"
+
+Objective::~Objective() {}
+
diff --git a/dtrain/test/logreg_cd/bin_class.h b/dtrain/test/logreg_cd/bin_class.h
new file mode 100644
index 00000000..3466109a
--- /dev/null
+++ b/dtrain/test/logreg_cd/bin_class.h
@@ -0,0 +1,22 @@
+#ifndef _BIN_CLASS_H_
+#define _BIN_CLASS_H_
+
+#include <vector>
+#include "sparse_vector.h"
+
+struct TrainingInstance {
+  // TODO add other info? loss for MIRA-type updates?
+  SparseVector<double> x_feature_map;
+  bool y;
+};
+
+struct Objective {
+  virtual ~Objective();
+
+  // returns f(x) and f'(x)
+  virtual double ObjectiveAndGradient(const SparseVector<double>& x,
+                  const std::vector<TrainingInstance>& training_instances,
+                  SparseVector<double>* g) const = 0;
+};
+
+#endif
diff --git a/dtrain/test/logreg_cd/log_reg.cc b/dtrain/test/logreg_cd/log_reg.cc
new file mode 100644
index 00000000..ec2331fe
--- /dev/null
+++ b/dtrain/test/logreg_cd/log_reg.cc
@@ -0,0 +1,39 @@
+#include "log_reg.h"
+
+#include <vector>
+#include <cmath>
+
+#include "sparse_vector.h"
+
+using namespace std;
+
+double LogisticRegression::ObjectiveAndGradient(const SparseVector<double>& x,
+                              const vector<TrainingInstance>& training_instances,
+                              SparseVector<double>* g) const {
+  double cll = 0;
+  for (int i = 0; i < training_instances.size(); ++i) {
+    const double dotprod = training_instances[i].x_feature_map.dot(x); // TODO no bias, if bias, add x[0]
+    double lp_false = dotprod;
+    double lp_true = -dotprod;
+    if (0 < lp_true) {
+      lp_true += log1p(exp(-lp_true));
+      lp_false = log1p(exp(lp_false));
+    } else {
+      lp_true = log1p(exp(lp_true));
+      lp_false += log1p(exp(-lp_false));
+    }
+    lp_true *= -1;
+    lp_false *= -1;
+    if (training_instances[i].y) {  // true label
+      cll -= lp_true;
+      (*g) -= training_instances[i].x_feature_map * exp(lp_false);
+      // (*g)[0] -= exp(lp_false); // bias
+    } else {                  // false label
+      cll -= lp_false;
+      (*g) += training_instances[i].x_feature_map * exp(lp_true);
+      // g += corpus[i].second * exp(lp_true);
+    }
+  }
+  return cll;
+}
+
diff --git a/dtrain/test/logreg_cd/log_reg.h b/dtrain/test/logreg_cd/log_reg.h
new file mode 100644
index 00000000..ecc560b8
--- /dev/null
+++ b/dtrain/test/logreg_cd/log_reg.h
@@ -0,0 +1,14 @@
+#ifndef _LOG_REG_H_
+#define _LOG_REG_H_
+
+#include <vector>
+#include "sparse_vector.h"
+#include "bin_class.h"
+
+struct LogisticRegression : public Objective {
+  double ObjectiveAndGradient(const SparseVector<double>& x,
+                              const std::vector<TrainingInstance>& training_instances,
+                              SparseVector<double>* g) const;
+};
+
+#endif
diff --git a/dtrain/test/toy/dtrain.ini b/dtrain/test/toy/dtrain.ini
index 5bfa5b2d..105c07df 100644
--- a/dtrain/test/toy/dtrain.ini
+++ b/dtrain/test/toy/dtrain.ini
@@ -3,7 +3,7 @@ k=4
 N=3
 epochs=2
 input=test/toy/in
-scorer=stupid_bleu
-sample_from=forest
 output=-
+scorer=stupid_bleu
+sample_from=kbest
 print_weights=logp use_shell use_house PassThrough
-- 
cgit v1.2.3