4 files changed, 120 insertions, 26 deletions
diff --git a/dtrain/README.md b/dtrain/README.md
index b1dbf481..58c6dddc 100644
--- a/dtrain/README.md
+++ b/dtrain/README.md
@@ -45,6 +45,29 @@ Uncertain, known bugs, problems
 * input/grammar caching (strings -> WordIDs)
 * look at forest sampling...
 * devtest loo or not? why loo grammars larger? (sort psgs | uniq -> grammar)
+* lower beam size to be faster?
+* why is <unk> -100 in lm so good?
+* noise helps?
+
+random notes
+------------
+* learning rate tuned with perceptron
+* aer correlation with bleu?
+* dtrain (perc) used for some tests because no optimizer instability
+* http://www.ark.cs.cmu.edu/cdyer/dtrain/
+* repeat as often as max needed by any learner!
+* don't compare lms with diff vocab (stupid backoff paper)
+* what does mira/pro optimize?
+
+features
+--------
+* baseline features (take whatever cdec implements for VEST)
+* rule identifiers (feature name = rule as string)
+* rule discounts (taken from frequency i or frequency interval [i,j] of rule in extraction from parallel training data)
+* target ngrams (from nonterminals in rule rhs)
+* source-target unigrams (from word alignments used in rule extraction, if they are?)
+* lhs, rhs, rule length features
+* all other features depend on syntax annotation. 
 
 FIXME, todo
 -----------
@@ -52,12 +75,26 @@ FIXME, todo
 * mapred count shard sents
 * mapred stats for learning curve (output weights per iter for eval on devtest)
 * 250 forest sampling is real bad, bug?
-* metric reporter of bleu for each shard
+* metric reporter of bleu for each shard (reporters, status?)
+  to draw learning curves for all shards in 1 plot
 * kenlm not portable (i7-2620M vs Intel(R) Xeon(R) CPU E5620 @ 2.40GHz)
 * mapred chaining? hamake?
 * make our sigtest work with cdec
-* l1l2 red
-* tsuroke?
+* l1l2 red (tsuroke)?
+* epsilon stopping criterion
+* normalize weight vector to get proper model scores for forest sampling
+* 108010 with gap(s), and/or fix (same score in diff groups)
+* 108010: combine model score + bleu
+* visualize weight vector
+* *100 runs stats
+* correlation of *_bleu to ibm_bleu
+* ep: open lm, cutoff @1
+* tune regs
+* 3x3 4x4 5x5 .. 10x10 until standard dev ok
+* avg weight vector for dtrain? (mira non-avg)
+* repeat lm choose with mira/pro
+* shuffle training data
+
 
 Data
 ----
@@ -116,6 +153,8 @@ lm?
  lm oov weight pos? -100
  no tuning, -100 prob for unk EXPECT: nounk
  tuning with dtrain EXPECT: open
+ =>
+  lmtest on cs.giza.loo???
 
 [2]
 cs?
@@ -167,3 +206,40 @@ variables to control
 
 [pro]
 
+
+--------
+In PRO, a continually growing list of candidates is maintained for
+each sentence by concatenating k-best lists from each decoding run,
+and the training pairs are sampled from them. This is done to ensure
+that the optimizer doesn't forget about bad places in the parameter
+space that it visited previously (since some training samples will be
+selected from that space). Something like your approach should work
+well though, provided you don't overfit to the sentence pair you're
+looking at in each iteration. So I guess the question is: what are you
+doing in step 2 exactly? A complete optimization? Taking one step? The
+other thing is, do you maintain n-best hypotheses from previous
+iterations?
+
+--------
+good grammar? => ability to overfit
+ berkeley vs giza
+ not LOO
+ NO optimizer instability
+ 20+ iterations
+ approx_bleu-4
+ train on dev => test on dev
+ train on devtest => test on devtest
+ dev on dev better?
+ devtest on devtest better?
+ (train/test on loo? => lower!)
+ (test on others => real bad)
+
+
+loo vs non-loo? => generalization
+ (cs vs non-cs?)
+ giza||berkeley
+ LOO + non LOO
+ 2 fold cross validation 
+ train on dev, test on devtest
+ train on devtest, test on dev
+ as above ^^^
diff --git a/dtrain/dtrain.cc b/dtrain/dtrain.cc
index 4668ad66..2fe7afd7 100644
--- a/dtrain/dtrain.cc
+++ b/dtrain/dtrain.cc
@@ -24,6 +24,7 @@ dtrain_init(int argc, char** argv, po::variables_map* cfg)
     ("gamma",          po::value<weight_t>()->default_value(0),                          "gamma for SVM (0 for perceptron)")
     ("tmp",            po::value<string>()->default_value("/tmp"),                                        "temp dir to use")
     ("select_weights", po::value<string>()->default_value("last"), "output 'best' or 'last' weights ('VOID' to throw away)")
+    ("keep_w",         po::value<bool>()->zero_tokens(),                              "protocol weights for each iteration")
 #ifdef DTRAIN_LOCAL
     ("refs,r",         po::value<string>(),                                                      "references in local mode")
 #endif
@@ -92,7 +93,12 @@ main(int argc, char** argv)
   bool hstreaming = false;
   if (cfg.count("hstreaming")) {
     hstreaming = true;
+    quiet = true;
+    cerr.precision(17);
   }
+  bool keep_w = false;
+  if (cfg.count("keep_w")) keep_w = true;
+
   const unsigned k = cfg["k"].as<unsigned>();
   const unsigned N = cfg["N"].as<unsigned>(); 
   const unsigned T = cfg["epochs"].as<unsigned>();
@@ -104,7 +110,7 @@ main(int argc, char** argv)
   vector<string> print_weights;
   if (cfg.count("print_weights"))
     boost::split(print_weights, cfg["print_weights"].as<string>(), boost::is_any_of(" "));
-
+  
   // setup decoder
   register_feature_functions();
   SetSilent(true);
@@ -151,6 +157,7 @@ main(int argc, char** argv)
   weight_t eta = cfg["learning_rate"].as<weight_t>();
   weight_t gamma = cfg["gamma"].as<weight_t>();
 
+  // output
   string output_fn = cfg["output"].as<string>();
   // input
   string input_fn = cfg["input"].as<string>();
@@ -158,9 +165,9 @@ main(int argc, char** argv)
   // buffer input for t > 0
   vector<string> src_str_buf;          // source strings (decoder takes only strings)
   vector<vector<WordID> > ref_ids_buf; // references as WordID vecs
-  vector<string> weights_files;        // remember weights for each iteration
   // where temp files go
   string tmp_path = cfg["tmp"].as<string>();
+  vector<string> w_tmp_files; // used for protocol_w
 #ifdef DTRAIN_LOCAL
   string refs_fn = cfg["refs"].as<string>();
   ReadFile refs(refs_fn);
@@ -169,7 +176,7 @@ main(int argc, char** argv)
   ogzstream grammar_buf_out;
   grammar_buf_out.open(grammar_buf_fn.c_str());
 #endif
-  
+
   unsigned in_sz = UINT_MAX; // input index, input size
   vector<pair<score_t, score_t> > all_scores;
   score_t max_score = 0.;
@@ -206,6 +213,8 @@ main(int argc, char** argv)
   for (unsigned t = 0; t < T; t++) // T epochs
   {
 
+  if (hstreaming) cerr << "reporter:status:Iteration #" << t+1 << " of " << T << endl;
+
   time_t start, end;  
   time(&start);
 #ifndef DTRAIN_LOCAL
@@ -231,7 +240,7 @@ main(int argc, char** argv)
     if (stop_after > 0 && stop_after == ii && !next) stop = true;
     
     // produce some pretty output
-    if (!hstreaming && !quiet && !verbose) {
+    if (!quiet && !verbose) {
       if (ii == 0) cerr << " ";
       if ((ii+1) % (DTRAIN_DOTS) == 0) {
         cerr << ".";
@@ -375,10 +384,12 @@ main(int argc, char** argv)
     
     ++ii;
 
-    if (hstreaming) cerr << "reporter:counter:dtrain,sid," << ii << endl;
+    if (hstreaming) cerr << "reporter:counter:dtrain,count,1" << endl;
 
   } // input loop
 
+  if (hstreaming && t == 0) cerr << "reporter:counter:dtrain,|input|," << ii+1 << endl;
+
   if (scorer_str == "approx_bleu") scorer->Reset();
 
   if (t == 0) {
@@ -404,6 +415,11 @@ main(int argc, char** argv)
     score_diff = score_avg;
     model_diff = model_avg;
   }
+  if (hstreaming) {
+    cerr << "reporter:counter:dtrain,score avg it " << t+1 << "," << score_avg << endl;
+    cerr << "reporter:counter:dtrain,model avg it " << t+1 << "," << model_avg << endl;
+  }
+
   if (!quiet) {
     cerr << _p5 << _p << "WEIGHTS" << endl;
     for (vector<string>::iterator it = print_weights.begin(); it != print_weights.end(); it++) {
@@ -439,12 +455,10 @@ main(int argc, char** argv)
   if (noup) break;
 
   // write weights to file
-  if (select_weights == "best") {
-    string infix = "dtrain-weights-" + boost::lexical_cast<string>(t);
+  if (select_weights == "best" || keep_w) {
     lambdas.init_vector(&dense_weights);
-    string w_fn = gettmpf(tmp_path, infix, "gz");
+    string w_fn = "weights." + boost::lexical_cast<string>(t) + ".gz";
     Weights::WriteToFile(w_fn, dense_weights, true); 
-    weights_files.push_back(w_fn);
   }
 
   } // outer loop
@@ -467,18 +481,19 @@ main(int argc, char** argv)
     } else if (select_weights == "VOID") { // do nothing with the weights
     } else { // best
       if (output_fn != "-") {
-        CopyFile(weights_files[best_it], output_fn);  // always gzipped
+        CopyFile("weights."+boost::lexical_cast<string>(best_it)+".gz", output_fn);
       } else {
-        ReadFile bestw(weights_files[best_it]);
+        ReadFile bestw("weights."+boost::lexical_cast<string>(best_it)+".gz");
         string o;
         cout.precision(17);
         cout << _np;
         while(getline(*bestw, o)) cout << o << endl;
       }
-      for (vector<string>::iterator it = weights_files.begin(); it != weights_files.end(); ++it) {
-        unlink(it->c_str());
-        it->erase(it->end()-3, it->end());
-        unlink(it->c_str());
+      if (!keep_w) {
+        for (unsigned i = 0; i < T; i++) {
+          string s = "weights." + boost::lexical_cast<string>(i) + ".gz";
+          unlink(s.c_str());
+        }
       }
     }
     if (output_fn == "-" && hstreaming) cout << "__SHARD_COUNT__\t1" << endl;
@@ -491,6 +506,13 @@ main(int argc, char** argv)
     cerr << _p2 << "This took " << overall_time/60. << " min." << endl;
   }
 
+  if (keep_w) {
+    cout << endl << "Weight files per iteration:" << endl;
+    for (unsigned i = 0; i < w_tmp_files.size(); i++) {
+      cout << w_tmp_files[i] << endl;
+    }
+  }
+
   return 0;
 }
 
diff --git a/dtrain/dtrain.h b/dtrain/dtrain.h
index 6c9decf4..6742f343 100644
--- a/dtrain/dtrain.h
+++ b/dtrain/dtrain.h
@@ -30,17 +30,13 @@ inline void register_and_convert(const vector<string>& strs, vector<WordID>& ids
     ids.push_back(TD::Convert(*it));
 }
 
-inline string gettmpf(const string path, const string infix, const string suffix="") {
+inline string gettmpf(const string path, const string infix) {
   char fn[1024];
   strcpy(fn, path.c_str());
   strcat(fn, "/");
   strcat(fn, infix.c_str());
   strcat(fn, "-XXXXXX");
   mkstemp(fn);
-  if (suffix != "") { // we will get 2 files
-    strcat(fn, ".");
-    strcat(fn, suffix.c_str());
-  }
   return string(fn);
 }
 
diff --git a/dtrain/test/example/dtrain.ini b/dtrain/test/example/dtrain.ini
index 40f8e03f..e8a20759 100644
--- a/dtrain/test/example/dtrain.ini
+++ b/dtrain/test/example/dtrain.ini
@@ -1,13 +1,13 @@
 decoder_config=test/example/cdec.ini
 k=100
 N=3
-gamma=0.001
+gamma=0
 epochs=20
 input=test/example/nc-1k-tabs.gz
 scorer=stupid_bleu
 output=weights.gz
-#stop_after=100
-sample_from=forest
+stop_after=10
+sample_from=kbest
 pair_sampling=108010
 select_weights=VOID
 print_weights=Glue WordPenalty LanguageModel LanguageModel_OOV PhraseModel_0 PhraseModel_1 PhraseModel_2 PhraseModel_3 PhraseModel_4 PassThrough