3 files changed, 125 insertions, 37 deletions
diff --git a/training/dtrain/dtrain.cc b/training/dtrain/dtrain.cc
index 3e9902ab..53e8cd50 100644
--- a/training/dtrain/dtrain.cc
+++ b/training/dtrain/dtrain.cc
@@ -41,6 +41,13 @@ main(int argc, char** argv)
   const bool output_updates      = output_updates_fn!="";
   const string output_raw_fn     = conf["output_raw"].as<string>();
   const bool output_raw          = output_raw_fn!="";
+  const bool use_adadelta        = conf["adadelta"].as<bool>();
+  const weight_t adadelta_decay  = conf["adadelta_decay"].as<weight_t>();
+  const weight_t adadelta_eta    = 0.000001;
+  const string adadelta_input    = conf["adadelta_input"].as<string>();
+  const string adadelta_output   = conf["adadelta_output"].as<string>();
+  const size_t max_input         = conf["stop_after"].as<size_t>();
+  const bool batch               = conf["batch"].as<bool>();
 
   // setup decoder
   register_feature_functions();
@@ -89,8 +96,8 @@ main(int argc, char** argv)
   vector<vector<size_t> > buffered_lengths;  // (just once)
   size_t input_sz = 0;
 
-  cerr << setprecision(4);
   // output configuration
+  cerr << fixed << setprecision(4);
   cerr << "Parameters:" << endl;
   cerr << setw(25) << "bitext " << "'" << input_fn << "'" << endl;
   cerr << setw(25) << "k " << k << endl;
@@ -109,10 +116,10 @@ main(int argc, char** argv)
     cerr << setw(25) << "chiang decay " << chiang_decay << endl;
   cerr << setw(25) << "N " << N << endl;
   cerr << setw(25) << "T " << T << endl;
-  cerr << setw(25) << "learning rate " << eta << endl;
+  cerr << scientific << setw(25) << "learning rate " << eta << endl;
   cerr << setw(25) << "margin " << margin << endl;
   if (!structured) {
-    cerr << setw(25) << "cut " << round(cut*100) << "%" << endl;
+    cerr << fixed << setw(25) << "cut " << round(cut*100) << "%" << endl;
     cerr << setw(25) << "adjust " << adjust_cut << endl;
   } else {
     cerr << setw(25) << "struct. obj " << structured << endl;
@@ -124,7 +131,7 @@ main(int argc, char** argv)
   if (noup)
     cerr << setw(25) << "no up. " << noup << endl;
   cerr << setw(25) << "average " << average << endl;
-  cerr << setw(25) << "l1 reg. " << l1_reg << endl;
+  cerr << scientific << setw(25) << "l1 reg. " << l1_reg << endl;
   cerr << setw(25) << "decoder conf " << "'"
        << conf["decoder_conf"].as<string>() << "'" << endl;
   cerr << setw(25) << "input " << "'" << input_fn << "'" << endl;
@@ -133,8 +140,17 @@ main(int argc, char** argv)
     cerr << setw(25) << "weights in " << "'"
          << conf["input_weights"].as<string>() << "'" << endl;
   }
+  cerr << setw(25) << "batch " << batch << endl;
   if (noup)
     cerr << setw(25) << "no updates!" << endl;
+  if (use_adadelta) {
+    cerr << setw(25) << "adadelta " << use_adadelta << endl;
+    cerr << setw(25) << "   decay " << adadelta_decay << endl;
+    if (adadelta_input != "")
+      cerr << setw(25) << "-input "  << adadelta_input << endl;
+    if (adadelta_output != "")
+      cerr << setw(25) << "-output "  << adadelta_output << endl;
+  }
   cerr << "(1 dot per processed input)" << endl;
 
   // meta
@@ -153,10 +169,23 @@ main(int argc, char** argv)
     *out_up << setprecision(numeric_limits<double>::digits10+1);
   }
 
+  // adadelta
+  SparseVector<weight_t> gradient_accum, update_accum;
+  if (use_adadelta && adadelta_input!="") {
+    vector<weight_t> grads_tmp;
+    Weights::InitFromFile(adadelta_input+".gradient", &grads_tmp);
+    Weights::InitSparseVector(grads_tmp, &gradient_accum);
+    vector<weight_t> update_tmp;
+    Weights::InitFromFile(adadelta_input+".update", &update_tmp);
+    Weights::InitSparseVector(update_tmp, &update_accum);
+  }
 
   for (size_t t = 0; t < T; t++) // T iterations
   {
 
+  // batch update
+  SparseVector<weight_t> batch_update;
+
   time_t start, end;
   time(&start);
   weight_t gold_sum=0., model_sum=0.;
@@ -194,6 +223,9 @@ main(int argc, char** argv)
       next = i<input_sz;
     }
 
+    if (max_input == i)
+      next = false;
+
     // produce some pretty output
     if (next) {
       if (i%20 == 0)
@@ -225,7 +257,7 @@ main(int argc, char** argv)
     list_sz += observer->effective_size;
 
     if (output_raw)
-      output_sample(sample, *out_raw, i);
+      output_sample(sample, out_raw, i);
 
     // update model
     if (!noup) {
@@ -233,21 +265,46 @@ main(int argc, char** argv)
     SparseVector<weight_t> updates;
     if (structured)
       num_up += update_structured(sample, updates, margin,
-                                  output_updates, *out_up, i);
+                                  out_up, i);
     else if (all_pairs)
       num_up += updates_all(sample, updates, max_up, threshold,
-                            output_updates, *out_up, i);
+                            out_up, i);
     else if (pro)
       num_up += updates_pro(sample, updates, cut, max_up, threshold,
-                            output_updates, *out_up, i);
+                            out_up, i);
     else
       num_up += updates_multipartite(sample, updates, cut, margin,
                                      max_up, threshold, adjust_cut,
-                                     output_updates, *out_up, i);
+                                     out_up, i);
+
     SparseVector<weight_t> lambdas_copy;
     if (l1_reg)
       lambdas_copy = lambdas;
-    lambdas.plus_eq_v_times_s(updates, eta);
+
+    if (use_adadelta) { // adadelta update
+      SparseVector<weight_t> squared;
+      for (auto it: updates)
+        squared[it.first] = pow(it.second, 2.0);
+      gradient_accum *= adadelta_decay;
+      squared *= 1.0-adadelta_decay;
+      gradient_accum += squared;
+      SparseVector<weight_t> u = gradient_accum + update_accum;
+      for (auto it: u)
+          u[it.first] = -1.0*(
+                              sqrt(update_accum[it.first]+adadelta_eta)
+                              /
+                              sqrt(gradient_accum[it.first]+adadelta_eta)
+                             ) * updates[it.first];
+      lambdas += u;
+      update_accum *= adadelta_decay;
+      for (auto it: u)
+          u[it.first] = pow(it.second, 2.0);
+      update_accum = update_accum + (u*(1.0-adadelta_decay));
+    } else if (batch) {
+      batch_update += updates;
+    } else { // regular update
+      lambdas.plus_eq_v_times_s(updates, eta);
+    }
 
     // update context for Chiang's approx. BLEU
     if (score_name == "chiang") {
@@ -290,23 +347,47 @@ main(int argc, char** argv)
   if (t == 0)
     input_sz = i; // remember size of input (# lines)
 
+  // batch
+  if (batch) {
+    batch_update /= (weight_t)num_up;
+    lambdas.plus_eq_v_times_s(batch_update, eta);
+    lambdas.init_vector(&decoder_weights);
+  }
+
   // update average
   if (average)
     w_average += lambdas;
 
+  if (adadelta_output != "") {
+     WriteFile g(adadelta_output+".gradient.gz");
+    for (auto it: gradient_accum)
+      *g << FD::Convert(it.first) << " " << it.second << endl;
+    WriteFile u(adadelta_output+".update.gz");
+    for (auto it: update_accum)
+      *u << FD::Convert(it.first) << " " << it.second << endl;
+  }
+
   // stats
   weight_t gold_avg = gold_sum/(weight_t)input_sz;
-  cerr << setiosflags(ios::showpos) << "WEIGHTS" << endl;
-  for (auto name: print_weights)
+  cerr << setiosflags(ios::showpos) << scientific << "WEIGHTS" << endl;
+  for (auto name: print_weights) {
     cerr << setw(18) << name << " = "
-         << lambdas.get(FD::Convert(name)) << endl;
+         << lambdas.get(FD::Convert(name));
+    if (use_adadelta) {
+      weight_t rate = -1.0*(sqrt(update_accum[FD::Convert(name)]+adadelta_eta)
+                          / sqrt(gradient_accum[FD::Convert(name)]+adadelta_eta));
+      cerr << " {" << rate << "}";
+    }
+    cerr << endl;
+  }
   cerr << "        ---" << endl;
   cerr << resetiosflags(ios::showpos)
        << "       1best avg score: "   << gold_avg*100;
-  cerr << setiosflags(ios::showpos)    << " ("
+  cerr << setiosflags(ios::showpos)    << fixed << " ("
        << (gold_avg-gold_prev)*100     << ")" << endl;
-  cerr << " 1best avg model score: "
+  cerr << scientific << " 1best avg model score: "
        << model_sum/(weight_t)input_sz << endl;
+  cerr << fixed;
   cerr << "         avg # updates: ";
   cerr << resetiosflags(ios::showpos)  <<  num_up/(float)input_sz << endl;
   cerr << "   non-0 feature count: "   << lambdas.num_nonzero() << endl;
diff --git a/training/dtrain/dtrain.h b/training/dtrain/dtrain.h
index b07edfdf..ce5b2101 100644
--- a/training/dtrain/dtrain.h
+++ b/training/dtrain/dtrain.h
@@ -57,11 +57,18 @@ dtrain_init(int argc,
      "learning rate [only meaningful if margin>0 or input weights are given]")
     ("l1_reg,r",           po::value<weight_t>()->default_value(0.),
      "l1 regularization strength [see Tsuruoka, Tsujii and Ananiadou (2009)]")
+    ("adadelta,D",         po::bool_switch()->default_value(false),
+     "use AdaDelta dynamic learning rates")
+    ("adadelta_decay",     po::value<weight_t>()->default_value(0.9),
+     "decay for AdaDelta algorithm")
+    ("adadelta_input",     po::value<string>()->default_value(""),
+     "input for AdaDelta's parameters, two files: file.gradient, and file.update")
+    ("adadelta_output",    po::value<string>()->default_value(""),
+     "prefix for outputting AdaDelta's parameters")
     ("margin,m",           po::value<weight_t>()->default_value(1.0),
      "margin for margin perceptron [set =0 for standard perceptron]")
     ("cut,u",              po::value<weight_t>()->default_value(0.1),
-     "use top/bottom 10% (default) of k-best as 'good' and 'bad' for \
-pair sampling, 0 to use all pairs TODO")
+     "use top/bottom 10% (default) of k-best as 'good' and 'bad' for pair sampling, 0 to use all pairs TODO")
     ("adjust,A",           po::bool_switch()->default_value(false),
      "adjust cut for optimal pos. in k-best to cut")
     ("score,s",            po::value<string>()->default_value("nakov"),
@@ -87,6 +94,8 @@ pair sampling, 0 to use all pairs TODO")
     ("max_pairs",
      po::value<size_t>()->default_value(numeric_limits<size_t>::max()),
      "max. number of updates/pairs")
+    ("batch,B",            po::bool_switch()->default_value(false),
+     "perform batch updates")
     ("output,o",           po::value<string>()->default_value("-"),
      "output weights file, '-' for STDOUT")
     ("disable_learning,X", po::bool_switch()->default_value(false),
@@ -95,6 +104,8 @@ pair sampling, 0 to use all pairs TODO")
      "output updates (diff. vectors) [to filename]")
     ("output_raw,R",       po::value<string>()->default_value(""),
      "output raw data (e.g. k-best lists) [to filename]")
+    ("stop_after",         po::value<size_t>()->default_value(numeric_limits<size_t>::max()),
+     "only look at this number of segments")
     ("print_weights,P",    po::value<string>()->default_value("EgivenFCoherent SampleCountF CountEF MaxLexFgivenE MaxLexEgivenF IsSingletonF IsSingletonFE Glue WordPenalty PassThrough LanguageModel LanguageModel_OOV"),
      "list of weights to print after each iteration");
   po::options_description clopts("Command Line Options");
diff --git a/training/dtrain/update.h b/training/dtrain/update.h
index f6aa9842..405a3f76 100644
--- a/training/dtrain/update.h
+++ b/training/dtrain/update.h
@@ -20,9 +20,8 @@ updates_multipartite(vector<Hyp>* sample,
                      size_t max_up,
                      weight_t threshold,
                      bool adjust,
-                     bool output=false,
-                     ostream& os=cout,
-                     size_t id=0)
+                     WriteFile& output,
+                     size_t id)
 {
   size_t up = 0;
   size_t sz = sample->size();
@@ -50,7 +49,7 @@ updates_multipartite(vector<Hyp>* sample,
            || (threshold && (first.gold-second.gold < threshold)))
         continue;
       if (output)
-        os << id << "\t" << first.f-second.f << endl;
+        *output << id << "\t" << first.f-second.f << endl;
       updates += first.f-second.f;
       if (++up==max_up)
         return up;
@@ -70,7 +69,7 @@ updates_multipartite(vector<Hyp>* sample,
            || (threshold && (first.gold-second.gold < threshold)))
         continue;
       if (output)
-        os << id << "\t" << first.f-second.f << endl;
+        *output << id << "\t" << first.f-second.f << endl;
       updates += first.f-second.f;
       if (++up==max_up)
         break;
@@ -91,9 +90,8 @@ updates_all(vector<Hyp>* sample,
             SparseVector<weight_t>& updates,
             size_t max_up,
             weight_t threshold,
-            bool output=false,
-            ostream& os=cout,
-            size_t id=0)
+            WriteFile output,
+            size_t id)
 {
   size_t up = 0;
   size_t sz = sample->size();
@@ -108,7 +106,7 @@ updates_all(vector<Hyp>* sample,
            || (threshold && (first.gold-second.gold < threshold)))
         continue;
       if (output)
-        os << id << "\t" << first.f-second.f << endl;
+        *output << id << "\t" << first.f-second.f << endl;
       updates += first.f-second.f;
       if (++up==max_up)
         break;
@@ -127,9 +125,8 @@ inline size_t
 update_structured(vector<Hyp>* sample,
                   SparseVector<weight_t>& updates,
                   weight_t margin,
-                  bool output=false,
-                  ostream& os=cout,
-                  size_t id=0)
+                  WriteFile output,
+                  size_t id)
 {
   // hope
   sort(sample->begin(), sample->end(), [](Hyp first, Hyp second)
@@ -147,13 +144,13 @@ update_structured(vector<Hyp>* sample,
   if (hope.gold != fear.gold) {
     updates += hope.f - fear.f;
     if (output)
-      os << id << "\t" << hope.f << "\t" << fear.f << endl;
+      *output << id << "\t" << hope.f << "\t" << fear.f << endl;
 
     return 1;
   }
 
   if (output)
-    os << endl;
+    *output << endl;
 
   return 0;
 }
@@ -172,9 +169,8 @@ updates_pro(vector<Hyp>* sample,
            size_t maxs,
            size_t max_up,
            weight_t threshold,
-           bool output=false,
-           ostream& os=cout,
-           size_t id=0)
+           WriteFile& output,
+           size_t id)
 {
 
   size_t sz = sample->size(), s;
@@ -202,7 +198,7 @@ updates_pro(vector<Hyp>* sample,
 
   for (auto i: g) {
     if (output)
-      os << id << "\t" << i.first->f-i.second->f << endl;
+      *output << id << "\t" << i.first->f-i.second->f << endl;
     updates += i.first->f-i.second->f;
   }
 
@@ -215,7 +211,7 @@ updates_pro(vector<Hyp>* sample,
  */
 inline void
 output_sample(vector<Hyp>* sample,
-              ostream& os=cout,
+              WriteFile& output,
               size_t id=0,
               bool sorted=true)
 {
@@ -227,7 +223,7 @@ output_sample(vector<Hyp>* sample,
   }
   size_t j = 0;
   for (auto k: *sample) {
-    os << id << "\t" << j << "\t" << k.gold << "\t" << k.model
+    *output << id << "\t" << j << "\t" << k.gold << "\t" << k.model
        << "\t" << k.f << endl;
     j++;
   }