4 files changed, 81 insertions, 84 deletions
diff --git a/dtrain/dtrain.cc b/dtrain/dtrain.cc
index 5d84f250..25858738 100644
--- a/dtrain/dtrain.cc
+++ b/dtrain/dtrain.cc
@@ -6,25 +6,25 @@ dtrain_init(int argc, char** argv, po::variables_map* cfg)
 {
   po::options_description ini("Configuration File Options");
   ini.add_options()
-    ("input",          po::value<string>()->default_value("-"),                            "input file")
-    ("output",         po::value<string>()->default_value("-"),         "output weights file (or VOID)")
-    ("input_weights",  po::value<string>(),         "input weights file (e.g. from previous iteration)")
-    ("decoder_config", po::value<string>(),                               "configuration file for cdec")
-    ("k",              po::value<unsigned>()->default_value(100), "size of kbest or sample from forest")
-    ("sample_from",    po::value<string>()->default_value("kbest"),    "where to get translations from")
-    ("filter",         po::value<string>()->default_value("unique"),                "filter kbest list")
-    ("pair_sampling",  po::value<string>()->default_value("all"),      "how to sample pairs: all, rand")
-    ("N",              po::value<unsigned>()->default_value(3),                          "N for Ngrams")
-    ("epochs",         po::value<unsigned>()->default_value(2),                     "# of iterations T") 
-    ("scorer",         po::value<string>()->default_value("stupid_bleu"),              "scoring metric")
-    ("stop_after",     po::value<unsigned>()->default_value(0),          "stop after X input sentences")
-    ("print_weights",  po::value<string>(),                        "weights to print on each iteration")
-    ("hstreaming",     po::value<bool>()->zero_tokens(),                 "run in hadoop streaming mode")
-    ("learning_rate",  po::value<double>()->default_value(0.0005),                      "learning rate")
-    ("gamma",          po::value<double>()->default_value(0.),       "gamma for SVM (0 for perceptron)")
-    ("tmp",            po::value<string>()->default_value("/tmp"),                    "temp dir to use")
-    ("select_weights", po::value<string>()->default_value("last"),    "output 'best' or 'last' weights")
-    ("noup",           po::value<bool>()->zero_tokens(),                        "do not update weights");
+    ("input",          po::value<string>()->default_value("-"),                                                "input file")
+    ("output",         po::value<string>()->default_value("-"),                                       "output weights file")
+    ("input_weights",  po::value<string>(),                             "input weights file (e.g. from previous iteration)")
+    ("decoder_config", po::value<string>(),                                                   "configuration file for cdec")
+    ("k",              po::value<unsigned>()->default_value(100),                     "size of kbest or sample from forest")
+    ("sample_from",    po::value<string>()->default_value("kbest"),                        "where to get translations from")
+    ("filter",         po::value<string>()->default_value("unique"),                                    "filter kbest list")
+    ("pair_sampling",  po::value<string>()->default_value("all"),                          "how to sample pairs: all, rand")
+    ("N",              po::value<unsigned>()->default_value(3),                                              "N for Ngrams")
+    ("epochs",         po::value<unsigned>()->default_value(2),                                         "# of iterations T") 
+    ("scorer",         po::value<string>()->default_value("stupid_bleu"),                                  "scoring metric")
+    ("stop_after",     po::value<unsigned>()->default_value(0),                              "stop after X input sentences")
+    ("print_weights",  po::value<string>(),                                            "weights to print on each iteration")
+    ("hstreaming",     po::value<bool>()->zero_tokens(),                                     "run in hadoop streaming mode")
+    ("learning_rate",  po::value<double>()->default_value(0.0005),                                          "learning rate")
+    ("gamma",          po::value<double>()->default_value(0),                            "gamma for SVM (0 for perceptron)")
+    ("tmp",            po::value<string>()->default_value("/tmp"),                                        "temp dir to use")
+    ("select_weights", po::value<string>()->default_value("last"), "output 'best' or 'last' weights ('VOID' to throw away)")
+    ("noup",           po::value<bool>()->zero_tokens(),                                            "do not update weights");
   po::options_description cl("Command Line Options");
   cl.add_options()
     ("config,c",         po::value<string>(),              "dtrain config file")
@@ -108,9 +108,9 @@ main(int argc, char** argv)
   } else if (scorer_str == "stupid_bleu") {
     scorer = dynamic_cast<StupidBleuScorer*>(new StupidBleuScorer);
   } else if (scorer_str == "smooth_bleu") {
-      scorer = dynamic_cast<SmoothBleuScorer*>(new SmoothBleuScorer);
+    scorer = dynamic_cast<SmoothBleuScorer*>(new SmoothBleuScorer);
   } else if (scorer_str == "approx_bleu") {
-      scorer = dynamic_cast<StupidBleuScorer*>(new StupidBleuScorer); // FIXME
+    scorer = dynamic_cast<StupidBleuScorer*>(new StupidBleuScorer); // FIXME
   } else {
     cerr << "Don't know scoring metric: '" << scorer_str << "', exiting." << endl;
     exit(1);
@@ -138,8 +138,10 @@ main(int argc, char** argv)
   // meta params for perceptron, SVM
   double eta = cfg["learning_rate"].as<double>();
   double gamma = cfg["gamma"].as<double>();
-  lambdas.add_value(FD::Convert("__bias"), 0);
+  WordID __bias = FD::Convert("__bias");
+  lambdas.add_value(__bias, 0);
 
+  string output_fn = cfg["output"].as<string>();
   // input
   string input_fn = cfg["input"].as<string>();
   ReadFile input(input_fn);
@@ -169,8 +171,8 @@ main(int argc, char** argv)
       cerr << setw(25) << "stop_after " << stop_after << endl;
     if (cfg.count("input_weights"))
       cerr << setw(25) << "weights in" << cfg["input_weights"].as<string>() << endl;
-    cerr << setw(25) << "input " << "'" << cfg["input"].as<string>() << "'" << endl;
-    cerr << setw(25) << "output " << "'" << cfg["output"].as<string>() << "'" << endl;
+    cerr << setw(25) << "input " << "'" << input_fn << "'" << endl;
+    cerr << setw(25) << "output " << "'" << output_fn << "'" << endl;
     if (sample_from == "kbest")
       cerr << setw(25) << "filter " << "'" << filter_type << "'" << endl;
     cerr << setw(25) << "learning rate " << eta << endl;
@@ -190,9 +192,9 @@ main(int argc, char** argv)
   igzstream grammar_buf_in;
   if (t > 0) grammar_buf_in.open(grammar_buf_fn.c_str());
   score_t score_sum = 0., model_sum = 0.;
-  unsigned ii = 0;
+  unsigned ii = 0, nup = 0, npairs = 0;
   if (!quiet) cerr << "Iteration #" << t+1 << " of " << T << "." << endl;
-  
+
   while(true)
   {
 
@@ -294,8 +296,7 @@ main(int argc, char** argv)
     score_sum += (*samples)[0].score;
     model_sum += (*samples)[0].model;
 
-//////////////////////////////////////////////////////////
-    // UPDATE WEIGHTS
+    // weight updates
     if (!noup) {
       vector<pair<ScoredHyp,ScoredHyp> > pairs;
       if (pair_sampling == "all")
@@ -304,45 +305,35 @@ main(int argc, char** argv)
         sample_rand_pairs(samples, pairs, &rng);
       if (pair_sampling == "108010")
         sample108010(samples, pairs);
+      npairs += pairs.size();
        
       for (vector<pair<ScoredHyp,ScoredHyp> >::iterator it = pairs.begin();
            it != pairs.end(); it++) {
-
-        SparseVector<double> dv;
-        if (it->first.score - it->second.score < 0) {
-          dv = it->second.f - it->first.f;
-      //} else {
-        //dv = it->first - it->second;
-      //}
-          dv.add_value(FD::Convert("__bias"), -1);
-        
-          //SparseVector<double> reg;
-          //reg = lambdas * (2 * gamma);
-          //dv -= reg;
-          lambdas += dv * eta;
-
-          if (verbose) {
-            /*cerr << "{{ f("<< it->first_rank <<") > f(" << it->second_rank << ") but g(i)="<< it->first_score <<" < g(j)="<< it->second_score << " so update" << endl;
-            cerr << " i  " << TD::GetString(samples->sents[ti->first_rank]) << endl;
-            cerr << "    " << samples->feats[ti->first_rank] << endl;
-            cerr << " j  " << TD::GetString(samples->sents[ti->second_rank]) << endl;
-            cerr << "    " << samples->feats[ti->second_rank] << endl; 
-            cerr << " diff vec: " << dv << endl;
-            cerr << " lambdas after update: " << lambdas << endl;
-            cerr << "}}" << endl;*/
+        if (!gamma) {
+          // perceptron
+          if (it->first.score - it->second.score < 0) { // rank error
+            SparseVector<double> dv = it->second.f - it->first.f;
+            dv.add_value(__bias, -1);
+            lambdas.plus_eq_v_times_s(dv, eta);
+            nup++;
           }
         } else {
-          //SparseVector<double> reg;
-          //reg = lambdas * (2 * gamma);
-          //lambdas += reg * (-eta);
+          // SVM
+          double rank_error = it->second.score - it->first.score;
+          if (rank_error > 0) {
+            SparseVector<double> dv = it->second.f - it->first.f;
+            dv.add_value(__bias, -1);
+            lambdas.plus_eq_v_times_s(dv, eta);
+          }
+          // regularization
+          double margin = it->first.model - it->second.model;
+          if (rank_error || margin < 1) {
+            lambdas.plus_eq_v_times_s(lambdas, -2*gamma*eta); // reg /= #EXAMPLES or #UPDATES ?
+            nup++;
+          }
         }
-
       }
-
-      //double l2 = lambdas.l2norm();
-      //if (l2) lambdas /= lambdas.l2norm();
     }
-//////////////////////////////////////////////////////////
 
     ++ii;
 
@@ -369,15 +360,19 @@ main(int argc, char** argv)
     model_diff = model_avg;
   }
   if (!quiet) {
-  cerr << _p5 << _p << "WEIGHTS" << endl;
-  for (vector<string>::iterator it = print_weights.begin(); it != print_weights.end(); it++) {
-    cerr << setw(16) << *it << " = " << lambdas.get(FD::Convert(*it)) << endl;
-  }
-  cerr << "        ---" << endl;
-  cerr << _np << "      1best avg score: " << score_avg;
-  cerr << _p << " (" << score_diff << ")" << endl;
-  cerr << _np << "1best avg model score: " << model_avg;
-  cerr << _p << " (" << model_diff << ")" << endl;
+    cerr << _p5 << _p << "WEIGHTS" << endl;
+    for (vector<string>::iterator it = print_weights.begin(); it != print_weights.end(); it++) {
+      cerr << setw(18) << *it << " = " << lambdas.get(FD::Convert(*it)) << endl;
+    }
+    cerr << "        ---" << endl;
+    cerr << _np << "      1best avg score: " << score_avg;
+    cerr << _p << " (" << score_diff << ")" << endl;
+    cerr << _np << "1best avg model score: " << model_avg;
+    cerr << _p << " (" << model_diff << ")" << endl;
+    cerr << "           avg #pairs: ";
+    cerr << _np << npairs/(float)in_sz << endl;
+    cerr << "              avg #up: ";
+    cerr << nup/(float)in_sz << endl;
   }
   pair<score_t,score_t> remember;
   remember.first = score_avg;
@@ -412,9 +407,9 @@ main(int argc, char** argv)
   unlink(grammar_buf_fn.c_str());
 
   if (!noup) {
-    if (!quiet) cerr << endl << "writing weights file to '" << cfg["output"].as<string>() << "' ...";
+    if (!quiet) cerr << endl << "writing weights file to '" << output_fn << "' ..." << endl;
     if (select_weights == "last") { // last
-      WriteFile out(cfg["output"].as<string>());
+      WriteFile out(output_fn);
       ostream& o = *out.stream();
       o.precision(17);
       o << _np;
@@ -423,9 +418,10 @@ main(int argc, char** argv)
         o << FD::Convert(it->first) << '\t' << it->second << endl;
       }
       if (hstreaming) cout << "__SHARD_COUNT__\t1" << endl;
+    } else if (select_weights == "VOID") { // do nothing
     } else { // best
-      if (cfg["output"].as<string>() != "-") {
-        CopyFile(weights_files[best_it], cfg["output"].as<string>()); 
+      if (output_fn != "-") {
+        CopyFile(weights_files[best_it], output_fn); 
       } else {
         ReadFile(weights_files[best_it]);
         string o;
diff --git a/dtrain/test/example/dtrain.ini b/dtrain/test/example/dtrain.ini
index 09c876d9..3e5c2cd1 100644
--- a/dtrain/test/example/dtrain.ini
+++ b/dtrain/test/example/dtrain.ini
@@ -1,15 +1,15 @@
 decoder_config=test/example/cdec.ini
 k=100
 N=3
+gamma=0
+#gamma=0.00001
 epochs=4
-#input=test/example/nc-1k.gz
 input=test/example/nc-1k-tabs.gz
 scorer=stupid_bleu
-output=VOID
-#/tmp/weights.gz
+output=-
 stop_after=100
 sample_from=kbest
 pair_sampling=all
+select_weights=VOID
 print_weights=Glue WordPenalty LanguageModel LanguageModel_OOV PhraseModel_0 PhraseModel_1 PhraseModel_2 PhraseModel_3 PhraseModel_4 PassThrough
 tmp=/tmp
-select_weights=best
diff --git a/dtrain/test/toy/dtrain.ini b/dtrain/test/toy/dtrain.ini
index 3ab4f8d4..5bfa5b2d 100644
--- a/dtrain/test/toy/dtrain.ini
+++ b/dtrain/test/toy/dtrain.ini
@@ -1,8 +1,9 @@
-decoder_config=test/toy_example/cdec.ini
-ksamples=4
-ngrams=3
+decoder_config=test/toy/cdec.ini
+k=4
+N=3
 epochs=2
-input=test/toy_example/toy.in
-scorer=bleu
+input=test/toy/in
+scorer=stupid_bleu
+sample_from=forest
 output=-
-wprint=logp use_shell use_house PassThrough
+print_weights=logp use_shell use_house PassThrough
diff --git a/dtrain/test/toy/in b/dtrain/test/toy/in
index 63f97158..d7b7d080 100644
--- a/dtrain/test/toy/in
+++ b/dtrain/test/toy/in
@@ -1,2 +1,2 @@
-0	ich sah ein kleines haus	i saw a little house	[S] ||| [NP,1] [VP,2] ||| [1] [2] ||| logp=0 __NEXT_RULE__ [NP] ||| ich ||| i ||| logp=0 __NEXT_RULE__ [NP] ||| ein [NN,1] ||| a [1] ||| logp=0 __NEXT_RULE__ [NN] ||| [JJ,1] haus ||| [1] house ||| logp=0 use_house=1 __NEXT_RULE__ [NN] ||| [JJ,1] haus ||| [1] shell ||| logp=0 use_shell=1 __NEXT_RULE__ [JJ] ||| kleines ||| small ||| logp=0 __NEXT_RULE__ [JJ] ||| kleines ||| little ||| logp=0 __NEXT_RULE__ [JJ] ||| grosses ||| big ||| logp=0 __NEXT_RULE__ [JJ] ||| grosses ||| large ||| logp=0 __NEXT_RULE__ [VP] ||| [V,1] [NP,2] ||| [1] [2] ||| logp=0 __NEXT_RULE__ [V] ||| sah ||| saw ||| logp=0 __NEXT_RULE__ [V] ||| fand ||| found ||| logp=0
-1	ich fand ein grosses haus	i found a large house	[S] ||| [NP,1] [VP,2] ||| [1] [2] ||| logp=0 __NEXT_RULE__ [NP] ||| ich ||| i ||| logp=0 __NEXT_RULE__ [NP] ||| ein [NN,1] ||| a [1] ||| logp=0 __NEXT_RULE__ [NN] ||| [JJ,1] haus ||| [1] house ||| logp=0 use_house=1 __NEXT_RULE__ [NN] ||| [JJ,1] haus ||| [1] shell ||| logp=0 use_shell=1 __NEXT_RULE__ [JJ] ||| kleines ||| small ||| logp=0 __NEXT_RULE__ [JJ] ||| kleines ||| little ||| logp=0 __NEXT_RULE__ [JJ] ||| grosses ||| big ||| logp=0 __NEXT_RULE__ [JJ] ||| grosses ||| large ||| logp=0 __NEXT_RULE__ [VP] ||| [V,1] [NP,2] ||| [1] [2] ||| logp=0 __NEXT_RULE__ [V] ||| sah ||| saw ||| logp=0 __NEXT_RULE__ [V] ||| fand ||| found ||| logp=0
+0	ich sah ein kleines haus	i saw a little house	[S] ||| [NP,1] [VP,2] ||| [1] [2] ||| logp=0	[NP] ||| ich ||| i ||| logp=0	[NP] ||| ein [NN,1] ||| a [1] ||| logp=0	[NN] ||| [JJ,1] haus ||| [1] house ||| logp=0 use_house=1	[NN] ||| [JJ,1] haus ||| [1] shell ||| logp=0 use_shell=1	[JJ] ||| kleines ||| small ||| logp=0	[JJ] ||| kleines ||| little ||| logp=0	[JJ] ||| grosses ||| big ||| logp=0	[JJ] ||| grosses ||| large ||| logp=0	[VP] ||| [V,1] [NP,2] ||| [1] [2] ||| logp=0	[V] ||| sah ||| saw ||| logp=0	[V] ||| fand ||| found ||| logp=0
+1	ich fand ein grosses haus	i found a large house	[S] ||| [NP,1] [VP,2] ||| [1] [2] ||| logp=0	[NP] ||| ich ||| i ||| logp=0	[NP] ||| ein [NN,1] ||| a [1] ||| logp=0	[NN] ||| [JJ,1] haus ||| [1] house ||| logp=0 use_house=1	[NN] ||| [JJ,1] haus ||| [1] shell ||| logp=0 use_shell=1	[JJ] ||| kleines ||| small ||| logp=0	[JJ] ||| kleines ||| little ||| logp=0	[JJ] ||| grosses ||| big ||| logp=0	[JJ] ||| grosses ||| large ||| logp=0	[VP] ||| [V,1] [NP,2] ||| [1] [2] ||| logp=0	[V] ||| sah ||| saw ||| logp=0	[V] ||| fand ||| found ||| logp=0