9 files changed, 97 insertions, 75 deletions
diff --git a/dtrain/Makefile.am b/dtrain/Makefile.am
index 471977e1..64fef489 100644
--- a/dtrain/Makefile.am
+++ b/dtrain/Makefile.am
@@ -3,5 +3,5 @@ bin_PROGRAMS = dtrain
 dtrain_SOURCES = dtrain.cc score.cc
 dtrain_LDADD   = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a ../klm/lm/libklm.a ../klm/util/libklm_util.a -lz
 
-AM_CPPFLAGS = -W -Wall -Wno-sign-compare -I$(top_srcdir)/utils -I$(top_srcdir)/decoder -I$(top_srcdir)/mteval -O3
+AM_CPPFLAGS = -W -Wall -Wno-sign-compare -I$(top_srcdir)/utils -I$(top_srcdir)/decoder -I$(top_srcdir)/mteval
 
diff --git a/dtrain/NEXT b/dtrain/NEXT
index 24939cf3..eccfb313 100644
--- a/dtrain/NEXT
+++ b/dtrain/NEXT
@@ -1,6 +1,7 @@
-cuda vecs?
-target side rule ngrams
-decoder meta-parameters testing
-cdyer -> sa-extract -> loo?
-reranking while sgd
+make svm faster (cuda)?
+ other learning frameworks
+target side rule ngram feature template
+decoder meta-parameters test
+sa-extract -> leave-one-out?
+rerank while sgd?
 
diff --git a/dtrain/README.md b/dtrain/README.md
index f4e1abed..2a24ec22 100644
--- a/dtrain/README.md
+++ b/dtrain/README.md
@@ -1,13 +1,20 @@
-This is a simple (but parallelizable) tuning method for cdec, as used here:
+This is a simple (and parallelizable) tuning method for cdec
+which is able to train the weights of very many (sparse) features.
+It was used here:
   "Joint Feature Selection in Distributed Stochastic
    Learning for Large-Scale Discriminative Training in
-   SMT" Simianer, Riezler, Dyer
-   ACL 2012
+   SMT" Simianer, Riezler, Dyer; ACL 2012
 
 
 Building
 --------
-builds when building cdec, see ../BUILDING
+Builds when building cdec, see ../BUILDING .
+To build only parts needed for dtrain do
+```
+  autoreconf -ifv
+  ./configure [--disable-test]
+  cd dtrain/; make
+```
 
 Running
 -------
@@ -15,10 +22,10 @@ To run this on a dev set locally:
 ```
     #define DTRAIN_LOCAL
 ```
-otherwise remove that line or undef. You need a single grammar file
-or per-sentence-grammars (psg) as you would use with cdec.
-Additionally you need to give dtrain a file with
-references (--refs).
+otherwise remove that line or undef, then recompile. You need a single
+grammar file or input annotated with per-sentence grammars (psg) as you
+would use with cdec. Additionally you need to give dtrain a file with
+references (--refs) when running locally.
 
 The input for use with hadoop streaming looks like this:
 ```
@@ -27,12 +34,12 @@ The input for use with hadoop streaming looks like this:
 To convert a psg to this format you need to replace all "\n"
 by "\t". Make sure there are no tabs in your data.
 
-For an example of local usage (with 'distributed' format)
+For an example of local usage (with the 'distributed' format)
 the see test/example/ . This expects dtrain to be built without
 DTRAIN_LOCAL.
 
-Legal stuff
------------
+Legal
+-----
 Copyright (c) 2012 by Patrick Simianer <p@simianer.de>
 
 See the file ../LICENSE.txt for the licensing terms that this software is
diff --git a/dtrain/dtrain.cc b/dtrain/dtrain.cc
index fb6c6880..e7a1244c 100644
--- a/dtrain/dtrain.cc
+++ b/dtrain/dtrain.cc
@@ -29,8 +29,8 @@ dtrain_init(int argc, char** argv, po::variables_map* cfg)
     ("rescale",         po::value<bool>()->zero_tokens(),                           "rescale weight vector after each input")
     ("l1_reg",          po::value<string>()->default_value("none"),   "apply l1 regularization as in 'Tsuroka et al' (2010)")
     ("l1_reg_strength", po::value<weight_t>(),                                                  "l1 regularization strength")
-    ("funny",           po::value<bool>()->zero_tokens(),                      "include correctly ranked pairs into updates")
-    ("fselect",         po::value<weight_t>()->default_value(-1),        "select top x percent of features after each epoch")     
+    ("inc_correct",     po::value<bool>()->zero_tokens(),                      "include correctly ranked pairs into updates")
+    ("fselect",         po::value<weight_t>()->default_value(-1),   "TODO select top x percent of features after each epoch")
 #ifdef DTRAIN_LOCAL
     ("refs,r",         po::value<string>(),                                                       "references in local mode")
 #endif
@@ -113,9 +113,9 @@ main(int argc, char** argv)
   HSReporter rep(task_id);
   bool keep = false;
   if (cfg.count("keep")) keep = true;
-  bool funny = false;
-  if (cfg.count("funny"))
-    funny = true;
+  bool inc_correct = false;
+  if (cfg.count("inc_correct"))
+    inc_correct = true;
 
   const unsigned k = cfg["k"].as<unsigned>();
   const unsigned N = cfg["N"].as<unsigned>(); 
@@ -158,10 +158,9 @@ main(int argc, char** argv)
   }
   vector<score_t> bleu_weights;
   scorer->Init(N, bleu_weights);
-  if (!quiet) cerr << setw(26) << "scorer '" << scorer_str << "'" << endl << endl;
 
   // setup decoder observer
-  MT19937 rng; // random number generator
+  MT19937 rng; // random number generator, only for forest sampling
   HypSampler* observer;
   if (sample_from == "kbest")
     observer = dynamic_cast<KBestGetter*>(new KBestGetter(k, filter_type));
@@ -225,6 +224,7 @@ main(int argc, char** argv)
     cerr << setw(25) << "k " << k << endl;
     cerr << setw(25) << "N " << N << endl;
     cerr << setw(25) << "T " << T << endl;
+    cerr << setw(25) << "scorer '" << scorer_str << "'" << endl;
     cerr << setw(25) << "sample from " << "'" << sample_from << "'" << endl;
     if (sample_from == "kbest")
       cerr << setw(25) << "filter " << "'" << filter_type << "'" << endl;
@@ -235,8 +235,8 @@ main(int argc, char** argv)
     cerr << setw(25) << "select weights " << "'" << select_weights << "'" << endl;
     if (cfg.count("l1_reg"))
       cerr << setw(25) << "l1 reg " << l1_reg << " '" << cfg["l1_reg"].as<string>() << "'" << endl;
-    if (funny)
-      cerr << setw(25) << "funny " << funny << endl;
+    if (inc_correct)
+      cerr << setw(25) << "inc. correct " << inc_correct << endl;
     if (rescale)
       cerr << setw(25) << "rescale " << rescale << endl;
     cerr << setw(25) << "cdec cfg " << "'" << cfg["decoder_config"].as<string>() << "'" << endl;
@@ -246,7 +246,7 @@ main(int argc, char** argv)
 #endif
     cerr << setw(25) << "output " << "'" << output_fn << "'" << endl;
     if (cfg.count("input_weights"))
-      cerr << setw(25) << "weights in" << cfg["input_weights"].as<string>() << endl;
+      cerr << setw(25) << "weights in " << "'" << cfg["input_weights"].as<string>() << "'" << endl;
     if (cfg.count("stop-after"))
       cerr << setw(25) << "stop_after " << stop_after << endl;
     if (!verbose) cerr << "(a dot represents " << DTRAIN_DOTS << " lines of input)" << endl;
@@ -279,7 +279,7 @@ main(int argc, char** argv)
     } else {
       if (ii == in_sz) next = true; // stop if we reach the end of our input
     }
-    // stop after X sentences (but still iterate for those)
+    // stop after X sentences (but still go on for those)
     if (stop_after > 0 && stop_after == ii && !next) stop = true;
     
     // produce some pretty output
@@ -323,14 +323,17 @@ main(int argc, char** argv)
       register_and_convert(ref_tok, ref_ids);
       ref_ids_buf.push_back(ref_ids);
       // process and set grammar
-      bool broken_grammar = true;
+      bool broken_grammar = true; // ignore broken grammars
       for (string::iterator it = in.begin(); it != in.end(); it++) {
         if (!isspace(*it)) {
           broken_grammar = false;
           break;
         }
       }
-      if (broken_grammar) continue;
+      if (broken_grammar) {
+        cerr << "Broken grammar for " << ii+1 << "! Ignoring this input." << endl;
+        continue;
+      }
       boost::replace_all(in, "\t", "\n");
       in += "\n";
       grammar_buf_out << in << DTRAIN_GRAMMAR_DELIM << " " << in_split[0] << endl;
@@ -389,7 +392,7 @@ main(int argc, char** argv)
       }
     }
 
-    score_sum += (*samples)[0].score;
+    score_sum += (*samples)[0].score; // stats for 1best
     model_sum += (*samples)[0].model;
 
     // weight updates
@@ -415,7 +418,7 @@ main(int argc, char** argv)
             lambdas.plus_eq_v_times_s(diff_vec, eta);
             rank_errors++;
           } else {
-            if (funny) {
+            if (inc_correct) {
               SparseVector<weight_t> diff_vec = it->first.f - it->second.f;
               lambdas.plus_eq_v_times_s(diff_vec, eta);
             }
@@ -453,7 +456,7 @@ main(int argc, char** argv)
           }
         }
       } else if (l1cumul) {
-        weight_t acc_penalty = (ii+1) * l1_reg; // ii is the index of the current input 
+        weight_t acc_penalty = (ii+1) * l1_reg; // Note: ii is the index of the current input
         for (unsigned d = 0; d < lambdas.size(); d++) {
           if (lambdas.nonzero(d)) {
             weight_t v = lambdas.get(d);
@@ -515,7 +518,7 @@ main(int argc, char** argv)
     model_diff = model_avg;
   }
   
-  unsigned nonz;
+  unsigned nonz = 0;
   if (!quiet || hstreaming) nonz = (unsigned)lambdas.size_nonzero();
 
   if (!quiet) {
@@ -524,18 +527,18 @@ main(int argc, char** argv)
       cerr << setw(18) << *it << " = " << lambdas.get(FD::Convert(*it)) << endl;
     }
     cerr << "        ---" << endl;
-    cerr << _np << "      1best avg score: " << score_avg;
+    cerr << _np << "       1best avg score: " << score_avg;
     cerr << _p << " (" << score_diff << ")" << endl;
-    cerr << _np << "1best avg model score: " << model_avg;
+    cerr << _np << " 1best avg model score: " << model_avg;
     cerr << _p << " (" << model_diff << ")" << endl;
-    cerr << "           avg #pairs: ";
+    cerr << "           avg # pairs: ";
     cerr << _np << npairs/(float)in_sz << endl;
-    cerr << "        avg #rank err: ";
+    cerr << "        avg # rank err: ";
     cerr << rank_errors/(float)in_sz << endl;
-    cerr << "     avg #margin viol: ";
+    cerr << "     avg # margin viol: ";
     cerr << margin_violations/(float)in_sz << endl;
-    cerr << "   non0 feature count: " <<  nonz << endl;
-    cerr << "          avg f count: ";
+    cerr << "    non0 feature count: " <<  nonz << endl;
+    cerr << "           avg f count: ";
     cerr << feature_count/(float)pair_count << endl;
   }
 
@@ -628,7 +631,5 @@ main(int argc, char** argv)
     cerr << best_it+1 << " [SCORE '" << scorer_str << "'=" << max_score << "]." << endl;
     cerr << _p2 << "This took " << overall_time/60. << " min." << endl;
   }
-
-  return 0;
 }
 
diff --git a/dtrain/dtrain.h b/dtrain/dtrain.h
index 783aa179..61d60657 100644
--- a/dtrain/dtrain.h
+++ b/dtrain/dtrain.h
@@ -1,5 +1,5 @@
-#ifndef _DTRAIN_COMMON_H_
-#define _DTRAIN_COMMON_H_
+#ifndef _DTRAIN_H_
+#define _DTRAIN_H_
 
 #include <iomanip>
 #include <climits>
@@ -13,9 +13,9 @@
 
 #include "filelib.h"
 
-#define DTRAIN_LOCAL
+//#define DTRAIN_LOCAL
 
-#define DTRAIN_DOTS 10 // when to display a '.'
+#define DTRAIN_DOTS 10 // after how many inputs to display a '.'
 #define DTRAIN_GRAMMAR_DELIM "########EOS########"
 #define DTRAIN_SCALE 100000
 
@@ -35,7 +35,10 @@ inline string gettmpf(const string path, const string infix) {
   strcat(fn, "/");
   strcat(fn, infix.c_str());
   strcat(fn, "-XXXXXX");
-  mkstemp(fn);
+  if (!mkstemp(fn)) {
+    cerr << "Cannot make temp file in" << path << " , exiting." << endl;
+    exit(1);
+  }
   return string(fn);
 }
 
diff --git a/dtrain/kbestget.h b/dtrain/kbestget.h
index 1b96bbf4..0c2da994 100644
--- a/dtrain/kbestget.h
+++ b/dtrain/kbestget.h
@@ -46,7 +46,7 @@ struct LocalScorer
   }
 
   inline score_t
-  brevity_penaly(const unsigned hyp_len, const unsigned ref_len)
+  brevity_penalty(const unsigned hyp_len, const unsigned ref_len)
   {
     if (hyp_len > ref_len) return 1;
     return exp(1 - (score_t)ref_len/hyp_len);
@@ -61,7 +61,7 @@ struct HypSampler : public DecoderObserver
   inline void SetScorer(LocalScorer* scorer) { scorer_ = scorer; }
   inline void SetRef(vector<WordID>& ref) { ref_ = &ref; } 
 };
-///////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////
 
 
 
@@ -76,7 +76,7 @@ struct KBestGetter : public HypSampler
     k_(k), filter_type_(filter_type) {}
 
   virtual void
-  NotifyTranslationForest(const SentenceMetadata& smeta, Hypergraph* hg)
+  NotifyTranslationForest(const SentenceMetadata& /*smeta*/, Hypergraph* hg)
   {
     KBestScored(*hg);
   }
diff --git a/dtrain/ksampler.h b/dtrain/ksampler.h
index 8b1c09f2..c45c8f64 100644
--- a/dtrain/ksampler.h
+++ b/dtrain/ksampler.h
@@ -20,7 +20,7 @@ struct KSampler : public HypSampler
     k_(k), prng_(prng) {}
 
   virtual void
-  NotifyTranslationForest(const SentenceMetadata& smeta, Hypergraph* hg)
+  NotifyTranslationForest(const SentenceMetadata& /*smeta*/, Hypergraph* hg)
   {
     ScoredSamples(*hg);
   }
diff --git a/dtrain/score.cc b/dtrain/score.cc
index 4cde638a..ec844437 100644
--- a/dtrain/score.cc
+++ b/dtrain/score.cc
@@ -24,12 +24,12 @@ BleuScorer::Bleu(NgramCounts& counts, const unsigned hyp_len, const unsigned ref
     if (counts.clipped[i] == 0 || counts.sum[i] == 0) return 0;
     sum += w_[i] * log((score_t)counts.clipped[i]/counts.sum[i]);
   }
-  return brevity_penaly(hyp_len, ref_len) * exp(sum);
+  return brevity_penalty(hyp_len, ref_len) * exp(sum);
 }
 
 score_t
 BleuScorer::Score(vector<WordID>& hyp, vector<WordID>& ref,
-                  const unsigned rank)
+                  const unsigned /*rank*/)
 {
   unsigned hyp_len = hyp.size(), ref_len = ref.size();
   if (hyp_len == 0 || ref_len == 0) return 0;
@@ -49,7 +49,7 @@ BleuScorer::Score(vector<WordID>& hyp, vector<WordID>& ref,
  */
 score_t
 StupidBleuScorer::Score(vector<WordID>& hyp, vector<WordID>& ref,
-                        const unsigned rank)
+                        const unsigned /*rank*/)
 {
   unsigned hyp_len = hyp.size(), ref_len = ref.size();
   if (hyp_len == 0 || ref_len == 0) return 0;
@@ -58,10 +58,11 @@ StupidBleuScorer::Score(vector<WordID>& hyp, vector<WordID>& ref,
   if (ref_len < N_) M = ref_len;
   score_t sum = 0, add = 0;
   for (unsigned i = 0; i < M; i++) {
+    if (i == 0 && (counts.clipped[i] == 0 || counts.sum[i] == 0)) return 0;
     if (i == 1) add = 1;
     sum += w_[i] * log(((score_t)counts.clipped[i] + add)/((counts.sum[i] + add)));
   }
-  return  brevity_penaly(hyp_len, ref_len) * exp(sum);
+  return  brevity_penalty(hyp_len, ref_len) * exp(sum);
 }
 
 /*
@@ -75,19 +76,28 @@ StupidBleuScorer::Score(vector<WordID>& hyp, vector<WordID>& ref,
  */
 score_t
 SmoothBleuScorer::Score(vector<WordID>& hyp, vector<WordID>& ref,
-                        const unsigned rank)
+                        const unsigned /*rank*/)
 {
   unsigned hyp_len = hyp.size(), ref_len = ref.size();
   if (hyp_len == 0 || ref_len == 0) return 0;
   NgramCounts counts = make_ngram_counts(hyp, ref, N_);
-  score_t sum = 0;
-  unsigned j = 1;
-  for (unsigned i = 0; i < N_; i++) {
-    if (counts.clipped[i] == 0 || counts.sum[i] == 0) continue;
-    sum += exp((w_[i] * log((score_t)counts.clipped[i]/counts.sum[i])))/pow(2, N_-j+1);
-    j++;
+  unsigned M = N_;
+  if (ref_len < N_) M = ref_len;
+  score_t sum = 0.;
+  vector<score_t> i_bleu;
+  for (unsigned i = 0; i < M; i++) i_bleu.push_back(0.);
+  for (unsigned i = 0; i < M; i++) {
+    if (counts.clipped[i] == 0 || counts.sum[i] == 0) {
+      break;
+    } else {
+      score_t i_ng = log((score_t)counts.clipped[i]/counts.sum[i]);
+      for (unsigned j = i; j < M; j++) {
+        i_bleu[j] += (1/((score_t)j+1)) * i_ng;
+      }
+    }
+    sum += exp(i_bleu[i])/(pow(2, N_-i));
   }
-  return brevity_penaly(hyp_len, ref_len) * sum;
+  return brevity_penalty(hyp_len, ref_len) * sum;
 }
 
 /*
diff --git a/dtrain/test/example/dtrain.ini b/dtrain/test/example/dtrain.ini
index 68173e11..66be6bf2 100644
--- a/dtrain/test/example/dtrain.ini
+++ b/dtrain/test/example/dtrain.ini
@@ -1,20 +1,20 @@
 input=test/example/nc-wmt11.1k.gz    # use '-' for stdin
-output=-                             # a weights file or stdout
-decoder_config=test/example/cdec.ini # ini for cdec
-# these will be printed on each iteration
+output=weights.gz                    # a weights file (add .gz for gzip compression) or STDOUT '-'
+decoder_config=test/example/cdec.ini # config for cdec
+# weights for these features will be printed on each iteration
 print_weights=Glue WordPenalty LanguageModel LanguageModel_OOV PhraseModel_0 PhraseModel_1 PhraseModel_2 PhraseModel_3 PhraseModel_4 PhraseModel_5 PhraseModel_6 PassThrough
 tmp=/tmp
-stop_after=10 # stop iteration after 10 inputs 
+stop_after=100 # stop epoch after 10 inputs
 
 # interesting stuff
-epochs=3                # run over input 3 times
-k=200                   # use 100best lists
-N=4                     # optimize (approx) BLEU4
+epochs=100                # run over input 3 times
+k=100                   # use 100best lists
+N=4                    # optimize (approx) BLEU4
 learning_rate=0.0001    # learning rate
-gamma=0.00001           # use SVM reg
-scorer=stupid_bleu      # use stupid BLEU+1 approx.
+gamma=0           # use SVM reg
+scorer=smooth_bleu      # use smooth BLEU of (Liang et al. '06)
 sample_from=kbest       # use kbest lists (as opposed to forest)
-filter=uniq             # only uniq entries in kbest
+filter=uniq             # only unique entries in kbest (surface form)
 pair_sampling=108010    # 10 vs 80 vs 10 and 80 vs 10
-pair_threshold=0        # minimum distance in BLEU
+pair_threshold=0        # minimum distance in BLEU (this will still only use pairs with diff > 0)
 select_weights=last     # just output last weights