9 files changed, 95 insertions, 105 deletions
diff --git a/dtrain/dtrain.cc b/dtrain/dtrain.cc
index 0481cf96..44090242 100644
--- a/dtrain/dtrain.cc
+++ b/dtrain/dtrain.cc
@@ -6,23 +6,24 @@ dtrain_init(int argc, char** argv, po::variables_map* cfg)
 {
   po::options_description ini("Configuration File Options");
   ini.add_options()
-    ("input",          po::value<string>()->default_value("-"),                          "input file")
-    ("output",         po::value<string>()->default_value("-"),       "output weights file (or VOID)")
-    ("input_weights",  po::value<string>(),       "input weights file (e.g. from previous iteration)")
-    ("decoder_config", po::value<string>(),                             "configuration file for cdec")
-    ("k",              po::value<size_t>()->default_value(100), "size of kbest or sample from forest")
-    ("sample_from",    po::value<string>()->default_value("kbest"),  "where to get translations from")
-    ("filter",         po::value<string>()->default_value("unique"),              "filter kbest list")
-    ("pair_sampling",  po::value<string>()->default_value("all"),    "how to sample pairs: all, rand")
-    ("N",              po::value<size_t>()->default_value(3),                          "N for Ngrams")
-    ("epochs",         po::value<size_t>()->default_value(2),                     "# of iterations T") 
-    ("scorer",         po::value<string>()->default_value("stupid_bleu"),            "scoring metric")
-    ("stop_after",     po::value<size_t>()->default_value(0),          "stop after X input sentences")
-    ("print_weights",  po::value<string>(),                      "weights to print on each iteration")
-    ("hstreaming",     po::value<bool>()->zero_tokens(),               "run in hadoop streaming mode")
-    ("learning_rate",  po::value<double>()->default_value(0.0005),                    "learning rate")
-    ("gamma",          po::value<double>()->default_value(0.),     "gamma for SVM (0 for perceptron)")
-    ("noup",           po::value<bool>()->zero_tokens(),                      "do not update weights");
+    ("input",          po::value<string>()->default_value("-"),                            "input file")
+    ("output",         po::value<string>()->default_value("-"),         "output weights file (or VOID)")
+    ("input_weights",  po::value<string>(),         "input weights file (e.g. from previous iteration)")
+    ("decoder_config", po::value<string>(),                               "configuration file for cdec")
+    ("k",              po::value<unsigned>()->default_value(100), "size of kbest or sample from forest")
+    ("sample_from",    po::value<string>()->default_value("kbest"),    "where to get translations from")
+    ("filter",         po::value<string>()->default_value("unique"),                "filter kbest list")
+    ("pair_sampling",  po::value<string>()->default_value("all"),      "how to sample pairs: all, rand")
+    ("N",              po::value<unsigned>()->default_value(3),                          "N for Ngrams")
+    ("epochs",         po::value<unsigned>()->default_value(2),                     "# of iterations T") 
+    ("scorer",         po::value<string>()->default_value("stupid_bleu"),              "scoring metric")
+    ("stop_after",     po::value<unsigned>()->default_value(0),          "stop after X input sentences")
+    ("print_weights",  po::value<string>(),                        "weights to print on each iteration")
+    ("hstreaming",     po::value<bool>()->zero_tokens(),                 "run in hadoop streaming mode")
+    ("learning_rate",  po::value<double>()->default_value(0.0005),                      "learning rate")
+    ("gamma",          po::value<double>()->default_value(0.),       "gamma for SVM (0 for perceptron)")
+    ("tmp",            po::value<string>()->default_value("/tmp"),                    "temp dir to use") // FIXME
+    ("noup",           po::value<bool>()->zero_tokens(),                        "do not update weights");
   po::options_description cl("Command Line Options");
   cl.add_options()
     ("config,c",         po::value<string>(),              "dtrain config file")
@@ -75,10 +76,10 @@ main(int argc, char** argv)
     hstreaming = true;
     quiet = true;
   }
-  const size_t k = cfg["k"].as<size_t>();
-  const size_t N = cfg["N"].as<size_t>(); 
-  const size_t T = cfg["epochs"].as<size_t>();
-  const size_t stop_after = cfg["stop_after"].as<size_t>();
+  const unsigned k = cfg["k"].as<unsigned>();
+  const unsigned N = cfg["N"].as<unsigned>(); 
+  const unsigned T = cfg["epochs"].as<unsigned>();
+  const unsigned stop_after = cfg["stop_after"].as<unsigned>();
   const string filter_type = cfg["filter"].as<string>();
   const string sample_from = cfg["sample_from"].as<string>();
   const string pair_sampling = cfg["pair_sampling"].as<string>();
@@ -105,7 +106,7 @@ main(int argc, char** argv)
 
   // scoring metric/scorer
   string scorer_str = cfg["scorer"].as<string>();
-  score_t (*scorer)(NgramCounts&, const size_t, const size_t, size_t, vector<score_t>);
+  score_t (*scorer)(NgramCounts&, const unsigned, const unsigned, unsigned, vector<score_t>);
   if (scorer_str == "bleu") {
     scorer = &bleu;
   } else if (scorer_str == "stupid_bleu") {
@@ -119,8 +120,8 @@ main(int argc, char** argv)
     exit(1);
   }
   NgramCounts global_counts(N); // counts for 1 best translations
-  size_t global_hyp_len = 0;    // sum hypothesis lengths
-  size_t global_ref_len = 0;    // sum reference lengths
+  unsigned global_hyp_len = 0;    // sum hypothesis lengths
+  unsigned global_ref_len = 0;    // sum reference lengths
   // ^^^ global_* for approx_bleu
   vector<score_t> bleu_weights;   // we leave this empty -> 1/N 
   if (!quiet) cerr << setw(26) << "scorer '" << scorer_str << "'" << endl << endl;
@@ -149,10 +150,10 @@ main(int argc, char** argv)
   ogzstream grammar_buf_out;
   grammar_buf_out.open(grammar_buf_fn);
   
-  size_t in_sz = 999999999; // input index, input size
+  unsigned in_sz = 999999999; // input index, input size
   vector<pair<score_t,score_t> > all_scores;
   score_t max_score = 0.;
-  size_t best_it = 0;
+  unsigned best_it = 0;
   float overall_time = 0.;
 
   // output cfg
@@ -178,7 +179,7 @@ main(int argc, char** argv)
   }
 
 
-  for (size_t t = 0; t < T; t++) // T epochs
+  for (unsigned t = 0; t < T; t++) // T epochs
   {
 
   time_t start, end;  
@@ -186,7 +187,7 @@ main(int argc, char** argv)
   igzstream grammar_buf_in;
   if (t > 0) grammar_buf_in.open(grammar_buf_fn);
   score_t score_sum = 0., model_sum = 0.;
-  size_t ii = 0;
+  unsigned ii = 0;
   if (!quiet) cerr << "Iteration #" << t+1 << " of " << T << "." << endl;
   
   while(true)
@@ -279,10 +280,10 @@ main(int argc, char** argv)
     // (local) scoring
     if (t > 0) ref_ids = ref_ids_buf[ii];
     score_t score = 0.;
-    for (size_t i = 0; i < samples->size(); i++) {
+    for (unsigned i = 0; i < samples->size(); i++) {
       NgramCounts counts = make_ngram_counts(ref_ids, (*samples)[i].w, N);
       if (scorer_str == "approx_bleu") {
-        size_t hyp_len = 0;
+        unsigned hyp_len = 0;
         if (i == 0) { // 'context of 1best translations'
           global_counts  += counts;
           global_hyp_len += (*samples)[i].w.size();
diff --git a/dtrain/dtrain.h b/dtrain/dtrain.h
index 9bc5be93..ed75a297 100644
--- a/dtrain/dtrain.h
+++ b/dtrain/dtrain.h
@@ -18,8 +18,8 @@
 #include "ksampler.h"
 #include "pairsampling.h"
 
-#define DTRAIN_DOTS 100                     // when to display a '.'
-#define DTRAIN_TMP_DIR "/var/hadoop/mapred/local"               // put this on a SSD?
+#define DTRAIN_DOTS 100 // when to display a '.'
+#define DTRAIN_TMP_DIR "/tmp"
 #define DTRAIN_GRAMMAR_DELIM "########EOS########"
 
 using namespace std;
@@ -36,20 +36,20 @@ inline ostream& _p(ostream& out)  { return out << setiosflags(ios::showpos); }
 inline ostream& _p2(ostream& out) { return out << setprecision(2); }
 inline ostream& _p5(ostream& out) { return out << setprecision(5); }
 inline ostream& _p9(ostream& out) { return out << setprecision(9); }
-inline void strsplit(string &s, vector<string>& v, char d = '\t', size_t parts = 0) { 
+inline void strsplit(string &s, vector<string>& v, char d = '\t', unsigned parts = 0) { 
   stringstream ss(s);
   string t;
-  size_t c = 0;
+  unsigned i = 0;
   while(true)
   {
-    if (parts > 0 && c == parts-1) {
+    if (parts > 0 && i == parts-1) {
       getline(ss, t);
       v.push_back(t);
       break;
     }
     if (!getline(ss, t, d)) break;
     v.push_back(t);
-    c++;
+    i++;
   }
 }
 
diff --git a/dtrain/kbestget.h b/dtrain/kbestget.h
index 403384de..935998a0 100644
--- a/dtrain/kbestget.h
+++ b/dtrain/kbestget.h
@@ -22,11 +22,11 @@ struct HypSampler : public DecoderObserver
 
 struct KBestGetter : public HypSampler
 {
-  const size_t k_;
+  const unsigned k_;
   const string filter_type_;
   vector<ScoredHyp> s_;
 
-  KBestGetter(const size_t k, const string filter_type) :
+  KBestGetter(const unsigned k, const string filter_type) :
     k_(k), filter_type_(filter_type) {}
 
   virtual void
@@ -51,9 +51,11 @@ struct KBestGetter : public HypSampler
   KBestUnique(const Hypergraph& forest)
   {
     s_.clear();
-    KBest::KBestDerivations<vector<WordID>, ESentenceTraversal, KBest::FilterUnique, prob_t, EdgeProb> kbest(forest, k_);
-    for (size_t i = 0; i < k_; ++i) {
-      const KBest::KBestDerivations<vector<WordID>, ESentenceTraversal, KBest::FilterUnique, prob_t, EdgeProb>::Derivation* d =
+    KBest::KBestDerivations<vector<WordID>, ESentenceTraversal,
+      KBest::FilterUnique, prob_t, EdgeProb> kbest(forest, k_);
+    for (unsigned i = 0; i < k_; ++i) {
+      const KBest::KBestDerivations<vector<WordID>, ESentenceTraversal, KBest::FilterUnique,
+              prob_t, EdgeProb>::Derivation* d =
             kbest.LazyKthBest(forest.nodes_.size() - 1, i);
       if (!d) break;
       ScoredHyp h;
@@ -69,7 +71,7 @@ struct KBestGetter : public HypSampler
   {
     s_.clear();
     KBest::KBestDerivations<vector<WordID>, ESentenceTraversal> kbest(forest, k_);
-    for (size_t i = 0; i < k_; ++i) {
+    for (unsigned i = 0; i < k_; ++i) {
       const KBest::KBestDerivations<vector<WordID>, ESentenceTraversal>::Derivation* d =
             kbest.LazyKthBest(forest.nodes_.size() - 1, i);
       if (!d) break;
diff --git a/dtrain/ksampler.h b/dtrain/ksampler.h
index 08bf1498..17b0ba56 100644
--- a/dtrain/ksampler.h
+++ b/dtrain/ksampler.h
@@ -11,11 +11,11 @@ namespace dtrain
 
 struct KSampler : public HypSampler
 {
-  const size_t k_;
+  const unsigned k_;
   vector<ScoredHyp> s_;
   MT19937* prng_;
 
-  explicit KSampler(const size_t k, MT19937* prng) :
+  explicit KSampler(const unsigned k, MT19937* prng) :
     k_(k), prng_(prng) {}
 
   virtual void
@@ -30,7 +30,7 @@ struct KSampler : public HypSampler
     s_.clear();
     std::vector<HypergraphSampler::Hypothesis> samples;
     HypergraphSampler::sample_hypotheses(forest, k_, prng_, &samples);
-    for (size_t i = 0; i < k_; ++i) {
+    for (unsigned i = 0; i < k_; ++i) {
       ScoredHyp h;
       h.w = samples[i].words;
       h.f = samples[i].fmap;
diff --git a/dtrain/pairsampling.h b/dtrain/pairsampling.h
index 2e4ab155..9546a945 100644
--- a/dtrain/pairsampling.h
+++ b/dtrain/pairsampling.h
@@ -11,8 +11,8 @@ namespace dtrain
 inline void
 sample_all_pairs(vector<ScoredHyp>* s, vector<pair<ScoredHyp,ScoredHyp> > &training)
 {
-  for (size_t i = 0; i < s->size()-1; i++) {
-    for (size_t j = i+1; j < s->size(); j++) {
+  for (unsigned i = 0; i < s->size()-1; i++) {
+    for (unsigned j = i+1; j < s->size(); j++) {
       pair<ScoredHyp,ScoredHyp> p;
       p.first = (*s)[i];
       p.second = (*s)[j];
@@ -25,8 +25,8 @@ inline void
 sample_rand_pairs(vector<ScoredHyp>* s, vector<pair<ScoredHyp,ScoredHyp> > &training,
                   MT19937* prng)
 {
-  for (size_t i = 0; i < s->size()-1; i++) {
-    for (size_t j = i+1; j < s->size(); j++) {
+  for (unsigned i = 0; i < s->size()-1; i++) {
+    for (unsigned j = i+1; j < s->size(); j++) {
       if (prng->next() < .5) {
         pair<ScoredHyp,ScoredHyp> p;
         p.first = (*s)[i];
diff --git a/dtrain/score.cc b/dtrain/score.cc
index c6d3a05f..52644250 100644
--- a/dtrain/score.cc
+++ b/dtrain/score.cc
@@ -5,13 +5,13 @@ namespace dtrain
 
 
 Ngrams
-make_ngrams(vector<WordID>& s, size_t N)
+make_ngrams(vector<WordID>& s, unsigned N)
 {
   Ngrams ngrams;
   vector<WordID> ng;
   for (size_t i = 0; i < s.size(); i++) {
     ng.clear();
-    for (size_t j = i; j < min(i+N, s.size()); j++) {
+    for (unsigned j = i; j < min(i+N, s.size()); j++) {
       ng.push_back(s[j]);
       ngrams[ng]++;
     }
@@ -20,7 +20,7 @@ make_ngrams(vector<WordID>& s, size_t N)
 }
 
 NgramCounts
-make_ngram_counts(vector<WordID> hyp, vector<WordID> ref, size_t N)
+make_ngram_counts(vector<WordID> hyp, vector<WordID> ref, unsigned N)
 {
   Ngrams hyp_ngrams = make_ngrams(hyp, N);
   Ngrams ref_ngrams = make_ngrams(ref, N);
@@ -48,26 +48,22 @@ make_ngram_counts(vector<WordID> hyp, vector<WordID> ref, size_t N)
  * NOTE: 0 if one n in {1..N} has 0 count
  */
 score_t
-brevity_penaly(const size_t hyp_len, const size_t ref_len)
+brevity_penaly(const unsigned hyp_len, const unsigned ref_len)
 {
   if (hyp_len > ref_len) return 1;
-  return exp(1 - (score_t)ref_len/(score_t)hyp_len);
+  return exp(1 - (score_t)ref_len/hyp_len);
 }
 score_t
-bleu(NgramCounts& counts, const size_t hyp_len, const size_t ref_len,
-      size_t N, vector<score_t> weights )
+bleu(NgramCounts& counts, const unsigned hyp_len, const unsigned ref_len,
+      unsigned N, vector<score_t> weights )
 {
   if (hyp_len == 0 || ref_len == 0) return 0;
   if (ref_len < N) N = ref_len;
-  score_t N_ = (score_t)N;
-  if (weights.empty())
-  {
-    for (size_t i = 0; i < N; i++) weights.push_back(1/N_);
-  }
+  if (weights.empty()) for (unsigned i = 0; i < N; i++) weights.push_back(1./N);
   score_t sum = 0;
-  for (size_t i = 0; i < N; i++) {
+  for (unsigned i = 0; i < N; i++) {
     if (counts.clipped[i] == 0 || counts.sum[i] == 0) return 0;
-    sum += weights[i] * log((score_t)counts.clipped[i] / (score_t)counts.sum[i]);
+    sum += weights[i] * log((score_t)counts.clipped[i] / counts.sum[i]);
   }
   return brevity_penaly(hyp_len, ref_len) * exp(sum);
 }
@@ -83,21 +79,16 @@ bleu(NgramCounts& counts, const size_t hyp_len, const size_t ref_len,
  * NOTE: 0 iff no 1gram match
  */
 score_t
-stupid_bleu(NgramCounts& counts, const size_t hyp_len, const size_t ref_len,
-             size_t N, vector<score_t> weights )
+stupid_bleu(NgramCounts& counts, const unsigned hyp_len, const unsigned ref_len,
+             unsigned N, vector<score_t> weights )
 {
   if (hyp_len == 0 || ref_len == 0) return 0;
   if (ref_len < N) N = ref_len;
-  score_t N_ = (score_t)N;
-  if (weights.empty())
-  {
-    for (size_t i = 0; i < N; i++) weights.push_back(1/N_);
-  }
-  score_t sum = 0;
-  score_t add = 0;
-  for (size_t i = 0; i < N; i++) {
+  if (weights.empty()) for (unsigned i = 0; i < N; i++) weights.push_back(1./N);
+  score_t sum = 0, add = 0;
+  for (unsigned i = 0; i < N; i++) {
     if (i == 1) add = 1;
-    sum += weights[i] * log(((score_t)counts.clipped[i] + add) / ((score_t)counts.sum[i] + add));
+    sum += weights[i] * log(((score_t)counts.clipped[i] + add) / ((counts.sum[i] + add)));
   }
   return brevity_penaly(hyp_len, ref_len) * exp(sum);
 }
@@ -112,20 +103,16 @@ stupid_bleu(NgramCounts& counts, const size_t hyp_len, const size_t ref_len,
  * NOTE: max is 0.9375
  */
 score_t
-smooth_bleu(NgramCounts& counts, const size_t hyp_len, const size_t ref_len,
-            const size_t N, vector<score_t> weights )
+smooth_bleu(NgramCounts& counts, const unsigned hyp_len, const unsigned ref_len,
+            const unsigned N, vector<score_t> weights )
 {
   if (hyp_len == 0 || ref_len == 0) return 0;
-  score_t N_ = (score_t)N;
-  if (weights.empty())
-  {
-    for (size_t i = 0; i < N; i++) weights.push_back(1/N_);
-  }
+  if (weights.empty()) for (unsigned i = 0; i < N; i++) weights.push_back(1./N);
   score_t sum = 0;
-  score_t j = 1;
-  for (size_t i = 0; i < N; i++) {
+  unsigned j = 1;
+  for (unsigned i = 0; i < N; i++) {
     if (counts.clipped[i] == 0 || counts.sum[i] == 0) continue;
-    sum += exp((weights[i] * log((score_t)counts.clipped[i]/(score_t)counts.sum[i]))) / pow(2, N_-j+1);
+    sum += exp((weights[i] * log((score_t)counts.clipped[i]/counts.sum[i]))) / pow(2, N-j+1);
     j++;
   }
   return brevity_penaly(hyp_len, ref_len) * sum;
@@ -139,11 +126,11 @@ smooth_bleu(NgramCounts& counts, const size_t hyp_len, const size_t ref_len,
  * (Chiang et al. '08)
  */
 score_t
-approx_bleu(NgramCounts& counts, const size_t hyp_len, const size_t ref_len,
-            const size_t N, vector<score_t> weights)
+approx_bleu(NgramCounts& counts, const unsigned hyp_len, const unsigned ref_len,
+            const unsigned N, vector<score_t> weights)
 {
   return brevity_penaly(hyp_len, ref_len) 
-         * 0.9 * bleu(counts, hyp_len, ref_len, N, weights);
+           * 0.9 * bleu(counts, hyp_len, ref_len, N, weights);
 }
 
 
diff --git a/dtrain/score.h b/dtrain/score.h
index bff0b10c..3e5d82a9 100644
--- a/dtrain/score.h
+++ b/dtrain/score.h
@@ -19,17 +19,17 @@ typedef double score_t; // float
 
 struct NgramCounts
 {
-  size_t N_;
-  map<size_t, size_t> clipped;
-  map<size_t, size_t> sum;
+  unsigned N_;
+  map<unsigned, unsigned> clipped;
+  map<unsigned, unsigned> sum;
 
-  NgramCounts(const size_t N) : N_(N) { reset(); } 
+  NgramCounts(const unsigned N) : N_(N) { reset(); } 
 
   void
   operator+=(const NgramCounts& rhs)
   {
     assert(N_ == rhs.N_);
-    for (size_t i = 0; i < N_; i++) {
+    for (unsigned i = 0; i < N_; i++) {
       this->clipped[i] += rhs.clipped.find(i)->second;
       this->sum[i] += rhs.sum.find(i)->second;
     }
@@ -44,7 +44,7 @@ struct NgramCounts
   }
 
   void
-  add(size_t count, size_t ref_count, size_t i)
+  add(unsigned count, unsigned ref_count, unsigned i)
   {
     assert(i < N_);
     if (count > ref_count) {
@@ -59,7 +59,7 @@ struct NgramCounts
   void
   reset()
   {
-    size_t i;
+    unsigned i;
     for (i = 0; i < N_; i++) {
       clipped[i] = 0;
       sum[i] = 0;
@@ -69,26 +69,26 @@ struct NgramCounts
   void
   print()
   {
-    for (size_t i = 0; i < N_; i++) {
+    for (unsigned i = 0; i < N_; i++) {
       cout << i+1 << "grams (clipped):\t" << clipped[i] << endl;
       cout << i+1 << "grams:\t\t\t" << sum[i] << endl;
     }
   }
 };
 
-typedef map<vector<WordID>, size_t> Ngrams;
+typedef map<vector<WordID>, unsigned> Ngrams;
 
-Ngrams make_ngrams(vector<WordID>& s, size_t N);
-NgramCounts make_ngram_counts(vector<WordID> hyp, vector<WordID> ref, size_t N);
+Ngrams make_ngrams(vector<WordID>& s, unsigned N);
+NgramCounts make_ngram_counts(vector<WordID> hyp, vector<WordID> ref, unsigned N);
 
-score_t brevity_penaly(const size_t hyp_len, const size_t ref_len);
-score_t bleu(NgramCounts& counts, const size_t hyp_len, const size_t ref_len, const size_t N,
+score_t brevity_penaly(const unsigned hyp_len, const unsigned ref_len);
+score_t bleu(NgramCounts& counts, const unsigned hyp_len, const unsigned ref_len, const unsigned N,
              vector<score_t> weights = vector<score_t>());
-score_t stupid_bleu(NgramCounts& counts, const size_t hyp_len, const size_t ref_len, size_t N,
+score_t stupid_bleu(NgramCounts& counts, const unsigned hyp_len, const unsigned ref_len, unsigned N,
                     vector<score_t> weights = vector<score_t>());
-score_t smooth_bleu(NgramCounts& counts, const size_t hyp_len, const size_t ref_len, const size_t N,
+score_t smooth_bleu(NgramCounts& counts, const unsigned hyp_len, const unsigned ref_len, const unsigned N,
                     vector<score_t> weights = vector<score_t>());
-score_t approx_bleu(NgramCounts& counts, const size_t hyp_len, const size_t ref_len, const size_t N,
+score_t approx_bleu(NgramCounts& counts, const unsigned hyp_len, const unsigned ref_len, const unsigned N,
                     vector<score_t> weights = vector<score_t>());
 
 
diff --git a/dtrain/test/example/dtrain.ini b/dtrain/test/example/dtrain.ini
index 00ba72f9..fbddb915 100644
--- a/dtrain/test/example/dtrain.ini
+++ b/dtrain/test/example/dtrain.ini
@@ -5,7 +5,7 @@ epochs=1000
 input=test/example/nc-1k.gz
 scorer=stupid_bleu
 output=test/example/weights.gz
-stop_after=100
-sample_from=forest
+stop_after=10
+sample_from=kbest
 pair_sampling=all
 print_weights=Glue WordPenalty LanguageModel LanguageModel_OOV PhraseModel_0 PhraseModel_1 PhraseModel_2 PhraseModel_3 PhraseModel_4 PassThrough
diff --git a/dtrain/test/example/weights.gz b/dtrain/test/example/weights.gz
index 21157427..e2e1ecce 100644
--- a/dtrain/test/example/weights.gz
+++ b/dtrain/test/example/weights.gz