From 529c8f0671ce0b09c2a797278a8f84242c86465d Mon Sep 17 00:00:00 2001
From: Patrick Simianer <p@simianer.de>
Date: Fri, 15 Mar 2013 10:29:13 +0100
Subject: removed hadoop/hstreaming mode

---
 training/dtrain/README.md                          |  28 +----
 training/dtrain/dtrain.cc                          | 121 +------------------
 training/dtrain/dtrain.h                           |   8 +-
 training/dtrain/hstreaming/avg.rb                  |  32 -----
 training/dtrain/hstreaming/cdec.ini                |  22 ----
 training/dtrain/hstreaming/dtrain.ini              |  15 ---
 training/dtrain/hstreaming/dtrain.sh               |   9 --
 training/dtrain/hstreaming/hadoop-streaming-job.sh |  30 -----
 training/dtrain/hstreaming/lplp.rb                 | 131 ---------------------
 training/dtrain/hstreaming/red-test                |   9 --
 training/dtrain/lplp.rb                            | 131 +++++++++++++++++++++
 training/dtrain/parallelize.rb                     |   4 +-
 training/dtrain/test/example/cdec.ini              |   2 +-
 13 files changed, 144 insertions(+), 398 deletions(-)
 delete mode 100755 training/dtrain/hstreaming/avg.rb
 delete mode 100644 training/dtrain/hstreaming/cdec.ini
 delete mode 100644 training/dtrain/hstreaming/dtrain.ini
 delete mode 100755 training/dtrain/hstreaming/dtrain.sh
 delete mode 100755 training/dtrain/hstreaming/hadoop-streaming-job.sh
 delete mode 100755 training/dtrain/hstreaming/lplp.rb
 delete mode 100644 training/dtrain/hstreaming/red-test
 create mode 100755 training/dtrain/lplp.rb

(limited to 'training/dtrain')
diff --git a/training/dtrain/README.md b/training/dtrain/README.md
index 7edabbf1..2ab2f232 100644
--- a/training/dtrain/README.md
+++ b/training/dtrain/README.md
@@ -13,36 +13,18 @@ Builds when building cdec, see ../BUILDING .
 To build only parts needed for dtrain do
 ```
   autoreconf -ifv
-  ./configure [--disable-gtest]
-  cd dtrain/; make
+  ./configure
+  cd training/dtrain/; make
 ```
 
 Running
 -------
-To run this on a dev set locally:
-```
-    #define DTRAIN_LOCAL
-```
-otherwise remove that line or undef, then recompile. You need a single
-grammar file or input annotated with per-sentence grammars (psg) as you
-would use with cdec. Additionally you need to give dtrain a file with
-references (--refs) when running locally.
-
-The input for use with hadoop streaming looks like this:
-```
-    <sid>\t<source>\t<ref>\t<grammar rules separated by \t>
-```
-To convert a psg to this format you need to replace all "\n"
-by "\t". Make sure there are no tabs in your data.
-
-For an example of local usage (with the 'distributed' format)
-the see test/example/ . This expects dtrain to be built without
-DTRAIN_LOCAL.
+See directories under test/ .
 
 Legal
 -----
-Copyright (c) 2012 by Patrick Simianer <p@simianer.de>
+Copyright (c) 2012-2013 by Patrick Simianer <p@simianer.de>
 
-See the file ../LICENSE.txt for the licensing terms that this software is
+See the file LICENSE.txt in the root folder for the licensing terms that this software is
 released under.
 
diff --git a/training/dtrain/dtrain.cc b/training/dtrain/dtrain.cc
index 53487d34..dfb5b351 100644
--- a/training/dtrain/dtrain.cc
+++ b/training/dtrain/dtrain.cc
@@ -12,9 +12,7 @@ dtrain_init(int argc, char** argv, po::variables_map* cfg)
     ("decoder_config",    po::value<string>(),                                                      "configuration file for cdec")
     ("print_weights",     po::value<string>(),                                               "weights to print on each iteration")
     ("stop_after",        po::value<unsigned>()->default_value(0),                                 "stop after X input sentences")
-    ("tmp",               po::value<string>()->default_value("/tmp"),                                           "temp dir to use")
     ("keep",              po::value<bool>()->zero_tokens(),                               "keep weights files for each iteration")
-    ("hstreaming",        po::value<string>(),                                   "run in hadoop streaming mode, arg is a task id")
     ("epochs",            po::value<unsigned>()->default_value(10),                               "# of iterations T (per shard)")
     ("k",                 po::value<unsigned>()->default_value(100),                            "how many translations to sample")
     ("sample_from",       po::value<string>()->default_value("kbest"),     "where to sample translations from: 'kbest', 'forest'")
@@ -28,16 +26,14 @@ dtrain_init(int argc, char** argv, po::variables_map* cfg)
     ("gamma",             po::value<weight_t>()->default_value(0.),                            "gamma for SVM (0 for perceptron)")
     ("select_weights",    po::value<string>()->default_value("last"),     "output best, last, avg weights ('VOID' to throw away)")
     ("rescale",           po::value<bool>()->zero_tokens(),                              "rescale weight vector after each input")
-    ("l1_reg",            po::value<string>()->default_value("none"),      "apply l1 regularization as in 'Tsuroka et al' (2010)")
+    ("l1_reg",            po::value<string>()->default_value("none"), "apply l1 regularization as in 'Tsuroka et al' (2010) UNTESTED")
     ("l1_reg_strength",   po::value<weight_t>(),                                                     "l1 regularization strength")
     ("fselect",           po::value<weight_t>()->default_value(-1), "select top x percent (or by threshold) of features after each epoch NOT IMPLEMENTED") // TODO
     ("approx_bleu_d",     po::value<score_t>()->default_value(0.9),                                   "discount for approx. BLEU")
     ("scale_bleu_diff",   po::value<bool>()->zero_tokens(),                      "learning rate <- bleu diff of a misranked pair")
     ("loss_margin",       po::value<weight_t>()->default_value(0.),  "update if no error in pref pair but model scores this near")
     ("max_pairs",         po::value<unsigned>()->default_value(std::numeric_limits<unsigned>::max()), "max. # of pairs per Sent.")
-#ifdef DTRAIN_LOCAL
     ("refs,r",            po::value<string>(),                                                         "references in local mode")
-#endif
     ("noup",              po::value<bool>()->zero_tokens(),                                               "do not update weights");
   po::options_description cl("Command Line Options");
   cl.add_options()
@@ -55,16 +51,6 @@ dtrain_init(int argc, char** argv, po::variables_map* cfg)
     cerr << cl << endl;
     return false;
   }
-  if (cfg->count("hstreaming") && (*cfg)["output"].as<string>() != "-") {
-    cerr << "When using 'hstreaming' the 'output' param should be '-'." << endl;
-    return false;
-  }
-#ifdef DTRAIN_LOCAL
-  if ((*cfg)["input"].as<string>() == "-") {
-    cerr << "Can't use stdin as input with this binary. Recompile without DTRAIN_LOCAL" << endl;
-    return false;
-  }
-#endif
   if ((*cfg)["sample_from"].as<string>() != "kbest"
        && (*cfg)["sample_from"].as<string>() != "forest") {
     cerr << "Wrong 'sample_from' param: '" << (*cfg)["sample_from"].as<string>() << "', use 'kbest' or 'forest'." << endl;
@@ -111,17 +97,8 @@ main(int argc, char** argv)
   if (cfg.count("verbose")) verbose = true;
   bool noup = false;
   if (cfg.count("noup")) noup = true;
-  bool hstreaming = false;
-  string task_id;
-  if (cfg.count("hstreaming")) {
-    hstreaming = true;
-    quiet = true;
-    task_id = cfg["hstreaming"].as<string>();
-    cerr.precision(17);
-  }
   bool rescale = false;
   if (cfg.count("rescale")) rescale = true;
-  HSReporter rep(task_id);
   bool keep = false;
   if (cfg.count("keep")) keep = true;
 
@@ -224,16 +201,8 @@ main(int argc, char** argv)
   // buffer input for t > 0
   vector<string> src_str_buf;          // source strings (decoder takes only strings)
   vector<vector<WordID> > ref_ids_buf; // references as WordID vecs
-  // where temp files go
-  string tmp_path = cfg["tmp"].as<string>();
-#ifdef DTRAIN_LOCAL
   string refs_fn = cfg["refs"].as<string>();
   ReadFile refs(refs_fn);
-#else
-  string grammar_buf_fn = gettmpf(tmp_path, "dtrain-grammars");
-  ogzstream grammar_buf_out;
-  grammar_buf_out.open(grammar_buf_fn.c_str());
-#endif
 
   unsigned in_sz = std::numeric_limits<unsigned>::max(); // input index, input size
   vector<pair<score_t, score_t> > all_scores;
@@ -270,9 +239,7 @@ main(int argc, char** argv)
     cerr << setw(25) << "max pairs " << max_pairs << endl;
     cerr << setw(25) << "cdec cfg " << "'" << cfg["decoder_config"].as<string>() << "'" << endl;
     cerr << setw(25) << "input " << "'" << input_fn << "'" << endl;
-#ifdef DTRAIN_LOCAL
     cerr << setw(25) << "refs " << "'" << refs_fn << "'" << endl;
-#endif
     cerr << setw(25) << "output " << "'" << output_fn << "'" << endl;
     if (cfg.count("input_weights"))
       cerr << setw(25) << "weights in " << "'" << cfg["input_weights"].as<string>() << "'" << endl;
@@ -285,14 +252,10 @@ main(int argc, char** argv)
   for (unsigned t = 0; t < T; t++) // T epochs
   {
 
-  if (hstreaming) cerr << "reporter:status:Iteration #" << t+1 << " of " << T << endl;
-
   time_t start, end;
   time(&start);
-#ifndef DTRAIN_LOCAL
   igzstream grammar_buf_in;
   if (t > 0) grammar_buf_in.open(grammar_buf_fn.c_str());
-#endif
   score_t score_sum = 0.;
   score_t model_sum(0);
   unsigned ii = 0, rank_errors = 0, margin_violations = 0, npairs = 0, f_count = 0, list_sz = 0;
@@ -340,52 +303,6 @@ main(int argc, char** argv)
 
     // getting input
     vector<WordID> ref_ids; // reference as vector<WordID>
-#ifndef DTRAIN_LOCAL
-    vector<string> in_split; // input: sid\tsrc\tref\tpsg
-    if (t == 0) {
-      // handling input
-      split_in(in, in_split);
-      if (hstreaming && ii == 0) cerr << "reporter:counter:" << task_id << ",First ID," << in_split[0] << endl;
-      // getting reference
-      vector<string> ref_tok;
-      boost::split(ref_tok, in_split[2], boost::is_any_of(" "));
-      register_and_convert(ref_tok, ref_ids);
-      ref_ids_buf.push_back(ref_ids);
-      // process and set grammar
-      bool broken_grammar = true; // ignore broken grammars
-      for (string::iterator it = in.begin(); it != in.end(); it++) {
-        if (!isspace(*it)) {
-          broken_grammar = false;
-          break;
-        }
-      }
-      if (broken_grammar) {
-        cerr << "Broken grammar for " << ii+1 << "! Ignoring this input." << endl;
-        continue;
-      }
-      boost::replace_all(in, "\t", "\n");
-      in += "\n";
-      grammar_buf_out << in << DTRAIN_GRAMMAR_DELIM << " " << in_split[0] << endl;
-      decoder.AddSupplementalGrammarFromString(in);
-      src_str_buf.push_back(in_split[1]);
-      // decode
-      observer->SetRef(ref_ids);
-      decoder.Decode(in_split[1], observer);
-    } else {
-      // get buffered grammar
-      string grammar_str;
-      while (true) {
-        string rule;
-        getline(grammar_buf_in, rule);
-        if (boost::starts_with(rule, DTRAIN_GRAMMAR_DELIM)) break;
-        grammar_str += rule + "\n";
-      }
-      decoder.AddSupplementalGrammarFromString(grammar_str);
-      // decode
-      observer->SetRef(ref_ids_buf[ii]);
-      decoder.Decode(src_str_buf[ii], observer);
-    }
-#else
     if (t == 0) {
       string r_;
       getline(*refs, r_);
@@ -402,7 +319,6 @@ main(int argc, char** argv)
       decoder.Decode(in, observer);
     else
       decoder.Decode(src_str_buf[ii], observer);
-#endif
 
     // get (scored) samples
     vector<ScoredHyp>* samples = observer->GetSamples();
@@ -505,11 +421,6 @@ main(int argc, char** argv)
 
     ++ii;
 
-    if (hstreaming) {
-      rep.update_counter("Seen #"+boost::lexical_cast<string>(t+1), 1u);
-      rep.update_counter("Seen", 1u);
-    }
-
   } // input loop
 
   if (average) w_average += lambdas;
@@ -518,21 +429,8 @@ main(int argc, char** argv)
 
   if (t == 0) {
     in_sz = ii; // remember size of input (# lines)
-    if (hstreaming) {
-      rep.update_counter("|Input|", ii);
-      rep.update_gcounter("|Input|", ii);
-      rep.update_gcounter("Shards", 1u);
-    }
   }
 
-#ifndef DTRAIN_LOCAL
-  if (t == 0) {
-    grammar_buf_out.close();
-  } else {
-    grammar_buf_in.close();
-  }
-#endif
-
   // print some stats
   score_t score_avg = score_sum/(score_t)in_sz;
   score_t model_avg = model_sum/(score_t)in_sz;
@@ -546,7 +444,7 @@ main(int argc, char** argv)
   }
 
   unsigned nonz = 0;
-  if (!quiet || hstreaming) nonz = (unsigned)lambdas.num_nonzero();
+  if (!quiet) nonz = (unsigned)lambdas.num_nonzero();
 
   if (!quiet) {
     cerr << _p5 << _p << "WEIGHTS" << endl;
@@ -571,16 +469,6 @@ main(int argc, char** argv)
     cerr << "           avg f count: " << f_count/(float)list_sz << endl;
   }
 
-  if (hstreaming) {
-    rep.update_counter("Score 1best avg #"+boost::lexical_cast<string>(t+1), (unsigned)(score_avg*DTRAIN_SCALE));
-    rep.update_counter("Model 1best avg #"+boost::lexical_cast<string>(t+1), (unsigned)(model_avg*DTRAIN_SCALE));
-    rep.update_counter("Pairs avg #"+boost::lexical_cast<string>(t+1), (unsigned)((npairs/(weight_t)in_sz)*DTRAIN_SCALE));
-    rep.update_counter("Rank errors avg #"+boost::lexical_cast<string>(t+1), (unsigned)((rank_errors/(weight_t)in_sz)*DTRAIN_SCALE));
-    rep.update_counter("Margin violations avg #"+boost::lexical_cast<string>(t+1), (unsigned)((margin_violations/(weight_t)in_sz)*DTRAIN_SCALE));
-    rep.update_counter("Non zero feature count #"+boost::lexical_cast<string>(t+1), nonz);
-    rep.update_gcounter("Non zero feature count #"+boost::lexical_cast<string>(t+1), nonz);
-  }
-
   pair<score_t,score_t> remember;
   remember.first = score_avg;
   remember.second = model_avg;
@@ -611,10 +499,6 @@ main(int argc, char** argv)
 
   if (average) w_average /= (weight_t)T;
 
-#ifndef DTRAIN_LOCAL
-  unlink(grammar_buf_fn.c_str());
-#endif
-
   if (!noup) {
     if (!quiet) cerr << endl << "Writing weights file to '" << output_fn << "' ..." << endl;
     if (select_weights == "last" || average) { // last, average
@@ -651,7 +535,6 @@ main(int argc, char** argv)
         }
       }
     }
-    if (output_fn == "-" && hstreaming) cout << "__SHARD_COUNT__\t1" << endl;
     if (!quiet) cerr << "done" << endl;
   }
 
diff --git a/training/dtrain/dtrain.h b/training/dtrain/dtrain.h
index 572fd613..f368d810 100644
--- a/training/dtrain/dtrain.h
+++ b/training/dtrain/dtrain.h
@@ -1,14 +1,12 @@
 #ifndef _DTRAIN_H_
 #define _DTRAIN_H_
 
-#undef DTRAIN_FASTER_PERCEPTRON // only look at misranked pairs
-                                 // DO NOT USE WITH SVM!
-#define DTRAIN_LOCAL
+#undef DTRAIN_FASTER_PERCEPTRON // only consider actually misranked pairs
+                                // DO NOT ENABLE  WITH SVM (gamma > 0) OR loss_margin!
+
 #define DTRAIN_DOTS 10 // after how many inputs to display a '.'
-#define DTRAIN_GRAMMAR_DELIM "########EOS########"
 #define DTRAIN_SCALE 100000
 
-
 #include <iomanip>
 #include <climits>
 #include <string.h>
diff --git a/training/dtrain/hstreaming/avg.rb b/training/dtrain/hstreaming/avg.rb
deleted file mode 100755
index 2599c732..00000000
--- a/training/dtrain/hstreaming/avg.rb
+++ /dev/null
@@ -1,32 +0,0 @@
-#!/usr/bin/env ruby
-# first arg may be an int of custom shard count
-
-shard_count_key = "__SHARD_COUNT__"
-
-STDIN.set_encoding 'utf-8'
-STDOUT.set_encoding 'utf-8'
-
-w = {}
-c = {}
-w.default = 0
-c.default = 0
-while line = STDIN.gets
-  key, val = line.split /\s/
-  w[key] += val.to_f
-  c[key] += 1
-end
-
-if ARGV.size == 0
-  shard_count = w["__SHARD_COUNT__"]
-else
-  shard_count = ARGV[0].to_f
-end
-w.each_key { |k|
-  if k == shard_count_key
-    next
-  else
-    puts "#{k}\t#{w[k]/shard_count}"
-    #puts "# #{c[k]}"
-  end
-}
-
diff --git a/training/dtrain/hstreaming/cdec.ini b/training/dtrain/hstreaming/cdec.ini
deleted file mode 100644
index d4f5cecd..00000000
--- a/training/dtrain/hstreaming/cdec.ini
+++ /dev/null
@@ -1,22 +0,0 @@
-formalism=scfg
-add_pass_through_rules=true
-scfg_max_span_limit=15
-intersection_strategy=cube_pruning
-cubepruning_pop_limit=30
-feature_function=WordPenalty
-feature_function=KLanguageModel nc-wmt11.en.srilm.gz
-#feature_function=ArityPenalty
-#feature_function=CMR2008ReorderingFeatures
-#feature_function=Dwarf
-#feature_function=InputIndicator
-#feature_function=LexNullJump
-#feature_function=NewJump
-#feature_function=NgramFeatures
-#feature_function=NonLatinCount
-#feature_function=OutputIndicator
-#feature_function=RuleIdentityFeatures
-#feature_function=RuleNgramFeatures
-#feature_function=RuleShape
-#feature_function=SourceSpanSizeFeatures
-#feature_function=SourceWordPenalty
-#feature_function=SpanFeatures
diff --git a/training/dtrain/hstreaming/dtrain.ini b/training/dtrain/hstreaming/dtrain.ini
deleted file mode 100644
index a2c219a1..00000000
--- a/training/dtrain/hstreaming/dtrain.ini
+++ /dev/null
@@ -1,15 +0,0 @@
-input=-
-output=-
-decoder_config=cdec.ini
-tmp=/var/hadoop/mapred/local/
-epochs=1
-k=100
-N=4
-learning_rate=0.0001
-gamma=0
-scorer=stupid_bleu
-sample_from=kbest
-filter=uniq
-pair_sampling=XYX
-pair_threshold=0
-select_weights=last
diff --git a/training/dtrain/hstreaming/dtrain.sh b/training/dtrain/hstreaming/dtrain.sh
deleted file mode 100755
index 877ff94c..00000000
--- a/training/dtrain/hstreaming/dtrain.sh
+++ /dev/null
@@ -1,9 +0,0 @@
-#!/bin/bash
-# script to run dtrain with a task id
-
-pushd . &>/dev/null
-cd ..
-ID=$(basename $(pwd)) # attempt_...
-popd &>/dev/null
-./dtrain -c dtrain.ini --hstreaming $ID
-
diff --git a/training/dtrain/hstreaming/hadoop-streaming-job.sh b/training/dtrain/hstreaming/hadoop-streaming-job.sh
deleted file mode 100755
index 92419956..00000000
--- a/training/dtrain/hstreaming/hadoop-streaming-job.sh
+++ /dev/null
@@ -1,30 +0,0 @@
-#!/bin/sh
-
-EXP=a_simple_test
-
-# change these vars to fit your hadoop installation
-HADOOP_HOME=/usr/lib/hadoop-0.20
-JAR=contrib/streaming/hadoop-streaming-0.20.2-cdh3u1.jar
-HSTREAMING="$HADOOP_HOME/bin/hadoop jar $HADOOP_HOME/$JAR"
-
- IN=input_on_hdfs
-OUT=output_weights_on_hdfs
-
-# you can -reducer to NONE if you want to
-# do feature selection/averaging locally (e.g. to
-# keep weights of all epochs)
-$HSTREAMING \
-    -mapper "dtrain.sh" \
-    -reducer "ruby lplp.rb l2 select_k 100000" \
-    -input $IN \
-    -output $OUT \
-    -file dtrain.sh \
-    -file lplp.rb \
-    -file ../dtrain \
-    -file dtrain.ini \
-    -file cdec.ini \
-    -file ../test/example/nc-wmt11.en.srilm.gz \
-    -jobconf mapred.reduce.tasks=30 \
-    -jobconf mapred.max.map.failures.percent=0 \
-    -jobconf mapred.job.name="dtrain $EXP"
-
diff --git a/training/dtrain/hstreaming/lplp.rb b/training/dtrain/hstreaming/lplp.rb
deleted file mode 100755
index f0cd58c5..00000000
--- a/training/dtrain/hstreaming/lplp.rb
+++ /dev/null
@@ -1,131 +0,0 @@
-# lplp.rb
-
-# norms
-def l0(feature_column, n)
-  if feature_column.size >= n then return 1 else return 0 end
-end
-
-def l1(feature_column, n=-1)
-  return feature_column.map { |i| i.abs }.reduce { |sum,i| sum+i }
-end
-
-def l2(feature_column, n=-1)
-  return Math.sqrt feature_column.map { |i| i.abs2 }.reduce { |sum,i| sum+i }
-end
-
-def linfty(feature_column, n=-1)
-  return feature_column.map { |i| i.abs }.max
-end
-
-# stats
-def median(feature_column, n)
-  return feature_column.concat(0.step(n-feature_column.size-1).map{|i|0}).sort[feature_column.size/2]
-end
-
-def mean(feature_column, n)
-  return feature_column.reduce { |sum, i| sum+i } / n
-end
-
-# selection
-def select_k(weights, norm_fun, n, k=10000)
-  weights.sort{|a,b| norm_fun.call(b[1], n) <=> norm_fun.call(a[1], n)}.each { |p|
-    puts "#{p[0]}\t#{mean(p[1], n)}"
-    k -= 1
-    if k == 0 then break end
-  }
-end
-
-def cut(weights, norm_fun, n, epsilon=0.0001)
-  weights.each { |k,v|
-    if norm_fun.call(v, n).abs >= epsilon
-      puts "#{k}\t#{mean(v, n)}"
-    end
-  }
-end
-
-# test
-def _test()
-  puts
-  w = {}
-  w["a"] = [1, 2, 3]
-  w["b"] = [1, 2]
-  w["c"] = [66]
-  w["d"] = [10, 20, 30]
-  n = 3
-  puts w.to_s
-  puts
-  puts "select_k"
-  puts "l0 expect ad"
-  select_k(w, method(:l0), n, 2)
-  puts "l1 expect cd"
-  select_k(w, method(:l1), n, 2)
-  puts "l2 expect c"
-  select_k(w, method(:l2), n, 1)
-  puts
-  puts "cut"
-  puts "l1 expect cd"
-  cut(w, method(:l1), n, 7)
-  puts
-  puts "median"
-  a = [1,2,3,4,5]
-  puts a.to_s
-  puts median(a, 5)
-  puts
-  puts "#{median(a, 7)} <- that's because we add missing 0s:"
-  puts a.concat(0.step(7-a.size-1).map{|i|0}).to_s
-  puts
-  puts "mean expect bc"
-  w.clear
-  w["a"] = [2]
-  w["b"] = [2.1]
-  w["c"] = [2.2]
-  cut(w, method(:mean), 1, 2.05)
- exit
-end
-#_test()
-
-# actually do something
-def usage()
-  puts "lplp.rb <l0,l1,l2,linfty,mean,median> <cut|select_k> <k|threshold> [n] < <input>"
-  puts "   l0...: norms for selection"
-  puts "select_k: only output top k (according to the norm of their column vector) features"
-  puts "     cut: output features with weight >= threshold"
-  puts "       n: if we do not have a shard count use this number for averaging"
-  exit
-end
-
-if ARGV.size < 3 then usage end
-norm_fun = method(ARGV[0].to_sym)
-type = ARGV[1]
-x = ARGV[2].to_f
-
-shard_count_key = "__SHARD_COUNT__"
-
-STDIN.set_encoding 'utf-8'
-STDOUT.set_encoding 'utf-8'
-
-w = {}
-shard_count = 0
-while line = STDIN.gets
-  key, val = line.split /\s+/
-  if key == shard_count_key
-    shard_count += 1
-    next
-  end
-  if w.has_key? key
-    w[key].push val.to_f
-  else
-    w[key] = [val.to_f]
-  end
-end
-
-if ARGV.size == 4 then shard_count = ARGV[3].to_f end
-
-if type == 'cut'
-  cut(w, norm_fun, shard_count, x)
-elsif type == 'select_k'
-  select_k(w, norm_fun, shard_count, x)
-else
-  puts "oh oh"
-end
-
diff --git a/training/dtrain/hstreaming/red-test b/training/dtrain/hstreaming/red-test
deleted file mode 100644
index 2623d697..00000000
--- a/training/dtrain/hstreaming/red-test
+++ /dev/null
@@ -1,9 +0,0 @@
-a	1
-b	2
-c	3.5
-a	1
-b	2
-c	3.5
-d	1
-e	2
-__SHARD_COUNT__	2
diff --git a/training/dtrain/lplp.rb b/training/dtrain/lplp.rb
new file mode 100755
index 00000000..f0cd58c5
--- /dev/null
+++ b/training/dtrain/lplp.rb
@@ -0,0 +1,131 @@
+# lplp.rb
+
+# norms
+def l0(feature_column, n)
+  if feature_column.size >= n then return 1 else return 0 end
+end
+
+def l1(feature_column, n=-1)
+  return feature_column.map { |i| i.abs }.reduce { |sum,i| sum+i }
+end
+
+def l2(feature_column, n=-1)
+  return Math.sqrt feature_column.map { |i| i.abs2 }.reduce { |sum,i| sum+i }
+end
+
+def linfty(feature_column, n=-1)
+  return feature_column.map { |i| i.abs }.max
+end
+
+# stats
+def median(feature_column, n)
+  return feature_column.concat(0.step(n-feature_column.size-1).map{|i|0}).sort[feature_column.size/2]
+end
+
+def mean(feature_column, n)
+  return feature_column.reduce { |sum, i| sum+i } / n
+end
+
+# selection
+def select_k(weights, norm_fun, n, k=10000)
+  weights.sort{|a,b| norm_fun.call(b[1], n) <=> norm_fun.call(a[1], n)}.each { |p|
+    puts "#{p[0]}\t#{mean(p[1], n)}"
+    k -= 1
+    if k == 0 then break end
+  }
+end
+
+def cut(weights, norm_fun, n, epsilon=0.0001)
+  weights.each { |k,v|
+    if norm_fun.call(v, n).abs >= epsilon
+      puts "#{k}\t#{mean(v, n)}"
+    end
+  }
+end
+
+# test
+def _test()
+  puts
+  w = {}
+  w["a"] = [1, 2, 3]
+  w["b"] = [1, 2]
+  w["c"] = [66]
+  w["d"] = [10, 20, 30]
+  n = 3
+  puts w.to_s
+  puts
+  puts "select_k"
+  puts "l0 expect ad"
+  select_k(w, method(:l0), n, 2)
+  puts "l1 expect cd"
+  select_k(w, method(:l1), n, 2)
+  puts "l2 expect c"
+  select_k(w, method(:l2), n, 1)
+  puts
+  puts "cut"
+  puts "l1 expect cd"
+  cut(w, method(:l1), n, 7)
+  puts
+  puts "median"
+  a = [1,2,3,4,5]
+  puts a.to_s
+  puts median(a, 5)
+  puts
+  puts "#{median(a, 7)} <- that's because we add missing 0s:"
+  puts a.concat(0.step(7-a.size-1).map{|i|0}).to_s
+  puts
+  puts "mean expect bc"
+  w.clear
+  w["a"] = [2]
+  w["b"] = [2.1]
+  w["c"] = [2.2]
+  cut(w, method(:mean), 1, 2.05)
+ exit
+end
+#_test()
+
+# actually do something
+def usage()
+  puts "lplp.rb <l0,l1,l2,linfty,mean,median> <cut|select_k> <k|threshold> [n] < <input>"
+  puts "   l0...: norms for selection"
+  puts "select_k: only output top k (according to the norm of their column vector) features"
+  puts "     cut: output features with weight >= threshold"
+  puts "       n: if we do not have a shard count use this number for averaging"
+  exit
+end
+
+if ARGV.size < 3 then usage end
+norm_fun = method(ARGV[0].to_sym)
+type = ARGV[1]
+x = ARGV[2].to_f
+
+shard_count_key = "__SHARD_COUNT__"
+
+STDIN.set_encoding 'utf-8'
+STDOUT.set_encoding 'utf-8'
+
+w = {}
+shard_count = 0
+while line = STDIN.gets
+  key, val = line.split /\s+/
+  if key == shard_count_key
+    shard_count += 1
+    next
+  end
+  if w.has_key? key
+    w[key].push val.to_f
+  else
+    w[key] = [val.to_f]
+  end
+end
+
+if ARGV.size == 4 then shard_count = ARGV[3].to_f end
+
+if type == 'cut'
+  cut(w, norm_fun, shard_count, x)
+elsif type == 'select_k'
+  select_k(w, norm_fun, shard_count, x)
+else
+  puts "oh oh"
+end
+
diff --git a/training/dtrain/parallelize.rb b/training/dtrain/parallelize.rb
index fca9b10d..24e7f49e 100755
--- a/training/dtrain/parallelize.rb
+++ b/training/dtrain/parallelize.rb
@@ -80,7 +80,7 @@ def make_shards(input, refs, num_shards, epoch, rand)
     shard_refs = File.new refs_fn, 'w+'
     refs_fns << refs_fn
     0.upto(shard_sz-1) { |i|
-      j = index.pop 
+      j = index.pop
       shard_in.write in_lines[j]
       shard_refs.write refs_lines[j]
     }
@@ -125,7 +125,7 @@ end
       if use_qsub
         qsub_str_start = "qsub -cwd -sync y -b y -j y -o work/out.#{shard}.#{epoch} -N dtrain.#{shard}.#{epoch} \""
         qsub_str_end = "\""
-        local_end = '' 
+        local_end = ''
       else
         local_end = "&>work/out.#{shard}.#{epoch}"
       end
diff --git a/training/dtrain/test/example/cdec.ini b/training/dtrain/test/example/cdec.ini
index 068ebd4d..0215416d 100644
--- a/training/dtrain/test/example/cdec.ini
+++ b/training/dtrain/test/example/cdec.ini
@@ -2,7 +2,7 @@ formalism=scfg
 add_pass_through_rules=true
 scfg_max_span_limit=15
 intersection_strategy=cube_pruning
-cubepruning_pop_limit=30
+cubepruning_pop_limit=200
 feature_function=WordPenalty
 feature_function=KLanguageModel ./nc-wmt11.en.srilm.gz
 # all currently working feature functions for translation:
-- 
cgit v1.2.3