From 529c8f0671ce0b09c2a797278a8f84242c86465d Mon Sep 17 00:00:00 2001
From: Patrick Simianer
Date: Fri, 15 Mar 2013 10:29:13 +0100
Subject: removed hadoop/hstreaming mode
---
training/dtrain/README.md | 28 +----
training/dtrain/dtrain.cc | 121 +------------------
training/dtrain/dtrain.h | 8 +-
training/dtrain/hstreaming/avg.rb | 32 -----
training/dtrain/hstreaming/cdec.ini | 22 ----
training/dtrain/hstreaming/dtrain.ini | 15 ---
training/dtrain/hstreaming/dtrain.sh | 9 --
training/dtrain/hstreaming/hadoop-streaming-job.sh | 30 -----
training/dtrain/hstreaming/lplp.rb | 131 ---------------------
training/dtrain/hstreaming/red-test | 9 --
training/dtrain/lplp.rb | 131 +++++++++++++++++++++
training/dtrain/parallelize.rb | 4 +-
training/dtrain/test/example/cdec.ini | 2 +-
13 files changed, 144 insertions(+), 398 deletions(-)
delete mode 100755 training/dtrain/hstreaming/avg.rb
delete mode 100644 training/dtrain/hstreaming/cdec.ini
delete mode 100644 training/dtrain/hstreaming/dtrain.ini
delete mode 100755 training/dtrain/hstreaming/dtrain.sh
delete mode 100755 training/dtrain/hstreaming/hadoop-streaming-job.sh
delete mode 100755 training/dtrain/hstreaming/lplp.rb
delete mode 100644 training/dtrain/hstreaming/red-test
create mode 100755 training/dtrain/lplp.rb
(limited to 'training/dtrain')
diff --git a/training/dtrain/README.md b/training/dtrain/README.md
index 7edabbf1..2ab2f232 100644
--- a/training/dtrain/README.md
+++ b/training/dtrain/README.md
@@ -13,36 +13,18 @@ Builds when building cdec, see ../BUILDING .
To build only parts needed for dtrain do
```
autoreconf -ifv
- ./configure [--disable-gtest]
- cd dtrain/; make
+ ./configure
+ cd training/dtrain/; make
```
Running
-------
-To run this on a dev set locally:
-```
- #define DTRAIN_LOCAL
-```
-otherwise remove that line or undef, then recompile. You need a single
-grammar file or input annotated with per-sentence grammars (psg) as you
-would use with cdec. Additionally you need to give dtrain a file with
-references (--refs) when running locally.
-
-The input for use with hadoop streaming looks like this:
-```
- \t\t[\t]
-```
-To convert a psg to this format you need to replace all "\n"
-by "\t". Make sure there are no tabs in your data.
-
-For an example of local usage (with the 'distributed' format)
-the see test/example/ . This expects dtrain to be built without
-DTRAIN_LOCAL.
+See directories under test/ .
Legal
-----
-Copyright (c) 2012 by Patrick Simianer
+Copyright (c) 2012-2013 by Patrick Simianer
-See the file ../LICENSE.txt for the licensing terms that this software is
+See the file LICENSE.txt in the root folder for the licensing terms that this software is
released under.
diff --git a/training/dtrain/dtrain.cc b/training/dtrain/dtrain.cc
index 53487d34..dfb5b351 100644
--- a/training/dtrain/dtrain.cc
+++ b/training/dtrain/dtrain.cc
@@ -12,9 +12,7 @@ dtrain_init(int argc, char** argv, po::variables_map* cfg)
("decoder_config", po::value(), "configuration file for cdec")
("print_weights", po::value(), "weights to print on each iteration")
("stop_after", po::value()->default_value(0), "stop after X input sentences")
- ("tmp", po::value()->default_value("/tmp"), "temp dir to use")
("keep", po::value()->zero_tokens(), "keep weights files for each iteration")
- ("hstreaming", po::value(), "run in hadoop streaming mode, arg is a task id")
("epochs", po::value()->default_value(10), "# of iterations T (per shard)")
("k", po::value()->default_value(100), "how many translations to sample")
("sample_from", po::value()->default_value("kbest"), "where to sample translations from: 'kbest', 'forest'")
@@ -28,16 +26,14 @@ dtrain_init(int argc, char** argv, po::variables_map* cfg)
("gamma", po::value()->default_value(0.), "gamma for SVM (0 for perceptron)")
("select_weights", po::value()->default_value("last"), "output best, last, avg weights ('VOID' to throw away)")
("rescale", po::value()->zero_tokens(), "rescale weight vector after each input")
- ("l1_reg", po::value()->default_value("none"), "apply l1 regularization as in 'Tsuroka et al' (2010)")
+ ("l1_reg", po::value()->default_value("none"), "apply l1 regularization as in 'Tsuroka et al' (2010) UNTESTED")
("l1_reg_strength", po::value(), "l1 regularization strength")
("fselect", po::value()->default_value(-1), "select top x percent (or by threshold) of features after each epoch NOT IMPLEMENTED") // TODO
("approx_bleu_d", po::value()->default_value(0.9), "discount for approx. BLEU")
("scale_bleu_diff", po::value()->zero_tokens(), "learning rate <- bleu diff of a misranked pair")
("loss_margin", po::value()->default_value(0.), "update if no error in pref pair but model scores this near")
("max_pairs", po::value()->default_value(std::numeric_limits::max()), "max. # of pairs per Sent.")
-#ifdef DTRAIN_LOCAL
("refs,r", po::value(), "references in local mode")
-#endif
("noup", po::value()->zero_tokens(), "do not update weights");
po::options_description cl("Command Line Options");
cl.add_options()
@@ -55,16 +51,6 @@ dtrain_init(int argc, char** argv, po::variables_map* cfg)
cerr << cl << endl;
return false;
}
- if (cfg->count("hstreaming") && (*cfg)["output"].as() != "-") {
- cerr << "When using 'hstreaming' the 'output' param should be '-'." << endl;
- return false;
- }
-#ifdef DTRAIN_LOCAL
- if ((*cfg)["input"].as() == "-") {
- cerr << "Can't use stdin as input with this binary. Recompile without DTRAIN_LOCAL" << endl;
- return false;
- }
-#endif
if ((*cfg)["sample_from"].as() != "kbest"
&& (*cfg)["sample_from"].as() != "forest") {
cerr << "Wrong 'sample_from' param: '" << (*cfg)["sample_from"].as() << "', use 'kbest' or 'forest'." << endl;
@@ -111,17 +97,8 @@ main(int argc, char** argv)
if (cfg.count("verbose")) verbose = true;
bool noup = false;
if (cfg.count("noup")) noup = true;
- bool hstreaming = false;
- string task_id;
- if (cfg.count("hstreaming")) {
- hstreaming = true;
- quiet = true;
- task_id = cfg["hstreaming"].as();
- cerr.precision(17);
- }
bool rescale = false;
if (cfg.count("rescale")) rescale = true;
- HSReporter rep(task_id);
bool keep = false;
if (cfg.count("keep")) keep = true;
@@ -224,16 +201,8 @@ main(int argc, char** argv)
// buffer input for t > 0
vector src_str_buf; // source strings (decoder takes only strings)
vector > ref_ids_buf; // references as WordID vecs
- // where temp files go
- string tmp_path = cfg["tmp"].as();
-#ifdef DTRAIN_LOCAL
string refs_fn = cfg["refs"].as();
ReadFile refs(refs_fn);
-#else
- string grammar_buf_fn = gettmpf(tmp_path, "dtrain-grammars");
- ogzstream grammar_buf_out;
- grammar_buf_out.open(grammar_buf_fn.c_str());
-#endif
unsigned in_sz = std::numeric_limits::max(); // input index, input size
vector > all_scores;
@@ -270,9 +239,7 @@ main(int argc, char** argv)
cerr << setw(25) << "max pairs " << max_pairs << endl;
cerr << setw(25) << "cdec cfg " << "'" << cfg["decoder_config"].as() << "'" << endl;
cerr << setw(25) << "input " << "'" << input_fn << "'" << endl;
-#ifdef DTRAIN_LOCAL
cerr << setw(25) << "refs " << "'" << refs_fn << "'" << endl;
-#endif
cerr << setw(25) << "output " << "'" << output_fn << "'" << endl;
if (cfg.count("input_weights"))
cerr << setw(25) << "weights in " << "'" << cfg["input_weights"].as() << "'" << endl;
@@ -285,14 +252,10 @@ main(int argc, char** argv)
for (unsigned t = 0; t < T; t++) // T epochs
{
- if (hstreaming) cerr << "reporter:status:Iteration #" << t+1 << " of " << T << endl;
-
time_t start, end;
time(&start);
-#ifndef DTRAIN_LOCAL
igzstream grammar_buf_in;
if (t > 0) grammar_buf_in.open(grammar_buf_fn.c_str());
-#endif
score_t score_sum = 0.;
score_t model_sum(0);
unsigned ii = 0, rank_errors = 0, margin_violations = 0, npairs = 0, f_count = 0, list_sz = 0;
@@ -340,52 +303,6 @@ main(int argc, char** argv)
// getting input
vector ref_ids; // reference as vector
-#ifndef DTRAIN_LOCAL
- vector in_split; // input: sid\tsrc\tref\tpsg
- if (t == 0) {
- // handling input
- split_in(in, in_split);
- if (hstreaming && ii == 0) cerr << "reporter:counter:" << task_id << ",First ID," << in_split[0] << endl;
- // getting reference
- vector ref_tok;
- boost::split(ref_tok, in_split[2], boost::is_any_of(" "));
- register_and_convert(ref_tok, ref_ids);
- ref_ids_buf.push_back(ref_ids);
- // process and set grammar
- bool broken_grammar = true; // ignore broken grammars
- for (string::iterator it = in.begin(); it != in.end(); it++) {
- if (!isspace(*it)) {
- broken_grammar = false;
- break;
- }
- }
- if (broken_grammar) {
- cerr << "Broken grammar for " << ii+1 << "! Ignoring this input." << endl;
- continue;
- }
- boost::replace_all(in, "\t", "\n");
- in += "\n";
- grammar_buf_out << in << DTRAIN_GRAMMAR_DELIM << " " << in_split[0] << endl;
- decoder.AddSupplementalGrammarFromString(in);
- src_str_buf.push_back(in_split[1]);
- // decode
- observer->SetRef(ref_ids);
- decoder.Decode(in_split[1], observer);
- } else {
- // get buffered grammar
- string grammar_str;
- while (true) {
- string rule;
- getline(grammar_buf_in, rule);
- if (boost::starts_with(rule, DTRAIN_GRAMMAR_DELIM)) break;
- grammar_str += rule + "\n";
- }
- decoder.AddSupplementalGrammarFromString(grammar_str);
- // decode
- observer->SetRef(ref_ids_buf[ii]);
- decoder.Decode(src_str_buf[ii], observer);
- }
-#else
if (t == 0) {
string r_;
getline(*refs, r_);
@@ -402,7 +319,6 @@ main(int argc, char** argv)
decoder.Decode(in, observer);
else
decoder.Decode(src_str_buf[ii], observer);
-#endif
// get (scored) samples
vector* samples = observer->GetSamples();
@@ -505,11 +421,6 @@ main(int argc, char** argv)
++ii;
- if (hstreaming) {
- rep.update_counter("Seen #"+boost::lexical_cast(t+1), 1u);
- rep.update_counter("Seen", 1u);
- }
-
} // input loop
if (average) w_average += lambdas;
@@ -518,21 +429,8 @@ main(int argc, char** argv)
if (t == 0) {
in_sz = ii; // remember size of input (# lines)
- if (hstreaming) {
- rep.update_counter("|Input|", ii);
- rep.update_gcounter("|Input|", ii);
- rep.update_gcounter("Shards", 1u);
- }
}
-#ifndef DTRAIN_LOCAL
- if (t == 0) {
- grammar_buf_out.close();
- } else {
- grammar_buf_in.close();
- }
-#endif
-
// print some stats
score_t score_avg = score_sum/(score_t)in_sz;
score_t model_avg = model_sum/(score_t)in_sz;
@@ -546,7 +444,7 @@ main(int argc, char** argv)
}
unsigned nonz = 0;
- if (!quiet || hstreaming) nonz = (unsigned)lambdas.num_nonzero();
+ if (!quiet) nonz = (unsigned)lambdas.num_nonzero();
if (!quiet) {
cerr << _p5 << _p << "WEIGHTS" << endl;
@@ -571,16 +469,6 @@ main(int argc, char** argv)
cerr << " avg f count: " << f_count/(float)list_sz << endl;
}
- if (hstreaming) {
- rep.update_counter("Score 1best avg #"+boost::lexical_cast(t+1), (unsigned)(score_avg*DTRAIN_SCALE));
- rep.update_counter("Model 1best avg #"+boost::lexical_cast(t+1), (unsigned)(model_avg*DTRAIN_SCALE));
- rep.update_counter("Pairs avg #"+boost::lexical_cast(t+1), (unsigned)((npairs/(weight_t)in_sz)*DTRAIN_SCALE));
- rep.update_counter("Rank errors avg #"+boost::lexical_cast(t+1), (unsigned)((rank_errors/(weight_t)in_sz)*DTRAIN_SCALE));
- rep.update_counter("Margin violations avg #"+boost::lexical_cast(t+1), (unsigned)((margin_violations/(weight_t)in_sz)*DTRAIN_SCALE));
- rep.update_counter("Non zero feature count #"+boost::lexical_cast(t+1), nonz);
- rep.update_gcounter("Non zero feature count #"+boost::lexical_cast(t+1), nonz);
- }
-
pair remember;
remember.first = score_avg;
remember.second = model_avg;
@@ -611,10 +499,6 @@ main(int argc, char** argv)
if (average) w_average /= (weight_t)T;
-#ifndef DTRAIN_LOCAL
- unlink(grammar_buf_fn.c_str());
-#endif
-
if (!noup) {
if (!quiet) cerr << endl << "Writing weights file to '" << output_fn << "' ..." << endl;
if (select_weights == "last" || average) { // last, average
@@ -651,7 +535,6 @@ main(int argc, char** argv)
}
}
}
- if (output_fn == "-" && hstreaming) cout << "__SHARD_COUNT__\t1" << endl;
if (!quiet) cerr << "done" << endl;
}
diff --git a/training/dtrain/dtrain.h b/training/dtrain/dtrain.h
index 572fd613..f368d810 100644
--- a/training/dtrain/dtrain.h
+++ b/training/dtrain/dtrain.h
@@ -1,14 +1,12 @@
#ifndef _DTRAIN_H_
#define _DTRAIN_H_
-#undef DTRAIN_FASTER_PERCEPTRON // only look at misranked pairs
- // DO NOT USE WITH SVM!
-#define DTRAIN_LOCAL
+#undef DTRAIN_FASTER_PERCEPTRON // only consider actually misranked pairs
+ // DO NOT ENABLE WITH SVM (gamma > 0) OR loss_margin!
+
#define DTRAIN_DOTS 10 // after how many inputs to display a '.'
-#define DTRAIN_GRAMMAR_DELIM "########EOS########"
#define DTRAIN_SCALE 100000
-
#include
#include
#include
diff --git a/training/dtrain/hstreaming/avg.rb b/training/dtrain/hstreaming/avg.rb
deleted file mode 100755
index 2599c732..00000000
--- a/training/dtrain/hstreaming/avg.rb
+++ /dev/null
@@ -1,32 +0,0 @@
-#!/usr/bin/env ruby
-# first arg may be an int of custom shard count
-
-shard_count_key = "__SHARD_COUNT__"
-
-STDIN.set_encoding 'utf-8'
-STDOUT.set_encoding 'utf-8'
-
-w = {}
-c = {}
-w.default = 0
-c.default = 0
-while line = STDIN.gets
- key, val = line.split /\s/
- w[key] += val.to_f
- c[key] += 1
-end
-
-if ARGV.size == 0
- shard_count = w["__SHARD_COUNT__"]
-else
- shard_count = ARGV[0].to_f
-end
-w.each_key { |k|
- if k == shard_count_key
- next
- else
- puts "#{k}\t#{w[k]/shard_count}"
- #puts "# #{c[k]}"
- end
-}
-
diff --git a/training/dtrain/hstreaming/cdec.ini b/training/dtrain/hstreaming/cdec.ini
deleted file mode 100644
index d4f5cecd..00000000
--- a/training/dtrain/hstreaming/cdec.ini
+++ /dev/null
@@ -1,22 +0,0 @@
-formalism=scfg
-add_pass_through_rules=true
-scfg_max_span_limit=15
-intersection_strategy=cube_pruning
-cubepruning_pop_limit=30
-feature_function=WordPenalty
-feature_function=KLanguageModel nc-wmt11.en.srilm.gz
-#feature_function=ArityPenalty
-#feature_function=CMR2008ReorderingFeatures
-#feature_function=Dwarf
-#feature_function=InputIndicator
-#feature_function=LexNullJump
-#feature_function=NewJump
-#feature_function=NgramFeatures
-#feature_function=NonLatinCount
-#feature_function=OutputIndicator
-#feature_function=RuleIdentityFeatures
-#feature_function=RuleNgramFeatures
-#feature_function=RuleShape
-#feature_function=SourceSpanSizeFeatures
-#feature_function=SourceWordPenalty
-#feature_function=SpanFeatures
diff --git a/training/dtrain/hstreaming/dtrain.ini b/training/dtrain/hstreaming/dtrain.ini
deleted file mode 100644
index a2c219a1..00000000
--- a/training/dtrain/hstreaming/dtrain.ini
+++ /dev/null
@@ -1,15 +0,0 @@
-input=-
-output=-
-decoder_config=cdec.ini
-tmp=/var/hadoop/mapred/local/
-epochs=1
-k=100
-N=4
-learning_rate=0.0001
-gamma=0
-scorer=stupid_bleu
-sample_from=kbest
-filter=uniq
-pair_sampling=XYX
-pair_threshold=0
-select_weights=last
diff --git a/training/dtrain/hstreaming/dtrain.sh b/training/dtrain/hstreaming/dtrain.sh
deleted file mode 100755
index 877ff94c..00000000
--- a/training/dtrain/hstreaming/dtrain.sh
+++ /dev/null
@@ -1,9 +0,0 @@
-#!/bin/bash
-# script to run dtrain with a task id
-
-pushd . &>/dev/null
-cd ..
-ID=$(basename $(pwd)) # attempt_...
-popd &>/dev/null
-./dtrain -c dtrain.ini --hstreaming $ID
-
diff --git a/training/dtrain/hstreaming/hadoop-streaming-job.sh b/training/dtrain/hstreaming/hadoop-streaming-job.sh
deleted file mode 100755
index 92419956..00000000
--- a/training/dtrain/hstreaming/hadoop-streaming-job.sh
+++ /dev/null
@@ -1,30 +0,0 @@
-#!/bin/sh
-
-EXP=a_simple_test
-
-# change these vars to fit your hadoop installation
-HADOOP_HOME=/usr/lib/hadoop-0.20
-JAR=contrib/streaming/hadoop-streaming-0.20.2-cdh3u1.jar
-HSTREAMING="$HADOOP_HOME/bin/hadoop jar $HADOOP_HOME/$JAR"
-
- IN=input_on_hdfs
-OUT=output_weights_on_hdfs
-
-# you can -reducer to NONE if you want to
-# do feature selection/averaging locally (e.g. to
-# keep weights of all epochs)
-$HSTREAMING \
- -mapper "dtrain.sh" \
- -reducer "ruby lplp.rb l2 select_k 100000" \
- -input $IN \
- -output $OUT \
- -file dtrain.sh \
- -file lplp.rb \
- -file ../dtrain \
- -file dtrain.ini \
- -file cdec.ini \
- -file ../test/example/nc-wmt11.en.srilm.gz \
- -jobconf mapred.reduce.tasks=30 \
- -jobconf mapred.max.map.failures.percent=0 \
- -jobconf mapred.job.name="dtrain $EXP"
-
diff --git a/training/dtrain/hstreaming/lplp.rb b/training/dtrain/hstreaming/lplp.rb
deleted file mode 100755
index f0cd58c5..00000000
--- a/training/dtrain/hstreaming/lplp.rb
+++ /dev/null
@@ -1,131 +0,0 @@
-# lplp.rb
-
-# norms
-def l0(feature_column, n)
- if feature_column.size >= n then return 1 else return 0 end
-end
-
-def l1(feature_column, n=-1)
- return feature_column.map { |i| i.abs }.reduce { |sum,i| sum+i }
-end
-
-def l2(feature_column, n=-1)
- return Math.sqrt feature_column.map { |i| i.abs2 }.reduce { |sum,i| sum+i }
-end
-
-def linfty(feature_column, n=-1)
- return feature_column.map { |i| i.abs }.max
-end
-
-# stats
-def median(feature_column, n)
- return feature_column.concat(0.step(n-feature_column.size-1).map{|i|0}).sort[feature_column.size/2]
-end
-
-def mean(feature_column, n)
- return feature_column.reduce { |sum, i| sum+i } / n
-end
-
-# selection
-def select_k(weights, norm_fun, n, k=10000)
- weights.sort{|a,b| norm_fun.call(b[1], n) <=> norm_fun.call(a[1], n)}.each { |p|
- puts "#{p[0]}\t#{mean(p[1], n)}"
- k -= 1
- if k == 0 then break end
- }
-end
-
-def cut(weights, norm_fun, n, epsilon=0.0001)
- weights.each { |k,v|
- if norm_fun.call(v, n).abs >= epsilon
- puts "#{k}\t#{mean(v, n)}"
- end
- }
-end
-
-# test
-def _test()
- puts
- w = {}
- w["a"] = [1, 2, 3]
- w["b"] = [1, 2]
- w["c"] = [66]
- w["d"] = [10, 20, 30]
- n = 3
- puts w.to_s
- puts
- puts "select_k"
- puts "l0 expect ad"
- select_k(w, method(:l0), n, 2)
- puts "l1 expect cd"
- select_k(w, method(:l1), n, 2)
- puts "l2 expect c"
- select_k(w, method(:l2), n, 1)
- puts
- puts "cut"
- puts "l1 expect cd"
- cut(w, method(:l1), n, 7)
- puts
- puts "median"
- a = [1,2,3,4,5]
- puts a.to_s
- puts median(a, 5)
- puts
- puts "#{median(a, 7)} <- that's because we add missing 0s:"
- puts a.concat(0.step(7-a.size-1).map{|i|0}).to_s
- puts
- puts "mean expect bc"
- w.clear
- w["a"] = [2]
- w["b"] = [2.1]
- w["c"] = [2.2]
- cut(w, method(:mean), 1, 2.05)
- exit
-end
-#_test()
-
-# actually do something
-def usage()
- puts "lplp.rb [n] < "
- puts " l0...: norms for selection"
- puts "select_k: only output top k (according to the norm of their column vector) features"
- puts " cut: output features with weight >= threshold"
- puts " n: if we do not have a shard count use this number for averaging"
- exit
-end
-
-if ARGV.size < 3 then usage end
-norm_fun = method(ARGV[0].to_sym)
-type = ARGV[1]
-x = ARGV[2].to_f
-
-shard_count_key = "__SHARD_COUNT__"
-
-STDIN.set_encoding 'utf-8'
-STDOUT.set_encoding 'utf-8'
-
-w = {}
-shard_count = 0
-while line = STDIN.gets
- key, val = line.split /\s+/
- if key == shard_count_key
- shard_count += 1
- next
- end
- if w.has_key? key
- w[key].push val.to_f
- else
- w[key] = [val.to_f]
- end
-end
-
-if ARGV.size == 4 then shard_count = ARGV[3].to_f end
-
-if type == 'cut'
- cut(w, norm_fun, shard_count, x)
-elsif type == 'select_k'
- select_k(w, norm_fun, shard_count, x)
-else
- puts "oh oh"
-end
-
diff --git a/training/dtrain/hstreaming/red-test b/training/dtrain/hstreaming/red-test
deleted file mode 100644
index 2623d697..00000000
--- a/training/dtrain/hstreaming/red-test
+++ /dev/null
@@ -1,9 +0,0 @@
-a 1
-b 2
-c 3.5
-a 1
-b 2
-c 3.5
-d 1
-e 2
-__SHARD_COUNT__ 2
diff --git a/training/dtrain/lplp.rb b/training/dtrain/lplp.rb
new file mode 100755
index 00000000..f0cd58c5
--- /dev/null
+++ b/training/dtrain/lplp.rb
@@ -0,0 +1,131 @@
+# lplp.rb
+
+# norms
+def l0(feature_column, n)
+ if feature_column.size >= n then return 1 else return 0 end
+end
+
+def l1(feature_column, n=-1)
+ return feature_column.map { |i| i.abs }.reduce { |sum,i| sum+i }
+end
+
+def l2(feature_column, n=-1)
+ return Math.sqrt feature_column.map { |i| i.abs2 }.reduce { |sum,i| sum+i }
+end
+
+def linfty(feature_column, n=-1)
+ return feature_column.map { |i| i.abs }.max
+end
+
+# stats
+def median(feature_column, n)
+ return feature_column.concat(0.step(n-feature_column.size-1).map{|i|0}).sort[feature_column.size/2]
+end
+
+def mean(feature_column, n)
+ return feature_column.reduce { |sum, i| sum+i } / n
+end
+
+# selection
+def select_k(weights, norm_fun, n, k=10000)
+ weights.sort{|a,b| norm_fun.call(b[1], n) <=> norm_fun.call(a[1], n)}.each { |p|
+ puts "#{p[0]}\t#{mean(p[1], n)}"
+ k -= 1
+ if k == 0 then break end
+ }
+end
+
+def cut(weights, norm_fun, n, epsilon=0.0001)
+ weights.each { |k,v|
+ if norm_fun.call(v, n).abs >= epsilon
+ puts "#{k}\t#{mean(v, n)}"
+ end
+ }
+end
+
+# test
+def _test()
+ puts
+ w = {}
+ w["a"] = [1, 2, 3]
+ w["b"] = [1, 2]
+ w["c"] = [66]
+ w["d"] = [10, 20, 30]
+ n = 3
+ puts w.to_s
+ puts
+ puts "select_k"
+ puts "l0 expect ad"
+ select_k(w, method(:l0), n, 2)
+ puts "l1 expect cd"
+ select_k(w, method(:l1), n, 2)
+ puts "l2 expect c"
+ select_k(w, method(:l2), n, 1)
+ puts
+ puts "cut"
+ puts "l1 expect cd"
+ cut(w, method(:l1), n, 7)
+ puts
+ puts "median"
+ a = [1,2,3,4,5]
+ puts a.to_s
+ puts median(a, 5)
+ puts
+ puts "#{median(a, 7)} <- that's because we add missing 0s:"
+ puts a.concat(0.step(7-a.size-1).map{|i|0}).to_s
+ puts
+ puts "mean expect bc"
+ w.clear
+ w["a"] = [2]
+ w["b"] = [2.1]
+ w["c"] = [2.2]
+ cut(w, method(:mean), 1, 2.05)
+ exit
+end
+#_test()
+
+# actually do something
+def usage()
+ puts "lplp.rb [n] < "
+ puts " l0...: norms for selection"
+ puts "select_k: only output top k (according to the norm of their column vector) features"
+ puts " cut: output features with weight >= threshold"
+ puts " n: if we do not have a shard count use this number for averaging"
+ exit
+end
+
+if ARGV.size < 3 then usage end
+norm_fun = method(ARGV[0].to_sym)
+type = ARGV[1]
+x = ARGV[2].to_f
+
+shard_count_key = "__SHARD_COUNT__"
+
+STDIN.set_encoding 'utf-8'
+STDOUT.set_encoding 'utf-8'
+
+w = {}
+shard_count = 0
+while line = STDIN.gets
+ key, val = line.split /\s+/
+ if key == shard_count_key
+ shard_count += 1
+ next
+ end
+ if w.has_key? key
+ w[key].push val.to_f
+ else
+ w[key] = [val.to_f]
+ end
+end
+
+if ARGV.size == 4 then shard_count = ARGV[3].to_f end
+
+if type == 'cut'
+ cut(w, norm_fun, shard_count, x)
+elsif type == 'select_k'
+ select_k(w, norm_fun, shard_count, x)
+else
+ puts "oh oh"
+end
+
diff --git a/training/dtrain/parallelize.rb b/training/dtrain/parallelize.rb
index fca9b10d..24e7f49e 100755
--- a/training/dtrain/parallelize.rb
+++ b/training/dtrain/parallelize.rb
@@ -80,7 +80,7 @@ def make_shards(input, refs, num_shards, epoch, rand)
shard_refs = File.new refs_fn, 'w+'
refs_fns << refs_fn
0.upto(shard_sz-1) { |i|
- j = index.pop
+ j = index.pop
shard_in.write in_lines[j]
shard_refs.write refs_lines[j]
}
@@ -125,7 +125,7 @@ end
if use_qsub
qsub_str_start = "qsub -cwd -sync y -b y -j y -o work/out.#{shard}.#{epoch} -N dtrain.#{shard}.#{epoch} \""
qsub_str_end = "\""
- local_end = ''
+ local_end = ''
else
local_end = "&>work/out.#{shard}.#{epoch}"
end
diff --git a/training/dtrain/test/example/cdec.ini b/training/dtrain/test/example/cdec.ini
index 068ebd4d..0215416d 100644
--- a/training/dtrain/test/example/cdec.ini
+++ b/training/dtrain/test/example/cdec.ini
@@ -2,7 +2,7 @@ formalism=scfg
add_pass_through_rules=true
scfg_max_span_limit=15
intersection_strategy=cube_pruning
-cubepruning_pop_limit=30
+cubepruning_pop_limit=200
feature_function=WordPenalty
feature_function=KLanguageModel ./nc-wmt11.en.srilm.gz
# all currently working feature functions for translation:
--
cgit v1.2.3