summaryrefslogtreecommitdiff
path: root/training/dtrain
diff options
context:
space:
mode:
authorPatrick Simianer <p@simianer.de>2013-03-15 10:29:13 +0100
committerPatrick Simianer <p@simianer.de>2013-03-15 10:29:13 +0100
commit529c8f0671ce0b09c2a797278a8f84242c86465d (patch)
tree35e44d37ab45bd53749afb1ea93c8693055cfb4f /training/dtrain
parent72b07dfc1534862aea06c102b4382513183ce253 (diff)
removed hadoop/hstreaming mode
Diffstat (limited to 'training/dtrain')
-rw-r--r--training/dtrain/README.md28
-rw-r--r--training/dtrain/dtrain.cc121
-rw-r--r--training/dtrain/dtrain.h8
-rwxr-xr-xtraining/dtrain/hstreaming/avg.rb32
-rw-r--r--training/dtrain/hstreaming/cdec.ini22
-rw-r--r--training/dtrain/hstreaming/dtrain.ini15
-rwxr-xr-xtraining/dtrain/hstreaming/dtrain.sh9
-rwxr-xr-xtraining/dtrain/hstreaming/hadoop-streaming-job.sh30
-rw-r--r--training/dtrain/hstreaming/red-test9
-rwxr-xr-xtraining/dtrain/lplp.rb (renamed from training/dtrain/hstreaming/lplp.rb)0
-rwxr-xr-xtraining/dtrain/parallelize.rb4
-rw-r--r--training/dtrain/test/example/cdec.ini2
12 files changed, 13 insertions, 267 deletions
diff --git a/training/dtrain/README.md b/training/dtrain/README.md
index 7edabbf1..2ab2f232 100644
--- a/training/dtrain/README.md
+++ b/training/dtrain/README.md
@@ -13,36 +13,18 @@ Builds when building cdec, see ../BUILDING .
To build only parts needed for dtrain do
```
autoreconf -ifv
- ./configure [--disable-gtest]
- cd dtrain/; make
+ ./configure
+ cd training/dtrain/; make
```
Running
-------
-To run this on a dev set locally:
-```
- #define DTRAIN_LOCAL
-```
-otherwise remove that line or undef, then recompile. You need a single
-grammar file or input annotated with per-sentence grammars (psg) as you
-would use with cdec. Additionally you need to give dtrain a file with
-references (--refs) when running locally.
-
-The input for use with hadoop streaming looks like this:
-```
- <sid>\t<source>\t<ref>\t<grammar rules separated by \t>
-```
-To convert a psg to this format you need to replace all "\n"
-by "\t". Make sure there are no tabs in your data.
-
-For an example of local usage (with the 'distributed' format)
-the see test/example/ . This expects dtrain to be built without
-DTRAIN_LOCAL.
+See directories under test/ .
Legal
-----
-Copyright (c) 2012 by Patrick Simianer <p@simianer.de>
+Copyright (c) 2012-2013 by Patrick Simianer <p@simianer.de>
-See the file ../LICENSE.txt for the licensing terms that this software is
+See the file LICENSE.txt in the root folder for the licensing terms that this software is
released under.
diff --git a/training/dtrain/dtrain.cc b/training/dtrain/dtrain.cc
index 53487d34..dfb5b351 100644
--- a/training/dtrain/dtrain.cc
+++ b/training/dtrain/dtrain.cc
@@ -12,9 +12,7 @@ dtrain_init(int argc, char** argv, po::variables_map* cfg)
("decoder_config", po::value<string>(), "configuration file for cdec")
("print_weights", po::value<string>(), "weights to print on each iteration")
("stop_after", po::value<unsigned>()->default_value(0), "stop after X input sentences")
- ("tmp", po::value<string>()->default_value("/tmp"), "temp dir to use")
("keep", po::value<bool>()->zero_tokens(), "keep weights files for each iteration")
- ("hstreaming", po::value<string>(), "run in hadoop streaming mode, arg is a task id")
("epochs", po::value<unsigned>()->default_value(10), "# of iterations T (per shard)")
("k", po::value<unsigned>()->default_value(100), "how many translations to sample")
("sample_from", po::value<string>()->default_value("kbest"), "where to sample translations from: 'kbest', 'forest'")
@@ -28,16 +26,14 @@ dtrain_init(int argc, char** argv, po::variables_map* cfg)
("gamma", po::value<weight_t>()->default_value(0.), "gamma for SVM (0 for perceptron)")
("select_weights", po::value<string>()->default_value("last"), "output best, last, avg weights ('VOID' to throw away)")
("rescale", po::value<bool>()->zero_tokens(), "rescale weight vector after each input")
- ("l1_reg", po::value<string>()->default_value("none"), "apply l1 regularization as in 'Tsuroka et al' (2010)")
+ ("l1_reg", po::value<string>()->default_value("none"), "apply l1 regularization as in 'Tsuroka et al' (2010) UNTESTED")
("l1_reg_strength", po::value<weight_t>(), "l1 regularization strength")
("fselect", po::value<weight_t>()->default_value(-1), "select top x percent (or by threshold) of features after each epoch NOT IMPLEMENTED") // TODO
("approx_bleu_d", po::value<score_t>()->default_value(0.9), "discount for approx. BLEU")
("scale_bleu_diff", po::value<bool>()->zero_tokens(), "learning rate <- bleu diff of a misranked pair")
("loss_margin", po::value<weight_t>()->default_value(0.), "update if no error in pref pair but model scores this near")
("max_pairs", po::value<unsigned>()->default_value(std::numeric_limits<unsigned>::max()), "max. # of pairs per Sent.")
-#ifdef DTRAIN_LOCAL
("refs,r", po::value<string>(), "references in local mode")
-#endif
("noup", po::value<bool>()->zero_tokens(), "do not update weights");
po::options_description cl("Command Line Options");
cl.add_options()
@@ -55,16 +51,6 @@ dtrain_init(int argc, char** argv, po::variables_map* cfg)
cerr << cl << endl;
return false;
}
- if (cfg->count("hstreaming") && (*cfg)["output"].as<string>() != "-") {
- cerr << "When using 'hstreaming' the 'output' param should be '-'." << endl;
- return false;
- }
-#ifdef DTRAIN_LOCAL
- if ((*cfg)["input"].as<string>() == "-") {
- cerr << "Can't use stdin as input with this binary. Recompile without DTRAIN_LOCAL" << endl;
- return false;
- }
-#endif
if ((*cfg)["sample_from"].as<string>() != "kbest"
&& (*cfg)["sample_from"].as<string>() != "forest") {
cerr << "Wrong 'sample_from' param: '" << (*cfg)["sample_from"].as<string>() << "', use 'kbest' or 'forest'." << endl;
@@ -111,17 +97,8 @@ main(int argc, char** argv)
if (cfg.count("verbose")) verbose = true;
bool noup = false;
if (cfg.count("noup")) noup = true;
- bool hstreaming = false;
- string task_id;
- if (cfg.count("hstreaming")) {
- hstreaming = true;
- quiet = true;
- task_id = cfg["hstreaming"].as<string>();
- cerr.precision(17);
- }
bool rescale = false;
if (cfg.count("rescale")) rescale = true;
- HSReporter rep(task_id);
bool keep = false;
if (cfg.count("keep")) keep = true;
@@ -224,16 +201,8 @@ main(int argc, char** argv)
// buffer input for t > 0
vector<string> src_str_buf; // source strings (decoder takes only strings)
vector<vector<WordID> > ref_ids_buf; // references as WordID vecs
- // where temp files go
- string tmp_path = cfg["tmp"].as<string>();
-#ifdef DTRAIN_LOCAL
string refs_fn = cfg["refs"].as<string>();
ReadFile refs(refs_fn);
-#else
- string grammar_buf_fn = gettmpf(tmp_path, "dtrain-grammars");
- ogzstream grammar_buf_out;
- grammar_buf_out.open(grammar_buf_fn.c_str());
-#endif
unsigned in_sz = std::numeric_limits<unsigned>::max(); // input index, input size
vector<pair<score_t, score_t> > all_scores;
@@ -270,9 +239,7 @@ main(int argc, char** argv)
cerr << setw(25) << "max pairs " << max_pairs << endl;
cerr << setw(25) << "cdec cfg " << "'" << cfg["decoder_config"].as<string>() << "'" << endl;
cerr << setw(25) << "input " << "'" << input_fn << "'" << endl;
-#ifdef DTRAIN_LOCAL
cerr << setw(25) << "refs " << "'" << refs_fn << "'" << endl;
-#endif
cerr << setw(25) << "output " << "'" << output_fn << "'" << endl;
if (cfg.count("input_weights"))
cerr << setw(25) << "weights in " << "'" << cfg["input_weights"].as<string>() << "'" << endl;
@@ -285,14 +252,10 @@ main(int argc, char** argv)
for (unsigned t = 0; t < T; t++) // T epochs
{
- if (hstreaming) cerr << "reporter:status:Iteration #" << t+1 << " of " << T << endl;
-
time_t start, end;
time(&start);
-#ifndef DTRAIN_LOCAL
igzstream grammar_buf_in;
if (t > 0) grammar_buf_in.open(grammar_buf_fn.c_str());
-#endif
score_t score_sum = 0.;
score_t model_sum(0);
unsigned ii = 0, rank_errors = 0, margin_violations = 0, npairs = 0, f_count = 0, list_sz = 0;
@@ -340,52 +303,6 @@ main(int argc, char** argv)
// getting input
vector<WordID> ref_ids; // reference as vector<WordID>
-#ifndef DTRAIN_LOCAL
- vector<string> in_split; // input: sid\tsrc\tref\tpsg
- if (t == 0) {
- // handling input
- split_in(in, in_split);
- if (hstreaming && ii == 0) cerr << "reporter:counter:" << task_id << ",First ID," << in_split[0] << endl;
- // getting reference
- vector<string> ref_tok;
- boost::split(ref_tok, in_split[2], boost::is_any_of(" "));
- register_and_convert(ref_tok, ref_ids);
- ref_ids_buf.push_back(ref_ids);
- // process and set grammar
- bool broken_grammar = true; // ignore broken grammars
- for (string::iterator it = in.begin(); it != in.end(); it++) {
- if (!isspace(*it)) {
- broken_grammar = false;
- break;
- }
- }
- if (broken_grammar) {
- cerr << "Broken grammar for " << ii+1 << "! Ignoring this input." << endl;
- continue;
- }
- boost::replace_all(in, "\t", "\n");
- in += "\n";
- grammar_buf_out << in << DTRAIN_GRAMMAR_DELIM << " " << in_split[0] << endl;
- decoder.AddSupplementalGrammarFromString(in);
- src_str_buf.push_back(in_split[1]);
- // decode
- observer->SetRef(ref_ids);
- decoder.Decode(in_split[1], observer);
- } else {
- // get buffered grammar
- string grammar_str;
- while (true) {
- string rule;
- getline(grammar_buf_in, rule);
- if (boost::starts_with(rule, DTRAIN_GRAMMAR_DELIM)) break;
- grammar_str += rule + "\n";
- }
- decoder.AddSupplementalGrammarFromString(grammar_str);
- // decode
- observer->SetRef(ref_ids_buf[ii]);
- decoder.Decode(src_str_buf[ii], observer);
- }
-#else
if (t == 0) {
string r_;
getline(*refs, r_);
@@ -402,7 +319,6 @@ main(int argc, char** argv)
decoder.Decode(in, observer);
else
decoder.Decode(src_str_buf[ii], observer);
-#endif
// get (scored) samples
vector<ScoredHyp>* samples = observer->GetSamples();
@@ -505,11 +421,6 @@ main(int argc, char** argv)
++ii;
- if (hstreaming) {
- rep.update_counter("Seen #"+boost::lexical_cast<string>(t+1), 1u);
- rep.update_counter("Seen", 1u);
- }
-
} // input loop
if (average) w_average += lambdas;
@@ -518,21 +429,8 @@ main(int argc, char** argv)
if (t == 0) {
in_sz = ii; // remember size of input (# lines)
- if (hstreaming) {
- rep.update_counter("|Input|", ii);
- rep.update_gcounter("|Input|", ii);
- rep.update_gcounter("Shards", 1u);
- }
}
-#ifndef DTRAIN_LOCAL
- if (t == 0) {
- grammar_buf_out.close();
- } else {
- grammar_buf_in.close();
- }
-#endif
-
// print some stats
score_t score_avg = score_sum/(score_t)in_sz;
score_t model_avg = model_sum/(score_t)in_sz;
@@ -546,7 +444,7 @@ main(int argc, char** argv)
}
unsigned nonz = 0;
- if (!quiet || hstreaming) nonz = (unsigned)lambdas.num_nonzero();
+ if (!quiet) nonz = (unsigned)lambdas.num_nonzero();
if (!quiet) {
cerr << _p5 << _p << "WEIGHTS" << endl;
@@ -571,16 +469,6 @@ main(int argc, char** argv)
cerr << " avg f count: " << f_count/(float)list_sz << endl;
}
- if (hstreaming) {
- rep.update_counter("Score 1best avg #"+boost::lexical_cast<string>(t+1), (unsigned)(score_avg*DTRAIN_SCALE));
- rep.update_counter("Model 1best avg #"+boost::lexical_cast<string>(t+1), (unsigned)(model_avg*DTRAIN_SCALE));
- rep.update_counter("Pairs avg #"+boost::lexical_cast<string>(t+1), (unsigned)((npairs/(weight_t)in_sz)*DTRAIN_SCALE));
- rep.update_counter("Rank errors avg #"+boost::lexical_cast<string>(t+1), (unsigned)((rank_errors/(weight_t)in_sz)*DTRAIN_SCALE));
- rep.update_counter("Margin violations avg #"+boost::lexical_cast<string>(t+1), (unsigned)((margin_violations/(weight_t)in_sz)*DTRAIN_SCALE));
- rep.update_counter("Non zero feature count #"+boost::lexical_cast<string>(t+1), nonz);
- rep.update_gcounter("Non zero feature count #"+boost::lexical_cast<string>(t+1), nonz);
- }
-
pair<score_t,score_t> remember;
remember.first = score_avg;
remember.second = model_avg;
@@ -611,10 +499,6 @@ main(int argc, char** argv)
if (average) w_average /= (weight_t)T;
-#ifndef DTRAIN_LOCAL
- unlink(grammar_buf_fn.c_str());
-#endif
-
if (!noup) {
if (!quiet) cerr << endl << "Writing weights file to '" << output_fn << "' ..." << endl;
if (select_weights == "last" || average) { // last, average
@@ -651,7 +535,6 @@ main(int argc, char** argv)
}
}
}
- if (output_fn == "-" && hstreaming) cout << "__SHARD_COUNT__\t1" << endl;
if (!quiet) cerr << "done" << endl;
}
diff --git a/training/dtrain/dtrain.h b/training/dtrain/dtrain.h
index 572fd613..f368d810 100644
--- a/training/dtrain/dtrain.h
+++ b/training/dtrain/dtrain.h
@@ -1,14 +1,12 @@
#ifndef _DTRAIN_H_
#define _DTRAIN_H_
-#undef DTRAIN_FASTER_PERCEPTRON // only look at misranked pairs
- // DO NOT USE WITH SVM!
-#define DTRAIN_LOCAL
+#undef DTRAIN_FASTER_PERCEPTRON // only consider actually misranked pairs
+ // DO NOT ENABLE WITH SVM (gamma > 0) OR loss_margin!
+
#define DTRAIN_DOTS 10 // after how many inputs to display a '.'
-#define DTRAIN_GRAMMAR_DELIM "########EOS########"
#define DTRAIN_SCALE 100000
-
#include <iomanip>
#include <climits>
#include <string.h>
diff --git a/training/dtrain/hstreaming/avg.rb b/training/dtrain/hstreaming/avg.rb
deleted file mode 100755
index 2599c732..00000000
--- a/training/dtrain/hstreaming/avg.rb
+++ /dev/null
@@ -1,32 +0,0 @@
-#!/usr/bin/env ruby
-# first arg may be an int of custom shard count
-
-shard_count_key = "__SHARD_COUNT__"
-
-STDIN.set_encoding 'utf-8'
-STDOUT.set_encoding 'utf-8'
-
-w = {}
-c = {}
-w.default = 0
-c.default = 0
-while line = STDIN.gets
- key, val = line.split /\s/
- w[key] += val.to_f
- c[key] += 1
-end
-
-if ARGV.size == 0
- shard_count = w["__SHARD_COUNT__"]
-else
- shard_count = ARGV[0].to_f
-end
-w.each_key { |k|
- if k == shard_count_key
- next
- else
- puts "#{k}\t#{w[k]/shard_count}"
- #puts "# #{c[k]}"
- end
-}
-
diff --git a/training/dtrain/hstreaming/cdec.ini b/training/dtrain/hstreaming/cdec.ini
deleted file mode 100644
index d4f5cecd..00000000
--- a/training/dtrain/hstreaming/cdec.ini
+++ /dev/null
@@ -1,22 +0,0 @@
-formalism=scfg
-add_pass_through_rules=true
-scfg_max_span_limit=15
-intersection_strategy=cube_pruning
-cubepruning_pop_limit=30
-feature_function=WordPenalty
-feature_function=KLanguageModel nc-wmt11.en.srilm.gz
-#feature_function=ArityPenalty
-#feature_function=CMR2008ReorderingFeatures
-#feature_function=Dwarf
-#feature_function=InputIndicator
-#feature_function=LexNullJump
-#feature_function=NewJump
-#feature_function=NgramFeatures
-#feature_function=NonLatinCount
-#feature_function=OutputIndicator
-#feature_function=RuleIdentityFeatures
-#feature_function=RuleNgramFeatures
-#feature_function=RuleShape
-#feature_function=SourceSpanSizeFeatures
-#feature_function=SourceWordPenalty
-#feature_function=SpanFeatures
diff --git a/training/dtrain/hstreaming/dtrain.ini b/training/dtrain/hstreaming/dtrain.ini
deleted file mode 100644
index a2c219a1..00000000
--- a/training/dtrain/hstreaming/dtrain.ini
+++ /dev/null
@@ -1,15 +0,0 @@
-input=-
-output=-
-decoder_config=cdec.ini
-tmp=/var/hadoop/mapred/local/
-epochs=1
-k=100
-N=4
-learning_rate=0.0001
-gamma=0
-scorer=stupid_bleu
-sample_from=kbest
-filter=uniq
-pair_sampling=XYX
-pair_threshold=0
-select_weights=last
diff --git a/training/dtrain/hstreaming/dtrain.sh b/training/dtrain/hstreaming/dtrain.sh
deleted file mode 100755
index 877ff94c..00000000
--- a/training/dtrain/hstreaming/dtrain.sh
+++ /dev/null
@@ -1,9 +0,0 @@
-#!/bin/bash
-# script to run dtrain with a task id
-
-pushd . &>/dev/null
-cd ..
-ID=$(basename $(pwd)) # attempt_...
-popd &>/dev/null
-./dtrain -c dtrain.ini --hstreaming $ID
-
diff --git a/training/dtrain/hstreaming/hadoop-streaming-job.sh b/training/dtrain/hstreaming/hadoop-streaming-job.sh
deleted file mode 100755
index 92419956..00000000
--- a/training/dtrain/hstreaming/hadoop-streaming-job.sh
+++ /dev/null
@@ -1,30 +0,0 @@
-#!/bin/sh
-
-EXP=a_simple_test
-
-# change these vars to fit your hadoop installation
-HADOOP_HOME=/usr/lib/hadoop-0.20
-JAR=contrib/streaming/hadoop-streaming-0.20.2-cdh3u1.jar
-HSTREAMING="$HADOOP_HOME/bin/hadoop jar $HADOOP_HOME/$JAR"
-
- IN=input_on_hdfs
-OUT=output_weights_on_hdfs
-
-# you can -reducer to NONE if you want to
-# do feature selection/averaging locally (e.g. to
-# keep weights of all epochs)
-$HSTREAMING \
- -mapper "dtrain.sh" \
- -reducer "ruby lplp.rb l2 select_k 100000" \
- -input $IN \
- -output $OUT \
- -file dtrain.sh \
- -file lplp.rb \
- -file ../dtrain \
- -file dtrain.ini \
- -file cdec.ini \
- -file ../test/example/nc-wmt11.en.srilm.gz \
- -jobconf mapred.reduce.tasks=30 \
- -jobconf mapred.max.map.failures.percent=0 \
- -jobconf mapred.job.name="dtrain $EXP"
-
diff --git a/training/dtrain/hstreaming/red-test b/training/dtrain/hstreaming/red-test
deleted file mode 100644
index 2623d697..00000000
--- a/training/dtrain/hstreaming/red-test
+++ /dev/null
@@ -1,9 +0,0 @@
-a 1
-b 2
-c 3.5
-a 1
-b 2
-c 3.5
-d 1
-e 2
-__SHARD_COUNT__ 2
diff --git a/training/dtrain/hstreaming/lplp.rb b/training/dtrain/lplp.rb
index f0cd58c5..f0cd58c5 100755
--- a/training/dtrain/hstreaming/lplp.rb
+++ b/training/dtrain/lplp.rb
diff --git a/training/dtrain/parallelize.rb b/training/dtrain/parallelize.rb
index fca9b10d..24e7f49e 100755
--- a/training/dtrain/parallelize.rb
+++ b/training/dtrain/parallelize.rb
@@ -80,7 +80,7 @@ def make_shards(input, refs, num_shards, epoch, rand)
shard_refs = File.new refs_fn, 'w+'
refs_fns << refs_fn
0.upto(shard_sz-1) { |i|
- j = index.pop
+ j = index.pop
shard_in.write in_lines[j]
shard_refs.write refs_lines[j]
}
@@ -125,7 +125,7 @@ end
if use_qsub
qsub_str_start = "qsub -cwd -sync y -b y -j y -o work/out.#{shard}.#{epoch} -N dtrain.#{shard}.#{epoch} \""
qsub_str_end = "\""
- local_end = ''
+ local_end = ''
else
local_end = "&>work/out.#{shard}.#{epoch}"
end
diff --git a/training/dtrain/test/example/cdec.ini b/training/dtrain/test/example/cdec.ini
index 068ebd4d..0215416d 100644
--- a/training/dtrain/test/example/cdec.ini
+++ b/training/dtrain/test/example/cdec.ini
@@ -2,7 +2,7 @@ formalism=scfg
add_pass_through_rules=true
scfg_max_span_limit=15
intersection_strategy=cube_pruning
-cubepruning_pop_limit=30
+cubepruning_pop_limit=200
feature_function=WordPenalty
feature_function=KLanguageModel ./nc-wmt11.en.srilm.gz
# all currently working feature functions for translation: