diff options
-rw-r--r-- | dtrain/README.md | 3 | ||||
-rw-r--r-- | dtrain/dtrain.cc | 25 | ||||
-rw-r--r-- | dtrain/dtrain.h | 19 | ||||
-rwxr-xr-x | dtrain/hstreaming/dtrain.sh | 8 |
4 files changed, 47 insertions, 8 deletions
diff --git a/dtrain/README.md b/dtrain/README.md index 58c6dddc..faedf8a7 100644 --- a/dtrain/README.md +++ b/dtrain/README.md @@ -48,6 +48,9 @@ Uncertain, known bugs, problems * lower beam size to be faster? * why is <unk> -100 in lm so good? * noise helps? +* what does srilm do with -unk but nothing mapped to unk (<unk> unigram)? + => this: http://www-speech.sri.com/pipermail/srilm-user/2007q4/000543.html +* mira translation sampling? random notes ------------ diff --git a/dtrain/dtrain.cc b/dtrain/dtrain.cc index 2fe7afd7..2d15f059 100644 --- a/dtrain/dtrain.cc +++ b/dtrain/dtrain.cc @@ -19,7 +19,7 @@ dtrain_init(int argc, char** argv, po::variables_map* cfg) ("scorer", po::value<string>()->default_value("stupid_bleu"), "scoring: bleu, stupid_*, smooth_*, approx_*") ("stop_after", po::value<unsigned>()->default_value(0), "stop after X input sentences") ("print_weights", po::value<string>(), "weights to print on each iteration") - ("hstreaming", po::value<bool>()->zero_tokens(), "run in hadoop streaming mode") + ("hstreaming", po::value<string>()->default_value("N/A"), "run in hadoop streaming mode, arg is a task id") ("learning_rate", po::value<weight_t>()->default_value(0.0005), "learning rate") ("gamma", po::value<weight_t>()->default_value(0), "gamma for SVM (0 for perceptron)") ("tmp", po::value<string>()->default_value("/tmp"), "temp dir to use") @@ -91,11 +91,14 @@ main(int argc, char** argv) bool noup = false; if (cfg.count("noup")) noup = true; bool hstreaming = false; + string task_id; if (cfg.count("hstreaming")) { hstreaming = true; quiet = true; + task_id = cfg["hstreaming"].as<string>(); cerr.precision(17); } + HSReporter rep(task_id); bool keep_w = false; if (cfg.count("keep_w")) keep_w = true; @@ -384,16 +387,18 @@ main(int argc, char** argv) ++ii; - if (hstreaming) cerr << "reporter:counter:dtrain,count,1" << endl; + if (hstreaming) rep.update_counter("Seen", 1u); } // input loop - if (hstreaming && t == 0) cerr << "reporter:counter:dtrain,|input|," << ii+1 << endl; - if (scorer_str == "approx_bleu") scorer->Reset(); if (t == 0) { in_sz = ii; // remember size of input (# lines) + if (hstreaming) { + rep.update_counter("|Input|", ii+1); + rep.update_gcounter("|Input|", ii+1); + } } #ifndef DTRAIN_LOCAL @@ -415,10 +420,6 @@ main(int argc, char** argv) score_diff = score_avg; model_diff = model_avg; } - if (hstreaming) { - cerr << "reporter:counter:dtrain,score avg it " << t+1 << "," << score_avg << endl; - cerr << "reporter:counter:dtrain,model avg it " << t+1 << "," << model_avg << endl; - } if (!quiet) { cerr << _p5 << _p << "WEIGHTS" << endl; @@ -435,6 +436,14 @@ main(int argc, char** argv) cerr << " avg #up: "; cerr << nup/(float)in_sz << endl; } + + if (hstreaming) { + rep.update_counter("Score avg #"+boost::lexical_cast<string>(t+1), score_avg); + rep.update_counter("Model avg #"+boost::lexical_cast<string>(t+1), model_avg); + rep.update_counter("Pairs avg #"+boost::lexical_cast<string>(t+1), npairs/(weight_t)in_sz); + rep.update_counter("Updates avg #"+boost::lexical_cast<string>(t+1), nup/(weight_t)in_sz); + } + pair<score_t,score_t> remember; remember.first = score_avg; remember.second = model_avg; diff --git a/dtrain/dtrain.h b/dtrain/dtrain.h index 6742f343..84f3f1f5 100644 --- a/dtrain/dtrain.h +++ b/dtrain/dtrain.h @@ -52,6 +52,25 @@ inline void split_in(string& s, vector<string>& parts) s.erase(0, f+1); } +struct HSReporter +{ + string task_id_; + + HSReporter(string task_id) : task_id_(task_id) {} + inline void update_counter(string name, weight_t amount) { + cerr << "reporter:counter:" << task_id_ << "," << name << "," << amount << endl; + } + inline void update_counter(string name, unsigned amount) { + cerr << "reporter:counter:" << task_id_ << "," << name << "," << amount << endl; + } + inline void update_gcounter(string name, weight_t amount) { + cerr << "reporter:counter:Global," << name << "," << amount << endl; + } + inline void update_gcounter(string name, unsigned amount) { + cerr << "reporter:counter:Global," << name << "," << amount << endl; + } +}; + inline ostream& _np(ostream& out) { return out << resetiosflags(ios::showpos); } inline ostream& _p(ostream& out) { return out << setiosflags(ios::showpos); } inline ostream& _p2(ostream& out) { return out << setprecision(2); } diff --git a/dtrain/hstreaming/dtrain.sh b/dtrain/hstreaming/dtrain.sh new file mode 100755 index 00000000..6d34012a --- /dev/null +++ b/dtrain/hstreaming/dtrain.sh @@ -0,0 +1,8 @@ +#!/bin/bash + +pushd . +cd .. +ID=$(basename $(pwd)) +popd +./dtrain -c dtrain.ini --hstreaming $ID + |