From 606750f2487ed294dcdadcd99638eb5de80d1a0c Mon Sep 17 00:00:00 2001
From: Patrick Simianer
Date: Fri, 11 Nov 2011 11:28:24 +0100
Subject: better hstreaming reporting, new hstreaming wrapper
---
dtrain/README.md | 3 +++
dtrain/dtrain.cc | 25 +++++++++++++++++--------
dtrain/dtrain.h | 19 +++++++++++++++++++
dtrain/hstreaming/dtrain.sh | 8 ++++++++
4 files changed, 47 insertions(+), 8 deletions(-)
create mode 100755 dtrain/hstreaming/dtrain.sh
diff --git a/dtrain/README.md b/dtrain/README.md
index 58c6dddc..faedf8a7 100644
--- a/dtrain/README.md
+++ b/dtrain/README.md
@@ -48,6 +48,9 @@ Uncertain, known bugs, problems
* lower beam size to be faster?
* why is -100 in lm so good?
* noise helps?
+* what does srilm do with -unk but nothing mapped to unk ( unigram)?
+ => this: http://www-speech.sri.com/pipermail/srilm-user/2007q4/000543.html
+* mira translation sampling?
random notes
------------
diff --git a/dtrain/dtrain.cc b/dtrain/dtrain.cc
index 2fe7afd7..2d15f059 100644
--- a/dtrain/dtrain.cc
+++ b/dtrain/dtrain.cc
@@ -19,7 +19,7 @@ dtrain_init(int argc, char** argv, po::variables_map* cfg)
("scorer", po::value()->default_value("stupid_bleu"), "scoring: bleu, stupid_*, smooth_*, approx_*")
("stop_after", po::value()->default_value(0), "stop after X input sentences")
("print_weights", po::value(), "weights to print on each iteration")
- ("hstreaming", po::value()->zero_tokens(), "run in hadoop streaming mode")
+ ("hstreaming", po::value()->default_value("N/A"), "run in hadoop streaming mode, arg is a task id")
("learning_rate", po::value()->default_value(0.0005), "learning rate")
("gamma", po::value()->default_value(0), "gamma for SVM (0 for perceptron)")
("tmp", po::value()->default_value("/tmp"), "temp dir to use")
@@ -91,11 +91,14 @@ main(int argc, char** argv)
bool noup = false;
if (cfg.count("noup")) noup = true;
bool hstreaming = false;
+ string task_id;
if (cfg.count("hstreaming")) {
hstreaming = true;
quiet = true;
+ task_id = cfg["hstreaming"].as();
cerr.precision(17);
}
+ HSReporter rep(task_id);
bool keep_w = false;
if (cfg.count("keep_w")) keep_w = true;
@@ -384,16 +387,18 @@ main(int argc, char** argv)
++ii;
- if (hstreaming) cerr << "reporter:counter:dtrain,count,1" << endl;
+ if (hstreaming) rep.update_counter("Seen", 1u);
} // input loop
- if (hstreaming && t == 0) cerr << "reporter:counter:dtrain,|input|," << ii+1 << endl;
-
if (scorer_str == "approx_bleu") scorer->Reset();
if (t == 0) {
in_sz = ii; // remember size of input (# lines)
+ if (hstreaming) {
+ rep.update_counter("|Input|", ii+1);
+ rep.update_gcounter("|Input|", ii+1);
+ }
}
#ifndef DTRAIN_LOCAL
@@ -415,10 +420,6 @@ main(int argc, char** argv)
score_diff = score_avg;
model_diff = model_avg;
}
- if (hstreaming) {
- cerr << "reporter:counter:dtrain,score avg it " << t+1 << "," << score_avg << endl;
- cerr << "reporter:counter:dtrain,model avg it " << t+1 << "," << model_avg << endl;
- }
if (!quiet) {
cerr << _p5 << _p << "WEIGHTS" << endl;
@@ -435,6 +436,14 @@ main(int argc, char** argv)
cerr << " avg #up: ";
cerr << nup/(float)in_sz << endl;
}
+
+ if (hstreaming) {
+ rep.update_counter("Score avg #"+boost::lexical_cast(t+1), score_avg);
+ rep.update_counter("Model avg #"+boost::lexical_cast(t+1), model_avg);
+ rep.update_counter("Pairs avg #"+boost::lexical_cast(t+1), npairs/(weight_t)in_sz);
+ rep.update_counter("Updates avg #"+boost::lexical_cast(t+1), nup/(weight_t)in_sz);
+ }
+
pair remember;
remember.first = score_avg;
remember.second = model_avg;
diff --git a/dtrain/dtrain.h b/dtrain/dtrain.h
index 6742f343..84f3f1f5 100644
--- a/dtrain/dtrain.h
+++ b/dtrain/dtrain.h
@@ -52,6 +52,25 @@ inline void split_in(string& s, vector& parts)
s.erase(0, f+1);
}
+struct HSReporter
+{
+ string task_id_;
+
+ HSReporter(string task_id) : task_id_(task_id) {}
+ inline void update_counter(string name, weight_t amount) {
+ cerr << "reporter:counter:" << task_id_ << "," << name << "," << amount << endl;
+ }
+ inline void update_counter(string name, unsigned amount) {
+ cerr << "reporter:counter:" << task_id_ << "," << name << "," << amount << endl;
+ }
+ inline void update_gcounter(string name, weight_t amount) {
+ cerr << "reporter:counter:Global," << name << "," << amount << endl;
+ }
+ inline void update_gcounter(string name, unsigned amount) {
+ cerr << "reporter:counter:Global," << name << "," << amount << endl;
+ }
+};
+
inline ostream& _np(ostream& out) { return out << resetiosflags(ios::showpos); }
inline ostream& _p(ostream& out) { return out << setiosflags(ios::showpos); }
inline ostream& _p2(ostream& out) { return out << setprecision(2); }
diff --git a/dtrain/hstreaming/dtrain.sh b/dtrain/hstreaming/dtrain.sh
new file mode 100755
index 00000000..6d34012a
--- /dev/null
+++ b/dtrain/hstreaming/dtrain.sh
@@ -0,0 +1,8 @@
+#!/bin/bash
+
+pushd .
+cd ..
+ID=$(basename $(pwd))
+popd
+./dtrain -c dtrain.ini --hstreaming $ID
+
--
cgit v1.2.3