diff options
| -rw-r--r-- | dtrain/README.md | 3 | ||||
| -rw-r--r-- | dtrain/dtrain.cc | 25 | ||||
| -rw-r--r-- | dtrain/dtrain.h | 19 | ||||
| -rwxr-xr-x | dtrain/hstreaming/dtrain.sh | 8 | 
4 files changed, 47 insertions, 8 deletions
diff --git a/dtrain/README.md b/dtrain/README.md index 58c6dddc..faedf8a7 100644 --- a/dtrain/README.md +++ b/dtrain/README.md @@ -48,6 +48,9 @@ Uncertain, known bugs, problems  * lower beam size to be faster?  * why is <unk> -100 in lm so good?  * noise helps? +* what does srilm do with -unk but nothing mapped to unk (<unk> unigram)? +  => this: http://www-speech.sri.com/pipermail/srilm-user/2007q4/000543.html +* mira translation sampling?  random notes  ------------ diff --git a/dtrain/dtrain.cc b/dtrain/dtrain.cc index 2fe7afd7..2d15f059 100644 --- a/dtrain/dtrain.cc +++ b/dtrain/dtrain.cc @@ -19,7 +19,7 @@ dtrain_init(int argc, char** argv, po::variables_map* cfg)      ("scorer",         po::value<string>()->default_value("stupid_bleu"),     "scoring: bleu, stupid_*, smooth_*, approx_*")      ("stop_after",     po::value<unsigned>()->default_value(0),                              "stop after X input sentences")      ("print_weights",  po::value<string>(),                                            "weights to print on each iteration") -    ("hstreaming",     po::value<bool>()->zero_tokens(),                                     "run in hadoop streaming mode") +    ("hstreaming",     po::value<string>()->default_value("N/A"),          "run in hadoop streaming mode, arg is a task id")      ("learning_rate",  po::value<weight_t>()->default_value(0.0005),                                        "learning rate")      ("gamma",          po::value<weight_t>()->default_value(0),                          "gamma for SVM (0 for perceptron)")      ("tmp",            po::value<string>()->default_value("/tmp"),                                        "temp dir to use") @@ -91,11 +91,14 @@ main(int argc, char** argv)    bool noup = false;    if (cfg.count("noup")) noup = true;    bool hstreaming = false; +  string task_id;    if (cfg.count("hstreaming")) {      hstreaming = true;      quiet = true; +    task_id = cfg["hstreaming"].as<string>();      cerr.precision(17);    } +  HSReporter rep(task_id);    bool keep_w = false;    if (cfg.count("keep_w")) keep_w = true; @@ -384,16 +387,18 @@ main(int argc, char** argv)      ++ii; -    if (hstreaming) cerr << "reporter:counter:dtrain,count,1" << endl; +    if (hstreaming) rep.update_counter("Seen", 1u);    } // input loop -  if (hstreaming && t == 0) cerr << "reporter:counter:dtrain,|input|," << ii+1 << endl; -    if (scorer_str == "approx_bleu") scorer->Reset();    if (t == 0) {      in_sz = ii; // remember size of input (# lines) +    if (hstreaming) { +      rep.update_counter("|Input|", ii+1); +      rep.update_gcounter("|Input|", ii+1); +    }    }  #ifndef DTRAIN_LOCAL @@ -415,10 +420,6 @@ main(int argc, char** argv)      score_diff = score_avg;      model_diff = model_avg;    } -  if (hstreaming) { -    cerr << "reporter:counter:dtrain,score avg it " << t+1 << "," << score_avg << endl; -    cerr << "reporter:counter:dtrain,model avg it " << t+1 << "," << model_avg << endl; -  }    if (!quiet) {      cerr << _p5 << _p << "WEIGHTS" << endl; @@ -435,6 +436,14 @@ main(int argc, char** argv)      cerr << "              avg #up: ";      cerr << nup/(float)in_sz << endl;    } + +  if (hstreaming) { +    rep.update_counter("Score avg #"+boost::lexical_cast<string>(t+1), score_avg);  +    rep.update_counter("Model avg #"+boost::lexical_cast<string>(t+1), model_avg);  +    rep.update_counter("Pairs avg #"+boost::lexical_cast<string>(t+1), npairs/(weight_t)in_sz);  +    rep.update_counter("Updates avg #"+boost::lexical_cast<string>(t+1), nup/(weight_t)in_sz);  +  } +    pair<score_t,score_t> remember;    remember.first = score_avg;    remember.second = model_avg; diff --git a/dtrain/dtrain.h b/dtrain/dtrain.h index 6742f343..84f3f1f5 100644 --- a/dtrain/dtrain.h +++ b/dtrain/dtrain.h @@ -52,6 +52,25 @@ inline void split_in(string& s, vector<string>& parts)    s.erase(0, f+1);  } +struct HSReporter +{ +  string task_id_; + +  HSReporter(string task_id) : task_id_(task_id) {} +  inline void update_counter(string name, weight_t amount) { +    cerr << "reporter:counter:" << task_id_ << "," << name << "," << amount << endl; +  } +  inline void update_counter(string name, unsigned amount) { +    cerr << "reporter:counter:" << task_id_ << "," << name << "," << amount << endl; +  } +  inline void update_gcounter(string name, weight_t amount) { +    cerr << "reporter:counter:Global," << name << "," << amount << endl; +  } +  inline void update_gcounter(string name, unsigned amount) { +    cerr << "reporter:counter:Global," << name << "," << amount << endl; +  } +}; +  inline ostream& _np(ostream& out) { return out << resetiosflags(ios::showpos); }  inline ostream& _p(ostream& out)  { return out << setiosflags(ios::showpos); }  inline ostream& _p2(ostream& out) { return out << setprecision(2); } diff --git a/dtrain/hstreaming/dtrain.sh b/dtrain/hstreaming/dtrain.sh new file mode 100755 index 00000000..6d34012a --- /dev/null +++ b/dtrain/hstreaming/dtrain.sh @@ -0,0 +1,8 @@ +#!/bin/bash + +pushd . +cd .. +ID=$(basename $(pwd)) +popd +./dtrain -c dtrain.ini --hstreaming $ID  +  | 
