From e377e9de738773d03e600681b0f0d2797df717c6 Mon Sep 17 00:00:00 2001 From: Patrick Simianer Date: Mon, 10 Oct 2011 22:39:41 +0200 Subject: speed --- .gitignore | 1 + dtrain/README | 5 +++++ dtrain/dtrain.cc | 12 ++++++------ dtrain/dtrain.h | 12 ++++++++++++ dtrain/test/example/dtrain.ini | 10 ++++++---- 5 files changed, 30 insertions(+), 10 deletions(-) diff --git a/.gitignore b/.gitignore index 5b703e41..0590b009 100644 --- a/.gitignore +++ b/.gitignore @@ -132,3 +132,4 @@ utils/ts training/compute_cllh dtrain/dtrain weights.gz +dtrain/test/eval/ diff --git a/dtrain/README b/dtrain/README index 42b91b9b..a6c48fcc 100644 --- a/dtrain/README +++ b/dtrain/README @@ -31,3 +31,8 @@ KNOWN BUGS, PROBLEMS PhraseModel_* features (0..99 seem to be generated, why 99?) flex scanner jams on malicious input, we could skip that +FIX + approx BLEU fix + merge + ep data + r\tr\tr\tr\tr diff --git a/dtrain/dtrain.cc b/dtrain/dtrain.cc index 69f83633..5d84f250 100644 --- a/dtrain/dtrain.cc +++ b/dtrain/dtrain.cc @@ -241,7 +241,7 @@ main(int argc, char** argv) vector ref_ids; // reference as vector if (t == 0) { // handling input - boost::split(in_split, in, boost::is_any_of("\t")); + split_in(in, in_split); // getting reference vector ref_tok; boost::split(ref_tok, in_split[2], boost::is_any_of(" ")); @@ -249,17 +249,17 @@ main(int argc, char** argv) ref_ids_buf.push_back(ref_ids); // process and set grammar bool broken_grammar = true; - for (string::iterator it = in_split[3].begin(); it != in_split[3].end(); it++) { + for (string::iterator it = in.begin(); it != in.end(); it++) { if (!isspace(*it)) { broken_grammar = false; break; } } if (broken_grammar) continue; - boost::replace_all(in_split[3], " __NEXT__RULE__ ", "\n"); // TODO - in_split[3] += "\n"; - grammar_buf_out << in_split[3] << DTRAIN_GRAMMAR_DELIM << " " << in_split[0] << endl; - decoder.SetSentenceGrammarFromString(in_split[3]); + boost::replace_all(in, "\t", "\n"); + in += "\n"; + grammar_buf_out << in << DTRAIN_GRAMMAR_DELIM << " " << in_split[0] << endl; + decoder.SetSentenceGrammarFromString(in); src_str_buf.push_back(in_split[1]); // decode observer->SetRef(ref_ids); diff --git a/dtrain/dtrain.h b/dtrain/dtrain.h index 34464e3c..e98ef470 100644 --- a/dtrain/dtrain.h +++ b/dtrain/dtrain.h @@ -38,6 +38,18 @@ inline string gettmpf(const string path, const string infix, const string suffix return string(fn); } +inline void split_in(string& s, vector& parts) +{ + unsigned f = 0; + for(unsigned i = 0; i < 3; i++) { + unsigned e = f; + f = s.find("\t", f+1); + if (e != 0) parts.push_back(s.substr(e+1, f-e-1)); + else parts.push_back(s.substr(0, f)); + } + s.erase(0, f+1); +} + inline ostream& _np(ostream& out) { return out << resetiosflags(ios::showpos); } inline ostream& _p(ostream& out) { return out << setiosflags(ios::showpos); } inline ostream& _p2(ostream& out) { return out << setprecision(2); } diff --git a/dtrain/test/example/dtrain.ini b/dtrain/test/example/dtrain.ini index c560a3a6..09c876d9 100644 --- a/dtrain/test/example/dtrain.ini +++ b/dtrain/test/example/dtrain.ini @@ -1,11 +1,13 @@ decoder_config=test/example/cdec.ini k=100 N=3 -epochs=1000 -input=test/example/nc-1k.gz +epochs=4 +#input=test/example/nc-1k.gz +input=test/example/nc-1k-tabs.gz scorer=stupid_bleu -output=/tmp/weights.gz -stop_after=10 +output=VOID +#/tmp/weights.gz +stop_after=100 sample_from=kbest pair_sampling=all print_weights=Glue WordPenalty LanguageModel LanguageModel_OOV PhraseModel_0 PhraseModel_1 PhraseModel_2 PhraseModel_3 PhraseModel_4 PassThrough -- cgit v1.2.3