summaryrefslogtreecommitdiff
path: root/dtrain
diff options
context:
space:
mode:
authorPatrick Simianer <p@simianer.de>2011-10-10 22:39:41 +0200
committerPatrick Simianer <p@simianer.de>2011-10-10 22:39:41 +0200
commite377e9de738773d03e600681b0f0d2797df717c6 (patch)
tree4bd4a32c498c115886ae3b1165a469678241cc00 /dtrain
parentabc2919ccf6cb57dd0320716cad378866b08054a (diff)
speed
Diffstat (limited to 'dtrain')
-rw-r--r--dtrain/README5
-rw-r--r--dtrain/dtrain.cc12
-rw-r--r--dtrain/dtrain.h12
-rw-r--r--dtrain/test/example/dtrain.ini10
4 files changed, 29 insertions, 10 deletions
diff --git a/dtrain/README b/dtrain/README
index 42b91b9b..a6c48fcc 100644
--- a/dtrain/README
+++ b/dtrain/README
@@ -31,3 +31,8 @@ KNOWN BUGS, PROBLEMS
PhraseModel_* features (0..99 seem to be generated, why 99?)
flex scanner jams on malicious input, we could skip that
+FIX
+ approx BLEU fix
+ merge
+ ep data
+ r\tr\tr\tr\tr
diff --git a/dtrain/dtrain.cc b/dtrain/dtrain.cc
index 69f83633..5d84f250 100644
--- a/dtrain/dtrain.cc
+++ b/dtrain/dtrain.cc
@@ -241,7 +241,7 @@ main(int argc, char** argv)
vector<WordID> ref_ids; // reference as vector<WordID>
if (t == 0) {
// handling input
- boost::split(in_split, in, boost::is_any_of("\t"));
+ split_in(in, in_split);
// getting reference
vector<string> ref_tok;
boost::split(ref_tok, in_split[2], boost::is_any_of(" "));
@@ -249,17 +249,17 @@ main(int argc, char** argv)
ref_ids_buf.push_back(ref_ids);
// process and set grammar
bool broken_grammar = true;
- for (string::iterator it = in_split[3].begin(); it != in_split[3].end(); it++) {
+ for (string::iterator it = in.begin(); it != in.end(); it++) {
if (!isspace(*it)) {
broken_grammar = false;
break;
}
}
if (broken_grammar) continue;
- boost::replace_all(in_split[3], " __NEXT__RULE__ ", "\n"); // TODO
- in_split[3] += "\n";
- grammar_buf_out << in_split[3] << DTRAIN_GRAMMAR_DELIM << " " << in_split[0] << endl;
- decoder.SetSentenceGrammarFromString(in_split[3]);
+ boost::replace_all(in, "\t", "\n");
+ in += "\n";
+ grammar_buf_out << in << DTRAIN_GRAMMAR_DELIM << " " << in_split[0] << endl;
+ decoder.SetSentenceGrammarFromString(in);
src_str_buf.push_back(in_split[1]);
// decode
observer->SetRef(ref_ids);
diff --git a/dtrain/dtrain.h b/dtrain/dtrain.h
index 34464e3c..e98ef470 100644
--- a/dtrain/dtrain.h
+++ b/dtrain/dtrain.h
@@ -38,6 +38,18 @@ inline string gettmpf(const string path, const string infix, const string suffix
return string(fn);
}
+inline void split_in(string& s, vector<string>& parts)
+{
+ unsigned f = 0;
+ for(unsigned i = 0; i < 3; i++) {
+ unsigned e = f;
+ f = s.find("\t", f+1);
+ if (e != 0) parts.push_back(s.substr(e+1, f-e-1));
+ else parts.push_back(s.substr(0, f));
+ }
+ s.erase(0, f+1);
+}
+
inline ostream& _np(ostream& out) { return out << resetiosflags(ios::showpos); }
inline ostream& _p(ostream& out) { return out << setiosflags(ios::showpos); }
inline ostream& _p2(ostream& out) { return out << setprecision(2); }
diff --git a/dtrain/test/example/dtrain.ini b/dtrain/test/example/dtrain.ini
index c560a3a6..09c876d9 100644
--- a/dtrain/test/example/dtrain.ini
+++ b/dtrain/test/example/dtrain.ini
@@ -1,11 +1,13 @@
decoder_config=test/example/cdec.ini
k=100
N=3
-epochs=1000
-input=test/example/nc-1k.gz
+epochs=4
+#input=test/example/nc-1k.gz
+input=test/example/nc-1k-tabs.gz
scorer=stupid_bleu
-output=/tmp/weights.gz
-stop_after=10
+output=VOID
+#/tmp/weights.gz
+stop_after=100
sample_from=kbest
pair_sampling=all
print_weights=Glue WordPenalty LanguageModel LanguageModel_OOV PhraseModel_0 PhraseModel_1 PhraseModel_2 PhraseModel_3 PhraseModel_4 PassThrough