summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--.gitignore1
-rw-r--r--dtrain/README5
-rw-r--r--dtrain/dtrain.cc12
-rw-r--r--dtrain/dtrain.h12
-rw-r--r--dtrain/test/example/dtrain.ini10
5 files changed, 30 insertions, 10 deletions
diff --git a/.gitignore b/.gitignore
index 5b703e41..0590b009 100644
--- a/.gitignore
+++ b/.gitignore
@@ -132,3 +132,4 @@ utils/ts
training/compute_cllh
dtrain/dtrain
weights.gz
+dtrain/test/eval/
diff --git a/dtrain/README b/dtrain/README
index 42b91b9b..a6c48fcc 100644
--- a/dtrain/README
+++ b/dtrain/README
@@ -31,3 +31,8 @@ KNOWN BUGS, PROBLEMS
PhraseModel_* features (0..99 seem to be generated, why 99?)
flex scanner jams on malicious input, we could skip that
+FIX
+ approx BLEU fix
+ merge
+ ep data
+ r\tr\tr\tr\tr
diff --git a/dtrain/dtrain.cc b/dtrain/dtrain.cc
index 69f83633..5d84f250 100644
--- a/dtrain/dtrain.cc
+++ b/dtrain/dtrain.cc
@@ -241,7 +241,7 @@ main(int argc, char** argv)
vector<WordID> ref_ids; // reference as vector<WordID>
if (t == 0) {
// handling input
- boost::split(in_split, in, boost::is_any_of("\t"));
+ split_in(in, in_split);
// getting reference
vector<string> ref_tok;
boost::split(ref_tok, in_split[2], boost::is_any_of(" "));
@@ -249,17 +249,17 @@ main(int argc, char** argv)
ref_ids_buf.push_back(ref_ids);
// process and set grammar
bool broken_grammar = true;
- for (string::iterator it = in_split[3].begin(); it != in_split[3].end(); it++) {
+ for (string::iterator it = in.begin(); it != in.end(); it++) {
if (!isspace(*it)) {
broken_grammar = false;
break;
}
}
if (broken_grammar) continue;
- boost::replace_all(in_split[3], " __NEXT__RULE__ ", "\n"); // TODO
- in_split[3] += "\n";
- grammar_buf_out << in_split[3] << DTRAIN_GRAMMAR_DELIM << " " << in_split[0] << endl;
- decoder.SetSentenceGrammarFromString(in_split[3]);
+ boost::replace_all(in, "\t", "\n");
+ in += "\n";
+ grammar_buf_out << in << DTRAIN_GRAMMAR_DELIM << " " << in_split[0] << endl;
+ decoder.SetSentenceGrammarFromString(in);
src_str_buf.push_back(in_split[1]);
// decode
observer->SetRef(ref_ids);
diff --git a/dtrain/dtrain.h b/dtrain/dtrain.h
index 34464e3c..e98ef470 100644
--- a/dtrain/dtrain.h
+++ b/dtrain/dtrain.h
@@ -38,6 +38,18 @@ inline string gettmpf(const string path, const string infix, const string suffix
return string(fn);
}
+inline void split_in(string& s, vector<string>& parts)
+{
+ unsigned f = 0;
+ for(unsigned i = 0; i < 3; i++) {
+ unsigned e = f;
+ f = s.find("\t", f+1);
+ if (e != 0) parts.push_back(s.substr(e+1, f-e-1));
+ else parts.push_back(s.substr(0, f));
+ }
+ s.erase(0, f+1);
+}
+
inline ostream& _np(ostream& out) { return out << resetiosflags(ios::showpos); }
inline ostream& _p(ostream& out) { return out << setiosflags(ios::showpos); }
inline ostream& _p2(ostream& out) { return out << setprecision(2); }
diff --git a/dtrain/test/example/dtrain.ini b/dtrain/test/example/dtrain.ini
index c560a3a6..09c876d9 100644
--- a/dtrain/test/example/dtrain.ini
+++ b/dtrain/test/example/dtrain.ini
@@ -1,11 +1,13 @@
decoder_config=test/example/cdec.ini
k=100
N=3
-epochs=1000
-input=test/example/nc-1k.gz
+epochs=4
+#input=test/example/nc-1k.gz
+input=test/example/nc-1k-tabs.gz
scorer=stupid_bleu
-output=/tmp/weights.gz
-stop_after=10
+output=VOID
+#/tmp/weights.gz
+stop_after=100
sample_from=kbest
pair_sampling=all
print_weights=Glue WordPenalty LanguageModel LanguageModel_OOV PhraseModel_0 PhraseModel_1 PhraseModel_2 PhraseModel_3 PhraseModel_4 PassThrough