diff options
Diffstat (limited to 'dtrain')
| -rw-r--r-- | dtrain/README | 5 | ||||
| -rw-r--r-- | dtrain/dtrain.cc | 12 | ||||
| -rw-r--r-- | dtrain/dtrain.h | 12 | ||||
| -rw-r--r-- | dtrain/test/example/dtrain.ini | 10 | 
4 files changed, 29 insertions, 10 deletions
| diff --git a/dtrain/README b/dtrain/README index 42b91b9b..a6c48fcc 100644 --- a/dtrain/README +++ b/dtrain/README @@ -31,3 +31,8 @@ KNOWN BUGS, PROBLEMS   PhraseModel_* features (0..99 seem to be generated, why 99?)   flex scanner jams on malicious input, we could skip that +FIX +  approx BLEU fix +  merge +  ep data +  r\tr\tr\tr\tr diff --git a/dtrain/dtrain.cc b/dtrain/dtrain.cc index 69f83633..5d84f250 100644 --- a/dtrain/dtrain.cc +++ b/dtrain/dtrain.cc @@ -241,7 +241,7 @@ main(int argc, char** argv)      vector<WordID> ref_ids;  // reference as vector<WordID>      if (t == 0) {        // handling input -      boost::split(in_split, in, boost::is_any_of("\t")); +      split_in(in, in_split);         // getting reference        vector<string> ref_tok;        boost::split(ref_tok, in_split[2], boost::is_any_of(" ")); @@ -249,17 +249,17 @@ main(int argc, char** argv)        ref_ids_buf.push_back(ref_ids);        // process and set grammar        bool broken_grammar = true; -      for (string::iterator it = in_split[3].begin(); it != in_split[3].end(); it++) { +      for (string::iterator it = in.begin(); it != in.end(); it++) {          if (!isspace(*it)) {            broken_grammar = false;            break;          }        }        if (broken_grammar) continue; -      boost::replace_all(in_split[3], " __NEXT__RULE__ ", "\n"); // TODO -      in_split[3] += "\n"; -      grammar_buf_out << in_split[3] << DTRAIN_GRAMMAR_DELIM << " " << in_split[0] << endl; -      decoder.SetSentenceGrammarFromString(in_split[3]); +      boost::replace_all(in, "\t", "\n"); +      in += "\n"; +      grammar_buf_out << in << DTRAIN_GRAMMAR_DELIM << " " << in_split[0] << endl; +      decoder.SetSentenceGrammarFromString(in);        src_str_buf.push_back(in_split[1]);        // decode        observer->SetRef(ref_ids); diff --git a/dtrain/dtrain.h b/dtrain/dtrain.h index 34464e3c..e98ef470 100644 --- a/dtrain/dtrain.h +++ b/dtrain/dtrain.h @@ -38,6 +38,18 @@ inline string gettmpf(const string path, const string infix, const string suffix    return string(fn);  } +inline void split_in(string& s, vector<string>& parts) +{ +  unsigned f = 0; +  for(unsigned i = 0; i < 3; i++) { +    unsigned e = f; +    f = s.find("\t", f+1); +    if (e != 0) parts.push_back(s.substr(e+1, f-e-1)); +    else parts.push_back(s.substr(0, f));  +  } +  s.erase(0, f+1); +} +  inline ostream& _np(ostream& out) { return out << resetiosflags(ios::showpos); }  inline ostream& _p(ostream& out)  { return out << setiosflags(ios::showpos); }  inline ostream& _p2(ostream& out) { return out << setprecision(2); } diff --git a/dtrain/test/example/dtrain.ini b/dtrain/test/example/dtrain.ini index c560a3a6..09c876d9 100644 --- a/dtrain/test/example/dtrain.ini +++ b/dtrain/test/example/dtrain.ini @@ -1,11 +1,13 @@  decoder_config=test/example/cdec.ini  k=100  N=3 -epochs=1000 -input=test/example/nc-1k.gz +epochs=4 +#input=test/example/nc-1k.gz +input=test/example/nc-1k-tabs.gz  scorer=stupid_bleu -output=/tmp/weights.gz -stop_after=10 +output=VOID +#/tmp/weights.gz +stop_after=100  sample_from=kbest  pair_sampling=all  print_weights=Glue WordPenalty LanguageModel LanguageModel_OOV PhraseModel_0 PhraseModel_1 PhraseModel_2 PhraseModel_3 PhraseModel_4 PassThrough | 
