1 files changed, 86 insertions, 45 deletions
diff --git a/dtrain/dtrain.cc b/dtrain/dtrain.cc
index a141a576..30ced234 100644
--- a/dtrain/dtrain.cc
+++ b/dtrain/dtrain.cc
@@ -25,18 +25,20 @@ init(int argc, char** argv, po::variables_map* cfg)
 {
   po::options_description conff( "Configuration File Options" );
   size_t k, N, T, stop;
-  string s;
+  string s, f;
   conff.add_options()
     ( "decoder_config", po::value<string>(),                            "configuration file for cdec" )
     ( "kbest",          po::value<size_t>(&k)->default_value(DTRAIN_DEFAULT_K),         "k for kbest" )
-    ( "ngrams",         po::value<size_t>(&N)->default_value(DTRAIN_DEFAULT_N),        "n for Ngrams" )
-    ( "filter",         po::value<string>(),                                      "filter kbest list" ) // FIXME
+    ( "ngrams",         po::value<size_t>(&N)->default_value(DTRAIN_DEFAULT_N),        "N for Ngrams" )
+    ( "filter",         po::value<string>(&f)->default_value("unique"),           "filter kbest list" )
     ( "epochs",         po::value<size_t>(&T)->default_value(DTRAIN_DEFAULT_T),   "# of iterations T" ) 
     ( "input",          po::value<string>(),                                             "input file" )
     ( "scorer",         po::value<string>(&s)->default_value(DTRAIN_DEFAULT_SCORER), "scoring metric" )
     ( "output",         po::value<string>(),                                    "output weights file" )
     ( "stop_after",     po::value<size_t>(&stop)->default_value(0),    "stop after X input sentences" )
-    ( "weights_file",   po::value<string>(),       "input weights file (e.g. from previous iteration" );
+    ( "weights_file",   po::value<string>(),      "input weights file (e.g. from previous iteration)" )
+    ( "wprint",         po::value<string>(),                     "weights to print on each iteration" )
+    ( "noup",           po::value<bool>()->zero_tokens(),                     "do not update weights" );
 
   po::options_description clo("Command Line Options");
   clo.add_options()
@@ -65,6 +67,14 @@ init(int argc, char** argv, po::variables_map* cfg)
     cerr << cmdline_options << endl;
     return false;
   }
+  if ( cfg->count("noup") && cfg->count("decode") ) {
+    cerr << "You can't use 'noup' and 'decode' at once." << endl;
+    return false;
+  }
+  if ( cfg->count("filter") && (*cfg)["filter"].as<string>() != "unique"
+       && (*cfg)["filter"].as<string>() != "no" ) {
+    cerr << "Wrong 'filter' type: '" << (*cfg)["filter"].as<string>() << "'." << endl;
+  }
   #ifdef DTRAIN_DEBUG       
   if ( !cfg->count("test") ) {
     cerr << cmdline_options << endl;
@@ -98,20 +108,29 @@ main(int argc, char** argv)
   if ( cfg.count("quiet") ) quiet = true;
   bool verbose = false;  
   if ( cfg.count("verbose") ) verbose = true;
+  bool noup = false;
+  if ( cfg.count("noup") ) noup = true;
   const size_t k = cfg["kbest"].as<size_t>();
   const size_t N = cfg["ngrams"].as<size_t>(); 
   const size_t T = cfg["epochs"].as<size_t>();
   const size_t stop_after = cfg["stop_after"].as<size_t>();
+  const string filter_type = cfg["filter"].as<string>();
   if ( !quiet ) {
     cout << endl << "dtrain" << endl << "Parameters:" << endl;
-    cout << setw(16) << "k " << k << endl;
-    cout << setw(16) << "N " << N << endl;
-    cout << setw(16) << "T " << T << endl;
+    cout << setw(25) << "k " << k << endl;
+    cout << setw(25) << "N " << N << endl;
+    cout << setw(25) << "T " << T << endl;
     if ( cfg.count("stop-after") )
-      cout << setw(16) << "stop_after " << stop_after << endl;
+      cout << setw(25) << "stop_after " << stop_after << endl;
     if ( cfg.count("weights") )
-      cout << setw(16) << "weights " << cfg["weights"].as<string>() << endl;
-    cout << setw(16) << "input " << "'" << cfg["input"].as<string>() << "'" << endl;
+      cout << setw(25) << "weights " << cfg["weights"].as<string>() << endl;
+    cout << setw(25) << "input " << "'" << cfg["input"].as<string>() << "'" << endl;
+    cout << setw(25) << "filter " << "'" << filter_type << "'" << endl;
+  }
+
+  vector<string> wprint;
+  if ( cfg.count("wprint") ) {
+    boost::split( wprint, cfg["wprint"].as<string>(), boost::is_any_of(" ") );
   }
 
   // setup decoder, observer
@@ -119,9 +138,9 @@ main(int argc, char** argv)
   SetSilent(true);
   ReadFile ini_rf( cfg["decoder_config"].as<string>() );
   if ( !quiet )
-    cout << setw(16) << "cdec cfg " << "'" << cfg["decoder_config"].as<string>() << "'" << endl;
-  Decoder decoder(ini_rf.stream());
-  KBestGetter observer( k );
+    cout << setw(25) << "cdec cfg " << "'" << cfg["decoder_config"].as<string>() << "'" << endl;
+  Decoder decoder( ini_rf.stream() );
+  KBestGetter observer( k, filter_type );
 
   // scoring metric/scorer
   string scorer_str = cfg["scorer"].as<string>();
@@ -144,13 +163,13 @@ main(int argc, char** argv)
   size_t global_ref_len = 0;      // sum reference lengths
   // this is all BLEU implmentations
   vector<float> bleu_weights; // we leave this empty -> 1/N; TODO? 
-  if ( !quiet ) cout << setw(16) << "scorer '" << scorer_str << "'" << endl << endl;
+  if ( !quiet ) cout << setw(26) << "scorer '" << scorer_str << "'" << endl << endl;
 
   // init weights
   Weights weights;
   if ( cfg.count("weights") ) weights.InitFromFile( cfg["weights"].as<string>() );
   SparseVector<double> lambdas;
-  weights.InitSparseVector(&lambdas);
+  weights.InitSparseVector( &lambdas );
   vector<double> dense_weights;
 
   // input
@@ -203,12 +222,14 @@ main(int argc, char** argv)
 
   // reset average scores
   acc_1best_score = acc_1best_model = 0.;
-
-  sid = 0;                   // reset sentence counter
+  
+  // reset sentence counter
+  sid = 0;
   
   if ( !quiet ) cout << "Iteration #" << t+1 << " of " << T << "." << endl;
   
-  while( true ) {
+  while( true )
+  {
 
     // get input from stdin or file
     in.clear();
@@ -262,26 +283,32 @@ main(int argc, char** argv)
         // handling input
         in_split.clear();
         boost::split( in_split, in, boost::is_any_of("\t") );
+        // in_split[0] is id
         // getting reference
         ref_tok.clear(); ref_ids.clear();
-        boost::split( ref_tok, in_split[1], boost::is_any_of(" ") );
+        boost::split( ref_tok, in_split[2], boost::is_any_of(" ") );
         register_and_convert( ref_tok, ref_ids );
         ref_ids_buf.push_back( ref_ids );
         // process and set grammar
-        grammar_buf << in_split[2] << endl;
-        grammar_str = boost::replace_all_copy( in_split[2], " __NEXT_RULE__ ", "\n" );
-        grammar_str += "\n";
+        //grammar_buf << in_split[3] << endl;
+        grammar_str = boost::replace_all_copy( in_split[3], " __NEXT__RULE__ ", "\n" ) + "\n";
+        grammar_buf << grammar_str << DTRAIN_GRAMMAR_DELIM << endl;
         decoder.SetSentenceGrammarFromString( grammar_str );
         // decode, kbest
-        src_str_buf.push_back( in_split[0] );
-        decoder.Decode( in_split[0], &observer );
+        src_str_buf.push_back( in_split[1] );
+        decoder.Decode( in_split[1], &observer );
         break;
       default:
         // get buffered grammar
-        string g;
-        getline(grammar_buf_in, g);
-        grammar_str = boost::replace_all_copy( g, " __NEXT_RULE__ ", "\n" );
-        grammar_str += "\n";
+        grammar_str.clear();
+        int i = 1;
+        while ( true ) {
+          string g;  
+          getline( grammar_buf_in, g );
+          if ( g == DTRAIN_GRAMMAR_DELIM ) break;
+          grammar_str += g+"\n";
+          i += 1;
+        }
         decoder.SetSentenceGrammarFromString( grammar_str );
         // decode, kbest
         decoder.Decode( src_str_buf[sid], &observer );
@@ -308,9 +335,9 @@ main(int argc, char** argv)
             cand_len = kb->sents[i].size();
         }
         NgramCounts counts_tmp = global_counts + counts;
-        score = scorer( counts_tmp,
-                        global_ref_len,
-                        global_hyp_len + cand_len, N, bleu_weights );
+        score = 0.9 * scorer( counts_tmp,
+                              global_ref_len,
+                              global_hyp_len + cand_len, N, bleu_weights );
       } else {
         // other scorers
         cand_len = kb->sents[i].size();
@@ -332,15 +359,17 @@ main(int argc, char** argv)
         cout << "k=" << i+1 << " '" << TD::GetString( ref_ids ) << "'[ref] vs '";
         cout << _prec5 << _nopos << TD::GetString( kb->sents[i] ) << "'[hyp]";
         cout << " [SCORE=" << score << ",model="<< kb->scores[i] << "]" << endl;
-        //cout << kb->feats[i] << endl; this is maybe too verbose
+        //cout << kb->feats[i] << endl; // this is maybe too verbose
       }
     } // Nbest loop
     if ( verbose ) cout << endl;
 
-    // update weights; FIXME others
-    SofiaUpdater updater;
-    updater.Init( sid, kb->feats, scores );
-    updater.Update( lambdas );
+    // update weights; TODO other updaters
+    if ( !noup ) {
+      SofiaUpdater updater;
+      updater.Init( sid, kb->feats, scores );
+      updater.Update( lambdas );
+    }
 
     ++sid;
 
@@ -359,12 +388,15 @@ main(int argc, char** argv)
     avg_1best_score_diff = avg_1best_score;
     avg_1best_model_diff = avg_1best_model;
   }
-  cout << _prec5 << _nopos << "(sanity weights Glue=" << dense_weights[FD::Convert( "Glue" )];
-  cout << " LexEF=" << dense_weights[FD::Convert( "LexEF" )];
-  cout << " LexFE=" << dense_weights[FD::Convert( "LexFE" )] << ")" << endl;
-  cout << "     avg score: " << avg_1best_score;
+  cout << _prec5 << _pos << "WEIGHTS" << endl;
+  for (vector<string>::iterator it = wprint.begin(); it != wprint.end(); it++) {
+    cout << setw(16) << *it << " = " << dense_weights[FD::Convert( *it )] << endl;
+  }
+
+  cout << "        ---" << endl;
+  cout << _nopos << "      avg score: " << avg_1best_score;
   cout << _pos << " (" << avg_1best_score_diff << ")" << endl;
-  cout << _nopos << "avg modelscore: " << avg_1best_model;
+  cout << _nopos << "avg model score: " << avg_1best_model;
   cout << _pos << " (" << avg_1best_model_diff << ")" << endl;
   vector<double> remember_scores;
   remember_scores.push_back( avg_1best_score );
@@ -390,12 +422,16 @@ main(int argc, char** argv)
   
   if ( t+1 != T ) cout << endl;
 
+  if ( noup ) break;
+
   } // outer loop
 
-  unlink( grammar_buf_tmp_fn );
-  if ( !quiet ) cout << endl << "writing weights file '" << cfg["output"].as<string>() << "' ...";
-  weights.WriteToFile( cfg["output"].as<string>(), true );
-  if ( !quiet ) cout << "done" << endl;
+  //unlink( grammar_buf_tmp_fn );
+  if ( !noup ) {
+    if ( !quiet ) cout << endl << "writing weights file '" << cfg["output"].as<string>() << "' ...";
+    weights.WriteToFile( cfg["output"].as<string>(), true );
+    if ( !quiet ) cout << "done" << endl;
+  }
   
   if ( !quiet ) {
     cout << _prec5 << _nopos << endl << "---" << endl << "Best iteration: ";
@@ -403,6 +439,11 @@ main(int argc, char** argv)
     cout << _prec2 << "This took " << overall_time/60. << " min." << endl;
   }
 
+  // don't do this with many features...
+  /*for ( size_t i = 0; i < FD::NumFeats(); i++ ) {
+      cout << FD::Convert(i) << " " << dense_weights[i] << endl;
+  }*/
+
   return 0;
 }