diff options
Diffstat (limited to 'dtrain')
| -rw-r--r-- | dtrain/Makefile.am | 5 | ||||
| -rw-r--r-- | dtrain/README | 1 | ||||
| -rw-r--r-- | dtrain/dtrain.cc | 429 | ||||
| -rw-r--r-- | dtrain/dtrain.h | 69 | ||||
| -rw-r--r-- | dtrain/pairsampling.h | 17 | 
5 files changed, 237 insertions, 284 deletions
diff --git a/dtrain/Makefile.am b/dtrain/Makefile.am index 12084a70..baf6883a 100644 --- a/dtrain/Makefile.am +++ b/dtrain/Makefile.am @@ -1,8 +1,7 @@ -# TODO I'm sure I can leave something out.  bin_PROGRAMS = dtrain  dtrain_SOURCES = dtrain.cc score.cc hgsampler.cc -dtrain_LDADD   = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a ../klm/lm/libklm.a ../klm/util/libklm_util.a -lz -lboost_filesystem -lboost_iostreams +dtrain_LDADD   = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a ../klm/lm/libklm.a ../klm/util/libklm_util.a -lz -AM_CPPFLAGS = -W -Wall -Wno-sign-compare -I$(top_srcdir)/utils -I$(top_srcdir)/decoder -I$(top_srcdir)/mteval +AM_CPPFLAGS = -W -Wall -Wno-sign-compare -I$(top_srcdir)/utils -I$(top_srcdir)/decoder -I$(top_srcdir)/mteval -O3 diff --git a/dtrain/README b/dtrain/README index 0cc52acc..137c1b48 100644 --- a/dtrain/README +++ b/dtrain/README @@ -31,6 +31,7 @@ TODO   use separate TEST SET  KNOWN BUGS PROBLEMS + doesn't select best iteration for weigts   if size of candidate < N => 0 score   cdec kbest vs 1best (no -k param), rescoring? => ok(?)   no sparse vector in decoder => ok diff --git a/dtrain/dtrain.cc b/dtrain/dtrain.cc index 01119997..76fdb49c 100644 --- a/dtrain/dtrain.cc +++ b/dtrain/dtrain.cc @@ -1,199 +1,161 @@  #include "dtrain.h" - -/* - * register_and_convert - * - */ -void -register_and_convert(const vector<string>& strs, vector<WordID>& ids) -{ -  vector<string>::const_iterator it; -  for ( it = strs.begin(); it < strs.end(); it++ ) { -    ids.push_back( TD::Convert( *it ) ); -  } -} - - -/* - * init - * - */  bool -init(int argc, char** argv, po::variables_map* cfg) +dtrain_init(int argc, char** argv, po::variables_map* cfg)  { -  po::options_description conff( "Configuration File Options" ); -  size_t k, N, T, stop; -  string s, f; +  po::options_description conff("Configuration File Options");    conff.add_options() -    ( "decoder_config", po::value<string>(),                            "configuration file for cdec" ) -    ( "kbest",          po::value<size_t>(&k)->default_value(DTRAIN_DEFAULT_K),         "k for kbest" ) -    ( "ngrams",         po::value<size_t>(&N)->default_value(DTRAIN_DEFAULT_N),        "N for Ngrams" ) -    ( "filter",         po::value<string>(&f)->default_value("unique"),           "filter kbest list" ) -    ( "epochs",         po::value<size_t>(&T)->default_value(DTRAIN_DEFAULT_T),   "# of iterations T" )  -    ( "input",          po::value<string>(),                                             "input file" ) -    ( "scorer",         po::value<string>(&s)->default_value(DTRAIN_DEFAULT_SCORER), "scoring metric" ) -    ( "output",         po::value<string>(),                                    "output weights file" ) -    ( "stop_after",     po::value<size_t>(&stop)->default_value(0),    "stop after X input sentences" ) -    ( "weights_file",   po::value<string>(),      "input weights file (e.g. from previous iteration)" ) -    ( "wprint",         po::value<string>(),                     "weights to print on each iteration" ) -    ( "noup",           po::value<bool>()->zero_tokens(),                     "do not update weights" ); +    ("decoder_config", po::value<string>(),                       "configuration file for cdec") +    ("kbest",          po::value<size_t>()->default_value(100),                   "k for kbest") +    ("ngrams",         po::value<size_t>()->default_value(3),                    "N for Ngrams") +    ("filter",         po::value<string>()->default_value("unique"),        "filter kbest list") +    ("epochs",         po::value<size_t>()->default_value(2),               "# of iterations T")  +    ("input",          po::value<string>()->default_value("-"),                    "input file") +    ("output",         po::value<string>()->default_value("-"),           "output weights file") +    ("scorer",         po::value<string>()->default_value("stupid_bleu"),      "scoring metric") +    ("stop_after",     po::value<size_t>()->default_value(0),    "stop after X input sentences") +    ("input_weights",  po::value<string>(), "input weights file (e.g. from previous iteration)") +    ("wprint",         po::value<string>(),                "weights to print on each iteration") +    ("hstreaming",     po::value<bool>()->zero_tokens(),         "run in hadoop streaming mode") +    ("noup",           po::value<bool>()->zero_tokens(),                "do not update weights");    po::options_description clo("Command Line Options");    clo.add_options() -    ( "config,c",         po::value<string>(),              "dtrain config file" ) -    ( "quiet,q",          po::value<bool>()->zero_tokens(),           "be quiet" ) -    ( "verbose,v",        po::value<bool>()->zero_tokens(),         "be verbose" ); +    ("config,c",         po::value<string>(),              "dtrain config file") +    ("quiet,q",          po::value<bool>()->zero_tokens(),           "be quiet") +    ("verbose,v",        po::value<bool>()->zero_tokens(),         "be verbose");    po::options_description config_options, cmdline_options;    config_options.add(conff);    cmdline_options.add(clo);    cmdline_options.add(conff); -  po::store( parse_command_line(argc, argv, cmdline_options), *cfg ); -  if ( cfg->count("config") ) { -    ifstream config( (*cfg)["config"].as<string>().c_str() ); -    po::store( po::parse_config_file(config, config_options), *cfg ); +  po::store(parse_command_line(argc, argv, cmdline_options), *cfg); +  if (cfg->count("config")) { +    ifstream config((*cfg)["config"].as<string>().c_str()); +    po::store(po::parse_config_file(config, config_options), *cfg);    }    po::notify(*cfg); -  if ( !cfg->count("decoder_config") || !cfg->count("input") ) {  +  if (!cfg->count("decoder_config")) {       cerr << cmdline_options << endl;      return false;    } -  if ( cfg->count("noup") && cfg->count("decode") ) { -    cerr << "You can't use 'noup' and 'decode' at once." << endl; +  if (cfg->count("hstreaming") && (*cfg)["output"].as<string>() != "-") { +    cerr << "When using 'hstreaming' the 'output' param should be '-'.";      return false;    } -  if ( cfg->count("filter") && (*cfg)["filter"].as<string>() != "unique" -       && (*cfg)["filter"].as<string>() != "no" ) { +  if (cfg->count("filter") && (*cfg)["filter"].as<string>() != "unique" +       && (*cfg)["filter"].as<string>() != "no") {      cerr << "Wrong 'filter' type: '" << (*cfg)["filter"].as<string>() << "'." << endl;    } -  #ifdef DTRAIN_DEBUG        -  if ( !cfg->count("test") ) { -    cerr << cmdline_options << endl; -    return false; -  } -  #endif    return true;  } +#include "filelib.h" -// output formatting -ostream& _nopos( ostream& out ) { return out << resetiosflags( ios::showpos ); } -ostream& _pos( ostream& out ) { return out << setiosflags( ios::showpos ); } -ostream& _prec2( ostream& out ) { return out << setprecision(2); } -ostream& _prec5( ostream& out ) { return out << setprecision(5); } - - - - -/* - * dtrain - * - */  int -main( int argc, char** argv ) +main(int argc, char** argv)  { -  cout << setprecision( 5 ); +  cout << _p5;    // handle most parameters    po::variables_map cfg; -  if ( ! init(argc, argv, &cfg) ) exit(1); // something is wrong  -#ifdef DTRAIN_DEBUG -  if ( cfg.count("test") ) run_tests(); // run tests and exit  -#endif +  if (! dtrain_init(argc, argv, &cfg)) exit(1); // something is wrong     bool quiet = false; -  if ( cfg.count("quiet") ) quiet = true; +  if (cfg.count("quiet")) quiet = true;    bool verbose = false;   -  if ( cfg.count("verbose") ) verbose = true; +  if (cfg.count("verbose")) verbose = true;    bool noup = false; -  if ( cfg.count("noup") ) noup = true; +  if (cfg.count("noup")) noup = true; +  bool hstreaming = false; +  if (cfg.count("hstreaming")) { +    hstreaming = true; +    quiet = true; +  }    const size_t k = cfg["kbest"].as<size_t>();    const size_t N = cfg["ngrams"].as<size_t>();     const size_t T = cfg["epochs"].as<size_t>();    const size_t stop_after = cfg["stop_after"].as<size_t>();    const string filter_type = cfg["filter"].as<string>(); -  if ( !quiet ) { +  if (!quiet) {      cout << endl << "dtrain" << endl << "Parameters:" << endl;      cout << setw(25) << "k " << k << endl;      cout << setw(25) << "N " << N << endl;      cout << setw(25) << "T " << T << endl; -    if ( cfg.count("stop-after") ) +    if (cfg.count("stop-after"))        cout << setw(25) << "stop_after " << stop_after << endl; -    if ( cfg.count("weights") ) +    if (cfg.count("input_weights"))        cout << setw(25) << "weights " << cfg["weights"].as<string>() << endl;      cout << setw(25) << "input " << "'" << cfg["input"].as<string>() << "'" << endl;      cout << setw(25) << "filter " << "'" << filter_type << "'" << endl;    }    vector<string> wprint; -  if ( cfg.count("wprint") ) { -    boost::split( wprint, cfg["wprint"].as<string>(), boost::is_any_of(" ") ); +  if (cfg.count("wprint")) { +    boost::split(wprint, cfg["wprint"].as<string>(), boost::is_any_of(" "));    }    // setup decoder, observer    register_feature_functions();    SetSilent(true); -  ReadFile ini_rf( cfg["decoder_config"].as<string>() ); -  if ( !quiet ) +  ReadFile ini_rf(cfg["decoder_config"].as<string>()); +  if (!quiet)      cout << setw(25) << "cdec cfg " << "'" << cfg["decoder_config"].as<string>() << "'" << endl; -  Decoder decoder( ini_rf.stream() ); -  KBestGetter observer( k, filter_type ); +  Decoder decoder(ini_rf.stream()); +  KBestGetter observer(k, filter_type);    MT19937 rng; -  //KSampler observer( k, &rng ); +  //KSampler observer(k, &rng);    // scoring metric/scorer    string scorer_str = cfg["scorer"].as<string>(); -  double (*scorer)( NgramCounts&, const size_t, const size_t, size_t, vector<float> ); -  if ( scorer_str == "bleu" ) { +  double (*scorer)(NgramCounts&, const size_t, const size_t, size_t, vector<float>); +  if (scorer_str == "bleu") {      scorer = &bleu; -  } else if ( scorer_str == "stupid_bleu" ) { +  } else if (scorer_str == "stupid_bleu") {      scorer = &stupid_bleu; -  } else if ( scorer_str == "smooth_bleu" ) { +  } else if (scorer_str == "smooth_bleu") {      scorer = &smooth_bleu; -  } else if ( scorer_str == "approx_bleu" ) { +  } else if (scorer_str == "approx_bleu") {      scorer = &approx_bleu;    } else {      cerr << "Don't know scoring metric: '" << scorer_str << "', exiting." << endl;      exit(1);    }    // for approx_bleu -  NgramCounts global_counts( N ); // counts for 1 best translations +  NgramCounts global_counts(N); // counts for 1 best translations    size_t global_hyp_len = 0;      // sum hypothesis lengths    size_t global_ref_len = 0;      // sum reference lengths    // this is all BLEU implmentations    vector<float> bleu_weights; // we leave this empty -> 1/N; TODO?  -  if ( !quiet ) cout << setw(26) << "scorer '" << scorer_str << "'" << endl << endl; +  if (!quiet) cout << setw(26) << "scorer '" << scorer_str << "'" << endl << endl;    // init weights    Weights weights; -  if ( cfg.count("weights") ) weights.InitFromFile( cfg["weights"].as<string>() ); +  if (cfg.count("weights")) weights.InitFromFile(cfg["weights"].as<string>());    SparseVector<double> lambdas; -  weights.InitSparseVector( &lambdas ); +  weights.InitSparseVector(&lambdas);    vector<double> dense_weights;    // input -  if ( !quiet && !verbose ) +  if (!quiet && !verbose)      cout << "(a dot represents " << DTRAIN_DOTS << " lines of input)" << endl;    string input_fn = cfg["input"].as<string>();    ifstream input; -  if ( input_fn != "-" ) input.open( input_fn.c_str() ); +  if (input_fn != "-") input.open(input_fn.c_str());    string in;    vector<string> in_split; // input: src\tref\tpsg    vector<string> ref_tok;  // tokenized reference    vector<WordID> ref_ids;  // reference as vector of WordID -  string grammar_str;    // buffer input for t > 0    vector<string> src_str_buf;           // source strings, TODO? memory    vector<vector<WordID> > ref_ids_buf;  // references as WordID vecs -  filtering_ostream grammar_buf;        // written to compressed file in /tmp    // this is for writing the grammar buffer file -  grammar_buf.push( gzip_compressor() ); -  char grammar_buf_tmp_fn[] = DTRAIN_TMP_DIR"/dtrain-grammars-XXXXXX"; -  mkstemp( grammar_buf_tmp_fn ); -  grammar_buf.push( file_sink(grammar_buf_tmp_fn, ios::binary | ios::trunc) ); +  char grammar_buf_fn[] = DTRAIN_TMP_DIR"/dtrain-grammars-XXXXXX"; +  mkstemp(grammar_buf_fn); +  ogzstream grammar_buf_out; +  grammar_buf_out.open(grammar_buf_fn);    size_t sid = 0, in_sz = 99999999; // sentence id, input size    double acc_1best_score = 0., acc_1best_model = 0.; @@ -208,23 +170,21 @@ main( int argc, char** argv )    // for the perceptron/SVM; TODO as params    double eta = 0.0005;    double gamma = 0.;//01; // -> SVM -  lambdas.add_value( FD::Convert("__bias"), 0 ); +  lambdas.add_value(FD::Convert("__bias"), 0);    // for random sampling -  srand ( time(NULL) ); +  srand (time(NULL)); -  for ( size_t t = 0; t < T; t++ ) // T epochs +  for (size_t t = 0; t < T; t++) // T epochs    {    time_t start, end;   -  time( &start ); +  time(&start);    // actually, we need only need this if t > 0 FIXME -  ifstream grammar_file( grammar_buf_tmp_fn, ios_base::in | ios_base::binary ); -  filtering_istream grammar_buf_in; -  grammar_buf_in.push( gzip_decompressor() ); -  grammar_buf_in.push( grammar_file ); +  igzstream grammar_buf_in; +  if (t > 0) grammar_buf_in.open(grammar_buf_fn);    // reset average scores    acc_1best_score = acc_1best_model = 0.; @@ -232,43 +192,43 @@ main( int argc, char** argv )    // reset sentence counter    sid = 0; -  if ( !quiet ) cout << "Iteration #" << t+1 << " of " << T << "." << endl; +  if (!quiet) cout << "Iteration #" << t+1 << " of " << T << "." << endl; -  while( true ) +  while(true)    {      // get input from stdin or file      in.clear();      next = stop = false; // next iteration, premature stop -    if ( t == 0 ) {     -      if ( input_fn == "-" ) { -        if ( !getline(cin, in) ) next = true; +    if (t == 0) {     +      if (input_fn == "-") { +        if (!getline(cin, in)) next = true;        } else { -        if ( !getline(input, in) ) next = true;  +        if (!getline(input, in)) next = true;         }      } else { -      if ( sid == in_sz ) next = true; // stop if we reach the end of our input +      if (sid == in_sz) next = true; // stop if we reach the end of our input      }      // stop after X sentences (but still iterate for those) -    if ( stop_after > 0 && stop_after == sid && !next ) stop = true; +    if (stop_after > 0 && stop_after == sid && !next) stop = true;      // produce some pretty output -    if ( !quiet && !verbose ) { -        if ( sid == 0 ) cout << " "; -        if ( (sid+1) % (DTRAIN_DOTS) == 0 ) { +    if (!quiet && !verbose) { +        if (sid == 0) cout << " "; +        if ((sid+1) % (DTRAIN_DOTS) == 0) {              cout << ".";              cout.flush();          } -        if ( (sid+1) % (20*DTRAIN_DOTS) == 0) { +        if ((sid+1) % (20*DTRAIN_DOTS) == 0) {              cout << " " << sid+1 << endl; -            if ( !next && !stop ) cout << " "; +            if (!next && !stop) cout << " ";          } -        if ( stop ) { -          if ( sid % (20*DTRAIN_DOTS) != 0 ) cout << " " << sid << endl; +        if (stop) { +          if (sid % (20*DTRAIN_DOTS) != 0) cout << " " << sid << endl;            cout << "Stopping after " << stop_after << " input sentences." << endl;          } else { -          if ( next ) { -            if ( sid % (20*DTRAIN_DOTS) != 0 ) { +          if (next) { +            if (sid % (20*DTRAIN_DOTS) != 0) {                cout << " " << sid << endl;              }            } @@ -276,68 +236,65 @@ main( int argc, char** argv )      }      // next iteration -    if ( next || stop ) break; +    if (next || stop) break;      // weights      dense_weights.clear(); -    weights.InitFromVector( lambdas ); -    weights.InitVector( &dense_weights ); -    decoder.SetWeights( dense_weights ); +    weights.InitFromVector(lambdas); +    weights.InitVector(&dense_weights); +    decoder.SetWeights(dense_weights); -    if ( t == 0 ) { +    if (t == 0) {        // handling input        in_split.clear(); -      boost::split( in_split, in, boost::is_any_of("\t") ); // in_split[0] is id +      strsplit(in, in_split, '\t', 4);        // getting reference        ref_tok.clear(); ref_ids.clear(); -      boost::split( ref_tok, in_split[2], boost::is_any_of(" ") ); -      register_and_convert( ref_tok, ref_ids ); -      ref_ids_buf.push_back( ref_ids ); +      strsplit(in_split[2], ref_tok, ' '); +      register_and_convert(ref_tok, ref_ids); +      ref_ids_buf.push_back(ref_ids);        // process and set grammar        bool broken_grammar = true; -      for ( string::iterator ti = in_split[3].begin(); ti != in_split[3].end(); ti++ ) { -        if ( !isspace(*ti) ) { +      for (string::iterator ti = in_split[3].begin(); ti != in_split[3].end(); ti++) { +        if (!isspace(*ti)) {            broken_grammar = false;            break;          }        } -      if ( broken_grammar ) continue; -      grammar_str = boost::replace_all_copy( in_split[3], " __NEXT__RULE__ ", "\n" ) + "\n"; // FIXME copy, __ -      grammar_buf << grammar_str << DTRAIN_GRAMMAR_DELIM << " " << in_split[0] << endl; -      decoder.SetSentenceGrammarFromString( grammar_str ); -      // decode, kbest -      src_str_buf.push_back( in_split[1] ); -      decoder.Decode( in_split[1], &observer ); +      if (broken_grammar) continue; +      boost::replace_all(in_split[3], " __NEXT__RULE__ ", "\n"); +      in_split[3] += "\n"; +      grammar_buf_out << in_split[3] << DTRAIN_GRAMMAR_DELIM << " " << in_split[0] << endl; +      decoder.SetSentenceGrammarFromString(in_split[3]); +      // decode +      src_str_buf.push_back(in_split[1]); +      decoder.Decode(in_split[1], &observer);      } else {        // get buffered grammar -      grammar_str.clear(); -      int i = 1; -      while ( true ) { -        string g;   -        getline( grammar_buf_in, g ); -        //if ( g == DTRAIN_GRAMMAR_DELIM ) break; -        if (boost::starts_with(g, DTRAIN_GRAMMAR_DELIM)) break; -        grammar_str += g+"\n"; -        i += 1; +      string grammar_str; +      while (true) { +        string rule;   +        getline(grammar_buf_in, rule); +        if (boost::starts_with(rule, DTRAIN_GRAMMAR_DELIM)) break; +        grammar_str += rule + "\n";        } -      decoder.SetSentenceGrammarFromString( grammar_str ); -      // decode, kbest -      decoder.Decode( src_str_buf[sid], &observer ); +      decoder.SetSentenceGrammarFromString(grammar_str); +      // decode +      decoder.Decode(src_str_buf[sid], &observer);      }      // get kbest list      KBestList* kb; -    //if ( ) { // TODO get from forest +    //if () { // TODO get from forest        kb = observer.GetKBest();      //} -    // scoring kbest -    if ( t > 0 ) ref_ids = ref_ids_buf[sid]; -    for ( size_t i = 0; i < kb->GetSize(); i++ ) { -      NgramCounts counts = make_ngram_counts( ref_ids, kb->sents[i], N ); -      // this is for approx bleu -      if ( scorer_str == "approx_bleu" ) { -        if ( i == 0 ) { // 'context of 1best translations' +    // (local) scoring +    if (t > 0) ref_ids = ref_ids_buf[sid]; +    for (size_t i = 0; i < kb->GetSize(); i++) { +      NgramCounts counts = make_ngram_counts(ref_ids, kb->sents[i], N); +      if (scorer_str == "approx_bleu") { +        if (i == 0) { // 'context of 1best translations'            global_counts  += counts;            global_hyp_len += kb->sents[i].size();            global_ref_len += ref_ids.size(); @@ -347,59 +304,61 @@ main( int argc, char** argv )              cand_len = kb->sents[i].size();          }          NgramCounts counts_tmp = global_counts + counts; -        score = .9*scorer( counts_tmp, +        score = .9*scorer(counts_tmp,                          global_ref_len, -                        global_hyp_len + cand_len, N, bleu_weights ); +                        global_hyp_len + cand_len, N, bleu_weights);        } else { -        // other scorers          cand_len = kb->sents[i].size(); -        score = scorer( counts, +        score = scorer(counts,                          ref_ids.size(), -                        kb->sents[i].size(), N, bleu_weights ); +                        kb->sents[i].size(), N, bleu_weights);        } -      kb->scores.push_back( score ); +      kb->scores.push_back(score); -      if ( i == 0 ) { +      if (i == 0) {          acc_1best_score += score;          acc_1best_model += kb->model_scores[i];        } -      if ( verbose ) { -        if ( i == 0 ) cout << "'" << TD::GetString( ref_ids ) << "' [ref]" << endl; -        cout << _prec5 << _nopos << "[hyp " << i << "] " << "'" << TD::GetString( kb->sents[i] ) << "'"; +      if (verbose) { +        if (i == 0) cout << "'" << TD::GetString(ref_ids) << "' [ref]" << endl; +        cout << _p5 << _np << "[hyp " << i << "] " << "'" << TD::GetString(kb->sents[i]) << "'";          cout << " [SCORE=" << score << ",model="<< kb->model_scores[i] << "]" << endl; -        cout << kb->feats[i] << endl; // this is maybe too verbose +        //cout << kb->feats[i] << endl; // too verbose        }      } // Nbest loop -    if ( verbose ) cout << endl; - +    if (verbose) cout << endl; +//////////////////////////////////////////////////////////      // UPDATE WEIGHTS -    if ( !noup ) { +    if (!noup) { + +      int up = 0;        TrainingInstances pairs;        sample_all_pairs(kb, pairs); -      //sample_rand_pairs( kb, pairs, &rng ); +      //sample_rand_pairs(kb, pairs, &rng); -      for ( TrainingInstances::iterator ti = pairs.begin(); -            ti != pairs.end(); ti++ ) { +      for (TrainingInstances::iterator ti = pairs.begin(); +            ti != pairs.end(); ti++) {          SparseVector<double> dv; -        if ( ti->first_score - ti->second_score < 0 ) { +        if (ti->first_score - ti->second_score < 0) { +            up++;            dv = ti->second - ti->first;        //} else {          //dv = ti->first - ti->second;        //} -          dv.add_value( FD::Convert("__bias"), -1 ); +          dv.add_value(FD::Convert("__bias"), -1);            //SparseVector<double> reg; -          //reg = lambdas * ( 2 * gamma ); +          //reg = lambdas * (2 * gamma);            //dv -= reg;            lambdas += dv * eta; -          if ( verbose ) { +          if (verbose) {              cout << "{{ f("<< ti->first_rank <<") > f(" << ti->second_rank << ") but g(i)="<< ti->first_score <<" < g(j)="<< ti->second_score << " so update" << endl;              cout << " i  " << TD::GetString(kb->sents[ti->first_rank]) << endl;              cout << "    " << kb->feats[ti->first_rank] << endl; @@ -411,99 +370,99 @@ main( int argc, char** argv )            }          } else {            //SparseVector<double> reg; -          //reg = lambdas * ( 2 * gamma ); -          //lambdas += reg * ( -eta ); +          //reg = lambdas * (2 * gamma); +          //lambdas += reg * (-eta);          }        }        //double l2 = lambdas.l2norm(); -      //if ( l2 ) lambdas /= lambdas.l2norm(); - +      //if (l2) lambdas /= lambdas.l2norm(); +      //cout << up << endl;      } +//////////////////////////////////////////////////////////      ++sid; -    //cerr << "reporter:counter:dtrain,sent," << sid << endl; + +    if (hstreaming) cerr << "reporter:counter:dtrain,sid," << sid << endl;    } // input loop -  if ( t == 0 ) in_sz = sid; // remember size (lines) of input +  if (t == 0) { +    in_sz = sid; // remember size (lines) of input +    grammar_buf_out.close(); +    if (input_fn != "-") input.close(); +  } else { +    grammar_buf_in.close(); +  }    // print some stats    double avg_1best_score = acc_1best_score/(double)in_sz;    double avg_1best_model = acc_1best_model/(double)in_sz;    double avg_1best_score_diff, avg_1best_model_diff; -  if ( t > 0 ) { +  if (t > 0) {      avg_1best_score_diff = avg_1best_score - scores_per_iter[t-1][0];      avg_1best_model_diff = avg_1best_model - scores_per_iter[t-1][1];    } else {      avg_1best_score_diff = avg_1best_score;      avg_1best_model_diff = avg_1best_model;    } -  if ( !quiet ) { -  cout << _prec5 << _pos << "WEIGHTS" << endl; +  if (!quiet) { +  cout << _p5 << _p << "WEIGHTS" << endl;    for (vector<string>::iterator it = wprint.begin(); it != wprint.end(); it++) { -    cout << setw(16) << *it << " = " << dense_weights[FD::Convert( *it )] << endl; +    cout << setw(16) << *it << " = " << dense_weights[FD::Convert(*it)] << endl;    } -    cout << "        ---" << endl; -  cout << _nopos << "      avg score: " << avg_1best_score; -  cout << _pos << " (" << avg_1best_score_diff << ")" << endl; -  cout << _nopos << "avg model score: " << avg_1best_model; -  cout << _pos << " (" << avg_1best_model_diff << ")" << endl; +  cout << _np << "      avg score: " << avg_1best_score; +  cout << _p << " (" << avg_1best_score_diff << ")" << endl; +  cout << _np << "avg model score: " << avg_1best_model; +  cout << _p << " (" << avg_1best_model_diff << ")" << endl;    }    vector<double> remember_scores; -  remember_scores.push_back( avg_1best_score ); -  remember_scores.push_back( avg_1best_model ); -  scores_per_iter.push_back( remember_scores ); -  if ( avg_1best_score > max_score ) { +  remember_scores.push_back(avg_1best_score); +  remember_scores.push_back(avg_1best_model); +  scores_per_iter.push_back(remember_scores); +  if (avg_1best_score > max_score) {      max_score = avg_1best_score;      best_t = t;    } - -  // close open files -  if ( input_fn != "-" ) input.close(); -  close( grammar_buf ); -  grammar_file.close(); - -  time ( &end ); -  double time_dif = difftime( end, start ); +  time (&end); +  double time_dif = difftime(end, start);    overall_time += time_dif; -  if ( !quiet ) { -    cout << _prec2 << _nopos << "(time " << time_dif/60. << " min, "; +  if (!quiet) { +    cout << _p2 << _np << "(time " << time_dif/60. << " min, ";      cout << time_dif/(double)in_sz<< " s/S)" << endl;    } -  if ( t+1 != T && !quiet ) cout << endl; +  if (t+1 != T && !quiet) cout << endl; -  if ( noup ) break; +  if (noup) break;    } // outer loop -  unlink( grammar_buf_tmp_fn ); -  if ( !noup ) { -    // TODO BEST ITER -    if ( !quiet ) cout << endl << "writing weights file '" << cfg["output"].as<string>() << "' ..."; -    if ( cfg["output"].as<string>() == "-" ) { -      for ( SparseVector<double>::const_iterator ti = lambdas.begin(); -            ti != lambdas.end(); ++ti ) { -	if ( ti->second == 0 ) continue; -        //if ( ti->first == "__bias" ) continue; -        cout << setprecision(9); -        cout << _nopos << FD::Convert(ti->first) << "\t" << ti->second << endl; -        //cout << "__SHARD_COUNT__\t1" << endl; +  //unlink(grammar_buf_fn); + +  if (!noup) { +    if (!quiet) cout << endl << "writing weights file '" << cfg["output"].as<string>() << "' ..."; +    if (cfg["output"].as<string>() == "-") { +      for (SparseVector<double>::const_iterator ti = lambdas.begin(); +            ti != lambdas.end(); ++ti) { +	if (ti->second == 0) continue; +        cout << _p9; +        cout << _np << FD::Convert(ti->first) << "\t" << ti->second << endl;        } +      if (hstreaming) cout << "__SHARD_COUNT__\t1" << endl;      } else { -      weights.InitFromVector( lambdas ); -      weights.WriteToFile( cfg["output"].as<string>(), true ); +      weights.InitFromVector(lambdas); +      weights.WriteToFile(cfg["output"].as<string>(), true);      } -    if ( !quiet ) cout << "done" << endl; +    if (!quiet) cout << "done" << endl;    } -  if ( !quiet ) { -    cout << _prec5 << _nopos << endl << "---" << endl << "Best iteration: "; +  if (!quiet) { +    cout << _p5 << _np << endl << "---" << endl << "Best iteration: ";      cout << best_t+1 << " [SCORE '" << scorer_str << "'=" << max_score << "]." << endl; -    cout << _prec2 << "This took " << overall_time/60. << " min." << endl; +    cout << _p2 << "This took " << overall_time/60. << " min." << endl;    }    return 0; diff --git a/dtrain/dtrain.h b/dtrain/dtrain.h index 3d319233..9bc5be93 100644 --- a/dtrain/dtrain.h +++ b/dtrain/dtrain.h @@ -2,59 +2,56 @@  #define _DTRAIN_COMMON_H_ -#include <sstream> -#include <iostream> -#include <vector> -#include <cassert> -#include <cmath>  #include <iomanip> -// cdec includes -#include "sentence_metadata.h" +#include <boost/algorithm/string.hpp> +#include <boost/program_options.hpp> +  #include "verbose.h"  #include "viterbi.h" -#include "kbest.h"  #include "ff_register.h"  #include "decoder.h"  #include "weights.h" -// boost includes -#include <boost/algorithm/string.hpp> -#include <boost/program_options.hpp> - -// own headers  #include "score.h" - -#define DTRAIN_DEFAULT_K 100                // k for kbest lists -#define DTRAIN_DEFAULT_N 4                  // N for ngrams (e.g. BLEU) -#define DTRAIN_DEFAULT_T 1                  // iterations -#define DTRAIN_DEFAULT_SCORER "stupid_bleu" // scorer -#define DTRAIN_DOTS 100                     // when to display a '.' -#define DTRAIN_TMP_DIR "/tmp"               // put this on a SSD? -#define DTRAIN_GRAMMAR_DELIM "########EOS########" - -  #include "kbestget.h" -#include "pairsampling.h" -  #include "ksampler.h" +#include "pairsampling.h" -// boost compression -#include <boost/iostreams/device/file.hpp>  -#include <boost/iostreams/filtering_stream.hpp> -#include <boost/iostreams/filter/gzip.hpp> -//#include <boost/iostreams/filter/zlib.hpp> -//#include <boost/iostreams/filter/bzip2.hpp> -using namespace boost::iostreams; - -#include <boost/algorithm/string/predicate.hpp> -#include <boost/lexical_cast.hpp> - +#define DTRAIN_DOTS 100                     // when to display a '.' +#define DTRAIN_TMP_DIR "/var/hadoop/mapred/local"               // put this on a SSD? +#define DTRAIN_GRAMMAR_DELIM "########EOS########"  using namespace std;  using namespace dtrain;  namespace po = boost::program_options; +inline void register_and_convert(const vector<string>& strs, vector<WordID>& ids) { +  vector<string>::const_iterator it; +  for (it = strs.begin(); it < strs.end(); it++) +    ids.push_back(TD::Convert(*it)); +} +inline ostream& _np(ostream& out) { return out << resetiosflags(ios::showpos); } +inline ostream& _p(ostream& out)  { return out << setiosflags(ios::showpos); } +inline ostream& _p2(ostream& out) { return out << setprecision(2); } +inline ostream& _p5(ostream& out) { return out << setprecision(5); } +inline ostream& _p9(ostream& out) { return out << setprecision(9); } +inline void strsplit(string &s, vector<string>& v, char d = '\t', size_t parts = 0) {  +  stringstream ss(s); +  string t; +  size_t c = 0; +  while(true) +  { +    if (parts > 0 && c == parts-1) { +      getline(ss, t); +      v.push_back(t); +      break; +    } +    if (!getline(ss, t, d)) break; +    v.push_back(t); +    c++; +  } +}  #endif diff --git a/dtrain/pairsampling.h b/dtrain/pairsampling.h index 9774ba4a..e06036ca 100644 --- a/dtrain/pairsampling.h +++ b/dtrain/pairsampling.h @@ -2,7 +2,7 @@  #define _DTRAIN_PAIRSAMPLING_H_  #include "kbestget.h" -#include "sampler.h" // cdec MT19937 +#include "sampler.h" // cdec, MT19937  namespace dtrain  { @@ -17,7 +17,7 @@ struct TPair  typedef vector<TPair> TrainingInstances; -void +inline void  sample_all_pairs(KBestList* kb, TrainingInstances &training)  {    for (size_t i = 0; i < kb->GetSize()-1; i++) { @@ -30,14 +30,13 @@ sample_all_pairs(KBestList* kb, TrainingInstances &training)        p.first_score = kb->scores[i];        p.second_score = kb->scores[j];        training.push_back(p); -    } -  } +    } // j +  } // i  } -void +inline void  sample_rand_pairs(KBestList* kb, TrainingInstances &training, MT19937* prng)  { -  srand(time(NULL));    for (size_t i = 0; i < kb->GetSize()-1; i++) {      for (size_t j = i+1; j < kb->GetSize(); j++) {        if (prng->next() < .5) { @@ -50,14 +49,12 @@ sample_rand_pairs(KBestList* kb, TrainingInstances &training, MT19937* prng)          p.second_score = kb->scores[j];          training.push_back(p);        } -    } -  } -  cout << training.size() << " sampled" << endl; +    } // j +  } // i  }  } // namespace -  #endif  | 
