diff options
Diffstat (limited to 'dtrain')
| -rw-r--r-- | dtrain/dcommon.cc | 79 | ||||
| -rw-r--r-- | dtrain/dcommon.h | 71 | ||||
| -rw-r--r-- | dtrain/dtest.cc | 33 | ||||
| -rw-r--r-- | dtrain/dtrain.cc | 48 | ||||
| -rw-r--r-- | dtrain/learner.h | 71 | ||||
| -rwxr-xr-x | dtrain/test.sh | 2 | 
6 files changed, 168 insertions, 136 deletions
| diff --git a/dtrain/dcommon.cc b/dtrain/dcommon.cc index a6bdc92c..6657bed6 100644 --- a/dtrain/dcommon.cc +++ b/dtrain/dcommon.cc @@ -2,7 +2,11 @@ -/* + +/****************************************************************************** + * NGRAMS + * + *   * make_ngrams   *   */ @@ -23,9 +27,6 @@ make_ngrams( vector<WordID>& s, size_t N )  } - - -  /*   * ngram_matches   * @@ -50,7 +51,12 @@ make_ngram_counts( vector<WordID> hyp, vector<WordID> ref, size_t N )  } -/* + + +/****************************************************************************** + * SCORES + * + *   * brevity_penaly   *   */ @@ -156,7 +162,12 @@ approx_bleu( NgramCounts& counts, const size_t hyp_len, const size_t ref_len,  } -/* + + +/****************************************************************************** + * UTILS + * + *   * register_and_convert   *   */ @@ -170,11 +181,39 @@ register_and_convert(const vector<string>& strs, vector<WordID>& ids)  } +/* + * approx_equal + * + */ +double +approx_equal( double x, double y ) +{ +  const double EPSILON = 1E-5; +  if ( x == 0 ) return fabs( y ) <= EPSILON; +  if ( y == 0 ) return fabs( x ) <= EPSILON; +  return fabs( x - y ) / max( fabs(x), fabs(y) ) <= EPSILON; +}  /* + * print_FD + * + */ +void +print_FD() +{ +  for ( size_t i = 0; i < FD::NumFeats(); i++ ) cout << FD::Convert(i)<< endl; +} + + + + +/****************************************************************************** + * TESTS   *   * + * test_ngrams + *   */  void  test_ngrams() @@ -207,21 +246,7 @@ test_ngrams()  /* - * - * - */ -double -approx_equal( double x, double y ) -{ -  const double EPSILON = 1E-5; -  if ( x == 0 ) return fabs( y ) <= EPSILON; -  if ( y == 0 ) return fabs( x ) <= EPSILON; -  return fabs( x - y ) / max( fabs(x), fabs(y) ) <= EPSILON; -} - - -/* - * + * test_metrics   *   */  void @@ -263,8 +288,9 @@ test_metrics()    cout << endl;  } +  /* - * + * test_SetWeights   *   */  void @@ -287,7 +313,7 @@ test_SetWeights()  /* - * + * run_tests   *   */  void @@ -302,10 +328,3 @@ run_tests()    exit(0);  } - -void -print_FD() -{ -  for ( size_t i = 0; i < FD::NumFeats(); i++ ) cout << FD::Convert(i)<< endl; -} - diff --git a/dtrain/dcommon.h b/dtrain/dcommon.h index ff796642..6df841bb 100644 --- a/dtrain/dcommon.h +++ b/dtrain/dcommon.h @@ -30,6 +30,8 @@ using namespace std;  namespace po = boost::program_options; + +  struct ScorePair  {    ScorePair(double modelscore, double score) : modelscore_(modelscore), score_(score) {}  @@ -139,72 +141,7 @@ struct NgramCounts  }; -/*class Learnerx -{ -  public: -    virtual void Init(const vector<SparseVector<double> >& kbest, const Scores& scores) {}; -    virtual void Update(SparseVector<double>& lambdas); -};*/ - -class SofiaLearner //: public Learnerx FIXME -{ -  // TODO bool invert_score -  public: -  void -  Init( const size_t sid, const vector<SparseVector<double> >& kbest, /*const*/ Scores& scores ) -  { -    assert( kbest.size() == scores.size() ); -    ofstream o; -    unlink( "/tmo/sofia_ml_training" ); -    o.open( "/tmp/sofia_ml_training", ios::trunc ); // TODO randomize, filename exists -    int fid = 0; -    map<int,int>::iterator ff; -    for ( size_t k = 0; k < kbest.size(); ++k ) { -      SparseVector<double>::const_iterator it = kbest[k].begin(); -      o << scores[k].GetScore(); -      for ( ; it != kbest[k].end(); ++it) { -        ff = fmap.find( it->first ); -        if ( ff == fmap.end() ) { -          fmap.insert( pair<int,int>(it->first, fid) ); -          fmap1.insert( pair<int,int>(fid, it->first) ); -          fid++; -        } -        o << " "<< fmap[it->first] << ":" << it->second; -      } -      o << endl; -    } -    o.close(); -  } - -  void -  Update(SparseVector<double>& lambdas) -  { -    string call = "./sofia-ml --training_file /tmp/sofia_ml_training --model_out /tmp/sofia_ml_model --loop_type stochastic --lambda 100 --dimensionality "; -    std::stringstream out; -    out << fmap.size(); -    call += out.str(); -    call += " &>/dev/null"; -    system ( call.c_str() ); -    ifstream i; -    unlink( "/tmo/sofia_ml_model" ); -    i.open( "/tmp/sofia_ml_model", ios::in ); -    string model; -    getline( i, model ); -    //cout << model << endl; -    vector<string> strs; -    boost::split( strs, model, boost::is_any_of(" ") ); -    int j = 0; -    for ( vector<string>::iterator it = strs.begin(); it != strs.end(); ++it ) { -      lambdas.set_value(fmap1[j], atof( it->c_str() ) ); -      j++; -    } - -  } -  private: -    map<int,int> fmap; -    map<int,int> fmap1; -};  typedef map<vector<WordID>, size_t> Ngrams;  Ngrams make_ngrams( vector<WordID>& s, size_t N ); @@ -215,10 +152,6 @@ double stupid_bleu( NgramCounts& counts, const size_t hyp_len, const size_t ref_  double smooth_bleu( NgramCounts& counts, const size_t hyp_len, const size_t ref_len, const size_t N, vector<float> weights = vector<float>() );  double approx_bleu( NgramCounts& counts, const size_t hyp_len, const size_t ref_len, const size_t N, vector<float> weights = vector<float>() );  void register_and_convert(const vector<string>& strs, vector<WordID>& ids); - - - -  void print_FD();  void run_tests();  void test_SetWeights(); diff --git a/dtrain/dtest.cc b/dtrain/dtest.cc index 9975794f..5ae473e6 100644 --- a/dtrain/dtest.cc +++ b/dtrain/dtest.cc @@ -11,11 +11,13 @@ bool  init(int argc, char** argv, po::variables_map* conf)  {    int N; +  bool q;    po::options_description opts( "Options" );    opts.add_options() -    ( "decoder-config,c", po::value<string>(),                  "configuration file for cdec" ) -    ( "weights,w",        po::value<string>(),                  "weights file") -    ( "ngrams,n",         po::value<int>(&N)->default_value(4), "N for Ngrams (default 5)" ); +    ( "decoder-config,c", po::value<string>(),                      "configuration file for cdec" ) +    ( "weights,w",        po::value<string>(),                      "weights file") +    ( "ngrams,n",         po::value<int>(&N)->default_value(4),     "N for Ngrams (default 5)" ) +    ( "quiet,q",          po::value<bool>(&q)->default_value(true), "do not output translations" );    po::options_description cmdline_options;    cmdline_options.add(opts);    po::store( parse_command_line(argc, argv, cmdline_options), *conf ); @@ -44,6 +46,7 @@ main(int argc, char** argv)    Decoder decoder(ini_rf.stream());    KBestGetter observer(k);    size_t N = conf["ngrams"].as<int>(); +  bool quiet = conf["quiet"].as<bool>();    Weights weights;    weights.InitFromFile(conf["weights"].as<string>()); @@ -56,13 +59,15 @@ main(int argc, char** argv)    string in, psg;    size_t sid = 0;    double overall = 0.0; -  cerr << "(1 dot equals 100 lines of input)" << endl; +  double overall1 = 0.0; +  double overall2 = 0.0; +  cerr << "(a dot equals 100 lines of input)" << endl;    while( getline(cin, in) ) {      if ( (sid+1) % 100 == 0 ) {          cerr << ".";          if ( (sid+1)%1000 == 0 ) cerr << endl;      } -    if ( sid > 5000 ) break; +    //if ( sid > 5000 ) break;      strs.clear();      boost::split( strs, in, boost::is_any_of("\t") );      // grammar @@ -75,19 +80,25 @@ main(int argc, char** argv)      boost::split( ref_strs, strs[1], boost::is_any_of(" ") );      register_and_convert( ref_strs, ref_ids );      // scoring kbest -    double score = 0; -    Scores scores; +    double score = 0.0; +    double score1 = 0.0; +    double score2 = 0.0;      NgramCounts counts = make_ngram_counts( ref_ids, kb->sents[0], 4 );      score = smooth_bleu( counts,                           ref_ids.size(),                           kb->sents[0].size(), N ); -    ScorePair sp( kb->scores[0], score ); -    scores.push_back( sp ); -    //cout << TD::GetString( kb->sents[0] ) << endl; +    score1 = stupid_bleu( counts, ref_ids.size(), kb->sents[0].size(), N) ; +    score2 = bleu( counts, ref_ids.size(), kb->sents[0].size(), N ); +    //if ( ! quiet ) +    cout << TD::GetString( kb->sents[0] ) << endl;      overall += score; +    overall1 += score1; +    overall2 += score2;      sid += 1;    } -  cout << "Average score: " << overall/(sid+1) << endl; +  cerr << "Average score (smooth): " << overall/(double)(sid+1) << endl; +  cerr << "Average score (stupid): " << overall1/(double)(sid+1) << endl; +  cerr << "Average score (normal): " << overall2/(double)(sid+1) << endl;    cerr << endl;    return 0; diff --git a/dtrain/dtrain.cc b/dtrain/dtrain.cc index 95fc81af..373458e8 100644 --- a/dtrain/dtrain.cc +++ b/dtrain/dtrain.cc @@ -1,4 +1,5 @@  #include "dcommon.h" +#include "learner.h" @@ -45,41 +46,35 @@ main(int argc, char** argv)    ReadFile ini_rf(conf["decoder-config"].as<string>());    Decoder decoder(ini_rf.stream());    KBestGetter observer(k); -  size_t N = 4; // TODO as parameter/in config  +  size_t N = 3; // TODO as parameter/in config     // TODO scoring metric as parameter/in config     // for approx. bleu -  //NgramCounts global_counts; -  //size_t global_hyp_len; -  //size_t global_ref_len; +  NgramCounts global_counts(N); +  size_t global_hyp_len = 0; +  size_t global_ref_len = 0;    Weights weights;    SparseVector<double> lambdas;    weights.InitSparseVector(&lambdas);    vector<double> dense_weights; -  lambdas.set_value(FD::Convert("logp"), 0); - -     vector<string> strs, ref_strs;    vector<WordID> ref_ids;    string in, psg;    size_t sid = 0;    cerr << "(1 dot equals 100 lines of input)" << endl;    while( getline(cin, in) ) { -    //if ( !SILENT ) -    //    cerr << endl << endl << "Getting kbest for sentence #" << sid << endl;      if ( (sid+1) % 100 == 0 ) {          cerr << ".";          if ( (sid+1)%1000 == 0 ) cerr << endl;      } -    if ( sid > 5000 ) break; +    //if ( sid > 5000 ) break;      // weights      dense_weights.clear();      weights.InitFromVector( lambdas );      weights.InitVector( &dense_weights );      decoder.SetWeights( dense_weights ); -    //if ( sid > 100 ) break;      // handling input..      strs.clear();      boost::split( strs, in, boost::is_any_of("\t") ); @@ -94,33 +89,36 @@ main(int argc, char** argv)      register_and_convert( ref_strs, ref_ids );      // scoring kbest      double score = 0; +    size_t cand_len = 0;      Scores scores; -    for ( size_t i = 0; i < k; i++ ) { -      NgramCounts counts = make_ngram_counts( ref_ids, kb->sents[i], 4 ); -      score = smooth_bleu( counts, -                           ref_ids.size(), -                           kb->sents[i].size(), N ); +    for ( size_t i = 0; i < kb->sents.size(); i++ ) { +      NgramCounts counts = make_ngram_counts( ref_ids, kb->sents[i], N ); +      if ( i == 0) { +        global_counts += counts; +        global_hyp_len += kb->sents[i].size(); +        global_ref_len += ref_ids.size(); +        cand_len = 0; +      } else { +        cand_len = kb->sents[i].size(); +      } +      //score = bleu( global_counts, +      //                     global_ref_len, +       //                    global_hyp_len + cand_len, N ); +      score = bleu ( counts, ref_ids.size(), kb->sents[i].size(), N );        ScorePair sp( kb->scores[i], score );        scores.push_back( sp );        //cout << "'" << TD::GetString( ref_ids ) << "' vs '" << TD::GetString( kb->sents[i] ) << "' SCORE=" << score << endl;        //cout << kb->feats[i] << endl;      } -    //cout << "###" << endl; +    // learner      SofiaLearner learner;      learner.Init( sid, kb->feats, scores );      learner.Update(lambdas); -    // initializing learner -    // TODO -    // updating weights -    //lambdas.set_value( FD::Convert("use_shell"), 1 ); -    //lambdas.set_value( FD::Convert("use_a"), 1 );      //print_FD();      sid += 1; // TODO does cdec count this already?    } - -  weights.WriteToFile( "weights-final", true ); -      cerr << endl; +  weights.WriteToFile( "data/weights-final-normalx", true );    return 0;  } diff --git a/dtrain/learner.h b/dtrain/learner.h new file mode 100644 index 00000000..a953284d --- /dev/null +++ b/dtrain/learner.h @@ -0,0 +1,71 @@ +/*class Learnerx +{ +  public: +    virtual void Init(const vector<SparseVector<double> >& kbest, const Scores& scores) {}; +    virtual void Update(SparseVector<double>& lambdas); +};*/ + +class SofiaLearner //: public Learnerx FIXME +{ +  // TODO bool invert_score +  public: +  void +  Init( const size_t sid, const vector<SparseVector<double> >& kbest, /*const*/ Scores& scores ) +  { +    assert( kbest.size() == scores.size() ); +    ofstream o; +    //unlink( "/tmp/sofia_ml_training_stupid" ); +    o.open( "/tmp/sofia_ml_training_normalx", ios::trunc ); // TODO randomize, filename exists +    int fid = 0; +    map<int,int>::iterator ff; + +    for ( size_t k = 0; k < kbest.size(); ++k ) { +      map<int,double> m; +      SparseVector<double>::const_iterator it = kbest[k].begin(); +      o << scores[k].GetScore(); +      for ( ; it != kbest[k].end(); ++it) { +        ff = fmap.find( it->first ); +        if ( ff == fmap.end() ) { +          fmap.insert( pair<int,int>(it->first, fid) ); +          fmap1.insert( pair<int,int>(fid, it->first) ); +          fid++; +        } +        m.insert(pair<int,double>(fmap[it->first], it->second)); +      } +      map<int,double>::iterator ti = m.begin(); +      for ( ; ti != m.end(); ++ti ) { +        o << " " << ti->first << ":" << ti->second; +      } +      o << endl; +    } +    o.close(); +  } + +  void +  Update(SparseVector<double>& lambdas) +  { +    string call = "./sofia-ml --training_file /tmp/sofia_ml_training_normalx --model_out /tmp/sofia_ml_model_normalx --loop_type stochastic --lambda 100 --dimensionality "; +    std::stringstream out; +    out << fmap.size(); +    call += out.str(); +    call += " &>/dev/null"; +    system ( call.c_str() ); +    ifstream i; +    //unlink( "/tmp/sofia_ml_model_stupid" ); +    i.open( "/tmp/sofia_ml_model_normalx", ios::in ); +    string model; +    getline( i, model ); +    vector<string> strs; +    boost::split( strs, model, boost::is_any_of(" ") ); +    int j = 0; +    for ( vector<string>::iterator it = strs.begin(); it != strs.end(); ++it ) { +      lambdas.set_value(fmap1[j], atof( it->c_str() ) ); +      j++; +    } +  } + +  private: +    map<int,int> fmap; +    map<int,int> fmap1; +}; + diff --git a/dtrain/test.sh b/dtrain/test.sh index ad45bd1e..bc318ae7 100755 --- a/dtrain/test.sh +++ b/dtrain/test.sh @@ -1,4 +1,4 @@  #!/bin/sh -./dtrain -c data/cdec.ini -k 4 < data/in.blunsom08 #< data/in.toy +./dtrain -c data/cdec.ini -k 200 < data/in.blunsom08 #< data/in.toy | 
