diff options
Diffstat (limited to 'dtrain')
| -rw-r--r-- | dtrain/Makefile.am | 11 | ||||
| -rw-r--r-- | dtrain/cdec.ini | 4 | ||||
| -rw-r--r-- | dtrain/dcommon.cc | 311 | ||||
| -rw-r--r-- | dtrain/dcommon.h | 230 | ||||
| -rw-r--r-- | dtrain/dtest.cc | 95 | ||||
| -rw-r--r-- | dtrain/dtrain.cc | 595 | ||||
| -rw-r--r-- | dtrain/dtrain.ini | 0 | ||||
| -rw-r--r-- | dtrain/in | 2 | ||||
| -rw-r--r-- | dtrain/in.toy | 2 | ||||
| -rwxr-xr-x | dtrain/test.sh | 2 | 
10 files changed, 704 insertions, 548 deletions
| diff --git a/dtrain/Makefile.am b/dtrain/Makefile.am index daa20cf3..c3f14bb0 100644 --- a/dtrain/Makefile.am +++ b/dtrain/Makefile.am @@ -1,6 +1,11 @@ -bin_PROGRAMS = dtrain +# TODO I'm sure I can leave something out. +bin_PROGRAMS = dtrain dtest -dtrain_SOURCES = dtrain.cc -dtrain_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a ../klm/lm/libklm.a ../klm/util/libklm_util.a -lz +dtrain_SOURCES = dtrain.cc dcommon.cc +dtrain_LDADD   = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a ../klm/lm/libklm.a ../klm/util/libklm_util.a -lz + +dtest_SOURCES = dtest.cc dcommon.cc +dtest_LDADD   = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a ../klm/lm/libklm.a ../klm/util/libklm_util.a -lz  AM_CPPFLAGS = -W -Wall -Wno-sign-compare -I$(top_srcdir)/utils -I$(top_srcdir)/decoder -I$(top_srcdir)/mteval + diff --git a/dtrain/cdec.ini b/dtrain/cdec.ini deleted file mode 100644 index 92a4a335..00000000 --- a/dtrain/cdec.ini +++ /dev/null @@ -1,4 +0,0 @@ -formalism=scfg -#feature_function=KLanguageModel europarl-v6.tok.lc.s-tag.en.arpa.kenlm.v4.mma -#k_best=2 -#add_pass_through_rules=true diff --git a/dtrain/dcommon.cc b/dtrain/dcommon.cc new file mode 100644 index 00000000..a6bdc92c --- /dev/null +++ b/dtrain/dcommon.cc @@ -0,0 +1,311 @@ +#include "dcommon.h" + + + +/* + * make_ngrams + * + */ +typedef map<vector<WordID>, size_t> Ngrams; +Ngrams +make_ngrams( vector<WordID>& s, size_t N ) +{ +  Ngrams ngrams; +  vector<WordID> ng; +  for ( size_t i = 0; i < s.size(); i++ ) { +    ng.clear(); +    for ( size_t j = i; j < min( i+N, s.size() ); j++ ) { +      ng.push_back( s[j] ); +      ngrams[ng]++; +    } +  } +  return ngrams; +} + + + + + +/* + * ngram_matches + * + */ +NgramCounts +make_ngram_counts( vector<WordID> hyp, vector<WordID> ref, size_t N ) +{ +  Ngrams hyp_ngrams = make_ngrams( hyp, N ); +  Ngrams ref_ngrams = make_ngrams( ref, N ); +  NgramCounts counts( N ); +  Ngrams::iterator it; +  Ngrams::iterator ti; +  for ( it = hyp_ngrams.begin(); it != hyp_ngrams.end(); it++ ) { +    ti = ref_ngrams.find( it->first ); +    if ( ti != ref_ngrams.end() ) { +      counts.add( it->second, ti->second, it->first.size() - 1 ); +    } else { +      counts.add( it->second, 0, it->first.size() - 1 ); +    } +  } +  return counts; +} + + +/* + * brevity_penaly + * + */ +double +brevity_penaly( const size_t hyp_len, const size_t ref_len ) +{ +  if ( hyp_len > ref_len ) return 1; +  return exp( 1 - (double)ref_len/(double)hyp_len ); +} + + +/* + * bleu + * as in "BLEU: a Method for Automatic Evaluation of Machine Translation" (Papineni et al. '02) + * page TODO + * 0 if for N one of the counts = 0 + */ +double +bleu( NgramCounts& counts, const size_t hyp_len, const size_t ref_len, +      size_t N, vector<float> weights  ) +{ +  if ( hyp_len == 0 || ref_len == 0 ) return 0; +  if ( ref_len < N ) N = ref_len; +  float N_ = (float)N; +  if ( weights.empty() ) +  { +    for ( size_t i = 0; i < N; i++ ) weights.push_back( 1/N_ ); +  } +  double sum = 0; +  for ( size_t i = 0; i < N; i++ ) { +    if ( counts.clipped[i] == 0 || counts.sum[i] == 0 ) return 0; +    sum += weights[i] * log( (double)counts.clipped[i] / (double)counts.sum[i] ); +  } +  return brevity_penaly( hyp_len, ref_len ) * exp( sum ); +} + + +/* + * stupid_bleu + * as in "ORANGE: a Method for Evaluating Automatic Evaluation Metrics for Machine Translation (Lin & Och '04) + * page TODO + * 0 iff no 1gram match + */ +double +stupid_bleu( NgramCounts& counts, const size_t hyp_len, const size_t ref_len, +             size_t N, vector<float> weights  ) +{ +  if ( hyp_len == 0 || ref_len == 0 ) return 0; +  if ( ref_len < N ) N = ref_len; +  float N_ = (float)N; +  if ( weights.empty() ) +  { +    for ( size_t i = 0; i < N; i++ ) weights.push_back( 1/N_ ); +  } +  double sum = 0; +  float add = 0; +  for ( size_t i = 0; i < N; i++ ) { +    if ( i == 1 ) add = 1; +    sum += weights[i] * log( ((double)counts.clipped[i] + add) / ((double)counts.sum[i] + add) ); +  } +  return brevity_penaly( hyp_len, ref_len ) * exp( sum ); +} + + +/* + * smooth_bleu + * as in "An End-to-End Discriminative Approach to Machine Translation" (Liang et al. '06) + * page TODO + * max. 0.9375 + */ +double +smooth_bleu( NgramCounts& counts, const size_t hyp_len, const size_t ref_len, +             const size_t N, vector<float> weights  ) +{ +  if ( hyp_len == 0 || ref_len == 0 ) return 0; +  float N_ = (float)N; +  if ( weights.empty() ) +  { +    for ( size_t i = 0; i < N; i++ ) weights.push_back( 1/N_ ); +  } +  double sum = 0; +  float j = 1; +  for ( size_t i = 0; i < N; i++ ) { +    if ( counts.clipped[i] == 0 || counts.sum[i] == 0) continue; +    sum += exp((weights[i] * log((double)counts.clipped[i]/(double)counts.sum[i]))) / pow( 2, N_-j+1 ); +    j++; +  } +  return brevity_penaly( hyp_len, ref_len ) * sum; +} + + +/* + * approx_bleu + * as in "Online Large-Margin Training for Statistical Machine Translation" (Watanabe et al. '07) + * page TODO + * + */ +double +approx_bleu( NgramCounts& counts, const size_t hyp_len, const size_t ref_len, +     const size_t N, vector<float> weights ) +{ +  return bleu( counts, hyp_len, ref_len, N, weights ); +} + + +/* + * register_and_convert + * + */ +void +register_and_convert(const vector<string>& strs, vector<WordID>& ids) +{ +  vector<string>::const_iterator it; +  for ( it = strs.begin(); it < strs.end(); it++ ) { +    ids.push_back( TD::Convert( *it ) ); +  } +} + + + + +/* + * + * + */ +void +test_ngrams() +{ +  cout << "Testing ngrams..." << endl << endl; +  size_t N = 5; +  cout << "N = " << N << endl; +  vector<int> a; // hyp +  vector<int> b; // ref +  cout << "a "; +  for (size_t i = 1; i <= 8; i++) { +    cout << i << " "; +    a.push_back(i); +  } +  cout << endl << "b "; +  for (size_t i = 1; i <= 4; i++) { +    cout << i << " "; +    b.push_back(i); +  } +  cout << endl << endl; +  NgramCounts c = make_ngram_counts( a, b, N ); +  assert( c.clipped[N-1] == 0 ); +  assert( c.sum[N-1] == 4 ); +  c.print(); +  c += c; +  cout << endl; +  c.print(); +  cout << endl; +} + + +/* + * + * + */ +double +approx_equal( double x, double y ) +{ +  const double EPSILON = 1E-5; +  if ( x == 0 ) return fabs( y ) <= EPSILON; +  if ( y == 0 ) return fabs( x ) <= EPSILON; +  return fabs( x - y ) / max( fabs(x), fabs(y) ) <= EPSILON; +} + + +/* + * + * + */ +void +test_metrics() +{ +  cout << "Testing metrics..." << endl << endl; +  using namespace boost::assign; +  vector<string> a, b; +  vector<double> expect_vanilla, expect_smooth, expect_stupid; +  a +=              "a a a a", "a a a a", "a",   "a", "b",        "a a a a", "a a",  "a a a", "a b a"; // hyp +  b +=              "b b b b", "a a a a", "a",   "b", "b b b b",  "a",       "a a",  "a a a", "a b b"; // ref +  expect_vanilla += 0,         1,         1,      0,  0,          .25,       1,      1,       0; +  expect_smooth  += 0,          .9375,     .0625, 0,   .00311169, .0441942,   .1875,  .4375,   .161587; +  expect_stupid  += 0,         1,         1,      0,   .0497871,  .25,       1,      1,        .605707; +  vector<string> aa, bb; +  vector<WordID> aai, bbi; +  double vanilla, smooth, stupid; +  size_t N = 4; +  cout << "N = " << N << endl << endl; +  for ( size_t i = 0; i < a.size(); i++ ) { +    cout << " hyp: " << a[i] << endl; +    cout << " ref: " << b[i] << endl; +    aa.clear(); bb.clear(); aai.clear(); bbi.clear(); +    boost::split( aa, a[i], boost::is_any_of(" ") ); +    boost::split( bb, b[i], boost::is_any_of(" ") ); +    register_and_convert( aa, aai ); +    register_and_convert( bb, bbi ); +    NgramCounts counts = make_ngram_counts( aai, bbi, N ); +    vanilla =        bleu( counts, aa.size(), bb.size(), N); +    smooth  = smooth_bleu( counts, aa.size(), bb.size(), N); +    stupid  = stupid_bleu( counts, aa.size(), bb.size(), N); +    assert( approx_equal(vanilla, expect_vanilla[i]) ); +    assert( approx_equal(smooth, expect_smooth[i]) ); +    assert( approx_equal(stupid, expect_stupid[i]) ); +    cout << setw(14) << "bleu = "      << vanilla << endl; +    cout << setw(14) << "smooth bleu = " << smooth << endl; +    cout << setw(14) << "stupid bleu = " << stupid << endl << endl; +  } +  cout << endl; +} + +/* + * + * + */ +void +test_SetWeights() +{ +  cout << "Testing Weights::SetWeight..." << endl << endl; +  Weights weights; +  SparseVector<double> lambdas; +  weights.InitSparseVector( &lambdas ); +  weights.SetWeight( &lambdas, "test", 0 ); +  weights.SetWeight( &lambdas, "test1", 1 ); +  WordID fid = FD::Convert( "test2" ); +  weights.SetWeight( &lambdas, fid, 2 ); +  string fn = "weights-test"; +  cout << "FD::NumFeats() " << FD::NumFeats() << endl; +  assert( FD::NumFeats() == 4 ); +  weights.WriteToFile( fn, true ); +  cout << endl; +} + + +/* + * + * + */ +void +run_tests() +{ +  cout << endl; +  test_ngrams(); +  cout << endl; +  test_metrics(); +  cout << endl; +  test_SetWeights(); +  exit(0); +} + + +void +print_FD() +{ +  for ( size_t i = 0; i < FD::NumFeats(); i++ ) cout << FD::Convert(i)<< endl; +} + diff --git a/dtrain/dcommon.h b/dtrain/dcommon.h new file mode 100644 index 00000000..ff796642 --- /dev/null +++ b/dtrain/dcommon.h @@ -0,0 +1,230 @@ +#include <sstream> +#include <iostream> +#include <vector> +#include <cassert> +#include <cmath> + +#include "config.h" + +#include <boost/shared_ptr.hpp> +#include <boost/algorithm/string.hpp> +#include <boost/program_options.hpp> +#include <boost/program_options/variables_map.hpp> + +#include "sentence_metadata.h" +#include "scorer.h" +#include "verbose.h" +#include "viterbi.h" +#include "hg.h" +#include "prob.h" +#include "kbest.h" +#include "ff_register.h" +#include "decoder.h" +#include "filelib.h" +#include "fdict.h" +#include "weights.h" +#include "sparse_vector.h" +#include "sampler.h" + +using namespace std; +namespace po = boost::program_options; + + +struct ScorePair +{ +  ScorePair(double modelscore, double score) : modelscore_(modelscore), score_(score) {}  +  double modelscore_, score_; +  double GetModelScore() { return modelscore_; } +  double GetScore() { return score_; } +}; +typedef vector<ScorePair> Scores; + + +/* + * KBestGetter + * + */ +struct KBestList { +  vector<SparseVector<double> > feats; +  vector<vector<WordID> > sents; +  vector<double> scores; +}; +struct KBestGetter : public DecoderObserver +{ +  KBestGetter( const size_t k ) : k_(k) {} +  const size_t k_; +  KBestList kb; + +  virtual void +  NotifyTranslationForest(const SentenceMetadata& smeta, Hypergraph* hg) +  { +    GetKBest(smeta.GetSentenceID(), *hg); +  } + +  KBestList* GetKBest() { return &kb; } + +  void +  GetKBest(int sent_id, const Hypergraph& forest) +  { +    kb.scores.clear(); +    kb.sents.clear(); +    kb.feats.clear(); +    KBest::KBestDerivations<vector<WordID>, ESentenceTraversal> kbest( forest, k_ ); +    for ( size_t i = 0; i < k_; ++i ) { +      const KBest::KBestDerivations<vector<WordID>, ESentenceTraversal>::Derivation* d = +        kbest.LazyKthBest( forest.nodes_.size() - 1, i ); +      if (!d) break; +      kb.sents.push_back( d->yield); +      kb.feats.push_back( d->feature_values ); +      kb.scores.push_back( d->score ); +    } +  } +}; + + +/* + * NgramCounts + * + */ +struct NgramCounts +{ +  NgramCounts( const size_t N ) : N_( N ) { +    reset(); +  }  +  size_t N_; +  map<size_t, size_t> clipped; +  map<size_t, size_t> sum; + +  void +  operator+=( const NgramCounts& rhs ) +  { +    assert( N_ == rhs.N_ ); +    for ( size_t i = 0; i < N_; i++ ) { +      this->clipped[i] += rhs.clipped.find(i)->second; +      this->sum[i] += rhs.sum.find(i)->second; +    } +  } + +  void +  add( size_t count, size_t ref_count, size_t i ) +  { +    assert( i < N_ ); +    if ( count > ref_count ) { +      clipped[i] += ref_count; +      sum[i] += count; +    } else { +      clipped[i] += count; +      sum[i] += count; +    } +  } + +  void +  reset() +  { +    size_t i; +    for ( i = 0; i < N_; i++ ) { +      clipped[i] = 0; +      sum[i] = 0; +    } +  } + +  void +  print() +  { +    for ( size_t i = 0; i < N_; i++ ) { +      cout << i+1 << "grams (clipped):\t" << clipped[i] << endl; +      cout << i+1 << "grams:\t\t\t" << sum[i] << endl; +    } +  } +}; + + +/*class Learnerx +{ +  public: +    virtual void Init(const vector<SparseVector<double> >& kbest, const Scores& scores) {}; +    virtual void Update(SparseVector<double>& lambdas); +};*/ + +class SofiaLearner //: public Learnerx FIXME +{ +  // TODO bool invert_score +  public: +  void +  Init( const size_t sid, const vector<SparseVector<double> >& kbest, /*const*/ Scores& scores ) +  { +    assert( kbest.size() == scores.size() ); +    ofstream o; +    unlink( "/tmo/sofia_ml_training" ); +    o.open( "/tmp/sofia_ml_training", ios::trunc ); // TODO randomize, filename exists +    int fid = 0; +    map<int,int>::iterator ff; +    for ( size_t k = 0; k < kbest.size(); ++k ) { +      SparseVector<double>::const_iterator it = kbest[k].begin(); +      o << scores[k].GetScore(); +      for ( ; it != kbest[k].end(); ++it) { +        ff = fmap.find( it->first ); +        if ( ff == fmap.end() ) { +          fmap.insert( pair<int,int>(it->first, fid) ); +          fmap1.insert( pair<int,int>(fid, it->first) ); +          fid++; +        } +        o << " "<< fmap[it->first] << ":" << it->second; +      } +      o << endl; +    } +    o.close(); +  } + +  void +  Update(SparseVector<double>& lambdas) +  { +    string call = "./sofia-ml --training_file /tmp/sofia_ml_training --model_out /tmp/sofia_ml_model --loop_type stochastic --lambda 100 --dimensionality "; +    std::stringstream out; +    out << fmap.size(); +    call += out.str(); +    call += " &>/dev/null"; +    system ( call.c_str() ); +    ifstream i; +    unlink( "/tmo/sofia_ml_model" ); +    i.open( "/tmp/sofia_ml_model", ios::in ); +    string model; +    getline( i, model ); +    //cout << model << endl; +    vector<string> strs; +    boost::split( strs, model, boost::is_any_of(" ") ); +    int j = 0; +    for ( vector<string>::iterator it = strs.begin(); it != strs.end(); ++it ) { +      lambdas.set_value(fmap1[j], atof( it->c_str() ) ); +      j++; +    } + +  } + +  private: +    map<int,int> fmap; +    map<int,int> fmap1; +}; + +typedef map<vector<WordID>, size_t> Ngrams; +Ngrams make_ngrams( vector<WordID>& s, size_t N ); +NgramCounts make_ngram_counts( vector<WordID> hyp, vector<WordID> ref, size_t N ); +double brevity_penaly( const size_t hyp_len, const size_t ref_len ); +double bleu( NgramCounts& counts, const size_t hyp_len, const size_t ref_len, size_t N, vector<float> weights = vector<float>() ); +double stupid_bleu( NgramCounts& counts, const size_t hyp_len, const size_t ref_len, size_t N, vector<float> weights = vector<float>() ); +double smooth_bleu( NgramCounts& counts, const size_t hyp_len, const size_t ref_len, const size_t N, vector<float> weights = vector<float>() ); +double approx_bleu( NgramCounts& counts, const size_t hyp_len, const size_t ref_len, const size_t N, vector<float> weights = vector<float>() ); +void register_and_convert(const vector<string>& strs, vector<WordID>& ids); + + + + +void print_FD(); +void run_tests(); +void test_SetWeights(); +#include <boost/assign/std/vector.hpp> +#include <iomanip> +void test_metrics(); +double approx_equal( double x, double y ); +void test_ngrams(); + diff --git a/dtrain/dtest.cc b/dtrain/dtest.cc new file mode 100644 index 00000000..9975794f --- /dev/null +++ b/dtrain/dtest.cc @@ -0,0 +1,95 @@ +#include "dcommon.h" + + + + +/* + * init + * + */ +bool +init(int argc, char** argv, po::variables_map* conf) +{ +  int N; +  po::options_description opts( "Options" ); +  opts.add_options() +    ( "decoder-config,c", po::value<string>(),                  "configuration file for cdec" ) +    ( "weights,w",        po::value<string>(),                  "weights file") +    ( "ngrams,n",         po::value<int>(&N)->default_value(4), "N for Ngrams (default 5)" ); +  po::options_description cmdline_options; +  cmdline_options.add(opts); +  po::store( parse_command_line(argc, argv, cmdline_options), *conf ); +  po::notify( *conf ); +  if ( ! (conf->count("decoder-config") || conf->count("weights")) ) { +    cerr << cmdline_options << endl; +    return false; +  } +  return true; +} + + +/* + * main + * + */ +int +main(int argc, char** argv) +{ +  SetSilent(true); +  po::variables_map conf; +  if (!init(argc, argv, &conf)) return 1; +  register_feature_functions(); +  size_t k = 1; +  ReadFile ini_rf(conf["decoder-config"].as<string>()); +  Decoder decoder(ini_rf.stream()); +  KBestGetter observer(k); +  size_t N = conf["ngrams"].as<int>(); + +  Weights weights; +  weights.InitFromFile(conf["weights"].as<string>()); +  vector<double> w; +  weights.InitVector(&w); +  decoder.SetWeights(w); +  +  vector<string> strs, ref_strs; +  vector<WordID> ref_ids; +  string in, psg; +  size_t sid = 0; +  double overall = 0.0; +  cerr << "(1 dot equals 100 lines of input)" << endl; +  while( getline(cin, in) ) { +    if ( (sid+1) % 100 == 0 ) { +        cerr << "."; +        if ( (sid+1)%1000 == 0 ) cerr << endl; +    } +    if ( sid > 5000 ) break; +    strs.clear(); +    boost::split( strs, in, boost::is_any_of("\t") ); +    // grammar +    psg = boost::replace_all_copy( strs[2], " __NEXT_RULE__ ", "\n" ); psg += "\n"; +    decoder.SetSentenceGrammar( psg ); +    decoder.Decode( strs[0], &observer ); +    KBestList* kb = observer.GetKBest(); +    // reference +    ref_strs.clear(); ref_ids.clear(); +    boost::split( ref_strs, strs[1], boost::is_any_of(" ") ); +    register_and_convert( ref_strs, ref_ids ); +    // scoring kbest +    double score = 0; +    Scores scores; +    NgramCounts counts = make_ngram_counts( ref_ids, kb->sents[0], 4 ); +    score = smooth_bleu( counts, +                         ref_ids.size(), +                         kb->sents[0].size(), N ); +    ScorePair sp( kb->scores[0], score ); +    scores.push_back( sp ); +    //cout << TD::GetString( kb->sents[0] ) << endl; +    overall += score; +    sid += 1; +  } +  cout << "Average score: " << overall/(sid+1) << endl; +  cerr << endl; + +  return 0; +} + diff --git a/dtrain/dtrain.cc b/dtrain/dtrain.cc index 8464a429..95fc81af 100644 --- a/dtrain/dtrain.cc +++ b/dtrain/dtrain.cc @@ -1,33 +1,6 @@ -#include <sstream> -#include <iostream> -#include <vector> -#include <cassert> -#include <cmath> +#include "dcommon.h" -#include "config.h" -#include <boost/shared_ptr.hpp> -#include <boost/algorithm/string.hpp> -#include <boost/program_options.hpp> -#include <boost/program_options/variables_map.hpp> - -#include "sentence_metadata.h" -#include "scorer.h" -#include "verbose.h" -#include "viterbi.h" -#include "hg.h" -#include "prob.h" -#include "kbest.h" -#include "ff_register.h" -#include "decoder.h" -#include "filelib.h" -#include "fdict.h" -#include "weights.h" -#include "sparse_vector.h" -#include "sampler.h" - -using namespace std; -namespace boostpo = boost::program_options;  /* @@ -35,19 +8,19 @@ namespace boostpo = boost::program_options;   *   */  bool -init(int argc, char** argv, boostpo::variables_map* conf) +init(int argc, char** argv, po::variables_map* conf)  { -  boostpo::options_description opts( "Options" ); +  po::options_description opts( "Options" );    opts.add_options() -    ( "decoder-config,c", boostpo::value<string>(), "configuration file for cdec" ) -    ( "kbest,k",          boostpo::value<size_t>(), "k for kbest" ) -    ( "ngrams,n",         boostpo::value<int>(),    "n for Ngrams" ) -    ( "filter,f",         boostpo::value<string>(), "filter kbest list" ) +    ( "decoder-config,c", po::value<string>(), "configuration file for cdec" ) +    ( "kbest,k",          po::value<size_t>(), "k for kbest" ) +    ( "ngrams,n",         po::value<int>(),    "n for Ngrams" ) +    ( "filter,f",         po::value<string>(), "filter kbest list" )      ( "test",                                       "run tests and exit"); -  boostpo::options_description cmdline_options; +  po::options_description cmdline_options;    cmdline_options.add(opts); -  boostpo::store( parse_command_line(argc, argv, cmdline_options), *conf ); -  boostpo::notify( *conf ); +  po::store( parse_command_line(argc, argv, cmdline_options), *conf ); +  po::notify( *conf );    if ( ! (conf->count("decoder-config") || conf->count("test")) ) {      cerr << cmdline_options << endl;      return false; @@ -57,451 +30,14 @@ init(int argc, char** argv, boostpo::variables_map* conf)  /* - * KBestGetter - * - */ -struct KBestList { -  vector<SparseVector<double> > feats; -  vector<vector<WordID> > sents; -  vector<double> scores; -}; -struct KBestGetter : public DecoderObserver -{ -  KBestGetter( const size_t k ) : k_(k) {} -  const size_t k_; -  KBestList kb; - -  virtual void -  NotifyTranslationForest(const SentenceMetadata& smeta, Hypergraph* hg) -  { -    GetKBest(smeta.GetSentenceID(), *hg); -  } - -  KBestList* getkb() { return &kb; } - -  void -  GetKBest(int sent_id, const Hypergraph& forest) -  { -    kb.scores.clear(); -    kb.sents.clear(); -    kb.feats.clear(); -    KBest::KBestDerivations<vector<WordID>, ESentenceTraversal> kbest( forest, k_ ); -    for ( size_t i = 0; i < k_; ++i ) { -      const KBest::KBestDerivations<vector<WordID>, ESentenceTraversal>::Derivation* d = -        kbest.LazyKthBest( forest.nodes_.size() - 1, i ); -      if (!d) break; -      kb.sents.push_back( d->yield); -      kb.feats.push_back( d->feature_values ); -      kb.scores.push_back( d->score ); -    } -  } -}; - - -/* - * write_training_data_for_sofia - * - */ -void -sofia_write_training_data() -{ -  // TODO -} - - -/* - * call_sofia - * - */ -void -sofia_call() -{ -  // TODO -} - - -/* - * sofia_model2weights - * - */ -void -sofia_read_model() -{ -  // TODO -} - - -/* - * make_ngrams - * - */ -typedef map<vector<WordID>, size_t> Ngrams; -Ngrams -make_ngrams( vector<WordID>& s, size_t N ) -{ -  Ngrams ngrams; -  vector<WordID> ng; -  for ( size_t i = 0; i < s.size(); i++ ) { -    ng.clear(); -    for ( size_t j = i; j < min( i+N, s.size() ); j++ ) { -      ng.push_back( s[j] ); -      ngrams[ng]++; -    } -  } -  return ngrams; -} - - -/* - * NgramCounts - * - */ -struct NgramCounts -{ -  NgramCounts( const size_t N ) : N_( N ) { -    reset(); -  }  -  size_t N_; -  map<size_t, size_t> clipped; -  map<size_t, size_t> sum; - -  void -  operator+=( const NgramCounts& rhs ) -  { -    assert( N_ == rhs.N_ ); -    for ( size_t i = 0; i < N_; i++ ) { -      this->clipped[i] += rhs.clipped.find(i)->second; -      this->sum[i] += rhs.sum.find(i)->second; -    } -  } - -  void -  add( size_t count, size_t ref_count, size_t i ) -  { -    assert( i < N_ ); -    if ( count > ref_count ) { -      clipped[i] += ref_count; -      sum[i] += count; -    } else { -      clipped[i] += count; -      sum[i] += count; -    } -  } - -  void -  reset() -  { -    size_t i; -    for ( i = 0; i < N_; i++ ) { -      clipped[i] = 0; -      sum[i] = 0; -    } -  } - -  void -  print() -  { -    for ( size_t i = 0; i < N_; i++ ) { -      cout << i+1 << "grams (clipped):\t" << clipped[i] << endl; -      cout << i+1 << "grams:\t\t\t" << sum[i] << endl; -    } -  } -}; - - -/* - * ngram_matches - * - */ -NgramCounts -make_ngram_counts( vector<WordID> hyp, vector<WordID> ref, size_t N ) -{ -  Ngrams hyp_ngrams = make_ngrams( hyp, N ); -  Ngrams ref_ngrams = make_ngrams( ref, N ); -  NgramCounts counts( N ); -  Ngrams::iterator it; -  Ngrams::iterator ti; -  for ( it = hyp_ngrams.begin(); it != hyp_ngrams.end(); it++ ) { -    ti = ref_ngrams.find( it->first ); -    if ( ti != ref_ngrams.end() ) { -      counts.add( it->second, ti->second, it->first.size() - 1 ); -    } else { -      counts.add( it->second, 0, it->first.size() - 1 ); -    } -  } -  return counts; -} - - -/* - * brevity_penaly - * - */ -double -brevity_penaly( const size_t hyp_len, const size_t ref_len ) -{ -  if ( hyp_len > ref_len ) return 1; -  return exp( 1 - (double)ref_len/(double)hyp_len ); -} - - -/* - * bleu - * as in "BLEU: a Method for Automatic Evaluation of Machine Translation" (Papineni et al. '02) - * page TODO - * 0 if for N one of the counts = 0 - */ -double -bleu( NgramCounts& counts, const size_t hyp_len, const size_t ref_len, -      size_t N, vector<float> weights = vector<float>() ) -{ -  if ( hyp_len == 0 || ref_len == 0 ) return 0; -  if ( ref_len < N ) N = ref_len; -  float N_ = (float)N; -  if ( weights.empty() ) -  { -    for ( size_t i = 0; i < N; i++ ) weights.push_back( 1/N_ ); -  } -  double sum = 0; -  for ( size_t i = 0; i < N; i++ ) { -    if ( counts.clipped[i] == 0 || counts.sum[i] == 0 ) return 0; -    sum += weights[i] * log( (double)counts.clipped[i] / (double)counts.sum[i] ); -  } -  return brevity_penaly( hyp_len, ref_len ) * exp( sum ); -} - - -/* - * stupid_bleu - * as in "ORANGE: a Method for Evaluating Automatic Evaluation Metrics for Machine Translation (Lin & Och '04) - * page TODO - * 0 iff no 1gram match - */ -double -stupid_bleu( NgramCounts& counts, const size_t hyp_len, const size_t ref_len, -             size_t N, vector<float> weights = vector<float>() ) -{ -  if ( hyp_len == 0 || ref_len == 0 ) return 0; -  if ( ref_len < N ) N = ref_len; -  float N_ = (float)N; -  if ( weights.empty() ) -  { -    for ( size_t i = 0; i < N; i++ ) weights.push_back( 1/N_ ); -  } -  double sum = 0; -  float add = 0; -  for ( size_t i = 0; i < N; i++ ) { -    if ( i == 1 ) add = 1; -    sum += weights[i] * log( ((double)counts.clipped[i] + add) / ((double)counts.sum[i] + add) ); -  } -  return brevity_penaly( hyp_len, ref_len ) * exp( sum ); -} - - -/* - * smooth_bleu - * as in "An End-to-End Discriminative Approach to Machine Translation" (Liang et al. '06) - * page TODO - * max. 0.9375 - */ -double -smooth_bleu( NgramCounts& counts, const size_t hyp_len, const size_t ref_len, -             const size_t N, vector<float> weights = vector<float>() ) -{ -  if ( hyp_len == 0 || ref_len == 0 ) return 0; -  float N_ = (float)N; -  if ( weights.empty() ) -  { -    for ( size_t i = 0; i < N; i++ ) weights.push_back( 1/N_ ); -  } -  double sum = 0; -  float j = 1; -  for ( size_t i = 0; i < N; i++ ) { -    if ( counts.clipped[i] == 0 || counts.sum[i] == 0) continue; -    sum += exp((weights[i] * log((double)counts.clipped[i]/(double)counts.sum[i]))) / pow( 2, N_-j+1 ); -    j++; -  } -  return brevity_penaly( hyp_len, ref_len ) * sum; -} - - -/* - * approx_bleu - * as in "Online Large-Margin Training for Statistical Machine Translation" (Watanabe et al. '07) - * page TODO - * - */ -double -approx_bleu( NgramCounts& counts, const size_t hyp_len, const size_t ref_len, -     const size_t N, vector<float> weights = vector<float>() ) -{ -  return bleu( counts, hyp_len, ref_len, N, weights ); -} - - -/* - * register_and_convert - * - */ -void -register_and_convert(const vector<string>& strs, vector<WordID>& ids) -{ -  vector<string>::const_iterator it; -  for ( it = strs.begin(); it < strs.end(); it++ ) { -    ids.push_back( TD::Convert( *it ) ); -  } -} - - -/* - * - * - */ -void -test_ngrams() -{ -  cout << "Testing ngrams..." << endl << endl; -  size_t N = 5; -  cout << "N = " << N << endl; -  vector<int> a; // hyp -  vector<int> b; // ref -  cout << "a "; -  for (size_t i = 1; i <= 8; i++) { -    cout << i << " "; -    a.push_back(i); -  } -  cout << endl << "b "; -  for (size_t i = 1; i <= 4; i++) { -    cout << i << " "; -    b.push_back(i); -  } -  cout << endl << endl; -  NgramCounts c = make_ngram_counts( a, b, N ); -  assert( c.clipped[N-1] == 0 ); -  assert( c.sum[N-1] == 4 ); -  c.print(); -  c += c; -  cout << endl; -  c.print(); -  cout << endl; -} - - -/* - * - * - */ -double -approx_equal( double x, double y ) -{ -  const double EPSILON = 1E-5; -  if ( x == 0 ) return fabs( y ) <= EPSILON; -  if ( y == 0 ) return fabs( x ) <= EPSILON; -  return fabs( x - y ) / max( fabs(x), fabs(y) ) <= EPSILON; -} - - -/* - * - * - */ -#include <boost/assign/std/vector.hpp> -#include <iomanip> -void -test_metrics() -{ -  cout << "Testing metrics..." << endl << endl; -  using namespace boost::assign; -  vector<string> a, b; -  vector<double> expect_vanilla, expect_smooth, expect_stupid; -  a +=              "a a a a", "a a a a", "a",   "a", "b",        "a a a a", "a a",  "a a a", "a b a"; // hyp -  b +=              "b b b b", "a a a a", "a",   "b", "b b b b",  "a",       "a a",  "a a a", "a b b"; // ref -  expect_vanilla += 0,         1,         1,      0,  0,          .25,       1,      1,       0; -  expect_smooth  += 0,          .9375,     .0625, 0,   .00311169, .0441942,   .1875,  .4375,   .161587; -  expect_stupid  += 0,         1,         1,      0,   .0497871,  .25,       1,      1,        .605707; -  vector<string> aa, bb; -  vector<WordID> aai, bbi; -  double vanilla, smooth, stupid; -  size_t N = 4; -  cout << "N = " << N << endl << endl; -  for ( size_t i = 0; i < a.size(); i++ ) { -    cout << " hyp: " << a[i] << endl; -    cout << " ref: " << b[i] << endl; -    aa.clear(); bb.clear(); aai.clear(); bbi.clear(); -    boost::split( aa, a[i], boost::is_any_of(" ") ); -    boost::split( bb, b[i], boost::is_any_of(" ") ); -    register_and_convert( aa, aai ); -    register_and_convert( bb, bbi ); -    NgramCounts counts = make_ngram_counts( aai, bbi, N ); -    vanilla =        bleu( counts, aa.size(), bb.size(), N); -    smooth  = smooth_bleu( counts, aa.size(), bb.size(), N); -    stupid  = stupid_bleu( counts, aa.size(), bb.size(), N); -    assert( approx_equal(vanilla, expect_vanilla[i]) ); -    assert( approx_equal(smooth, expect_smooth[i]) ); -    assert( approx_equal(stupid, expect_stupid[i]) ); -    cout << setw(14) << "bleu = "      << vanilla << endl; -    cout << setw(14) << "smooth bleu = " << smooth << endl; -    cout << setw(14) << "stupid bleu = " << stupid << endl << endl; -  } -  cout << endl; -} - -/* - * - * - */ -void -test_SetWeights() -{ -  cout << "Testing Weights::SetWeight..." << endl << endl; -  Weights weights; -  SparseVector<double> lambdas; -  weights.InitSparseVector( &lambdas ); -  weights.SetWeight( &lambdas, "test", 0 ); -  weights.SetWeight( &lambdas, "test1", 1 ); -  WordID fid = FD::Convert( "test2" ); -  weights.SetWeight( &lambdas, fid, 2 ); -  string fn = "weights-test"; -  cout << "FD::NumFeats() " << FD::NumFeats() << endl; -  assert( FD::NumFeats() == 4 ); -  weights.WriteToFile( fn, true ); -  cout << endl; -} - - -/* - * - * - */ -void -run_tests() -{ -  cout << endl; -  test_ngrams(); -  cout << endl; -  test_metrics(); -  cout << endl; -  test_SetWeights(); -  exit(0); -} - - -void -print_FD() -{ -  for ( size_t i = 0; i < FD::NumFeats(); i++ ) cout << FD::Convert(i)<< endl; -} - - -/*   * main   *   */  int  main(int argc, char** argv)  { -  //SetSilent(true); -  boostpo::variables_map conf; +  SetSilent(true); +  po::variables_map conf;    if (!init(argc, argv, &conf)) return 1;    if ( conf.count("test") ) run_tests();     register_feature_functions(); @@ -509,7 +45,9 @@ main(int argc, char** argv)    ReadFile ini_rf(conf["decoder-config"].as<string>());    Decoder decoder(ini_rf.stream());    KBestGetter observer(k); -   +  size_t N = 4; // TODO as parameter/in config  + +  // TODO scoring metric as parameter/in config     // for approx. bleu    //NgramCounts global_counts;    //size_t global_hyp_len; @@ -523,82 +61,67 @@ main(int argc, char** argv)    lambdas.set_value(FD::Convert("logp"), 0); -  vector<string> strs; +  vector<string> strs, ref_strs; +  vector<WordID> ref_ids;    string in, psg; -  size_t i = 0; +  size_t sid = 0; +  cerr << "(1 dot equals 100 lines of input)" << endl;    while( getline(cin, in) ) { -    if ( !SILENT ) cerr << endl << endl << "Getting kbest for sentence #" << i << endl; -    // why? why!? +    //if ( !SILENT ) +    //    cerr << endl << endl << "Getting kbest for sentence #" << sid << endl; +    if ( (sid+1) % 100 == 0 ) { +        cerr << "."; +        if ( (sid+1)%1000 == 0 ) cerr << endl; +    } +    if ( sid > 5000 ) break; +    // weights      dense_weights.clear();      weights.InitFromVector( lambdas );      weights.InitVector( &dense_weights );      decoder.SetWeights( dense_weights ); -    //cout << "use_shell " << dense_weights[FD::Convert("use_shell")] << endl; +    //if ( sid > 100 ) break; +    // handling input..      strs.clear();      boost::split( strs, in, boost::is_any_of("\t") ); +    // grammar      psg = boost::replace_all_copy( strs[2], " __NEXT_RULE__ ", "\n" ); psg += "\n"; -    //decoder.SetId(i);      decoder.SetSentenceGrammar( psg );      decoder.Decode( strs[0], &observer ); -    KBestList* kb = observer.getkb(); +    KBestList* kb = observer.GetKBest(); +    // reference +    ref_strs.clear(); ref_ids.clear(); +    boost::split( ref_strs, strs[1], boost::is_any_of(" ") ); +    register_and_convert( ref_strs, ref_ids ); +    // scoring kbest +    double score = 0; +    Scores scores;      for ( size_t i = 0; i < k; i++ ) { -      cout << i << " "; -      for (size_t j = 0; j < kb->sents[i].size(); ++j ) { -        cout << TD::Convert( kb->sents[i][j] ) << " "; -      } -      cout << kb->scores[i]; -      cout << endl; +      NgramCounts counts = make_ngram_counts( ref_ids, kb->sents[i], 4 ); +      score = smooth_bleu( counts, +                           ref_ids.size(), +                           kb->sents[i].size(), N ); +      ScorePair sp( kb->scores[i], score ); +      scores.push_back( sp ); +      //cout << "'" << TD::GetString( ref_ids ) << "' vs '" << TD::GetString( kb->sents[i] ) << "' SCORE=" << score << endl; +      //cout << kb->feats[i] << endl;      } -    lambdas.set_value( FD::Convert("use_shell"), 1 ); -    lambdas.set_value( FD::Convert("use_a"), 1 ); +    //cout << "###" << endl; +    SofiaLearner learner; +    learner.Init( sid, kb->feats, scores ); +    learner.Update(lambdas); +    // initializing learner +    // TODO +    // updating weights +    //lambdas.set_value( FD::Convert("use_shell"), 1 ); +    //lambdas.set_value( FD::Convert("use_a"), 1 );      //print_FD(); +    sid += 1; // TODO does cdec count this already?    } -   +    weights.WriteToFile( "weights-final", true ); +   +  cerr << endl;    return 0;  } -    // next: FMap, ->sofia, ->FMap, -> Weights -    // learner gets all used features (binary! and dense (logprob is sum of logprobs!)) -    // only for those feats with weight > 0 after learning -    // see decoder line 548 - - -/* - * TODO - *  iterate over training set, for t=1..T - *  mapred impl - *   mapper:  main - *   reducer: average weights, global NgramCounts for approx. bleu - *  1st cut: hadoop streaming? - *  batch, non-batch in the mapper (what sofia gets, regenerated Kbest lists) - *  filter kbest yes/no - *  sofia: --eta_type explicit - *  psg preparation source\tref\tpsg - *  set reference for cdec? - *  LM - *   shared? - *   startup? - *  X reference(s) for *bleu!? - *  kbest nicer (do not iterate twice)!? -> shared_ptr - *  multipartite ranking - *  weights! global, per sentence from global, featuremap - *  const decl... - *  sketch: batch/iter options - *  weights.cc: why wv_? - *  --weights cmd line (for iterations): script to call again/hadoop streaming? - *  I do not need to remember features, cdec does - *  resocre hg? - *  do not use Decoder::Decode!? - *  what happens if feature not in FD? 0??? - */ - -/* - * PROBLEMS - *  cdec kbest vs 1best (no -k param) - *  FD, Weights::wv_ grow too large, see utils/weights.cc; decoder/hg.h; decoder/scfg_translator.cc; utils/fdict.cc!? - *  sparse vector instead of vector<double> for weights in Decoder? - *  PhraseModel_* features for psg!? (seem to be generated) - */ - diff --git a/dtrain/dtrain.ini b/dtrain/dtrain.ini deleted file mode 100644 index e69de29b..00000000 --- a/dtrain/dtrain.ini +++ /dev/null diff --git a/dtrain/in b/dtrain/in deleted file mode 100644 index 294d009b..00000000 --- a/dtrain/in +++ /dev/null @@ -1,2 +0,0 @@ -vorrichtung	means	[X] ||| vorrichtung ||| apparatus ||| LogP=-200 ||| 0-0 __NEXT_RULE__ [X] ||| vorrichtung ||| means ||| LogP=-101 ||| 0-0 -eintest	test	[X] ||| eintest ||| test ||| LogP=-200 ||| 0-0 __NEXT_RULE__ [X] ||| eintest ||| xxx ||| LogP=-101 ||| 0-0 diff --git a/dtrain/in.toy b/dtrain/in.toy deleted file mode 100644 index 71b736a6..00000000 --- a/dtrain/in.toy +++ /dev/null @@ -1,2 +0,0 @@ -ich sah ein kleines haus	i saw a little shell	[S] ||| [NP,1] [VP,2] ||| [1] [2] ||| logp=0 __NEXT_RULE__ [NP] ||| ich ||| i ||| logp=0 __NEXT_RULE__ [NP] ||| ein [NN,1] ||| a [1] ||| logp=0 __NEXT_RULE__ [NN] ||| [JJ,1] haus ||| [1] house ||| logp=-0.5 __NEXT_RULE__ [NN] ||| [JJ,1] haus ||| [1] shell ||| logp=-0.5 use_shell=1 __NEXT_RULE__ [JJ] ||| kleines ||| small ||| logp=-0.5 __NEXT_RULE__ [JJ] ||| kleines ||| little ||| logp=-0.9 __NEXT_RULE__ [JJ] ||| grosses ||| big ||| logp=-0.5 __NEXT_RULE__ [JJ] ||| grosses ||| large ||| logp=-1.5 __NEXT_RULE__ [VP] ||| [V,1] [NP,2] ||| [1] [2] ||| logp=0 __NEXT_RULE__ [V] ||| sah ||| saw ||| logp=0 __NEXT_RULE__ [V] ||| fand ||| found ||| logp=0 -ich fand ein grosses haus	i found a little shell	[S] ||| [NP,1] [VP,2] ||| [1] [2] ||| logp=0 __NEXT_RULE__ [NP] ||| ich ||| i ||| logp=0 __NEXT_RULE__ [NP] ||| ein [NN,1] ||| a [1] ||| logp=0 use_a=1 __NEXT_RULE__ [NN] ||| [JJ,1] haus ||| [1] house ||| logp=-1000 __NEXT_RULE__ [NN] ||| [JJ,1] haus ||| [1] shell ||| logp=-1 use_shell=1 __NEXT_RULE__ [JJ] ||| kleines ||| small ||| logp=-0.5 __NEXT_RULE__ [JJ] ||| kleines ||| little ||| logp=-0.9 __NEXT_RULE__ [JJ] ||| grosses ||| big ||| logp=-0.5 __NEXT_RULE__ [JJ] ||| grosses ||| large ||| logp=-1.5 __NEXT_RULE__ [VP] ||| [V,1] [NP,2] ||| [1] [2] ||| logp=0 __NEXT_RULE__ [V] ||| sah ||| saw ||| logp=0 __NEXT_RULE__ [V] ||| fand ||| found ||| logp=0 diff --git a/dtrain/test.sh b/dtrain/test.sh index a0ebb420..ad45bd1e 100755 --- a/dtrain/test.sh +++ b/dtrain/test.sh @@ -1,4 +1,4 @@  #!/bin/sh -./dtrain -c cdec.ini -k 4 < in.toy +./dtrain -c data/cdec.ini -k 4 < data/in.blunsom08 #< data/in.toy | 
