diff options
Diffstat (limited to 'dtrain')
| -rw-r--r-- | dtrain/Makefile.am | 4 | ||||
| -rw-r--r-- | dtrain/common.h | 37 | ||||
| -rw-r--r-- | dtrain/dcommon.cc | 330 | ||||
| -rw-r--r-- | dtrain/dcommon.h | 163 | ||||
| -rw-r--r-- | dtrain/dtest.cc | 47 | ||||
| -rw-r--r-- | dtrain/dtrain.cc | 86 | ||||
| -rw-r--r-- | dtrain/kbestget.h | 61 | ||||
| -rw-r--r-- | dtrain/learner.h | 133 | ||||
| -rw-r--r-- | dtrain/score.cc | 166 | ||||
| -rw-r--r-- | dtrain/score.h | 111 | ||||
| -rwxr-xr-x | dtrain/scripts/run.sh | 4 | ||||
| -rwxr-xr-x | dtrain/scripts/test.sh | 6 | ||||
| -rwxr-xr-x | dtrain/test.sh | 4 | ||||
| -rw-r--r-- | dtrain/tests.cc | 141 | ||||
| -rw-r--r-- | dtrain/tests.h | 26 | ||||
| -rw-r--r-- | dtrain/util.cc | 34 | ||||
| -rw-r--r-- | dtrain/util.h | 28 | 
17 files changed, 774 insertions, 607 deletions
| diff --git a/dtrain/Makefile.am b/dtrain/Makefile.am index c3f14bb0..03e3ccf7 100644 --- a/dtrain/Makefile.am +++ b/dtrain/Makefile.am @@ -1,10 +1,10 @@  # TODO I'm sure I can leave something out.  bin_PROGRAMS = dtrain dtest -dtrain_SOURCES = dtrain.cc dcommon.cc +dtrain_SOURCES = dtrain.cc score.cc tests.cc util.cc  dtrain_LDADD   = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a ../klm/lm/libklm.a ../klm/util/libklm_util.a -lz -dtest_SOURCES = dtest.cc dcommon.cc +dtest_SOURCES = dtest.cc score.cc util.cc  dtest_LDADD   = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a ../klm/lm/libklm.a ../klm/util/libklm_util.a -lz  AM_CPPFLAGS = -W -Wall -Wno-sign-compare -I$(top_srcdir)/utils -I$(top_srcdir)/decoder -I$(top_srcdir)/mteval diff --git a/dtrain/common.h b/dtrain/common.h new file mode 100644 index 00000000..cf365d48 --- /dev/null +++ b/dtrain/common.h @@ -0,0 +1,37 @@ +#ifndef _DTRAIN_COMMON_H_ +#define _DTRAIN_COMMON_H_ + + +#include <sstream> +#include <iostream> +#include <vector> +#include <cassert> +#include <cmath> + +#include "sentence_metadata.h" +#include "verbose.h" +#include "viterbi.h" +#include "kbest.h" +#include "ff_register.h" +#include "decoder.h" +#include "weights.h" + +#include <boost/algorithm/string.hpp> +#include <boost/program_options.hpp> + +#include "score.h" + +#define DTRAIN_DEFAULT_K 100 +#define DTRAIN_DEFAULT_N 4 +#define DTRAIN_DEFAULT_T 1 + +#define DTRAIN_DOTOUT 100 + + +using namespace std; +using namespace dtrain; +namespace po = boost::program_options; + + +#endif + diff --git a/dtrain/dcommon.cc b/dtrain/dcommon.cc deleted file mode 100644 index 6657bed6..00000000 --- a/dtrain/dcommon.cc +++ /dev/null @@ -1,330 +0,0 @@ -#include "dcommon.h" - - - - -/****************************************************************************** - * NGRAMS - * - * - * make_ngrams - * - */ -typedef map<vector<WordID>, size_t> Ngrams; -Ngrams -make_ngrams( vector<WordID>& s, size_t N ) -{ -  Ngrams ngrams; -  vector<WordID> ng; -  for ( size_t i = 0; i < s.size(); i++ ) { -    ng.clear(); -    for ( size_t j = i; j < min( i+N, s.size() ); j++ ) { -      ng.push_back( s[j] ); -      ngrams[ng]++; -    } -  } -  return ngrams; -} - - -/* - * ngram_matches - * - */ -NgramCounts -make_ngram_counts( vector<WordID> hyp, vector<WordID> ref, size_t N ) -{ -  Ngrams hyp_ngrams = make_ngrams( hyp, N ); -  Ngrams ref_ngrams = make_ngrams( ref, N ); -  NgramCounts counts( N ); -  Ngrams::iterator it; -  Ngrams::iterator ti; -  for ( it = hyp_ngrams.begin(); it != hyp_ngrams.end(); it++ ) { -    ti = ref_ngrams.find( it->first ); -    if ( ti != ref_ngrams.end() ) { -      counts.add( it->second, ti->second, it->first.size() - 1 ); -    } else { -      counts.add( it->second, 0, it->first.size() - 1 ); -    } -  } -  return counts; -} - - - - -/****************************************************************************** - * SCORES - * - * - * brevity_penaly - * - */ -double -brevity_penaly( const size_t hyp_len, const size_t ref_len ) -{ -  if ( hyp_len > ref_len ) return 1; -  return exp( 1 - (double)ref_len/(double)hyp_len ); -} - - -/* - * bleu - * as in "BLEU: a Method for Automatic Evaluation of Machine Translation" (Papineni et al. '02) - * page TODO - * 0 if for N one of the counts = 0 - */ -double -bleu( NgramCounts& counts, const size_t hyp_len, const size_t ref_len, -      size_t N, vector<float> weights  ) -{ -  if ( hyp_len == 0 || ref_len == 0 ) return 0; -  if ( ref_len < N ) N = ref_len; -  float N_ = (float)N; -  if ( weights.empty() ) -  { -    for ( size_t i = 0; i < N; i++ ) weights.push_back( 1/N_ ); -  } -  double sum = 0; -  for ( size_t i = 0; i < N; i++ ) { -    if ( counts.clipped[i] == 0 || counts.sum[i] == 0 ) return 0; -    sum += weights[i] * log( (double)counts.clipped[i] / (double)counts.sum[i] ); -  } -  return brevity_penaly( hyp_len, ref_len ) * exp( sum ); -} - - -/* - * stupid_bleu - * as in "ORANGE: a Method for Evaluating Automatic Evaluation Metrics for Machine Translation (Lin & Och '04) - * page TODO - * 0 iff no 1gram match - */ -double -stupid_bleu( NgramCounts& counts, const size_t hyp_len, const size_t ref_len, -             size_t N, vector<float> weights  ) -{ -  if ( hyp_len == 0 || ref_len == 0 ) return 0; -  if ( ref_len < N ) N = ref_len; -  float N_ = (float)N; -  if ( weights.empty() ) -  { -    for ( size_t i = 0; i < N; i++ ) weights.push_back( 1/N_ ); -  } -  double sum = 0; -  float add = 0; -  for ( size_t i = 0; i < N; i++ ) { -    if ( i == 1 ) add = 1; -    sum += weights[i] * log( ((double)counts.clipped[i] + add) / ((double)counts.sum[i] + add) ); -  } -  return brevity_penaly( hyp_len, ref_len ) * exp( sum ); -} - - -/* - * smooth_bleu - * as in "An End-to-End Discriminative Approach to Machine Translation" (Liang et al. '06) - * page TODO - * max. 0.9375 - */ -double -smooth_bleu( NgramCounts& counts, const size_t hyp_len, const size_t ref_len, -             const size_t N, vector<float> weights  ) -{ -  if ( hyp_len == 0 || ref_len == 0 ) return 0; -  float N_ = (float)N; -  if ( weights.empty() ) -  { -    for ( size_t i = 0; i < N; i++ ) weights.push_back( 1/N_ ); -  } -  double sum = 0; -  float j = 1; -  for ( size_t i = 0; i < N; i++ ) { -    if ( counts.clipped[i] == 0 || counts.sum[i] == 0) continue; -    sum += exp((weights[i] * log((double)counts.clipped[i]/(double)counts.sum[i]))) / pow( 2, N_-j+1 ); -    j++; -  } -  return brevity_penaly( hyp_len, ref_len ) * sum; -} - - -/* - * approx_bleu - * as in "Online Large-Margin Training for Statistical Machine Translation" (Watanabe et al. '07) - * page TODO - * - */ -double -approx_bleu( NgramCounts& counts, const size_t hyp_len, const size_t ref_len, -     const size_t N, vector<float> weights ) -{ -  return bleu( counts, hyp_len, ref_len, N, weights ); -} - - - - -/****************************************************************************** - * UTILS - * - * - * register_and_convert - * - */ -void -register_and_convert(const vector<string>& strs, vector<WordID>& ids) -{ -  vector<string>::const_iterator it; -  for ( it = strs.begin(); it < strs.end(); it++ ) { -    ids.push_back( TD::Convert( *it ) ); -  } -} - - -/* - * approx_equal - * - */ -double -approx_equal( double x, double y ) -{ -  const double EPSILON = 1E-5; -  if ( x == 0 ) return fabs( y ) <= EPSILON; -  if ( y == 0 ) return fabs( x ) <= EPSILON; -  return fabs( x - y ) / max( fabs(x), fabs(y) ) <= EPSILON; -} - - -/* - * print_FD - * - */ -void -print_FD() -{ -  for ( size_t i = 0; i < FD::NumFeats(); i++ ) cout << FD::Convert(i)<< endl; -} - - - - -/****************************************************************************** - * TESTS - * - * - * test_ngrams - * - */ -void -test_ngrams() -{ -  cout << "Testing ngrams..." << endl << endl; -  size_t N = 5; -  cout << "N = " << N << endl; -  vector<int> a; // hyp -  vector<int> b; // ref -  cout << "a "; -  for (size_t i = 1; i <= 8; i++) { -    cout << i << " "; -    a.push_back(i); -  } -  cout << endl << "b "; -  for (size_t i = 1; i <= 4; i++) { -    cout << i << " "; -    b.push_back(i); -  } -  cout << endl << endl; -  NgramCounts c = make_ngram_counts( a, b, N ); -  assert( c.clipped[N-1] == 0 ); -  assert( c.sum[N-1] == 4 ); -  c.print(); -  c += c; -  cout << endl; -  c.print(); -  cout << endl; -} - - -/* - * test_metrics - * - */ -void -test_metrics() -{ -  cout << "Testing metrics..." << endl << endl; -  using namespace boost::assign; -  vector<string> a, b; -  vector<double> expect_vanilla, expect_smooth, expect_stupid; -  a +=              "a a a a", "a a a a", "a",   "a", "b",        "a a a a", "a a",  "a a a", "a b a"; // hyp -  b +=              "b b b b", "a a a a", "a",   "b", "b b b b",  "a",       "a a",  "a a a", "a b b"; // ref -  expect_vanilla += 0,         1,         1,      0,  0,          .25,       1,      1,       0; -  expect_smooth  += 0,          .9375,     .0625, 0,   .00311169, .0441942,   .1875,  .4375,   .161587; -  expect_stupid  += 0,         1,         1,      0,   .0497871,  .25,       1,      1,        .605707; -  vector<string> aa, bb; -  vector<WordID> aai, bbi; -  double vanilla, smooth, stupid; -  size_t N = 4; -  cout << "N = " << N << endl << endl; -  for ( size_t i = 0; i < a.size(); i++ ) { -    cout << " hyp: " << a[i] << endl; -    cout << " ref: " << b[i] << endl; -    aa.clear(); bb.clear(); aai.clear(); bbi.clear(); -    boost::split( aa, a[i], boost::is_any_of(" ") ); -    boost::split( bb, b[i], boost::is_any_of(" ") ); -    register_and_convert( aa, aai ); -    register_and_convert( bb, bbi ); -    NgramCounts counts = make_ngram_counts( aai, bbi, N ); -    vanilla =        bleu( counts, aa.size(), bb.size(), N); -    smooth  = smooth_bleu( counts, aa.size(), bb.size(), N); -    stupid  = stupid_bleu( counts, aa.size(), bb.size(), N); -    assert( approx_equal(vanilla, expect_vanilla[i]) ); -    assert( approx_equal(smooth, expect_smooth[i]) ); -    assert( approx_equal(stupid, expect_stupid[i]) ); -    cout << setw(14) << "bleu = "      << vanilla << endl; -    cout << setw(14) << "smooth bleu = " << smooth << endl; -    cout << setw(14) << "stupid bleu = " << stupid << endl << endl; -  } -  cout << endl; -} - - -/* - * test_SetWeights - * - */ -void -test_SetWeights() -{ -  cout << "Testing Weights::SetWeight..." << endl << endl; -  Weights weights; -  SparseVector<double> lambdas; -  weights.InitSparseVector( &lambdas ); -  weights.SetWeight( &lambdas, "test", 0 ); -  weights.SetWeight( &lambdas, "test1", 1 ); -  WordID fid = FD::Convert( "test2" ); -  weights.SetWeight( &lambdas, fid, 2 ); -  string fn = "weights-test"; -  cout << "FD::NumFeats() " << FD::NumFeats() << endl; -  assert( FD::NumFeats() == 4 ); -  weights.WriteToFile( fn, true ); -  cout << endl; -} - - -/* - * run_tests - * - */ -void -run_tests() -{ -  cout << endl; -  test_ngrams(); -  cout << endl; -  test_metrics(); -  cout << endl; -  test_SetWeights(); -  exit(0); -} - diff --git a/dtrain/dcommon.h b/dtrain/dcommon.h deleted file mode 100644 index 6df841bb..00000000 --- a/dtrain/dcommon.h +++ /dev/null @@ -1,163 +0,0 @@ -#include <sstream> -#include <iostream> -#include <vector> -#include <cassert> -#include <cmath> - -#include "config.h" - -#include <boost/shared_ptr.hpp> -#include <boost/algorithm/string.hpp> -#include <boost/program_options.hpp> -#include <boost/program_options/variables_map.hpp> - -#include "sentence_metadata.h" -#include "scorer.h" -#include "verbose.h" -#include "viterbi.h" -#include "hg.h" -#include "prob.h" -#include "kbest.h" -#include "ff_register.h" -#include "decoder.h" -#include "filelib.h" -#include "fdict.h" -#include "weights.h" -#include "sparse_vector.h" -#include "sampler.h" - -using namespace std; -namespace po = boost::program_options; - - - - -struct ScorePair -{ -  ScorePair(double modelscore, double score) : modelscore_(modelscore), score_(score) {}  -  double modelscore_, score_; -  double GetModelScore() { return modelscore_; } -  double GetScore() { return score_; } -}; -typedef vector<ScorePair> Scores; - - -/* - * KBestGetter - * - */ -struct KBestList { -  vector<SparseVector<double> > feats; -  vector<vector<WordID> > sents; -  vector<double> scores; -}; -struct KBestGetter : public DecoderObserver -{ -  KBestGetter( const size_t k ) : k_(k) {} -  const size_t k_; -  KBestList kb; - -  virtual void -  NotifyTranslationForest(const SentenceMetadata& smeta, Hypergraph* hg) -  { -    GetKBest(smeta.GetSentenceID(), *hg); -  } - -  KBestList* GetKBest() { return &kb; } - -  void -  GetKBest(int sent_id, const Hypergraph& forest) -  { -    kb.scores.clear(); -    kb.sents.clear(); -    kb.feats.clear(); -    KBest::KBestDerivations<vector<WordID>, ESentenceTraversal> kbest( forest, k_ ); -    for ( size_t i = 0; i < k_; ++i ) { -      const KBest::KBestDerivations<vector<WordID>, ESentenceTraversal>::Derivation* d = -        kbest.LazyKthBest( forest.nodes_.size() - 1, i ); -      if (!d) break; -      kb.sents.push_back( d->yield); -      kb.feats.push_back( d->feature_values ); -      kb.scores.push_back( d->score ); -    } -  } -}; - - -/* - * NgramCounts - * - */ -struct NgramCounts -{ -  NgramCounts( const size_t N ) : N_( N ) { -    reset(); -  }  -  size_t N_; -  map<size_t, size_t> clipped; -  map<size_t, size_t> sum; - -  void -  operator+=( const NgramCounts& rhs ) -  { -    assert( N_ == rhs.N_ ); -    for ( size_t i = 0; i < N_; i++ ) { -      this->clipped[i] += rhs.clipped.find(i)->second; -      this->sum[i] += rhs.sum.find(i)->second; -    } -  } - -  void -  add( size_t count, size_t ref_count, size_t i ) -  { -    assert( i < N_ ); -    if ( count > ref_count ) { -      clipped[i] += ref_count; -      sum[i] += count; -    } else { -      clipped[i] += count; -      sum[i] += count; -    } -  } - -  void -  reset() -  { -    size_t i; -    for ( i = 0; i < N_; i++ ) { -      clipped[i] = 0; -      sum[i] = 0; -    } -  } - -  void -  print() -  { -    for ( size_t i = 0; i < N_; i++ ) { -      cout << i+1 << "grams (clipped):\t" << clipped[i] << endl; -      cout << i+1 << "grams:\t\t\t" << sum[i] << endl; -    } -  } -}; - - - - -typedef map<vector<WordID>, size_t> Ngrams; -Ngrams make_ngrams( vector<WordID>& s, size_t N ); -NgramCounts make_ngram_counts( vector<WordID> hyp, vector<WordID> ref, size_t N ); -double brevity_penaly( const size_t hyp_len, const size_t ref_len ); -double bleu( NgramCounts& counts, const size_t hyp_len, const size_t ref_len, size_t N, vector<float> weights = vector<float>() ); -double stupid_bleu( NgramCounts& counts, const size_t hyp_len, const size_t ref_len, size_t N, vector<float> weights = vector<float>() ); -double smooth_bleu( NgramCounts& counts, const size_t hyp_len, const size_t ref_len, const size_t N, vector<float> weights = vector<float>() ); -double approx_bleu( NgramCounts& counts, const size_t hyp_len, const size_t ref_len, const size_t N, vector<float> weights = vector<float>() ); -void register_and_convert(const vector<string>& strs, vector<WordID>& ids); -void print_FD(); -void run_tests(); -void test_SetWeights(); -#include <boost/assign/std/vector.hpp> -#include <iomanip> -void test_metrics(); -double approx_equal( double x, double y ); -void test_ngrams(); - diff --git a/dtrain/dtest.cc b/dtrain/dtest.cc index 5ae473e6..d1ff30c0 100644 --- a/dtrain/dtest.cc +++ b/dtrain/dtest.cc @@ -1,6 +1,6 @@ -#include "dcommon.h" - - +#include "common.h" +#include "kbestget.h" +#include "util.h"  /* @@ -14,10 +14,10 @@ init(int argc, char** argv, po::variables_map* conf)    bool q;    po::options_description opts( "Options" );    opts.add_options() -    ( "decoder-config,c", po::value<string>(),                      "configuration file for cdec" ) -    ( "weights,w",        po::value<string>(),                      "weights file") -    ( "ngrams,n",         po::value<int>(&N)->default_value(4),     "N for Ngrams (default 5)" ) -    ( "quiet,q",          po::value<bool>(&q)->default_value(true), "do not output translations" ); +    ( "decoder-config,c", po::value<string>(),                              "configuration file for cdec" ) +    ( "weights,w",        po::value<string>(),                                             "weights file" ) +    ( "ngrams,n",         po::value<int>(&N)->default_value(DTRAIN_DEFAULT_N), "N for Ngrams (default 5)" ) +    ( "quiet,q",          po::value<bool>(&q)->default_value(true),          "do not output translations" );    po::options_description cmdline_options;    cmdline_options.add(opts);    po::store( parse_command_line(argc, argv, cmdline_options), *conf ); @@ -57,17 +57,17 @@ main(int argc, char** argv)    vector<string> strs, ref_strs;    vector<WordID> ref_ids;    string in, psg; -  size_t sid = 0; -  double overall = 0.0; +  size_t sn = 0; +  double overall  = 0.0;    double overall1 = 0.0;    double overall2 = 0.0; -  cerr << "(a dot equals 100 lines of input)" << endl; +  cerr << "(A dot equals " << DTRAIN_DOTOUT << " lines of input.)" << endl;    while( getline(cin, in) ) { -    if ( (sid+1) % 100 == 0 ) { +    if ( (sn+1) % DTRAIN_DOTOUT == 0 ) {          cerr << "."; -        if ( (sid+1)%1000 == 0 ) cerr << endl; +        if ( (sn+1) % (20*DTRAIN_DOTOUT) == 0 ) cerr << endl;      } -    //if ( sid > 5000 ) break; +    //if ( sn > 5000 ) break;      strs.clear();      boost::split( strs, in, boost::is_any_of("\t") );      // grammar @@ -80,25 +80,22 @@ main(int argc, char** argv)      boost::split( ref_strs, strs[1], boost::is_any_of(" ") );      register_and_convert( ref_strs, ref_ids );      // scoring kbest -    double score = 0.0; +    double score  = 0.0;      double score1 = 0.0;      double score2 = 0.0;      NgramCounts counts = make_ngram_counts( ref_ids, kb->sents[0], 4 ); -    score = smooth_bleu( counts, -                         ref_ids.size(), -                         kb->sents[0].size(), N ); -    score1 = stupid_bleu( counts, ref_ids.size(), kb->sents[0].size(), N) ; -    score2 = bleu( counts, ref_ids.size(), kb->sents[0].size(), N ); -    //if ( ! quiet ) -    cout << TD::GetString( kb->sents[0] ) << endl; +    score =  smooth_bleu( counts, ref_ids.size(), kb->sents[0].size(), N ); +    score1 = stupid_bleu( counts, ref_ids.size(), kb->sents[0].size(), N ); +    score2 =        bleu( counts, ref_ids.size(), kb->sents[0].size(), N ); +    if ( ! quiet ) cout << TD::GetString( kb->sents[0] ) << endl;      overall += score;      overall1 += score1;      overall2 += score2; -    sid += 1; +    sn += 1;    } -  cerr << "Average score (smooth): " << overall/(double)(sid+1) << endl; -  cerr << "Average score (stupid): " << overall1/(double)(sid+1) << endl; -  cerr << "Average score (normal): " << overall2/(double)(sid+1) << endl; +  cerr << "Average score (smooth) : " << overall/(double)(sn+1) << endl; +  cerr << "Average score (stupid) : " << overall1/(double)(sn+1) << endl; +  cerr << "Average score (vanilla): " << overall2/(double)(sn+1) << endl;    cerr << endl;    return 0; diff --git a/dtrain/dtrain.cc b/dtrain/dtrain.cc index 373458e8..16b83a70 100644 --- a/dtrain/dtrain.cc +++ b/dtrain/dtrain.cc @@ -1,6 +1,11 @@ -#include "dcommon.h" +#include "common.h" +#include "kbestget.h"  #include "learner.h" +#include "util.h" +#ifdef DTRAIN_DEBUG +#include "tests.h" +#endif @@ -12,20 +17,33 @@ bool  init(int argc, char** argv, po::variables_map* conf)  {    po::options_description opts( "Options" ); +  size_t k, N, T; +  // TODO scoring metric as parameter/in config     opts.add_options() -    ( "decoder-config,c", po::value<string>(), "configuration file for cdec" ) -    ( "kbest,k",          po::value<size_t>(), "k for kbest" ) -    ( "ngrams,n",         po::value<int>(),    "n for Ngrams" ) -    ( "filter,f",         po::value<string>(), "filter kbest list" ) -    ( "test",                                       "run tests and exit"); +    ( "decoder-config,c", po::value<string>(),                          "configuration file for cdec" ) +    ( "kbest,k",          po::value<size_t>(&k)->default_value(DTRAIN_DEFAULT_K),       "k for kbest" ) +    ( "ngrams,n",         po::value<size_t>(&N)->default_value(DTRAIN_DEFAULT_N),      "n for Ngrams" ) +    ( "filter,f",         po::value<string>(),                                    "filter kbest list" ) // FIXME +    ( "epochs,t",         po::value<size_t>(&T)->default_value(DTRAIN_DEFAULT_T), "# of iterations T" )  +#ifndef DTRAIN_DEBUG +    ; +#else +    ( "test",                                  "run tests and exit"); +#endif    po::options_description cmdline_options;    cmdline_options.add(opts);    po::store( parse_command_line(argc, argv, cmdline_options), *conf );    po::notify( *conf ); -  if ( ! (conf->count("decoder-config") || conf->count("test")) ) { +  if ( ! conf->count("decoder-config") ) {       cerr << cmdline_options << endl;      return false;    } +  #ifdef DTRAIN_DEBUG        +  if ( ! conf->count("test") ) { +    cerr << cmdline_options << endl; +    return false; +  } +  #endif    return true;  } @@ -40,19 +58,21 @@ main(int argc, char** argv)    SetSilent(true);    po::variables_map conf;    if (!init(argc, argv, &conf)) return 1; +#ifdef DTRAIN_DEBUG    if ( conf.count("test") ) run_tests();  +#endif    register_feature_functions();    size_t k = conf["kbest"].as<size_t>(); -  ReadFile ini_rf(conf["decoder-config"].as<string>()); +  ReadFile ini_rf( conf["decoder-config"].as<string>() );    Decoder decoder(ini_rf.stream()); -  KBestGetter observer(k); -  size_t N = 3; // TODO as parameter/in config  +  KBestGetter observer( k ); +  size_t N = conf["ngrams"].as<size_t>();  +  size_t T = conf["epochs"].as<size_t>(); -  // TODO scoring metric as parameter/in config     // for approx. bleu -  NgramCounts global_counts(N); -  size_t global_hyp_len = 0; -  size_t global_ref_len = 0; +  //NgramCounts global_counts( N ); +  //size_t global_hyp_len = 0; +  //size_t global_ref_len = 0;    Weights weights;    SparseVector<double> lambdas; @@ -62,20 +82,24 @@ main(int argc, char** argv)    vector<string> strs, ref_strs;    vector<WordID> ref_ids;    string in, psg; -  size_t sid = 0; -  cerr << "(1 dot equals 100 lines of input)" << endl; +  size_t sn = 0; +  cerr << "(A dot equals " << DTRAIN_DOTOUT << " lines of input.)" << endl; + +  for ( size_t t = 0; t < T; t++ ) +  { +    while( getline(cin, in) ) { -    if ( (sid+1) % 100 == 0 ) { +    if ( (sn+1) % DTRAIN_DOTOUT == 0 ) {          cerr << "."; -        if ( (sid+1)%1000 == 0 ) cerr << endl; +        if ( (sn+1) % (20*DTRAIN_DOTOUT) == 0 ) cerr << endl;      } -    //if ( sid > 5000 ) break; +    //if ( sn > 5000 ) break;      // weights      dense_weights.clear();      weights.InitFromVector( lambdas );      weights.InitVector( &dense_weights );      decoder.SetWeights( dense_weights ); -    // handling input.. +    // handling input      strs.clear();      boost::split( strs, in, boost::is_any_of("\t") );      // grammar @@ -89,11 +113,11 @@ main(int argc, char** argv)      register_and_convert( ref_strs, ref_ids );      // scoring kbest      double score = 0; -    size_t cand_len = 0; +    //size_t cand_len = 0;      Scores scores;      for ( size_t i = 0; i < kb->sents.size(); i++ ) {        NgramCounts counts = make_ngram_counts( ref_ids, kb->sents[i], N ); -      if ( i == 0) { +      /*if ( i == 0 ) {          global_counts += counts;          global_hyp_len += kb->sents[i].size();          global_ref_len += ref_ids.size(); @@ -101,24 +125,28 @@ main(int argc, char** argv)        } else {          cand_len = kb->sents[i].size();        } -      //score = bleu( global_counts, -      //                     global_ref_len, -       //                    global_hyp_len + cand_len, N ); +      score = bleu( global_counts, +                    global_ref_len, +                     global_hyp_len + cand_len, N );*/        score = bleu ( counts, ref_ids.size(), kb->sents[i].size(), N );        ScorePair sp( kb->scores[i], score );        scores.push_back( sp ); -      //cout << "'" << TD::GetString( ref_ids ) << "' vs '" << TD::GetString( kb->sents[i] ) << "' SCORE=" << score << endl; +      //cout << "'" << TD::GetString( ref_ids ) << "' vs '"; +      //cout << TD::GetString( kb->sents[i] ) << "' SCORE=" << score << endl;        //cout << kb->feats[i] << endl;      }      // learner      SofiaLearner learner; -    learner.Init( sid, kb->feats, scores ); +    learner.Init( sn, kb->feats, scores );      learner.Update(lambdas);      //print_FD(); -    sid += 1; // TODO does cdec count this already? +    sn += 1;    } + +  } // outer loop +    cerr << endl; -  weights.WriteToFile( "data/weights-final-normalx", true ); +  weights.WriteToFile( "data/weights-vanilla", false );    return 0;  } diff --git a/dtrain/kbestget.h b/dtrain/kbestget.h new file mode 100644 index 00000000..6d93d3b7 --- /dev/null +++ b/dtrain/kbestget.h @@ -0,0 +1,61 @@ +#ifndef _DTRAIN_KBESTGET_H_ +#define _DTRAIN_KBESTGET_H_ + + +namespace dtrain +{ + + +/* + * KBestList + * + */ +struct KBestList { +  vector<SparseVector<double> > feats; +  vector<vector<WordID> > sents; +  vector<double> scores; +}; + + +/* + * KBestGetter + * + */ +struct KBestGetter : public DecoderObserver +{ +  KBestGetter( const size_t k ) : k_(k) {} +  const size_t k_; +  KBestList kb; + +  virtual void +  NotifyTranslationForest(const SentenceMetadata& smeta, Hypergraph* hg) +  { +    GetKBest(smeta.GetSentenceID(), *hg); +  } + +  KBestList* GetKBest() { return &kb; } + +  void +  GetKBest(int sent_id, const Hypergraph& forest) +  { +    kb.scores.clear(); +    kb.sents.clear(); +    kb.feats.clear(); +    KBest::KBestDerivations<vector<WordID>, ESentenceTraversal> kbest( forest, k_ ); +    for ( size_t i = 0; i < k_; ++i ) { +      const KBest::KBestDerivations<vector<WordID>, ESentenceTraversal>::Derivation* d = +        kbest.LazyKthBest( forest.nodes_.size() - 1, i ); +      if (!d) break; +      kb.sents.push_back( d->yield); +      kb.feats.push_back( d->feature_values ); +      kb.scores.push_back( d->score ); +    } +  } +}; + + +} // namespace + + +#endif + diff --git a/dtrain/learner.h b/dtrain/learner.h index a953284d..038749e2 100644 --- a/dtrain/learner.h +++ b/dtrain/learner.h @@ -1,71 +1,96 @@ -/*class Learnerx +#ifndef _DTRAIN_LEARNER_H_ +#define _DTRAIN_LEARNER_H_ + +#include <string> +#include <vector> +#include <map> + +#include "sparse_vector.h" +#include "score.h" + + +namespace dtrain +{ + + +class Learner  {    public: -    virtual void Init(const vector<SparseVector<double> >& kbest, const Scores& scores) {}; -    virtual void Update(SparseVector<double>& lambdas); -};*/ +    virtual void Init( const vector<SparseVector<double> >& kbest, const Scores& scores, +                       const bool invert_score = false ) {}; +    virtual void Update( SparseVector<double>& lambdas ) {}; +}; -class SofiaLearner //: public Learnerx FIXME + +class SofiaLearner : public Learner  { -  // TODO bool invert_score    public: -  void -  Init( const size_t sid, const vector<SparseVector<double> >& kbest, /*const*/ Scores& scores ) -  { -    assert( kbest.size() == scores.size() ); -    ofstream o; -    //unlink( "/tmp/sofia_ml_training_stupid" ); -    o.open( "/tmp/sofia_ml_training_normalx", ios::trunc ); // TODO randomize, filename exists -    int fid = 0; -    map<int,int>::iterator ff; +    void +    Init( const size_t sid, const vector<SparseVector<double> >& kbest, /*const FIXME*/ Scores& scores, +          const bool invert_score = false ) +    { +      assert( kbest.size() == scores.size() ); +      ofstream o; +      unlink( "/tmp/sofia_ml_training" ); +      o.open( "/tmp/sofia_ml_training", ios::trunc ); // TODO randomize, filename exists +      int fid = 0; +      map<int,int>::iterator ff; -    for ( size_t k = 0; k < kbest.size(); ++k ) { -      map<int,double> m; -      SparseVector<double>::const_iterator it = kbest[k].begin(); -      o << scores[k].GetScore(); -      for ( ; it != kbest[k].end(); ++it) { -        ff = fmap.find( it->first ); -        if ( ff == fmap.end() ) { -          fmap.insert( pair<int,int>(it->first, fid) ); -          fmap1.insert( pair<int,int>(fid, it->first) ); -          fid++; +      double score; +      for ( size_t k = 0; k < kbest.size(); ++k ) { +        map<int,double> m; +        SparseVector<double>::const_iterator it = kbest[k].begin(); +        score = scores[k].GetScore(); +        if ( invert_score ) score = -score; +        o << score; +        for ( ; it != kbest[k].end(); ++it ) { +          ff = fmap.find( it->first ); +          if ( ff == fmap.end() ) { +            fmap.insert( pair<int,int>(it->first, fid) ); +            fmap1.insert( pair<int,int>(fid, it->first) ); +            fid++; +          } +          m.insert( pair<int,double>(fmap[it->first], it->second) );          } -        m.insert(pair<int,double>(fmap[it->first], it->second)); -      } -      map<int,double>::iterator ti = m.begin(); -      for ( ; ti != m.end(); ++ti ) { -        o << " " << ti->first << ":" << ti->second; +        map<int,double>::iterator ti = m.begin(); +        for ( ; ti != m.end(); ++ti ) { +          o << " " << ti->first << ":" << ti->second; +        } +        o << endl;        } -      o << endl; +      o.close();      } -    o.close(); -  } -  void -  Update(SparseVector<double>& lambdas) -  { -    string call = "./sofia-ml --training_file /tmp/sofia_ml_training_normalx --model_out /tmp/sofia_ml_model_normalx --loop_type stochastic --lambda 100 --dimensionality "; -    std::stringstream out; -    out << fmap.size(); -    call += out.str(); -    call += " &>/dev/null"; -    system ( call.c_str() ); -    ifstream i; -    //unlink( "/tmp/sofia_ml_model_stupid" ); -    i.open( "/tmp/sofia_ml_model_normalx", ios::in ); -    string model; -    getline( i, model ); -    vector<string> strs; -    boost::split( strs, model, boost::is_any_of(" ") ); -    int j = 0; -    for ( vector<string>::iterator it = strs.begin(); it != strs.end(); ++it ) { -      lambdas.set_value(fmap1[j], atof( it->c_str() ) ); -      j++; +    void +    Update(SparseVector<double>& lambdas) +    { +      string call = "./sofia-ml --training_file /tmp/sofia_ml_training --model_out /tmp/sofia_ml_model --loop_type stochastic --lambda 100 --dimensionality "; +      std::stringstream out; +      out << fmap.size(); +      call += out.str(); +      call += " &>/dev/null"; +      system ( call.c_str() ); +      ifstream i; +      unlink( "/tmp/sofia_ml_model" ); +      i.open( "/tmp/sofia_ml_model", ios::in ); +      string model; +      getline( i, model ); +      vector<string> strs; +      boost::split( strs, model, boost::is_any_of(" ") ); +      int j = 0; +      for ( vector<string>::iterator it = strs.begin(); it != strs.end(); ++it ) { +        lambdas.set_value(fmap1[j], atof( it->c_str() ) ); +        j++; +      }      } -  }    private:      map<int,int> fmap;      map<int,int> fmap1;  }; + +} // namespace + +#endif + diff --git a/dtrain/score.cc b/dtrain/score.cc new file mode 100644 index 00000000..72e6db71 --- /dev/null +++ b/dtrain/score.cc @@ -0,0 +1,166 @@ +#include "score.h" + + +namespace dtrain +{ + + +/****************************************************************************** + * NGRAMS + * + * + * make_ngrams + * + */ +typedef map<vector<WordID>, size_t> Ngrams; +Ngrams +make_ngrams( vector<WordID>& s, size_t N ) +{ +  Ngrams ngrams; +  vector<WordID> ng; +  for ( size_t i = 0; i < s.size(); i++ ) { +    ng.clear(); +    for ( size_t j = i; j < min( i+N, s.size() ); j++ ) { +      ng.push_back( s[j] ); +      ngrams[ng]++; +    } +  } +  return ngrams; +} + + +/* + * ngram_matches + * + */ +NgramCounts +make_ngram_counts( vector<WordID> hyp, vector<WordID> ref, size_t N ) +{ +  Ngrams hyp_ngrams = make_ngrams( hyp, N ); +  Ngrams ref_ngrams = make_ngrams( ref, N ); +  NgramCounts counts( N ); +  Ngrams::iterator it; +  Ngrams::iterator ti; +  for ( it = hyp_ngrams.begin(); it != hyp_ngrams.end(); it++ ) { +    ti = ref_ngrams.find( it->first ); +    if ( ti != ref_ngrams.end() ) { +      counts.add( it->second, ti->second, it->first.size() - 1 ); +    } else { +      counts.add( it->second, 0, it->first.size() - 1 ); +    } +  } +  return counts; +} + + +/****************************************************************************** + * SCORES + * + * + * brevity_penaly + * + */ +double +brevity_penaly( const size_t hyp_len, const size_t ref_len ) +{ +  if ( hyp_len > ref_len ) return 1; +  return exp( 1 - (double)ref_len/(double)hyp_len ); +} + + +/* + * bleu + * as in "BLEU: a Method for Automatic Evaluation of Machine Translation" (Papineni et al. '02) + * page TODO + * 0 if for N one of the counts = 0 + */ +double +bleu( NgramCounts& counts, const size_t hyp_len, const size_t ref_len, +      size_t N, vector<float> weights  ) +{ +  if ( hyp_len == 0 || ref_len == 0 ) return 0; +  if ( ref_len < N ) N = ref_len; +  float N_ = (float)N; +  if ( weights.empty() ) +  { +    for ( size_t i = 0; i < N; i++ ) weights.push_back( 1/N_ ); +  } +  double sum = 0; +  for ( size_t i = 0; i < N; i++ ) { +    if ( counts.clipped[i] == 0 || counts.sum[i] == 0 ) return 0; +    sum += weights[i] * log( (double)counts.clipped[i] / (double)counts.sum[i] ); +  } +  return brevity_penaly( hyp_len, ref_len ) * exp( sum ); +} + + +/* + * stupid_bleu + * as in "ORANGE: a Method for Evaluating Automatic Evaluation Metrics for Machine Translation (Lin & Och '04) + * page TODO + * 0 iff no 1gram match + */ +double +stupid_bleu( NgramCounts& counts, const size_t hyp_len, const size_t ref_len, +             size_t N, vector<float> weights  ) +{ +  if ( hyp_len == 0 || ref_len == 0 ) return 0; +  if ( ref_len < N ) N = ref_len; +  float N_ = (float)N; +  if ( weights.empty() ) +  { +    for ( size_t i = 0; i < N; i++ ) weights.push_back( 1/N_ ); +  } +  double sum = 0; +  float add = 0; +  for ( size_t i = 0; i < N; i++ ) { +    if ( i == 1 ) add = 1; +    sum += weights[i] * log( ((double)counts.clipped[i] + add) / ((double)counts.sum[i] + add) ); +  } +  return brevity_penaly( hyp_len, ref_len ) * exp( sum ); +} + + +/* + * smooth_bleu + * as in "An End-to-End Discriminative Approach to Machine Translation" (Liang et al. '06) + * page TODO + * max. 0.9375 + */ +double +smooth_bleu( NgramCounts& counts, const size_t hyp_len, const size_t ref_len, +             const size_t N, vector<float> weights  ) +{ +  if ( hyp_len == 0 || ref_len == 0 ) return 0; +  float N_ = (float)N; +  if ( weights.empty() ) +  { +    for ( size_t i = 0; i < N; i++ ) weights.push_back( 1/N_ ); +  } +  double sum = 0; +  float j = 1; +  for ( size_t i = 0; i < N; i++ ) { +    if ( counts.clipped[i] == 0 || counts.sum[i] == 0) continue; +    sum += exp((weights[i] * log((double)counts.clipped[i]/(double)counts.sum[i]))) / pow( 2, N_-j+1 ); +    j++; +  } +  return brevity_penaly( hyp_len, ref_len ) * sum; +} + + +/* + * approx_bleu + * as in "Online Large-Margin Training for Statistical Machine Translation" (Watanabe et al. '07) + * page TODO + * + */ +double +approx_bleu( NgramCounts& counts, const size_t hyp_len, const size_t ref_len, +     const size_t N, vector<float> weights ) +{ +  return bleu( counts, hyp_len, ref_len, N, weights ); +} + + +} // namespace + diff --git a/dtrain/score.h b/dtrain/score.h new file mode 100644 index 00000000..e9130e18 --- /dev/null +++ b/dtrain/score.h @@ -0,0 +1,111 @@ +#ifndef _DTRAIN_SCORE_H_ +#define _DTRAIN_SCORE_H_ + + +#include <iostream> +#include <vector> +#include <map> +#include <cassert> +#include <cmath> + +#include "wordid.h" + +using namespace std; + + +namespace dtrain +{ + + +/* + * ScorePair + * + */ +struct ScorePair +{ +  ScorePair(double modelscore, double score) : modelscore_(modelscore), score_(score) {}  +  double modelscore_, score_; +  double GetModelScore() { return modelscore_; } +  double GetScore() { return score_; } +}; + +typedef vector<ScorePair> Scores; + + +/* + * NgramCounts + * + */ +struct NgramCounts +{ +  NgramCounts( const size_t N ) : N_( N ) { +    reset(); +  }  +  size_t N_; +  map<size_t, size_t> clipped; +  map<size_t, size_t> sum; + +  void +  operator+=( const NgramCounts& rhs ) +  { +    assert( N_ == rhs.N_ ); +    for ( size_t i = 0; i < N_; i++ ) { +      this->clipped[i] += rhs.clipped.find(i)->second; +      this->sum[i] += rhs.sum.find(i)->second; +    } +  } + +  void +  add( size_t count, size_t ref_count, size_t i ) +  { +    assert( i < N_ ); +    if ( count > ref_count ) { +      clipped[i] += ref_count; +      sum[i] += count; +    } else { +      clipped[i] += count; +      sum[i] += count; +    } +  } + +  void +  reset() +  { +    size_t i; +    for ( i = 0; i < N_; i++ ) { +      clipped[i] = 0; +      sum[i] = 0; +    } +  } + +  void +  print() +  { +    for ( size_t i = 0; i < N_; i++ ) { +      cout << i+1 << "grams (clipped):\t" << clipped[i] << endl; +      cout << i+1 << "grams:\t\t\t" << sum[i] << endl; +    } +  } +}; + + +typedef map<vector<WordID>, size_t> Ngrams; +Ngrams make_ngrams( vector<WordID>& s, size_t N ); +NgramCounts make_ngram_counts( vector<WordID> hyp, vector<WordID> ref, size_t N ); + +double brevity_penaly( const size_t hyp_len, const size_t ref_len ); +double bleu( NgramCounts& counts, const size_t hyp_len, const size_t ref_len, const size_t N, +             vector<float> weights = vector<float>() ); +double stupid_bleu( NgramCounts& counts, const size_t hyp_len, const size_t ref_len, size_t N, +                    vector<float> weights = vector<float>() ); +double smooth_bleu( NgramCounts& counts, const size_t hyp_len, const size_t ref_len, const size_t N, +                    vector<float> weights = vector<float>() ); +double approx_bleu( NgramCounts& counts, const size_t hyp_len, const size_t ref_len, const size_t N, +                    vector<float> weights = vector<float>() ); + + +} // namespace + + +#endif + diff --git a/dtrain/scripts/run.sh b/dtrain/scripts/run.sh new file mode 100755 index 00000000..f2b6d600 --- /dev/null +++ b/dtrain/scripts/run.sh @@ -0,0 +1,4 @@ +#!/bin/sh + +./dtrain -c ./data/cdec.ini -k 200 -n 3 -t 10 < ./data/in.blunsom08 #< data/in.toy + diff --git a/dtrain/scripts/test.sh b/dtrain/scripts/test.sh new file mode 100755 index 00000000..3639dfe7 --- /dev/null +++ b/dtrain/scripts/test.sh @@ -0,0 +1,6 @@ +#!/bin/sh + +EXP=$1 +#head -5000 +cat ./data/in.blunsom08 | ./dtest -q false -c ./data/cdec.ini -w ./data/weights-$EXP 2> ./output/err.$EXP > ./output/out.$EXP + diff --git a/dtrain/test.sh b/dtrain/test.sh deleted file mode 100755 index bc318ae7..00000000 --- a/dtrain/test.sh +++ /dev/null @@ -1,4 +0,0 @@ -#!/bin/sh - -./dtrain -c data/cdec.ini -k 200 < data/in.blunsom08 #< data/in.toy - diff --git a/dtrain/tests.cc b/dtrain/tests.cc new file mode 100644 index 00000000..997eafbb --- /dev/null +++ b/dtrain/tests.cc @@ -0,0 +1,141 @@ +#include "tests.h" + + +namespace dtrain +{ + + +/* + * approx_equal + * + */ +double +approx_equal( double x, double y ) +{ +  const double EPSILON = 1E-5; +  if ( x == 0 ) return fabs( y ) <= EPSILON; +  if ( y == 0 ) return fabs( x ) <= EPSILON; +  return fabs( x - y ) / max( fabs(x), fabs(y) ) <= EPSILON; +} + + +/* + * test_ngrams + * + */ +void +test_ngrams() +{ +  cout << "Testing ngrams..." << endl << endl; +  size_t N = 5; +  cout << "N = " << N << endl; +  vector<int> a; // hyp +  vector<int> b; // ref +  cout << "a "; +  for (size_t i = 1; i <= 8; i++) { +    cout << i << " "; +    a.push_back(i); +  } +  cout << endl << "b "; +  for (size_t i = 1; i <= 4; i++) { +    cout << i << " "; +    b.push_back(i); +  } +  cout << endl << endl; +  NgramCounts c = make_ngram_counts( a, b, N ); +  assert( c.clipped[N-1] == 0 ); +  assert( c.sum[N-1] == 4 ); +  c.print(); +  c += c; +  cout << endl; +  c.print(); +  cout << endl; +} + + +/* + * test_metrics + * + */ +void +test_metrics() +{ +  cout << "Testing metrics..." << endl << endl; +  using namespace boost::assign; +  vector<string> a, b; +  vector<double> expect_vanilla, expect_smooth, expect_stupid; +  a +=              "a a a a", "a a a a", "a",   "a", "b",        "a a a a", "a a",  "a a a", "a b a"; // hyp +  b +=              "b b b b", "a a a a", "a",   "b", "b b b b",  "a",       "a a",  "a a a", "a b b"; // ref +  expect_vanilla += 0,         1,         1,      0,  0,          .25,       1,      1,       0; +  expect_smooth  += 0,          .9375,     .0625, 0,   .00311169, .0441942,   .1875,  .4375,   .161587; +  expect_stupid  += 0,         1,         1,      0,   .0497871,  .25,       1,      1,        .605707; +  vector<string> aa, bb; +  vector<WordID> aai, bbi; +  double vanilla, smooth, stupid; +  size_t N = 4; +  cout << "N = " << N << endl << endl; +  for ( size_t i = 0; i < a.size(); i++ ) { +    cout << " hyp: " << a[i] << endl; +    cout << " ref: " << b[i] << endl; +    aa.clear(); bb.clear(); aai.clear(); bbi.clear(); +    boost::split( aa, a[i], boost::is_any_of(" ") ); +    boost::split( bb, b[i], boost::is_any_of(" ") ); +    register_and_convert( aa, aai ); +    register_and_convert( bb, bbi ); +    NgramCounts counts = make_ngram_counts( aai, bbi, N ); +    vanilla =        bleu( counts, aa.size(), bb.size(), N); +    smooth  = smooth_bleu( counts, aa.size(), bb.size(), N); +    stupid  = stupid_bleu( counts, aa.size(), bb.size(), N); +    assert( approx_equal(vanilla, expect_vanilla[i]) ); +    assert( approx_equal(smooth, expect_smooth[i]) ); +    assert( approx_equal(stupid, expect_stupid[i]) ); +    cout << setw(14) << "bleu = "      << vanilla << endl; +    cout << setw(14) << "smooth bleu = " << smooth << endl; +    cout << setw(14) << "stupid bleu = " << stupid << endl << endl; +  } +  cout << endl; +} + + +/* + * test_SetWeights + * + */ +void +test_SetWeights() +{ +  cout << "Testing Weights::SetWeight..." << endl << endl; +  Weights weights; +  SparseVector<double> lambdas; +  weights.InitSparseVector( &lambdas ); +  weights.SetWeight( &lambdas, "test", 0 ); +  weights.SetWeight( &lambdas, "test1", 1 ); +  WordID fid = FD::Convert( "test2" ); +  weights.SetWeight( &lambdas, fid, 2 ); +  string fn = "weights-test"; +  cout << "FD::NumFeats() " << FD::NumFeats() << endl; +  assert( FD::NumFeats() == 4 ); +  weights.WriteToFile( fn, true ); +  cout << endl; +} + + +/* + * run_tests + * + */ +void +run_tests() +{ +  cout << endl; +  test_ngrams(); +  cout << endl; +  test_metrics(); +  cout << endl; +  test_SetWeights(); +  exit(0); +} + + +} // namespace + diff --git a/dtrain/tests.h b/dtrain/tests.h new file mode 100644 index 00000000..9853e3c3 --- /dev/null +++ b/dtrain/tests.h @@ -0,0 +1,26 @@ +#ifndef _DTRAIN_TESTS_H_ +#define _DTRAIN_TESTS_H_ + +#include <iomanip> +#include <boost/assign/std/vector.hpp> + +#include "common.h" +#include "util.h" + + +namespace dtrain +{ + + +double approx_equal( double x, double y ); +void test_ngrams(); +void test_metrics(); +void test_SetWeights(); +void run_tests(); + + +} // namespace + + +#endif + diff --git a/dtrain/util.cc b/dtrain/util.cc new file mode 100644 index 00000000..7b3bbe3d --- /dev/null +++ b/dtrain/util.cc @@ -0,0 +1,34 @@ +#include "util.h" + + +namespace dtrain +{ + + +/* + * register_and_convert + * + */ +void +register_and_convert(const vector<string>& strs, vector<WordID>& ids) +{ +  vector<string>::const_iterator it; +  for ( it = strs.begin(); it < strs.end(); it++ ) { +    ids.push_back( TD::Convert( *it ) ); +  } +} + + +/* + * print_FD + * + */ +void +print_FD() +{ +  for ( size_t i = 0; i < FD::NumFeats(); i++ ) cout << FD::Convert(i)<< endl; +} + + +} // namespace + diff --git a/dtrain/util.h b/dtrain/util.h new file mode 100644 index 00000000..6a548519 --- /dev/null +++ b/dtrain/util.h @@ -0,0 +1,28 @@ +#ifndef _DTRAIN_UTIL_H_ +#define _DTRAIN_UTIL_H_ + + +#include <iostream> +#include <string> +#include <vector> + +#include "fdict.h" +#include "tdict.h" +#include "wordid.h" + +using namespace std; + + +namespace dtrain +{ + + +void register_and_convert(const vector<string>& strs, vector<WordID>& ids); +void print_FD(); + + +} // namespace + + +#endif + | 
