From d980ecbbcd35fba23313aa715046bc0f87a23afd Mon Sep 17 00:00:00 2001
From: Patrick Simianer <p@simianer.de>
Date: Fri, 29 Jul 2011 00:48:04 +0200
Subject: first cut for sofia-ml, little change in utils/dict.h, coarse
 refactoring

---
 dtrain/dtrain.cc | 595 ++++++-------------------------------------------------
 1 file changed, 59 insertions(+), 536 deletions(-)

(limited to 'dtrain/dtrain.cc')
diff --git a/dtrain/dtrain.cc b/dtrain/dtrain.cc
index 8464a429..95fc81af 100644
--- a/dtrain/dtrain.cc
+++ b/dtrain/dtrain.cc
@@ -1,33 +1,6 @@
-#include <sstream>
-#include <iostream>
-#include <vector>
-#include <cassert>
-#include <cmath>
+#include "dcommon.h"
 
-#include "config.h"
 
-#include <boost/shared_ptr.hpp>
-#include <boost/algorithm/string.hpp>
-#include <boost/program_options.hpp>
-#include <boost/program_options/variables_map.hpp>
-
-#include "sentence_metadata.h"
-#include "scorer.h"
-#include "verbose.h"
-#include "viterbi.h"
-#include "hg.h"
-#include "prob.h"
-#include "kbest.h"
-#include "ff_register.h"
-#include "decoder.h"
-#include "filelib.h"
-#include "fdict.h"
-#include "weights.h"
-#include "sparse_vector.h"
-#include "sampler.h"
-
-using namespace std;
-namespace boostpo = boost::program_options;
 
 
 /*
@@ -35,19 +8,19 @@ namespace boostpo = boost::program_options;
  *
  */
 bool
-init(int argc, char** argv, boostpo::variables_map* conf)
+init(int argc, char** argv, po::variables_map* conf)
 {
-  boostpo::options_description opts( "Options" );
+  po::options_description opts( "Options" );
   opts.add_options()
-    ( "decoder-config,c", boostpo::value<string>(), "configuration file for cdec" )
-    ( "kbest,k",          boostpo::value<size_t>(), "k for kbest" )
-    ( "ngrams,n",         boostpo::value<int>(),    "n for Ngrams" )
-    ( "filter,f",         boostpo::value<string>(), "filter kbest list" )
+    ( "decoder-config,c", po::value<string>(), "configuration file for cdec" )
+    ( "kbest,k",          po::value<size_t>(), "k for kbest" )
+    ( "ngrams,n",         po::value<int>(),    "n for Ngrams" )
+    ( "filter,f",         po::value<string>(), "filter kbest list" )
     ( "test",                                       "run tests and exit");
-  boostpo::options_description cmdline_options;
+  po::options_description cmdline_options;
   cmdline_options.add(opts);
-  boostpo::store( parse_command_line(argc, argv, cmdline_options), *conf );
-  boostpo::notify( *conf );
+  po::store( parse_command_line(argc, argv, cmdline_options), *conf );
+  po::notify( *conf );
   if ( ! (conf->count("decoder-config") || conf->count("test")) ) {
     cerr << cmdline_options << endl;
     return false;
@@ -56,443 +29,6 @@ init(int argc, char** argv, boostpo::variables_map* conf)
 }
 
 
-/*
- * KBestGetter
- *
- */
-struct KBestList {
-  vector<SparseVector<double> > feats;
-  vector<vector<WordID> > sents;
-  vector<double> scores;
-};
-struct KBestGetter : public DecoderObserver
-{
-  KBestGetter( const size_t k ) : k_(k) {}
-  const size_t k_;
-  KBestList kb;
-
-  virtual void
-  NotifyTranslationForest(const SentenceMetadata& smeta, Hypergraph* hg)
-  {
-    GetKBest(smeta.GetSentenceID(), *hg);
-  }
-
-  KBestList* getkb() { return &kb; }
-
-  void
-  GetKBest(int sent_id, const Hypergraph& forest)
-  {
-    kb.scores.clear();
-    kb.sents.clear();
-    kb.feats.clear();
-    KBest::KBestDerivations<vector<WordID>, ESentenceTraversal> kbest( forest, k_ );
-    for ( size_t i = 0; i < k_; ++i ) {
-      const KBest::KBestDerivations<vector<WordID>, ESentenceTraversal>::Derivation* d =
-        kbest.LazyKthBest( forest.nodes_.size() - 1, i );
-      if (!d) break;
-      kb.sents.push_back( d->yield);
-      kb.feats.push_back( d->feature_values );
-      kb.scores.push_back( d->score );
-    }
-  }
-};
-
-
-/*
- * write_training_data_for_sofia
- *
- */
-void
-sofia_write_training_data()
-{
-  // TODO
-}
-
-
-/*
- * call_sofia
- *
- */
-void
-sofia_call()
-{
-  // TODO
-}
-
-
-/*
- * sofia_model2weights
- *
- */
-void
-sofia_read_model()
-{
-  // TODO
-}
-
-
-/*
- * make_ngrams
- *
- */
-typedef map<vector<WordID>, size_t> Ngrams;
-Ngrams
-make_ngrams( vector<WordID>& s, size_t N )
-{
-  Ngrams ngrams;
-  vector<WordID> ng;
-  for ( size_t i = 0; i < s.size(); i++ ) {
-    ng.clear();
-    for ( size_t j = i; j < min( i+N, s.size() ); j++ ) {
-      ng.push_back( s[j] );
-      ngrams[ng]++;
-    }
-  }
-  return ngrams;
-}
-
-
-/*
- * NgramCounts
- *
- */
-struct NgramCounts
-{
-  NgramCounts( const size_t N ) : N_( N ) {
-    reset();
-  } 
-  size_t N_;
-  map<size_t, size_t> clipped;
-  map<size_t, size_t> sum;
-
-  void
-  operator+=( const NgramCounts& rhs )
-  {
-    assert( N_ == rhs.N_ );
-    for ( size_t i = 0; i < N_; i++ ) {
-      this->clipped[i] += rhs.clipped.find(i)->second;
-      this->sum[i] += rhs.sum.find(i)->second;
-    }
-  }
-
-  void
-  add( size_t count, size_t ref_count, size_t i )
-  {
-    assert( i < N_ );
-    if ( count > ref_count ) {
-      clipped[i] += ref_count;
-      sum[i] += count;
-    } else {
-      clipped[i] += count;
-      sum[i] += count;
-    }
-  }
-
-  void
-  reset()
-  {
-    size_t i;
-    for ( i = 0; i < N_; i++ ) {
-      clipped[i] = 0;
-      sum[i] = 0;
-    }
-  }
-
-  void
-  print()
-  {
-    for ( size_t i = 0; i < N_; i++ ) {
-      cout << i+1 << "grams (clipped):\t" << clipped[i] << endl;
-      cout << i+1 << "grams:\t\t\t" << sum[i] << endl;
-    }
-  }
-};
-
-
-/*
- * ngram_matches
- *
- */
-NgramCounts
-make_ngram_counts( vector<WordID> hyp, vector<WordID> ref, size_t N )
-{
-  Ngrams hyp_ngrams = make_ngrams( hyp, N );
-  Ngrams ref_ngrams = make_ngrams( ref, N );
-  NgramCounts counts( N );
-  Ngrams::iterator it;
-  Ngrams::iterator ti;
-  for ( it = hyp_ngrams.begin(); it != hyp_ngrams.end(); it++ ) {
-    ti = ref_ngrams.find( it->first );
-    if ( ti != ref_ngrams.end() ) {
-      counts.add( it->second, ti->second, it->first.size() - 1 );
-    } else {
-      counts.add( it->second, 0, it->first.size() - 1 );
-    }
-  }
-  return counts;
-}
-
-
-/*
- * brevity_penaly
- *
- */
-double
-brevity_penaly( const size_t hyp_len, const size_t ref_len )
-{
-  if ( hyp_len > ref_len ) return 1;
-  return exp( 1 - (double)ref_len/(double)hyp_len );
-}
-
-
-/*
- * bleu
- * as in "BLEU: a Method for Automatic Evaluation of Machine Translation" (Papineni et al. '02)
- * page TODO
- * 0 if for N one of the counts = 0
- */
-double
-bleu( NgramCounts& counts, const size_t hyp_len, const size_t ref_len,
-      size_t N, vector<float> weights = vector<float>() )
-{
-  if ( hyp_len == 0 || ref_len == 0 ) return 0;
-  if ( ref_len < N ) N = ref_len;
-  float N_ = (float)N;
-  if ( weights.empty() )
-  {
-    for ( size_t i = 0; i < N; i++ ) weights.push_back( 1/N_ );
-  }
-  double sum = 0;
-  for ( size_t i = 0; i < N; i++ ) {
-    if ( counts.clipped[i] == 0 || counts.sum[i] == 0 ) return 0;
-    sum += weights[i] * log( (double)counts.clipped[i] / (double)counts.sum[i] );
-  }
-  return brevity_penaly( hyp_len, ref_len ) * exp( sum );
-}
-
-
-/*
- * stupid_bleu
- * as in "ORANGE: a Method for Evaluating Automatic Evaluation Metrics for Machine Translation (Lin & Och '04)
- * page TODO
- * 0 iff no 1gram match
- */
-double
-stupid_bleu( NgramCounts& counts, const size_t hyp_len, const size_t ref_len,
-             size_t N, vector<float> weights = vector<float>() )
-{
-  if ( hyp_len == 0 || ref_len == 0 ) return 0;
-  if ( ref_len < N ) N = ref_len;
-  float N_ = (float)N;
-  if ( weights.empty() )
-  {
-    for ( size_t i = 0; i < N; i++ ) weights.push_back( 1/N_ );
-  }
-  double sum = 0;
-  float add = 0;
-  for ( size_t i = 0; i < N; i++ ) {
-    if ( i == 1 ) add = 1;
-    sum += weights[i] * log( ((double)counts.clipped[i] + add) / ((double)counts.sum[i] + add) );
-  }
-  return brevity_penaly( hyp_len, ref_len ) * exp( sum );
-}
-
-
-/*
- * smooth_bleu
- * as in "An End-to-End Discriminative Approach to Machine Translation" (Liang et al. '06)
- * page TODO
- * max. 0.9375
- */
-double
-smooth_bleu( NgramCounts& counts, const size_t hyp_len, const size_t ref_len,
-             const size_t N, vector<float> weights = vector<float>() )
-{
-  if ( hyp_len == 0 || ref_len == 0 ) return 0;
-  float N_ = (float)N;
-  if ( weights.empty() )
-  {
-    for ( size_t i = 0; i < N; i++ ) weights.push_back( 1/N_ );
-  }
-  double sum = 0;
-  float j = 1;
-  for ( size_t i = 0; i < N; i++ ) {
-    if ( counts.clipped[i] == 0 || counts.sum[i] == 0) continue;
-    sum += exp((weights[i] * log((double)counts.clipped[i]/(double)counts.sum[i]))) / pow( 2, N_-j+1 );
-    j++;
-  }
-  return brevity_penaly( hyp_len, ref_len ) * sum;
-}
-
-
-/*
- * approx_bleu
- * as in "Online Large-Margin Training for Statistical Machine Translation" (Watanabe et al. '07)
- * page TODO
- *
- */
-double
-approx_bleu( NgramCounts& counts, const size_t hyp_len, const size_t ref_len,
-     const size_t N, vector<float> weights = vector<float>() )
-{
-  return bleu( counts, hyp_len, ref_len, N, weights );
-}
-
-
-/*
- * register_and_convert
- *
- */
-void
-register_and_convert(const vector<string>& strs, vector<WordID>& ids)
-{
-  vector<string>::const_iterator it;
-  for ( it = strs.begin(); it < strs.end(); it++ ) {
-    ids.push_back( TD::Convert( *it ) );
-  }
-}
-
-
-/*
- *
- *
- */
-void
-test_ngrams()
-{
-  cout << "Testing ngrams..." << endl << endl;
-  size_t N = 5;
-  cout << "N = " << N << endl;
-  vector<int> a; // hyp
-  vector<int> b; // ref
-  cout << "a ";
-  for (size_t i = 1; i <= 8; i++) {
-    cout << i << " ";
-    a.push_back(i);
-  }
-  cout << endl << "b ";
-  for (size_t i = 1; i <= 4; i++) {
-    cout << i << " ";
-    b.push_back(i);
-  }
-  cout << endl << endl;
-  NgramCounts c = make_ngram_counts( a, b, N );
-  assert( c.clipped[N-1] == 0 );
-  assert( c.sum[N-1] == 4 );
-  c.print();
-  c += c;
-  cout << endl;
-  c.print();
-  cout << endl;
-}
-
-
-/*
- *
- *
- */
-double
-approx_equal( double x, double y )
-{
-  const double EPSILON = 1E-5;
-  if ( x == 0 ) return fabs( y ) <= EPSILON;
-  if ( y == 0 ) return fabs( x ) <= EPSILON;
-  return fabs( x - y ) / max( fabs(x), fabs(y) ) <= EPSILON;
-}
-
-
-/*
- *
- *
- */
-#include <boost/assign/std/vector.hpp>
-#include <iomanip>
-void
-test_metrics()
-{
-  cout << "Testing metrics..." << endl << endl;
-  using namespace boost::assign;
-  vector<string> a, b;
-  vector<double> expect_vanilla, expect_smooth, expect_stupid;
-  a +=              "a a a a", "a a a a", "a",   "a", "b",        "a a a a", "a a",  "a a a", "a b a"; // hyp
-  b +=              "b b b b", "a a a a", "a",   "b", "b b b b",  "a",       "a a",  "a a a", "a b b"; // ref
-  expect_vanilla += 0,         1,         1,      0,  0,          .25,       1,      1,       0;
-  expect_smooth  += 0,          .9375,     .0625, 0,   .00311169, .0441942,   .1875,  .4375,   .161587;
-  expect_stupid  += 0,         1,         1,      0,   .0497871,  .25,       1,      1,        .605707;
-  vector<string> aa, bb;
-  vector<WordID> aai, bbi;
-  double vanilla, smooth, stupid;
-  size_t N = 4;
-  cout << "N = " << N << endl << endl;
-  for ( size_t i = 0; i < a.size(); i++ ) {
-    cout << " hyp: " << a[i] << endl;
-    cout << " ref: " << b[i] << endl;
-    aa.clear(); bb.clear(); aai.clear(); bbi.clear();
-    boost::split( aa, a[i], boost::is_any_of(" ") );
-    boost::split( bb, b[i], boost::is_any_of(" ") );
-    register_and_convert( aa, aai );
-    register_and_convert( bb, bbi );
-    NgramCounts counts = make_ngram_counts( aai, bbi, N );
-    vanilla =        bleu( counts, aa.size(), bb.size(), N);
-    smooth  = smooth_bleu( counts, aa.size(), bb.size(), N);
-    stupid  = stupid_bleu( counts, aa.size(), bb.size(), N);
-    assert( approx_equal(vanilla, expect_vanilla[i]) );
-    assert( approx_equal(smooth, expect_smooth[i]) );
-    assert( approx_equal(stupid, expect_stupid[i]) );
-    cout << setw(14) << "bleu = "      << vanilla << endl;
-    cout << setw(14) << "smooth bleu = " << smooth << endl;
-    cout << setw(14) << "stupid bleu = " << stupid << endl << endl;
-  }
-  cout << endl;
-}
-
-/*
- *
- *
- */
-void
-test_SetWeights()
-{
-  cout << "Testing Weights::SetWeight..." << endl << endl;
-  Weights weights;
-  SparseVector<double> lambdas;
-  weights.InitSparseVector( &lambdas );
-  weights.SetWeight( &lambdas, "test", 0 );
-  weights.SetWeight( &lambdas, "test1", 1 );
-  WordID fid = FD::Convert( "test2" );
-  weights.SetWeight( &lambdas, fid, 2 );
-  string fn = "weights-test";
-  cout << "FD::NumFeats() " << FD::NumFeats() << endl;
-  assert( FD::NumFeats() == 4 );
-  weights.WriteToFile( fn, true );
-  cout << endl;
-}
-
-
-/*
- *
- *
- */
-void
-run_tests()
-{
-  cout << endl;
-  test_ngrams();
-  cout << endl;
-  test_metrics();
-  cout << endl;
-  test_SetWeights();
-  exit(0);
-}
-
-
-void
-print_FD()
-{
-  for ( size_t i = 0; i < FD::NumFeats(); i++ ) cout << FD::Convert(i)<< endl;
-}
-
-
 /*
  * main
  *
@@ -500,8 +36,8 @@ print_FD()
 int
 main(int argc, char** argv)
 {
-  //SetSilent(true);
-  boostpo::variables_map conf;
+  SetSilent(true);
+  po::variables_map conf;
   if (!init(argc, argv, &conf)) return 1;
   if ( conf.count("test") ) run_tests(); 
   register_feature_functions();
@@ -509,7 +45,9 @@ main(int argc, char** argv)
   ReadFile ini_rf(conf["decoder-config"].as<string>());
   Decoder decoder(ini_rf.stream());
   KBestGetter observer(k);
-  
+  size_t N = 4; // TODO as parameter/in config 
+
+  // TODO scoring metric as parameter/in config 
   // for approx. bleu
   //NgramCounts global_counts;
   //size_t global_hyp_len;
@@ -523,82 +61,67 @@ main(int argc, char** argv)
   lambdas.set_value(FD::Convert("logp"), 0);
 
  
-  vector<string> strs;
+  vector<string> strs, ref_strs;
+  vector<WordID> ref_ids;
   string in, psg;
-  size_t i = 0;
+  size_t sid = 0;
+  cerr << "(1 dot equals 100 lines of input)" << endl;
   while( getline(cin, in) ) {
-    if ( !SILENT ) cerr << endl << endl << "Getting kbest for sentence #" << i << endl;
-    // why? why!?
+    //if ( !SILENT )
+    //    cerr << endl << endl << "Getting kbest for sentence #" << sid << endl;
+    if ( (sid+1) % 100 == 0 ) {
+        cerr << ".";
+        if ( (sid+1)%1000 == 0 ) cerr << endl;
+    }
+    if ( sid > 5000 ) break;
+    // weights
     dense_weights.clear();
     weights.InitFromVector( lambdas );
     weights.InitVector( &dense_weights );
     decoder.SetWeights( dense_weights );
-    //cout << "use_shell " << dense_weights[FD::Convert("use_shell")] << endl;
+    //if ( sid > 100 ) break;
+    // handling input..
     strs.clear();
     boost::split( strs, in, boost::is_any_of("\t") );
+    // grammar
     psg = boost::replace_all_copy( strs[2], " __NEXT_RULE__ ", "\n" ); psg += "\n";
-    //decoder.SetId(i);
     decoder.SetSentenceGrammar( psg );
     decoder.Decode( strs[0], &observer );
-    KBestList* kb = observer.getkb();
+    KBestList* kb = observer.GetKBest();
+    // reference
+    ref_strs.clear(); ref_ids.clear();
+    boost::split( ref_strs, strs[1], boost::is_any_of(" ") );
+    register_and_convert( ref_strs, ref_ids );
+    // scoring kbest
+    double score = 0;
+    Scores scores;
     for ( size_t i = 0; i < k; i++ ) {
-      cout << i << " ";
-      for (size_t j = 0; j < kb->sents[i].size(); ++j ) {
-        cout << TD::Convert( kb->sents[i][j] ) << " ";
-      }
-      cout << kb->scores[i];
-      cout << endl;
+      NgramCounts counts = make_ngram_counts( ref_ids, kb->sents[i], 4 );
+      score = smooth_bleu( counts,
+                           ref_ids.size(),
+                           kb->sents[i].size(), N );
+      ScorePair sp( kb->scores[i], score );
+      scores.push_back( sp );
+      //cout << "'" << TD::GetString( ref_ids ) << "' vs '" << TD::GetString( kb->sents[i] ) << "' SCORE=" << score << endl;
+      //cout << kb->feats[i] << endl;
     }
-    lambdas.set_value( FD::Convert("use_shell"), 1 );
-    lambdas.set_value( FD::Convert("use_a"), 1 );
+    //cout << "###" << endl;
+    SofiaLearner learner;
+    learner.Init( sid, kb->feats, scores );
+    learner.Update(lambdas);
+    // initializing learner
+    // TODO
+    // updating weights
+    //lambdas.set_value( FD::Convert("use_shell"), 1 );
+    //lambdas.set_value( FD::Convert("use_a"), 1 );
     //print_FD();
+    sid += 1; // TODO does cdec count this already?
   }
-  
+
   weights.WriteToFile( "weights-final", true );
+  
+  cerr << endl;
 
   return 0;
 }
 
-    // next: FMap, ->sofia, ->FMap, -> Weights
-    // learner gets all used features (binary! and dense (logprob is sum of logprobs!))
-    // only for those feats with weight > 0 after learning
-    // see decoder line 548
-
-
-/*
- * TODO
- *  iterate over training set, for t=1..T
- *  mapred impl
- *   mapper:  main
- *   reducer: average weights, global NgramCounts for approx. bleu
- *  1st cut: hadoop streaming?
- *  batch, non-batch in the mapper (what sofia gets, regenerated Kbest lists)
- *  filter kbest yes/no
- *  sofia: --eta_type explicit
- *  psg preparation source\tref\tpsg
- *  set reference for cdec?
- *  LM
- *   shared?
- *   startup?
- *  X reference(s) for *bleu!?
- *  kbest nicer (do not iterate twice)!? -> shared_ptr
- *  multipartite ranking
- *  weights! global, per sentence from global, featuremap
- *  const decl...
- *  sketch: batch/iter options
- *  weights.cc: why wv_?
- *  --weights cmd line (for iterations): script to call again/hadoop streaming?
- *  I do not need to remember features, cdec does
- *  resocre hg?
- *  do not use Decoder::Decode!?
- *  what happens if feature not in FD? 0???
- */
-
-/*
- * PROBLEMS
- *  cdec kbest vs 1best (no -k param)
- *  FD, Weights::wv_ grow too large, see utils/weights.cc; decoder/hg.h; decoder/scfg_translator.cc; utils/fdict.cc!?
- *  sparse vector instead of vector<double> for weights in Decoder?
- *  PhraseModel_* features for psg!? (seem to be generated)
- */
-
-- 
cgit v1.2.3