first cut for sofia-ml, little change in utils/dict.h, coarse refactoring

author: Patrick Simianer <p@simianer.de> 2011-07-29 00:48:04 +0200
committer: Patrick Simianer <p@simianer.de> 2011-09-23 19:13:57 +0200
commit: b732e625ffcf59da8440db577183110488f5c4b7 (patch)
tree: 863b59dea0ffd927621751da5c53298924c2ee2f /dtrain
parent: 05c41075d0018ca6142f7ba593742fbadfecdf65 (diff)
10 files changed, 704 insertions, 548 deletions
diff --git a/dtrain/Makefile.am b/dtrain/Makefile.am
index daa20cf3..c3f14bb0 100644
--- a/dtrain/Makefile.am
+++ b/dtrain/Makefile.am
@@ -1,6 +1,11 @@
-bin_PROGRAMS = dtrain
+# TODO I'm sure I can leave something out.
+bin_PROGRAMS = dtrain dtest
 
-dtrain_SOURCES = dtrain.cc
-dtrain_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a ../klm/lm/libklm.a ../klm/util/libklm_util.a -lz
+dtrain_SOURCES = dtrain.cc dcommon.cc
+dtrain_LDADD   = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a ../klm/lm/libklm.a ../klm/util/libklm_util.a -lz
+
+dtest_SOURCES = dtest.cc dcommon.cc
+dtest_LDADD   = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a ../klm/lm/libklm.a ../klm/util/libklm_util.a -lz
 
 AM_CPPFLAGS = -W -Wall -Wno-sign-compare -I$(top_srcdir)/utils -I$(top_srcdir)/decoder -I$(top_srcdir)/mteval
+
diff --git a/dtrain/cdec.ini b/dtrain/cdec.ini
deleted file mode 100644
index 92a4a335..00000000
--- a/dtrain/cdec.ini
+++ /dev/null
@@ -1,4 +0,0 @@
-formalism=scfg
-#feature_function=KLanguageModel europarl-v6.tok.lc.s-tag.en.arpa.kenlm.v4.mma
-#k_best=2
-#add_pass_through_rules=true
diff --git a/dtrain/dcommon.cc b/dtrain/dcommon.cc
new file mode 100644
index 00000000..a6bdc92c
--- /dev/null
+++ b/dtrain/dcommon.cc
@@ -0,0 +1,311 @@
+#include "dcommon.h"
+
+
+
+/*
+ * make_ngrams
+ *
+ */
+typedef map<vector<WordID>, size_t> Ngrams;
+Ngrams
+make_ngrams( vector<WordID>& s, size_t N )
+{
+  Ngrams ngrams;
+  vector<WordID> ng;
+  for ( size_t i = 0; i < s.size(); i++ ) {
+    ng.clear();
+    for ( size_t j = i; j < min( i+N, s.size() ); j++ ) {
+      ng.push_back( s[j] );
+      ngrams[ng]++;
+    }
+  }
+  return ngrams;
+}
+
+
+
+
+
+/*
+ * ngram_matches
+ *
+ */
+NgramCounts
+make_ngram_counts( vector<WordID> hyp, vector<WordID> ref, size_t N )
+{
+  Ngrams hyp_ngrams = make_ngrams( hyp, N );
+  Ngrams ref_ngrams = make_ngrams( ref, N );
+  NgramCounts counts( N );
+  Ngrams::iterator it;
+  Ngrams::iterator ti;
+  for ( it = hyp_ngrams.begin(); it != hyp_ngrams.end(); it++ ) {
+    ti = ref_ngrams.find( it->first );
+    if ( ti != ref_ngrams.end() ) {
+      counts.add( it->second, ti->second, it->first.size() - 1 );
+    } else {
+      counts.add( it->second, 0, it->first.size() - 1 );
+    }
+  }
+  return counts;
+}
+
+
+/*
+ * brevity_penaly
+ *
+ */
+double
+brevity_penaly( const size_t hyp_len, const size_t ref_len )
+{
+  if ( hyp_len > ref_len ) return 1;
+  return exp( 1 - (double)ref_len/(double)hyp_len );
+}
+
+
+/*
+ * bleu
+ * as in "BLEU: a Method for Automatic Evaluation of Machine Translation" (Papineni et al. '02)
+ * page TODO
+ * 0 if for N one of the counts = 0
+ */
+double
+bleu( NgramCounts& counts, const size_t hyp_len, const size_t ref_len,
+      size_t N, vector<float> weights  )
+{
+  if ( hyp_len == 0 || ref_len == 0 ) return 0;
+  if ( ref_len < N ) N = ref_len;
+  float N_ = (float)N;
+  if ( weights.empty() )
+  {
+    for ( size_t i = 0; i < N; i++ ) weights.push_back( 1/N_ );
+  }
+  double sum = 0;
+  for ( size_t i = 0; i < N; i++ ) {
+    if ( counts.clipped[i] == 0 || counts.sum[i] == 0 ) return 0;
+    sum += weights[i] * log( (double)counts.clipped[i] / (double)counts.sum[i] );
+  }
+  return brevity_penaly( hyp_len, ref_len ) * exp( sum );
+}
+
+
+/*
+ * stupid_bleu
+ * as in "ORANGE: a Method for Evaluating Automatic Evaluation Metrics for Machine Translation (Lin & Och '04)
+ * page TODO
+ * 0 iff no 1gram match
+ */
+double
+stupid_bleu( NgramCounts& counts, const size_t hyp_len, const size_t ref_len,
+             size_t N, vector<float> weights  )
+{
+  if ( hyp_len == 0 || ref_len == 0 ) return 0;
+  if ( ref_len < N ) N = ref_len;
+  float N_ = (float)N;
+  if ( weights.empty() )
+  {
+    for ( size_t i = 0; i < N; i++ ) weights.push_back( 1/N_ );
+  }
+  double sum = 0;
+  float add = 0;
+  for ( size_t i = 0; i < N; i++ ) {
+    if ( i == 1 ) add = 1;
+    sum += weights[i] * log( ((double)counts.clipped[i] + add) / ((double)counts.sum[i] + add) );
+  }
+  return brevity_penaly( hyp_len, ref_len ) * exp( sum );
+}
+
+
+/*
+ * smooth_bleu
+ * as in "An End-to-End Discriminative Approach to Machine Translation" (Liang et al. '06)
+ * page TODO
+ * max. 0.9375
+ */
+double
+smooth_bleu( NgramCounts& counts, const size_t hyp_len, const size_t ref_len,
+             const size_t N, vector<float> weights  )
+{
+  if ( hyp_len == 0 || ref_len == 0 ) return 0;
+  float N_ = (float)N;
+  if ( weights.empty() )
+  {
+    for ( size_t i = 0; i < N; i++ ) weights.push_back( 1/N_ );
+  }
+  double sum = 0;
+  float j = 1;
+  for ( size_t i = 0; i < N; i++ ) {
+    if ( counts.clipped[i] == 0 || counts.sum[i] == 0) continue;
+    sum += exp((weights[i] * log((double)counts.clipped[i]/(double)counts.sum[i]))) / pow( 2, N_-j+1 );
+    j++;
+  }
+  return brevity_penaly( hyp_len, ref_len ) * sum;
+}
+
+
+/*
+ * approx_bleu
+ * as in "Online Large-Margin Training for Statistical Machine Translation" (Watanabe et al. '07)
+ * page TODO
+ *
+ */
+double
+approx_bleu( NgramCounts& counts, const size_t hyp_len, const size_t ref_len,
+     const size_t N, vector<float> weights )
+{
+  return bleu( counts, hyp_len, ref_len, N, weights );
+}
+
+
+/*
+ * register_and_convert
+ *
+ */
+void
+register_and_convert(const vector<string>& strs, vector<WordID>& ids)
+{
+  vector<string>::const_iterator it;
+  for ( it = strs.begin(); it < strs.end(); it++ ) {
+    ids.push_back( TD::Convert( *it ) );
+  }
+}
+
+
+
+
+/*
+ *
+ *
+ */
+void
+test_ngrams()
+{
+  cout << "Testing ngrams..." << endl << endl;
+  size_t N = 5;
+  cout << "N = " << N << endl;
+  vector<int> a; // hyp
+  vector<int> b; // ref
+  cout << "a ";
+  for (size_t i = 1; i <= 8; i++) {
+    cout << i << " ";
+    a.push_back(i);
+  }
+  cout << endl << "b ";
+  for (size_t i = 1; i <= 4; i++) {
+    cout << i << " ";
+    b.push_back(i);
+  }
+  cout << endl << endl;
+  NgramCounts c = make_ngram_counts( a, b, N );
+  assert( c.clipped[N-1] == 0 );
+  assert( c.sum[N-1] == 4 );
+  c.print();
+  c += c;
+  cout << endl;
+  c.print();
+  cout << endl;
+}
+
+
+/*
+ *
+ *
+ */
+double
+approx_equal( double x, double y )
+{
+  const double EPSILON = 1E-5;
+  if ( x == 0 ) return fabs( y ) <= EPSILON;
+  if ( y == 0 ) return fabs( x ) <= EPSILON;
+  return fabs( x - y ) / max( fabs(x), fabs(y) ) <= EPSILON;
+}
+
+
+/*
+ *
+ *
+ */
+void
+test_metrics()
+{
+  cout << "Testing metrics..." << endl << endl;
+  using namespace boost::assign;
+  vector<string> a, b;
+  vector<double> expect_vanilla, expect_smooth, expect_stupid;
+  a +=              "a a a a", "a a a a", "a",   "a", "b",        "a a a a", "a a",  "a a a", "a b a"; // hyp
+  b +=              "b b b b", "a a a a", "a",   "b", "b b b b",  "a",       "a a",  "a a a", "a b b"; // ref
+  expect_vanilla += 0,         1,         1,      0,  0,          .25,       1,      1,       0;
+  expect_smooth  += 0,          .9375,     .0625, 0,   .00311169, .0441942,   .1875,  .4375,   .161587;
+  expect_stupid  += 0,         1,         1,      0,   .0497871,  .25,       1,      1,        .605707;
+  vector<string> aa, bb;
+  vector<WordID> aai, bbi;
+  double vanilla, smooth, stupid;
+  size_t N = 4;
+  cout << "N = " << N << endl << endl;
+  for ( size_t i = 0; i < a.size(); i++ ) {
+    cout << " hyp: " << a[i] << endl;
+    cout << " ref: " << b[i] << endl;
+    aa.clear(); bb.clear(); aai.clear(); bbi.clear();
+    boost::split( aa, a[i], boost::is_any_of(" ") );
+    boost::split( bb, b[i], boost::is_any_of(" ") );
+    register_and_convert( aa, aai );
+    register_and_convert( bb, bbi );
+    NgramCounts counts = make_ngram_counts( aai, bbi, N );
+    vanilla =        bleu( counts, aa.size(), bb.size(), N);
+    smooth  = smooth_bleu( counts, aa.size(), bb.size(), N);
+    stupid  = stupid_bleu( counts, aa.size(), bb.size(), N);
+    assert( approx_equal(vanilla, expect_vanilla[i]) );
+    assert( approx_equal(smooth, expect_smooth[i]) );
+    assert( approx_equal(stupid, expect_stupid[i]) );
+    cout << setw(14) << "bleu = "      << vanilla << endl;
+    cout << setw(14) << "smooth bleu = " << smooth << endl;
+    cout << setw(14) << "stupid bleu = " << stupid << endl << endl;
+  }
+  cout << endl;
+}
+
+/*
+ *
+ *
+ */
+void
+test_SetWeights()
+{
+  cout << "Testing Weights::SetWeight..." << endl << endl;
+  Weights weights;
+  SparseVector<double> lambdas;
+  weights.InitSparseVector( &lambdas );
+  weights.SetWeight( &lambdas, "test", 0 );
+  weights.SetWeight( &lambdas, "test1", 1 );
+  WordID fid = FD::Convert( "test2" );
+  weights.SetWeight( &lambdas, fid, 2 );
+  string fn = "weights-test";
+  cout << "FD::NumFeats() " << FD::NumFeats() << endl;
+  assert( FD::NumFeats() == 4 );
+  weights.WriteToFile( fn, true );
+  cout << endl;
+}
+
+
+/*
+ *
+ *
+ */
+void
+run_tests()
+{
+  cout << endl;
+  test_ngrams();
+  cout << endl;
+  test_metrics();
+  cout << endl;
+  test_SetWeights();
+  exit(0);
+}
+
+
+void
+print_FD()
+{
+  for ( size_t i = 0; i < FD::NumFeats(); i++ ) cout << FD::Convert(i)<< endl;
+}
+
diff --git a/dtrain/dcommon.h b/dtrain/dcommon.h
new file mode 100644
index 00000000..ff796642
--- /dev/null
+++ b/dtrain/dcommon.h
@@ -0,0 +1,230 @@
+#include <sstream>
+#include <iostream>
+#include <vector>
+#include <cassert>
+#include <cmath>
+
+#include "config.h"
+
+#include <boost/shared_ptr.hpp>
+#include <boost/algorithm/string.hpp>
+#include <boost/program_options.hpp>
+#include <boost/program_options/variables_map.hpp>
+
+#include "sentence_metadata.h"
+#include "scorer.h"
+#include "verbose.h"
+#include "viterbi.h"
+#include "hg.h"
+#include "prob.h"
+#include "kbest.h"
+#include "ff_register.h"
+#include "decoder.h"
+#include "filelib.h"
+#include "fdict.h"
+#include "weights.h"
+#include "sparse_vector.h"
+#include "sampler.h"
+
+using namespace std;
+namespace po = boost::program_options;
+
+
+struct ScorePair
+{
+  ScorePair(double modelscore, double score) : modelscore_(modelscore), score_(score) {} 
+  double modelscore_, score_;
+  double GetModelScore() { return modelscore_; }
+  double GetScore() { return score_; }
+};
+typedef vector<ScorePair> Scores;
+
+
+/*
+ * KBestGetter
+ *
+ */
+struct KBestList {
+  vector<SparseVector<double> > feats;
+  vector<vector<WordID> > sents;
+  vector<double> scores;
+};
+struct KBestGetter : public DecoderObserver
+{
+  KBestGetter( const size_t k ) : k_(k) {}
+  const size_t k_;
+  KBestList kb;
+
+  virtual void
+  NotifyTranslationForest(const SentenceMetadata& smeta, Hypergraph* hg)
+  {
+    GetKBest(smeta.GetSentenceID(), *hg);
+  }
+
+  KBestList* GetKBest() { return &kb; }
+
+  void
+  GetKBest(int sent_id, const Hypergraph& forest)
+  {
+    kb.scores.clear();
+    kb.sents.clear();
+    kb.feats.clear();
+    KBest::KBestDerivations<vector<WordID>, ESentenceTraversal> kbest( forest, k_ );
+    for ( size_t i = 0; i < k_; ++i ) {
+      const KBest::KBestDerivations<vector<WordID>, ESentenceTraversal>::Derivation* d =
+        kbest.LazyKthBest( forest.nodes_.size() - 1, i );
+      if (!d) break;
+      kb.sents.push_back( d->yield);
+      kb.feats.push_back( d->feature_values );
+      kb.scores.push_back( d->score );
+    }
+  }
+};
+
+
+/*
+ * NgramCounts
+ *
+ */
+struct NgramCounts
+{
+  NgramCounts( const size_t N ) : N_( N ) {
+    reset();
+  } 
+  size_t N_;
+  map<size_t, size_t> clipped;
+  map<size_t, size_t> sum;
+
+  void
+  operator+=( const NgramCounts& rhs )
+  {
+    assert( N_ == rhs.N_ );
+    for ( size_t i = 0; i < N_; i++ ) {
+      this->clipped[i] += rhs.clipped.find(i)->second;
+      this->sum[i] += rhs.sum.find(i)->second;
+    }
+  }
+
+  void
+  add( size_t count, size_t ref_count, size_t i )
+  {
+    assert( i < N_ );
+    if ( count > ref_count ) {
+      clipped[i] += ref_count;
+      sum[i] += count;
+    } else {
+      clipped[i] += count;
+      sum[i] += count;
+    }
+  }
+
+  void
+  reset()
+  {
+    size_t i;
+    for ( i = 0; i < N_; i++ ) {
+      clipped[i] = 0;
+      sum[i] = 0;
+    }
+  }
+
+  void
+  print()
+  {
+    for ( size_t i = 0; i < N_; i++ ) {
+      cout << i+1 << "grams (clipped):\t" << clipped[i] << endl;
+      cout << i+1 << "grams:\t\t\t" << sum[i] << endl;
+    }
+  }
+};
+
+
+/*class Learnerx
+{
+  public:
+    virtual void Init(const vector<SparseVector<double> >& kbest, const Scores& scores) {};
+    virtual void Update(SparseVector<double>& lambdas);
+};*/
+
+class SofiaLearner //: public Learnerx FIXME
+{
+  // TODO bool invert_score
+  public:
+  void
+  Init( const size_t sid, const vector<SparseVector<double> >& kbest, /*const*/ Scores& scores )
+  {
+    assert( kbest.size() == scores.size() );
+    ofstream o;
+    unlink( "/tmo/sofia_ml_training" );
+    o.open( "/tmp/sofia_ml_training", ios::trunc ); // TODO randomize, filename exists
+    int fid = 0;
+    map<int,int>::iterator ff;
+    for ( size_t k = 0; k < kbest.size(); ++k ) {
+      SparseVector<double>::const_iterator it = kbest[k].begin();
+      o << scores[k].GetScore();
+      for ( ; it != kbest[k].end(); ++it) {
+        ff = fmap.find( it->first );
+        if ( ff == fmap.end() ) {
+          fmap.insert( pair<int,int>(it->first, fid) );
+          fmap1.insert( pair<int,int>(fid, it->first) );
+          fid++;
+        }
+        o << " "<< fmap[it->first] << ":" << it->second;
+      }
+      o << endl;
+    }
+    o.close();
+  }
+
+  void
+  Update(SparseVector<double>& lambdas)
+  {
+    string call = "./sofia-ml --training_file /tmp/sofia_ml_training --model_out /tmp/sofia_ml_model --loop_type stochastic --lambda 100 --dimensionality ";
+    std::stringstream out;
+    out << fmap.size();
+    call += out.str();
+    call += " &>/dev/null";
+    system ( call.c_str() );
+    ifstream i;
+    unlink( "/tmo/sofia_ml_model" );
+    i.open( "/tmp/sofia_ml_model", ios::in );
+    string model;
+    getline( i, model );
+    //cout << model << endl;
+    vector<string> strs;
+    boost::split( strs, model, boost::is_any_of(" ") );
+    int j = 0;
+    for ( vector<string>::iterator it = strs.begin(); it != strs.end(); ++it ) {
+      lambdas.set_value(fmap1[j], atof( it->c_str() ) );
+      j++;
+    }
+
+  }
+
+  private:
+    map<int,int> fmap;
+    map<int,int> fmap1;
+};
+
+typedef map<vector<WordID>, size_t> Ngrams;
+Ngrams make_ngrams( vector<WordID>& s, size_t N );
+NgramCounts make_ngram_counts( vector<WordID> hyp, vector<WordID> ref, size_t N );
+double brevity_penaly( const size_t hyp_len, const size_t ref_len );
+double bleu( NgramCounts& counts, const size_t hyp_len, const size_t ref_len, size_t N, vector<float> weights = vector<float>() );
+double stupid_bleu( NgramCounts& counts, const size_t hyp_len, const size_t ref_len, size_t N, vector<float> weights = vector<float>() );
+double smooth_bleu( NgramCounts& counts, const size_t hyp_len, const size_t ref_len, const size_t N, vector<float> weights = vector<float>() );
+double approx_bleu( NgramCounts& counts, const size_t hyp_len, const size_t ref_len, const size_t N, vector<float> weights = vector<float>() );
+void register_and_convert(const vector<string>& strs, vector<WordID>& ids);
+
+
+
+
+void print_FD();
+void run_tests();
+void test_SetWeights();
+#include <boost/assign/std/vector.hpp>
+#include <iomanip>
+void test_metrics();
+double approx_equal( double x, double y );
+void test_ngrams();
+
diff --git a/dtrain/dtest.cc b/dtrain/dtest.cc
new file mode 100644
index 00000000..9975794f
--- /dev/null
+++ b/dtrain/dtest.cc
@@ -0,0 +1,95 @@
+#include "dcommon.h"
+
+
+
+
+/*
+ * init
+ *
+ */
+bool
+init(int argc, char** argv, po::variables_map* conf)
+{
+  int N;
+  po::options_description opts( "Options" );
+  opts.add_options()
+    ( "decoder-config,c", po::value<string>(),                  "configuration file for cdec" )
+    ( "weights,w",        po::value<string>(),                  "weights file")
+    ( "ngrams,n",         po::value<int>(&N)->default_value(4), "N for Ngrams (default 5)" );
+  po::options_description cmdline_options;
+  cmdline_options.add(opts);
+  po::store( parse_command_line(argc, argv, cmdline_options), *conf );
+  po::notify( *conf );
+  if ( ! (conf->count("decoder-config") || conf->count("weights")) ) {
+    cerr << cmdline_options << endl;
+    return false;
+  }
+  return true;
+}
+
+
+/*
+ * main
+ *
+ */
+int
+main(int argc, char** argv)
+{
+  SetSilent(true);
+  po::variables_map conf;
+  if (!init(argc, argv, &conf)) return 1;
+  register_feature_functions();
+  size_t k = 1;
+  ReadFile ini_rf(conf["decoder-config"].as<string>());
+  Decoder decoder(ini_rf.stream());
+  KBestGetter observer(k);
+  size_t N = conf["ngrams"].as<int>();
+
+  Weights weights;
+  weights.InitFromFile(conf["weights"].as<string>());
+  vector<double> w;
+  weights.InitVector(&w);
+  decoder.SetWeights(w);
+ 
+  vector<string> strs, ref_strs;
+  vector<WordID> ref_ids;
+  string in, psg;
+  size_t sid = 0;
+  double overall = 0.0;
+  cerr << "(1 dot equals 100 lines of input)" << endl;
+  while( getline(cin, in) ) {
+    if ( (sid+1) % 100 == 0 ) {
+        cerr << ".";
+        if ( (sid+1)%1000 == 0 ) cerr << endl;
+    }
+    if ( sid > 5000 ) break;
+    strs.clear();
+    boost::split( strs, in, boost::is_any_of("\t") );
+    // grammar
+    psg = boost::replace_all_copy( strs[2], " __NEXT_RULE__ ", "\n" ); psg += "\n";
+    decoder.SetSentenceGrammar( psg );
+    decoder.Decode( strs[0], &observer );
+    KBestList* kb = observer.GetKBest();
+    // reference
+    ref_strs.clear(); ref_ids.clear();
+    boost::split( ref_strs, strs[1], boost::is_any_of(" ") );
+    register_and_convert( ref_strs, ref_ids );
+    // scoring kbest
+    double score = 0;
+    Scores scores;
+    NgramCounts counts = make_ngram_counts( ref_ids, kb->sents[0], 4 );
+    score = smooth_bleu( counts,
+                         ref_ids.size(),
+                         kb->sents[0].size(), N );
+    ScorePair sp( kb->scores[0], score );
+    scores.push_back( sp );
+    //cout << TD::GetString( kb->sents[0] ) << endl;
+    overall += score;
+    sid += 1;
+  }
+  cout << "Average score: " << overall/(sid+1) << endl;
+  cerr << endl;
+
+  return 0;
+}
+
diff --git a/dtrain/dtrain.cc b/dtrain/dtrain.cc
index 8464a429..95fc81af 100644
--- a/dtrain/dtrain.cc
+++ b/dtrain/dtrain.cc
@@ -1,33 +1,6 @@
-#include <sstream>
-#include <iostream>
-#include <vector>
-#include <cassert>
-#include <cmath>
+#include "dcommon.h"
 
-#include "config.h"
 
-#include <boost/shared_ptr.hpp>
-#include <boost/algorithm/string.hpp>
-#include <boost/program_options.hpp>
-#include <boost/program_options/variables_map.hpp>
-
-#include "sentence_metadata.h"
-#include "scorer.h"
-#include "verbose.h"
-#include "viterbi.h"
-#include "hg.h"
-#include "prob.h"
-#include "kbest.h"
-#include "ff_register.h"
-#include "decoder.h"
-#include "filelib.h"
-#include "fdict.h"
-#include "weights.h"
-#include "sparse_vector.h"
-#include "sampler.h"
-
-using namespace std;
-namespace boostpo = boost::program_options;
 
 
 /*
@@ -35,19 +8,19 @@ namespace boostpo = boost::program_options;
  *
  */
 bool
-init(int argc, char** argv, boostpo::variables_map* conf)
+init(int argc, char** argv, po::variables_map* conf)
 {
-  boostpo::options_description opts( "Options" );
+  po::options_description opts( "Options" );
   opts.add_options()
-    ( "decoder-config,c", boostpo::value<string>(), "configuration file for cdec" )
-    ( "kbest,k",          boostpo::value<size_t>(), "k for kbest" )
-    ( "ngrams,n",         boostpo::value<int>(),    "n for Ngrams" )
-    ( "filter,f",         boostpo::value<string>(), "filter kbest list" )
+    ( "decoder-config,c", po::value<string>(), "configuration file for cdec" )
+    ( "kbest,k",          po::value<size_t>(), "k for kbest" )
+    ( "ngrams,n",         po::value<int>(),    "n for Ngrams" )
+    ( "filter,f",         po::value<string>(), "filter kbest list" )
     ( "test",                                       "run tests and exit");
-  boostpo::options_description cmdline_options;
+  po::options_description cmdline_options;
   cmdline_options.add(opts);
-  boostpo::store( parse_command_line(argc, argv, cmdline_options), *conf );
-  boostpo::notify( *conf );
+  po::store( parse_command_line(argc, argv, cmdline_options), *conf );
+  po::notify( *conf );
   if ( ! (conf->count("decoder-config") || conf->count("test")) ) {
     cerr << cmdline_options << endl;
     return false;
@@ -57,451 +30,14 @@ init(int argc, char** argv, boostpo::variables_map* conf)
 
 
 /*
- * KBestGetter
- *
- */
-struct KBestList {
-  vector<SparseVector<double> > feats;
-  vector<vector<WordID> > sents;
-  vector<double> scores;
-};
-struct KBestGetter : public DecoderObserver
-{
-  KBestGetter( const size_t k ) : k_(k) {}
-  const size_t k_;
-  KBestList kb;
-
-  virtual void
-  NotifyTranslationForest(const SentenceMetadata& smeta, Hypergraph* hg)
-  {
-    GetKBest(smeta.GetSentenceID(), *hg);
-  }
-
-  KBestList* getkb() { return &kb; }
-
-  void
-  GetKBest(int sent_id, const Hypergraph& forest)
-  {
-    kb.scores.clear();
-    kb.sents.clear();
-    kb.feats.clear();
-    KBest::KBestDerivations<vector<WordID>, ESentenceTraversal> kbest( forest, k_ );
-    for ( size_t i = 0; i < k_; ++i ) {
-      const KBest::KBestDerivations<vector<WordID>, ESentenceTraversal>::Derivation* d =
-        kbest.LazyKthBest( forest.nodes_.size() - 1, i );
-      if (!d) break;
-      kb.sents.push_back( d->yield);
-      kb.feats.push_back( d->feature_values );
-      kb.scores.push_back( d->score );
-    }
-  }
-};
-
-
-/*
- * write_training_data_for_sofia
- *
- */
-void
-sofia_write_training_data()
-{
-  // TODO
-}
-
-
-/*
- * call_sofia
- *
- */
-void
-sofia_call()
-{
-  // TODO
-}
-
-
-/*
- * sofia_model2weights
- *
- */
-void
-sofia_read_model()
-{
-  // TODO
-}
-
-
-/*
- * make_ngrams
- *
- */
-typedef map<vector<WordID>, size_t> Ngrams;
-Ngrams
-make_ngrams( vector<WordID>& s, size_t N )
-{
-  Ngrams ngrams;
-  vector<WordID> ng;
-  for ( size_t i = 0; i < s.size(); i++ ) {
-    ng.clear();
-    for ( size_t j = i; j < min( i+N, s.size() ); j++ ) {
-      ng.push_back( s[j] );
-      ngrams[ng]++;
-    }
-  }
-  return ngrams;
-}
-
-
-/*
- * NgramCounts
- *
- */
-struct NgramCounts
-{
-  NgramCounts( const size_t N ) : N_( N ) {
-    reset();
-  } 
-  size_t N_;
-  map<size_t, size_t> clipped;
-  map<size_t, size_t> sum;
-
-  void
-  operator+=( const NgramCounts& rhs )
-  {
-    assert( N_ == rhs.N_ );
-    for ( size_t i = 0; i < N_; i++ ) {
-      this->clipped[i] += rhs.clipped.find(i)->second;
-      this->sum[i] += rhs.sum.find(i)->second;
-    }
-  }
-
-  void
-  add( size_t count, size_t ref_count, size_t i )
-  {
-    assert( i < N_ );
-    if ( count > ref_count ) {
-      clipped[i] += ref_count;
-      sum[i] += count;
-    } else {
-      clipped[i] += count;
-      sum[i] += count;
-    }
-  }
-
-  void
-  reset()
-  {
-    size_t i;
-    for ( i = 0; i < N_; i++ ) {
-      clipped[i] = 0;
-      sum[i] = 0;
-    }
-  }
-
-  void
-  print()
-  {
-    for ( size_t i = 0; i < N_; i++ ) {
-      cout << i+1 << "grams (clipped):\t" << clipped[i] << endl;
-      cout << i+1 << "grams:\t\t\t" << sum[i] << endl;
-    }
-  }
-};
-
-
-/*
- * ngram_matches
- *
- */
-NgramCounts
-make_ngram_counts( vector<WordID> hyp, vector<WordID> ref, size_t N )
-{
-  Ngrams hyp_ngrams = make_ngrams( hyp, N );
-  Ngrams ref_ngrams = make_ngrams( ref, N );
-  NgramCounts counts( N );
-  Ngrams::iterator it;
-  Ngrams::iterator ti;
-  for ( it = hyp_ngrams.begin(); it != hyp_ngrams.end(); it++ ) {
-    ti = ref_ngrams.find( it->first );
-    if ( ti != ref_ngrams.end() ) {
-      counts.add( it->second, ti->second, it->first.size() - 1 );
-    } else {
-      counts.add( it->second, 0, it->first.size() - 1 );
-    }
-  }
-  return counts;
-}
-
-
-/*
- * brevity_penaly
- *
- */
-double
-brevity_penaly( const size_t hyp_len, const size_t ref_len )
-{
-  if ( hyp_len > ref_len ) return 1;
-  return exp( 1 - (double)ref_len/(double)hyp_len );
-}
-
-
-/*
- * bleu
- * as in "BLEU: a Method for Automatic Evaluation of Machine Translation" (Papineni et al. '02)
- * page TODO
- * 0 if for N one of the counts = 0
- */
-double
-bleu( NgramCounts& counts, const size_t hyp_len, const size_t ref_len,
-      size_t N, vector<float> weights = vector<float>() )
-{
-  if ( hyp_len == 0 || ref_len == 0 ) return 0;
-  if ( ref_len < N ) N = ref_len;
-  float N_ = (float)N;
-  if ( weights.empty() )
-  {
-    for ( size_t i = 0; i < N; i++ ) weights.push_back( 1/N_ );
-  }
-  double sum = 0;
-  for ( size_t i = 0; i < N; i++ ) {
-    if ( counts.clipped[i] == 0 || counts.sum[i] == 0 ) return 0;
-    sum += weights[i] * log( (double)counts.clipped[i] / (double)counts.sum[i] );
-  }
-  return brevity_penaly( hyp_len, ref_len ) * exp( sum );
-}
-
-
-/*
- * stupid_bleu
- * as in "ORANGE: a Method for Evaluating Automatic Evaluation Metrics for Machine Translation (Lin & Och '04)
- * page TODO
- * 0 iff no 1gram match
- */
-double
-stupid_bleu( NgramCounts& counts, const size_t hyp_len, const size_t ref_len,
-             size_t N, vector<float> weights = vector<float>() )
-{
-  if ( hyp_len == 0 || ref_len == 0 ) return 0;
-  if ( ref_len < N ) N = ref_len;
-  float N_ = (float)N;
-  if ( weights.empty() )
-  {
-    for ( size_t i = 0; i < N; i++ ) weights.push_back( 1/N_ );
-  }
-  double sum = 0;
-  float add = 0;
-  for ( size_t i = 0; i < N; i++ ) {
-    if ( i == 1 ) add = 1;
-    sum += weights[i] * log( ((double)counts.clipped[i] + add) / ((double)counts.sum[i] + add) );
-  }
-  return brevity_penaly( hyp_len, ref_len ) * exp( sum );
-}
-
-
-/*
- * smooth_bleu
- * as in "An End-to-End Discriminative Approach to Machine Translation" (Liang et al. '06)
- * page TODO
- * max. 0.9375
- */
-double
-smooth_bleu( NgramCounts& counts, const size_t hyp_len, const size_t ref_len,
-             const size_t N, vector<float> weights = vector<float>() )
-{
-  if ( hyp_len == 0 || ref_len == 0 ) return 0;
-  float N_ = (float)N;
-  if ( weights.empty() )
-  {
-    for ( size_t i = 0; i < N; i++ ) weights.push_back( 1/N_ );
-  }
-  double sum = 0;
-  float j = 1;
-  for ( size_t i = 0; i < N; i++ ) {
-    if ( counts.clipped[i] == 0 || counts.sum[i] == 0) continue;
-    sum += exp((weights[i] * log((double)counts.clipped[i]/(double)counts.sum[i]))) / pow( 2, N_-j+1 );
-    j++;
-  }
-  return brevity_penaly( hyp_len, ref_len ) * sum;
-}
-
-
-/*
- * approx_bleu
- * as in "Online Large-Margin Training for Statistical Machine Translation" (Watanabe et al. '07)
- * page TODO
- *
- */
-double
-approx_bleu( NgramCounts& counts, const size_t hyp_len, const size_t ref_len,
-     const size_t N, vector<float> weights = vector<float>() )
-{
-  return bleu( counts, hyp_len, ref_len, N, weights );
-}
-
-
-/*
- * register_and_convert
- *
- */
-void
-register_and_convert(const vector<string>& strs, vector<WordID>& ids)
-{
-  vector<string>::const_iterator it;
-  for ( it = strs.begin(); it < strs.end(); it++ ) {
-    ids.push_back( TD::Convert( *it ) );
-  }
-}
-
-
-/*
- *
- *
- */
-void
-test_ngrams()
-{
-  cout << "Testing ngrams..." << endl << endl;
-  size_t N = 5;
-  cout << "N = " << N << endl;
-  vector<int> a; // hyp
-  vector<int> b; // ref
-  cout << "a ";
-  for (size_t i = 1; i <= 8; i++) {
-    cout << i << " ";
-    a.push_back(i);
-  }
-  cout << endl << "b ";
-  for (size_t i = 1; i <= 4; i++) {
-    cout << i << " ";
-    b.push_back(i);
-  }
-  cout << endl << endl;
-  NgramCounts c = make_ngram_counts( a, b, N );
-  assert( c.clipped[N-1] == 0 );
-  assert( c.sum[N-1] == 4 );
-  c.print();
-  c += c;
-  cout << endl;
-  c.print();
-  cout << endl;
-}
-
-
-/*
- *
- *
- */
-double
-approx_equal( double x, double y )
-{
-  const double EPSILON = 1E-5;
-  if ( x == 0 ) return fabs( y ) <= EPSILON;
-  if ( y == 0 ) return fabs( x ) <= EPSILON;
-  return fabs( x - y ) / max( fabs(x), fabs(y) ) <= EPSILON;
-}
-
-
-/*
- *
- *
- */
-#include <boost/assign/std/vector.hpp>
-#include <iomanip>
-void
-test_metrics()
-{
-  cout << "Testing metrics..." << endl << endl;
-  using namespace boost::assign;
-  vector<string> a, b;
-  vector<double> expect_vanilla, expect_smooth, expect_stupid;
-  a +=              "a a a a", "a a a a", "a",   "a", "b",        "a a a a", "a a",  "a a a", "a b a"; // hyp
-  b +=              "b b b b", "a a a a", "a",   "b", "b b b b",  "a",       "a a",  "a a a", "a b b"; // ref
-  expect_vanilla += 0,         1,         1,      0,  0,          .25,       1,      1,       0;
-  expect_smooth  += 0,          .9375,     .0625, 0,   .00311169, .0441942,   .1875,  .4375,   .161587;
-  expect_stupid  += 0,         1,         1,      0,   .0497871,  .25,       1,      1,        .605707;
-  vector<string> aa, bb;
-  vector<WordID> aai, bbi;
-  double vanilla, smooth, stupid;
-  size_t N = 4;
-  cout << "N = " << N << endl << endl;
-  for ( size_t i = 0; i < a.size(); i++ ) {
-    cout << " hyp: " << a[i] << endl;
-    cout << " ref: " << b[i] << endl;
-    aa.clear(); bb.clear(); aai.clear(); bbi.clear();
-    boost::split( aa, a[i], boost::is_any_of(" ") );
-    boost::split( bb, b[i], boost::is_any_of(" ") );
-    register_and_convert( aa, aai );
-    register_and_convert( bb, bbi );
-    NgramCounts counts = make_ngram_counts( aai, bbi, N );
-    vanilla =        bleu( counts, aa.size(), bb.size(), N);
-    smooth  = smooth_bleu( counts, aa.size(), bb.size(), N);
-    stupid  = stupid_bleu( counts, aa.size(), bb.size(), N);
-    assert( approx_equal(vanilla, expect_vanilla[i]) );
-    assert( approx_equal(smooth, expect_smooth[i]) );
-    assert( approx_equal(stupid, expect_stupid[i]) );
-    cout << setw(14) << "bleu = "      << vanilla << endl;
-    cout << setw(14) << "smooth bleu = " << smooth << endl;
-    cout << setw(14) << "stupid bleu = " << stupid << endl << endl;
-  }
-  cout << endl;
-}
-
-/*
- *
- *
- */
-void
-test_SetWeights()
-{
-  cout << "Testing Weights::SetWeight..." << endl << endl;
-  Weights weights;
-  SparseVector<double> lambdas;
-  weights.InitSparseVector( &lambdas );
-  weights.SetWeight( &lambdas, "test", 0 );
-  weights.SetWeight( &lambdas, "test1", 1 );
-  WordID fid = FD::Convert( "test2" );
-  weights.SetWeight( &lambdas, fid, 2 );
-  string fn = "weights-test";
-  cout << "FD::NumFeats() " << FD::NumFeats() << endl;
-  assert( FD::NumFeats() == 4 );
-  weights.WriteToFile( fn, true );
-  cout << endl;
-}
-
-
-/*
- *
- *
- */
-void
-run_tests()
-{
-  cout << endl;
-  test_ngrams();
-  cout << endl;
-  test_metrics();
-  cout << endl;
-  test_SetWeights();
-  exit(0);
-}
-
-
-void
-print_FD()
-{
-  for ( size_t i = 0; i < FD::NumFeats(); i++ ) cout << FD::Convert(i)<< endl;
-}
-
-
-/*
  * main
  *
  */
 int
 main(int argc, char** argv)
 {
-  //SetSilent(true);
-  boostpo::variables_map conf;
+  SetSilent(true);
+  po::variables_map conf;
   if (!init(argc, argv, &conf)) return 1;
   if ( conf.count("test") ) run_tests(); 
   register_feature_functions();
@@ -509,7 +45,9 @@ main(int argc, char** argv)
   ReadFile ini_rf(conf["decoder-config"].as<string>());
   Decoder decoder(ini_rf.stream());
   KBestGetter observer(k);
-  
+  size_t N = 4; // TODO as parameter/in config 
+
+  // TODO scoring metric as parameter/in config 
   // for approx. bleu
   //NgramCounts global_counts;
   //size_t global_hyp_len;
@@ -523,82 +61,67 @@ main(int argc, char** argv)
   lambdas.set_value(FD::Convert("logp"), 0);
 
  
-  vector<string> strs;
+  vector<string> strs, ref_strs;
+  vector<WordID> ref_ids;
   string in, psg;
-  size_t i = 0;
+  size_t sid = 0;
+  cerr << "(1 dot equals 100 lines of input)" << endl;
   while( getline(cin, in) ) {
-    if ( !SILENT ) cerr << endl << endl << "Getting kbest for sentence #" << i << endl;
-    // why? why!?
+    //if ( !SILENT )
+    //    cerr << endl << endl << "Getting kbest for sentence #" << sid << endl;
+    if ( (sid+1) % 100 == 0 ) {
+        cerr << ".";
+        if ( (sid+1)%1000 == 0 ) cerr << endl;
+    }
+    if ( sid > 5000 ) break;
+    // weights
     dense_weights.clear();
     weights.InitFromVector( lambdas );
     weights.InitVector( &dense_weights );
     decoder.SetWeights( dense_weights );
-    //cout << "use_shell " << dense_weights[FD::Convert("use_shell")] << endl;
+    //if ( sid > 100 ) break;
+    // handling input..
     strs.clear();
     boost::split( strs, in, boost::is_any_of("\t") );
+    // grammar
     psg = boost::replace_all_copy( strs[2], " __NEXT_RULE__ ", "\n" ); psg += "\n";
-    //decoder.SetId(i);
     decoder.SetSentenceGrammar( psg );
     decoder.Decode( strs[0], &observer );
-    KBestList* kb = observer.getkb();
+    KBestList* kb = observer.GetKBest();
+    // reference
+    ref_strs.clear(); ref_ids.clear();
+    boost::split( ref_strs, strs[1], boost::is_any_of(" ") );
+    register_and_convert( ref_strs, ref_ids );
+    // scoring kbest
+    double score = 0;
+    Scores scores;
     for ( size_t i = 0; i < k; i++ ) {
-      cout << i << " ";
-      for (size_t j = 0; j < kb->sents[i].size(); ++j ) {
-        cout << TD::Convert( kb->sents[i][j] ) << " ";
-      }
-      cout << kb->scores[i];
-      cout << endl;
+      NgramCounts counts = make_ngram_counts( ref_ids, kb->sents[i], 4 );
+      score = smooth_bleu( counts,
+                           ref_ids.size(),
+                           kb->sents[i].size(), N );
+      ScorePair sp( kb->scores[i], score );
+      scores.push_back( sp );
+      //cout << "'" << TD::GetString( ref_ids ) << "' vs '" << TD::GetString( kb->sents[i] ) << "' SCORE=" << score << endl;
+      //cout << kb->feats[i] << endl;
     }
-    lambdas.set_value( FD::Convert("use_shell"), 1 );
-    lambdas.set_value( FD::Convert("use_a"), 1 );
+    //cout << "###" << endl;
+    SofiaLearner learner;
+    learner.Init( sid, kb->feats, scores );
+    learner.Update(lambdas);
+    // initializing learner
+    // TODO
+    // updating weights
+    //lambdas.set_value( FD::Convert("use_shell"), 1 );
+    //lambdas.set_value( FD::Convert("use_a"), 1 );
     //print_FD();
+    sid += 1; // TODO does cdec count this already?
   }
-  
+
   weights.WriteToFile( "weights-final", true );
+  
+  cerr << endl;
 
   return 0;
 }
 
-    // next: FMap, ->sofia, ->FMap, -> Weights
-    // learner gets all used features (binary! and dense (logprob is sum of logprobs!))
-    // only for those feats with weight > 0 after learning
-    // see decoder line 548
-
-
-/*
- * TODO
- *  iterate over training set, for t=1..T
- *  mapred impl
- *   mapper:  main
- *   reducer: average weights, global NgramCounts for approx. bleu
- *  1st cut: hadoop streaming?
- *  batch, non-batch in the mapper (what sofia gets, regenerated Kbest lists)
- *  filter kbest yes/no
- *  sofia: --eta_type explicit
- *  psg preparation source\tref\tpsg
- *  set reference for cdec?
- *  LM
- *   shared?
- *   startup?
- *  X reference(s) for *bleu!?
- *  kbest nicer (do not iterate twice)!? -> shared_ptr
- *  multipartite ranking
- *  weights! global, per sentence from global, featuremap
- *  const decl...
- *  sketch: batch/iter options
- *  weights.cc: why wv_?
- *  --weights cmd line (for iterations): script to call again/hadoop streaming?
- *  I do not need to remember features, cdec does
- *  resocre hg?
- *  do not use Decoder::Decode!?
- *  what happens if feature not in FD? 0???
- */
-
-/*
- * PROBLEMS
- *  cdec kbest vs 1best (no -k param)
- *  FD, Weights::wv_ grow too large, see utils/weights.cc; decoder/hg.h; decoder/scfg_translator.cc; utils/fdict.cc!?
- *  sparse vector instead of vector<double> for weights in Decoder?
- *  PhraseModel_* features for psg!? (seem to be generated)
- */
-
diff --git a/dtrain/dtrain.ini b/dtrain/dtrain.ini
deleted file mode 100644
index e69de29b..00000000
--- a/dtrain/dtrain.ini
+++ /dev/null
diff --git a/dtrain/in b/dtrain/in
deleted file mode 100644
index 294d009b..00000000
--- a/dtrain/in
+++ /dev/null
@@ -1,2 +0,0 @@
-vorrichtung	means	[X] ||| vorrichtung ||| apparatus ||| LogP=-200 ||| 0-0 __NEXT_RULE__ [X] ||| vorrichtung ||| means ||| LogP=-101 ||| 0-0
-eintest	test	[X] ||| eintest ||| test ||| LogP=-200 ||| 0-0 __NEXT_RULE__ [X] ||| eintest ||| xxx ||| LogP=-101 ||| 0-0
diff --git a/dtrain/in.toy b/dtrain/in.toy
deleted file mode 100644
index 71b736a6..00000000
--- a/dtrain/in.toy
+++ /dev/null
@@ -1,2 +0,0 @@
-ich sah ein kleines haus	i saw a little shell	[S] ||| [NP,1] [VP,2] ||| [1] [2] ||| logp=0 __NEXT_RULE__ [NP] ||| ich ||| i ||| logp=0 __NEXT_RULE__ [NP] ||| ein [NN,1] ||| a [1] ||| logp=0 __NEXT_RULE__ [NN] ||| [JJ,1] haus ||| [1] house ||| logp=-0.5 __NEXT_RULE__ [NN] ||| [JJ,1] haus ||| [1] shell ||| logp=-0.5 use_shell=1 __NEXT_RULE__ [JJ] ||| kleines ||| small ||| logp=-0.5 __NEXT_RULE__ [JJ] ||| kleines ||| little ||| logp=-0.9 __NEXT_RULE__ [JJ] ||| grosses ||| big ||| logp=-0.5 __NEXT_RULE__ [JJ] ||| grosses ||| large ||| logp=-1.5 __NEXT_RULE__ [VP] ||| [V,1] [NP,2] ||| [1] [2] ||| logp=0 __NEXT_RULE__ [V] ||| sah ||| saw ||| logp=0 __NEXT_RULE__ [V] ||| fand ||| found ||| logp=0
-ich fand ein grosses haus	i found a little shell	[S] ||| [NP,1] [VP,2] ||| [1] [2] ||| logp=0 __NEXT_RULE__ [NP] ||| ich ||| i ||| logp=0 __NEXT_RULE__ [NP] ||| ein [NN,1] ||| a [1] ||| logp=0 use_a=1 __NEXT_RULE__ [NN] ||| [JJ,1] haus ||| [1] house ||| logp=-1000 __NEXT_RULE__ [NN] ||| [JJ,1] haus ||| [1] shell ||| logp=-1 use_shell=1 __NEXT_RULE__ [JJ] ||| kleines ||| small ||| logp=-0.5 __NEXT_RULE__ [JJ] ||| kleines ||| little ||| logp=-0.9 __NEXT_RULE__ [JJ] ||| grosses ||| big ||| logp=-0.5 __NEXT_RULE__ [JJ] ||| grosses ||| large ||| logp=-1.5 __NEXT_RULE__ [VP] ||| [V,1] [NP,2] ||| [1] [2] ||| logp=0 __NEXT_RULE__ [V] ||| sah ||| saw ||| logp=0 __NEXT_RULE__ [V] ||| fand ||| found ||| logp=0
diff --git a/dtrain/test.sh b/dtrain/test.sh
index a0ebb420..ad45bd1e 100755
--- a/dtrain/test.sh
+++ b/dtrain/test.sh
@@ -1,4 +1,4 @@
 #!/bin/sh
 
-./dtrain -c cdec.ini -k 4 < in.toy
+./dtrain -c data/cdec.ini -k 4 < data/in.blunsom08 #< data/in.toy
author	Patrick Simianer <p@simianer.de>	2011-07-29 00:48:04 +0200
committer	Patrick Simianer <p@simianer.de>	2011-09-23 19:13:57 +0200
commit	b732e625ffcf59da8440db577183110488f5c4b7 (patch)
tree	863b59dea0ffd927621751da5c53298924c2ee2f /dtrain
parent	05c41075d0018ca6142f7ba593742fbadfecdf65 (diff)