summaryrefslogtreecommitdiff
path: root/dtrain
diff options
context:
space:
mode:
authorPatrick Simianer <p@simianer.de>2011-07-29 00:48:04 +0200
committerPatrick Simianer <p@simianer.de>2011-09-23 19:13:57 +0200
commitb732e625ffcf59da8440db577183110488f5c4b7 (patch)
tree863b59dea0ffd927621751da5c53298924c2ee2f /dtrain
parent05c41075d0018ca6142f7ba593742fbadfecdf65 (diff)
first cut for sofia-ml, little change in utils/dict.h, coarse refactoring
Diffstat (limited to 'dtrain')
-rw-r--r--dtrain/Makefile.am11
-rw-r--r--dtrain/cdec.ini4
-rw-r--r--dtrain/dcommon.cc311
-rw-r--r--dtrain/dcommon.h230
-rw-r--r--dtrain/dtest.cc95
-rw-r--r--dtrain/dtrain.cc595
-rw-r--r--dtrain/dtrain.ini0
-rw-r--r--dtrain/in2
-rw-r--r--dtrain/in.toy2
-rwxr-xr-xdtrain/test.sh2
10 files changed, 704 insertions, 548 deletions
diff --git a/dtrain/Makefile.am b/dtrain/Makefile.am
index daa20cf3..c3f14bb0 100644
--- a/dtrain/Makefile.am
+++ b/dtrain/Makefile.am
@@ -1,6 +1,11 @@
-bin_PROGRAMS = dtrain
+# TODO I'm sure I can leave something out.
+bin_PROGRAMS = dtrain dtest
-dtrain_SOURCES = dtrain.cc
-dtrain_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a ../klm/lm/libklm.a ../klm/util/libklm_util.a -lz
+dtrain_SOURCES = dtrain.cc dcommon.cc
+dtrain_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a ../klm/lm/libklm.a ../klm/util/libklm_util.a -lz
+
+dtest_SOURCES = dtest.cc dcommon.cc
+dtest_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a ../klm/lm/libklm.a ../klm/util/libklm_util.a -lz
AM_CPPFLAGS = -W -Wall -Wno-sign-compare -I$(top_srcdir)/utils -I$(top_srcdir)/decoder -I$(top_srcdir)/mteval
+
diff --git a/dtrain/cdec.ini b/dtrain/cdec.ini
deleted file mode 100644
index 92a4a335..00000000
--- a/dtrain/cdec.ini
+++ /dev/null
@@ -1,4 +0,0 @@
-formalism=scfg
-#feature_function=KLanguageModel europarl-v6.tok.lc.s-tag.en.arpa.kenlm.v4.mma
-#k_best=2
-#add_pass_through_rules=true
diff --git a/dtrain/dcommon.cc b/dtrain/dcommon.cc
new file mode 100644
index 00000000..a6bdc92c
--- /dev/null
+++ b/dtrain/dcommon.cc
@@ -0,0 +1,311 @@
+#include "dcommon.h"
+
+
+
+/*
+ * make_ngrams
+ *
+ */
+typedef map<vector<WordID>, size_t> Ngrams;
+Ngrams
+make_ngrams( vector<WordID>& s, size_t N )
+{
+ Ngrams ngrams;
+ vector<WordID> ng;
+ for ( size_t i = 0; i < s.size(); i++ ) {
+ ng.clear();
+ for ( size_t j = i; j < min( i+N, s.size() ); j++ ) {
+ ng.push_back( s[j] );
+ ngrams[ng]++;
+ }
+ }
+ return ngrams;
+}
+
+
+
+
+
+/*
+ * ngram_matches
+ *
+ */
+NgramCounts
+make_ngram_counts( vector<WordID> hyp, vector<WordID> ref, size_t N )
+{
+ Ngrams hyp_ngrams = make_ngrams( hyp, N );
+ Ngrams ref_ngrams = make_ngrams( ref, N );
+ NgramCounts counts( N );
+ Ngrams::iterator it;
+ Ngrams::iterator ti;
+ for ( it = hyp_ngrams.begin(); it != hyp_ngrams.end(); it++ ) {
+ ti = ref_ngrams.find( it->first );
+ if ( ti != ref_ngrams.end() ) {
+ counts.add( it->second, ti->second, it->first.size() - 1 );
+ } else {
+ counts.add( it->second, 0, it->first.size() - 1 );
+ }
+ }
+ return counts;
+}
+
+
+/*
+ * brevity_penaly
+ *
+ */
+double
+brevity_penaly( const size_t hyp_len, const size_t ref_len )
+{
+ if ( hyp_len > ref_len ) return 1;
+ return exp( 1 - (double)ref_len/(double)hyp_len );
+}
+
+
+/*
+ * bleu
+ * as in "BLEU: a Method for Automatic Evaluation of Machine Translation" (Papineni et al. '02)
+ * page TODO
+ * 0 if for N one of the counts = 0
+ */
+double
+bleu( NgramCounts& counts, const size_t hyp_len, const size_t ref_len,
+ size_t N, vector<float> weights )
+{
+ if ( hyp_len == 0 || ref_len == 0 ) return 0;
+ if ( ref_len < N ) N = ref_len;
+ float N_ = (float)N;
+ if ( weights.empty() )
+ {
+ for ( size_t i = 0; i < N; i++ ) weights.push_back( 1/N_ );
+ }
+ double sum = 0;
+ for ( size_t i = 0; i < N; i++ ) {
+ if ( counts.clipped[i] == 0 || counts.sum[i] == 0 ) return 0;
+ sum += weights[i] * log( (double)counts.clipped[i] / (double)counts.sum[i] );
+ }
+ return brevity_penaly( hyp_len, ref_len ) * exp( sum );
+}
+
+
+/*
+ * stupid_bleu
+ * as in "ORANGE: a Method for Evaluating Automatic Evaluation Metrics for Machine Translation (Lin & Och '04)
+ * page TODO
+ * 0 iff no 1gram match
+ */
+double
+stupid_bleu( NgramCounts& counts, const size_t hyp_len, const size_t ref_len,
+ size_t N, vector<float> weights )
+{
+ if ( hyp_len == 0 || ref_len == 0 ) return 0;
+ if ( ref_len < N ) N = ref_len;
+ float N_ = (float)N;
+ if ( weights.empty() )
+ {
+ for ( size_t i = 0; i < N; i++ ) weights.push_back( 1/N_ );
+ }
+ double sum = 0;
+ float add = 0;
+ for ( size_t i = 0; i < N; i++ ) {
+ if ( i == 1 ) add = 1;
+ sum += weights[i] * log( ((double)counts.clipped[i] + add) / ((double)counts.sum[i] + add) );
+ }
+ return brevity_penaly( hyp_len, ref_len ) * exp( sum );
+}
+
+
+/*
+ * smooth_bleu
+ * as in "An End-to-End Discriminative Approach to Machine Translation" (Liang et al. '06)
+ * page TODO
+ * max. 0.9375
+ */
+double
+smooth_bleu( NgramCounts& counts, const size_t hyp_len, const size_t ref_len,
+ const size_t N, vector<float> weights )
+{
+ if ( hyp_len == 0 || ref_len == 0 ) return 0;
+ float N_ = (float)N;
+ if ( weights.empty() )
+ {
+ for ( size_t i = 0; i < N; i++ ) weights.push_back( 1/N_ );
+ }
+ double sum = 0;
+ float j = 1;
+ for ( size_t i = 0; i < N; i++ ) {
+ if ( counts.clipped[i] == 0 || counts.sum[i] == 0) continue;
+ sum += exp((weights[i] * log((double)counts.clipped[i]/(double)counts.sum[i]))) / pow( 2, N_-j+1 );
+ j++;
+ }
+ return brevity_penaly( hyp_len, ref_len ) * sum;
+}
+
+
+/*
+ * approx_bleu
+ * as in "Online Large-Margin Training for Statistical Machine Translation" (Watanabe et al. '07)
+ * page TODO
+ *
+ */
+double
+approx_bleu( NgramCounts& counts, const size_t hyp_len, const size_t ref_len,
+ const size_t N, vector<float> weights )
+{
+ return bleu( counts, hyp_len, ref_len, N, weights );
+}
+
+
+/*
+ * register_and_convert
+ *
+ */
+void
+register_and_convert(const vector<string>& strs, vector<WordID>& ids)
+{
+ vector<string>::const_iterator it;
+ for ( it = strs.begin(); it < strs.end(); it++ ) {
+ ids.push_back( TD::Convert( *it ) );
+ }
+}
+
+
+
+
+/*
+ *
+ *
+ */
+void
+test_ngrams()
+{
+ cout << "Testing ngrams..." << endl << endl;
+ size_t N = 5;
+ cout << "N = " << N << endl;
+ vector<int> a; // hyp
+ vector<int> b; // ref
+ cout << "a ";
+ for (size_t i = 1; i <= 8; i++) {
+ cout << i << " ";
+ a.push_back(i);
+ }
+ cout << endl << "b ";
+ for (size_t i = 1; i <= 4; i++) {
+ cout << i << " ";
+ b.push_back(i);
+ }
+ cout << endl << endl;
+ NgramCounts c = make_ngram_counts( a, b, N );
+ assert( c.clipped[N-1] == 0 );
+ assert( c.sum[N-1] == 4 );
+ c.print();
+ c += c;
+ cout << endl;
+ c.print();
+ cout << endl;
+}
+
+
+/*
+ *
+ *
+ */
+double
+approx_equal( double x, double y )
+{
+ const double EPSILON = 1E-5;
+ if ( x == 0 ) return fabs( y ) <= EPSILON;
+ if ( y == 0 ) return fabs( x ) <= EPSILON;
+ return fabs( x - y ) / max( fabs(x), fabs(y) ) <= EPSILON;
+}
+
+
+/*
+ *
+ *
+ */
+void
+test_metrics()
+{
+ cout << "Testing metrics..." << endl << endl;
+ using namespace boost::assign;
+ vector<string> a, b;
+ vector<double> expect_vanilla, expect_smooth, expect_stupid;
+ a += "a a a a", "a a a a", "a", "a", "b", "a a a a", "a a", "a a a", "a b a"; // hyp
+ b += "b b b b", "a a a a", "a", "b", "b b b b", "a", "a a", "a a a", "a b b"; // ref
+ expect_vanilla += 0, 1, 1, 0, 0, .25, 1, 1, 0;
+ expect_smooth += 0, .9375, .0625, 0, .00311169, .0441942, .1875, .4375, .161587;
+ expect_stupid += 0, 1, 1, 0, .0497871, .25, 1, 1, .605707;
+ vector<string> aa, bb;
+ vector<WordID> aai, bbi;
+ double vanilla, smooth, stupid;
+ size_t N = 4;
+ cout << "N = " << N << endl << endl;
+ for ( size_t i = 0; i < a.size(); i++ ) {
+ cout << " hyp: " << a[i] << endl;
+ cout << " ref: " << b[i] << endl;
+ aa.clear(); bb.clear(); aai.clear(); bbi.clear();
+ boost::split( aa, a[i], boost::is_any_of(" ") );
+ boost::split( bb, b[i], boost::is_any_of(" ") );
+ register_and_convert( aa, aai );
+ register_and_convert( bb, bbi );
+ NgramCounts counts = make_ngram_counts( aai, bbi, N );
+ vanilla = bleu( counts, aa.size(), bb.size(), N);
+ smooth = smooth_bleu( counts, aa.size(), bb.size(), N);
+ stupid = stupid_bleu( counts, aa.size(), bb.size(), N);
+ assert( approx_equal(vanilla, expect_vanilla[i]) );
+ assert( approx_equal(smooth, expect_smooth[i]) );
+ assert( approx_equal(stupid, expect_stupid[i]) );
+ cout << setw(14) << "bleu = " << vanilla << endl;
+ cout << setw(14) << "smooth bleu = " << smooth << endl;
+ cout << setw(14) << "stupid bleu = " << stupid << endl << endl;
+ }
+ cout << endl;
+}
+
+/*
+ *
+ *
+ */
+void
+test_SetWeights()
+{
+ cout << "Testing Weights::SetWeight..." << endl << endl;
+ Weights weights;
+ SparseVector<double> lambdas;
+ weights.InitSparseVector( &lambdas );
+ weights.SetWeight( &lambdas, "test", 0 );
+ weights.SetWeight( &lambdas, "test1", 1 );
+ WordID fid = FD::Convert( "test2" );
+ weights.SetWeight( &lambdas, fid, 2 );
+ string fn = "weights-test";
+ cout << "FD::NumFeats() " << FD::NumFeats() << endl;
+ assert( FD::NumFeats() == 4 );
+ weights.WriteToFile( fn, true );
+ cout << endl;
+}
+
+
+/*
+ *
+ *
+ */
+void
+run_tests()
+{
+ cout << endl;
+ test_ngrams();
+ cout << endl;
+ test_metrics();
+ cout << endl;
+ test_SetWeights();
+ exit(0);
+}
+
+
+void
+print_FD()
+{
+ for ( size_t i = 0; i < FD::NumFeats(); i++ ) cout << FD::Convert(i)<< endl;
+}
+
diff --git a/dtrain/dcommon.h b/dtrain/dcommon.h
new file mode 100644
index 00000000..ff796642
--- /dev/null
+++ b/dtrain/dcommon.h
@@ -0,0 +1,230 @@
+#include <sstream>
+#include <iostream>
+#include <vector>
+#include <cassert>
+#include <cmath>
+
+#include "config.h"
+
+#include <boost/shared_ptr.hpp>
+#include <boost/algorithm/string.hpp>
+#include <boost/program_options.hpp>
+#include <boost/program_options/variables_map.hpp>
+
+#include "sentence_metadata.h"
+#include "scorer.h"
+#include "verbose.h"
+#include "viterbi.h"
+#include "hg.h"
+#include "prob.h"
+#include "kbest.h"
+#include "ff_register.h"
+#include "decoder.h"
+#include "filelib.h"
+#include "fdict.h"
+#include "weights.h"
+#include "sparse_vector.h"
+#include "sampler.h"
+
+using namespace std;
+namespace po = boost::program_options;
+
+
+struct ScorePair
+{
+ ScorePair(double modelscore, double score) : modelscore_(modelscore), score_(score) {}
+ double modelscore_, score_;
+ double GetModelScore() { return modelscore_; }
+ double GetScore() { return score_; }
+};
+typedef vector<ScorePair> Scores;
+
+
+/*
+ * KBestGetter
+ *
+ */
+struct KBestList {
+ vector<SparseVector<double> > feats;
+ vector<vector<WordID> > sents;
+ vector<double> scores;
+};
+struct KBestGetter : public DecoderObserver
+{
+ KBestGetter( const size_t k ) : k_(k) {}
+ const size_t k_;
+ KBestList kb;
+
+ virtual void
+ NotifyTranslationForest(const SentenceMetadata& smeta, Hypergraph* hg)
+ {
+ GetKBest(smeta.GetSentenceID(), *hg);
+ }
+
+ KBestList* GetKBest() { return &kb; }
+
+ void
+ GetKBest(int sent_id, const Hypergraph& forest)
+ {
+ kb.scores.clear();
+ kb.sents.clear();
+ kb.feats.clear();
+ KBest::KBestDerivations<vector<WordID>, ESentenceTraversal> kbest( forest, k_ );
+ for ( size_t i = 0; i < k_; ++i ) {
+ const KBest::KBestDerivations<vector<WordID>, ESentenceTraversal>::Derivation* d =
+ kbest.LazyKthBest( forest.nodes_.size() - 1, i );
+ if (!d) break;
+ kb.sents.push_back( d->yield);
+ kb.feats.push_back( d->feature_values );
+ kb.scores.push_back( d->score );
+ }
+ }
+};
+
+
+/*
+ * NgramCounts
+ *
+ */
+struct NgramCounts
+{
+ NgramCounts( const size_t N ) : N_( N ) {
+ reset();
+ }
+ size_t N_;
+ map<size_t, size_t> clipped;
+ map<size_t, size_t> sum;
+
+ void
+ operator+=( const NgramCounts& rhs )
+ {
+ assert( N_ == rhs.N_ );
+ for ( size_t i = 0; i < N_; i++ ) {
+ this->clipped[i] += rhs.clipped.find(i)->second;
+ this->sum[i] += rhs.sum.find(i)->second;
+ }
+ }
+
+ void
+ add( size_t count, size_t ref_count, size_t i )
+ {
+ assert( i < N_ );
+ if ( count > ref_count ) {
+ clipped[i] += ref_count;
+ sum[i] += count;
+ } else {
+ clipped[i] += count;
+ sum[i] += count;
+ }
+ }
+
+ void
+ reset()
+ {
+ size_t i;
+ for ( i = 0; i < N_; i++ ) {
+ clipped[i] = 0;
+ sum[i] = 0;
+ }
+ }
+
+ void
+ print()
+ {
+ for ( size_t i = 0; i < N_; i++ ) {
+ cout << i+1 << "grams (clipped):\t" << clipped[i] << endl;
+ cout << i+1 << "grams:\t\t\t" << sum[i] << endl;
+ }
+ }
+};
+
+
+/*class Learnerx
+{
+ public:
+ virtual void Init(const vector<SparseVector<double> >& kbest, const Scores& scores) {};
+ virtual void Update(SparseVector<double>& lambdas);
+};*/
+
+class SofiaLearner //: public Learnerx FIXME
+{
+ // TODO bool invert_score
+ public:
+ void
+ Init( const size_t sid, const vector<SparseVector<double> >& kbest, /*const*/ Scores& scores )
+ {
+ assert( kbest.size() == scores.size() );
+ ofstream o;
+ unlink( "/tmo/sofia_ml_training" );
+ o.open( "/tmp/sofia_ml_training", ios::trunc ); // TODO randomize, filename exists
+ int fid = 0;
+ map<int,int>::iterator ff;
+ for ( size_t k = 0; k < kbest.size(); ++k ) {
+ SparseVector<double>::const_iterator it = kbest[k].begin();
+ o << scores[k].GetScore();
+ for ( ; it != kbest[k].end(); ++it) {
+ ff = fmap.find( it->first );
+ if ( ff == fmap.end() ) {
+ fmap.insert( pair<int,int>(it->first, fid) );
+ fmap1.insert( pair<int,int>(fid, it->first) );
+ fid++;
+ }
+ o << " "<< fmap[it->first] << ":" << it->second;
+ }
+ o << endl;
+ }
+ o.close();
+ }
+
+ void
+ Update(SparseVector<double>& lambdas)
+ {
+ string call = "./sofia-ml --training_file /tmp/sofia_ml_training --model_out /tmp/sofia_ml_model --loop_type stochastic --lambda 100 --dimensionality ";
+ std::stringstream out;
+ out << fmap.size();
+ call += out.str();
+ call += " &>/dev/null";
+ system ( call.c_str() );
+ ifstream i;
+ unlink( "/tmo/sofia_ml_model" );
+ i.open( "/tmp/sofia_ml_model", ios::in );
+ string model;
+ getline( i, model );
+ //cout << model << endl;
+ vector<string> strs;
+ boost::split( strs, model, boost::is_any_of(" ") );
+ int j = 0;
+ for ( vector<string>::iterator it = strs.begin(); it != strs.end(); ++it ) {
+ lambdas.set_value(fmap1[j], atof( it->c_str() ) );
+ j++;
+ }
+
+ }
+
+ private:
+ map<int,int> fmap;
+ map<int,int> fmap1;
+};
+
+typedef map<vector<WordID>, size_t> Ngrams;
+Ngrams make_ngrams( vector<WordID>& s, size_t N );
+NgramCounts make_ngram_counts( vector<WordID> hyp, vector<WordID> ref, size_t N );
+double brevity_penaly( const size_t hyp_len, const size_t ref_len );
+double bleu( NgramCounts& counts, const size_t hyp_len, const size_t ref_len, size_t N, vector<float> weights = vector<float>() );
+double stupid_bleu( NgramCounts& counts, const size_t hyp_len, const size_t ref_len, size_t N, vector<float> weights = vector<float>() );
+double smooth_bleu( NgramCounts& counts, const size_t hyp_len, const size_t ref_len, const size_t N, vector<float> weights = vector<float>() );
+double approx_bleu( NgramCounts& counts, const size_t hyp_len, const size_t ref_len, const size_t N, vector<float> weights = vector<float>() );
+void register_and_convert(const vector<string>& strs, vector<WordID>& ids);
+
+
+
+
+void print_FD();
+void run_tests();
+void test_SetWeights();
+#include <boost/assign/std/vector.hpp>
+#include <iomanip>
+void test_metrics();
+double approx_equal( double x, double y );
+void test_ngrams();
+
diff --git a/dtrain/dtest.cc b/dtrain/dtest.cc
new file mode 100644
index 00000000..9975794f
--- /dev/null
+++ b/dtrain/dtest.cc
@@ -0,0 +1,95 @@
+#include "dcommon.h"
+
+
+
+
+/*
+ * init
+ *
+ */
+bool
+init(int argc, char** argv, po::variables_map* conf)
+{
+ int N;
+ po::options_description opts( "Options" );
+ opts.add_options()
+ ( "decoder-config,c", po::value<string>(), "configuration file for cdec" )
+ ( "weights,w", po::value<string>(), "weights file")
+ ( "ngrams,n", po::value<int>(&N)->default_value(4), "N for Ngrams (default 5)" );
+ po::options_description cmdline_options;
+ cmdline_options.add(opts);
+ po::store( parse_command_line(argc, argv, cmdline_options), *conf );
+ po::notify( *conf );
+ if ( ! (conf->count("decoder-config") || conf->count("weights")) ) {
+ cerr << cmdline_options << endl;
+ return false;
+ }
+ return true;
+}
+
+
+/*
+ * main
+ *
+ */
+int
+main(int argc, char** argv)
+{
+ SetSilent(true);
+ po::variables_map conf;
+ if (!init(argc, argv, &conf)) return 1;
+ register_feature_functions();
+ size_t k = 1;
+ ReadFile ini_rf(conf["decoder-config"].as<string>());
+ Decoder decoder(ini_rf.stream());
+ KBestGetter observer(k);
+ size_t N = conf["ngrams"].as<int>();
+
+ Weights weights;
+ weights.InitFromFile(conf["weights"].as<string>());
+ vector<double> w;
+ weights.InitVector(&w);
+ decoder.SetWeights(w);
+
+ vector<string> strs, ref_strs;
+ vector<WordID> ref_ids;
+ string in, psg;
+ size_t sid = 0;
+ double overall = 0.0;
+ cerr << "(1 dot equals 100 lines of input)" << endl;
+ while( getline(cin, in) ) {
+ if ( (sid+1) % 100 == 0 ) {
+ cerr << ".";
+ if ( (sid+1)%1000 == 0 ) cerr << endl;
+ }
+ if ( sid > 5000 ) break;
+ strs.clear();
+ boost::split( strs, in, boost::is_any_of("\t") );
+ // grammar
+ psg = boost::replace_all_copy( strs[2], " __NEXT_RULE__ ", "\n" ); psg += "\n";
+ decoder.SetSentenceGrammar( psg );
+ decoder.Decode( strs[0], &observer );
+ KBestList* kb = observer.GetKBest();
+ // reference
+ ref_strs.clear(); ref_ids.clear();
+ boost::split( ref_strs, strs[1], boost::is_any_of(" ") );
+ register_and_convert( ref_strs, ref_ids );
+ // scoring kbest
+ double score = 0;
+ Scores scores;
+ NgramCounts counts = make_ngram_counts( ref_ids, kb->sents[0], 4 );
+ score = smooth_bleu( counts,
+ ref_ids.size(),
+ kb->sents[0].size(), N );
+ ScorePair sp( kb->scores[0], score );
+ scores.push_back( sp );
+ //cout << TD::GetString( kb->sents[0] ) << endl;
+ overall += score;
+ sid += 1;
+ }
+ cout << "Average score: " << overall/(sid+1) << endl;
+ cerr << endl;
+
+ return 0;
+}
+
diff --git a/dtrain/dtrain.cc b/dtrain/dtrain.cc
index 8464a429..95fc81af 100644
--- a/dtrain/dtrain.cc
+++ b/dtrain/dtrain.cc
@@ -1,33 +1,6 @@
-#include <sstream>
-#include <iostream>
-#include <vector>
-#include <cassert>
-#include <cmath>
+#include "dcommon.h"
-#include "config.h"
-#include <boost/shared_ptr.hpp>
-#include <boost/algorithm/string.hpp>
-#include <boost/program_options.hpp>
-#include <boost/program_options/variables_map.hpp>
-
-#include "sentence_metadata.h"
-#include "scorer.h"
-#include "verbose.h"
-#include "viterbi.h"
-#include "hg.h"
-#include "prob.h"
-#include "kbest.h"
-#include "ff_register.h"
-#include "decoder.h"
-#include "filelib.h"
-#include "fdict.h"
-#include "weights.h"
-#include "sparse_vector.h"
-#include "sampler.h"
-
-using namespace std;
-namespace boostpo = boost::program_options;
/*
@@ -35,19 +8,19 @@ namespace boostpo = boost::program_options;
*
*/
bool
-init(int argc, char** argv, boostpo::variables_map* conf)
+init(int argc, char** argv, po::variables_map* conf)
{
- boostpo::options_description opts( "Options" );
+ po::options_description opts( "Options" );
opts.add_options()
- ( "decoder-config,c", boostpo::value<string>(), "configuration file for cdec" )
- ( "kbest,k", boostpo::value<size_t>(), "k for kbest" )
- ( "ngrams,n", boostpo::value<int>(), "n for Ngrams" )
- ( "filter,f", boostpo::value<string>(), "filter kbest list" )
+ ( "decoder-config,c", po::value<string>(), "configuration file for cdec" )
+ ( "kbest,k", po::value<size_t>(), "k for kbest" )
+ ( "ngrams,n", po::value<int>(), "n for Ngrams" )
+ ( "filter,f", po::value<string>(), "filter kbest list" )
( "test", "run tests and exit");
- boostpo::options_description cmdline_options;
+ po::options_description cmdline_options;
cmdline_options.add(opts);
- boostpo::store( parse_command_line(argc, argv, cmdline_options), *conf );
- boostpo::notify( *conf );
+ po::store( parse_command_line(argc, argv, cmdline_options), *conf );
+ po::notify( *conf );
if ( ! (conf->count("decoder-config") || conf->count("test")) ) {
cerr << cmdline_options << endl;
return false;
@@ -57,451 +30,14 @@ init(int argc, char** argv, boostpo::variables_map* conf)
/*
- * KBestGetter
- *
- */
-struct KBestList {
- vector<SparseVector<double> > feats;
- vector<vector<WordID> > sents;
- vector<double> scores;
-};
-struct KBestGetter : public DecoderObserver
-{
- KBestGetter( const size_t k ) : k_(k) {}
- const size_t k_;
- KBestList kb;
-
- virtual void
- NotifyTranslationForest(const SentenceMetadata& smeta, Hypergraph* hg)
- {
- GetKBest(smeta.GetSentenceID(), *hg);
- }
-
- KBestList* getkb() { return &kb; }
-
- void
- GetKBest(int sent_id, const Hypergraph& forest)
- {
- kb.scores.clear();
- kb.sents.clear();
- kb.feats.clear();
- KBest::KBestDerivations<vector<WordID>, ESentenceTraversal> kbest( forest, k_ );
- for ( size_t i = 0; i < k_; ++i ) {
- const KBest::KBestDerivations<vector<WordID>, ESentenceTraversal>::Derivation* d =
- kbest.LazyKthBest( forest.nodes_.size() - 1, i );
- if (!d) break;
- kb.sents.push_back( d->yield);
- kb.feats.push_back( d->feature_values );
- kb.scores.push_back( d->score );
- }
- }
-};
-
-
-/*
- * write_training_data_for_sofia
- *
- */
-void
-sofia_write_training_data()
-{
- // TODO
-}
-
-
-/*
- * call_sofia
- *
- */
-void
-sofia_call()
-{
- // TODO
-}
-
-
-/*
- * sofia_model2weights
- *
- */
-void
-sofia_read_model()
-{
- // TODO
-}
-
-
-/*
- * make_ngrams
- *
- */
-typedef map<vector<WordID>, size_t> Ngrams;
-Ngrams
-make_ngrams( vector<WordID>& s, size_t N )
-{
- Ngrams ngrams;
- vector<WordID> ng;
- for ( size_t i = 0; i < s.size(); i++ ) {
- ng.clear();
- for ( size_t j = i; j < min( i+N, s.size() ); j++ ) {
- ng.push_back( s[j] );
- ngrams[ng]++;
- }
- }
- return ngrams;
-}
-
-
-/*
- * NgramCounts
- *
- */
-struct NgramCounts
-{
- NgramCounts( const size_t N ) : N_( N ) {
- reset();
- }
- size_t N_;
- map<size_t, size_t> clipped;
- map<size_t, size_t> sum;
-
- void
- operator+=( const NgramCounts& rhs )
- {
- assert( N_ == rhs.N_ );
- for ( size_t i = 0; i < N_; i++ ) {
- this->clipped[i] += rhs.clipped.find(i)->second;
- this->sum[i] += rhs.sum.find(i)->second;
- }
- }
-
- void
- add( size_t count, size_t ref_count, size_t i )
- {
- assert( i < N_ );
- if ( count > ref_count ) {
- clipped[i] += ref_count;
- sum[i] += count;
- } else {
- clipped[i] += count;
- sum[i] += count;
- }
- }
-
- void
- reset()
- {
- size_t i;
- for ( i = 0; i < N_; i++ ) {
- clipped[i] = 0;
- sum[i] = 0;
- }
- }
-
- void
- print()
- {
- for ( size_t i = 0; i < N_; i++ ) {
- cout << i+1 << "grams (clipped):\t" << clipped[i] << endl;
- cout << i+1 << "grams:\t\t\t" << sum[i] << endl;
- }
- }
-};
-
-
-/*
- * ngram_matches
- *
- */
-NgramCounts
-make_ngram_counts( vector<WordID> hyp, vector<WordID> ref, size_t N )
-{
- Ngrams hyp_ngrams = make_ngrams( hyp, N );
- Ngrams ref_ngrams = make_ngrams( ref, N );
- NgramCounts counts( N );
- Ngrams::iterator it;
- Ngrams::iterator ti;
- for ( it = hyp_ngrams.begin(); it != hyp_ngrams.end(); it++ ) {
- ti = ref_ngrams.find( it->first );
- if ( ti != ref_ngrams.end() ) {
- counts.add( it->second, ti->second, it->first.size() - 1 );
- } else {
- counts.add( it->second, 0, it->first.size() - 1 );
- }
- }
- return counts;
-}
-
-
-/*
- * brevity_penaly
- *
- */
-double
-brevity_penaly( const size_t hyp_len, const size_t ref_len )
-{
- if ( hyp_len > ref_len ) return 1;
- return exp( 1 - (double)ref_len/(double)hyp_len );
-}
-
-
-/*
- * bleu
- * as in "BLEU: a Method for Automatic Evaluation of Machine Translation" (Papineni et al. '02)
- * page TODO
- * 0 if for N one of the counts = 0
- */
-double
-bleu( NgramCounts& counts, const size_t hyp_len, const size_t ref_len,
- size_t N, vector<float> weights = vector<float>() )
-{
- if ( hyp_len == 0 || ref_len == 0 ) return 0;
- if ( ref_len < N ) N = ref_len;
- float N_ = (float)N;
- if ( weights.empty() )
- {
- for ( size_t i = 0; i < N; i++ ) weights.push_back( 1/N_ );
- }
- double sum = 0;
- for ( size_t i = 0; i < N; i++ ) {
- if ( counts.clipped[i] == 0 || counts.sum[i] == 0 ) return 0;
- sum += weights[i] * log( (double)counts.clipped[i] / (double)counts.sum[i] );
- }
- return brevity_penaly( hyp_len, ref_len ) * exp( sum );
-}
-
-
-/*
- * stupid_bleu
- * as in "ORANGE: a Method for Evaluating Automatic Evaluation Metrics for Machine Translation (Lin & Och '04)
- * page TODO
- * 0 iff no 1gram match
- */
-double
-stupid_bleu( NgramCounts& counts, const size_t hyp_len, const size_t ref_len,
- size_t N, vector<float> weights = vector<float>() )
-{
- if ( hyp_len == 0 || ref_len == 0 ) return 0;
- if ( ref_len < N ) N = ref_len;
- float N_ = (float)N;
- if ( weights.empty() )
- {
- for ( size_t i = 0; i < N; i++ ) weights.push_back( 1/N_ );
- }
- double sum = 0;
- float add = 0;
- for ( size_t i = 0; i < N; i++ ) {
- if ( i == 1 ) add = 1;
- sum += weights[i] * log( ((double)counts.clipped[i] + add) / ((double)counts.sum[i] + add) );
- }
- return brevity_penaly( hyp_len, ref_len ) * exp( sum );
-}
-
-
-/*
- * smooth_bleu
- * as in "An End-to-End Discriminative Approach to Machine Translation" (Liang et al. '06)
- * page TODO
- * max. 0.9375
- */
-double
-smooth_bleu( NgramCounts& counts, const size_t hyp_len, const size_t ref_len,
- const size_t N, vector<float> weights = vector<float>() )
-{
- if ( hyp_len == 0 || ref_len == 0 ) return 0;
- float N_ = (float)N;
- if ( weights.empty() )
- {
- for ( size_t i = 0; i < N; i++ ) weights.push_back( 1/N_ );
- }
- double sum = 0;
- float j = 1;
- for ( size_t i = 0; i < N; i++ ) {
- if ( counts.clipped[i] == 0 || counts.sum[i] == 0) continue;
- sum += exp((weights[i] * log((double)counts.clipped[i]/(double)counts.sum[i]))) / pow( 2, N_-j+1 );
- j++;
- }
- return brevity_penaly( hyp_len, ref_len ) * sum;
-}
-
-
-/*
- * approx_bleu
- * as in "Online Large-Margin Training for Statistical Machine Translation" (Watanabe et al. '07)
- * page TODO
- *
- */
-double
-approx_bleu( NgramCounts& counts, const size_t hyp_len, const size_t ref_len,
- const size_t N, vector<float> weights = vector<float>() )
-{
- return bleu( counts, hyp_len, ref_len, N, weights );
-}
-
-
-/*
- * register_and_convert
- *
- */
-void
-register_and_convert(const vector<string>& strs, vector<WordID>& ids)
-{
- vector<string>::const_iterator it;
- for ( it = strs.begin(); it < strs.end(); it++ ) {
- ids.push_back( TD::Convert( *it ) );
- }
-}
-
-
-/*
- *
- *
- */
-void
-test_ngrams()
-{
- cout << "Testing ngrams..." << endl << endl;
- size_t N = 5;
- cout << "N = " << N << endl;
- vector<int> a; // hyp
- vector<int> b; // ref
- cout << "a ";
- for (size_t i = 1; i <= 8; i++) {
- cout << i << " ";
- a.push_back(i);
- }
- cout << endl << "b ";
- for (size_t i = 1; i <= 4; i++) {
- cout << i << " ";
- b.push_back(i);
- }
- cout << endl << endl;
- NgramCounts c = make_ngram_counts( a, b, N );
- assert( c.clipped[N-1] == 0 );
- assert( c.sum[N-1] == 4 );
- c.print();
- c += c;
- cout << endl;
- c.print();
- cout << endl;
-}
-
-
-/*
- *
- *
- */
-double
-approx_equal( double x, double y )
-{
- const double EPSILON = 1E-5;
- if ( x == 0 ) return fabs( y ) <= EPSILON;
- if ( y == 0 ) return fabs( x ) <= EPSILON;
- return fabs( x - y ) / max( fabs(x), fabs(y) ) <= EPSILON;
-}
-
-
-/*
- *
- *
- */
-#include <boost/assign/std/vector.hpp>
-#include <iomanip>
-void
-test_metrics()
-{
- cout << "Testing metrics..." << endl << endl;
- using namespace boost::assign;
- vector<string> a, b;
- vector<double> expect_vanilla, expect_smooth, expect_stupid;
- a += "a a a a", "a a a a", "a", "a", "b", "a a a a", "a a", "a a a", "a b a"; // hyp
- b += "b b b b", "a a a a", "a", "b", "b b b b", "a", "a a", "a a a", "a b b"; // ref
- expect_vanilla += 0, 1, 1, 0, 0, .25, 1, 1, 0;
- expect_smooth += 0, .9375, .0625, 0, .00311169, .0441942, .1875, .4375, .161587;
- expect_stupid += 0, 1, 1, 0, .0497871, .25, 1, 1, .605707;
- vector<string> aa, bb;
- vector<WordID> aai, bbi;
- double vanilla, smooth, stupid;
- size_t N = 4;
- cout << "N = " << N << endl << endl;
- for ( size_t i = 0; i < a.size(); i++ ) {
- cout << " hyp: " << a[i] << endl;
- cout << " ref: " << b[i] << endl;
- aa.clear(); bb.clear(); aai.clear(); bbi.clear();
- boost::split( aa, a[i], boost::is_any_of(" ") );
- boost::split( bb, b[i], boost::is_any_of(" ") );
- register_and_convert( aa, aai );
- register_and_convert( bb, bbi );
- NgramCounts counts = make_ngram_counts( aai, bbi, N );
- vanilla = bleu( counts, aa.size(), bb.size(), N);
- smooth = smooth_bleu( counts, aa.size(), bb.size(), N);
- stupid = stupid_bleu( counts, aa.size(), bb.size(), N);
- assert( approx_equal(vanilla, expect_vanilla[i]) );
- assert( approx_equal(smooth, expect_smooth[i]) );
- assert( approx_equal(stupid, expect_stupid[i]) );
- cout << setw(14) << "bleu = " << vanilla << endl;
- cout << setw(14) << "smooth bleu = " << smooth << endl;
- cout << setw(14) << "stupid bleu = " << stupid << endl << endl;
- }
- cout << endl;
-}
-
-/*
- *
- *
- */
-void
-test_SetWeights()
-{
- cout << "Testing Weights::SetWeight..." << endl << endl;
- Weights weights;
- SparseVector<double> lambdas;
- weights.InitSparseVector( &lambdas );
- weights.SetWeight( &lambdas, "test", 0 );
- weights.SetWeight( &lambdas, "test1", 1 );
- WordID fid = FD::Convert( "test2" );
- weights.SetWeight( &lambdas, fid, 2 );
- string fn = "weights-test";
- cout << "FD::NumFeats() " << FD::NumFeats() << endl;
- assert( FD::NumFeats() == 4 );
- weights.WriteToFile( fn, true );
- cout << endl;
-}
-
-
-/*
- *
- *
- */
-void
-run_tests()
-{
- cout << endl;
- test_ngrams();
- cout << endl;
- test_metrics();
- cout << endl;
- test_SetWeights();
- exit(0);
-}
-
-
-void
-print_FD()
-{
- for ( size_t i = 0; i < FD::NumFeats(); i++ ) cout << FD::Convert(i)<< endl;
-}
-
-
-/*
* main
*
*/
int
main(int argc, char** argv)
{
- //SetSilent(true);
- boostpo::variables_map conf;
+ SetSilent(true);
+ po::variables_map conf;
if (!init(argc, argv, &conf)) return 1;
if ( conf.count("test") ) run_tests();
register_feature_functions();
@@ -509,7 +45,9 @@ main(int argc, char** argv)
ReadFile ini_rf(conf["decoder-config"].as<string>());
Decoder decoder(ini_rf.stream());
KBestGetter observer(k);
-
+ size_t N = 4; // TODO as parameter/in config
+
+ // TODO scoring metric as parameter/in config
// for approx. bleu
//NgramCounts global_counts;
//size_t global_hyp_len;
@@ -523,82 +61,67 @@ main(int argc, char** argv)
lambdas.set_value(FD::Convert("logp"), 0);
- vector<string> strs;
+ vector<string> strs, ref_strs;
+ vector<WordID> ref_ids;
string in, psg;
- size_t i = 0;
+ size_t sid = 0;
+ cerr << "(1 dot equals 100 lines of input)" << endl;
while( getline(cin, in) ) {
- if ( !SILENT ) cerr << endl << endl << "Getting kbest for sentence #" << i << endl;
- // why? why!?
+ //if ( !SILENT )
+ // cerr << endl << endl << "Getting kbest for sentence #" << sid << endl;
+ if ( (sid+1) % 100 == 0 ) {
+ cerr << ".";
+ if ( (sid+1)%1000 == 0 ) cerr << endl;
+ }
+ if ( sid > 5000 ) break;
+ // weights
dense_weights.clear();
weights.InitFromVector( lambdas );
weights.InitVector( &dense_weights );
decoder.SetWeights( dense_weights );
- //cout << "use_shell " << dense_weights[FD::Convert("use_shell")] << endl;
+ //if ( sid > 100 ) break;
+ // handling input..
strs.clear();
boost::split( strs, in, boost::is_any_of("\t") );
+ // grammar
psg = boost::replace_all_copy( strs[2], " __NEXT_RULE__ ", "\n" ); psg += "\n";
- //decoder.SetId(i);
decoder.SetSentenceGrammar( psg );
decoder.Decode( strs[0], &observer );
- KBestList* kb = observer.getkb();
+ KBestList* kb = observer.GetKBest();
+ // reference
+ ref_strs.clear(); ref_ids.clear();
+ boost::split( ref_strs, strs[1], boost::is_any_of(" ") );
+ register_and_convert( ref_strs, ref_ids );
+ // scoring kbest
+ double score = 0;
+ Scores scores;
for ( size_t i = 0; i < k; i++ ) {
- cout << i << " ";
- for (size_t j = 0; j < kb->sents[i].size(); ++j ) {
- cout << TD::Convert( kb->sents[i][j] ) << " ";
- }
- cout << kb->scores[i];
- cout << endl;
+ NgramCounts counts = make_ngram_counts( ref_ids, kb->sents[i], 4 );
+ score = smooth_bleu( counts,
+ ref_ids.size(),
+ kb->sents[i].size(), N );
+ ScorePair sp( kb->scores[i], score );
+ scores.push_back( sp );
+ //cout << "'" << TD::GetString( ref_ids ) << "' vs '" << TD::GetString( kb->sents[i] ) << "' SCORE=" << score << endl;
+ //cout << kb->feats[i] << endl;
}
- lambdas.set_value( FD::Convert("use_shell"), 1 );
- lambdas.set_value( FD::Convert("use_a"), 1 );
+ //cout << "###" << endl;
+ SofiaLearner learner;
+ learner.Init( sid, kb->feats, scores );
+ learner.Update(lambdas);
+ // initializing learner
+ // TODO
+ // updating weights
+ //lambdas.set_value( FD::Convert("use_shell"), 1 );
+ //lambdas.set_value( FD::Convert("use_a"), 1 );
//print_FD();
+ sid += 1; // TODO does cdec count this already?
}
-
+
weights.WriteToFile( "weights-final", true );
+
+ cerr << endl;
return 0;
}
- // next: FMap, ->sofia, ->FMap, -> Weights
- // learner gets all used features (binary! and dense (logprob is sum of logprobs!))
- // only for those feats with weight > 0 after learning
- // see decoder line 548
-
-
-/*
- * TODO
- * iterate over training set, for t=1..T
- * mapred impl
- * mapper: main
- * reducer: average weights, global NgramCounts for approx. bleu
- * 1st cut: hadoop streaming?
- * batch, non-batch in the mapper (what sofia gets, regenerated Kbest lists)
- * filter kbest yes/no
- * sofia: --eta_type explicit
- * psg preparation source\tref\tpsg
- * set reference for cdec?
- * LM
- * shared?
- * startup?
- * X reference(s) for *bleu!?
- * kbest nicer (do not iterate twice)!? -> shared_ptr
- * multipartite ranking
- * weights! global, per sentence from global, featuremap
- * const decl...
- * sketch: batch/iter options
- * weights.cc: why wv_?
- * --weights cmd line (for iterations): script to call again/hadoop streaming?
- * I do not need to remember features, cdec does
- * resocre hg?
- * do not use Decoder::Decode!?
- * what happens if feature not in FD? 0???
- */
-
-/*
- * PROBLEMS
- * cdec kbest vs 1best (no -k param)
- * FD, Weights::wv_ grow too large, see utils/weights.cc; decoder/hg.h; decoder/scfg_translator.cc; utils/fdict.cc!?
- * sparse vector instead of vector<double> for weights in Decoder?
- * PhraseModel_* features for psg!? (seem to be generated)
- */
-
diff --git a/dtrain/dtrain.ini b/dtrain/dtrain.ini
deleted file mode 100644
index e69de29b..00000000
--- a/dtrain/dtrain.ini
+++ /dev/null
diff --git a/dtrain/in b/dtrain/in
deleted file mode 100644
index 294d009b..00000000
--- a/dtrain/in
+++ /dev/null
@@ -1,2 +0,0 @@
-vorrichtung means [X] ||| vorrichtung ||| apparatus ||| LogP=-200 ||| 0-0 __NEXT_RULE__ [X] ||| vorrichtung ||| means ||| LogP=-101 ||| 0-0
-eintest test [X] ||| eintest ||| test ||| LogP=-200 ||| 0-0 __NEXT_RULE__ [X] ||| eintest ||| xxx ||| LogP=-101 ||| 0-0
diff --git a/dtrain/in.toy b/dtrain/in.toy
deleted file mode 100644
index 71b736a6..00000000
--- a/dtrain/in.toy
+++ /dev/null
@@ -1,2 +0,0 @@
-ich sah ein kleines haus i saw a little shell [S] ||| [NP,1] [VP,2] ||| [1] [2] ||| logp=0 __NEXT_RULE__ [NP] ||| ich ||| i ||| logp=0 __NEXT_RULE__ [NP] ||| ein [NN,1] ||| a [1] ||| logp=0 __NEXT_RULE__ [NN] ||| [JJ,1] haus ||| [1] house ||| logp=-0.5 __NEXT_RULE__ [NN] ||| [JJ,1] haus ||| [1] shell ||| logp=-0.5 use_shell=1 __NEXT_RULE__ [JJ] ||| kleines ||| small ||| logp=-0.5 __NEXT_RULE__ [JJ] ||| kleines ||| little ||| logp=-0.9 __NEXT_RULE__ [JJ] ||| grosses ||| big ||| logp=-0.5 __NEXT_RULE__ [JJ] ||| grosses ||| large ||| logp=-1.5 __NEXT_RULE__ [VP] ||| [V,1] [NP,2] ||| [1] [2] ||| logp=0 __NEXT_RULE__ [V] ||| sah ||| saw ||| logp=0 __NEXT_RULE__ [V] ||| fand ||| found ||| logp=0
-ich fand ein grosses haus i found a little shell [S] ||| [NP,1] [VP,2] ||| [1] [2] ||| logp=0 __NEXT_RULE__ [NP] ||| ich ||| i ||| logp=0 __NEXT_RULE__ [NP] ||| ein [NN,1] ||| a [1] ||| logp=0 use_a=1 __NEXT_RULE__ [NN] ||| [JJ,1] haus ||| [1] house ||| logp=-1000 __NEXT_RULE__ [NN] ||| [JJ,1] haus ||| [1] shell ||| logp=-1 use_shell=1 __NEXT_RULE__ [JJ] ||| kleines ||| small ||| logp=-0.5 __NEXT_RULE__ [JJ] ||| kleines ||| little ||| logp=-0.9 __NEXT_RULE__ [JJ] ||| grosses ||| big ||| logp=-0.5 __NEXT_RULE__ [JJ] ||| grosses ||| large ||| logp=-1.5 __NEXT_RULE__ [VP] ||| [V,1] [NP,2] ||| [1] [2] ||| logp=0 __NEXT_RULE__ [V] ||| sah ||| saw ||| logp=0 __NEXT_RULE__ [V] ||| fand ||| found ||| logp=0
diff --git a/dtrain/test.sh b/dtrain/test.sh
index a0ebb420..ad45bd1e 100755
--- a/dtrain/test.sh
+++ b/dtrain/test.sh
@@ -1,4 +1,4 @@
#!/bin/sh
-./dtrain -c cdec.ini -k 4 < in.toy
+./dtrain -c data/cdec.ini -k 4 < data/in.blunsom08 #< data/in.toy