summaryrefslogtreecommitdiff
path: root/dtrain/dtrain.cc
diff options
context:
space:
mode:
Diffstat (limited to 'dtrain/dtrain.cc')
-rw-r--r--dtrain/dtrain.cc526
1 files changed, 526 insertions, 0 deletions
diff --git a/dtrain/dtrain.cc b/dtrain/dtrain.cc
new file mode 100644
index 00000000..25249c7f
--- /dev/null
+++ b/dtrain/dtrain.cc
@@ -0,0 +1,526 @@
+#include <sstream>
+#include <iostream>
+#include <vector>
+#include <cassert>
+#include <cmath>
+
+#include "config.h"
+
+#include <boost/shared_ptr.hpp>
+#include <boost/algorithm/string.hpp>
+#include <boost/program_options.hpp>
+#include <boost/program_options/variables_map.hpp>
+
+#include "sentence_metadata.h"
+#include "scorer.h"
+#include "verbose.h"
+#include "viterbi.h"
+#include "hg.h"
+#include "prob.h"
+#include "kbest.h"
+#include "ff_register.h"
+#include "decoder.h"
+#include "filelib.h"
+#include "fdict.h"
+#include "weights.h"
+#include "sparse_vector.h"
+#include "sampler.h"
+
+using namespace std;
+namespace boostpo = boost::program_options;
+
+
+/*
+ * init
+ *
+ */
+bool
+init(int argc, char** argv, boostpo::variables_map* conf)
+{
+ boostpo::options_description opts( "Options" );
+ opts.add_options()
+ ( "decoder-config,c", boostpo::value<string>(), "configuration file for cdec" )
+ ( "kbest,k", boostpo::value<int>(), "k for kbest" )
+ ( "ngrams,n", boostpo::value<int>(), "n for Ngrams" )
+ ( "filter,f", boostpo::value<string>(), "filter kbest list" );
+ boostpo::options_description cmdline_options;
+ cmdline_options.add(opts);
+ boostpo::store( parse_command_line(argc, argv, cmdline_options), *conf );
+ boostpo::notify( *conf );
+ if ( ! conf->count("decoder-config") ) {
+ cerr << cmdline_options << endl;
+ return false;
+ }
+ return true;
+}
+
+
+/*
+ * KBestGetter
+ *
+ */
+struct KBestList {
+ vector<SparseVector<double> > feats;
+ vector<vector<WordID> > sents;
+ vector<double> scores;
+};
+struct KBestGetter : public DecoderObserver
+{
+ KBestGetter( const size_t k ) : k_(k) {}
+ size_t k_;
+ KBestList kb;
+
+ virtual void
+ NotifyTranslationForest(const SentenceMetadata& smeta, Hypergraph* hg)
+ {
+ GetKBest(smeta.GetSentenceID(), *hg);
+ }
+
+ KBestList* getkb() { return &kb; }
+
+ void
+ GetKBest(int sent_id, const Hypergraph& forest)
+ {
+ kb.scores.clear();
+ kb.sents.clear();
+ kb.feats.clear();
+ KBest::KBestDerivations<vector<WordID>, ESentenceTraversal> kbest( forest, k_ );
+ for ( size_t i = 0; i < k_; ++i ) {
+ const KBest::KBestDerivations<vector<WordID>, ESentenceTraversal>::Derivation* d =
+ kbest.LazyKthBest( forest.nodes_.size() - 1, i );
+ if (!d) break;
+ kb.sents.push_back( d->yield);
+ kb.feats.push_back( d->feature_values );
+ kb.scores.push_back( d->score );
+ }
+ }
+};
+
+
+/*
+ * write_training_data_for_sofia
+ *
+ */
+void
+sofia_write_training_data()
+{
+ // TODO
+}
+
+
+/*
+ * call_sofia
+ *
+ */
+void
+sofia_call()
+{
+ // TODO
+}
+
+
+/*
+ * sofia_model2weights
+ *
+ */
+void
+sofia_read_model()
+{
+ // TODO
+}
+
+
+/*
+ * make_ngrams
+ *
+ */
+typedef map<vector<WordID>, size_t> Ngrams;
+Ngrams
+make_ngrams( vector<WordID>& s, size_t N )
+{
+ Ngrams ngrams;
+ vector<WordID> ng;
+ for ( size_t i = 0; i < s.size(); i++ ) {
+ ng.clear();
+ for ( size_t j = i; j < min( i+N, s.size() ); j++ ) {
+ ng.push_back( s[j] );
+ ngrams[ng]++;
+ }
+ }
+ return ngrams;
+}
+
+
+/*
+ * NgramCounts
+ *
+ */
+struct NgramCounts
+{
+ NgramCounts( const size_t N ) : N_( N ) {
+ reset();
+ }
+ size_t N_;
+ map<size_t, size_t> clipped;
+ map<size_t, size_t> sum;
+
+ NgramCounts&
+ operator+=( const NgramCounts& rhs )
+ {
+ assert( N_ == rhs.N_ );
+ for ( size_t i = 0; i < N_; i++ ) {
+ this->clipped[i] += rhs.clipped.find(i)->second;
+ this->sum[i] += rhs.sum.find(i)->second;
+ }
+ }
+
+ void
+ add( size_t count, size_t ref_count, size_t i )
+ {
+ assert( i < N_ );
+ if ( count > ref_count ) {
+ clipped[i] += ref_count;
+ sum[i] += count;
+ } else {
+ clipped[i] += count;
+ sum[i] += count;
+ }
+ }
+
+ void
+ reset()
+ {
+ size_t i;
+ for ( i = 0; i < N_; i++ ) {
+ clipped[i] = 0;
+ sum[i] = 0;
+ }
+ }
+
+ void
+ print()
+ {
+ for ( size_t i = 0; i < N_; i++ ) {
+ cout << i+1 << "grams (clipped):\t" << clipped[i] << endl;
+ cout << i+1 << "grams:\t\t\t" << sum[i] << endl;
+ }
+ }
+};
+
+
+/*
+ * ngram_matches
+ *
+ */
+NgramCounts
+make_ngram_counts( vector<WordID> hyp, vector<WordID> ref, size_t N )
+{
+ Ngrams hyp_ngrams = make_ngrams( hyp, N );
+ Ngrams ref_ngrams = make_ngrams( ref, N );
+ NgramCounts counts( N );
+ Ngrams::iterator it;
+ Ngrams::iterator ti;
+ for ( it = hyp_ngrams.begin(); it != hyp_ngrams.end(); it++ ) {
+ ti = ref_ngrams.find( it->first );
+ if ( ti != ref_ngrams.end() ) {
+ counts.add( it->second, ti->second, it->first.size() - 1 );
+ } else {
+ counts.add( it->second, 0, it->first.size() - 1 );
+ }
+ }
+ return counts;
+}
+
+
+/*
+ * brevity_penaly
+ *
+ */
+double
+brevity_penaly( const size_t hyp_len, const size_t ref_len )
+{
+ if ( hyp_len > ref_len ) return 1;
+ return exp( 1 - (double)ref_len/(double)hyp_len );
+}
+
+
+/*
+ * bleu
+ * as in "BLEU: a Method for Automatic Evaluation of Machine Translation" (Papineni et al. '02)
+ * 0 if for N one of the counts = 0
+ */
+double
+bleu( NgramCounts& counts, const size_t hyp_len, const size_t ref_len,
+ size_t N, vector<float> weights = vector<float>() )
+{
+ if ( hyp_len == 0 || ref_len == 0 ) return 0;
+ if ( ref_len < N ) N = ref_len;
+ float N_ = (float)N;
+ if ( weights.empty() )
+ {
+ for ( size_t i = 0; i < N; i++ ) weights.push_back( 1/N_ );
+ }
+ double sum = 0;
+ for ( size_t i = 0; i < N; i++ ) {
+ if ( counts.clipped[i] == 0 || counts.sum[i] == 0 ) return 0;
+ sum += weights[i] * log( (double)counts.clipped[i] / (double)counts.sum[i] );
+ }
+ return brevity_penaly( hyp_len, ref_len ) * exp( sum );
+}
+
+
+/*
+ * stupid_bleu
+ * as in "ORANGE: a Method for Evaluating Automatic Evaluation Metrics for Machine Translation (Lin & Och '04)
+ * 0 iff no 1gram match
+ */
+double
+stupid_bleu( NgramCounts& counts, const size_t hyp_len, const size_t ref_len,
+ size_t N, vector<float> weights = vector<float>() )
+{
+ if ( hyp_len == 0 || ref_len == 0 ) return 0;
+ if ( ref_len < N ) N = ref_len;
+ float N_ = (float)N;
+ if ( weights.empty() )
+ {
+ for ( size_t i = 0; i < N; i++ ) weights.push_back( 1/N_ );
+ }
+ double sum = 0;
+ float add = 0;
+ for ( size_t i = 0; i < N; i++ ) {
+ if ( i == 1 ) add = 1;
+ sum += weights[i] * log( ((double)counts.clipped[i] + add) / ((double)counts.sum[i] + add) );
+ }
+ return brevity_penaly( hyp_len, ref_len ) * exp( sum );
+}
+
+
+/*
+ * smooth_bleu
+ * as in "An End-to-End Discriminative Approach to Machine Translation" (Liang et al. '06)
+ * max. 0.9375
+ */
+double
+smooth_bleu( NgramCounts& counts, const size_t hyp_len, const size_t ref_len,
+ const size_t N, vector<float> weights = vector<float>() )
+{
+ if ( hyp_len == 0 || ref_len == 0 ) return 0;
+ float N_ = (float)N;
+ if ( weights.empty() )
+ {
+ for ( size_t i = 0; i < N; i++ ) weights.push_back( 1/N_ );
+ }
+ double sum = 0;
+ float j = 1;
+ for ( size_t i = 0; i < N; i++ ) {
+ if ( counts.clipped[i] == 0 || counts.sum[i] == 0) continue;
+ sum += exp((weights[i] * log((double)counts.clipped[i]/(double)counts.sum[i]))) / pow( 2, N_-j+1 );
+ j++;
+ }
+ return brevity_penaly( hyp_len, ref_len ) * sum;
+}
+
+
+/*
+ * approx_bleu
+ * as in "Online Large-Margin Training for Statistical Machine Translation" (Watanabe et al. '07)
+ *
+ */
+double
+approx_bleu( NgramCounts& counts, const size_t hyp_len, const size_t ref_len,
+ const size_t N, vector<float> weights = vector<float>() )
+{
+ return bleu( counts, hyp_len, ref_len, N, weights );
+}
+
+
+/*
+ * register_and_convert
+ *
+ */
+void
+register_and_convert(const vector<string>& strs, vector<WordID>& ids)
+{
+ vector<string>::const_iterator it;
+ for ( it = strs.begin(); it < strs.end(); it++ ) {
+ ids.push_back( TD::Convert( *it ) );
+ }
+}
+
+
+void
+test_ngrams()
+{
+ cout << "Testing ngrams..." << endl << endl;
+ size_t N = 5;
+ vector<int> a; // hyp
+ vector<int> b; // ref
+ cout << "a ";
+ for (size_t i = 1; i <= 8; i++) {
+ cout << i << " ";
+ a.push_back(i);
+ }
+ cout << endl << "b ";
+ for (size_t i = 1; i <= 4; i++) {
+ cout << i << " ";
+ b.push_back(i);
+ }
+ cout << endl << endl;
+ NgramCounts c = make_ngram_counts( a, b, N );
+ assert( c.clipped[N-1] == 0 );
+ assert( c.sum[N-1] == 4 );
+ c.print();
+ c += c;
+ cout << endl;
+ c.print();
+}
+
+double
+approx_equal( double x, double y )
+{
+ const double EPSILON = 1E-5;
+ if ( x == 0 ) return fabs(y) <= EPSILON;
+ if ( y == 0 ) return fabs(x) <= EPSILON;
+ return fabs( x - y ) / max( fabs(x), fabs(y) ) <= EPSILON;
+}
+
+
+#include <boost/assign/std/vector.hpp>
+#include <iomanip>
+void
+test_metrics()
+{
+ cout << "Testing metrics..." << endl << endl;
+ using namespace boost::assign;
+ vector<string> a, b;
+ vector<double> expect_vanilla, expect_smooth, expect_stupid;
+ a += "a a a a", "a a a a", "a", "a", "b", "a a a a", "a a", "a a a", "a b a"; // hyp
+ b += "b b b b", "a a a a", "a", "b", "b b b b", "a", "a a", "a a a", "a b b"; // ref
+ expect_vanilla += 0, 1, 1, 0, 0, .25, 1, 1, 0;
+ expect_smooth += 0, .9375, .0625, 0, .00311169, .0441942, .1875, .4375, .161587;
+ expect_stupid += 0, 1, 1, 0, .0497871, .25, 1, 1, .605707;
+ vector<string> aa, bb;
+ vector<WordID> aai, bbi;
+ double vanilla, smooth, stupid;
+ size_t N = 4;
+ cout << "N = " << N << endl << endl;
+ for ( size_t i = 0; i < a.size(); i++ ) {
+ cout << " hyp: " << a[i] << endl;
+ cout << " ref: " << b[i] << endl;
+ aa.clear(); bb.clear(); aai.clear(); bbi.clear();
+ boost::split( aa, a[i], boost::is_any_of(" ") );
+ boost::split( bb, b[i], boost::is_any_of(" ") );
+ register_and_convert( aa, aai );
+ register_and_convert( bb, bbi );
+ NgramCounts counts = make_ngram_counts( aai, bbi, N );
+ vanilla = bleu( counts, aa.size(), bb.size(), N);
+ smooth = smooth_bleu( counts, aa.size(), bb.size(), N);
+ stupid = stupid_bleu( counts, aa.size(), bb.size(), N);
+ assert( approx_equal(vanilla, expect_vanilla[i]) );
+ assert( approx_equal(smooth, expect_smooth[i]) );
+ assert( approx_equal(stupid, expect_stupid[i]) );
+ cout << setw(14) << "bleu = " << vanilla << endl;
+ cout << setw(14) << "smooth bleu = " << smooth << endl;
+ cout << setw(14) << "stupid bleu = " << stupid << endl << endl;
+ }
+}
+
+
+/*
+ * main
+ *
+ */
+int
+main(int argc, char** argv)
+{
+ /*vector<string> v;
+ for (int i = 0; i <= 10; i++) {
+ v.push_back("asdf");
+ }
+ vector<vector<string> > ng = ngrams(v, 5);
+ for (int i = 0; i < ng.size(); i++) {
+ for (int j = 0; j < ng[i].size(); j++) {
+ cout << " " << ng[i][j];
+ }
+ cout << endl;
+ }*/
+
+ test_metrics();
+
+
+ //NgramCounts counts2 = make_ngram_counts( ref_ids, ref_ids, 4);
+ //counts += counts2;
+ //cout << counts.cNipped[1] << endl;
+
+ //size_t c, r; // c length of candidates, r of references
+ //c += cand.size();
+ //r += ref.size();
+ /*NgramMatches ngm; // for approx bleu
+ ngm.sum = 1;
+ ngm.clipped = 1;
+
+ NgramMatches x;
+ x.clipped = 1;
+ x.sum = 1;
+
+ x += ngm;
+ x += x;
+ x+= ngm;
+
+ cout << x.clipped << " " << x.sum << endl;*/
+
+
+ /*register_feature_functions();
+ SetSilent(true);
+
+ boost::program_options::variables_map conf;
+ if (!init(argc, argv, &conf)) return 1;
+ ReadFile ini_rf(conf["decoder-config"].as<string>());
+ Decoder decoder(ini_rf.stream());
+ Weights weights;
+ SparseVector<double> lambdas;
+ weights.InitSparseVector(&lambdas);
+
+ int k = conf["kbest"].as<int>();
+
+ KBestGetter observer(k);
+ string in, psg;
+ vector<string> strs;
+ int i = 0;
+ while(getline(cin, in)) {
+ if (!SILENT) cerr << "getting kbest for sentence #" << i << endl;
+ strs.clear();
+ boost::split(strs, in, boost::is_any_of("\t"));
+ psg = boost::replace_all_copy(strs[2], " __NEXT_RULE__ ", "\n"); psg += "\n";
+ decoder.SetSentenceGrammar( psg );
+ decoder.Decode( strs[0], &observer );
+ KBestList* kb = observer.getkb();
+ // FIXME not pretty iterating twice over k
+ for (int i = 0; i < k; i++) {
+ for (int j = 0; j < kb->sents[i].size(); ++j) {
+ cout << TD::Convert(kb->sents[i][j]) << endl;
+ }
+ }
+ }
+
+ return 0;*/
+}
+
+
+/*
+ * TODO
+ * for t =1..T
+ * mapper, reducer (average, handle ngram statistics for approx bleu)
+ * 1st streaming
+ * batch, non-batch in the mapper (what sofia gets)
+ * filter yes/no
+ * sofia: --eta_type explicit
+ * psg preparation
+ * set ref?
+ * shared LM?
+ * X reference(s) for *bleu!?
+ * kbest nicer!? shared_ptr
+ * multipartite
+ * weights! global, per sentence from global, featuremap
+ * todo const
+ */