diff options
Diffstat (limited to 'dtrain/dtrain.cc')
-rw-r--r-- | dtrain/dtrain.cc | 526 |
1 files changed, 526 insertions, 0 deletions
diff --git a/dtrain/dtrain.cc b/dtrain/dtrain.cc new file mode 100644 index 00000000..25249c7f --- /dev/null +++ b/dtrain/dtrain.cc @@ -0,0 +1,526 @@ +#include <sstream> +#include <iostream> +#include <vector> +#include <cassert> +#include <cmath> + +#include "config.h" + +#include <boost/shared_ptr.hpp> +#include <boost/algorithm/string.hpp> +#include <boost/program_options.hpp> +#include <boost/program_options/variables_map.hpp> + +#include "sentence_metadata.h" +#include "scorer.h" +#include "verbose.h" +#include "viterbi.h" +#include "hg.h" +#include "prob.h" +#include "kbest.h" +#include "ff_register.h" +#include "decoder.h" +#include "filelib.h" +#include "fdict.h" +#include "weights.h" +#include "sparse_vector.h" +#include "sampler.h" + +using namespace std; +namespace boostpo = boost::program_options; + + +/* + * init + * + */ +bool +init(int argc, char** argv, boostpo::variables_map* conf) +{ + boostpo::options_description opts( "Options" ); + opts.add_options() + ( "decoder-config,c", boostpo::value<string>(), "configuration file for cdec" ) + ( "kbest,k", boostpo::value<int>(), "k for kbest" ) + ( "ngrams,n", boostpo::value<int>(), "n for Ngrams" ) + ( "filter,f", boostpo::value<string>(), "filter kbest list" ); + boostpo::options_description cmdline_options; + cmdline_options.add(opts); + boostpo::store( parse_command_line(argc, argv, cmdline_options), *conf ); + boostpo::notify( *conf ); + if ( ! conf->count("decoder-config") ) { + cerr << cmdline_options << endl; + return false; + } + return true; +} + + +/* + * KBestGetter + * + */ +struct KBestList { + vector<SparseVector<double> > feats; + vector<vector<WordID> > sents; + vector<double> scores; +}; +struct KBestGetter : public DecoderObserver +{ + KBestGetter( const size_t k ) : k_(k) {} + size_t k_; + KBestList kb; + + virtual void + NotifyTranslationForest(const SentenceMetadata& smeta, Hypergraph* hg) + { + GetKBest(smeta.GetSentenceID(), *hg); + } + + KBestList* getkb() { return &kb; } + + void + GetKBest(int sent_id, const Hypergraph& forest) + { + kb.scores.clear(); + kb.sents.clear(); + kb.feats.clear(); + KBest::KBestDerivations<vector<WordID>, ESentenceTraversal> kbest( forest, k_ ); + for ( size_t i = 0; i < k_; ++i ) { + const KBest::KBestDerivations<vector<WordID>, ESentenceTraversal>::Derivation* d = + kbest.LazyKthBest( forest.nodes_.size() - 1, i ); + if (!d) break; + kb.sents.push_back( d->yield); + kb.feats.push_back( d->feature_values ); + kb.scores.push_back( d->score ); + } + } +}; + + +/* + * write_training_data_for_sofia + * + */ +void +sofia_write_training_data() +{ + // TODO +} + + +/* + * call_sofia + * + */ +void +sofia_call() +{ + // TODO +} + + +/* + * sofia_model2weights + * + */ +void +sofia_read_model() +{ + // TODO +} + + +/* + * make_ngrams + * + */ +typedef map<vector<WordID>, size_t> Ngrams; +Ngrams +make_ngrams( vector<WordID>& s, size_t N ) +{ + Ngrams ngrams; + vector<WordID> ng; + for ( size_t i = 0; i < s.size(); i++ ) { + ng.clear(); + for ( size_t j = i; j < min( i+N, s.size() ); j++ ) { + ng.push_back( s[j] ); + ngrams[ng]++; + } + } + return ngrams; +} + + +/* + * NgramCounts + * + */ +struct NgramCounts +{ + NgramCounts( const size_t N ) : N_( N ) { + reset(); + } + size_t N_; + map<size_t, size_t> clipped; + map<size_t, size_t> sum; + + NgramCounts& + operator+=( const NgramCounts& rhs ) + { + assert( N_ == rhs.N_ ); + for ( size_t i = 0; i < N_; i++ ) { + this->clipped[i] += rhs.clipped.find(i)->second; + this->sum[i] += rhs.sum.find(i)->second; + } + } + + void + add( size_t count, size_t ref_count, size_t i ) + { + assert( i < N_ ); + if ( count > ref_count ) { + clipped[i] += ref_count; + sum[i] += count; + } else { + clipped[i] += count; + sum[i] += count; + } + } + + void + reset() + { + size_t i; + for ( i = 0; i < N_; i++ ) { + clipped[i] = 0; + sum[i] = 0; + } + } + + void + print() + { + for ( size_t i = 0; i < N_; i++ ) { + cout << i+1 << "grams (clipped):\t" << clipped[i] << endl; + cout << i+1 << "grams:\t\t\t" << sum[i] << endl; + } + } +}; + + +/* + * ngram_matches + * + */ +NgramCounts +make_ngram_counts( vector<WordID> hyp, vector<WordID> ref, size_t N ) +{ + Ngrams hyp_ngrams = make_ngrams( hyp, N ); + Ngrams ref_ngrams = make_ngrams( ref, N ); + NgramCounts counts( N ); + Ngrams::iterator it; + Ngrams::iterator ti; + for ( it = hyp_ngrams.begin(); it != hyp_ngrams.end(); it++ ) { + ti = ref_ngrams.find( it->first ); + if ( ti != ref_ngrams.end() ) { + counts.add( it->second, ti->second, it->first.size() - 1 ); + } else { + counts.add( it->second, 0, it->first.size() - 1 ); + } + } + return counts; +} + + +/* + * brevity_penaly + * + */ +double +brevity_penaly( const size_t hyp_len, const size_t ref_len ) +{ + if ( hyp_len > ref_len ) return 1; + return exp( 1 - (double)ref_len/(double)hyp_len ); +} + + +/* + * bleu + * as in "BLEU: a Method for Automatic Evaluation of Machine Translation" (Papineni et al. '02) + * 0 if for N one of the counts = 0 + */ +double +bleu( NgramCounts& counts, const size_t hyp_len, const size_t ref_len, + size_t N, vector<float> weights = vector<float>() ) +{ + if ( hyp_len == 0 || ref_len == 0 ) return 0; + if ( ref_len < N ) N = ref_len; + float N_ = (float)N; + if ( weights.empty() ) + { + for ( size_t i = 0; i < N; i++ ) weights.push_back( 1/N_ ); + } + double sum = 0; + for ( size_t i = 0; i < N; i++ ) { + if ( counts.clipped[i] == 0 || counts.sum[i] == 0 ) return 0; + sum += weights[i] * log( (double)counts.clipped[i] / (double)counts.sum[i] ); + } + return brevity_penaly( hyp_len, ref_len ) * exp( sum ); +} + + +/* + * stupid_bleu + * as in "ORANGE: a Method for Evaluating Automatic Evaluation Metrics for Machine Translation (Lin & Och '04) + * 0 iff no 1gram match + */ +double +stupid_bleu( NgramCounts& counts, const size_t hyp_len, const size_t ref_len, + size_t N, vector<float> weights = vector<float>() ) +{ + if ( hyp_len == 0 || ref_len == 0 ) return 0; + if ( ref_len < N ) N = ref_len; + float N_ = (float)N; + if ( weights.empty() ) + { + for ( size_t i = 0; i < N; i++ ) weights.push_back( 1/N_ ); + } + double sum = 0; + float add = 0; + for ( size_t i = 0; i < N; i++ ) { + if ( i == 1 ) add = 1; + sum += weights[i] * log( ((double)counts.clipped[i] + add) / ((double)counts.sum[i] + add) ); + } + return brevity_penaly( hyp_len, ref_len ) * exp( sum ); +} + + +/* + * smooth_bleu + * as in "An End-to-End Discriminative Approach to Machine Translation" (Liang et al. '06) + * max. 0.9375 + */ +double +smooth_bleu( NgramCounts& counts, const size_t hyp_len, const size_t ref_len, + const size_t N, vector<float> weights = vector<float>() ) +{ + if ( hyp_len == 0 || ref_len == 0 ) return 0; + float N_ = (float)N; + if ( weights.empty() ) + { + for ( size_t i = 0; i < N; i++ ) weights.push_back( 1/N_ ); + } + double sum = 0; + float j = 1; + for ( size_t i = 0; i < N; i++ ) { + if ( counts.clipped[i] == 0 || counts.sum[i] == 0) continue; + sum += exp((weights[i] * log((double)counts.clipped[i]/(double)counts.sum[i]))) / pow( 2, N_-j+1 ); + j++; + } + return brevity_penaly( hyp_len, ref_len ) * sum; +} + + +/* + * approx_bleu + * as in "Online Large-Margin Training for Statistical Machine Translation" (Watanabe et al. '07) + * + */ +double +approx_bleu( NgramCounts& counts, const size_t hyp_len, const size_t ref_len, + const size_t N, vector<float> weights = vector<float>() ) +{ + return bleu( counts, hyp_len, ref_len, N, weights ); +} + + +/* + * register_and_convert + * + */ +void +register_and_convert(const vector<string>& strs, vector<WordID>& ids) +{ + vector<string>::const_iterator it; + for ( it = strs.begin(); it < strs.end(); it++ ) { + ids.push_back( TD::Convert( *it ) ); + } +} + + +void +test_ngrams() +{ + cout << "Testing ngrams..." << endl << endl; + size_t N = 5; + vector<int> a; // hyp + vector<int> b; // ref + cout << "a "; + for (size_t i = 1; i <= 8; i++) { + cout << i << " "; + a.push_back(i); + } + cout << endl << "b "; + for (size_t i = 1; i <= 4; i++) { + cout << i << " "; + b.push_back(i); + } + cout << endl << endl; + NgramCounts c = make_ngram_counts( a, b, N ); + assert( c.clipped[N-1] == 0 ); + assert( c.sum[N-1] == 4 ); + c.print(); + c += c; + cout << endl; + c.print(); +} + +double +approx_equal( double x, double y ) +{ + const double EPSILON = 1E-5; + if ( x == 0 ) return fabs(y) <= EPSILON; + if ( y == 0 ) return fabs(x) <= EPSILON; + return fabs( x - y ) / max( fabs(x), fabs(y) ) <= EPSILON; +} + + +#include <boost/assign/std/vector.hpp> +#include <iomanip> +void +test_metrics() +{ + cout << "Testing metrics..." << endl << endl; + using namespace boost::assign; + vector<string> a, b; + vector<double> expect_vanilla, expect_smooth, expect_stupid; + a += "a a a a", "a a a a", "a", "a", "b", "a a a a", "a a", "a a a", "a b a"; // hyp + b += "b b b b", "a a a a", "a", "b", "b b b b", "a", "a a", "a a a", "a b b"; // ref + expect_vanilla += 0, 1, 1, 0, 0, .25, 1, 1, 0; + expect_smooth += 0, .9375, .0625, 0, .00311169, .0441942, .1875, .4375, .161587; + expect_stupid += 0, 1, 1, 0, .0497871, .25, 1, 1, .605707; + vector<string> aa, bb; + vector<WordID> aai, bbi; + double vanilla, smooth, stupid; + size_t N = 4; + cout << "N = " << N << endl << endl; + for ( size_t i = 0; i < a.size(); i++ ) { + cout << " hyp: " << a[i] << endl; + cout << " ref: " << b[i] << endl; + aa.clear(); bb.clear(); aai.clear(); bbi.clear(); + boost::split( aa, a[i], boost::is_any_of(" ") ); + boost::split( bb, b[i], boost::is_any_of(" ") ); + register_and_convert( aa, aai ); + register_and_convert( bb, bbi ); + NgramCounts counts = make_ngram_counts( aai, bbi, N ); + vanilla = bleu( counts, aa.size(), bb.size(), N); + smooth = smooth_bleu( counts, aa.size(), bb.size(), N); + stupid = stupid_bleu( counts, aa.size(), bb.size(), N); + assert( approx_equal(vanilla, expect_vanilla[i]) ); + assert( approx_equal(smooth, expect_smooth[i]) ); + assert( approx_equal(stupid, expect_stupid[i]) ); + cout << setw(14) << "bleu = " << vanilla << endl; + cout << setw(14) << "smooth bleu = " << smooth << endl; + cout << setw(14) << "stupid bleu = " << stupid << endl << endl; + } +} + + +/* + * main + * + */ +int +main(int argc, char** argv) +{ + /*vector<string> v; + for (int i = 0; i <= 10; i++) { + v.push_back("asdf"); + } + vector<vector<string> > ng = ngrams(v, 5); + for (int i = 0; i < ng.size(); i++) { + for (int j = 0; j < ng[i].size(); j++) { + cout << " " << ng[i][j]; + } + cout << endl; + }*/ + + test_metrics(); + + + //NgramCounts counts2 = make_ngram_counts( ref_ids, ref_ids, 4); + //counts += counts2; + //cout << counts.cNipped[1] << endl; + + //size_t c, r; // c length of candidates, r of references + //c += cand.size(); + //r += ref.size(); + /*NgramMatches ngm; // for approx bleu + ngm.sum = 1; + ngm.clipped = 1; + + NgramMatches x; + x.clipped = 1; + x.sum = 1; + + x += ngm; + x += x; + x+= ngm; + + cout << x.clipped << " " << x.sum << endl;*/ + + + /*register_feature_functions(); + SetSilent(true); + + boost::program_options::variables_map conf; + if (!init(argc, argv, &conf)) return 1; + ReadFile ini_rf(conf["decoder-config"].as<string>()); + Decoder decoder(ini_rf.stream()); + Weights weights; + SparseVector<double> lambdas; + weights.InitSparseVector(&lambdas); + + int k = conf["kbest"].as<int>(); + + KBestGetter observer(k); + string in, psg; + vector<string> strs; + int i = 0; + while(getline(cin, in)) { + if (!SILENT) cerr << "getting kbest for sentence #" << i << endl; + strs.clear(); + boost::split(strs, in, boost::is_any_of("\t")); + psg = boost::replace_all_copy(strs[2], " __NEXT_RULE__ ", "\n"); psg += "\n"; + decoder.SetSentenceGrammar( psg ); + decoder.Decode( strs[0], &observer ); + KBestList* kb = observer.getkb(); + // FIXME not pretty iterating twice over k + for (int i = 0; i < k; i++) { + for (int j = 0; j < kb->sents[i].size(); ++j) { + cout << TD::Convert(kb->sents[i][j]) << endl; + } + } + } + + return 0;*/ +} + + +/* + * TODO + * for t =1..T + * mapper, reducer (average, handle ngram statistics for approx bleu) + * 1st streaming + * batch, non-batch in the mapper (what sofia gets) + * filter yes/no + * sofia: --eta_type explicit + * psg preparation + * set ref? + * shared LM? + * X reference(s) for *bleu!? + * kbest nicer!? shared_ptr + * multipartite + * weights! global, per sentence from global, featuremap + * todo const + */ |