diff options
author | Patrick Simianer <p@simianer.de> | 2011-07-24 20:46:22 +0200 |
---|---|---|
committer | Patrick Simianer <p@simianer.de> | 2011-09-23 19:13:57 +0200 |
commit | 3da2d6bef3d6f6c0e1e0d8845003db59d5697c90 (patch) | |
tree | f76ba4bf59aabc8cce9837dbfb4bbe716e50e1ec /dtrain | |
parent | 8e67ebda72410a84830b679db0c46251935e7892 (diff) |
added dtrain, implemented psg setting in decoder/translator, implemented psmetrics, ngram statistics, main carcass
Diffstat (limited to 'dtrain')
-rw-r--r-- | dtrain/.dtrain.cc.swo | bin | 0 -> 36864 bytes | |||
-rw-r--r-- | dtrain/Makefile.am | 6 | ||||
-rw-r--r-- | dtrain/cdec.ini | 3 | ||||
-rwxr-xr-x | dtrain/dtrain | bin | 0 -> 1479522 bytes | |||
-rw-r--r-- | dtrain/dtrain.cc | 526 | ||||
-rw-r--r-- | dtrain/dtrain.ini | 0 | ||||
-rw-r--r-- | dtrain/in | 2 | ||||
-rw-r--r-- | dtrain/input | 1 | ||||
-rwxr-xr-x | dtrain/test.sh | 4 |
9 files changed, 542 insertions, 0 deletions
diff --git a/dtrain/.dtrain.cc.swo b/dtrain/.dtrain.cc.swo Binary files differnew file mode 100644 index 00000000..ee45185d --- /dev/null +++ b/dtrain/.dtrain.cc.swo diff --git a/dtrain/Makefile.am b/dtrain/Makefile.am new file mode 100644 index 00000000..daa20cf3 --- /dev/null +++ b/dtrain/Makefile.am @@ -0,0 +1,6 @@ +bin_PROGRAMS = dtrain + +dtrain_SOURCES = dtrain.cc +dtrain_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a ../klm/lm/libklm.a ../klm/util/libklm_util.a -lz + +AM_CPPFLAGS = -W -Wall -Wno-sign-compare -I$(top_srcdir)/utils -I$(top_srcdir)/decoder -I$(top_srcdir)/mteval diff --git a/dtrain/cdec.ini b/dtrain/cdec.ini new file mode 100644 index 00000000..596fbf71 --- /dev/null +++ b/dtrain/cdec.ini @@ -0,0 +1,3 @@ +formalism=scfg +#k_best=2 +#add_pass_through_rules=true diff --git a/dtrain/dtrain b/dtrain/dtrain Binary files differnew file mode 100755 index 00000000..e200e05c --- /dev/null +++ b/dtrain/dtrain diff --git a/dtrain/dtrain.cc b/dtrain/dtrain.cc new file mode 100644 index 00000000..25249c7f --- /dev/null +++ b/dtrain/dtrain.cc @@ -0,0 +1,526 @@ +#include <sstream> +#include <iostream> +#include <vector> +#include <cassert> +#include <cmath> + +#include "config.h" + +#include <boost/shared_ptr.hpp> +#include <boost/algorithm/string.hpp> +#include <boost/program_options.hpp> +#include <boost/program_options/variables_map.hpp> + +#include "sentence_metadata.h" +#include "scorer.h" +#include "verbose.h" +#include "viterbi.h" +#include "hg.h" +#include "prob.h" +#include "kbest.h" +#include "ff_register.h" +#include "decoder.h" +#include "filelib.h" +#include "fdict.h" +#include "weights.h" +#include "sparse_vector.h" +#include "sampler.h" + +using namespace std; +namespace boostpo = boost::program_options; + + +/* + * init + * + */ +bool +init(int argc, char** argv, boostpo::variables_map* conf) +{ + boostpo::options_description opts( "Options" ); + opts.add_options() + ( "decoder-config,c", boostpo::value<string>(), "configuration file for cdec" ) + ( "kbest,k", boostpo::value<int>(), "k for kbest" ) + ( "ngrams,n", boostpo::value<int>(), "n for Ngrams" ) + ( "filter,f", boostpo::value<string>(), "filter kbest list" ); + boostpo::options_description cmdline_options; + cmdline_options.add(opts); + boostpo::store( parse_command_line(argc, argv, cmdline_options), *conf ); + boostpo::notify( *conf ); + if ( ! conf->count("decoder-config") ) { + cerr << cmdline_options << endl; + return false; + } + return true; +} + + +/* + * KBestGetter + * + */ +struct KBestList { + vector<SparseVector<double> > feats; + vector<vector<WordID> > sents; + vector<double> scores; +}; +struct KBestGetter : public DecoderObserver +{ + KBestGetter( const size_t k ) : k_(k) {} + size_t k_; + KBestList kb; + + virtual void + NotifyTranslationForest(const SentenceMetadata& smeta, Hypergraph* hg) + { + GetKBest(smeta.GetSentenceID(), *hg); + } + + KBestList* getkb() { return &kb; } + + void + GetKBest(int sent_id, const Hypergraph& forest) + { + kb.scores.clear(); + kb.sents.clear(); + kb.feats.clear(); + KBest::KBestDerivations<vector<WordID>, ESentenceTraversal> kbest( forest, k_ ); + for ( size_t i = 0; i < k_; ++i ) { + const KBest::KBestDerivations<vector<WordID>, ESentenceTraversal>::Derivation* d = + kbest.LazyKthBest( forest.nodes_.size() - 1, i ); + if (!d) break; + kb.sents.push_back( d->yield); + kb.feats.push_back( d->feature_values ); + kb.scores.push_back( d->score ); + } + } +}; + + +/* + * write_training_data_for_sofia + * + */ +void +sofia_write_training_data() +{ + // TODO +} + + +/* + * call_sofia + * + */ +void +sofia_call() +{ + // TODO +} + + +/* + * sofia_model2weights + * + */ +void +sofia_read_model() +{ + // TODO +} + + +/* + * make_ngrams + * + */ +typedef map<vector<WordID>, size_t> Ngrams; +Ngrams +make_ngrams( vector<WordID>& s, size_t N ) +{ + Ngrams ngrams; + vector<WordID> ng; + for ( size_t i = 0; i < s.size(); i++ ) { + ng.clear(); + for ( size_t j = i; j < min( i+N, s.size() ); j++ ) { + ng.push_back( s[j] ); + ngrams[ng]++; + } + } + return ngrams; +} + + +/* + * NgramCounts + * + */ +struct NgramCounts +{ + NgramCounts( const size_t N ) : N_( N ) { + reset(); + } + size_t N_; + map<size_t, size_t> clipped; + map<size_t, size_t> sum; + + NgramCounts& + operator+=( const NgramCounts& rhs ) + { + assert( N_ == rhs.N_ ); + for ( size_t i = 0; i < N_; i++ ) { + this->clipped[i] += rhs.clipped.find(i)->second; + this->sum[i] += rhs.sum.find(i)->second; + } + } + + void + add( size_t count, size_t ref_count, size_t i ) + { + assert( i < N_ ); + if ( count > ref_count ) { + clipped[i] += ref_count; + sum[i] += count; + } else { + clipped[i] += count; + sum[i] += count; + } + } + + void + reset() + { + size_t i; + for ( i = 0; i < N_; i++ ) { + clipped[i] = 0; + sum[i] = 0; + } + } + + void + print() + { + for ( size_t i = 0; i < N_; i++ ) { + cout << i+1 << "grams (clipped):\t" << clipped[i] << endl; + cout << i+1 << "grams:\t\t\t" << sum[i] << endl; + } + } +}; + + +/* + * ngram_matches + * + */ +NgramCounts +make_ngram_counts( vector<WordID> hyp, vector<WordID> ref, size_t N ) +{ + Ngrams hyp_ngrams = make_ngrams( hyp, N ); + Ngrams ref_ngrams = make_ngrams( ref, N ); + NgramCounts counts( N ); + Ngrams::iterator it; + Ngrams::iterator ti; + for ( it = hyp_ngrams.begin(); it != hyp_ngrams.end(); it++ ) { + ti = ref_ngrams.find( it->first ); + if ( ti != ref_ngrams.end() ) { + counts.add( it->second, ti->second, it->first.size() - 1 ); + } else { + counts.add( it->second, 0, it->first.size() - 1 ); + } + } + return counts; +} + + +/* + * brevity_penaly + * + */ +double +brevity_penaly( const size_t hyp_len, const size_t ref_len ) +{ + if ( hyp_len > ref_len ) return 1; + return exp( 1 - (double)ref_len/(double)hyp_len ); +} + + +/* + * bleu + * as in "BLEU: a Method for Automatic Evaluation of Machine Translation" (Papineni et al. '02) + * 0 if for N one of the counts = 0 + */ +double +bleu( NgramCounts& counts, const size_t hyp_len, const size_t ref_len, + size_t N, vector<float> weights = vector<float>() ) +{ + if ( hyp_len == 0 || ref_len == 0 ) return 0; + if ( ref_len < N ) N = ref_len; + float N_ = (float)N; + if ( weights.empty() ) + { + for ( size_t i = 0; i < N; i++ ) weights.push_back( 1/N_ ); + } + double sum = 0; + for ( size_t i = 0; i < N; i++ ) { + if ( counts.clipped[i] == 0 || counts.sum[i] == 0 ) return 0; + sum += weights[i] * log( (double)counts.clipped[i] / (double)counts.sum[i] ); + } + return brevity_penaly( hyp_len, ref_len ) * exp( sum ); +} + + +/* + * stupid_bleu + * as in "ORANGE: a Method for Evaluating Automatic Evaluation Metrics for Machine Translation (Lin & Och '04) + * 0 iff no 1gram match + */ +double +stupid_bleu( NgramCounts& counts, const size_t hyp_len, const size_t ref_len, + size_t N, vector<float> weights = vector<float>() ) +{ + if ( hyp_len == 0 || ref_len == 0 ) return 0; + if ( ref_len < N ) N = ref_len; + float N_ = (float)N; + if ( weights.empty() ) + { + for ( size_t i = 0; i < N; i++ ) weights.push_back( 1/N_ ); + } + double sum = 0; + float add = 0; + for ( size_t i = 0; i < N; i++ ) { + if ( i == 1 ) add = 1; + sum += weights[i] * log( ((double)counts.clipped[i] + add) / ((double)counts.sum[i] + add) ); + } + return brevity_penaly( hyp_len, ref_len ) * exp( sum ); +} + + +/* + * smooth_bleu + * as in "An End-to-End Discriminative Approach to Machine Translation" (Liang et al. '06) + * max. 0.9375 + */ +double +smooth_bleu( NgramCounts& counts, const size_t hyp_len, const size_t ref_len, + const size_t N, vector<float> weights = vector<float>() ) +{ + if ( hyp_len == 0 || ref_len == 0 ) return 0; + float N_ = (float)N; + if ( weights.empty() ) + { + for ( size_t i = 0; i < N; i++ ) weights.push_back( 1/N_ ); + } + double sum = 0; + float j = 1; + for ( size_t i = 0; i < N; i++ ) { + if ( counts.clipped[i] == 0 || counts.sum[i] == 0) continue; + sum += exp((weights[i] * log((double)counts.clipped[i]/(double)counts.sum[i]))) / pow( 2, N_-j+1 ); + j++; + } + return brevity_penaly( hyp_len, ref_len ) * sum; +} + + +/* + * approx_bleu + * as in "Online Large-Margin Training for Statistical Machine Translation" (Watanabe et al. '07) + * + */ +double +approx_bleu( NgramCounts& counts, const size_t hyp_len, const size_t ref_len, + const size_t N, vector<float> weights = vector<float>() ) +{ + return bleu( counts, hyp_len, ref_len, N, weights ); +} + + +/* + * register_and_convert + * + */ +void +register_and_convert(const vector<string>& strs, vector<WordID>& ids) +{ + vector<string>::const_iterator it; + for ( it = strs.begin(); it < strs.end(); it++ ) { + ids.push_back( TD::Convert( *it ) ); + } +} + + +void +test_ngrams() +{ + cout << "Testing ngrams..." << endl << endl; + size_t N = 5; + vector<int> a; // hyp + vector<int> b; // ref + cout << "a "; + for (size_t i = 1; i <= 8; i++) { + cout << i << " "; + a.push_back(i); + } + cout << endl << "b "; + for (size_t i = 1; i <= 4; i++) { + cout << i << " "; + b.push_back(i); + } + cout << endl << endl; + NgramCounts c = make_ngram_counts( a, b, N ); + assert( c.clipped[N-1] == 0 ); + assert( c.sum[N-1] == 4 ); + c.print(); + c += c; + cout << endl; + c.print(); +} + +double +approx_equal( double x, double y ) +{ + const double EPSILON = 1E-5; + if ( x == 0 ) return fabs(y) <= EPSILON; + if ( y == 0 ) return fabs(x) <= EPSILON; + return fabs( x - y ) / max( fabs(x), fabs(y) ) <= EPSILON; +} + + +#include <boost/assign/std/vector.hpp> +#include <iomanip> +void +test_metrics() +{ + cout << "Testing metrics..." << endl << endl; + using namespace boost::assign; + vector<string> a, b; + vector<double> expect_vanilla, expect_smooth, expect_stupid; + a += "a a a a", "a a a a", "a", "a", "b", "a a a a", "a a", "a a a", "a b a"; // hyp + b += "b b b b", "a a a a", "a", "b", "b b b b", "a", "a a", "a a a", "a b b"; // ref + expect_vanilla += 0, 1, 1, 0, 0, .25, 1, 1, 0; + expect_smooth += 0, .9375, .0625, 0, .00311169, .0441942, .1875, .4375, .161587; + expect_stupid += 0, 1, 1, 0, .0497871, .25, 1, 1, .605707; + vector<string> aa, bb; + vector<WordID> aai, bbi; + double vanilla, smooth, stupid; + size_t N = 4; + cout << "N = " << N << endl << endl; + for ( size_t i = 0; i < a.size(); i++ ) { + cout << " hyp: " << a[i] << endl; + cout << " ref: " << b[i] << endl; + aa.clear(); bb.clear(); aai.clear(); bbi.clear(); + boost::split( aa, a[i], boost::is_any_of(" ") ); + boost::split( bb, b[i], boost::is_any_of(" ") ); + register_and_convert( aa, aai ); + register_and_convert( bb, bbi ); + NgramCounts counts = make_ngram_counts( aai, bbi, N ); + vanilla = bleu( counts, aa.size(), bb.size(), N); + smooth = smooth_bleu( counts, aa.size(), bb.size(), N); + stupid = stupid_bleu( counts, aa.size(), bb.size(), N); + assert( approx_equal(vanilla, expect_vanilla[i]) ); + assert( approx_equal(smooth, expect_smooth[i]) ); + assert( approx_equal(stupid, expect_stupid[i]) ); + cout << setw(14) << "bleu = " << vanilla << endl; + cout << setw(14) << "smooth bleu = " << smooth << endl; + cout << setw(14) << "stupid bleu = " << stupid << endl << endl; + } +} + + +/* + * main + * + */ +int +main(int argc, char** argv) +{ + /*vector<string> v; + for (int i = 0; i <= 10; i++) { + v.push_back("asdf"); + } + vector<vector<string> > ng = ngrams(v, 5); + for (int i = 0; i < ng.size(); i++) { + for (int j = 0; j < ng[i].size(); j++) { + cout << " " << ng[i][j]; + } + cout << endl; + }*/ + + test_metrics(); + + + //NgramCounts counts2 = make_ngram_counts( ref_ids, ref_ids, 4); + //counts += counts2; + //cout << counts.cNipped[1] << endl; + + //size_t c, r; // c length of candidates, r of references + //c += cand.size(); + //r += ref.size(); + /*NgramMatches ngm; // for approx bleu + ngm.sum = 1; + ngm.clipped = 1; + + NgramMatches x; + x.clipped = 1; + x.sum = 1; + + x += ngm; + x += x; + x+= ngm; + + cout << x.clipped << " " << x.sum << endl;*/ + + + /*register_feature_functions(); + SetSilent(true); + + boost::program_options::variables_map conf; + if (!init(argc, argv, &conf)) return 1; + ReadFile ini_rf(conf["decoder-config"].as<string>()); + Decoder decoder(ini_rf.stream()); + Weights weights; + SparseVector<double> lambdas; + weights.InitSparseVector(&lambdas); + + int k = conf["kbest"].as<int>(); + + KBestGetter observer(k); + string in, psg; + vector<string> strs; + int i = 0; + while(getline(cin, in)) { + if (!SILENT) cerr << "getting kbest for sentence #" << i << endl; + strs.clear(); + boost::split(strs, in, boost::is_any_of("\t")); + psg = boost::replace_all_copy(strs[2], " __NEXT_RULE__ ", "\n"); psg += "\n"; + decoder.SetSentenceGrammar( psg ); + decoder.Decode( strs[0], &observer ); + KBestList* kb = observer.getkb(); + // FIXME not pretty iterating twice over k + for (int i = 0; i < k; i++) { + for (int j = 0; j < kb->sents[i].size(); ++j) { + cout << TD::Convert(kb->sents[i][j]) << endl; + } + } + } + + return 0;*/ +} + + +/* + * TODO + * for t =1..T + * mapper, reducer (average, handle ngram statistics for approx bleu) + * 1st streaming + * batch, non-batch in the mapper (what sofia gets) + * filter yes/no + * sofia: --eta_type explicit + * psg preparation + * set ref? + * shared LM? + * X reference(s) for *bleu!? + * kbest nicer!? shared_ptr + * multipartite + * weights! global, per sentence from global, featuremap + * todo const + */ diff --git a/dtrain/dtrain.ini b/dtrain/dtrain.ini new file mode 100644 index 00000000..e69de29b --- /dev/null +++ b/dtrain/dtrain.ini diff --git a/dtrain/in b/dtrain/in new file mode 100644 index 00000000..294d009b --- /dev/null +++ b/dtrain/in @@ -0,0 +1,2 @@ +vorrichtung means [X] ||| vorrichtung ||| apparatus ||| LogP=-200 ||| 0-0 __NEXT_RULE__ [X] ||| vorrichtung ||| means ||| LogP=-101 ||| 0-0 +eintest test [X] ||| eintest ||| test ||| LogP=-200 ||| 0-0 __NEXT_RULE__ [X] ||| eintest ||| xxx ||| LogP=-101 ||| 0-0 diff --git a/dtrain/input b/dtrain/input new file mode 100644 index 00000000..ff005d22 --- /dev/null +++ b/dtrain/input @@ -0,0 +1 @@ +ich sah ein kleines haus @@@@@@@@ i saw a little shell @@@@@@@@ [S] ||| [NP,1] [VP,2] ||| [1] [2] ||| logp=0 __NEXT_RULE__ [NP] ||| ich ||| i ||| logp=0 __NEXT_RULE__ [NP] ||| ein [NN,1] ||| a [1] ||| logp=0 __NEXT_RULE__ [NN] ||| [JJ,1] haus ||| [1] house ||| logp=-0.5 __NEXT_RULE__ [NN] ||| [JJ,1] haus ||| [1] shell ||| logp=-0.9 __NEXT_RULE__ [JJ] ||| kleines ||| small ||| logp=-0.5 __NEXT_RULE__ [JJ] ||| kleines ||| little ||| logp=-0.9 __NEXT_RULE__ [JJ] ||| grosses ||| big ||| logp=-0.5 __NEXT_RULE__ [JJ] ||| grosses ||| large ||| logp=-1.5 __NEXT_RULE__ [VP] ||| [V,1] [NP,2] ||| [1] [2] ||| logp=0 __NEXT_RULE__ [V] ||| sah ||| saw ||| logp=0 __NEXT_RULE__ [V] ||| fand ||| found ||| logp=0 @@@@@@@@ logp diff --git a/dtrain/test.sh b/dtrain/test.sh new file mode 100755 index 00000000..508038ec --- /dev/null +++ b/dtrain/test.sh @@ -0,0 +1,4 @@ +#!/bin/sh + +./dtrain -c cdec.ini -k 2 < in + |