summaryrefslogtreecommitdiff
path: root/dtrain
diff options
context:
space:
mode:
authorPatrick Simianer <p@simianer.de>2011-07-31 19:24:02 +0200
committerPatrick Simianer <p@simianer.de>2011-09-23 19:13:57 +0200
commit06829982fb0c03a5b0bbd95ee04de5a0019c5263 (patch)
treec48d2b25092ce08bc5557c0410dc1a93c8e5fa16 /dtrain
parentd980ecbbcd35fba23313aa715046bc0f87a23afd (diff)
bugfixing, begin refactoring
Diffstat (limited to 'dtrain')
-rw-r--r--dtrain/dcommon.cc79
-rw-r--r--dtrain/dcommon.h71
-rw-r--r--dtrain/dtest.cc33
-rw-r--r--dtrain/dtrain.cc48
-rw-r--r--dtrain/learner.h71
-rwxr-xr-xdtrain/test.sh2
6 files changed, 168 insertions, 136 deletions
diff --git a/dtrain/dcommon.cc b/dtrain/dcommon.cc
index a6bdc92c..6657bed6 100644
--- a/dtrain/dcommon.cc
+++ b/dtrain/dcommon.cc
@@ -2,7 +2,11 @@
-/*
+
+/******************************************************************************
+ * NGRAMS
+ *
+ *
* make_ngrams
*
*/
@@ -23,9 +27,6 @@ make_ngrams( vector<WordID>& s, size_t N )
}
-
-
-
/*
* ngram_matches
*
@@ -50,7 +51,12 @@ make_ngram_counts( vector<WordID> hyp, vector<WordID> ref, size_t N )
}
-/*
+
+
+/******************************************************************************
+ * SCORES
+ *
+ *
* brevity_penaly
*
*/
@@ -156,7 +162,12 @@ approx_bleu( NgramCounts& counts, const size_t hyp_len, const size_t ref_len,
}
-/*
+
+
+/******************************************************************************
+ * UTILS
+ *
+ *
* register_and_convert
*
*/
@@ -170,11 +181,39 @@ register_and_convert(const vector<string>& strs, vector<WordID>& ids)
}
+/*
+ * approx_equal
+ *
+ */
+double
+approx_equal( double x, double y )
+{
+ const double EPSILON = 1E-5;
+ if ( x == 0 ) return fabs( y ) <= EPSILON;
+ if ( y == 0 ) return fabs( x ) <= EPSILON;
+ return fabs( x - y ) / max( fabs(x), fabs(y) ) <= EPSILON;
+}
/*
+ * print_FD
+ *
+ */
+void
+print_FD()
+{
+ for ( size_t i = 0; i < FD::NumFeats(); i++ ) cout << FD::Convert(i)<< endl;
+}
+
+
+
+
+/******************************************************************************
+ * TESTS
*
*
+ * test_ngrams
+ *
*/
void
test_ngrams()
@@ -207,21 +246,7 @@ test_ngrams()
/*
- *
- *
- */
-double
-approx_equal( double x, double y )
-{
- const double EPSILON = 1E-5;
- if ( x == 0 ) return fabs( y ) <= EPSILON;
- if ( y == 0 ) return fabs( x ) <= EPSILON;
- return fabs( x - y ) / max( fabs(x), fabs(y) ) <= EPSILON;
-}
-
-
-/*
- *
+ * test_metrics
*
*/
void
@@ -263,8 +288,9 @@ test_metrics()
cout << endl;
}
+
/*
- *
+ * test_SetWeights
*
*/
void
@@ -287,7 +313,7 @@ test_SetWeights()
/*
- *
+ * run_tests
*
*/
void
@@ -302,10 +328,3 @@ run_tests()
exit(0);
}
-
-void
-print_FD()
-{
- for ( size_t i = 0; i < FD::NumFeats(); i++ ) cout << FD::Convert(i)<< endl;
-}
-
diff --git a/dtrain/dcommon.h b/dtrain/dcommon.h
index ff796642..6df841bb 100644
--- a/dtrain/dcommon.h
+++ b/dtrain/dcommon.h
@@ -30,6 +30,8 @@ using namespace std;
namespace po = boost::program_options;
+
+
struct ScorePair
{
ScorePair(double modelscore, double score) : modelscore_(modelscore), score_(score) {}
@@ -139,72 +141,7 @@ struct NgramCounts
};
-/*class Learnerx
-{
- public:
- virtual void Init(const vector<SparseVector<double> >& kbest, const Scores& scores) {};
- virtual void Update(SparseVector<double>& lambdas);
-};*/
-
-class SofiaLearner //: public Learnerx FIXME
-{
- // TODO bool invert_score
- public:
- void
- Init( const size_t sid, const vector<SparseVector<double> >& kbest, /*const*/ Scores& scores )
- {
- assert( kbest.size() == scores.size() );
- ofstream o;
- unlink( "/tmo/sofia_ml_training" );
- o.open( "/tmp/sofia_ml_training", ios::trunc ); // TODO randomize, filename exists
- int fid = 0;
- map<int,int>::iterator ff;
- for ( size_t k = 0; k < kbest.size(); ++k ) {
- SparseVector<double>::const_iterator it = kbest[k].begin();
- o << scores[k].GetScore();
- for ( ; it != kbest[k].end(); ++it) {
- ff = fmap.find( it->first );
- if ( ff == fmap.end() ) {
- fmap.insert( pair<int,int>(it->first, fid) );
- fmap1.insert( pair<int,int>(fid, it->first) );
- fid++;
- }
- o << " "<< fmap[it->first] << ":" << it->second;
- }
- o << endl;
- }
- o.close();
- }
-
- void
- Update(SparseVector<double>& lambdas)
- {
- string call = "./sofia-ml --training_file /tmp/sofia_ml_training --model_out /tmp/sofia_ml_model --loop_type stochastic --lambda 100 --dimensionality ";
- std::stringstream out;
- out << fmap.size();
- call += out.str();
- call += " &>/dev/null";
- system ( call.c_str() );
- ifstream i;
- unlink( "/tmo/sofia_ml_model" );
- i.open( "/tmp/sofia_ml_model", ios::in );
- string model;
- getline( i, model );
- //cout << model << endl;
- vector<string> strs;
- boost::split( strs, model, boost::is_any_of(" ") );
- int j = 0;
- for ( vector<string>::iterator it = strs.begin(); it != strs.end(); ++it ) {
- lambdas.set_value(fmap1[j], atof( it->c_str() ) );
- j++;
- }
-
- }
- private:
- map<int,int> fmap;
- map<int,int> fmap1;
-};
typedef map<vector<WordID>, size_t> Ngrams;
Ngrams make_ngrams( vector<WordID>& s, size_t N );
@@ -215,10 +152,6 @@ double stupid_bleu( NgramCounts& counts, const size_t hyp_len, const size_t ref_
double smooth_bleu( NgramCounts& counts, const size_t hyp_len, const size_t ref_len, const size_t N, vector<float> weights = vector<float>() );
double approx_bleu( NgramCounts& counts, const size_t hyp_len, const size_t ref_len, const size_t N, vector<float> weights = vector<float>() );
void register_and_convert(const vector<string>& strs, vector<WordID>& ids);
-
-
-
-
void print_FD();
void run_tests();
void test_SetWeights();
diff --git a/dtrain/dtest.cc b/dtrain/dtest.cc
index 9975794f..5ae473e6 100644
--- a/dtrain/dtest.cc
+++ b/dtrain/dtest.cc
@@ -11,11 +11,13 @@ bool
init(int argc, char** argv, po::variables_map* conf)
{
int N;
+ bool q;
po::options_description opts( "Options" );
opts.add_options()
- ( "decoder-config,c", po::value<string>(), "configuration file for cdec" )
- ( "weights,w", po::value<string>(), "weights file")
- ( "ngrams,n", po::value<int>(&N)->default_value(4), "N for Ngrams (default 5)" );
+ ( "decoder-config,c", po::value<string>(), "configuration file for cdec" )
+ ( "weights,w", po::value<string>(), "weights file")
+ ( "ngrams,n", po::value<int>(&N)->default_value(4), "N for Ngrams (default 5)" )
+ ( "quiet,q", po::value<bool>(&q)->default_value(true), "do not output translations" );
po::options_description cmdline_options;
cmdline_options.add(opts);
po::store( parse_command_line(argc, argv, cmdline_options), *conf );
@@ -44,6 +46,7 @@ main(int argc, char** argv)
Decoder decoder(ini_rf.stream());
KBestGetter observer(k);
size_t N = conf["ngrams"].as<int>();
+ bool quiet = conf["quiet"].as<bool>();
Weights weights;
weights.InitFromFile(conf["weights"].as<string>());
@@ -56,13 +59,15 @@ main(int argc, char** argv)
string in, psg;
size_t sid = 0;
double overall = 0.0;
- cerr << "(1 dot equals 100 lines of input)" << endl;
+ double overall1 = 0.0;
+ double overall2 = 0.0;
+ cerr << "(a dot equals 100 lines of input)" << endl;
while( getline(cin, in) ) {
if ( (sid+1) % 100 == 0 ) {
cerr << ".";
if ( (sid+1)%1000 == 0 ) cerr << endl;
}
- if ( sid > 5000 ) break;
+ //if ( sid > 5000 ) break;
strs.clear();
boost::split( strs, in, boost::is_any_of("\t") );
// grammar
@@ -75,19 +80,25 @@ main(int argc, char** argv)
boost::split( ref_strs, strs[1], boost::is_any_of(" ") );
register_and_convert( ref_strs, ref_ids );
// scoring kbest
- double score = 0;
- Scores scores;
+ double score = 0.0;
+ double score1 = 0.0;
+ double score2 = 0.0;
NgramCounts counts = make_ngram_counts( ref_ids, kb->sents[0], 4 );
score = smooth_bleu( counts,
ref_ids.size(),
kb->sents[0].size(), N );
- ScorePair sp( kb->scores[0], score );
- scores.push_back( sp );
- //cout << TD::GetString( kb->sents[0] ) << endl;
+ score1 = stupid_bleu( counts, ref_ids.size(), kb->sents[0].size(), N) ;
+ score2 = bleu( counts, ref_ids.size(), kb->sents[0].size(), N );
+ //if ( ! quiet )
+ cout << TD::GetString( kb->sents[0] ) << endl;
overall += score;
+ overall1 += score1;
+ overall2 += score2;
sid += 1;
}
- cout << "Average score: " << overall/(sid+1) << endl;
+ cerr << "Average score (smooth): " << overall/(double)(sid+1) << endl;
+ cerr << "Average score (stupid): " << overall1/(double)(sid+1) << endl;
+ cerr << "Average score (normal): " << overall2/(double)(sid+1) << endl;
cerr << endl;
return 0;
diff --git a/dtrain/dtrain.cc b/dtrain/dtrain.cc
index 95fc81af..373458e8 100644
--- a/dtrain/dtrain.cc
+++ b/dtrain/dtrain.cc
@@ -1,4 +1,5 @@
#include "dcommon.h"
+#include "learner.h"
@@ -45,41 +46,35 @@ main(int argc, char** argv)
ReadFile ini_rf(conf["decoder-config"].as<string>());
Decoder decoder(ini_rf.stream());
KBestGetter observer(k);
- size_t N = 4; // TODO as parameter/in config
+ size_t N = 3; // TODO as parameter/in config
// TODO scoring metric as parameter/in config
// for approx. bleu
- //NgramCounts global_counts;
- //size_t global_hyp_len;
- //size_t global_ref_len;
+ NgramCounts global_counts(N);
+ size_t global_hyp_len = 0;
+ size_t global_ref_len = 0;
Weights weights;
SparseVector<double> lambdas;
weights.InitSparseVector(&lambdas);
vector<double> dense_weights;
- lambdas.set_value(FD::Convert("logp"), 0);
-
-
vector<string> strs, ref_strs;
vector<WordID> ref_ids;
string in, psg;
size_t sid = 0;
cerr << "(1 dot equals 100 lines of input)" << endl;
while( getline(cin, in) ) {
- //if ( !SILENT )
- // cerr << endl << endl << "Getting kbest for sentence #" << sid << endl;
if ( (sid+1) % 100 == 0 ) {
cerr << ".";
if ( (sid+1)%1000 == 0 ) cerr << endl;
}
- if ( sid > 5000 ) break;
+ //if ( sid > 5000 ) break;
// weights
dense_weights.clear();
weights.InitFromVector( lambdas );
weights.InitVector( &dense_weights );
decoder.SetWeights( dense_weights );
- //if ( sid > 100 ) break;
// handling input..
strs.clear();
boost::split( strs, in, boost::is_any_of("\t") );
@@ -94,33 +89,36 @@ main(int argc, char** argv)
register_and_convert( ref_strs, ref_ids );
// scoring kbest
double score = 0;
+ size_t cand_len = 0;
Scores scores;
- for ( size_t i = 0; i < k; i++ ) {
- NgramCounts counts = make_ngram_counts( ref_ids, kb->sents[i], 4 );
- score = smooth_bleu( counts,
- ref_ids.size(),
- kb->sents[i].size(), N );
+ for ( size_t i = 0; i < kb->sents.size(); i++ ) {
+ NgramCounts counts = make_ngram_counts( ref_ids, kb->sents[i], N );
+ if ( i == 0) {
+ global_counts += counts;
+ global_hyp_len += kb->sents[i].size();
+ global_ref_len += ref_ids.size();
+ cand_len = 0;
+ } else {
+ cand_len = kb->sents[i].size();
+ }
+ //score = bleu( global_counts,
+ // global_ref_len,
+ // global_hyp_len + cand_len, N );
+ score = bleu ( counts, ref_ids.size(), kb->sents[i].size(), N );
ScorePair sp( kb->scores[i], score );
scores.push_back( sp );
//cout << "'" << TD::GetString( ref_ids ) << "' vs '" << TD::GetString( kb->sents[i] ) << "' SCORE=" << score << endl;
//cout << kb->feats[i] << endl;
}
- //cout << "###" << endl;
+ // learner
SofiaLearner learner;
learner.Init( sid, kb->feats, scores );
learner.Update(lambdas);
- // initializing learner
- // TODO
- // updating weights
- //lambdas.set_value( FD::Convert("use_shell"), 1 );
- //lambdas.set_value( FD::Convert("use_a"), 1 );
//print_FD();
sid += 1; // TODO does cdec count this already?
}
-
- weights.WriteToFile( "weights-final", true );
-
cerr << endl;
+ weights.WriteToFile( "data/weights-final-normalx", true );
return 0;
}
diff --git a/dtrain/learner.h b/dtrain/learner.h
new file mode 100644
index 00000000..a953284d
--- /dev/null
+++ b/dtrain/learner.h
@@ -0,0 +1,71 @@
+/*class Learnerx
+{
+ public:
+ virtual void Init(const vector<SparseVector<double> >& kbest, const Scores& scores) {};
+ virtual void Update(SparseVector<double>& lambdas);
+};*/
+
+class SofiaLearner //: public Learnerx FIXME
+{
+ // TODO bool invert_score
+ public:
+ void
+ Init( const size_t sid, const vector<SparseVector<double> >& kbest, /*const*/ Scores& scores )
+ {
+ assert( kbest.size() == scores.size() );
+ ofstream o;
+ //unlink( "/tmp/sofia_ml_training_stupid" );
+ o.open( "/tmp/sofia_ml_training_normalx", ios::trunc ); // TODO randomize, filename exists
+ int fid = 0;
+ map<int,int>::iterator ff;
+
+ for ( size_t k = 0; k < kbest.size(); ++k ) {
+ map<int,double> m;
+ SparseVector<double>::const_iterator it = kbest[k].begin();
+ o << scores[k].GetScore();
+ for ( ; it != kbest[k].end(); ++it) {
+ ff = fmap.find( it->first );
+ if ( ff == fmap.end() ) {
+ fmap.insert( pair<int,int>(it->first, fid) );
+ fmap1.insert( pair<int,int>(fid, it->first) );
+ fid++;
+ }
+ m.insert(pair<int,double>(fmap[it->first], it->second));
+ }
+ map<int,double>::iterator ti = m.begin();
+ for ( ; ti != m.end(); ++ti ) {
+ o << " " << ti->first << ":" << ti->second;
+ }
+ o << endl;
+ }
+ o.close();
+ }
+
+ void
+ Update(SparseVector<double>& lambdas)
+ {
+ string call = "./sofia-ml --training_file /tmp/sofia_ml_training_normalx --model_out /tmp/sofia_ml_model_normalx --loop_type stochastic --lambda 100 --dimensionality ";
+ std::stringstream out;
+ out << fmap.size();
+ call += out.str();
+ call += " &>/dev/null";
+ system ( call.c_str() );
+ ifstream i;
+ //unlink( "/tmp/sofia_ml_model_stupid" );
+ i.open( "/tmp/sofia_ml_model_normalx", ios::in );
+ string model;
+ getline( i, model );
+ vector<string> strs;
+ boost::split( strs, model, boost::is_any_of(" ") );
+ int j = 0;
+ for ( vector<string>::iterator it = strs.begin(); it != strs.end(); ++it ) {
+ lambdas.set_value(fmap1[j], atof( it->c_str() ) );
+ j++;
+ }
+ }
+
+ private:
+ map<int,int> fmap;
+ map<int,int> fmap1;
+};
+
diff --git a/dtrain/test.sh b/dtrain/test.sh
index ad45bd1e..bc318ae7 100755
--- a/dtrain/test.sh
+++ b/dtrain/test.sh
@@ -1,4 +1,4 @@
#!/bin/sh
-./dtrain -c data/cdec.ini -k 4 < data/in.blunsom08 #< data/in.toy
+./dtrain -c data/cdec.ini -k 200 < data/in.blunsom08 #< data/in.toy