summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPatrick Simianer <p@simianer.de>2011-09-09 00:41:17 +0200
committerPatrick Simianer <p@simianer.de>2011-09-23 19:13:58 +0200
commit2b3b084ba45bf4e2e2dc5152afed268b616ee308 (patch)
treef200ce047f28b3298bf0e7aa2ead4bce9bd223c1
parent83eb31deb8a2056c098715c8cb29f2498fc213c3 (diff)
added svm, ksampler
-rw-r--r--decoder/viterbi.h2
-rw-r--r--dtrain/Makefile.am2
-rw-r--r--dtrain/dtrain.cc140
-rw-r--r--dtrain/kbestget.h6
-rwxr-xr-xdtrain/run.sh3
-rw-r--r--dtrain/sample.h64
-rw-r--r--dtrain/test/EXAMPLE/cdec.ini2
-rw-r--r--dtrain/test/EXAMPLE/dtrain.ini6
-rw-r--r--dtrain/test/toy.dtrain.ini2
9 files changed, 134 insertions, 93 deletions
diff --git a/decoder/viterbi.h b/decoder/viterbi.h
index ac0b9a11..daee3d7a 100644
--- a/decoder/viterbi.h
+++ b/decoder/viterbi.h
@@ -25,7 +25,7 @@ typename WeightFunction::Weight Viterbi(const Hypergraph& hg,
typedef typename WeightFunction::Weight WeightType;
const int num_nodes = hg.nodes_.size();
std::vector<T> vit_result(num_nodes);
- std::vector<WeightType> vit_weight(num_nodes, WeightType::Zero());
+ std::vector<WeightType> vit_weight(num_nodes, WeightType());
for (int i = 0; i < num_nodes; ++i) {
const Hypergraph::Node& cur_node = hg.nodes_[i];
diff --git a/dtrain/Makefile.am b/dtrain/Makefile.am
index afab00f2..c08cd1ea 100644
--- a/dtrain/Makefile.am
+++ b/dtrain/Makefile.am
@@ -1,7 +1,7 @@
# TODO I'm sure I can leave something out.
bin_PROGRAMS = dtrain dtest
-dtrain_SOURCES = dtrain.cc score.cc tests.cc util.cc
+dtrain_SOURCES = dtrain.cc score.cc tests.cc util.cc sample_hg.cc
dtrain_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a ../klm/lm/libklm.a ../klm/util/libklm_util.a -lz -lboost_filesystem -lboost_iostreams
dtest_SOURCES = dtest.cc score.cc util.cc
diff --git a/dtrain/dtrain.cc b/dtrain/dtrain.cc
index 4554e417..d58478a8 100644
--- a/dtrain/dtrain.cc
+++ b/dtrain/dtrain.cc
@@ -3,6 +3,8 @@
#include "util.h"
#include "sample.h"
+#include "ksampler.h"
+
// boost compression
#include <boost/iostreams/device/file.hpp>
#include <boost/iostreams/filtering_stream.hpp>
@@ -11,6 +13,7 @@
//#include <boost/iostreams/filter/bzip2.hpp>
using namespace boost::iostreams;
+
#ifdef DTRAIN_DEBUG
#include "tests.h"
#endif
@@ -101,6 +104,7 @@ ostream& _prec5( ostream& out ) { return out << setprecision(5); }
int
main( int argc, char** argv )
{
+ cout << setprecision( 5 );
// handle most parameters
po::variables_map cfg;
if ( ! init(argc, argv, &cfg) ) exit(1); // something is wrong
@@ -143,7 +147,9 @@ main( int argc, char** argv )
if ( !quiet )
cout << setw(25) << "cdec cfg " << "'" << cfg["decoder_config"].as<string>() << "'" << endl;
Decoder decoder( ini_rf.stream() );
- KBestGetter observer( k, filter_type );
+ //KBestGetter observer( k, filter_type );
+ MT19937 rng;
+ KSampler observer( k, &rng );
// scoring metric/scorer
string scorer_str = cfg["scorer"].as<string>();
@@ -207,11 +213,13 @@ main( int argc, char** argv )
size_t cand_len = 0;
double overall_time = 0.;
- cout << setprecision( 5 );
-
- // for the perceptron
- double eta = 0.5; // TODO as parameter
+ // for the perceptron/SVM; TODO as params
+ double eta = 0.0005;
+ double gamma = 0.01; // -> SVM
lambdas.add_value( FD::Convert("__bias"), 0 );
+
+ // for random sampling
+ srand ( time(NULL) );
for ( size_t t = 0; t < T; t++ ) // T epochs
@@ -284,44 +292,44 @@ main( int argc, char** argv )
weights.InitVector( &dense_weights );
decoder.SetWeights( dense_weights );
- srand ( time(NULL) );
-
- switch ( t ) {
- case 0:
- // handling input
- in_split.clear();
- boost::split( in_split, in, boost::is_any_of("\t") );
- // in_split[0] is id
- //cout << in_split[0] << endl;
- // getting reference
- ref_tok.clear(); ref_ids.clear();
- boost::split( ref_tok, in_split[2], boost::is_any_of(" ") );
- register_and_convert( ref_tok, ref_ids );
- ref_ids_buf.push_back( ref_ids );
- // process and set grammar
- //grammar_buf << in_split[3] << endl;
- grammar_str = boost::replace_all_copy( in_split[3], " __NEXT__RULE__ ", "\n" ) + "\n"; // FIXME copy, __
- grammar_buf << grammar_str << DTRAIN_GRAMMAR_DELIM << endl;
- decoder.SetSentenceGrammarFromString( grammar_str );
- // decode, kbest
- src_str_buf.push_back( in_split[1] );
- decoder.Decode( in_split[1], &observer );
- break;
- default:
- // get buffered grammar
- grammar_str.clear();
- int i = 1;
- while ( true ) {
- string g;
- getline( grammar_buf_in, g );
- if ( g == DTRAIN_GRAMMAR_DELIM ) break;
- grammar_str += g+"\n";
- i += 1;
+ if ( t == 0 ) {
+ // handling input
+ in_split.clear();
+ boost::split( in_split, in, boost::is_any_of("\t") ); // in_split[0] is id
+ // getting reference
+ ref_tok.clear(); ref_ids.clear();
+ boost::split( ref_tok, in_split[2], boost::is_any_of(" ") );
+ register_and_convert( ref_tok, ref_ids );
+ ref_ids_buf.push_back( ref_ids );
+ // process and set grammar
+ bool broken_grammar = true;
+ for ( string::iterator ti = in_split[3].begin(); ti != in_split[3].end(); ti++ ) {
+ if ( !isspace(*ti) ) {
+ broken_grammar = false;
+ break;
}
- decoder.SetSentenceGrammarFromString( grammar_str );
- // decode, kbest
- decoder.Decode( src_str_buf[sid], &observer );
- break;
+ }
+ if ( broken_grammar ) continue;
+ grammar_str = boost::replace_all_copy( in_split[3], " __NEXT__RULE__ ", "\n" ) + "\n"; // FIXME copy, __
+ grammar_buf << grammar_str << DTRAIN_GRAMMAR_DELIM << endl;
+ decoder.SetSentenceGrammarFromString( grammar_str );
+ // decode, kbest
+ src_str_buf.push_back( in_split[1] );
+ decoder.Decode( in_split[1], &observer );
+ } else {
+ // get buffered grammar
+ grammar_str.clear();
+ int i = 1;
+ while ( true ) {
+ string g;
+ getline( grammar_buf_in, g );
+ if ( g == DTRAIN_GRAMMAR_DELIM ) break;
+ grammar_str += g+"\n";
+ i += 1;
+ }
+ decoder.SetSentenceGrammarFromString( grammar_str );
+ // decode, kbest
+ decoder.Decode( src_str_buf[sid], &observer );
}
// get kbest list
@@ -346,6 +354,7 @@ main( int argc, char** argv )
cand_len = kb->sents[i].size();
}
NgramCounts counts_tmp = global_counts + counts;
+ // TODO as param
score = 0.9 * scorer( counts_tmp,
global_ref_len,
global_hyp_len + cand_len, N, bleu_weights );
@@ -380,31 +389,48 @@ main( int argc, char** argv )
TrainingInstances pairs;
- sample_all(kb, pairs);
+ sample_all_rand(kb, pairs);
+ cout << pairs.size() << endl;
for ( TrainingInstances::iterator ti = pairs.begin();
ti != pairs.end(); ti++ ) {
- // perceptron
+
SparseVector<double> dv;
- if ( ti->type == -1 ) {
+ if ( ti->first_score - ti->second_score < 0 ) {
dv = ti->second - ti->first;
- } else {
- dv = ti->first - ti->second;
- }
- dv.add_value(FD::Convert("__bias"), -1);
+ //} else {
+ //dv = ti->first - ti->second;
+ //}
+ dv.add_value( FD::Convert("__bias"), -1 );
+
+ SparseVector<double> reg;
+ reg = lambdas * ( 2 * gamma );
+ dv -= reg;
lambdas += dv * eta;
- /*if ( verbose ) {
- cout << "{{ f(i) > f(j) but g(i) < g(j), so update" << endl;
- cout << " i " << TD::GetString(kb->sents[ii]) << endl;
- cout << " " << kb->feats[ii] << endl;
- cout << " j " << TD::GetString(kb->sents[jj]) << endl;
- cout << " " << kb->feats[jj] << endl;
- cout << " dv " << dv << endl;
+ if ( verbose ) {
+ cout << "{{ f("<< ti->first_rank <<") > f(" << ti->second_rank << ") but g(i)="<< ti->first_score <<" < g(j)="<< ti->second_score << " so update" << endl;
+ cout << " i " << TD::GetString(kb->sents[ti->first_rank]) << endl;
+ cout << " " << kb->feats[ti->first_rank] << endl;
+ cout << " j " << TD::GetString(kb->sents[ti->second_rank]) << endl;
+ cout << " " << kb->feats[ti->second_rank] << endl;
+ cout << " diff vec: " << dv << endl;
+ cout << " lambdas after update: " << lambdas << endl;
cout << "}}" << endl;
- }*/
+ }
+
+ } else {
+ //if ( 0 ) {
+ SparseVector<double> reg;
+ reg = lambdas * ( gamma * 2 );
+ lambdas += reg * ( -eta );
+ //}
+ }
}
+ //double l2 = lambdas.l2norm();
+ //if ( l2 ) lambdas /= lambdas.l2norm();
+
}
++sid;
diff --git a/dtrain/kbestget.h b/dtrain/kbestget.h
index ae4588c9..cf466fe4 100644
--- a/dtrain/kbestget.h
+++ b/dtrain/kbestget.h
@@ -1,8 +1,10 @@
#ifndef _DTRAIN_KBESTGET_H_
#define _DTRAIN_KBESTGET_H_
+
#include "kbest.h"
+
namespace dtrain
{
@@ -65,7 +67,7 @@ struct KBestGetter : public DecoderObserver
if (!d) break;
kb.sents.push_back( d->yield);
kb.feats.push_back( d->feature_values );
- kb.model_scores.push_back( d->score );
+ kb.model_scores.push_back( log(d->score) );
}
}
@@ -83,7 +85,7 @@ struct KBestGetter : public DecoderObserver
if (!d) break;
kb.sents.push_back( d->yield);
kb.feats.push_back( d->feature_values );
- kb.model_scores.push_back( d->score );
+ kb.model_scores.push_back( log(d->score) );
}
}
};
diff --git a/dtrain/run.sh b/dtrain/run.sh
index b2012bcf..16575c25 100755
--- a/dtrain/run.sh
+++ b/dtrain/run.sh
@@ -3,7 +3,8 @@
#INI=test/blunsom08.dtrain.ini
#INI=test/nc-wmt11/dtrain.ini
#INI=test/EXAMPLE/dtrain.ini
-INI=test/toy.dtrain.ini
+INI=test/EXAMPLE/dtrain.ruleids.ini
+#INI=test/toy.dtrain.ini
rm /tmp/dtrain-*
./dtrain -c $INI $1 $2 $3 $4
diff --git a/dtrain/sample.h b/dtrain/sample.h
index b9bc4461..b6aa9abd 100644
--- a/dtrain/sample.h
+++ b/dtrain/sample.h
@@ -1,3 +1,7 @@
+#ifndef _DTRAIN_SAMPLE_H_
+#define _DTRAIN_SAMPLE_H_
+
+
#include "kbestget.h"
@@ -7,9 +11,9 @@ namespace dtrain
struct TPair
{
- double type;
- SparseVector<double> first;
- SparseVector<double> second;
+ SparseVector<double> first, second;
+ size_t first_rank, second_rank;
+ double first_score, second_score;
};
typedef vector<TPair> TrainingInstances;
@@ -18,35 +22,43 @@ typedef vector<TPair> TrainingInstances;
void
sample_all( KBestList* kb, TrainingInstances &training )
{
- double type;
for ( size_t i = 0; i < kb->GetSize()-1; i++ ) {
- for ( size_t j = i+1; j < kb->GetSize(); j++ ) {
- if ( kb->scores[i] - kb->scores[j] < 0 ) {
- type = -1;
- } else {
- type = 1;
- }
- TPair p;
- p.type = type;
- p.first = kb->feats[i];
- p.second = kb->feats[j];
- training.push_back( p );
- }
- }
-}
-
-/*void
-sample_all_only_neg(, vector<pair<SparSparseVector<double> > pairs)
-{
-
+ for ( size_t j = i+1; j < kb->GetSize(); j++ ) {
+ TPair p;
+ p.first = kb->feats[i];
+ p.second = kb->feats[j];
+ p.first_rank = i;
+ p.second_rank = j;
+ p.first_score = kb->scores[i];
+ p.second_score = kb->scores[j];
+ training.push_back( p );
+ }
+ }
}
void
-sample_random_pos()
+sample_all_rand( KBestList* kb, TrainingInstances &training )
{
- if ( rand() % 2 ) { // sample it?
-}*/
+ srand( time(NULL) );
+ for ( size_t i = 0; i < kb->GetSize()-1; i++ ) {
+ for ( size_t j = i+1; j < kb->GetSize(); j++ ) {
+ if ( rand() % 2 ) {
+ TPair p;
+ p.first = kb->feats[i];
+ p.second = kb->feats[j];
+ p.first_rank = i;
+ p.second_rank = j;
+ p.first_score = kb->scores[i];
+ p.second_score = kb->scores[j];
+ training.push_back( p );
+ }
+ }
+ }
+}
} // namespace
+
+#endif
+
diff --git a/dtrain/test/EXAMPLE/cdec.ini b/dtrain/test/EXAMPLE/cdec.ini
index e57138b0..bda0d12c 100644
--- a/dtrain/test/EXAMPLE/cdec.ini
+++ b/dtrain/test/EXAMPLE/cdec.ini
@@ -2,6 +2,6 @@ formalism=scfg
add_pass_through_rules=true
feature_function=WordPenalty
cubepruning_pop_limit=30
-feature_function=KLanguageModel data/nc-wmt11.en.srilm.gz
+feature_function=KLanguageModel /home/pks/src/cdec/dtrain/data/nc-wmt11.en.srilm.gz
scfg_max_span_limit=15
diff --git a/dtrain/test/EXAMPLE/dtrain.ini b/dtrain/test/EXAMPLE/dtrain.ini
index ffafd0b8..7645921a 100644
--- a/dtrain/test/EXAMPLE/dtrain.ini
+++ b/dtrain/test/EXAMPLE/dtrain.ini
@@ -1,10 +1,10 @@
decoder_config=test/EXAMPLE/cdec.ini
kbest=100
ngrams=3
-epochs=8
+epochs=3
input=test/EXAMPLE/dtrain.nc-1k
-scorer=approx_bleu
+scorer=stupid_bleu
output=test/EXAMPLE/weights.gz
-stop_after=1000
+stop_after=100
wprint=Glue WordPenalty LanguageModel LanguageModel_OOV PhraseModel_0 PhraseModel_1 PhraseModel_2 PhraseModel_3 PhraseModel_4 PassThrough
diff --git a/dtrain/test/toy.dtrain.ini b/dtrain/test/toy.dtrain.ini
index 35f76281..e9ed0ce5 100644
--- a/dtrain/test/toy.dtrain.ini
+++ b/dtrain/test/toy.dtrain.ini
@@ -4,7 +4,7 @@ ngrams=1
epochs=3
input=test/toy.in
scorer=bleu
-output=toy.gz
+output=data/w/toy.gz
#stop_after=1000
wprint=logp use_shell use_house PassThrough