summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPatrick Simianer <p@simianer.de>2011-09-08 00:06:52 +0200
committerPatrick Simianer <p@simianer.de>2011-09-23 19:13:58 +0200
commit0269777fc54bc554c12107bdd5498f743df2a1ce (patch)
tree05032f88088c2154b4c0ce815bb176ac49dc9b7e
parentbcf45fc73bd855a3003dee7a8a0b7551eeb0523b (diff)
a lot of stuff, fast_sparse_vector, perceptron, removed sofia, sample [...]
-rw-r--r--dtrain/README15
-rw-r--r--dtrain/dtrain.cc87
-rw-r--r--dtrain/kbestget.h12
-rwxr-xr-xdtrain/run.sh8
-rw-r--r--dtrain/sample.h52
-rw-r--r--dtrain/score.h16
-rw-r--r--dtrain/test/EXAMPLE/cdec.ini1
-rw-r--r--dtrain/test/EXAMPLE/dtrain.ini6
-rw-r--r--dtrain/test/log_reg/bin_class.cc4
-rw-r--r--dtrain/test/log_reg/bin_class.h22
-rw-r--r--dtrain/test/log_reg/log_reg.cc39
-rw-r--r--dtrain/test/log_reg/log_reg.h14
-rw-r--r--dtrain/test/nc-wmt11/dtrain.ini2
-rw-r--r--dtrain/test/toy.dtrain.ini3
-rw-r--r--dtrain/test/toy.in4
-rw-r--r--dtrain/test/toy_cdec/cdec.ini3
-rw-r--r--dtrain/test/toy_cdec/grammar12
-rw-r--r--dtrain/test/toy_cdec/in1
-rw-r--r--dtrain/test/toy_cdec/weights2
-rw-r--r--utils/fast_sparse_vector.h64
20 files changed, 293 insertions, 74 deletions
diff --git a/dtrain/README b/dtrain/README
index 74bac6a0..b3f513be 100644
--- a/dtrain/README
+++ b/dtrain/README
@@ -1,7 +1,7 @@
NOTES
learner gets all used features (binary! and dense (logprob is sum of logprobs!))
weights: see decoder/decoder.cc line 548
- 40k sents, k=100 = ~400M mem, 1 iteration 45min
+ (40k sents, k=100 = ~400M mem, 1 iteration 45min)?
utils/weights.cc: why wv_?
FD, Weights::wv_ grow too large, see utils/weights.cc;
decoder/hg.h; decoder/scfg_translator.cc; utils/fdict.cc
@@ -15,25 +15,26 @@ TODO
GENERATED data? (multi-task, ability to learn, perfect translation in nbest, at first all modelscore 1)
CACHING (ngrams for scoring)
hadoop PIPES imlementation
- SHARED LM?
+ SHARED LM (kenlm actually does this!)?
ITERATION variants
once -> average
shuffle resulting weights
weights AVERAGING in reducer (global Ngram counts)
BATCH implementation (no update after each Kbest list)
- SOFIA --eta_type explicit
set REFERENCE for cdec (rescoring)?
MORE THAN ONE reference for BLEU?
kbest NICER (do not iterate twice)!? -> shared_ptr?
DO NOT USE Decoder::Decode (input caching as WordID)!?
sparse vector instead of vector<double> for weights in Decoder(::SetWeights)?
reactivate DTEST and tests
- non deterministic, high variance, RANDOWM RESTARTS
+ non deterministic, high variance, RANDOM RESTARTS
use separate TEST SET
KNOWN BUGS PROBLEMS
- does probably OVERFIT
- cdec kbest vs 1best (no -k param) fishy!
+ cdec kbest vs 1best (no -k param), rescoring? => ok(?)
+ no sparse vector in decoder => ok
+ ? ok
sh: error while loading shared libraries: libreadline.so.6: cannot open shared object file: Error 24
- PhraseModel_* features (0..99 seem to be generated, default?)
+ PhraseModel_* features (0..99 seem to be generated, why 99?)
+ flex scanner jams on malicious input, we could skip that
diff --git a/dtrain/dtrain.cc b/dtrain/dtrain.cc
index 30ced234..4554e417 100644
--- a/dtrain/dtrain.cc
+++ b/dtrain/dtrain.cc
@@ -1,7 +1,7 @@
#include "common.h"
#include "kbestget.h"
-#include "updater.h"
#include "util.h"
+#include "sample.h"
// boost compression
#include <boost/iostreams/device/file.hpp>
@@ -85,18 +85,21 @@ init(int argc, char** argv, po::variables_map* cfg)
}
+// output formatting
ostream& _nopos( ostream& out ) { return out << resetiosflags( ios::showpos ); }
ostream& _pos( ostream& out ) { return out << setiosflags( ios::showpos ); }
ostream& _prec2( ostream& out ) { return out << setprecision(2); }
ostream& _prec5( ostream& out ) { return out << setprecision(5); }
+
+
/*
- * main
+ * dtrain
*
*/
int
-main(int argc, char** argv)
+main( int argc, char** argv )
{
// handle most parameters
po::variables_map cfg;
@@ -202,11 +205,14 @@ main(int argc, char** argv)
bool next = false, stop = false;
double score = 0.;
size_t cand_len = 0;
- Scores scores;
double overall_time = 0.;
cout << setprecision( 5 );
+ // for the perceptron
+ double eta = 0.5; // TODO as parameter
+ lambdas.add_value( FD::Convert("__bias"), 0 );
+
for ( size_t t = 0; t < T; t++ ) // T epochs
{
@@ -278,12 +284,15 @@ main(int argc, char** argv)
weights.InitVector( &dense_weights );
decoder.SetWeights( dense_weights );
+ srand ( time(NULL) );
+
switch ( t ) {
case 0:
// handling input
in_split.clear();
boost::split( in_split, in, boost::is_any_of("\t") );
// in_split[0] is id
+ //cout << in_split[0] << endl;
// getting reference
ref_tok.clear(); ref_ids.clear();
boost::split( ref_tok, in_split[2], boost::is_any_of(" ") );
@@ -291,7 +300,7 @@ main(int argc, char** argv)
ref_ids_buf.push_back( ref_ids );
// process and set grammar
//grammar_buf << in_split[3] << endl;
- grammar_str = boost::replace_all_copy( in_split[3], " __NEXT__RULE__ ", "\n" ) + "\n";
+ grammar_str = boost::replace_all_copy( in_split[3], " __NEXT__RULE__ ", "\n" ) + "\n"; // FIXME copy, __
grammar_buf << grammar_str << DTRAIN_GRAMMAR_DELIM << endl;
decoder.SetSentenceGrammarFromString( grammar_str );
// decode, kbest
@@ -316,14 +325,16 @@ main(int argc, char** argv)
}
// get kbest list
- KBestList* kb = observer.GetKBest();
+ KBestList* kb;
+ //if ( ) { // TODO get from forest
+ kb = observer.GetKBest();
+ //}
// scoring kbest
- scores.clear();
if ( t > 0 ) ref_ids = ref_ids_buf[sid];
- for ( size_t i = 0; i < kb->sents.size(); i++ ) {
+ for ( size_t i = 0; i < kb->GetSize(); i++ ) {
NgramCounts counts = make_ngram_counts( ref_ids, kb->sents[i], N );
- // for approx bleu
+ // this is for approx bleu
if ( scorer_str == "approx_bleu" ) {
if ( i == 0 ) { // 'context of 1best translations'
global_counts += counts;
@@ -346,29 +357,54 @@ main(int argc, char** argv)
kb->sents[i].size(), N, bleu_weights );
}
+ kb->scores.push_back( score );
+
if ( i == 0 ) {
acc_1best_score += score;
- acc_1best_model += kb->scores[i];
+ acc_1best_model += kb->model_scores[i];
}
- // scorer score and model score
- ScorePair sp( kb->scores[i], score );
- scores.push_back( sp );
-
if ( verbose ) {
- cout << "k=" << i+1 << " '" << TD::GetString( ref_ids ) << "'[ref] vs '";
- cout << _prec5 << _nopos << TD::GetString( kb->sents[i] ) << "'[hyp]";
- cout << " [SCORE=" << score << ",model="<< kb->scores[i] << "]" << endl;
- //cout << kb->feats[i] << endl; // this is maybe too verbose
+ if ( i == 0 ) cout << "'" << TD::GetString( ref_ids ) << "' [ref]" << endl;
+ cout << _prec5 << _nopos << "[hyp " << i << "] " << "'" << TD::GetString( kb->sents[i] ) << "'";
+ cout << " [SCORE=" << score << ",model="<< kb->model_scores[i] << "]" << endl;
+ cout << kb->feats[i] << endl; // this is maybe too verbose
}
} // Nbest loop
+
if ( verbose ) cout << endl;
- // update weights; TODO other updaters
+
+ // UPDATE WEIGHTS
if ( !noup ) {
- SofiaUpdater updater;
- updater.Init( sid, kb->feats, scores );
- updater.Update( lambdas );
+
+ TrainingInstances pairs;
+
+ sample_all(kb, pairs);
+
+ for ( TrainingInstances::iterator ti = pairs.begin();
+ ti != pairs.end(); ti++ ) {
+ // perceptron
+ SparseVector<double> dv;
+ if ( ti->type == -1 ) {
+ dv = ti->second - ti->first;
+ } else {
+ dv = ti->first - ti->second;
+ }
+ dv.add_value(FD::Convert("__bias"), -1);
+ lambdas += dv * eta;
+
+ /*if ( verbose ) {
+ cout << "{{ f(i) > f(j) but g(i) < g(j), so update" << endl;
+ cout << " i " << TD::GetString(kb->sents[ii]) << endl;
+ cout << " " << kb->feats[ii] << endl;
+ cout << " j " << TD::GetString(kb->sents[jj]) << endl;
+ cout << " " << kb->feats[jj] << endl;
+ cout << " dv " << dv << endl;
+ cout << "}}" << endl;
+ }*/
+ }
+
}
++sid;
@@ -426,7 +462,7 @@ main(int argc, char** argv)
} // outer loop
- //unlink( grammar_buf_tmp_fn );
+ unlink( grammar_buf_tmp_fn );
if ( !noup ) {
if ( !quiet ) cout << endl << "writing weights file '" << cfg["output"].as<string>() << "' ...";
weights.WriteToFile( cfg["output"].as<string>(), true );
@@ -439,11 +475,6 @@ main(int argc, char** argv)
cout << _prec2 << "This took " << overall_time/60. << " min." << endl;
}
- // don't do this with many features...
- /*for ( size_t i = 0; i < FD::NumFeats(); i++ ) {
- cout << FD::Convert(i) << " " << dense_weights[i] << endl;
- }*/
-
return 0;
}
diff --git a/dtrain/kbestget.h b/dtrain/kbestget.h
index bb430b85..ae4588c9 100644
--- a/dtrain/kbestget.h
+++ b/dtrain/kbestget.h
@@ -14,7 +14,9 @@ namespace dtrain
struct KBestList {
vector<SparseVector<double> > feats;
vector<vector<WordID> > sents;
+ vector<double> model_scores;
vector<double> scores;
+ size_t GetSize() { return sents.size(); }
};
@@ -52,9 +54,10 @@ struct KBestGetter : public DecoderObserver
void
KBestUnique( const Hypergraph& forest )
{
- kb.scores.clear();
kb.sents.clear();
kb.feats.clear();
+ kb.model_scores.clear();
+ kb.scores.clear();
KBest::KBestDerivations<vector<WordID>, ESentenceTraversal, KBest::FilterUnique, prob_t, EdgeProb> kbest( forest, k_ );
for ( size_t i = 0; i < k_; ++i ) {
const KBest::KBestDerivations<vector<WordID>, ESentenceTraversal, KBest::FilterUnique, prob_t, EdgeProb>::Derivation* d =
@@ -62,16 +65,17 @@ struct KBestGetter : public DecoderObserver
if (!d) break;
kb.sents.push_back( d->yield);
kb.feats.push_back( d->feature_values );
- kb.scores.push_back( d->score );
+ kb.model_scores.push_back( d->score );
}
}
void
KBestNoFilter( const Hypergraph& forest )
{
- kb.scores.clear();
kb.sents.clear();
kb.feats.clear();
+ kb.model_scores.clear();
+ kb.scores.clear();
KBest::KBestDerivations<vector<WordID>, ESentenceTraversal> kbest( forest, k_ );
for ( size_t i = 0; i < k_; ++i ) {
const KBest::KBestDerivations<vector<WordID>, ESentenceTraversal>::Derivation* d =
@@ -79,7 +83,7 @@ struct KBestGetter : public DecoderObserver
if (!d) break;
kb.sents.push_back( d->yield);
kb.feats.push_back( d->feature_values );
- kb.scores.push_back( d->score );
+ kb.model_scores.push_back( d->score );
}
}
};
diff --git a/dtrain/run.sh b/dtrain/run.sh
index cdaea067..b2012bcf 100755
--- a/dtrain/run.sh
+++ b/dtrain/run.sh
@@ -1,8 +1,10 @@
#!/bin/sh
-INI=test/blunsom08.dtrain.ini
-#INI=test/nc-wmt11/nc-wmt11.loo.dtrain.ini
+#INI=test/blunsom08.dtrain.ini
+#INI=test/nc-wmt11/dtrain.ini
+#INI=test/EXAMPLE/dtrain.ini
+INI=test/toy.dtrain.ini
rm /tmp/dtrain-*
-./dtrain -c $INI $1 $2 $3 $4 2>/dev/null
+./dtrain -c $INI $1 $2 $3 $4
diff --git a/dtrain/sample.h b/dtrain/sample.h
new file mode 100644
index 00000000..b9bc4461
--- /dev/null
+++ b/dtrain/sample.h
@@ -0,0 +1,52 @@
+#include "kbestget.h"
+
+
+namespace dtrain
+{
+
+
+struct TPair
+{
+ double type;
+ SparseVector<double> first;
+ SparseVector<double> second;
+};
+
+typedef vector<TPair> TrainingInstances;
+
+
+void
+sample_all( KBestList* kb, TrainingInstances &training )
+{
+ double type;
+ for ( size_t i = 0; i < kb->GetSize()-1; i++ ) {
+ for ( size_t j = i+1; j < kb->GetSize(); j++ ) {
+ if ( kb->scores[i] - kb->scores[j] < 0 ) {
+ type = -1;
+ } else {
+ type = 1;
+ }
+ TPair p;
+ p.type = type;
+ p.first = kb->feats[i];
+ p.second = kb->feats[j];
+ training.push_back( p );
+ }
+ }
+}
+
+/*void
+sample_all_only_neg(, vector<pair<SparSparseVector<double> > pairs)
+{
+
+}
+
+void
+sample_random_pos()
+{
+ if ( rand() % 2 ) { // sample it?
+}*/
+
+
+} // namespace
+
diff --git a/dtrain/score.h b/dtrain/score.h
index 4314157b..e88387c5 100644
--- a/dtrain/score.h
+++ b/dtrain/score.h
@@ -18,22 +18,6 @@ namespace dtrain
/*
- * ScorePair
- *
- */
-struct ScorePair
-{
- ScorePair(double modelscore, double score) : modelscore_(modelscore), score_(score) {}
- double modelscore_, score_;
- double GetModelScore() { return modelscore_; }
- double GetScore() { return score_; }
-};
-
-
-typedef vector<ScorePair> Scores;
-
-
-/*
* NgramCounts
*
*/
diff --git a/dtrain/test/EXAMPLE/cdec.ini b/dtrain/test/EXAMPLE/cdec.ini
index b6e92b5f..e57138b0 100644
--- a/dtrain/test/EXAMPLE/cdec.ini
+++ b/dtrain/test/EXAMPLE/cdec.ini
@@ -2,5 +2,6 @@ formalism=scfg
add_pass_through_rules=true
feature_function=WordPenalty
cubepruning_pop_limit=30
+feature_function=KLanguageModel data/nc-wmt11.en.srilm.gz
scfg_max_span_limit=15
diff --git a/dtrain/test/EXAMPLE/dtrain.ini b/dtrain/test/EXAMPLE/dtrain.ini
index 1467b332..ffafd0b8 100644
--- a/dtrain/test/EXAMPLE/dtrain.ini
+++ b/dtrain/test/EXAMPLE/dtrain.ini
@@ -1,10 +1,10 @@
decoder_config=test/EXAMPLE/cdec.ini
kbest=100
ngrams=3
-epochs=22
+epochs=8
input=test/EXAMPLE/dtrain.nc-1k
scorer=approx_bleu
output=test/EXAMPLE/weights.gz
-stop_after=5
-wprint=Glue WordPenalty LanguageModel LanguageModel_OOV PhraseModel_0 PhraseModel_1 PhraseModel_2 PhraseModel_3 PhraseModel_4
+stop_after=1000
+wprint=Glue WordPenalty LanguageModel LanguageModel_OOV PhraseModel_0 PhraseModel_1 PhraseModel_2 PhraseModel_3 PhraseModel_4 PassThrough
diff --git a/dtrain/test/log_reg/bin_class.cc b/dtrain/test/log_reg/bin_class.cc
new file mode 100644
index 00000000..19bcde25
--- /dev/null
+++ b/dtrain/test/log_reg/bin_class.cc
@@ -0,0 +1,4 @@
+#include "bin_class.h"
+
+Objective::~Objective() {}
+
diff --git a/dtrain/test/log_reg/bin_class.h b/dtrain/test/log_reg/bin_class.h
new file mode 100644
index 00000000..3466109a
--- /dev/null
+++ b/dtrain/test/log_reg/bin_class.h
@@ -0,0 +1,22 @@
+#ifndef _BIN_CLASS_H_
+#define _BIN_CLASS_H_
+
+#include <vector>
+#include "sparse_vector.h"
+
+struct TrainingInstance {
+ // TODO add other info? loss for MIRA-type updates?
+ SparseVector<double> x_feature_map;
+ bool y;
+};
+
+struct Objective {
+ virtual ~Objective();
+
+ // returns f(x) and f'(x)
+ virtual double ObjectiveAndGradient(const SparseVector<double>& x,
+ const std::vector<TrainingInstance>& training_instances,
+ SparseVector<double>* g) const = 0;
+};
+
+#endif
diff --git a/dtrain/test/log_reg/log_reg.cc b/dtrain/test/log_reg/log_reg.cc
new file mode 100644
index 00000000..ec2331fe
--- /dev/null
+++ b/dtrain/test/log_reg/log_reg.cc
@@ -0,0 +1,39 @@
+#include "log_reg.h"
+
+#include <vector>
+#include <cmath>
+
+#include "sparse_vector.h"
+
+using namespace std;
+
+double LogisticRegression::ObjectiveAndGradient(const SparseVector<double>& x,
+ const vector<TrainingInstance>& training_instances,
+ SparseVector<double>* g) const {
+ double cll = 0;
+ for (int i = 0; i < training_instances.size(); ++i) {
+ const double dotprod = training_instances[i].x_feature_map.dot(x); // TODO no bias, if bias, add x[0]
+ double lp_false = dotprod;
+ double lp_true = -dotprod;
+ if (0 < lp_true) {
+ lp_true += log1p(exp(-lp_true));
+ lp_false = log1p(exp(lp_false));
+ } else {
+ lp_true = log1p(exp(lp_true));
+ lp_false += log1p(exp(-lp_false));
+ }
+ lp_true *= -1;
+ lp_false *= -1;
+ if (training_instances[i].y) { // true label
+ cll -= lp_true;
+ (*g) -= training_instances[i].x_feature_map * exp(lp_false);
+ // (*g)[0] -= exp(lp_false); // bias
+ } else { // false label
+ cll -= lp_false;
+ (*g) += training_instances[i].x_feature_map * exp(lp_true);
+ // g += corpus[i].second * exp(lp_true);
+ }
+ }
+ return cll;
+}
+
diff --git a/dtrain/test/log_reg/log_reg.h b/dtrain/test/log_reg/log_reg.h
new file mode 100644
index 00000000..ecc560b8
--- /dev/null
+++ b/dtrain/test/log_reg/log_reg.h
@@ -0,0 +1,14 @@
+#ifndef _LOG_REG_H_
+#define _LOG_REG_H_
+
+#include <vector>
+#include "sparse_vector.h"
+#include "bin_class.h"
+
+struct LogisticRegression : public Objective {
+ double ObjectiveAndGradient(const SparseVector<double>& x,
+ const std::vector<TrainingInstance>& training_instances,
+ SparseVector<double>* g) const;
+};
+
+#endif
diff --git a/dtrain/test/nc-wmt11/dtrain.ini b/dtrain/test/nc-wmt11/dtrain.ini
index 51033f2d..ddbf5da7 100644
--- a/dtrain/test/nc-wmt11/dtrain.ini
+++ b/dtrain/test/nc-wmt11/dtrain.ini
@@ -2,7 +2,7 @@ decoder_config=test/nc-wmt11/cdec.ini
kbest=100
ngrams=3
epochs=8
-input=data/nc-wmt11.loo.localf.p0.500.rule-id #nc-wmt11-de-en-dyer-cs-joshua.tok.lc.fixamp1.loo.psg.dtrain.localf.p0
+input=data/nc-wmt11.loo.localf.p0.500.rule-id
scorer=approx_bleu
output=data/w/nc-wmt11.loo.p0.weights.gz
#stop_after=100
diff --git a/dtrain/test/toy.dtrain.ini b/dtrain/test/toy.dtrain.ini
index cacb3a2c..35f76281 100644
--- a/dtrain/test/toy.dtrain.ini
+++ b/dtrain/test/toy.dtrain.ini
@@ -2,8 +2,9 @@ decoder_config=test/cdec.ini
kbest=4
ngrams=1
epochs=3
-input=data/in.toy
+input=test/toy.in
scorer=bleu
output=toy.gz
#stop_after=1000
+wprint=logp use_shell use_house PassThrough
diff --git a/dtrain/test/toy.in b/dtrain/test/toy.in
index 63f97158..989a1f77 100644
--- a/dtrain/test/toy.in
+++ b/dtrain/test/toy.in
@@ -1,2 +1,2 @@
-0 ich sah ein kleines haus i saw a little house [S] ||| [NP,1] [VP,2] ||| [1] [2] ||| logp=0 __NEXT_RULE__ [NP] ||| ich ||| i ||| logp=0 __NEXT_RULE__ [NP] ||| ein [NN,1] ||| a [1] ||| logp=0 __NEXT_RULE__ [NN] ||| [JJ,1] haus ||| [1] house ||| logp=0 use_house=1 __NEXT_RULE__ [NN] ||| [JJ,1] haus ||| [1] shell ||| logp=0 use_shell=1 __NEXT_RULE__ [JJ] ||| kleines ||| small ||| logp=0 __NEXT_RULE__ [JJ] ||| kleines ||| little ||| logp=0 __NEXT_RULE__ [JJ] ||| grosses ||| big ||| logp=0 __NEXT_RULE__ [JJ] ||| grosses ||| large ||| logp=0 __NEXT_RULE__ [VP] ||| [V,1] [NP,2] ||| [1] [2] ||| logp=0 __NEXT_RULE__ [V] ||| sah ||| saw ||| logp=0 __NEXT_RULE__ [V] ||| fand ||| found ||| logp=0
-1 ich fand ein grosses haus i found a large house [S] ||| [NP,1] [VP,2] ||| [1] [2] ||| logp=0 __NEXT_RULE__ [NP] ||| ich ||| i ||| logp=0 __NEXT_RULE__ [NP] ||| ein [NN,1] ||| a [1] ||| logp=0 __NEXT_RULE__ [NN] ||| [JJ,1] haus ||| [1] house ||| logp=0 use_house=1 __NEXT_RULE__ [NN] ||| [JJ,1] haus ||| [1] shell ||| logp=0 use_shell=1 __NEXT_RULE__ [JJ] ||| kleines ||| small ||| logp=0 __NEXT_RULE__ [JJ] ||| kleines ||| little ||| logp=0 __NEXT_RULE__ [JJ] ||| grosses ||| big ||| logp=0 __NEXT_RULE__ [JJ] ||| grosses ||| large ||| logp=0 __NEXT_RULE__ [VP] ||| [V,1] [NP,2] ||| [1] [2] ||| logp=0 __NEXT_RULE__ [V] ||| sah ||| saw ||| logp=0 __NEXT_RULE__ [V] ||| fand ||| found ||| logp=0
+0 ich sah ein kleines haus i saw a little house [S] ||| [NP,1] [VP,2] ||| [1] [2] ||| logp=0 __NEXT__RULE__ [NP] ||| ich ||| i ||| logp=0 __NEXT__RULE__ [NP] ||| ein [NN,1] ||| a [1] ||| logp=0 __NEXT__RULE__ [NN] ||| [JJ,1] haus ||| [1] house ||| logp=0 use_house=1 __NEXT__RULE__ [NN] ||| [JJ,1] haus ||| [1] shell ||| logp=0 use_shell=1 __NEXT__RULE__ [JJ] ||| kleines ||| small ||| logp=0 __NEXT__RULE__ [JJ] ||| kleines ||| little ||| logp=0 __NEXT__RULE__ [JJ] ||| grosses ||| big ||| logp=0 __NEXT__RULE__ [JJ] ||| grosses ||| large ||| logp=0 __NEXT__RULE__ [VP] ||| [V,1] [NP,2] ||| [1] [2] ||| logp=0 __NEXT__RULE__ [V] ||| sah ||| saw ||| logp=0 __NEXT__RULE__ [V] ||| fand ||| found ||| logp=0
+1 ich fand ein grosses haus i found a large house [S] ||| [NP,1] [VP,2] ||| [1] [2] ||| logp=0 __NEXT__RULE__ [NP] ||| ich ||| i ||| logp=0 __NEXT__RULE__ [NP] ||| ein [NN,1] ||| a [1] ||| logp=0 __NEXT__RULE__ [NN] ||| [JJ,1] haus ||| [1] house ||| logp=0 use_house=1 __NEXT__RULE__ [NN] ||| [JJ,1] haus ||| [1] shell ||| logp=0 use_shell=1 __NEXT__RULE__ [JJ] ||| kleines ||| small ||| logp=0 __NEXT__RULE__ [JJ] ||| kleines ||| little ||| logp=0 __NEXT__RULE__ [JJ] ||| grosses ||| big ||| logp=0 __NEXT__RULE__ [JJ] ||| grosses ||| large ||| logp=0 __NEXT__RULE__ [VP] ||| [V,1] [NP,2] ||| [1] [2] ||| logp=0 __NEXT__RULE__ [V] ||| sah ||| saw ||| logp=0 __NEXT__RULE__ [V] ||| fand ||| found ||| logp=0
diff --git a/dtrain/test/toy_cdec/cdec.ini b/dtrain/test/toy_cdec/cdec.ini
new file mode 100644
index 00000000..3a6bab68
--- /dev/null
+++ b/dtrain/test/toy_cdec/cdec.ini
@@ -0,0 +1,3 @@
+formalism=scfg
+grammar=../dtrain/test/toy_cdec/grammar
+add_pass_through_rules=true
diff --git a/dtrain/test/toy_cdec/grammar b/dtrain/test/toy_cdec/grammar
new file mode 100644
index 00000000..aeed75ef
--- /dev/null
+++ b/dtrain/test/toy_cdec/grammar
@@ -0,0 +1,12 @@
+[S] ||| [NP,1] [VP,2] ||| [1] [2] ||| logp=0
+[NP] ||| ich ||| i ||| logp=0
+[NP] ||| ein [NN,1] ||| a [1] ||| logp=0
+[NN] ||| [JJ,1] haus ||| [1] house ||| logp=0 use_house=1
+[NN] ||| [JJ,1] haus ||| [1] shell ||| logp=0 use_shell=1
+[JJ] ||| kleines ||| small ||| logp=0
+[JJ] ||| kleines ||| little ||| logp=0
+[JJ] ||| grosses ||| big ||| logp=0
+[JJ] ||| grosses ||| large ||| logp=0
+[VP] ||| [V,1] [NP,2] ||| [1] [2] ||| logp=0
+[V] ||| sah ||| saw ||| logp=0
+[V] ||| fand ||| found ||| logp=0
diff --git a/dtrain/test/toy_cdec/in b/dtrain/test/toy_cdec/in
new file mode 100644
index 00000000..e6df9275
--- /dev/null
+++ b/dtrain/test/toy_cdec/in
@@ -0,0 +1 @@
+ich sah ein kleines haus
diff --git a/dtrain/test/toy_cdec/weights b/dtrain/test/toy_cdec/weights
new file mode 100644
index 00000000..10d7ed83
--- /dev/null
+++ b/dtrain/test/toy_cdec/weights
@@ -0,0 +1,2 @@
+logp 1
+use_shell 1
diff --git a/utils/fast_sparse_vector.h b/utils/fast_sparse_vector.h
index 4aae2039..1301581a 100644
--- a/utils/fast_sparse_vector.h
+++ b/utils/fast_sparse_vector.h
@@ -7,6 +7,8 @@
// important: indexes are integers
// important: iterators may return elements in any order
+#include "config.h"
+
#include <cmath>
#include <cstring>
#include <climits>
@@ -16,6 +18,12 @@
#include <boost/static_assert.hpp>
+#if HAVE_BOOST_ARCHIVE_TEXT_OARCHIVE_HPP
+#include <boost/serialization/map.hpp>
+#endif
+
+#include "fdict.h"
+
// this is architecture dependent, it should be
// detected in some way but it's probably easiest (for me)
// to just set it
@@ -235,6 +243,13 @@ class FastSparseVector {
}
return *this;
}
+ FastSparseVector<T> erase_zeros(const T& EPSILON = 1e-4) const {
+ FastSparseVector<T> o;
+ for (const_iterator it = begin(); it != end(); ++it) {
+ if (fabs(it->second) > EPSILON) o.set_value(it->first, it->second);
+ }
+ return o;
+ }
const_iterator begin() const {
return const_iterator(*this, false);
}
@@ -327,8 +342,45 @@ class FastSparseVector {
} data_;
unsigned char local_size_;
bool is_remote_;
+
+#if HAVE_BOOST_ARCHIVE_TEXT_OARCHIVE_HPP
+ private:
+ friend class boost::serialization::access;
+ template<class Archive>
+ void save(Archive & ar, const unsigned int version) const {
+ (void) version;
+ int eff_size = size();
+ const_iterator it = this->begin();
+ if (eff_size > 0) {
+ // 0 index is reserved as empty
+ if (it->first == 0) { ++it; --eff_size; }
+ }
+ ar & eff_size;
+ while (it != this->end()) {
+ const std::pair<const std::string&, const T&> wire_pair(FD::Convert(it->first), it->second);
+ ar & wire_pair;
+ ++it;
+ }
+ }
+ template<class Archive>
+ void load(Archive & ar, const unsigned int version) {
+ (void) version;
+ this->clear();
+ int sz; ar & sz;
+ for (int i = 0; i < sz; ++i) {
+ std::pair<std::string, T> wire_pair;
+ ar & wire_pair;
+ this->set_value(FD::Convert(wire_pair.first), wire_pair.second);
+ }
+ }
+ BOOST_SERIALIZATION_SPLIT_MEMBER()
+#endif
};
+#if HAVE_BOOST_ARCHIVE_TEXT_OARCHIVE_HPP
+BOOST_CLASS_TRACKING(FastSparseVector<double>,track_never)
+#endif
+
template <typename T>
const FastSparseVector<T> operator+(const FastSparseVector<T>& x, const FastSparseVector<T>& y) {
if (x.size() > y.size()) {
@@ -344,15 +396,9 @@ const FastSparseVector<T> operator+(const FastSparseVector<T>& x, const FastSpar
template <typename T>
const FastSparseVector<T> operator-(const FastSparseVector<T>& x, const FastSparseVector<T>& y) {
- if (x.size() > y.size()) {
- FastSparseVector<T> res(x);
- res -= y;
- return res;
- } else {
- FastSparseVector<T> res(y);
- res -= x;
- return res;
- }
+ FastSparseVector<T> res(x);
+ res -= y;
+ return res;
}
template <class T>