diff options
author | Patrick Simianer <p@simianer.de> | 2011-09-08 00:06:52 +0200 |
---|---|---|
committer | Patrick Simianer <p@simianer.de> | 2011-09-23 19:13:58 +0200 |
commit | 0269777fc54bc554c12107bdd5498f743df2a1ce (patch) | |
tree | 05032f88088c2154b4c0ce815bb176ac49dc9b7e | |
parent | bcf45fc73bd855a3003dee7a8a0b7551eeb0523b (diff) |
a lot of stuff, fast_sparse_vector, perceptron, removed sofia, sample [...]
-rw-r--r-- | dtrain/README | 15 | ||||
-rw-r--r-- | dtrain/dtrain.cc | 87 | ||||
-rw-r--r-- | dtrain/kbestget.h | 12 | ||||
-rwxr-xr-x | dtrain/run.sh | 8 | ||||
-rw-r--r-- | dtrain/sample.h | 52 | ||||
-rw-r--r-- | dtrain/score.h | 16 | ||||
-rw-r--r-- | dtrain/test/EXAMPLE/cdec.ini | 1 | ||||
-rw-r--r-- | dtrain/test/EXAMPLE/dtrain.ini | 6 | ||||
-rw-r--r-- | dtrain/test/log_reg/bin_class.cc | 4 | ||||
-rw-r--r-- | dtrain/test/log_reg/bin_class.h | 22 | ||||
-rw-r--r-- | dtrain/test/log_reg/log_reg.cc | 39 | ||||
-rw-r--r-- | dtrain/test/log_reg/log_reg.h | 14 | ||||
-rw-r--r-- | dtrain/test/nc-wmt11/dtrain.ini | 2 | ||||
-rw-r--r-- | dtrain/test/toy.dtrain.ini | 3 | ||||
-rw-r--r-- | dtrain/test/toy.in | 4 | ||||
-rw-r--r-- | dtrain/test/toy_cdec/cdec.ini | 3 | ||||
-rw-r--r-- | dtrain/test/toy_cdec/grammar | 12 | ||||
-rw-r--r-- | dtrain/test/toy_cdec/in | 1 | ||||
-rw-r--r-- | dtrain/test/toy_cdec/weights | 2 | ||||
-rw-r--r-- | utils/fast_sparse_vector.h | 64 |
20 files changed, 293 insertions, 74 deletions
diff --git a/dtrain/README b/dtrain/README index 74bac6a0..b3f513be 100644 --- a/dtrain/README +++ b/dtrain/README @@ -1,7 +1,7 @@ NOTES learner gets all used features (binary! and dense (logprob is sum of logprobs!)) weights: see decoder/decoder.cc line 548 - 40k sents, k=100 = ~400M mem, 1 iteration 45min + (40k sents, k=100 = ~400M mem, 1 iteration 45min)? utils/weights.cc: why wv_? FD, Weights::wv_ grow too large, see utils/weights.cc; decoder/hg.h; decoder/scfg_translator.cc; utils/fdict.cc @@ -15,25 +15,26 @@ TODO GENERATED data? (multi-task, ability to learn, perfect translation in nbest, at first all modelscore 1) CACHING (ngrams for scoring) hadoop PIPES imlementation - SHARED LM? + SHARED LM (kenlm actually does this!)? ITERATION variants once -> average shuffle resulting weights weights AVERAGING in reducer (global Ngram counts) BATCH implementation (no update after each Kbest list) - SOFIA --eta_type explicit set REFERENCE for cdec (rescoring)? MORE THAN ONE reference for BLEU? kbest NICER (do not iterate twice)!? -> shared_ptr? DO NOT USE Decoder::Decode (input caching as WordID)!? sparse vector instead of vector<double> for weights in Decoder(::SetWeights)? reactivate DTEST and tests - non deterministic, high variance, RANDOWM RESTARTS + non deterministic, high variance, RANDOM RESTARTS use separate TEST SET KNOWN BUGS PROBLEMS - does probably OVERFIT - cdec kbest vs 1best (no -k param) fishy! + cdec kbest vs 1best (no -k param), rescoring? => ok(?) + no sparse vector in decoder => ok + ? ok sh: error while loading shared libraries: libreadline.so.6: cannot open shared object file: Error 24 - PhraseModel_* features (0..99 seem to be generated, default?) + PhraseModel_* features (0..99 seem to be generated, why 99?) + flex scanner jams on malicious input, we could skip that diff --git a/dtrain/dtrain.cc b/dtrain/dtrain.cc index 30ced234..4554e417 100644 --- a/dtrain/dtrain.cc +++ b/dtrain/dtrain.cc @@ -1,7 +1,7 @@ #include "common.h" #include "kbestget.h" -#include "updater.h" #include "util.h" +#include "sample.h" // boost compression #include <boost/iostreams/device/file.hpp> @@ -85,18 +85,21 @@ init(int argc, char** argv, po::variables_map* cfg) } +// output formatting ostream& _nopos( ostream& out ) { return out << resetiosflags( ios::showpos ); } ostream& _pos( ostream& out ) { return out << setiosflags( ios::showpos ); } ostream& _prec2( ostream& out ) { return out << setprecision(2); } ostream& _prec5( ostream& out ) { return out << setprecision(5); } + + /* - * main + * dtrain * */ int -main(int argc, char** argv) +main( int argc, char** argv ) { // handle most parameters po::variables_map cfg; @@ -202,11 +205,14 @@ main(int argc, char** argv) bool next = false, stop = false; double score = 0.; size_t cand_len = 0; - Scores scores; double overall_time = 0.; cout << setprecision( 5 ); + // for the perceptron + double eta = 0.5; // TODO as parameter + lambdas.add_value( FD::Convert("__bias"), 0 ); + for ( size_t t = 0; t < T; t++ ) // T epochs { @@ -278,12 +284,15 @@ main(int argc, char** argv) weights.InitVector( &dense_weights ); decoder.SetWeights( dense_weights ); + srand ( time(NULL) ); + switch ( t ) { case 0: // handling input in_split.clear(); boost::split( in_split, in, boost::is_any_of("\t") ); // in_split[0] is id + //cout << in_split[0] << endl; // getting reference ref_tok.clear(); ref_ids.clear(); boost::split( ref_tok, in_split[2], boost::is_any_of(" ") ); @@ -291,7 +300,7 @@ main(int argc, char** argv) ref_ids_buf.push_back( ref_ids ); // process and set grammar //grammar_buf << in_split[3] << endl; - grammar_str = boost::replace_all_copy( in_split[3], " __NEXT__RULE__ ", "\n" ) + "\n"; + grammar_str = boost::replace_all_copy( in_split[3], " __NEXT__RULE__ ", "\n" ) + "\n"; // FIXME copy, __ grammar_buf << grammar_str << DTRAIN_GRAMMAR_DELIM << endl; decoder.SetSentenceGrammarFromString( grammar_str ); // decode, kbest @@ -316,14 +325,16 @@ main(int argc, char** argv) } // get kbest list - KBestList* kb = observer.GetKBest(); + KBestList* kb; + //if ( ) { // TODO get from forest + kb = observer.GetKBest(); + //} // scoring kbest - scores.clear(); if ( t > 0 ) ref_ids = ref_ids_buf[sid]; - for ( size_t i = 0; i < kb->sents.size(); i++ ) { + for ( size_t i = 0; i < kb->GetSize(); i++ ) { NgramCounts counts = make_ngram_counts( ref_ids, kb->sents[i], N ); - // for approx bleu + // this is for approx bleu if ( scorer_str == "approx_bleu" ) { if ( i == 0 ) { // 'context of 1best translations' global_counts += counts; @@ -346,29 +357,54 @@ main(int argc, char** argv) kb->sents[i].size(), N, bleu_weights ); } + kb->scores.push_back( score ); + if ( i == 0 ) { acc_1best_score += score; - acc_1best_model += kb->scores[i]; + acc_1best_model += kb->model_scores[i]; } - // scorer score and model score - ScorePair sp( kb->scores[i], score ); - scores.push_back( sp ); - if ( verbose ) { - cout << "k=" << i+1 << " '" << TD::GetString( ref_ids ) << "'[ref] vs '"; - cout << _prec5 << _nopos << TD::GetString( kb->sents[i] ) << "'[hyp]"; - cout << " [SCORE=" << score << ",model="<< kb->scores[i] << "]" << endl; - //cout << kb->feats[i] << endl; // this is maybe too verbose + if ( i == 0 ) cout << "'" << TD::GetString( ref_ids ) << "' [ref]" << endl; + cout << _prec5 << _nopos << "[hyp " << i << "] " << "'" << TD::GetString( kb->sents[i] ) << "'"; + cout << " [SCORE=" << score << ",model="<< kb->model_scores[i] << "]" << endl; + cout << kb->feats[i] << endl; // this is maybe too verbose } } // Nbest loop + if ( verbose ) cout << endl; - // update weights; TODO other updaters + + // UPDATE WEIGHTS if ( !noup ) { - SofiaUpdater updater; - updater.Init( sid, kb->feats, scores ); - updater.Update( lambdas ); + + TrainingInstances pairs; + + sample_all(kb, pairs); + + for ( TrainingInstances::iterator ti = pairs.begin(); + ti != pairs.end(); ti++ ) { + // perceptron + SparseVector<double> dv; + if ( ti->type == -1 ) { + dv = ti->second - ti->first; + } else { + dv = ti->first - ti->second; + } + dv.add_value(FD::Convert("__bias"), -1); + lambdas += dv * eta; + + /*if ( verbose ) { + cout << "{{ f(i) > f(j) but g(i) < g(j), so update" << endl; + cout << " i " << TD::GetString(kb->sents[ii]) << endl; + cout << " " << kb->feats[ii] << endl; + cout << " j " << TD::GetString(kb->sents[jj]) << endl; + cout << " " << kb->feats[jj] << endl; + cout << " dv " << dv << endl; + cout << "}}" << endl; + }*/ + } + } ++sid; @@ -426,7 +462,7 @@ main(int argc, char** argv) } // outer loop - //unlink( grammar_buf_tmp_fn ); + unlink( grammar_buf_tmp_fn ); if ( !noup ) { if ( !quiet ) cout << endl << "writing weights file '" << cfg["output"].as<string>() << "' ..."; weights.WriteToFile( cfg["output"].as<string>(), true ); @@ -439,11 +475,6 @@ main(int argc, char** argv) cout << _prec2 << "This took " << overall_time/60. << " min." << endl; } - // don't do this with many features... - /*for ( size_t i = 0; i < FD::NumFeats(); i++ ) { - cout << FD::Convert(i) << " " << dense_weights[i] << endl; - }*/ - return 0; } diff --git a/dtrain/kbestget.h b/dtrain/kbestget.h index bb430b85..ae4588c9 100644 --- a/dtrain/kbestget.h +++ b/dtrain/kbestget.h @@ -14,7 +14,9 @@ namespace dtrain struct KBestList { vector<SparseVector<double> > feats; vector<vector<WordID> > sents; + vector<double> model_scores; vector<double> scores; + size_t GetSize() { return sents.size(); } }; @@ -52,9 +54,10 @@ struct KBestGetter : public DecoderObserver void KBestUnique( const Hypergraph& forest ) { - kb.scores.clear(); kb.sents.clear(); kb.feats.clear(); + kb.model_scores.clear(); + kb.scores.clear(); KBest::KBestDerivations<vector<WordID>, ESentenceTraversal, KBest::FilterUnique, prob_t, EdgeProb> kbest( forest, k_ ); for ( size_t i = 0; i < k_; ++i ) { const KBest::KBestDerivations<vector<WordID>, ESentenceTraversal, KBest::FilterUnique, prob_t, EdgeProb>::Derivation* d = @@ -62,16 +65,17 @@ struct KBestGetter : public DecoderObserver if (!d) break; kb.sents.push_back( d->yield); kb.feats.push_back( d->feature_values ); - kb.scores.push_back( d->score ); + kb.model_scores.push_back( d->score ); } } void KBestNoFilter( const Hypergraph& forest ) { - kb.scores.clear(); kb.sents.clear(); kb.feats.clear(); + kb.model_scores.clear(); + kb.scores.clear(); KBest::KBestDerivations<vector<WordID>, ESentenceTraversal> kbest( forest, k_ ); for ( size_t i = 0; i < k_; ++i ) { const KBest::KBestDerivations<vector<WordID>, ESentenceTraversal>::Derivation* d = @@ -79,7 +83,7 @@ struct KBestGetter : public DecoderObserver if (!d) break; kb.sents.push_back( d->yield); kb.feats.push_back( d->feature_values ); - kb.scores.push_back( d->score ); + kb.model_scores.push_back( d->score ); } } }; diff --git a/dtrain/run.sh b/dtrain/run.sh index cdaea067..b2012bcf 100755 --- a/dtrain/run.sh +++ b/dtrain/run.sh @@ -1,8 +1,10 @@ #!/bin/sh -INI=test/blunsom08.dtrain.ini -#INI=test/nc-wmt11/nc-wmt11.loo.dtrain.ini +#INI=test/blunsom08.dtrain.ini +#INI=test/nc-wmt11/dtrain.ini +#INI=test/EXAMPLE/dtrain.ini +INI=test/toy.dtrain.ini rm /tmp/dtrain-* -./dtrain -c $INI $1 $2 $3 $4 2>/dev/null +./dtrain -c $INI $1 $2 $3 $4 diff --git a/dtrain/sample.h b/dtrain/sample.h new file mode 100644 index 00000000..b9bc4461 --- /dev/null +++ b/dtrain/sample.h @@ -0,0 +1,52 @@ +#include "kbestget.h" + + +namespace dtrain +{ + + +struct TPair +{ + double type; + SparseVector<double> first; + SparseVector<double> second; +}; + +typedef vector<TPair> TrainingInstances; + + +void +sample_all( KBestList* kb, TrainingInstances &training ) +{ + double type; + for ( size_t i = 0; i < kb->GetSize()-1; i++ ) { + for ( size_t j = i+1; j < kb->GetSize(); j++ ) { + if ( kb->scores[i] - kb->scores[j] < 0 ) { + type = -1; + } else { + type = 1; + } + TPair p; + p.type = type; + p.first = kb->feats[i]; + p.second = kb->feats[j]; + training.push_back( p ); + } + } +} + +/*void +sample_all_only_neg(, vector<pair<SparSparseVector<double> > pairs) +{ + +} + +void +sample_random_pos() +{ + if ( rand() % 2 ) { // sample it? +}*/ + + +} // namespace + diff --git a/dtrain/score.h b/dtrain/score.h index 4314157b..e88387c5 100644 --- a/dtrain/score.h +++ b/dtrain/score.h @@ -18,22 +18,6 @@ namespace dtrain /* - * ScorePair - * - */ -struct ScorePair -{ - ScorePair(double modelscore, double score) : modelscore_(modelscore), score_(score) {} - double modelscore_, score_; - double GetModelScore() { return modelscore_; } - double GetScore() { return score_; } -}; - - -typedef vector<ScorePair> Scores; - - -/* * NgramCounts * */ diff --git a/dtrain/test/EXAMPLE/cdec.ini b/dtrain/test/EXAMPLE/cdec.ini index b6e92b5f..e57138b0 100644 --- a/dtrain/test/EXAMPLE/cdec.ini +++ b/dtrain/test/EXAMPLE/cdec.ini @@ -2,5 +2,6 @@ formalism=scfg add_pass_through_rules=true feature_function=WordPenalty cubepruning_pop_limit=30 +feature_function=KLanguageModel data/nc-wmt11.en.srilm.gz scfg_max_span_limit=15 diff --git a/dtrain/test/EXAMPLE/dtrain.ini b/dtrain/test/EXAMPLE/dtrain.ini index 1467b332..ffafd0b8 100644 --- a/dtrain/test/EXAMPLE/dtrain.ini +++ b/dtrain/test/EXAMPLE/dtrain.ini @@ -1,10 +1,10 @@ decoder_config=test/EXAMPLE/cdec.ini kbest=100 ngrams=3 -epochs=22 +epochs=8 input=test/EXAMPLE/dtrain.nc-1k scorer=approx_bleu output=test/EXAMPLE/weights.gz -stop_after=5 -wprint=Glue WordPenalty LanguageModel LanguageModel_OOV PhraseModel_0 PhraseModel_1 PhraseModel_2 PhraseModel_3 PhraseModel_4 +stop_after=1000 +wprint=Glue WordPenalty LanguageModel LanguageModel_OOV PhraseModel_0 PhraseModel_1 PhraseModel_2 PhraseModel_3 PhraseModel_4 PassThrough diff --git a/dtrain/test/log_reg/bin_class.cc b/dtrain/test/log_reg/bin_class.cc new file mode 100644 index 00000000..19bcde25 --- /dev/null +++ b/dtrain/test/log_reg/bin_class.cc @@ -0,0 +1,4 @@ +#include "bin_class.h" + +Objective::~Objective() {} + diff --git a/dtrain/test/log_reg/bin_class.h b/dtrain/test/log_reg/bin_class.h new file mode 100644 index 00000000..3466109a --- /dev/null +++ b/dtrain/test/log_reg/bin_class.h @@ -0,0 +1,22 @@ +#ifndef _BIN_CLASS_H_ +#define _BIN_CLASS_H_ + +#include <vector> +#include "sparse_vector.h" + +struct TrainingInstance { + // TODO add other info? loss for MIRA-type updates? + SparseVector<double> x_feature_map; + bool y; +}; + +struct Objective { + virtual ~Objective(); + + // returns f(x) and f'(x) + virtual double ObjectiveAndGradient(const SparseVector<double>& x, + const std::vector<TrainingInstance>& training_instances, + SparseVector<double>* g) const = 0; +}; + +#endif diff --git a/dtrain/test/log_reg/log_reg.cc b/dtrain/test/log_reg/log_reg.cc new file mode 100644 index 00000000..ec2331fe --- /dev/null +++ b/dtrain/test/log_reg/log_reg.cc @@ -0,0 +1,39 @@ +#include "log_reg.h" + +#include <vector> +#include <cmath> + +#include "sparse_vector.h" + +using namespace std; + +double LogisticRegression::ObjectiveAndGradient(const SparseVector<double>& x, + const vector<TrainingInstance>& training_instances, + SparseVector<double>* g) const { + double cll = 0; + for (int i = 0; i < training_instances.size(); ++i) { + const double dotprod = training_instances[i].x_feature_map.dot(x); // TODO no bias, if bias, add x[0] + double lp_false = dotprod; + double lp_true = -dotprod; + if (0 < lp_true) { + lp_true += log1p(exp(-lp_true)); + lp_false = log1p(exp(lp_false)); + } else { + lp_true = log1p(exp(lp_true)); + lp_false += log1p(exp(-lp_false)); + } + lp_true *= -1; + lp_false *= -1; + if (training_instances[i].y) { // true label + cll -= lp_true; + (*g) -= training_instances[i].x_feature_map * exp(lp_false); + // (*g)[0] -= exp(lp_false); // bias + } else { // false label + cll -= lp_false; + (*g) += training_instances[i].x_feature_map * exp(lp_true); + // g += corpus[i].second * exp(lp_true); + } + } + return cll; +} + diff --git a/dtrain/test/log_reg/log_reg.h b/dtrain/test/log_reg/log_reg.h new file mode 100644 index 00000000..ecc560b8 --- /dev/null +++ b/dtrain/test/log_reg/log_reg.h @@ -0,0 +1,14 @@ +#ifndef _LOG_REG_H_ +#define _LOG_REG_H_ + +#include <vector> +#include "sparse_vector.h" +#include "bin_class.h" + +struct LogisticRegression : public Objective { + double ObjectiveAndGradient(const SparseVector<double>& x, + const std::vector<TrainingInstance>& training_instances, + SparseVector<double>* g) const; +}; + +#endif diff --git a/dtrain/test/nc-wmt11/dtrain.ini b/dtrain/test/nc-wmt11/dtrain.ini index 51033f2d..ddbf5da7 100644 --- a/dtrain/test/nc-wmt11/dtrain.ini +++ b/dtrain/test/nc-wmt11/dtrain.ini @@ -2,7 +2,7 @@ decoder_config=test/nc-wmt11/cdec.ini kbest=100 ngrams=3 epochs=8 -input=data/nc-wmt11.loo.localf.p0.500.rule-id #nc-wmt11-de-en-dyer-cs-joshua.tok.lc.fixamp1.loo.psg.dtrain.localf.p0 +input=data/nc-wmt11.loo.localf.p0.500.rule-id scorer=approx_bleu output=data/w/nc-wmt11.loo.p0.weights.gz #stop_after=100 diff --git a/dtrain/test/toy.dtrain.ini b/dtrain/test/toy.dtrain.ini index cacb3a2c..35f76281 100644 --- a/dtrain/test/toy.dtrain.ini +++ b/dtrain/test/toy.dtrain.ini @@ -2,8 +2,9 @@ decoder_config=test/cdec.ini kbest=4 ngrams=1 epochs=3 -input=data/in.toy +input=test/toy.in scorer=bleu output=toy.gz #stop_after=1000 +wprint=logp use_shell use_house PassThrough diff --git a/dtrain/test/toy.in b/dtrain/test/toy.in index 63f97158..989a1f77 100644 --- a/dtrain/test/toy.in +++ b/dtrain/test/toy.in @@ -1,2 +1,2 @@ -0 ich sah ein kleines haus i saw a little house [S] ||| [NP,1] [VP,2] ||| [1] [2] ||| logp=0 __NEXT_RULE__ [NP] ||| ich ||| i ||| logp=0 __NEXT_RULE__ [NP] ||| ein [NN,1] ||| a [1] ||| logp=0 __NEXT_RULE__ [NN] ||| [JJ,1] haus ||| [1] house ||| logp=0 use_house=1 __NEXT_RULE__ [NN] ||| [JJ,1] haus ||| [1] shell ||| logp=0 use_shell=1 __NEXT_RULE__ [JJ] ||| kleines ||| small ||| logp=0 __NEXT_RULE__ [JJ] ||| kleines ||| little ||| logp=0 __NEXT_RULE__ [JJ] ||| grosses ||| big ||| logp=0 __NEXT_RULE__ [JJ] ||| grosses ||| large ||| logp=0 __NEXT_RULE__ [VP] ||| [V,1] [NP,2] ||| [1] [2] ||| logp=0 __NEXT_RULE__ [V] ||| sah ||| saw ||| logp=0 __NEXT_RULE__ [V] ||| fand ||| found ||| logp=0 -1 ich fand ein grosses haus i found a large house [S] ||| [NP,1] [VP,2] ||| [1] [2] ||| logp=0 __NEXT_RULE__ [NP] ||| ich ||| i ||| logp=0 __NEXT_RULE__ [NP] ||| ein [NN,1] ||| a [1] ||| logp=0 __NEXT_RULE__ [NN] ||| [JJ,1] haus ||| [1] house ||| logp=0 use_house=1 __NEXT_RULE__ [NN] ||| [JJ,1] haus ||| [1] shell ||| logp=0 use_shell=1 __NEXT_RULE__ [JJ] ||| kleines ||| small ||| logp=0 __NEXT_RULE__ [JJ] ||| kleines ||| little ||| logp=0 __NEXT_RULE__ [JJ] ||| grosses ||| big ||| logp=0 __NEXT_RULE__ [JJ] ||| grosses ||| large ||| logp=0 __NEXT_RULE__ [VP] ||| [V,1] [NP,2] ||| [1] [2] ||| logp=0 __NEXT_RULE__ [V] ||| sah ||| saw ||| logp=0 __NEXT_RULE__ [V] ||| fand ||| found ||| logp=0 +0 ich sah ein kleines haus i saw a little house [S] ||| [NP,1] [VP,2] ||| [1] [2] ||| logp=0 __NEXT__RULE__ [NP] ||| ich ||| i ||| logp=0 __NEXT__RULE__ [NP] ||| ein [NN,1] ||| a [1] ||| logp=0 __NEXT__RULE__ [NN] ||| [JJ,1] haus ||| [1] house ||| logp=0 use_house=1 __NEXT__RULE__ [NN] ||| [JJ,1] haus ||| [1] shell ||| logp=0 use_shell=1 __NEXT__RULE__ [JJ] ||| kleines ||| small ||| logp=0 __NEXT__RULE__ [JJ] ||| kleines ||| little ||| logp=0 __NEXT__RULE__ [JJ] ||| grosses ||| big ||| logp=0 __NEXT__RULE__ [JJ] ||| grosses ||| large ||| logp=0 __NEXT__RULE__ [VP] ||| [V,1] [NP,2] ||| [1] [2] ||| logp=0 __NEXT__RULE__ [V] ||| sah ||| saw ||| logp=0 __NEXT__RULE__ [V] ||| fand ||| found ||| logp=0 +1 ich fand ein grosses haus i found a large house [S] ||| [NP,1] [VP,2] ||| [1] [2] ||| logp=0 __NEXT__RULE__ [NP] ||| ich ||| i ||| logp=0 __NEXT__RULE__ [NP] ||| ein [NN,1] ||| a [1] ||| logp=0 __NEXT__RULE__ [NN] ||| [JJ,1] haus ||| [1] house ||| logp=0 use_house=1 __NEXT__RULE__ [NN] ||| [JJ,1] haus ||| [1] shell ||| logp=0 use_shell=1 __NEXT__RULE__ [JJ] ||| kleines ||| small ||| logp=0 __NEXT__RULE__ [JJ] ||| kleines ||| little ||| logp=0 __NEXT__RULE__ [JJ] ||| grosses ||| big ||| logp=0 __NEXT__RULE__ [JJ] ||| grosses ||| large ||| logp=0 __NEXT__RULE__ [VP] ||| [V,1] [NP,2] ||| [1] [2] ||| logp=0 __NEXT__RULE__ [V] ||| sah ||| saw ||| logp=0 __NEXT__RULE__ [V] ||| fand ||| found ||| logp=0 diff --git a/dtrain/test/toy_cdec/cdec.ini b/dtrain/test/toy_cdec/cdec.ini new file mode 100644 index 00000000..3a6bab68 --- /dev/null +++ b/dtrain/test/toy_cdec/cdec.ini @@ -0,0 +1,3 @@ +formalism=scfg +grammar=../dtrain/test/toy_cdec/grammar +add_pass_through_rules=true diff --git a/dtrain/test/toy_cdec/grammar b/dtrain/test/toy_cdec/grammar new file mode 100644 index 00000000..aeed75ef --- /dev/null +++ b/dtrain/test/toy_cdec/grammar @@ -0,0 +1,12 @@ +[S] ||| [NP,1] [VP,2] ||| [1] [2] ||| logp=0 +[NP] ||| ich ||| i ||| logp=0 +[NP] ||| ein [NN,1] ||| a [1] ||| logp=0 +[NN] ||| [JJ,1] haus ||| [1] house ||| logp=0 use_house=1 +[NN] ||| [JJ,1] haus ||| [1] shell ||| logp=0 use_shell=1 +[JJ] ||| kleines ||| small ||| logp=0 +[JJ] ||| kleines ||| little ||| logp=0 +[JJ] ||| grosses ||| big ||| logp=0 +[JJ] ||| grosses ||| large ||| logp=0 +[VP] ||| [V,1] [NP,2] ||| [1] [2] ||| logp=0 +[V] ||| sah ||| saw ||| logp=0 +[V] ||| fand ||| found ||| logp=0 diff --git a/dtrain/test/toy_cdec/in b/dtrain/test/toy_cdec/in new file mode 100644 index 00000000..e6df9275 --- /dev/null +++ b/dtrain/test/toy_cdec/in @@ -0,0 +1 @@ +ich sah ein kleines haus diff --git a/dtrain/test/toy_cdec/weights b/dtrain/test/toy_cdec/weights new file mode 100644 index 00000000..10d7ed83 --- /dev/null +++ b/dtrain/test/toy_cdec/weights @@ -0,0 +1,2 @@ +logp 1 +use_shell 1 diff --git a/utils/fast_sparse_vector.h b/utils/fast_sparse_vector.h index 4aae2039..1301581a 100644 --- a/utils/fast_sparse_vector.h +++ b/utils/fast_sparse_vector.h @@ -7,6 +7,8 @@ // important: indexes are integers // important: iterators may return elements in any order +#include "config.h" + #include <cmath> #include <cstring> #include <climits> @@ -16,6 +18,12 @@ #include <boost/static_assert.hpp> +#if HAVE_BOOST_ARCHIVE_TEXT_OARCHIVE_HPP +#include <boost/serialization/map.hpp> +#endif + +#include "fdict.h" + // this is architecture dependent, it should be // detected in some way but it's probably easiest (for me) // to just set it @@ -235,6 +243,13 @@ class FastSparseVector { } return *this; } + FastSparseVector<T> erase_zeros(const T& EPSILON = 1e-4) const { + FastSparseVector<T> o; + for (const_iterator it = begin(); it != end(); ++it) { + if (fabs(it->second) > EPSILON) o.set_value(it->first, it->second); + } + return o; + } const_iterator begin() const { return const_iterator(*this, false); } @@ -327,8 +342,45 @@ class FastSparseVector { } data_; unsigned char local_size_; bool is_remote_; + +#if HAVE_BOOST_ARCHIVE_TEXT_OARCHIVE_HPP + private: + friend class boost::serialization::access; + template<class Archive> + void save(Archive & ar, const unsigned int version) const { + (void) version; + int eff_size = size(); + const_iterator it = this->begin(); + if (eff_size > 0) { + // 0 index is reserved as empty + if (it->first == 0) { ++it; --eff_size; } + } + ar & eff_size; + while (it != this->end()) { + const std::pair<const std::string&, const T&> wire_pair(FD::Convert(it->first), it->second); + ar & wire_pair; + ++it; + } + } + template<class Archive> + void load(Archive & ar, const unsigned int version) { + (void) version; + this->clear(); + int sz; ar & sz; + for (int i = 0; i < sz; ++i) { + std::pair<std::string, T> wire_pair; + ar & wire_pair; + this->set_value(FD::Convert(wire_pair.first), wire_pair.second); + } + } + BOOST_SERIALIZATION_SPLIT_MEMBER() +#endif }; +#if HAVE_BOOST_ARCHIVE_TEXT_OARCHIVE_HPP +BOOST_CLASS_TRACKING(FastSparseVector<double>,track_never) +#endif + template <typename T> const FastSparseVector<T> operator+(const FastSparseVector<T>& x, const FastSparseVector<T>& y) { if (x.size() > y.size()) { @@ -344,15 +396,9 @@ const FastSparseVector<T> operator+(const FastSparseVector<T>& x, const FastSpar template <typename T> const FastSparseVector<T> operator-(const FastSparseVector<T>& x, const FastSparseVector<T>& y) { - if (x.size() > y.size()) { - FastSparseVector<T> res(x); - res -= y; - return res; - } else { - FastSparseVector<T> res(y); - res -= x; - return res; - } + FastSparseVector<T> res(x); + res -= y; + return res; } template <class T> |