From a016c260bba1a7411af21264079ce670fb0ad3a6 Mon Sep 17 00:00:00 2001 From: Patrick Simianer Date: Tue, 27 Sep 2011 02:55:49 +0200 Subject: added basic multipartite pair sampling --- dtrain/dtrain.cc | 2 ++ dtrain/pairsampling.h | 36 +++++++++++++++++++++++++++++++++--- dtrain/test/example/dtrain.ini | 14 +++++++------- dtrain/test/example/weights.gz | Bin 395 -> 0 bytes 4 files changed, 42 insertions(+), 10 deletions(-) delete mode 100644 dtrain/test/example/weights.gz (limited to 'dtrain') diff --git a/dtrain/dtrain.cc b/dtrain/dtrain.cc index 37430fb9..9b1bbe68 100644 --- a/dtrain/dtrain.cc +++ b/dtrain/dtrain.cc @@ -297,6 +297,8 @@ main(int argc, char** argv) sample_all_pairs(samples, pairs); if (pair_sampling == "rand") sample_rand_pairs(samples, pairs, &rng); + if (pair_sampling == "108010") + sample108010(samples, pairs); for (vector >::iterator ti = pairs.begin(); ti != pairs.end(); ti++) { diff --git a/dtrain/pairsampling.h b/dtrain/pairsampling.h index 6db0c045..131e90ca 100644 --- a/dtrain/pairsampling.h +++ b/dtrain/pairsampling.h @@ -1,13 +1,12 @@ #ifndef _DTRAIN_PAIRSAMPLING_H_ #define _DTRAIN_PAIRSAMPLING_H_ - namespace dtrain { inline void -sample_all_pairs(vector* s, vector > &training) +sample_all_pairs(vector* s, vector >& training) { for (unsigned i = 0; i < s->size()-1; i++) { for (unsigned j = i+1; j < s->size(); j++) { @@ -20,7 +19,7 @@ sample_all_pairs(vector* s, vector > &train } inline void -sample_rand_pairs(vector* s, vector > &training, +sample_rand_pairs(vector* s, vector >& training, MT19937* prng) { for (unsigned i = 0; i < s->size()-1; i++) { @@ -35,6 +34,37 @@ sample_rand_pairs(vector* s, vector > &trai } } +bool +sort_samples_by_score(ScoredHyp a, ScoredHyp b) +{ + return a.score < b.score; +} + +inline void +sample108010(vector* s, vector >& training) +{ + sort(s->begin(), s->end(), sort_samples_by_score); + pair p; + unsigned sz = s->size(); + unsigned slice = 10; + unsigned sep = sz%slice; + if (sep == 0) sep = sz/slice; + for (unsigned i = 0; i < sep; i++) { + for(unsigned j = sep; j < sz; j++) { + p.first = (*s)[i]; + p.second = (*s)[j]; + if(p.first.rank < p.second.rank) training.push_back(p); + } + } + for (unsigned i = sep; i < sz-sep; i++) { + for (unsigned j = sz-sep; j < sz; j++) { + p.first = (*s)[i]; + p.second = (*s)[j]; + if(p.first.rank < p.second.rank) training.push_back(p); + } + } +} + } // namespace diff --git a/dtrain/test/example/dtrain.ini b/dtrain/test/example/dtrain.ini index b1b9b7bd..068074c4 100644 --- a/dtrain/test/example/dtrain.ini +++ b/dtrain/test/example/dtrain.ini @@ -1,12 +1,12 @@ decoder_config=test/example/cdec.ini k=100 -N=4 -epochs=2 +N=3 +epochs=3 input=test/example/nc-1k.gz scorer=stupid_bleu -output=- #test/example/weights.gz -stop_after=2 -sample_from=forest -pair_sampling=rand +output=/tmp/weights.gz +stop_after=1000 +sample_from=kbest +pair_sampling=all print_weights=Glue WordPenalty LanguageModel LanguageModel_OOV PhraseModel_0 PhraseModel_1 PhraseModel_2 PhraseModel_3 PhraseModel_4 PassThrough -tmp=./ +tmp=/tmp diff --git a/dtrain/test/example/weights.gz b/dtrain/test/example/weights.gz deleted file mode 100644 index 7960a05a..00000000 Binary files a/dtrain/test/example/weights.gz and /dev/null differ -- cgit v1.2.3