diff options
author | Patrick Simianer <p@simianer.de> | 2011-09-27 02:55:49 +0200 |
---|---|---|
committer | Patrick Simianer <p@simianer.de> | 2011-09-27 02:55:49 +0200 |
commit | 78fb5d2761551f4a1a4f4e8c19be88dc0348f3d9 (patch) | |
tree | bb779d3b2280c158583faae8d7f5146422052ebe | |
parent | c9d20f422029ed61dec78ac0fd557aef61dbd973 (diff) |
added basic multipartite pair sampling
-rw-r--r-- | dtrain/dtrain.cc | 2 | ||||
-rw-r--r-- | dtrain/pairsampling.h | 36 | ||||
-rw-r--r-- | dtrain/test/example/dtrain.ini | 14 | ||||
-rw-r--r-- | dtrain/test/example/weights.gz | bin | 395 -> 0 bytes |
4 files changed, 42 insertions, 10 deletions
diff --git a/dtrain/dtrain.cc b/dtrain/dtrain.cc index 37430fb9..9b1bbe68 100644 --- a/dtrain/dtrain.cc +++ b/dtrain/dtrain.cc @@ -297,6 +297,8 @@ main(int argc, char** argv) sample_all_pairs(samples, pairs); if (pair_sampling == "rand") sample_rand_pairs(samples, pairs, &rng); + if (pair_sampling == "108010") + sample108010(samples, pairs); for (vector<pair<ScoredHyp,ScoredHyp> >::iterator ti = pairs.begin(); ti != pairs.end(); ti++) { diff --git a/dtrain/pairsampling.h b/dtrain/pairsampling.h index 6db0c045..131e90ca 100644 --- a/dtrain/pairsampling.h +++ b/dtrain/pairsampling.h @@ -1,13 +1,12 @@ #ifndef _DTRAIN_PAIRSAMPLING_H_ #define _DTRAIN_PAIRSAMPLING_H_ - namespace dtrain { inline void -sample_all_pairs(vector<ScoredHyp>* s, vector<pair<ScoredHyp,ScoredHyp> > &training) +sample_all_pairs(vector<ScoredHyp>* s, vector<pair<ScoredHyp,ScoredHyp> >& training) { for (unsigned i = 0; i < s->size()-1; i++) { for (unsigned j = i+1; j < s->size(); j++) { @@ -20,7 +19,7 @@ sample_all_pairs(vector<ScoredHyp>* s, vector<pair<ScoredHyp,ScoredHyp> > &train } inline void -sample_rand_pairs(vector<ScoredHyp>* s, vector<pair<ScoredHyp,ScoredHyp> > &training, +sample_rand_pairs(vector<ScoredHyp>* s, vector<pair<ScoredHyp,ScoredHyp> >& training, MT19937* prng) { for (unsigned i = 0; i < s->size()-1; i++) { @@ -35,6 +34,37 @@ sample_rand_pairs(vector<ScoredHyp>* s, vector<pair<ScoredHyp,ScoredHyp> > &trai } } +bool +sort_samples_by_score(ScoredHyp a, ScoredHyp b) +{ + return a.score < b.score; +} + +inline void +sample108010(vector<ScoredHyp>* s, vector<pair<ScoredHyp,ScoredHyp> >& training) +{ + sort(s->begin(), s->end(), sort_samples_by_score); + pair<ScoredHyp,ScoredHyp> p; + unsigned sz = s->size(); + unsigned slice = 10; + unsigned sep = sz%slice; + if (sep == 0) sep = sz/slice; + for (unsigned i = 0; i < sep; i++) { + for(unsigned j = sep; j < sz; j++) { + p.first = (*s)[i]; + p.second = (*s)[j]; + if(p.first.rank < p.second.rank) training.push_back(p); + } + } + for (unsigned i = sep; i < sz-sep; i++) { + for (unsigned j = sz-sep; j < sz; j++) { + p.first = (*s)[i]; + p.second = (*s)[j]; + if(p.first.rank < p.second.rank) training.push_back(p); + } + } +} + } // namespace diff --git a/dtrain/test/example/dtrain.ini b/dtrain/test/example/dtrain.ini index b1b9b7bd..068074c4 100644 --- a/dtrain/test/example/dtrain.ini +++ b/dtrain/test/example/dtrain.ini @@ -1,12 +1,12 @@ decoder_config=test/example/cdec.ini k=100 -N=4 -epochs=2 +N=3 +epochs=3 input=test/example/nc-1k.gz scorer=stupid_bleu -output=- #test/example/weights.gz -stop_after=2 -sample_from=forest -pair_sampling=rand +output=/tmp/weights.gz +stop_after=1000 +sample_from=kbest +pair_sampling=all print_weights=Glue WordPenalty LanguageModel LanguageModel_OOV PhraseModel_0 PhraseModel_1 PhraseModel_2 PhraseModel_3 PhraseModel_4 PassThrough -tmp=./ +tmp=/tmp diff --git a/dtrain/test/example/weights.gz b/dtrain/test/example/weights.gz Binary files differdeleted file mode 100644 index 7960a05a..00000000 --- a/dtrain/test/example/weights.gz +++ /dev/null |