summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPatrick Simianer <simianer@cl.uni-heidelberg.de>2012-04-26 21:39:11 +0200
committerPatrick Simianer <simianer@cl.uni-heidelberg.de>2012-04-26 21:39:11 +0200
commit01110e92e7429df7882879e026b28aa9c89c724d (patch)
treef5e03f63c8ae907696582aaa66953cc5cd911610
parent28806638345e60bd442bf5fa8e7471f9115b0296 (diff)
made pair sampling configurable
-rw-r--r--dtrain/dtrain.cc76
-rw-r--r--dtrain/dtrain.h8
-rw-r--r--dtrain/pairsampling.h17
-rw-r--r--dtrain/score.cc2
-rw-r--r--dtrain/test/example/README6
-rw-r--r--dtrain/test/example/dtrain.ini9
6 files changed, 68 insertions, 50 deletions
diff --git a/dtrain/dtrain.cc b/dtrain/dtrain.cc
index cf913765..ea5b8835 100644
--- a/dtrain/dtrain.cc
+++ b/dtrain/dtrain.cc
@@ -6,35 +6,37 @@ dtrain_init(int argc, char** argv, po::variables_map* cfg)
{
po::options_description ini("Configuration File Options");
ini.add_options()
- ("input", po::value<string>()->default_value("-"), "input file")
- ("output", po::value<string>()->default_value("-"), "output weights file, '-' for STDOUT")
- ("input_weights", po::value<string>(), "input weights file (e.g. from previous iteration)")
- ("decoder_config", po::value<string>(), "configuration file for cdec")
- ("print_weights", po::value<string>(), "weights to print on each iteration")
- ("stop_after", po::value<unsigned>()->default_value(0), "stop after X input sentences")
- ("tmp", po::value<string>()->default_value("/tmp"), "temp dir to use")
- ("keep", po::value<bool>()->zero_tokens(), "keep weights files for each iteration")
- ("hstreaming", po::value<string>(), "run in hadoop streaming mode, arg is a task id")
- ("epochs", po::value<unsigned>()->default_value(10), "# of iterations T (per shard)")
- ("k", po::value<unsigned>()->default_value(100), "how many translations to sample")
- ("sample_from", po::value<string>()->default_value("kbest"), "where to sample translations from: 'kbest', 'forest'")
- ("filter", po::value<string>()->default_value("uniq"), "filter kbest list: 'not', 'uniq'")
- ("pair_sampling", po::value<string>()->default_value("108010"), "how to sample pairs: 'all', '108010' or 'PRO'")
- ("pair_threshold", po::value<score_t>()->default_value(0), "bleu [0,1] threshold to filter pairs")
- ("N", po::value<unsigned>()->default_value(4), "N for Ngrams (BLEU)")
- ("scorer", po::value<string>()->default_value("stupid_bleu"), "scoring: bleu, stupid_, smooth_, approx_")
- ("learning_rate", po::value<weight_t>()->default_value(0.0001), "learning rate")
- ("gamma", po::value<weight_t>()->default_value(0), "gamma for SVM (0 for perceptron)")
- ("select_weights", po::value<string>()->default_value("last"), "output best, last, avg weights ('VOID' to throw away)")
- ("rescale", po::value<bool>()->zero_tokens(), "rescale weight vector after each input")
- ("l1_reg", po::value<string>()->default_value("none"), "apply l1 regularization as in 'Tsuroka et al' (2010)")
- ("l1_reg_strength", po::value<weight_t>(), "l1 regularization strength")
- ("inc_correct", po::value<bool>()->zero_tokens(), "include correctly ranked pairs into updates")
- ("fselect", po::value<weight_t>()->default_value(-1), "TODO select top x percent of features after each epoch")
+ ("input", po::value<string>()->default_value("-"), "input file")
+ ("output", po::value<string>()->default_value("-"), "output weights file, '-' for STDOUT")
+ ("input_weights", po::value<string>(), "input weights file (e.g. from previous iteration)")
+ ("decoder_config", po::value<string>(), "configuration file for cdec")
+ ("print_weights", po::value<string>(), "weights to print on each iteration")
+ ("stop_after", po::value<unsigned>()->default_value(0), "stop after X input sentences")
+ ("tmp", po::value<string>()->default_value("/tmp"), "temp dir to use")
+ ("keep", po::value<bool>()->zero_tokens(), "keep weights files for each iteration")
+ ("hstreaming", po::value<string>(), "run in hadoop streaming mode, arg is a task id")
+ ("epochs", po::value<unsigned>()->default_value(10), "# of iterations T (per shard)")
+ ("k", po::value<unsigned>()->default_value(100), "how many translations to sample")
+ ("sample_from", po::value<string>()->default_value("kbest"), "where to sample translations from: 'kbest', 'forest'")
+ ("filter", po::value<string>()->default_value("uniq"), "filter kbest list: 'not', 'uniq'")
+ ("pair_sampling", po::value<string>()->default_value("XYX"), "how to sample pairs: 'all', 'XYX' or 'PRO'")
+ ("hi_lo", po::value<float>()->default_value(0.1), "hi and lo (X) for XYX (default 0.1), <= 0.5")
+ ("pair_threshold", po::value<score_t>()->default_value(0), "bleu [0,1] threshold to filter pairs")
+ ("N", po::value<unsigned>()->default_value(4), "N for Ngrams (BLEU)")
+ ("scorer", po::value<string>()->default_value("stupid_bleu"), "scoring: bleu, stupid_, smooth_, approx_")
+ ("learning_rate", po::value<weight_t>()->default_value(0.0001), "learning rate")
+ ("gamma", po::value<weight_t>()->default_value(0), "gamma for SVM (0 for perceptron)")
+ ("select_weights", po::value<string>()->default_value("last"), "output best, last, avg weights ('VOID' to throw away)")
+ ("rescale", po::value<bool>()->zero_tokens(), "rescale weight vector after each input")
+ ("l1_reg", po::value<string>()->default_value("none"), "apply l1 regularization as in 'Tsuroka et al' (2010)")
+ ("l1_reg_strength", po::value<weight_t>(), "l1 regularization strength")
+ ("inc_correct", po::value<bool>()->zero_tokens(), "include correctly ranked pairs into updates")
+ ("fselect", po::value<weight_t>()->default_value(-1), "TODO select top x percent of features after each epoch")
+ ("approx_bleu_scale", po::value<score_t>()->default_value(0.9), "scaling for approx. BLEU")
#ifdef DTRAIN_LOCAL
- ("refs,r", po::value<string>(), "references in local mode")
+ ("refs,r", po::value<string>(), "references in local mode")
#endif
- ("noup", po::value<bool>()->zero_tokens(), "do not update weights");
+ ("noup", po::value<bool>()->zero_tokens(), "do not update weights");
po::options_description cl("Command Line Options");
cl.add_options()
("config,c", po::value<string>(), "dtrain config file")
@@ -71,11 +73,18 @@ dtrain_init(int argc, char** argv, po::variables_map* cfg)
cerr << "Wrong 'filter' param: '" << (*cfg)["filter"].as<string>() << "', use 'uniq' or 'not'." << endl;
return false;
}
- if ((*cfg)["pair_sampling"].as<string>() != "all" && (*cfg)["pair_sampling"].as<string>() != "108010" &&
+ if ((*cfg)["pair_sampling"].as<string>() != "all" && (*cfg)["pair_sampling"].as<string>() != "XYX" &&
(*cfg)["pair_sampling"].as<string>() != "PRO") {
cerr << "Wrong 'pair_sampling' param: '" << (*cfg)["pair_sampling"].as<string>() << "'." << endl;
return false;
}
+ if(cfg->count("hi_lo") && (*cfg)["pair_sampling"].as<string>() != "XYX") {
+ cerr << "Warning: hi_lo only works with pair_sampling XYX." << endl;
+ }
+ if((*cfg)["hi_lo"].as<float>() > 0.5 || (*cfg)["hi_lo"].as<float>() < 0.01) {
+ cerr << "hi_lo must lie in [0.01, 0.5]" << endl;
+ return false;
+ }
if ((*cfg)["pair_threshold"].as<score_t>() < 0) {
cerr << "The threshold must be >= 0!" << endl;
return false;
@@ -126,6 +135,7 @@ main(int argc, char** argv)
const string pair_sampling = cfg["pair_sampling"].as<string>();
const score_t pair_threshold = cfg["pair_threshold"].as<score_t>();
const string select_weights = cfg["select_weights"].as<string>();
+ const float hi_lo = cfg["hi_lo"].as<float>();
bool average = false;
if (select_weights == "avg")
average = true;
@@ -231,6 +241,8 @@ main(int argc, char** argv)
cerr << setw(25) << "learning rate " << eta << endl;
cerr << setw(25) << "gamma " << gamma << endl;
cerr << setw(25) << "pairs " << "'" << pair_sampling << "'" << endl;
+ if (pair_sampling == "XYX")
+ cerr << setw(25) << "hi lo " << "'" << hi_lo << "'" << endl;
cerr << setw(25) << "pair threshold " << pair_threshold << endl;
cerr << setw(25) << "select weights " << "'" << select_weights << "'" << endl;
if (cfg.count("l1_reg"))
@@ -400,10 +412,10 @@ main(int argc, char** argv)
vector<pair<ScoredHyp,ScoredHyp> > pairs;
if (pair_sampling == "all")
all_pairs(samples, pairs, pair_threshold);
- if (pair_sampling == "108010")
- part108010(samples, pairs, pair_threshold);
+ if (pair_sampling == "XYX")
+ partXYX(samples, pairs, pair_threshold, hi_lo);
if (pair_sampling == "PRO")
- PROsampling(samples, pairs);
+ PROsampling(samples, pairs, pair_threshold);
npairs += pairs.size();
pair_count += 2*pairs.size();
@@ -456,7 +468,7 @@ main(int argc, char** argv)
}
}
} else if (l1cumul) {
- weight_t acc_penalty = (ii+1) * l1_reg; // Note: ii is the index of the current input
+ weight_t acc_penalty = (ii+1) * l1_reg; // ii is the index of the current input
for (unsigned d = 0; d < lambdas.size(); d++) {
if (lambdas.nonzero(d)) {
weight_t v = lambdas.get(d);
diff --git a/dtrain/dtrain.h b/dtrain/dtrain.h
index ac13995a..7b03d258 100644
--- a/dtrain/dtrain.h
+++ b/dtrain/dtrain.h
@@ -13,7 +13,7 @@
#include "filelib.h"
-#define DTRAIN_LOCAL
+//#define DTRAIN_LOCAL
#define DTRAIN_DOTS 10 // after how many inputs to display a '.'
#define DTRAIN_GRAMMAR_DELIM "########EOS########"
@@ -23,13 +23,15 @@ using namespace std;
using namespace dtrain;
namespace po = boost::program_options;
-inline void register_and_convert(const vector<string>& strs, vector<WordID>& ids) {
+inline void register_and_convert(const vector<string>& strs, vector<WordID>& ids)
+{
vector<string>::const_iterator it;
for (it = strs.begin(); it < strs.end(); it++)
ids.push_back(TD::Convert(*it));
}
-inline string gettmpf(const string path, const string infix) {
+inline string gettmpf(const string path, const string infix)
+{
char fn[1024];
strcpy(fn, path.c_str());
strcat(fn, "/");
diff --git a/dtrain/pairsampling.h b/dtrain/pairsampling.h
index 93c0630a..66ca1706 100644
--- a/dtrain/pairsampling.h
+++ b/dtrain/pairsampling.h
@@ -13,7 +13,7 @@ accept_pair(score_t a, score_t b, score_t threshold)
}
inline void
-all_pairs(vector<ScoredHyp>* s, vector<pair<ScoredHyp,ScoredHyp> >& training, score_t threshold)
+all_pairs(vector<ScoredHyp>* s, vector<pair<ScoredHyp,ScoredHyp> >& training, score_t threshold, float _unused = 1)
{
for (unsigned i = 0; i < s->size()-1; i++) {
for (unsigned j = i+1; j < s->size(); j++) {
@@ -35,19 +35,16 @@ all_pairs(vector<ScoredHyp>* s, vector<pair<ScoredHyp,ScoredHyp> >& training, sc
* cmp middle 80% to low 10%
*/
bool
-_108010_cmp_hyp_by_score(ScoredHyp a, ScoredHyp b)
+_XYX_cmp_hyp_by_score(ScoredHyp a, ScoredHyp b)
{
return a.score < b.score;
}
inline void
-part108010(vector<ScoredHyp>* s, vector<pair<ScoredHyp,ScoredHyp> >& training, score_t threshold)
+partXYX(vector<ScoredHyp>* s, vector<pair<ScoredHyp,ScoredHyp> >& training, score_t threshold, float hi_lo)
{
- sort(s->begin(), s->end(), _108010_cmp_hyp_by_score);
+ sort(s->begin(), s->end(), _XYX_cmp_hyp_by_score);
unsigned sz = s->size();
- unsigned slice = 10;
- unsigned sep = sz%slice;
- cout << "sep " << sep <<endl;
- if (sep == 0) sep = sz/slice;
+ unsigned sep = sz * hi_lo;
for (unsigned i = 0; i < sep; i++) {
for (unsigned j = sep; j < sz; j++) {
if ((*s)[i].rank < (*s)[j].rank) {
@@ -80,7 +77,7 @@ part108010(vector<ScoredHyp>* s, vector<pair<ScoredHyp,ScoredHyp> >& training, s
* pair sampling as in
* 'Tuning as Ranking' (Hopkins & May, 2011)
* count = 5000
- * threshold = 5% BLEU
+ * threshold = 5% BLEU (0.05 for param 3)
* cut = top 50
*/
bool
@@ -90,7 +87,7 @@ _PRO_cmp_pair_by_diff(pair<ScoredHyp,ScoredHyp> a, pair<ScoredHyp,ScoredHyp> b)
return (fabs(a.first.score - a.second.score)) > (fabs(b.first.score - b.second.score));
}
inline void
-PROsampling(vector<ScoredHyp>* s, vector<pair<ScoredHyp,ScoredHyp> >& training, score_t threshold=0.05)
+PROsampling(vector<ScoredHyp>* s, vector<pair<ScoredHyp,ScoredHyp> >& training, score_t threshold, float _unused = 1)
{
unsigned max_count = 5000, count = 0;
bool b = false;
diff --git a/dtrain/score.cc b/dtrain/score.cc
index ec844437..d964b4da 100644
--- a/dtrain/score.cc
+++ b/dtrain/score.cc
@@ -129,7 +129,7 @@ ApproxBleuScorer::Score(vector<WordID>& hyp, vector<WordID>& ref,
ref_len = ref.size();
tmp = glob_onebest_counts + counts;
}
- return 0.9 * Bleu(tmp, hyp_len, ref_len);
+ return 0.9 * Bleu(tmp, hyp_len, ref_len); // TODO param
}
diff --git a/dtrain/test/example/README b/dtrain/test/example/README
new file mode 100644
index 00000000..e5a5de59
--- /dev/null
+++ b/dtrain/test/example/README
@@ -0,0 +1,6 @@
+Small example of input format for distributed training.
+Call dtrain from cdec/dtrain/ with ./dtrain -c test/example/dtrain.ini .
+
+For this to work, disable '#define DTRAIN_LOCAL' from dtrain.h
+and recompile.
+
diff --git a/dtrain/test/example/dtrain.ini b/dtrain/test/example/dtrain.ini
index b59250f3..cd2c75e7 100644
--- a/dtrain/test/example/dtrain.ini
+++ b/dtrain/test/example/dtrain.ini
@@ -1,5 +1,5 @@
input=test/example/nc-wmt11.1k.gz # use '-' for STDIN
-output=weights.gz # a weights file (add .gz for gzip compression) or STDOUT '-'
+output=- # a weights file (add .gz for gzip compression) or STDOUT '-'
decoder_config=test/example/cdec.ini # config for cdec
# weights for these features will be printed on each iteration
print_weights=Glue WordPenalty LanguageModel LanguageModel_OOV PhraseModel_0 PhraseModel_1 PhraseModel_2 PhraseModel_3 PhraseModel_4 PhraseModel_5 PhraseModel_6 PassThrough
@@ -10,11 +10,12 @@ stop_after=100 # stop epoch after 100 inputs
epochs=3 # run over input 3 times
k=100 # use 100best lists
N=4 # optimize (approx) BLEU4
-scorer=stupid_bleu # use 'stupid' BLEU+1
+scorer=approx_bleu # use 'stupid' BLEU+1
learning_rate=0.0001 # learning rate
gamma=0 # use SVM reg
sample_from=kbest # use kbest lists (as opposed to forest)
filter=uniq # only unique entries in kbest (surface form)
-pair_sampling=108010 # 10 vs 80 vs 10 and 80 vs 10
+pair_sampling=XYX
+hi_lo=0.1 # 10 vs 80 vs 10 and 80 vs 10
pair_threshold=0 # minimum distance in BLEU (this will still only use pairs with diff > 0)
-select_weights=last # just output last weights
+select_weights=VOID # don't output weights