From 29473017d0f0cdd6f383d253235e2f3388533d13 Mon Sep 17 00:00:00 2001
From: Patrick Simianer
Date: Tue, 12 Nov 2013 20:07:47 +0100
Subject: impl repeat param
---
training/dtrain/dtrain.cc | 78 ++++++++++++++++------------
training/dtrain/examples/standard/dtrain.ini | 6 ++-
2 files changed, 49 insertions(+), 35 deletions(-)
diff --git a/training/dtrain/dtrain.cc b/training/dtrain/dtrain.cc
index 23131810..441e2cd7 100644
--- a/training/dtrain/dtrain.cc
+++ b/training/dtrain/dtrain.cc
@@ -43,7 +43,7 @@ dtrain_init(int argc, char** argv, po::variables_map* cfg)
("max_pairs", po::value()->default_value(std::numeric_limits::max()), "max. # of pairs per Sent.")
("pclr", po::value()->default_value("no"), "use a (simple|adagrad) per-coordinate learning rate")
("batch", po::value()->zero_tokens(), "do batch optimization")
- //("repeat", po::value()->default_value(1), "repeat optimization over kbest list this number of times")
+ ("repeat", po::value()->default_value(1), "repeat optimization over kbest list this number of times")
//("test-k-best", po::value()->zero_tokens(), "check if optimization works (use repeat >= 2)")
("noup", po::value()->zero_tokens(), "do not update weights");
po::options_description cl("Command Line Options");
@@ -129,7 +129,7 @@ main(int argc, char** argv)
const float hi_lo = cfg["hi_lo"].as();
const score_t approx_bleu_d = cfg["approx_bleu_d"].as();
const unsigned max_pairs = cfg["max_pairs"].as();
- //int repeat = cfg["repeat"].as();
+ int repeat = cfg["repeat"].as();
//bool test_k_best = false;
//if (cfg.count("test-k-best")) test_k_best = true;
weight_t loss_margin = cfg["loss_margin"].as();
@@ -276,7 +276,7 @@ main(int argc, char** argv)
cerr << setw(25) << "rescale " << rescale << endl;
cerr << setw(25) << "pclr " << pclr << endl;
cerr << setw(25) << "max pairs " << max_pairs << endl;
- //cerr << setw(25) << "repeat " << repeat << endl;
+ cerr << setw(25) << "repeat " << repeat << endl;
//cerr << setw(25) << "test k-best " << test_k_best << endl;
cerr << setw(25) << "cdec cfg " << "'" << cfg["decoder_config"].as() << "'" << endl;
cerr << setw(25) << "input " << "'" << input_fn << "'" << endl;
@@ -294,23 +294,19 @@ main(int argc, char** argv)
SparseVector learning_rates;
// batch
SparseVector batch_updates;
- weight_t batch_loss;
-
- //int did_improve; // FIXME for test-k-best
+ score_t batch_loss;
for (unsigned t = 0; t < T; t++) // T epochs
{
-
+
time_t start, end;
time(&start);
score_t score_sum = 0.;
score_t model_sum(0);
- unsigned ii = 0, rank_errors = 0, margin_violations = 0, npairs = 0, f_count = 0, list_sz = 0;
+ unsigned ii = 0, rank_errors = 0, margin_violations = 0, npairs = 0, f_count = 0, list_sz = 0, kbest_loss_improve = 0;
batch_loss = 0.;
if (!quiet) cerr << "Iteration #" << t+1 << " of " << T << "." << endl;
- //did_improve = 0;
-
while(true)
{
@@ -395,8 +391,10 @@ main(int argc, char** argv)
}
}
- score_sum += (*samples)[0].score; // stats for 1best
- model_sum += (*samples)[0].model;
+ if (repeat == 1) {
+ score_sum += (*samples)[0].score; // stats for 1best
+ model_sum += (*samples)[0].model;
+ }
f_count += observer->get_f_count();
list_sz += observer->get_sz();
@@ -414,24 +412,22 @@ main(int argc, char** argv)
int cur_npairs = pairs.size();
npairs += cur_npairs;
- weight_t kbest_loss_first, kbest_loss_last = 0.0;
+ score_t kbest_loss_first, kbest_loss_last = 0.0;
-//for (int q=0; q < repeat; q++) { // repeat
+ for (int ki=0; ki < repeat; ki++) {
- weight_t kbest_loss = 0.0; // test-k-best
+ score_t kbest_loss = 0.0; // test-k-best
SparseVector lambdas_copy; // for l1 regularization
SparseVector sum_up; // for pclr
if (l1naive||l1clip||l1cumul) lambdas_copy = lambdas;
for (vector >::iterator it = pairs.begin();
it != pairs.end(); it++) {
-
- /*if (repeat > 1) {
- double x = max(0.0, -1.0 * (lambdas.dot(it->first.f) - lambdas.dot(it->second.f)));
- kbest_loss += x;
- }*/
-
score_t model_diff = it->first.model - it->second.model;
+ if (repeat > 1) {
+ model_diff = lambdas.dot(it->first.f) - lambdas.dot(it->second.f);
+ kbest_loss += max(0.0, -1.0 * model_diff);
+ }
bool rank_error = false;
score_t margin;
if (faster_perceptron) { // we only have considering misranked pairs
@@ -442,7 +438,7 @@ main(int argc, char** argv)
margin = fabs(model_diff);
if (!rank_error && margin < loss_margin) margin_violations++;
}
- if (rank_error) rank_errors++;
+ if (rank_error && ki==1) rank_errors++;
if (scale_bleu_diff) eta = it->first.score - it->second.score;
if (rank_error || margin < loss_margin) {
SparseVector diff_vec = it->first.f - it->second.f;
@@ -524,12 +520,27 @@ main(int argc, char** argv)
}
}
- //if (q==0) { kbest_loss_first = kbest_loss; }
- //if (q==repeat-1) { kbest_loss_last = kbest_loss; }
-//}//repeat
-//if((kbest_loss_first - kbest_loss_last) > 0) did_improve++;
+ if (ki==0) kbest_loss_first = kbest_loss;
+ if (ki==repeat-1) { // done
+ kbest_loss_last = kbest_loss;
+ score_t best_score = -1.;
+ score_t best_model = -std::numeric_limits::max();
+ unsigned best_idx;
+ for (unsigned i=0; i < samples->size(); i++) {
+ score_t s = lambdas.dot((*samples)[i].f);
+ if (s > best_model) {
+ best_idx = i;
+ best_model = s;
+ }
+ }
+ score_sum += (*samples)[best_idx].score;
+ model_sum += best_model;
+ }
+ } // repeat
- }
+ if ((kbest_loss_first - kbest_loss_last) >= 0) kbest_loss_improve++;
+
+ } // noup
if (rescale) lambdas /= lambdas.l2norm();
@@ -539,7 +550,6 @@ main(int argc, char** argv)
if (t == 0) in_sz = ii; // remember size of input (# lines)
- //if (repeat > 1) cout << "did improve? " << did_improve << " out of " << in_sz << endl;
if (batch) {
lambdas.plus_eq_v_times_s(batch_updates, eta);
@@ -577,14 +587,16 @@ main(int argc, char** argv)
cerr << _np << " 1best avg model score: " << model_avg;
cerr << _p << " (" << model_diff << ")" << endl;
cerr << " avg # pairs: ";
- cerr << _np << npairs/(float)in_sz;
+ cerr << _np << npairs/(float)in_sz << endl;
+ cerr << " avg # margin viol: ";
+ cerr << margin_violations/(float)in_sz << endl;
+ cerr << " avg # rank err: ";
+ cerr << rank_errors/(float)in_sz;
if (faster_perceptron) cerr << " (meaningless)";
cerr << endl;
- cerr << " avg # rank err: ";
- cerr << rank_errors/(float)in_sz << endl;
if (batch) cerr << " batch loss: " << batch_loss << endl;
- cerr << " avg # margin viol: ";
- cerr << margin_violations/(float)in_sz << endl;
+ if (repeat > 1) cerr << " k-best loss imp: " << ((float)kbest_loss_improve/in_sz)*100 << "%" << endl;
+
cerr << " non0 feature count: " << nonz << endl;
cerr << " avg list sz: " << list_sz/(float)in_sz << endl;
cerr << " avg f count: " << f_count/(float)list_sz << endl;
diff --git a/training/dtrain/examples/standard/dtrain.ini b/training/dtrain/examples/standard/dtrain.ini
index 4d096dfb..ef022469 100644
--- a/training/dtrain/examples/standard/dtrain.ini
+++ b/training/dtrain/examples/standard/dtrain.ini
@@ -11,11 +11,11 @@ print_weights=Glue WordPenalty LanguageModel LanguageModel_OOV PhraseModel_0 Phr
stop_after=10 # stop epoch after 10 inputs
# interesting stuff
-epochs=100 # run over input 3 times
+epochs=3 # run over input 3 times
k=100 # use 100best lists
N=4 # optimize (approx) BLEU4
scorer=fixed_stupid_bleu # use 'stupid' BLEU+1
-learning_rate=0.0001 # learning rate, don't care if gamma=0 (perceptron) and loss_margin=0 (not margin perceptron)
+learning_rate=0.0001 # learning rate, don't care if gamma=0 (perceptron) and loss_margin=0 (not margin perceptron)
gamma=0 # use SVM reg
sample_from=kbest # use kbest lists (as opposed to forest)
filter=uniq # only unique entries in kbest (surface form)
@@ -23,3 +23,5 @@ pair_sampling=XYX #
hi_lo=0.1 # 10 vs 80 vs 10 and 80 vs 10 here
pair_threshold=0 # minimum distance in BLEU (here: > 0)
loss_margin=0 # update if correctly ranked, but within this margin
+repeat=1 # repeat training on a kbest list 1 times
+#batch=true # batch tuning, update after accumulating over all sentences and all kbest lists
--
cgit v1.2.3