summaryrefslogtreecommitdiff
path: root/training
diff options
context:
space:
mode:
Diffstat (limited to 'training')
-rw-r--r--training/dtrain/dtrain.cc36
-rw-r--r--training/dtrain/examples/standard/dtrain.ini3
2 files changed, 28 insertions, 11 deletions
diff --git a/training/dtrain/dtrain.cc b/training/dtrain/dtrain.cc
index 34c0a54a..2d090666 100644
--- a/training/dtrain/dtrain.cc
+++ b/training/dtrain/dtrain.cc
@@ -372,7 +372,8 @@ main(int argc, char** argv)
PROsampling(samples, pairs, pair_threshold, max_pairs);
npairs += pairs.size();
- SparseVector<weight_t> lambdas_copy;
+ SparseVector<weight_t> lambdas_copy; // for l1 regularization
+ SparseVector<weight_t> sum_up; // for pclr
if (l1naive||l1clip||l1cumul) lambdas_copy = lambdas;
for (vector<pair<ScoredHyp,ScoredHyp> >::iterator it = pairs.begin();
@@ -392,20 +393,24 @@ main(int argc, char** argv)
if (rank_error || margin < loss_margin) {
SparseVector<weight_t> diff_vec = it->first.f - it->second.f;
if (pclr) {
- SparseVector<weight_t>::iterator jt = diff_vec.begin();
- for (; jt != diff_vec.end(); ++it) {
- jt->second *= max(0.0000001, eta/(eta+learning_rates[jt->first])); // FIXME
- learning_rates[jt->first]++;
- }
- lambdas += diff_vec;
- } else {
- lambdas.plus_eq_v_times_s(diff_vec, eta);
- }
+ sum_up += diff_vec;
+ } else {
+ lambdas.plus_eq_v_times_s(diff_vec, eta);
+ }
if (gamma)
lambdas.plus_eq_v_times_s(lambdas, -2*gamma*eta*(1./npairs));
}
}
+ // per-coordinate learning rate
+ if (pclr) {
+ SparseVector<weight_t>::iterator it = sum_up.begin();
+ for (; it != lambdas.end(); ++it) {
+ lambdas[it->first] += it->second * max(0.00000001, eta/(eta+learning_rates[it->first]));
+ learning_rates[it->first]++;
+ }
+ }
+
// l1 regularization
// please note that this regularizations happen
// after a _sentence_ -- not after each example/pair!
@@ -413,6 +418,8 @@ main(int argc, char** argv)
SparseVector<weight_t>::iterator it = lambdas.begin();
for (; it != lambdas.end(); ++it) {
if (!lambdas_copy.get(it->first) || lambdas_copy.get(it->first)!=it->second) {
+ it->second *= max(0.0000001, eta/(eta+learning_rates[it->first])); // FIXME
+ learning_rates[it->first]++;
it->second -= sign(it->second) * l1_reg;
}
}
@@ -530,6 +537,15 @@ main(int argc, char** argv)
Weights::WriteToFile(w_fn, dense_weights, true);
}
+ WriteFile of("-");
+ ostream& o = *of.stream();
+ o << "<<<<<<<<<<<<<<<<<<<<<<<<\n";
+ for (SparseVector<weight_t>::iterator it = learning_rates.begin(); it != learning_rates.end(); ++it) {
+ if (it->second == 0) continue;
+ o << FD::Convert(it->first) << '\t' << it->second << endl;
+ }
+ o << ">>>>>>>>>>>>>>>>>>>>>>>>>\n";
+
} // outer loop
if (average) w_average /= (weight_t)T;
diff --git a/training/dtrain/examples/standard/dtrain.ini b/training/dtrain/examples/standard/dtrain.ini
index 23e94285..07350a0b 100644
--- a/training/dtrain/examples/standard/dtrain.ini
+++ b/training/dtrain/examples/standard/dtrain.ini
@@ -1,6 +1,6 @@
input=./nc-wmt11.de.gz
refs=./nc-wmt11.en.gz
-output=- # a weights file (add .gz for gzip compression) or STDOUT '-'
+output=asdf # a weights file (add .gz for gzip compression) or STDOUT '-'
select_weights=VOID # output average (over epochs) weight vector
decoder_config=./cdec.ini # config for cdec
# weights for these features will be printed on each iteration
@@ -22,3 +22,4 @@ pair_sampling=XYX #
hi_lo=0.1 # 10 vs 80 vs 10 and 80 vs 10 here
pair_threshold=0 # minimum distance in BLEU (here: > 0)
loss_margin=0 # update if correctly ranked, but within this margin
+pclr=1