diff options
Diffstat (limited to 'training')
-rw-r--r-- | training/dtrain/dtrain.cc | 36 | ||||
-rw-r--r-- | training/dtrain/examples/standard/dtrain.ini | 3 |
2 files changed, 28 insertions, 11 deletions
diff --git a/training/dtrain/dtrain.cc b/training/dtrain/dtrain.cc index 34c0a54a..2d090666 100644 --- a/training/dtrain/dtrain.cc +++ b/training/dtrain/dtrain.cc @@ -372,7 +372,8 @@ main(int argc, char** argv) PROsampling(samples, pairs, pair_threshold, max_pairs); npairs += pairs.size(); - SparseVector<weight_t> lambdas_copy; + SparseVector<weight_t> lambdas_copy; // for l1 regularization + SparseVector<weight_t> sum_up; // for pclr if (l1naive||l1clip||l1cumul) lambdas_copy = lambdas; for (vector<pair<ScoredHyp,ScoredHyp> >::iterator it = pairs.begin(); @@ -392,20 +393,24 @@ main(int argc, char** argv) if (rank_error || margin < loss_margin) { SparseVector<weight_t> diff_vec = it->first.f - it->second.f; if (pclr) { - SparseVector<weight_t>::iterator jt = diff_vec.begin(); - for (; jt != diff_vec.end(); ++it) { - jt->second *= max(0.0000001, eta/(eta+learning_rates[jt->first])); // FIXME - learning_rates[jt->first]++; - } - lambdas += diff_vec; - } else { - lambdas.plus_eq_v_times_s(diff_vec, eta); - } + sum_up += diff_vec; + } else { + lambdas.plus_eq_v_times_s(diff_vec, eta); + } if (gamma) lambdas.plus_eq_v_times_s(lambdas, -2*gamma*eta*(1./npairs)); } } + // per-coordinate learning rate + if (pclr) { + SparseVector<weight_t>::iterator it = sum_up.begin(); + for (; it != lambdas.end(); ++it) { + lambdas[it->first] += it->second * max(0.00000001, eta/(eta+learning_rates[it->first])); + learning_rates[it->first]++; + } + } + // l1 regularization // please note that this regularizations happen // after a _sentence_ -- not after each example/pair! @@ -413,6 +418,8 @@ main(int argc, char** argv) SparseVector<weight_t>::iterator it = lambdas.begin(); for (; it != lambdas.end(); ++it) { if (!lambdas_copy.get(it->first) || lambdas_copy.get(it->first)!=it->second) { + it->second *= max(0.0000001, eta/(eta+learning_rates[it->first])); // FIXME + learning_rates[it->first]++; it->second -= sign(it->second) * l1_reg; } } @@ -530,6 +537,15 @@ main(int argc, char** argv) Weights::WriteToFile(w_fn, dense_weights, true); } + WriteFile of("-"); + ostream& o = *of.stream(); + o << "<<<<<<<<<<<<<<<<<<<<<<<<\n"; + for (SparseVector<weight_t>::iterator it = learning_rates.begin(); it != learning_rates.end(); ++it) { + if (it->second == 0) continue; + o << FD::Convert(it->first) << '\t' << it->second << endl; + } + o << ">>>>>>>>>>>>>>>>>>>>>>>>>\n"; + } // outer loop if (average) w_average /= (weight_t)T; diff --git a/training/dtrain/examples/standard/dtrain.ini b/training/dtrain/examples/standard/dtrain.ini index 23e94285..07350a0b 100644 --- a/training/dtrain/examples/standard/dtrain.ini +++ b/training/dtrain/examples/standard/dtrain.ini @@ -1,6 +1,6 @@ input=./nc-wmt11.de.gz refs=./nc-wmt11.en.gz -output=- # a weights file (add .gz for gzip compression) or STDOUT '-' +output=asdf # a weights file (add .gz for gzip compression) or STDOUT '-' select_weights=VOID # output average (over epochs) weight vector decoder_config=./cdec.ini # config for cdec # weights for these features will be printed on each iteration @@ -22,3 +22,4 @@ pair_sampling=XYX # hi_lo=0.1 # 10 vs 80 vs 10 and 80 vs 10 here pair_threshold=0 # minimum distance in BLEU (here: > 0) loss_margin=0 # update if correctly ranked, but within this margin +pclr=1 |