summaryrefslogtreecommitdiff
path: root/pro-train
diff options
context:
space:
mode:
Diffstat (limited to 'pro-train')
-rwxr-xr-xpro-train/dist-pro.pl114
-rw-r--r--pro-train/mr_pro_reduce.cc84
2 files changed, 112 insertions, 86 deletions
diff --git a/pro-train/dist-pro.pl b/pro-train/dist-pro.pl
index dbfa329a..5db053de 100755
--- a/pro-train/dist-pro.pl
+++ b/pro-train/dist-pro.pl
@@ -10,6 +10,7 @@ use Getopt::Long;
use IPC::Open2;
use POSIX ":sys_wait_h";
my $QSUB_CMD = qsub_args(mert_memory());
+my $default_jobs = env_default_jobs();
my $VEST_DIR="$SCRIPT_DIR/../vest";
require "$VEST_DIR/libcall.pl";
@@ -39,10 +40,11 @@ die "Can't find $libcall" unless -e $libcall;
my $decoder = $cdec;
my $lines_per_mapper = 30;
my $iteration = 1;
-my $run_local = 0;
my $best_weights;
-my $max_iterations = 30;
-my $decode_nodes = 15; # number of decode nodes
+my $psi = 1;
+my $default_max_iter = 30;
+my $max_iterations = $default_max_iter;
+my $jobs = $default_jobs; # number of decode nodes
my $pmem = "4g";
my $disable_clean = 0;
my %seen_weights;
@@ -54,30 +56,32 @@ my $metric = "ibm_bleu";
my $dir;
my $iniFile;
my $weights;
-my $use_make; # use make to parallelize
-my $usefork;
+my $use_make = 1; # use make to parallelize
+my $useqsub = 0;
my $initial_weights;
my $pass_suffix = '';
my $cpbin=1;
# regularization strength
my $tune_regularizer = 0;
-my $reg = 1e-2;
+my $reg = 500;
+my $reg_previous = 5000;
# Process command-line options
Getopt::Long::Configure("no_auto_abbrev");
if (GetOptions(
- "decode-nodes=i" => \$decode_nodes,
+ "jobs=i" => \$jobs,
"dont-clean" => \$disable_clean,
"pass-suffix=s" => \$pass_suffix,
- "use-fork" => \$usefork,
+ "qsub" => \$useqsub,
"dry-run" => \$dryrun,
"epsilon=s" => \$epsilon,
+ "interpolate-with-weights=f" => \$psi,
"help" => \$help,
"weights=s" => \$initial_weights,
"tune-regularizer" => \$tune_regularizer,
"reg=f" => \$reg,
- "local" => \$run_local,
+ "reg-previous=f" => \$reg_previous,
"use-make=i" => \$use_make,
"max-iterations=i" => \$max_iterations,
"pmem=s" => \$pmem,
@@ -91,7 +95,18 @@ if (GetOptions(
exit;
}
-if ($usefork) { $usefork = "--use-fork"; } else { $usefork = ''; }
+die "--tune-regularizer is no longer supported with --reg-previous and --reg. Please tune manually.\n" if $tune_regularizer;
+
+if ($useqsub) {
+ $use_make = 0;
+ die "LocalEnvironment.pm does not have qsub configuration for this host. Cannot run with --qsub!\n" unless has_qsub();
+}
+
+my @missing_args = ();
+if (!defined $srcFile) { push @missing_args, "--source-file"; }
+if (!defined $refFiles) { push @missing_args, "--ref-files"; }
+if (!defined $initial_weights) { push @missing_args, "--weights"; }
+die "Please specify missing arguments: " . join (', ', @missing_args) . "\n" if (@missing_args);
if ($metric =~ /^(combi|ter)$/i) {
$lines_per_mapper = 5;
@@ -248,13 +263,10 @@ while (1){
`rm -f $dir/hgs/*.gz`;
my $decoder_cmd = "$decoder -c $iniFile --weights$pass_suffix $weightsFile -O $dir/hgs";
my $pcmd;
- if ($run_local) {
- $pcmd = "cat $srcFile |";
- } elsif ($use_make) {
- # TODO: Throw error when decode_nodes is specified along with use_make
- $pcmd = "cat $srcFile | $parallelize --use-fork -p $pmem -e $logdir -j $use_make --";
+ if ($use_make) {
+ $pcmd = "cat $srcFile | $parallelize --use-fork -p $pmem -e $logdir -j $jobs --";
} else {
- $pcmd = "cat $srcFile | $parallelize $usefork -p $pmem -e $logdir -j $decode_nodes --";
+ $pcmd = "cat $srcFile | $parallelize -p $pmem -e $logdir -j $jobs --";
}
my $cmd = "$pcmd $decoder_cmd 2> $decoderLog 1> $runFile";
print STDERR "COMMAND:\n$cmd\n";
@@ -327,10 +339,7 @@ while (1){
push @mapoutputs, "$dir/splag.$im1/$mapoutput";
$o2i{"$dir/splag.$im1/$mapoutput"} = "$dir/splag.$im1/$shard";
my $script = "$MAPPER -s $srcFile -l $metric $refs_comma_sep -w $inweights -K $dir/kbest < $dir/splag.$im1/$shard > $dir/splag.$im1/$mapoutput";
- if ($run_local) {
- print STDERR "COMMAND:\n$script\n";
- check_bash_call($script);
- } elsif ($use_make) {
+ if ($use_make) {
my $script_file = "$dir/scripts/map.$shard";
open F, ">$script_file" or die "Can't write $script_file: $!";
print F "#!/bin/bash\n";
@@ -376,12 +385,10 @@ while (1){
} else {
@dev_outs = @mapoutputs;
}
- if ($run_local) {
- print STDERR "\nCompleted extraction of training exemplars.\n";
- } elsif ($use_make) {
+ if ($use_make) {
print $mkfile "$dir/splag.$im1/map.done: @mkouts\n\ttouch $dir/splag.$im1/map.done\n\n";
close $mkfile;
- my $mcmd = "make -j $use_make -f $mkfilename";
+ my $mcmd = "make -j $jobs -f $mkfilename";
print STDERR "\nExecuting: $mcmd\n";
check_call($mcmd);
} else {
@@ -411,7 +418,7 @@ while (1){
}
print STDERR "\nRUNNING CLASSIFIER (REDUCER)\n";
print STDERR unchecked_output("date");
- $cmd="cat @dev_outs | $REDUCER -w $dir/weights.$im1 -s $reg";
+ $cmd="cat @dev_outs | $REDUCER -w $dir/weights.$im1 -C $reg -y $reg_previous --interpolate_with_weights $psi";
if ($tune_regularizer) {
$cmd .= " -T -t $dev_test_file";
}
@@ -492,7 +499,7 @@ sub write_config {
print $fh "REFS (DEV): $refFiles\n";
print $fh "EVAL METRIC: $metric\n";
print $fh "MAX ITERATIONS: $max_iterations\n";
- print $fh "DECODE NODES: $decode_nodes\n";
+ print $fh "JOBS: $jobs\n";
print $fh "HEAD NODE: $host\n";
print $fh "PMEM (DECODING): $pmem\n";
print $fh "CLEANUP: $cleanup\n";
@@ -541,16 +548,12 @@ sub enseg {
sub print_help {
my $executable = check_output("basename $0"); chomp $executable;
- print << "Help";
+ print << "Help";
Usage: $executable [options] <ini file>
$executable [options] <ini file>
- Runs a complete MERT optimization and test set decoding, using
- the decoder configuration in ini file. Note that many of the
- options have default values that are inferred automatically
- based on certain conventions. For details, refer to descriptions
- of the options --decoder, --weights, and --workdir.
+ Runs a complete PRO optimization using the ini file specified.
Required:
@@ -567,18 +570,12 @@ Required:
General options:
- --local
- Run the decoder and optimizer locally with a single thread.
-
- --decode-nodes <I>
- Number of decoder processes to run in parallel. [default=15]
-
--help
Print this message and exit.
--max-iterations <M>
Maximum number of iterations to run. If not specified, defaults
- to 10.
+ to $default_max_iter.
--metric <method>
Metric to optimize.
@@ -588,13 +585,6 @@ General options:
If the decoder is doing multi-pass decoding, the pass suffix "2",
"3", etc., is used to control what iteration of weights is set.
- --pmem <N>
- Amount of physical memory requested for parallel decoding jobs.
-
- --use-make <I>
- Use make -j <I> to run the optimizer commands (useful on large
- shared-memory machines where qsub is unavailable).
-
--workdir <dir>
Directory for intermediate and output files. If not specified, the
name is derived from the ini filename. Assuming that the ini
@@ -605,11 +595,35 @@ General options:
Regularization options:
- --tune-regularizer
- Hold out one third of the tuning data and used this to tune the
- regularization parameter.
-
--reg <F>
+ l2 regularization strength [default=500]. The greater this value,
+ the closer to zero the weights will be.
+
+ --reg-previous <F>
+ l2 penalty for moving away from the weights from the previous
+ iteration. [default=5000]. The greater this value, the closer
+ to the previous iteration's weights the next iteration's weights
+ will be.
+
+Job control options:
+
+ --jobs <I>
+ Number of decoder processes to run in parallel. [default=$default_jobs]
+
+ --qsub
+ Use qsub to run jobs in parallel (qsub must be configured in
+ environment/LocalEnvironment.pm)
+
+ --pmem <N>
+ Amount of physical memory requested for parallel decoding jobs
+ (used with qsub requests only)
+
+Deprecated options:
+
+ --interpolate-with-weights <F>
+ [deprecated] At each iteration the resulting weights are
+ interpolated with the weights from the previous iteration, with
+ this factor. [default=1.0, i.e., no effect]
Help
}
diff --git a/pro-train/mr_pro_reduce.cc b/pro-train/mr_pro_reduce.cc
index aff410a0..6362ce47 100644
--- a/pro-train/mr_pro_reduce.cc
+++ b/pro-train/mr_pro_reduce.cc
@@ -23,13 +23,14 @@ void InitCommandLine(int argc, char** argv, po::variables_map* conf) {
po::options_description opts("Configuration options");
opts.add_options()
("weights,w", po::value<string>(), "Weights from previous iteration (used as initialization and interpolation")
- ("interpolation,p",po::value<double>()->default_value(0.9), "Output weights are p*w + (1-p)*w_prev")
- ("memory_buffers,m",po::value<unsigned>()->default_value(200), "Number of memory buffers (LBFGS)")
- ("sigma_squared,s",po::value<double>()->default_value(0.1), "Sigma squared for Gaussian prior")
- ("min_reg,r",po::value<double>()->default_value(1e-8), "When tuning (-T) regularization strength, minimum regularization strenght")
- ("max_reg,R",po::value<double>()->default_value(10.0), "When tuning (-T) regularization strength, maximum regularization strenght")
+ ("regularization_strength,C",po::value<double>()->default_value(500.0), "l2 regularization strength")
+ ("regularize_to_weights,y",po::value<double>()->default_value(5000.0), "Differences in learned weights to previous weights are penalized with an l2 penalty with this strength; 0.0 = no effect")
+ ("memory_buffers,m",po::value<unsigned>()->default_value(100), "Number of memory buffers (LBFGS)")
+ ("min_reg,r",po::value<double>()->default_value(0.01), "When tuning (-T) regularization strength, minimum regularization strenght")
+ ("max_reg,R",po::value<double>()->default_value(1e6), "When tuning (-T) regularization strength, maximum regularization strenght")
("testset,t",po::value<string>(), "Optional held-out test set")
("tune_regularizer,T", "Use the held out test set (-t) to tune the regularization strength")
+ ("interpolate_with_weights,p",po::value<double>()->default_value(1.0), "[deprecated] Output weights are p*w + (1-p)*w_prev; 1.0 = no effect")
("help,h", "Help");
po::options_description dcmdline_options;
dcmdline_options.add(opts);
@@ -95,6 +96,27 @@ void GradAdd(const SparseVector<weight_t>& v, const double scale, vector<weight_
}
}
+double ApplyRegularizationTerms(const double C,
+ const double T,
+ const vector<weight_t>& weights,
+ const vector<weight_t>& prev_weights,
+ vector<weight_t>* g) {
+ assert(weights.size() == g->size());
+ double reg = 0;
+ for (size_t i = 0; i < weights.size(); ++i) {
+ const double prev_w_i = (i < prev_weights.size() ? prev_weights[i] : 0.0);
+ const double& w_i = weights[i];
+ double& g_i = (*g)[i];
+ reg += C * w_i * w_i;
+ g_i += 2 * C * w_i;
+
+ const double diff_i = w_i - prev_w_i;
+ reg += T * diff_i * diff_i;
+ g_i += 2 * T * diff_i;
+ }
+ return reg;
+}
+
double TrainingInference(const vector<weight_t>& x,
const vector<pair<bool, SparseVector<weight_t> > >& corpus,
vector<weight_t>* g = NULL) {
@@ -134,8 +156,10 @@ double TrainingInference(const vector<weight_t>& x,
// return held-out log likelihood
double LearnParameters(const vector<pair<bool, SparseVector<weight_t> > >& training,
const vector<pair<bool, SparseVector<weight_t> > >& testing,
- const double sigsq,
+ const double C,
+ const double T,
const unsigned memory_buffers,
+ const vector<weight_t>& prev_x,
vector<weight_t>* px) {
vector<weight_t>& x = *px;
vector<weight_t> vg(FD::NumFeats(), 0.0);
@@ -157,26 +181,12 @@ double LearnParameters(const vector<pair<bool, SparseVector<weight_t> > >& train
}
// handle regularizer
-#if 1
- double norm = 0;
- for (int i = 1; i < x.size(); ++i) {
- const double mean_i = 0.0;
- const double param = (x[i] - mean_i);
- norm += param * param;
- vg[i] += param / sigsq;
- }
- const double reg = norm / (2.0 * sigsq);
-#else
- double reg = 0;
-#endif
+ double reg = ApplyRegularizationTerms(C, T, x, prev_x, &vg);
cll += reg;
- cerr << cll << " (REG=" << reg << ")\tPPL=" << ppl << "\t TEST_PPL=" << tppl << "\t";
+ cerr << cll << " (REG=" << reg << ")\tPPL=" << ppl << "\t TEST_PPL=" << tppl << "\t" << endl;
try {
- vector<weight_t> old_x = x;
- do {
- opt.Optimize(cll, vg, &x);
- converged = opt.HasConverged();
- } while (!converged && x == old_x);
+ opt.Optimize(cll, vg, &x);
+ converged = opt.HasConverged();
} catch (...) {
cerr << "Exception caught, assuming convergence is close enough...\n";
converged = true;
@@ -201,13 +211,14 @@ int main(int argc, char** argv) {
}
const double min_reg = conf["min_reg"].as<double>();
const double max_reg = conf["max_reg"].as<double>();
- double sigsq = conf["sigma_squared"].as<double>(); // will be overridden if parameter is tuned
- assert(sigsq > 0.0);
+ double C = conf["regularization_strength"].as<double>(); // will be overridden if parameter is tuned
+ const double T = conf["regularize_to_weights"].as<double>();
+ assert(C > 0.0);
assert(min_reg > 0.0);
assert(max_reg > 0.0);
assert(max_reg > min_reg);
- const double psi = conf["interpolation"].as<double>();
- if (psi < 0.0 || psi > 1.0) { cerr << "Invalid interpolation weight: " << psi << endl; }
+ const double psi = conf["interpolate_with_weights"].as<double>();
+ if (psi < 0.0 || psi > 1.0) { cerr << "Invalid interpolation weight: " << psi << endl; return 1; }
ReadCorpus(&cin, &training);
if (conf.count("testset")) {
ReadFile rf(conf["testset"].as<string>());
@@ -231,14 +242,15 @@ int main(int argc, char** argv) {
vector<pair<double,double> > sp;
vector<double> smoothed;
if (tune_regularizer) {
- sigsq = min_reg;
+ C = min_reg;
const double steps = 18;
double sweep_factor = exp((log(max_reg) - log(min_reg)) / steps);
cerr << "SWEEP FACTOR: " << sweep_factor << endl;
- while(sigsq < max_reg) {
- tppl = LearnParameters(training, testing, sigsq, conf["memory_buffers"].as<unsigned>(), &x);
- sp.push_back(make_pair(sigsq, tppl));
- sigsq *= sweep_factor;
+ while(C < max_reg) {
+ cerr << "C=" << C << "\tT=" <<T << endl;
+ tppl = LearnParameters(training, testing, C, T, conf["memory_buffers"].as<unsigned>(), prev_x, &x);
+ sp.push_back(make_pair(C, tppl));
+ C *= sweep_factor;
}
smoothed.resize(sp.size(), 0);
smoothed[0] = sp[0].second;
@@ -257,16 +269,16 @@ int main(int argc, char** argv) {
best_i = i;
}
}
- sigsq = sp[best_i].first;
+ C = sp[best_i].first;
} // tune regularizer
- tppl = LearnParameters(training, testing, sigsq, conf["memory_buffers"].as<unsigned>(), &x);
+ tppl = LearnParameters(training, testing, C, T, conf["memory_buffers"].as<unsigned>(), prev_x, &x);
if (conf.count("weights")) {
for (int i = 1; i < x.size(); ++i) {
x[i] = (x[i] * psi) + prev_x[i] * (1.0 - psi);
}
}
cout.precision(15);
- cout << "# sigma^2=" << sigsq << "\theld out perplexity=";
+ cout << "# C=" << C << "\theld out perplexity=";
if (tppl) { cout << tppl << endl; } else { cout << "N/A\n"; }
if (sp.size()) {
cout << "# Parameter sweep:\n";