summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorredpony <redpony@ec762483-ff6d-05da-a07a-a48fb63a330f>2010-06-28 00:23:35 +0000
committerredpony <redpony@ec762483-ff6d-05da-a07a-a48fb63a330f>2010-06-28 00:23:35 +0000
commit875daff93c149ca5ed63ae0cdcf20be06b72d778 (patch)
tree27adfb34f335ab85a5301ae4e8d62e261f021d12
parentdb978c786c2b9eef4d730f68e3ae6cf6081c1d29 (diff)
fix mert to avoid fork problem, simplify parallelize
git-svn-id: https://ws10smt.googlecode.com/svn/trunk@30 ec762483-ff6d-05da-a07a-a48fb63a330f
-rwxr-xr-xvest/dist-vest.pl39
-rwxr-xr-xvest/parallelize.pl30
2 files changed, 29 insertions, 40 deletions
diff --git a/vest/dist-vest.pl b/vest/dist-vest.pl
index 22d6478a..2b4bf86b 100755
--- a/vest/dist-vest.pl
+++ b/vest/dist-vest.pl
@@ -34,8 +34,7 @@ my $best_weights;
my $max_iterations = 15;
my $optimization_iters = 6;
my $num_rand_points = 20;
-my $mert_nodes = join(" ", grep(/^c\d\d$/, split(/\n/, `pbsnodes -a`))); # "1 1 1 1 1" fails due to file staging conflicts
-my $decode_nodes = "1 1 1 1 1 1 1 1 1 1 1 1 1 1 1"; # start 15 jobs
+my $decode_nodes = 15; # number of decode nodes
my $pmem = "3g";
my $disable_clean = 0;
my %seen_weights;
@@ -58,7 +57,7 @@ my $decoderOpt;
Getopt::Long::Configure("no_auto_abbrev");
if (GetOptions(
"decoder=s" => \$decoderOpt,
- "decode-nodes=s" => \$decode_nodes,
+ "decode-nodes=i" => \$decode_nodes,
"dont-clean" => \$disable_clean,
"dry-run" => \$dryrun,
"epsilon" => \$epsilon,
@@ -67,7 +66,6 @@ if (GetOptions(
"iteration=i" => \$iteration,
"local" => \$run_local,
"max-iterations=i" => \$max_iterations,
- "mert-nodes=s" => \$mert_nodes,
"normalize=s" => \$normalize,
"pmem=s" => \$pmem,
"ranges=s" => \$ranges,
@@ -178,6 +176,7 @@ if ($dryrun){
} else {
mkdir $dir;
mkdir "$dir/hgs";
+ mkdir "$dir/scripts";
unless (-e $initialWeights) {
print STDERR "Please specify an initial weights file with --initial-weights\n";
print_help();
@@ -231,7 +230,7 @@ while (1){
my $im1 = $iteration - 1;
my $weightsFile="$dir/weights.$im1";
my $decoder_cmd = "$decoder -c $iniFile -w $weightsFile -O $dir/hgs";
- my $pcmd = "cat $srcFile | $parallelize -p $pmem -e $logdir -n \"$decode_nodes\" -- ";
+ my $pcmd = "cat $srcFile | $parallelize -p $pmem -e $logdir -j $decode_nodes -- ";
if ($run_local) { $pcmd = "cat $srcFile |"; }
my $cmd = $pcmd . "$decoder_cmd 2> $decoderLog 1> $runFile";
print LOGFILE "COMMAND:\n$cmd\n";
@@ -312,23 +311,22 @@ while (1){
die;
}
} else {
- my $todo = "qsub $QSUB_FLAGS -S /bin/bash -N $client_name -o /dev/null -e $logdir/$client_name.ER";
- local(*QOUT, *QIN);
- open2(\*QOUT, \*QIN, $todo);
- print QIN $script;
+ my $script_file = "$dir/scripts/map.$shard";
+ open F, ">$script_file" or die "Can't write $script_file: $!";
+ print F "$script\n";
+ close F;
if ($first_shard) { print LOGFILE "$script\n"; $first_shard=0; }
- close QIN;
+
$nmappers++;
- while (my $jobid=<QOUT>){
- chomp $jobid;
- $jobid =~ s/^(\d+)(.*?)$/\1/g;
- $jobid =~ s/^Your job (\d+) .*$/\1/;
- push(@cleanupcmds, "`qdel $jobid 2> /dev/null`");
- print LOGFILE " $jobid";
- if ($joblist == "") { $joblist = $jobid; }
- else {$joblist = $joblist . "\|" . $jobid; }
- }
- close QOUT;
+ my $jobid = `qsub $QSUB_FLAGS -S /bin/bash -N $client_name -o /dev/null -e $logdir/$client_name.ER $script_file`;
+ die "qsub failed: $!" unless $? == 0;
+ chomp $jobid;
+ $jobid =~ s/^(\d+)(.*?)$/\1/g;
+ $jobid =~ s/^Your job (\d+) .*$/\1/;
+ push(@cleanupcmds, "`qdel $jobid 2> /dev/null`");
+ print LOGFILE " $jobid";
+ if ($joblist == "") { $joblist = $jobid; }
+ else {$joblist = $joblist . "\|" . $jobid; }
}
}
if ($run_local) {
@@ -484,7 +482,6 @@ sub write_config {
print $fh "EVAL METRIC: $metric\n";
print $fh "START ITERATION: $iteration\n";
print $fh "MAX ITERATIONS: $max_iterations\n";
- print $fh "MERT NODES: $mert_nodes\n";
print $fh "DECODE NODES: $decode_nodes\n";
print $fh "HEAD NODE: $host\n";
print $fh "PMEM (DECODING): $pmem\n";
diff --git a/vest/parallelize.pl b/vest/parallelize.pl
index f01232c8..8b3f56dd 100755
--- a/vest/parallelize.pl
+++ b/vest/parallelize.pl
@@ -26,7 +26,7 @@ my $errordir;
my $multiline;
my @files_to_stage;
my $verbose = 1;
-my $nodelist;
+my $numnodes;
my $user = $ENV{"USER"};
my $pmem = "2g";
@@ -40,21 +40,12 @@ if (GetOptions(
"multi-line" => \$multiline,
"file=s" => \@files_to_stage,
"verbose" => \$verbose,
- "nodelist=s" => \$nodelist,
+ "jobs=i" => \$numnodes,
"pmem=s" => \$pmem
) == 0 || @ARGV == 0){
print_help();
}
-my @nodes = grep(/^[cd]\d\d$/, split(/\n/, `pbsnodes -a`));
-if ($nodelist){
- @nodes = split(/ /, $nodelist);
-}
-
-if ($verbose){
- print STDERR "Compute nodes: @nodes\n";
-}
-
my $cmd = "";
for my $arg (@ARGV){
if ($arg=~ /\s/){
@@ -64,6 +55,8 @@ for my $arg (@ARGV){
}
}
+die "Please specify a command to parallelize\n" if $cmd eq '';
+
if ($errordir){
`mkdir -p $errordir`;
}
@@ -132,8 +125,8 @@ if (my $pid = fork){
print STDERR $script;
print STDERR "====\n";
}
- for my $node (@nodes){
- launch_job_on_node($node);
+ for (my $jobn=0; $jobn<$numnodes; $jobn++){
+ launch_job_on_node(1);
}
if ($recycle_clients) {
my $ret;
@@ -143,9 +136,8 @@ if (my $pid = fork){
#print STDERR "waitpid $pid ret = $ret \n";
if ($ret != 0) {last; } # break
$livejobs = numof_live_jobs();
- if ( $#nodes >= $livejobs ) { # a client terminated
- my $numof_nodes = scalar @nodes;
- print STDERR "num of requested nodes = $numof_nodes; num of currently live jobs = $livejobs; Client terminated - launching another.\n";
+ if ($numnodes >= $livejobs ) { # a client terminated
+ print STDERR "num of requested nodes = $numnodes; num of currently live jobs = $livejobs; Client terminated - launching another.\n";
launch_job_on_node(1); # TODO: support named nodes
} else {
sleep (60);
@@ -197,6 +189,7 @@ sub launch_job_on_node {
if ($verbose){ print STDERR "Launched client job: $jobid"; }
push(@cleanup_cmds, "`qdel $jobid 2> /dev/null`");
$jobid =~ s/^(\d+)(.*?)$/\1/g;
+ $jobid =~ s/^Your job (\d+) .*$/\1/;
print STDERR "short job id $jobid\n";
if ($joblist == "") { $joblist = $jobid; }
else {$joblist = $joblist . "\|" . $jobid; }
@@ -244,9 +237,8 @@ options:
-v, --verbose
Print diagnostic informatoin on stderr.
- -n, --nodelist
- Space-delimited list of nodes to request. There is
- one qsub command per node.
+ -j, --jobs
+ Number of jobs to use.
-p, --pmem
pmem setting for each job.