summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorredpony <redpony@ec762483-ff6d-05da-a07a-a48fb63a330f>2010-08-10 20:03:53 +0000
committerredpony <redpony@ec762483-ff6d-05da-a07a-a48fb63a330f>2010-08-10 20:03:53 +0000
commit963b7b96576de000a743ef377c439ea5c6787e2e (patch)
treec44b113de22473a74b831867dd2c8fed8d4d56f4
parent86ae2fcf6207630c03ec222131346a6fd8fee10a (diff)
support for running in multiple environments which are automatically detected
git-svn-id: https://ws10smt.googlecode.com/svn/trunk@501 ec762483-ff6d-05da-a07a-a48fb63a330f
-rw-r--r--configure.ac8
-rw-r--r--decoder/dict.cc22
-rw-r--r--environment/LocalConfig.pm68
-rwxr-xr-xgi/pipeline/evaluation-pipeline.pl27
-rw-r--r--gi/pipeline/lticluster.config9
-rwxr-xr-xvest/dist-vest.pl35
-rwxr-xr-xvest/parallelize.pl5
7 files changed, 137 insertions, 37 deletions
diff --git a/configure.ac b/configure.ac
index 54d0c6bf..e627c1cc 100644
--- a/configure.ac
+++ b/configure.ac
@@ -8,11 +8,13 @@ AC_PROG_CXX
AC_LANG_CPLUSPLUS
BOOST_REQUIRE
BOOST_PROGRAM_OPTIONS
-BOOST_REGEX
+# BOOST_REGEX
BOOST_THREADS
CPPFLAGS="$CPPFLAGS $BOOST_CPPFLAGS"
-LDFLAGS="$LDFLAGS $BOOST_PROGRAM_OPTIONS_LDFLAGS $BOOST_REGEX_LDFLAGS $BOOST_THREAD_LDFLAGS"
-LIBS="$LIBS $BOOST_PROGRAM_OPTIONS_LIBS $BOOST_REGEX_LIBS $BOOST_THREAD_LIBS"
+#LDFLAGS="$LDFLAGS $BOOST_PROGRAM_OPTIONS_LDFLAGS $BOOST_REGEX_LDFLAGS $BOOST_THREAD_LDFLAGS"
+#LIBS="$LIBS $BOOST_PROGRAM_OPTIONS_LIBS $BOOST_REGEX_LIBS $BOOST_THREAD_LIBS"
+LDFLAGS="$LDFLAGS $BOOST_PROGRAM_OPTIONS_LDFLAGS $BOOST_THREAD_LDFLAGS"
+LIBS="$LIBS $BOOST_PROGRAM_OPTIONS_LIBS $BOOST_THREAD_LIBS"
AC_CHECK_HEADER(boost/math/special_functions/digamma.hpp,
[AC_DEFINE([HAVE_BOOST_DIGAMMA], [], [flag for boost::math::digamma])])
diff --git a/decoder/dict.cc b/decoder/dict.cc
index 4a842f82..2d6986c8 100644
--- a/decoder/dict.cc
+++ b/decoder/dict.cc
@@ -2,10 +2,26 @@
#include <string>
#include <vector>
-#include <boost/regex.hpp>
-#include <boost/algorithm/string/regex.hpp>
+
+void TokenizeStringSeparator(
+ const std::string& str,
+ const std::string& separator,
+ std::vector<std::string>* tokens) {
+
+ size_t pos = 0;
+ std::string::size_type nextPos = str.find(separator, pos);
+
+ while (nextPos != std::string::npos) {
+ tokens->push_back(str.substr(pos, nextPos - pos));
+ pos = nextPos + separator.size();
+ nextPos = str.find(separator, pos);
+ }
+ tokens->push_back(str.substr(pos, nextPos - pos));
+}
+
void Dict::AsVector(const WordID& id, std::vector<std::string>* results) const {
- boost::algorithm::split_regex(*results, Convert(id), boost::regex("\\s\\|\\|\\|\\s"));
+ results->clear();
+ TokenizeStringSeparator(Convert(id), " ||| ", results);
}
diff --git a/environment/LocalConfig.pm b/environment/LocalConfig.pm
new file mode 100644
index 00000000..e4269361
--- /dev/null
+++ b/environment/LocalConfig.pm
@@ -0,0 +1,68 @@
+package LocalConfig;
+
+use strict;
+use warnings;
+
+use base 'Exporter';
+our @EXPORT = qw( qsub_args mert_memory environment_name );
+
+use Net::Domain qw(hostname hostfqdn hostdomain domainname);
+
+my $host = domainname;
+
+# keys are: HOST_REGEXP, MERTMem, QSubQueue, QSubMemFlag, QSubExtraFlags
+my $CCONFIG = {
+ 'LTICluster' => {
+ 'HOST_REGEXP' => qr/^cluster\d+\.lti\.cs\.cmu\.edu$/,
+ 'QSubMemFlag' => '-l pmem=',
+ 'QSubQueue' => '-q long',
+ },
+ 'UMIACS' => {
+ 'HOST_REGEXP' => qr/^d.*\.umiacs\.umd\.edu$/,
+ 'QSubMemFlag' => '-l pmem=',
+ 'QSubQueue' => '-q batch',
+ 'QSubExtraFlags' => '-l walltime=144:00:00',
+ },
+ 'CLSP' => {
+ 'HOST_REGEXP' => qr/\.clsp\.jhu\.edu$/,
+ 'QSubMemFlag' => '-l mem_free=',
+ 'MERTMem' => '9G',
+ },
+ 'Valhalla' => {
+ 'HOST_REGEXP' => qr/^(thor|tyr)\.inf\.ed\.ac\.uk$/,
+ },
+};
+
+our $senvironment_name;
+for my $config_key (keys %$CCONFIG) {
+ my $re = $CCONFIG->{$config_key}->{'HOST_REGEXP'};
+ die "Can't find HOST_REGEXP for $config_key" unless $re;
+ if ($host =~ /$re/) {
+ $senvironment_name = $config_key;
+ }
+}
+
+die "NO ENVIRONMENT INFO FOR HOST: $host\nPLEASE EDIT LocalConfig.pm\n" unless $senvironment_name;
+
+our %CONFIG = %{$CCONFIG->{$senvironment_name}};
+print STDERR "**Environment: $senvironment_name\n";
+
+sub environment_name {
+ return $senvironment_name;
+}
+
+sub qsub_args {
+ my $mem = shift @_;
+ die "qsub_args requires a memory amount as a parameter, e.g. 4G" unless $mem;
+ my $mf = $CONFIG{'QSubMemFlag'} or die "QSubMemFlag not set for $senvironment_name";
+ my $cmd = "qsub -S /bin/bash ${mf}${mem}";
+ if ($CONFIG{'QSubQueue'}) { $cmd .= ' ' . $CONFIG{'QSubQueue'}; }
+ if ($CONFIG{'QSubExtraFlags'}) { $cmd .= ' ' . $CONFIG{'QSubExtraFlags'}; }
+ return $cmd;
+}
+
+sub mert_memory {
+ return ($CONFIG{'MERTMem'} || '2G');
+};
+
+1;
diff --git a/gi/pipeline/evaluation-pipeline.pl b/gi/pipeline/evaluation-pipeline.pl
index 2660155f..4b4529d9 100755
--- a/gi/pipeline/evaluation-pipeline.pl
+++ b/gi/pipeline/evaluation-pipeline.pl
@@ -4,11 +4,12 @@ use Getopt::Long;
use Cwd;
my $CWD = getcwd;
-my $SCRIPT_DIR; BEGIN { use Cwd qw/ abs_path /; use File::Basename; $SCRIPT_DIR = dirname(abs_path($0)); push @INC, $SCRIPT_DIR; }
+my $SCRIPT_DIR; BEGIN { use Cwd qw/ abs_path /; use File::Basename; $SCRIPT_DIR = dirname(abs_path($0)); push @INC, $SCRIPT_DIR, "$SCRIPT_DIR/../../environment"; }
+use LocalConfig;
my $JOBS = 15;
my $PMEM = "9G";
-my $NUM_TRANSLATIONS = 30;
+my $NUM_TRANSLATIONS = 50;
my $GOAL = "S";
# featurize_grammar may add multiple features from a single feature extractor
@@ -75,17 +76,7 @@ assert_exec($CDEC, $PARALLELIZE, $FILTER, $FEATURIZE, $DISTVEST, $FILTERBYF);
my $numtopics = 25;
-my $config = "$SCRIPT_DIR/clsp.config";
-if ((scalar @ARGV) >= 2 && ($ARGV[0] eq '-c')) {
- $config = $ARGV[1];
- shift @ARGV; shift @ARGV;
- unless (-f $config) {
- $config = "$SCRIPT_DIR/$config";
- unless (-f $config) {
- $config .= ".config";
- }
- }
-}
+my $config = "$SCRIPT_DIR/" . (lc environment_name()) . '.config';
print STDERR "CORPORA CONFIGURATION: $config\n";
open CONF, "<$config" or die "Can't read $config: $!";
my %paths;
@@ -128,8 +119,10 @@ my $gluegram;
my $oovgram;
my $usefork;
my $lmorder = 3;
+my $density;
if (GetOptions(
"backoff-grammar=s" => \$bkoffgram,
+ "density-prune=f" => \$density,
"glue-grammar=s" => \$gluegram,
"oov-grammar=s" => \$oovgram,
"data=s" => \$dataDir,
@@ -145,6 +138,10 @@ if (GetOptions(
print_help();
exit;
}
+my $DENSITY_PRUNE = '';
+if ($density) {
+ $DENSITY_PRUNE = "--density-prune $density";
+}
if ($usefork) { $usefork="--use-fork"; } else { $usefork = ''; }
my @fkeys = keys %$feat_map;
die "You must specify one or more features with -f. Known features: @fkeys\n" unless scalar @features > 0;
@@ -228,7 +225,7 @@ my $tuned_weights = mydircat($outdir, 'weights.tuned');
if (-f $tuned_weights) {
print STDERR "TUNED WEIGHTS $tuned_weights EXISTS: REUSING\n";
} else {
- my $cmd = "$DISTVEST $usefork --decode-nodes $JOBS --pmem=$PMEM --ref-files=$drefs --source-file=$dev --weights $weights $devini";
+ my $cmd = "$DISTVEST $usefork $DENSITY_PRUNE --decode-nodes $JOBS --pmem=$PMEM --ref-files=$drefs --source-file=$dev --weights $weights $devini";
print STDERR "MERT COMMAND: $cmd\n";
`rm -rf $outdir/vest 2> /dev/null`;
chdir $outdir or die "Can't chdir to $outdir: $!";
@@ -265,7 +262,7 @@ sub write_random_weights_file {
open F, ">$file" or die "Can't write $file: $!";
my @feats = (@DEFAULT_FEATS, @extras);
for my $feat (@feats) {
- my $r = rand(1.6);
+ my $r = rand(0.4) + 0.8;
my $w = $init_weights{$feat} * $r;
if ($w == 0) { $w = 0.0001; print STDERR "WARNING: $feat had no initial weight!\n"; }
print F "$feat $w\n";
diff --git a/gi/pipeline/lticluster.config b/gi/pipeline/lticluster.config
new file mode 100644
index 00000000..3e23c8cb
--- /dev/null
+++ b/gi/pipeline/lticluster.config
@@ -0,0 +1,9 @@
+# THIS FILE GIVES THE LOCATIONS OF THE CORPORA USED
+# name path aligned-corpus LM dev dev-refs test1 testt-eval.sh ...
+/home/cdyer/ws10smt-data
+btec /home/cdyer/ws10smt-data/btec/ split.zh-en.al lm/en.3gram.lm.gz devtest/devset1_2.zh devtest/devset1_2.lc.en* devtest/devset3.zh eval-devset3.sh
+zhen /home/cdyer/ws10smt-data/chinese-english corpus.zh-en.al lm/c2e.3gram.lm.gz dev_and_test/mt02.src.txt dev_and_test/mt02.ref.* dev_and_test/mt03.src.txt eval-mt03.sh
+aren /home/cdyer/ws10smt-data/arabic-english corpus.ar-en-al lm/a2e.3gram.lm.gz dev_and_test/dev.src.txt dev_and_test/dev.ref.txt.* dev_and_test/mt05.src.txt eval-mt05.sh
+uren /home/cdyer/ws10smt-data/urdu-english corpus.ur-en.al lm/u2e.en.lm.gz dev/dev.ur dev/dev.en* devtest/devtest.ur eval-devtest.sh
+nlfr /home/cdyer/ws10smt-data/dutch-french corpus.nl-fr.al
+
diff --git a/vest/dist-vest.pl b/vest/dist-vest.pl
index b66b8e9c..8f7f2053 100755
--- a/vest/dist-vest.pl
+++ b/vest/dist-vest.pl
@@ -3,13 +3,13 @@
use strict;
my @ORIG_ARGV=@ARGV;
use Cwd qw(getcwd);
-my $SCRIPT_DIR; BEGIN { use Cwd qw/ abs_path /; use File::Basename; $SCRIPT_DIR = dirname(abs_path($0)); push @INC, $SCRIPT_DIR; }
+my $SCRIPT_DIR; BEGIN { use Cwd qw/ abs_path /; use File::Basename; $SCRIPT_DIR = dirname(abs_path($0)); push @INC, $SCRIPT_DIR, "$SCRIPT_DIR/../environment"; }
+use LocalConfig;
use Getopt::Long;
use IPC::Open2;
use strict;
use POSIX ":sys_wait_h";
-#my $QSUB_FLAGS = "-q batch -l pmem=3000mb,walltime=5:00:00";
-my $QSUB_FLAGS = "-l mem_free=9G";
+my $QSUB_CMD = qsub_args(mert_memory());
# Default settings
my $srcFile;
@@ -24,6 +24,8 @@ my $REDUCER = "$bin_dir/mr_vest_reduce";
my $parallelize = "$bin_dir/parallelize.pl";
my $sentserver = "$bin_dir/sentserver";
my $sentclient = "$bin_dir/sentclient";
+my $LocalConfig = "$SCRIPT_DIR/../environment/LocalConfig.pm";
+
my $SCORER = $FAST_SCORE;
die "Can't find $MAPPER" unless -x $MAPPER;
my $cdec = "$bin_dir/../decoder/cdec";
@@ -197,7 +199,7 @@ if ($dryrun){
} else {
-e $dir || mkdir $dir;
mkdir "$dir/hgs";
- modbin("$dir/bin",\$cdec,\$SCORER,\$MAPINPUT,\$MAPPER,\$REDUCER,\$parallelize,\$sentserver,\$sentclient) if $cpbin;
+ modbin("$dir/bin",\$LocalConfig,\$cdec,\$SCORER,\$MAPINPUT,\$MAPPER,\$REDUCER,\$parallelize,\$sentserver,\$sentclient) if $cpbin;
mkdir "$dir/scripts";
my $cmdfile="$dir/rerun-vest.sh";
open CMD,'>',$cmdfile;
@@ -276,6 +278,8 @@ while (1){
print STDERR "ERROR: Parallel decoder returned non-zero exit code $result\n";
die;
}
+ my $num_hgs = `ls $dir/hgs/*.gz | wc -l`;
+ print STDERR "HGs: $num_hgs\n";
my $dec_score = `cat $runFile | $SCORER $refs_comma_sep -l $metric`;
chomp $dec_score;
print STDERR "DECODER SCORE: $dec_score\n";
@@ -343,20 +347,21 @@ while (1){
die "ERROR: mapper returned non-zero exit code $result\n";
}
} else {
- my $script_file = "$dir/scripts/map.$shard";
- open F, ">$script_file" or die "Can't write $script_file: $!";
- print F "$script\n";
- close F;
+ my $script_file = "$dir/scripts/map.$shard";
+ open F, ">$script_file" or die "Can't write $script_file: $!";
+ print F "$script\n";
+ close F;
if ($first_shard) { print STDERR "$script\n"; $first_shard=0; }
$nmappers++;
- my $jobid = `qsub $QSUB_FLAGS -S /bin/bash -N $client_name -o /dev/null -e $logdir/$client_name.ER $script_file`;
- die "qsub failed: $!" unless $? == 0;
- chomp $jobid;
+ my $qcmd = "$QSUB_CMD -N $client_name -o /dev/null -e $logdir/$client_name.ER $script_file";
+ my $jobid = `$qcmd`;
+ die "qsub failed: $!\nCMD was: $qcmd" unless $? == 0;
+ chomp $jobid;
$jobid =~ s/^(\d+)(.*?)$/\1/g;
- $jobid =~ s/^Your job (\d+) .*$/\1/;
- push(@cleanupcmds, "`qdel $jobid 2> /dev/null`");
- print STDERR " $jobid";
+ $jobid =~ s/^Your job (\d+) .*$/\1/;
+ push(@cleanupcmds, "`qdel $jobid 2> /dev/null`");
+ print STDERR " $jobid";
if ($joblist == "") { $joblist = $jobid; }
else {$joblist = $joblist . "\|" . $jobid; }
}
@@ -368,7 +373,7 @@ while (1){
print STDERR "Waiting for mappers to complete...\n";
while ($nmappers > 0) {
sleep 5;
- my @livejobs = grep(/$joblist/, split(/\n/, `qstat`));
+ my @livejobs = grep(/$joblist/, split(/\n/, `qstat | grep -v ' C '`));
$nmappers = scalar @livejobs;
}
print STDERR "All mappers complete.\n";
diff --git a/vest/parallelize.pl b/vest/parallelize.pl
index a5a40704..daaf9b2f 100755
--- a/vest/parallelize.pl
+++ b/vest/parallelize.pl
@@ -18,6 +18,9 @@
#ANNOYANCE: if input is shorter than -j n lines, or at the very last few lines, repeatedly sleeps. time cut down to 15s from 60s
+my $SCRIPT_DIR; BEGIN { use Cwd qw/ abs_path /; use File::Basename; $SCRIPT_DIR = dirname(abs_path($0)); push @INC, $SCRIPT_DIR, "$SCRIPT_DIR/../environment"; }
+use LocalConfig;
+
use File::Temp qw/ tempfile /;
use Getopt::Long;
use IPC::Open2;
@@ -303,7 +306,7 @@ sub launch_job {
push @errors,$errorfile;
push @outs,$outfile;
}
- my $todo = "qsub -l mem_free=$pmem -N $clientname -o $outfile -e $errorfile";
+ my $todo = qsub_args($pmem) . " -N $clientname -o $outfile -e $errorfile";
push @cmds,$todo;
print STDERR "Running: $todo\n";