From 2cc482a64c687ef9505449e903776a1f1e595343 Mon Sep 17 00:00:00 2001 From: redpony Date: Tue, 10 Aug 2010 20:03:53 +0000 Subject: support for running in multiple environments which are automatically detected git-svn-id: https://ws10smt.googlecode.com/svn/trunk@501 ec762483-ff6d-05da-a07a-a48fb63a330f --- configure.ac | 8 +++-- decoder/dict.cc | 22 ++++++++++-- environment/LocalConfig.pm | 68 ++++++++++++++++++++++++++++++++++++++ gi/pipeline/evaluation-pipeline.pl | 27 +++++++-------- gi/pipeline/lticluster.config | 9 +++++ vest/dist-vest.pl | 35 +++++++++++--------- vest/parallelize.pl | 5 ++- 7 files changed, 137 insertions(+), 37 deletions(-) create mode 100644 environment/LocalConfig.pm create mode 100644 gi/pipeline/lticluster.config diff --git a/configure.ac b/configure.ac index 54d0c6bf..e627c1cc 100644 --- a/configure.ac +++ b/configure.ac @@ -8,11 +8,13 @@ AC_PROG_CXX AC_LANG_CPLUSPLUS BOOST_REQUIRE BOOST_PROGRAM_OPTIONS -BOOST_REGEX +# BOOST_REGEX BOOST_THREADS CPPFLAGS="$CPPFLAGS $BOOST_CPPFLAGS" -LDFLAGS="$LDFLAGS $BOOST_PROGRAM_OPTIONS_LDFLAGS $BOOST_REGEX_LDFLAGS $BOOST_THREAD_LDFLAGS" -LIBS="$LIBS $BOOST_PROGRAM_OPTIONS_LIBS $BOOST_REGEX_LIBS $BOOST_THREAD_LIBS" +#LDFLAGS="$LDFLAGS $BOOST_PROGRAM_OPTIONS_LDFLAGS $BOOST_REGEX_LDFLAGS $BOOST_THREAD_LDFLAGS" +#LIBS="$LIBS $BOOST_PROGRAM_OPTIONS_LIBS $BOOST_REGEX_LIBS $BOOST_THREAD_LIBS" +LDFLAGS="$LDFLAGS $BOOST_PROGRAM_OPTIONS_LDFLAGS $BOOST_THREAD_LDFLAGS" +LIBS="$LIBS $BOOST_PROGRAM_OPTIONS_LIBS $BOOST_THREAD_LIBS" AC_CHECK_HEADER(boost/math/special_functions/digamma.hpp, [AC_DEFINE([HAVE_BOOST_DIGAMMA], [], [flag for boost::math::digamma])]) diff --git a/decoder/dict.cc b/decoder/dict.cc index 4a842f82..2d6986c8 100644 --- a/decoder/dict.cc +++ b/decoder/dict.cc @@ -2,10 +2,26 @@ #include #include -#include -#include + +void TokenizeStringSeparator( + const std::string& str, + const std::string& separator, + std::vector* tokens) { + + size_t pos = 0; + std::string::size_type nextPos = str.find(separator, pos); + + while (nextPos != std::string::npos) { + tokens->push_back(str.substr(pos, nextPos - pos)); + pos = nextPos + separator.size(); + nextPos = str.find(separator, pos); + } + tokens->push_back(str.substr(pos, nextPos - pos)); +} + void Dict::AsVector(const WordID& id, std::vector* results) const { - boost::algorithm::split_regex(*results, Convert(id), boost::regex("\\s\\|\\|\\|\\s")); + results->clear(); + TokenizeStringSeparator(Convert(id), " ||| ", results); } diff --git a/environment/LocalConfig.pm b/environment/LocalConfig.pm new file mode 100644 index 00000000..e4269361 --- /dev/null +++ b/environment/LocalConfig.pm @@ -0,0 +1,68 @@ +package LocalConfig; + +use strict; +use warnings; + +use base 'Exporter'; +our @EXPORT = qw( qsub_args mert_memory environment_name ); + +use Net::Domain qw(hostname hostfqdn hostdomain domainname); + +my $host = domainname; + +# keys are: HOST_REGEXP, MERTMem, QSubQueue, QSubMemFlag, QSubExtraFlags +my $CCONFIG = { + 'LTICluster' => { + 'HOST_REGEXP' => qr/^cluster\d+\.lti\.cs\.cmu\.edu$/, + 'QSubMemFlag' => '-l pmem=', + 'QSubQueue' => '-q long', + }, + 'UMIACS' => { + 'HOST_REGEXP' => qr/^d.*\.umiacs\.umd\.edu$/, + 'QSubMemFlag' => '-l pmem=', + 'QSubQueue' => '-q batch', + 'QSubExtraFlags' => '-l walltime=144:00:00', + }, + 'CLSP' => { + 'HOST_REGEXP' => qr/\.clsp\.jhu\.edu$/, + 'QSubMemFlag' => '-l mem_free=', + 'MERTMem' => '9G', + }, + 'Valhalla' => { + 'HOST_REGEXP' => qr/^(thor|tyr)\.inf\.ed\.ac\.uk$/, + }, +}; + +our $senvironment_name; +for my $config_key (keys %$CCONFIG) { + my $re = $CCONFIG->{$config_key}->{'HOST_REGEXP'}; + die "Can't find HOST_REGEXP for $config_key" unless $re; + if ($host =~ /$re/) { + $senvironment_name = $config_key; + } +} + +die "NO ENVIRONMENT INFO FOR HOST: $host\nPLEASE EDIT LocalConfig.pm\n" unless $senvironment_name; + +our %CONFIG = %{$CCONFIG->{$senvironment_name}}; +print STDERR "**Environment: $senvironment_name\n"; + +sub environment_name { + return $senvironment_name; +} + +sub qsub_args { + my $mem = shift @_; + die "qsub_args requires a memory amount as a parameter, e.g. 4G" unless $mem; + my $mf = $CONFIG{'QSubMemFlag'} or die "QSubMemFlag not set for $senvironment_name"; + my $cmd = "qsub -S /bin/bash ${mf}${mem}"; + if ($CONFIG{'QSubQueue'}) { $cmd .= ' ' . $CONFIG{'QSubQueue'}; } + if ($CONFIG{'QSubExtraFlags'}) { $cmd .= ' ' . $CONFIG{'QSubExtraFlags'}; } + return $cmd; +} + +sub mert_memory { + return ($CONFIG{'MERTMem'} || '2G'); +}; + +1; diff --git a/gi/pipeline/evaluation-pipeline.pl b/gi/pipeline/evaluation-pipeline.pl index 2660155f..4b4529d9 100755 --- a/gi/pipeline/evaluation-pipeline.pl +++ b/gi/pipeline/evaluation-pipeline.pl @@ -4,11 +4,12 @@ use Getopt::Long; use Cwd; my $CWD = getcwd; -my $SCRIPT_DIR; BEGIN { use Cwd qw/ abs_path /; use File::Basename; $SCRIPT_DIR = dirname(abs_path($0)); push @INC, $SCRIPT_DIR; } +my $SCRIPT_DIR; BEGIN { use Cwd qw/ abs_path /; use File::Basename; $SCRIPT_DIR = dirname(abs_path($0)); push @INC, $SCRIPT_DIR, "$SCRIPT_DIR/../../environment"; } +use LocalConfig; my $JOBS = 15; my $PMEM = "9G"; -my $NUM_TRANSLATIONS = 30; +my $NUM_TRANSLATIONS = 50; my $GOAL = "S"; # featurize_grammar may add multiple features from a single feature extractor @@ -75,17 +76,7 @@ assert_exec($CDEC, $PARALLELIZE, $FILTER, $FEATURIZE, $DISTVEST, $FILTERBYF); my $numtopics = 25; -my $config = "$SCRIPT_DIR/clsp.config"; -if ((scalar @ARGV) >= 2 && ($ARGV[0] eq '-c')) { - $config = $ARGV[1]; - shift @ARGV; shift @ARGV; - unless (-f $config) { - $config = "$SCRIPT_DIR/$config"; - unless (-f $config) { - $config .= ".config"; - } - } -} +my $config = "$SCRIPT_DIR/" . (lc environment_name()) . '.config'; print STDERR "CORPORA CONFIGURATION: $config\n"; open CONF, "<$config" or die "Can't read $config: $!"; my %paths; @@ -128,8 +119,10 @@ my $gluegram; my $oovgram; my $usefork; my $lmorder = 3; +my $density; if (GetOptions( "backoff-grammar=s" => \$bkoffgram, + "density-prune=f" => \$density, "glue-grammar=s" => \$gluegram, "oov-grammar=s" => \$oovgram, "data=s" => \$dataDir, @@ -145,6 +138,10 @@ if (GetOptions( print_help(); exit; } +my $DENSITY_PRUNE = ''; +if ($density) { + $DENSITY_PRUNE = "--density-prune $density"; +} if ($usefork) { $usefork="--use-fork"; } else { $usefork = ''; } my @fkeys = keys %$feat_map; die "You must specify one or more features with -f. Known features: @fkeys\n" unless scalar @features > 0; @@ -228,7 +225,7 @@ my $tuned_weights = mydircat($outdir, 'weights.tuned'); if (-f $tuned_weights) { print STDERR "TUNED WEIGHTS $tuned_weights EXISTS: REUSING\n"; } else { - my $cmd = "$DISTVEST $usefork --decode-nodes $JOBS --pmem=$PMEM --ref-files=$drefs --source-file=$dev --weights $weights $devini"; + my $cmd = "$DISTVEST $usefork $DENSITY_PRUNE --decode-nodes $JOBS --pmem=$PMEM --ref-files=$drefs --source-file=$dev --weights $weights $devini"; print STDERR "MERT COMMAND: $cmd\n"; `rm -rf $outdir/vest 2> /dev/null`; chdir $outdir or die "Can't chdir to $outdir: $!"; @@ -265,7 +262,7 @@ sub write_random_weights_file { open F, ">$file" or die "Can't write $file: $!"; my @feats = (@DEFAULT_FEATS, @extras); for my $feat (@feats) { - my $r = rand(1.6); + my $r = rand(0.4) + 0.8; my $w = $init_weights{$feat} * $r; if ($w == 0) { $w = 0.0001; print STDERR "WARNING: $feat had no initial weight!\n"; } print F "$feat $w\n"; diff --git a/gi/pipeline/lticluster.config b/gi/pipeline/lticluster.config new file mode 100644 index 00000000..3e23c8cb --- /dev/null +++ b/gi/pipeline/lticluster.config @@ -0,0 +1,9 @@ +# THIS FILE GIVES THE LOCATIONS OF THE CORPORA USED +# name path aligned-corpus LM dev dev-refs test1 testt-eval.sh ... +/home/cdyer/ws10smt-data +btec /home/cdyer/ws10smt-data/btec/ split.zh-en.al lm/en.3gram.lm.gz devtest/devset1_2.zh devtest/devset1_2.lc.en* devtest/devset3.zh eval-devset3.sh +zhen /home/cdyer/ws10smt-data/chinese-english corpus.zh-en.al lm/c2e.3gram.lm.gz dev_and_test/mt02.src.txt dev_and_test/mt02.ref.* dev_and_test/mt03.src.txt eval-mt03.sh +aren /home/cdyer/ws10smt-data/arabic-english corpus.ar-en-al lm/a2e.3gram.lm.gz dev_and_test/dev.src.txt dev_and_test/dev.ref.txt.* dev_and_test/mt05.src.txt eval-mt05.sh +uren /home/cdyer/ws10smt-data/urdu-english corpus.ur-en.al lm/u2e.en.lm.gz dev/dev.ur dev/dev.en* devtest/devtest.ur eval-devtest.sh +nlfr /home/cdyer/ws10smt-data/dutch-french corpus.nl-fr.al + diff --git a/vest/dist-vest.pl b/vest/dist-vest.pl index b66b8e9c..8f7f2053 100755 --- a/vest/dist-vest.pl +++ b/vest/dist-vest.pl @@ -3,13 +3,13 @@ use strict; my @ORIG_ARGV=@ARGV; use Cwd qw(getcwd); -my $SCRIPT_DIR; BEGIN { use Cwd qw/ abs_path /; use File::Basename; $SCRIPT_DIR = dirname(abs_path($0)); push @INC, $SCRIPT_DIR; } +my $SCRIPT_DIR; BEGIN { use Cwd qw/ abs_path /; use File::Basename; $SCRIPT_DIR = dirname(abs_path($0)); push @INC, $SCRIPT_DIR, "$SCRIPT_DIR/../environment"; } +use LocalConfig; use Getopt::Long; use IPC::Open2; use strict; use POSIX ":sys_wait_h"; -#my $QSUB_FLAGS = "-q batch -l pmem=3000mb,walltime=5:00:00"; -my $QSUB_FLAGS = "-l mem_free=9G"; +my $QSUB_CMD = qsub_args(mert_memory()); # Default settings my $srcFile; @@ -24,6 +24,8 @@ my $REDUCER = "$bin_dir/mr_vest_reduce"; my $parallelize = "$bin_dir/parallelize.pl"; my $sentserver = "$bin_dir/sentserver"; my $sentclient = "$bin_dir/sentclient"; +my $LocalConfig = "$SCRIPT_DIR/../environment/LocalConfig.pm"; + my $SCORER = $FAST_SCORE; die "Can't find $MAPPER" unless -x $MAPPER; my $cdec = "$bin_dir/../decoder/cdec"; @@ -197,7 +199,7 @@ if ($dryrun){ } else { -e $dir || mkdir $dir; mkdir "$dir/hgs"; - modbin("$dir/bin",\$cdec,\$SCORER,\$MAPINPUT,\$MAPPER,\$REDUCER,\$parallelize,\$sentserver,\$sentclient) if $cpbin; + modbin("$dir/bin",\$LocalConfig,\$cdec,\$SCORER,\$MAPINPUT,\$MAPPER,\$REDUCER,\$parallelize,\$sentserver,\$sentclient) if $cpbin; mkdir "$dir/scripts"; my $cmdfile="$dir/rerun-vest.sh"; open CMD,'>',$cmdfile; @@ -276,6 +278,8 @@ while (1){ print STDERR "ERROR: Parallel decoder returned non-zero exit code $result\n"; die; } + my $num_hgs = `ls $dir/hgs/*.gz | wc -l`; + print STDERR "HGs: $num_hgs\n"; my $dec_score = `cat $runFile | $SCORER $refs_comma_sep -l $metric`; chomp $dec_score; print STDERR "DECODER SCORE: $dec_score\n"; @@ -343,20 +347,21 @@ while (1){ die "ERROR: mapper returned non-zero exit code $result\n"; } } else { - my $script_file = "$dir/scripts/map.$shard"; - open F, ">$script_file" or die "Can't write $script_file: $!"; - print F "$script\n"; - close F; + my $script_file = "$dir/scripts/map.$shard"; + open F, ">$script_file" or die "Can't write $script_file: $!"; + print F "$script\n"; + close F; if ($first_shard) { print STDERR "$script\n"; $first_shard=0; } $nmappers++; - my $jobid = `qsub $QSUB_FLAGS -S /bin/bash -N $client_name -o /dev/null -e $logdir/$client_name.ER $script_file`; - die "qsub failed: $!" unless $? == 0; - chomp $jobid; + my $qcmd = "$QSUB_CMD -N $client_name -o /dev/null -e $logdir/$client_name.ER $script_file"; + my $jobid = `$qcmd`; + die "qsub failed: $!\nCMD was: $qcmd" unless $? == 0; + chomp $jobid; $jobid =~ s/^(\d+)(.*?)$/\1/g; - $jobid =~ s/^Your job (\d+) .*$/\1/; - push(@cleanupcmds, "`qdel $jobid 2> /dev/null`"); - print STDERR " $jobid"; + $jobid =~ s/^Your job (\d+) .*$/\1/; + push(@cleanupcmds, "`qdel $jobid 2> /dev/null`"); + print STDERR " $jobid"; if ($joblist == "") { $joblist = $jobid; } else {$joblist = $joblist . "\|" . $jobid; } } @@ -368,7 +373,7 @@ while (1){ print STDERR "Waiting for mappers to complete...\n"; while ($nmappers > 0) { sleep 5; - my @livejobs = grep(/$joblist/, split(/\n/, `qstat`)); + my @livejobs = grep(/$joblist/, split(/\n/, `qstat | grep -v ' C '`)); $nmappers = scalar @livejobs; } print STDERR "All mappers complete.\n"; diff --git a/vest/parallelize.pl b/vest/parallelize.pl index a5a40704..daaf9b2f 100755 --- a/vest/parallelize.pl +++ b/vest/parallelize.pl @@ -18,6 +18,9 @@ #ANNOYANCE: if input is shorter than -j n lines, or at the very last few lines, repeatedly sleeps. time cut down to 15s from 60s +my $SCRIPT_DIR; BEGIN { use Cwd qw/ abs_path /; use File::Basename; $SCRIPT_DIR = dirname(abs_path($0)); push @INC, $SCRIPT_DIR, "$SCRIPT_DIR/../environment"; } +use LocalConfig; + use File::Temp qw/ tempfile /; use Getopt::Long; use IPC::Open2; @@ -303,7 +306,7 @@ sub launch_job { push @errors,$errorfile; push @outs,$outfile; } - my $todo = "qsub -l mem_free=$pmem -N $clientname -o $outfile -e $errorfile"; + my $todo = qsub_args($pmem) . " -N $clientname -o $outfile -e $errorfile"; push @cmds,$todo; print STDERR "Running: $todo\n"; -- cgit v1.2.3