From 9f821dd7f08da5a146e14863b3d49ae16e6739f4 Mon Sep 17 00:00:00 2001 From: Jonathan Clark Date: Thu, 10 Mar 2011 17:09:21 -0500 Subject: initial version of scons --- Makefile.am | 2 +- SConstruct | 54 ++++++++++++++++++++++ decoder/Makefile.am | 4 +- decoder/ff_wordset.h | 6 ++- environment/LocalConfig.pm | 4 ++ vest/dist-vest.pl | 112 +++++++++++++++++++-------------------------- vest/parallelize.pl | 33 +++++++------ 7 files changed, 129 insertions(+), 86 deletions(-) create mode 100644 SConstruct diff --git a/Makefile.am b/Makefile.am index a808c211..0e7b3885 100644 --- a/Makefile.am +++ b/Makefile.am @@ -5,5 +5,5 @@ SUBDIRS = utils mteval klm/util klm/lm decoder phrasinator training vest extools AUTOMAKE_OPTIONS = foreign ACLOCAL_AMFLAGS = -I m4 -AM_CPPFLAGS = -D_GLIBCXX_PARALLEL +AM_CPPFLAGS = -D_GLIBCXX_PARALLEL diff --git a/SConstruct b/SConstruct new file mode 100644 index 00000000..e61af175 --- /dev/null +++ b/SConstruct @@ -0,0 +1,54 @@ +AddOption('--prefix', dest='prefix', type='string', nargs=1, action='store', metavar='DIR', + help='installation prefix') +AddOption('--with-boost', dest='boost', type='string', nargs=1, action='store', metavar='DIR', + help='boost installation directory (if in a non-standard location)') +AddOption('--with-glc', dest='glc', type='string', nargs=1, action='store', metavar='DIR', + help='path to Global Lexical Coherence package (optional)') +AddOption('--efence', dest='efence', action='store_true', + help='use electric fence for debugging memory corruptions') + +platform = ARGUMENTS.get('OS', Platform()) +include = Split('decoder utils klm mteval .') +env = Environment(PREFIX=GetOption('prefix'), + PLATFORM = platform, +# BINDIR = bin, +# INCDIR = include, +# LIBDIR = lib, + CPPPATH = include, + LIBPATH = [], + LIBS = Split('boost_program_options boost_serialization boost_thread z'), + CCFLAGS=Split('-g -O3')) + +boost = GetOption('boost') +if boost: + print 'Using Boost at {0}'.format(boost) + env.Append(CPPPATH=boost+'/include', + LIBPATH=boost+'/lib') + +if GetOption('efence'): + env.Append(LIBS=Split('efence Segfault')) + +srcs = [] + +# TODO: Get rid of config.h + +glc = GetOption('glc') +if glc: + print 'Using Global Lexical Coherence package at {0}'.format(glc) + env.Append(CCFLAGS='-DHAVE_GLC', + CPPPATH=[glc, glc+'/cdec']) + srcs.append(glc+'/string_util.cc') + srcs.append(glc+'/feature-factory.cc') + srcs.append(glc+'/cdec/ff_glc.cc') + +for pattern in ['decoder/*.cc', 'decoder/*.c', 'klm/*/*.cc', 'utils/*.cc', 'mteval/*.cc']: + srcs.extend([ file for file in Glob(pattern) + if not 'test' in str(file) + and 'build_binary.cc' not in str(file) + and 'ngram_query.cc' not in str(file) + and 'mbr_kbest.cc' not in str(file) + and 'sri.cc' not in str(file) + and 'fast_score.cc' not in str(file) + ]) + +env.Program(target='decoder/cdec', source=srcs) diff --git a/decoder/Makefile.am b/decoder/Makefile.am index 9cf4c3c4..e1dba497 100644 --- a/decoder/Makefile.am +++ b/decoder/Makefile.am @@ -26,7 +26,7 @@ hg_test_SOURCES = hg_test.cc hg_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) libcdec.a ../mteval/libmteval.a ../utils/libutils.a -lz trule_test_SOURCES = trule_test.cc trule_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) libcdec.a ../mteval/libmteval.a ../utils/libutils.a -lz -AM_CPPFLAGS = -W -Wall -Wno-sign-compare $(GTEST_CPPFLAGS) -I.. -I../mteval -I../utils -I../klm +AM_CPPFLAGS = -W -Wno-sign-compare $(GTEST_CPPFLAGS) -I.. -I../mteval -I../utils -I../klm rule_lexer.cc: rule_lexer.l $(LEX) -s -CF -8 -o$@ $< @@ -82,5 +82,5 @@ libcdec_a_SOURCES = \ if GLC # Until we build GLC as a library... - libcdec_a_SOURCES += ff_glc.cc + libcdec_a_SOURCES += ff_glc.cc string_util.cc feature-factory.cc endif diff --git a/decoder/ff_wordset.h b/decoder/ff_wordset.h index 256d54bb..00e1145b 100644 --- a/decoder/ff_wordset.h +++ b/decoder/ff_wordset.h @@ -13,13 +13,12 @@ class WordSet : public FeatureFunction { public: - // we depend on the order of the initializer list // to call member constructurs in the proper order // modify this carefully! // // Usage: "WordSet -v vocab.txt [--oov]" - WordSet(const std::string& param) { + WordSet(const std::string& param) { std::string vocabFile; std::string featName; parseArgs(param, &featName, &vocabFile, &oovMode_); @@ -30,6 +29,9 @@ class WordSet : public FeatureFunction { loadVocab(vocabFile, &vocab_); } + ~WordSet() { + } + protected: virtual void TraversalFeaturesImpl(const SentenceMetadata& smeta, diff --git a/environment/LocalConfig.pm b/environment/LocalConfig.pm index 10933f36..dd3ef761 100644 --- a/environment/LocalConfig.pm +++ b/environment/LocalConfig.pm @@ -36,6 +36,10 @@ my $CCONFIG = { 'HOST_REGEXP' => qr/^(blacklight.psc.edu|bl1.psc.teragrid.org|bl0.psc.teragrid.org)$/, 'QSubMemFlag' => '-l pmem=', }, + 'Barrow/Chicago' => { + 'HOST_REGEXP' => qr/^(barrow|chicago).lti.cs.cmu.edu$/, + 'QSubMemFlag' => '-l pmem=', + }, 'LOCAL' => { 'HOST_REGEXP' => qr/local\.net$/, 'QSubMemFlag' => '', diff --git a/vest/dist-vest.pl b/vest/dist-vest.pl index 2a56dd55..973a29ef 100755 --- a/vest/dist-vest.pl +++ b/vest/dist-vest.pl @@ -1,16 +1,18 @@ #!/usr/bin/env perl - use strict; my @ORIG_ARGV=@ARGV; use Cwd qw(getcwd); my $SCRIPT_DIR; BEGIN { use Cwd qw/ abs_path /; use File::Basename; $SCRIPT_DIR = dirname(abs_path($0)); push @INC, $SCRIPT_DIR, "$SCRIPT_DIR/../environment"; } + +# Skip local config (used for distributing jobs) if we're running in local-only mode use LocalConfig; use Getopt::Long; use IPC::Open2; -use strict; use POSIX ":sys_wait_h"; my $QSUB_CMD = qsub_args(mert_memory()); +require "libcall.pl"; + # Default settings my $srcFile; my $refFiles; @@ -22,6 +24,7 @@ my $MAPINPUT = "$bin_dir/mr_vest_generate_mapper_input"; my $MAPPER = "$bin_dir/mr_vest_map"; my $REDUCER = "$bin_dir/mr_vest_reduce"; my $parallelize = "$bin_dir/parallelize.pl"; +my $libcall = "$bin_dir/libcall.pl"; my $sentserver = "$bin_dir/sentserver"; my $sentclient = "$bin_dir/sentclient"; my $LocalConfig = "$SCRIPT_DIR/../environment/LocalConfig.pm"; @@ -31,6 +34,7 @@ die "Can't find $MAPPER" unless -x $MAPPER; my $cdec = "$bin_dir/../decoder/cdec"; die "Can't find decoder in $cdec" unless -x $cdec; die "Can't find $parallelize" unless -x $parallelize; +die "Can't find $libcall" unless -e $libcall; my $decoder = $cdec; my $lines_per_mapper = 400; my $rand_directions = 15; @@ -124,7 +128,7 @@ sub enseg; sub print_help; my $nodelist; -my $host =`hostname`; chomp $host; +my $host =check_output("hostname"); chomp $host; my $bleu; my $interval_count = 0; my $logfile; @@ -142,7 +146,7 @@ unless ($dir){ $dir = "vest"; } unless ($dir =~ /^\//){ # convert relative path to absolute path - my $basedir = `pwd`; + my $basedir = check_output("pwd"); chomp $basedir; $dir = "$basedir/$dir"; } @@ -158,15 +162,18 @@ my @cleanupcmds = (); sub cleanup { print STDERR "Cleanup...\n"; - for my $pid (@childpids){ `kill $pid`; } - for my $cmd (@cleanupcmds){`$cmd`; } + for my $pid (@childpids){ unchecked_call("kill $pid"); } + for my $cmd (@cleanupcmds){ unchecked_call("$cmd"); } exit 1; }; +# Always call cleanup, no matter how we exit +*CORE::GLOBAL::exit = + sub{ cleanup(); }; $SIG{INT} = "cleanup"; $SIG{TERM} = "cleanup"; $SIG{HUP} = "cleanup"; -my $decoderBase = `basename $decoder`; chomp $decoderBase; +my $decoderBase = check_output("basename $decoder"); chomp $decoderBase; my $newIniFile = "$dir/$decoderBase.ini"; my $inputFileName = "$dir/input"; my $user = $ENV{"USER"}; @@ -181,12 +188,12 @@ use File::Basename qw(basename); sub modbin { local $_; my $bindir=shift; - `mkdir -p $bindir`; + check_call("mkdir -p $bindir"); -d $bindir || die "couldn't make bindir $bindir"; for (@_) { my $src=$$_; $$_="$bindir/".basename($src); - `cp -p $src $$_`; + check_call("cp -p $src $$_"); die "cp $src $$_ failed: $!" unless $? == 0; } } @@ -203,7 +210,7 @@ if ($dryrun){ } else { -e $dir || mkdir $dir; mkdir "$dir/hgs"; - modbin("$dir/bin",\$LocalConfig,\$cdec,\$SCORER,\$MAPINPUT,\$MAPPER,\$REDUCER,\$parallelize,\$sentserver,\$sentclient) if $cpbin; + modbin("$dir/bin",\$LocalConfig,\$cdec,\$SCORER,\$MAPINPUT,\$MAPPER,\$REDUCER,\$parallelize,\$sentserver,\$sentclient,\$libcall) if $cpbin; mkdir "$dir/scripts"; my $cmdfile="$dir/rerun-vest.sh"; open CMD,'>',$cmdfile; @@ -219,7 +226,7 @@ if ($dryrun){ print_help(); exit; } - `cp $initialWeights $dir/weights.0`; + check_call("cp $initialWeights $dir/weights.0"); die "Can't find weights.0" unless (-e "$dir/weights.0"); } write_config(*STDERR); @@ -227,7 +234,7 @@ if ($dryrun){ # Generate initial files and values -`cp $iniFile $newIniFile`; +check_call("cp $iniFile $newIniFile"); $iniFile = $newIniFile; my $newsrc = "$dir/dev.input"; @@ -259,12 +266,12 @@ while (1){ my $logdir="$dir/logs.$iteration"; my $decoderLog="$logdir/decoder.sentserver.log.$iteration"; my $scorerLog="$logdir/scorer.log.$iteration"; - `mkdir -p $logdir`; + check_call("mkdir -p $logdir"); #decode print STDERR "RUNNING DECODER AT "; - print STDERR `date`; + print STDERR unchecked_output("date"); my $im1 = $iteration - 1; my $weightsFile="$dir/weights.$im1"; my $decoder_cmd = "$decoder -c $iniFile --weights$pass_suffix $weightsFile -O $dir/hgs"; @@ -275,33 +282,28 @@ while (1){ if ($run_local) { $pcmd = "cat $srcFile |"; } elsif ($use_make) { - $pcmd = "cat $srcFile | $parallelize --use-fork -p $pmem -e $logdir -j $decode_nodes --"; + # TODO: Throw error when decode_nodes is specified along with use_make + $pcmd = "cat $srcFile | $parallelize --use-fork -p $pmem -e $logdir -j $use_make --"; } else { $pcmd = "cat $srcFile | $parallelize $usefork -p $pmem -e $logdir -j $decode_nodes --"; } my $cmd = "$pcmd $decoder_cmd 2> $decoderLog 1> $runFile"; print STDERR "COMMAND:\n$cmd\n"; - my $result = 0; - $result = system($cmd); - unless ($result == 0){ - cleanup(); - print STDERR "ERROR: Parallel decoder returned non-zero exit code $result\n"; - die; - } - my $num_hgs = `ls $dir/hgs/*.gz | wc -l`; + check_bash_call($cmd); + my $num_hgs = check_output("ls $dir/hgs/*.gz | wc -l"); print STDERR "NUMBER OF HGs: $num_hgs\n"; die "Dev set contains $devSize sentences! Decoder failure?\n" if ($devSize != $num_hgs); - my $dec_score = `cat $runFile | $SCORER $refs_comma_sep -l $metric`; + my $dec_score = check_output("cat $runFile | $SCORER $refs_comma_sep -l $metric"); chomp $dec_score; print STDERR "DECODER SCORE: $dec_score\n"; # save space - `gzip -f $runFile`; - `gzip -f $decoderLog`; + check_call("gzip -f $runFile"); + check_call("gzip -f $decoderLog"); # run optimizer print STDERR "RUNNING OPTIMIZER AT "; - print STDERR `date`; + print STDERR unchecked_output("date"); my $mergeLog="$logdir/prune-merge.log.$iteration"; my $score = 0; @@ -309,28 +311,18 @@ while (1){ my $inweights="$dir/weights.$im1"; for (my $opt_iter=1; $opt_iter<$optimization_iters; $opt_iter++) { print STDERR "\nGENERATE OPTIMIZATION STRATEGY (OPT-ITERATION $opt_iter/$optimization_iters)\n"; - print STDERR `date`; + print STDERR unchecked_output("date"); $icc++; my $nop=$noprimary?"--no_primary":""; my $targs=$oraclen ? "--decoder_translations='$runFile.gz' ".get_comma_sep_refs('-references',$refFiles):""; my $bwargs=$bleu_weight!=1 ? "--bleu_weight=$bleu_weight":""; $cmd="$MAPINPUT -w $inweights -r $dir/hgs $bwargs -s $devSize -d $rand_directions --max_similarity=$maxsim --oracle_directions=$oraclen --oracle_batch=$oracleb $targs $dirargs > $dir/agenda.$im1-$opt_iter"; print STDERR "COMMAND:\n$cmd\n"; - $result = system($cmd); - unless ($result == 0){ - cleanup(); - die "ERROR: mapinput command returned non-zero exit code $result\n"; - } - - `mkdir -p $dir/splag.$im1`; + check_call($cmd); + check_call("mkdir -p $dir/splag.$im1"); $cmd="split -a 3 -l $lines_per_mapper $dir/agenda.$im1-$opt_iter $dir/splag.$im1/mapinput."; print STDERR "COMMAND:\n$cmd\n"; - $result = system($cmd); - unless ($result == 0){ - cleanup(); - print STDERR "ERROR: split command returned non-zero exit code $result\n"; - die; - } + check_call($cmd); opendir(DIR, "$dir/splag.$im1") or die "Can't open directory: $!"; my @shards = grep { /^mapinput\./ } readdir(DIR); closedir DIR; @@ -360,11 +352,7 @@ while (1){ my $script = "$MAPPER -s $srcFile -l $metric $refs_comma_sep < $dir/splag.$im1/$shard | sort -t \$'\\t' -k 1 > $dir/splag.$im1/$mapoutput"; if ($run_local) { print STDERR "COMMAND:\n$script\n"; - $result = system($script); - unless ($result == 0){ - cleanup(); - die "ERROR: mapper returned non-zero exit code $result\n"; - } + check_bash_call($script); } elsif ($use_make) { my $script_file = "$dir/scripts/map.$shard"; open F, ">$script_file" or die "Can't write $script_file: $!"; @@ -384,13 +372,13 @@ while (1){ if ($first_shard) { print STDERR "$script\n"; $first_shard=0; } $nmappers++; - my $qcmd = "$QSUB_CMD -N $client_name -o /dev/null -e $logdir/$client_name.ER $script_file"; - my $jobid = `$qcmd`; + my $qcmd = "QSUB_CMD -N $client_name -o /dev/null -e $logdir/$client_name.ER $script_file"; + my $jobid = check_output("$qcmd"); die "qsub failed: $!\nCMD was: $qcmd" unless $? == 0; chomp $jobid; $jobid =~ s/^(\d+)(.*?)$/\1/g; $jobid =~ s/^Your job (\d+) .*$/\1/; - push(@cleanupcmds, "`qdel $jobid 2> /dev/null`"); + push(@cleanupcmds, check_output("qdel $jobid 2> /dev/null")); print STDERR " $jobid"; if ($joblist == "") { $joblist = $jobid; } else {$joblist = $joblist . "\|" . $jobid; } @@ -403,18 +391,14 @@ while (1){ close $mkfile; my $mcmd = "make -j $use_make -f $mkfilename"; print STDERR "\nExecuting: $mcmd\n"; - $result = system($mcmd); - unless ($result == 0){ - cleanup(); - die "ERROR: make command returned non-zero exit code $result\n"; - } + check_call($mcmd); } else { print STDERR "\nLaunched $nmappers mappers.\n"; sleep 8; print STDERR "Waiting for mappers to complete...\n"; while ($nmappers > 0) { sleep 5; - my @livejobs = grep(/$joblist/, split(/\n/, `qstat | grep -v ' C '`)); + my @livejobs = grep(/$joblist/, split(/\n/, check_output("qstat | grep -v ' C '"))); $nmappers = scalar @livejobs; } print STDERR "All mappers complete.\n"; @@ -430,16 +414,12 @@ while (1){ } print STDERR "Results for $tol/$til lines\n"; print STDERR "\nSORTING AND RUNNING VEST REDUCER\n"; - print STDERR `date`; + print STDERR unchecked_output("date"); $cmd="sort -t \$'\\t' -k 1 @mapoutputs | $REDUCER -l $metric > $dir/redoutput.$im1"; print STDERR "COMMAND:\n$cmd\n"; - $result = system($cmd); - unless ($result == 0){ - cleanup(); - die "ERROR: reducer command returned non-zero exit code $result\n"; - } + check_bash_call($cmd); $cmd="sort -nk3 $DIR_FLAG '-t|' $dir/redoutput.$im1 | head -1"; - my $best=`$cmd`; chomp $best; + my $best=check_bash_output("$cmd"); chomp $best; print STDERR "$best\n"; my ($oa, $x, $xscore) = split /\|/, $best; $score = $xscore; @@ -472,11 +452,11 @@ while (1){ my $v = ($ori{$k} + $axi{$k} * $x) / $norm; print W "$k $v\n"; } - `rm -rf $dir/splag.$im1`; + check_call("rm -rf $dir/splag.$im1"); $inweights = $finalFile; } $lastWeightsFile = "$dir/weights.$iteration"; - `cp $inweights $lastWeightsFile`; + check_call("cp $inweights $lastWeightsFile"); if ($icc < 2) { print STDERR "\nREACHED STOPPING CRITERION: score change too little\n"; last; @@ -520,7 +500,7 @@ sub get_lines { sub get_comma_sep_refs { my ($r,$p) = @_; - my $o = `echo $p`; + my $o = check_output("echo $p"); chomp $o; my @files = split /\s+/, $o; return "-$r " . join(" -$r ", @files); @@ -607,7 +587,7 @@ sub enseg { sub print_help { - my $executable = `basename $0`; chomp $executable; + my $executable = check_output("basename $0"); chomp $executable; print << "Help"; Usage: $executable [options] diff --git a/vest/parallelize.pl b/vest/parallelize.pl index cb5406ec..47b77c79 100755 --- a/vest/parallelize.pl +++ b/vest/parallelize.pl @@ -28,6 +28,12 @@ use IPC::Open2; use strict; use POSIX ":sys_wait_h"; +use File::Basename; +my $myDir = dirname(__FILE__); +print STDERR __FILE__." -> $myDir\n"; +push(@INC, $myDir); +require "libcall.pl"; + my $tailn=5; # +0 = concatenate all the client logs. 5 = last 5 lines my $recycle_clients; # spawn new clients when previous ones terminate my $stay_alive; # dont let server die when having zero clients @@ -76,7 +82,7 @@ sub preview_files { my @f=grep { ! ($skipempty && -z $_) } @$l; my $fn=join(' ',map {escape_shell($_)} @f); my $cmd="tail -n $n $fn"; - `$cmd`.($footer?"\nNONEMPTY FILES:\n$fn\n":""); + check_output("$cmd").($footer?"\nNONEMPTY FILES:\n$fn\n":""); } sub prefix_dirname($) { #like `dirname but if ends in / then return the whole thing @@ -105,7 +111,7 @@ sub extend_path($$;$$) { $dir=prefix_dirname($base); } my @cmd=("/bin/mkdir","-p",$dir); - system(@cmd) if $mkdir; + check_call(@cmd) if $mkdir; } return $base.$ext; } @@ -142,7 +148,7 @@ my $prog=shift; if ($no_which) { $cmd=$prog; } else { - $cmd=`which $prog`; + $cmd=check_output("which $prog"); chomp $cmd; die "$prog not found - $cmd" unless $cmd; } @@ -156,7 +162,7 @@ my $cdcmd=$no_cd ? '' : ("cd ".escape_shell($abscwd)."\n"); my $executable = $cmd; $executable =~ s/^\s*(\S+)($|\s.*)/$1/; -$executable=`basename $executable`; +$executable=check_output("basename $executable"); chomp $executable; @@ -192,10 +198,10 @@ sub launch_job_on_node; # vars -my $mydir = `dirname $0`; chomp $mydir; +my $mydir = check_output("dirname $0"); chomp $mydir; my $sentserver = "$mydir/sentserver"; my $sentclient = "$mydir/sentclient"; -my $host = `hostname`; +my $host = check_output("hostname"); chomp $host; @@ -205,7 +211,7 @@ my $port = 50300+int(rand($randp)); my $endp=$port+$tryp; sub listening_port_lines { my $quiet=$verbose?'':'2>/dev/null'; - `netstat -a -n $quiet | grep LISTENING | grep -i tcp` + return unchecked_output("netstat -a -n $quiet | grep LISTENING | grep -i tcp"); } my $netstat=&listening_port_lines; @@ -270,17 +276,14 @@ $cdcmd$sentclient $host:$port:$key $cmd # my $todo = "$sentserver -k $key $multiflag $port "; my $todo = "$sentserver -k $key $multiflag $port $stay_alive_flag "; if ($verbose){ print STDERR "Running: $todo\n"; } - my $rc = system($todo); - if ($rc){ - die "Error: sentserver returned code $rc\n"; - } + check_call($todo); } sub numof_live_jobs { if ($use_fork) { die "not implemented"; } else { - my @livejobs = grep(/$joblist/, split(/\n/, `qstat`)); + my @livejobs = grep(/$joblist/, split(/\n/, check_output("qstat"))); return ($#livejobs + 1); } } @@ -320,7 +323,7 @@ sub launch_job { } if ($joblist == "") { $joblist = $jobid; } else {$joblist = $joblist . "\|" . $jobid; } - my $cleanfn="`qdel $jobid 2> /dev/null`"; + my $cleanfn=check_output("qdel $jobid 2> /dev/null"); push(@cleanup_cmds, $cleanfn); } close QOUT; @@ -345,7 +348,7 @@ sub launch_job_fork { close $fh; my $todo = "/bin/sh $scr_name 1> $outfile 2> $errorfile"; print STDERR "EXEC: $todo\n"; - my $out = `$todo`; + my $out = check_output("$todo"); print STDERR "RES: $out\n"; unlink $scr_name or warn "Failed to remove $scr_name"; exit 0; @@ -377,7 +380,7 @@ sub cleanup { sub print_help { - my $name = `basename $0`; chomp $name; + my $name = check_output("basename $0"); chomp $name; print << "Help"; usage: $name [options] -- cgit v1.2.3