summaryrefslogtreecommitdiff
path: root/vest
diff options
context:
space:
mode:
Diffstat (limited to 'vest')
-rw-r--r--vest/Makefile.am35
-rw-r--r--vest/README.shared-mem9
-rwxr-xr-xvest/cat.pl4
-rw-r--r--vest/ces.cc87
-rw-r--r--vest/ces.h12
-rwxr-xr-xvest/dist-vest.pl718
-rw-r--r--vest/error_surface.cc43
-rw-r--r--vest/error_surface.h24
-rw-r--r--vest/libcall.pl71
-rwxr-xr-xvest/line_mediator.pl116
-rw-r--r--vest/line_optimizer.cc109
-rw-r--r--vest/line_optimizer.h46
-rw-r--r--vest/lo_test.cc235
-rw-r--r--vest/mbr_kbest.cc138
-rw-r--r--vest/mr_vest_generate_mapper_input.cc320
-rw-r--r--vest/mr_vest_map.cc110
-rw-r--r--vest/mr_vest_reduce.cc81
-rwxr-xr-xvest/parallelize.pl423
-rw-r--r--vest/sentclient.c76
-rw-r--r--vest/sentserver.c515
-rw-r--r--vest/sentserver.h6
-rwxr-xr-xvest/tac.pl8
-rw-r--r--vest/test_aer/README8
-rw-r--r--vest/test_aer/cdec.ini3
-rw-r--r--vest/test_aer/corpus.src3
-rw-r--r--vest/test_aer/grammar12
-rw-r--r--vest/test_aer/ref.03
-rw-r--r--vest/test_aer/weights13
-rw-r--r--vest/test_data/0.json.gzbin13709 -> 0 bytes
-rw-r--r--vest/test_data/1.json.gzbin204803 -> 0 bytes
-rw-r--r--vest/test_data/c2e.txt.02
-rw-r--r--vest/test_data/c2e.txt.12
-rw-r--r--vest/test_data/c2e.txt.22
-rw-r--r--vest/test_data/c2e.txt.32
-rw-r--r--vest/test_data/re.txt.05
-rw-r--r--vest/test_data/re.txt.15
-rw-r--r--vest/test_data/re.txt.25
-rw-r--r--vest/test_data/re.txt.35
-rw-r--r--vest/viterbi_envelope.cc177
-rw-r--r--vest/viterbi_envelope.h81
40 files changed, 0 insertions, 3514 deletions
diff --git a/vest/Makefile.am b/vest/Makefile.am
deleted file mode 100644
index 05fa5639..00000000
--- a/vest/Makefile.am
+++ /dev/null
@@ -1,35 +0,0 @@
-bin_PROGRAMS = \
- mr_vest_map \
- mr_vest_reduce \
- mr_vest_generate_mapper_input \
- sentserver \
- sentclient
-
-if HAVE_GTEST
-noinst_PROGRAMS = \
- lo_test
-TESTS = lo_test
-endif
-
-sentserver_SOURCES = sentserver.c
-sentserver_LDFLAGS = -all-static -pthread
-
-sentclient_SOURCES = sentclient.c
-sentclient_LDFLAGS = -all-static -pthread
-
-mr_vest_generate_mapper_input_SOURCES = mr_vest_generate_mapper_input.cc line_optimizer.cc
-mr_vest_generate_mapper_input_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a -lz
-
-# nbest2hg_SOURCES = nbest2hg.cc
-# nbest2hg_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a -lfst -lz
-
-mr_vest_map_SOURCES = viterbi_envelope.cc ces.cc error_surface.cc mr_vest_map.cc line_optimizer.cc
-mr_vest_map_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a -lz
-
-mr_vest_reduce_SOURCES = error_surface.cc ces.cc mr_vest_reduce.cc line_optimizer.cc viterbi_envelope.cc
-mr_vest_reduce_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a -lz
-
-lo_test_SOURCES = lo_test.cc ces.cc viterbi_envelope.cc error_surface.cc line_optimizer.cc
-lo_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a -lz
-
-AM_CPPFLAGS = -W -Wall -Wno-sign-compare $(GTEST_CPPFLAGS) -I$(top_srcdir)/utils -I$(top_srcdir)/decoder -I$(top_srcdir)/mteval
diff --git a/vest/README.shared-mem b/vest/README.shared-mem
deleted file mode 100644
index 7728efc0..00000000
--- a/vest/README.shared-mem
+++ /dev/null
@@ -1,9 +0,0 @@
-If you want to run dist-vest.pl on a very large shared memory machine, do the
-following:
-
- ./dist-vest.pl --use-make I --decode-nodes J --weights weights.init --source-file=dev.src --ref-files=dev.ref.* cdec.ini
-
-This will use I jobs for doing the line search and J jobs to run the decoder. Typically, since the
-decoder must load grammars, language models, etc., J should be smaller than I, but this will depend
-on the system you are running on and the complexity of the models used for decoding.
-
diff --git a/vest/cat.pl b/vest/cat.pl
deleted file mode 100755
index 2ecba3f9..00000000
--- a/vest/cat.pl
+++ /dev/null
@@ -1,4 +0,0 @@
-#!/usr/bin/perl
-
-$|=1;
-print while(<>);
diff --git a/vest/ces.cc b/vest/ces.cc
deleted file mode 100644
index 4ae6b695..00000000
--- a/vest/ces.cc
+++ /dev/null
@@ -1,87 +0,0 @@
-#include "ces.h"
-
-#include <vector>
-#include <sstream>
-#include <boost/shared_ptr.hpp>
-
-#include "aligner.h"
-#include "lattice.h"
-#include "viterbi_envelope.h"
-#include "error_surface.h"
-
-using boost::shared_ptr;
-using namespace std;
-
-const bool minimize_segments = true; // if adjacent segments have equal scores, merge them
-
-void ComputeErrorSurface(const SentenceScorer& ss, const ViterbiEnvelope& ve, ErrorSurface* env, const ScoreType type, const Hypergraph& hg) {
- vector<WordID> prev_trans;
- const vector<shared_ptr<Segment> >& ienv = ve.GetSortedSegs();
- env->resize(ienv.size());
- ScoreP prev_score;
- int j = 0;
- for (int i = 0; i < ienv.size(); ++i) {
- const Segment& seg = *ienv[i];
- vector<WordID> trans;
- if (type == AER) {
- vector<bool> edges(hg.edges_.size(), false);
- seg.CollectEdgesUsed(&edges); // get the set of edges in the viterbi
- // alignment
- ostringstream os;
- const string* psrc = ss.GetSource();
- if (psrc == NULL) {
- cerr << "AER scoring in VEST requires source, but it is missing!\n";
- abort();
- }
- size_t pos = psrc->rfind(" ||| ");
- if (pos == string::npos) {
- cerr << "Malformed source for AER: expected |||\nINPUT: " << *psrc << endl;
- abort();
- }
- Lattice src;
- Lattice ref;
- LatticeTools::ConvertTextOrPLF(psrc->substr(0, pos), &src);
- LatticeTools::ConvertTextOrPLF(psrc->substr(pos + 5), &ref);
- AlignerTools::WriteAlignment(src, ref, hg, &os, true, 0, &edges);
- string tstr = os.str();
- TD::ConvertSentence(tstr.substr(tstr.rfind(" ||| ") + 5), &trans);
- } else {
- seg.ConstructTranslation(&trans);
- }
- // cerr << "Scoring: " << TD::GetString(trans) << endl;
- if (trans == prev_trans) {
- if (!minimize_segments) {
- assert(prev_score); // if this fails, it means
- // the decoder can generate null translations
- ErrorSegment& out = (*env)[j];
- out.delta = prev_score->GetZero();
- out.x = seg.x;
- ++j;
- }
- // cerr << "Identical translation, skipping scoring\n";
- } else {
- ScoreP score = ss.ScoreCandidate(trans);
- // cerr << "score= " << score->ComputeScore() << "\n";
- ScoreP cur_delta_p = score->GetZero();
- Score* cur_delta = cur_delta_p.get();
- // just record the score diffs
- if (!prev_score)
- prev_score = score->GetZero();
-
- score->Subtract(*prev_score, cur_delta);
- prev_trans.swap(trans);
- prev_score = score;
- if ((!minimize_segments) || (!cur_delta->IsAdditiveIdentity())) {
- ErrorSegment& out = (*env)[j];
- out.delta = cur_delta_p;
- out.x = seg.x;
- ++j;
- }
- }
- }
- // cerr << " In segments: " << ienv.size() << endl;
- // cerr << "Out segments: " << j << endl;
- assert(j > 0);
- env->resize(j);
-}
-
diff --git a/vest/ces.h b/vest/ces.h
deleted file mode 100644
index 2f098990..00000000
--- a/vest/ces.h
+++ /dev/null
@@ -1,12 +0,0 @@
-#ifndef _CES_H_
-#define _CES_H_
-
-#include "scorer.h"
-
-class ViterbiEnvelope;
-class Hypergraph;
-class ErrorSurface;
-
-void ComputeErrorSurface(const SentenceScorer& ss, const ViterbiEnvelope& ve, ErrorSurface* es, const ScoreType type, const Hypergraph& hg);
-
-#endif
diff --git a/vest/dist-vest.pl b/vest/dist-vest.pl
deleted file mode 100755
index 11e791c1..00000000
--- a/vest/dist-vest.pl
+++ /dev/null
@@ -1,718 +0,0 @@
-#!/usr/bin/env perl
-use strict;
-my @ORIG_ARGV=@ARGV;
-use Cwd qw(getcwd);
-my $SCRIPT_DIR; BEGIN { use Cwd qw/ abs_path /; use File::Basename; $SCRIPT_DIR = dirname(abs_path($0)); push @INC, $SCRIPT_DIR, "$SCRIPT_DIR/../environment"; }
-
-# Skip local config (used for distributing jobs) if we're running in local-only mode
-use LocalConfig;
-use Getopt::Long;
-use IPC::Open2;
-use POSIX ":sys_wait_h";
-my $QSUB_CMD = qsub_args(mert_memory());
-
-require "libcall.pl";
-
-# Default settings
-my $srcFile;
-my $refFiles;
-my $default_jobs = env_default_jobs();
-my $bin_dir = $SCRIPT_DIR;
-die "Bin directory $bin_dir missing/inaccessible" unless -d $bin_dir;
-my $FAST_SCORE="$bin_dir/../mteval/fast_score";
-die "Can't execute $FAST_SCORE" unless -x $FAST_SCORE;
-my $MAPINPUT = "$bin_dir/mr_vest_generate_mapper_input";
-my $MAPPER = "$bin_dir/mr_vest_map";
-my $REDUCER = "$bin_dir/mr_vest_reduce";
-my $parallelize = "$bin_dir/parallelize.pl";
-my $libcall = "$bin_dir/libcall.pl";
-my $sentserver = "$bin_dir/sentserver";
-my $sentclient = "$bin_dir/sentclient";
-my $LocalConfig = "$SCRIPT_DIR/../environment/LocalConfig.pm";
-
-my $SCORER = $FAST_SCORE;
-die "Can't find $MAPPER" unless -x $MAPPER;
-my $cdec = "$bin_dir/../decoder/cdec";
-die "Can't find decoder in $cdec" unless -x $cdec;
-die "Can't find $parallelize" unless -x $parallelize;
-die "Can't find $libcall" unless -e $libcall;
-my $decoder = $cdec;
-my $lines_per_mapper = 400;
-my $rand_directions = 15;
-my $iteration = 1;
-my $best_weights;
-my $max_iterations = 15;
-my $optimization_iters = 6;
-my $jobs = $default_jobs; # number of decode nodes
-my $pmem = "9g";
-my $disable_clean = 0;
-my %seen_weights;
-my $normalize;
-my $help = 0;
-my $epsilon = 0.0001;
-my $interval = 5;
-my $dryrun = 0;
-my $last_score = -10000000;
-my $metric = "ibm_bleu";
-my $dir;
-my $iniFile;
-my $weights;
-my $initialWeights;
-my $decoderOpt;
-my $noprimary;
-my $maxsim=0;
-my $oraclen=0;
-my $oracleb=20;
-my $bleu_weight=1;
-my $use_make = 1; # use make to parallelize line search
-my $dirargs='';
-my $density_prune;
-my $useqsub;
-my $pass_suffix = '';
-my $cpbin=1;
-# Process command-line options
-Getopt::Long::Configure("no_auto_abbrev");
-if (GetOptions(
- "decoder=s" => \$decoderOpt,
- "jobs=i" => \$jobs,
- "density-prune=f" => \$density_prune,
- "dont-clean" => \$disable_clean,
- "pass-suffix=s" => \$pass_suffix,
- "dry-run" => \$dryrun,
- "epsilon=s" => \$epsilon,
- "help" => \$help,
- "interval" => \$interval,
- "qsub" => \$useqsub,
- "max-iterations=i" => \$max_iterations,
- "normalize=s" => \$normalize,
- "pmem=s" => \$pmem,
- "cpbin!" => \$cpbin,
- "rand-directions=i" => \$rand_directions,
- "random_directions=i" => \$rand_directions,
- "bleu_weight=s" => \$bleu_weight,
- "no-primary!" => \$noprimary,
- "max-similarity=s" => \$maxsim,
- "oracle-directions=i" => \$oraclen,
- "n-oracle=i" => \$oraclen,
- "oracle-batch=i" => \$oracleb,
- "directions-args=s" => \$dirargs,
- "ref-files=s" => \$refFiles,
- "metric=s" => \$metric,
- "source-file=s" => \$srcFile,
- "weights=s" => \$initialWeights,
- "workdir=s" => \$dir,
- "opt-iterations=i" => \$optimization_iters,
-) == 0 || @ARGV!=1 || $help) {
- print_help();
- exit;
-}
-
-if (defined $density_prune) {
- die "--density_prune n: n must be greater than 1.0\n" unless $density_prune > 1.0;
-}
-
-if ($useqsub) {
- $use_make = 0;
- die "LocalEnvironment.pm does not have qsub configuration for this host. Cannot run with --qsub!\n" unless has_qsub();
-}
-
-my @missing_args = ();
-if (!defined $srcFile) { push @missing_args, "--source-file"; }
-if (!defined $refFiles) { push @missing_args, "--ref-files"; }
-if (!defined $initialWeights) { push @missing_args, "--weights"; }
-die "Please specify missing arguments: " . join (', ', @missing_args) . "\n" if (@missing_args);
-
-if ($metric =~ /^(combi|ter)$/i) {
- $lines_per_mapper = 40;
-} elsif ($metric =~ /^meteor$/i) {
- $lines_per_mapper = 2000; # start up time is really high
-}
-
-($iniFile) = @ARGV;
-
-
-sub write_config;
-sub enseg;
-sub print_help;
-
-my $nodelist;
-my $host =check_output("hostname"); chomp $host;
-my $bleu;
-my $interval_count = 0;
-my $logfile;
-my $projected_score;
-
-# used in sorting scores
-my $DIR_FLAG = '-r';
-if ($metric =~ /^ter$|^aer$/i) {
- $DIR_FLAG = '';
-}
-
-my $refs_comma_sep = get_comma_sep_refs('r',$refFiles);
-
-unless ($dir){
- $dir = "vest";
-}
-unless ($dir =~ /^\//){ # convert relative path to absolute path
- my $basedir = check_output("pwd");
- chomp $basedir;
- $dir = "$basedir/$dir";
-}
-
-if ($decoderOpt){ $decoder = $decoderOpt; }
-
-
-# Initializations and helper functions
-srand;
-
-my @childpids = ();
-my @cleanupcmds = ();
-
-sub cleanup {
- print STDERR "Cleanup...\n";
- for my $pid (@childpids){ unchecked_call("kill $pid"); }
- for my $cmd (@cleanupcmds){ unchecked_call("$cmd"); }
- exit 1;
-};
-# Always call cleanup, no matter how we exit
-*CORE::GLOBAL::exit =
- sub{ cleanup(); };
-$SIG{INT} = "cleanup";
-$SIG{TERM} = "cleanup";
-$SIG{HUP} = "cleanup";
-
-my $decoderBase = check_output("basename $decoder"); chomp $decoderBase;
-my $newIniFile = "$dir/$decoderBase.ini";
-my $inputFileName = "$dir/input";
-my $user = $ENV{"USER"};
-
-
-# process ini file
--e $iniFile || die "Error: could not open $iniFile for reading\n";
-open(INI, $iniFile);
-
-use File::Basename qw(basename);
-#pass bindir, refs to vars holding bin
-sub modbin {
- local $_;
- my $bindir=shift;
- check_call("mkdir -p $bindir");
- -d $bindir || die "couldn't make bindir $bindir";
- for (@_) {
- my $src=$$_;
- $$_="$bindir/".basename($src);
- check_call("cp -p $src $$_");
- }
-}
-sub dirsize {
- opendir ISEMPTY,$_[0];
- return scalar(readdir(ISEMPTY))-1;
-}
-if ($dryrun){
- write_config(*STDERR);
- exit 0;
-} else {
- if (-e $dir && dirsize($dir)>1 && -e "$dir/hgs" ){ # allow preexisting logfile, binaries, but not dist-vest.pl outputs
- die "ERROR: working dir $dir already exists\n\n";
- } else {
- -e $dir || mkdir $dir;
- mkdir "$dir/hgs";
- modbin("$dir/bin",\$LocalConfig,\$cdec,\$SCORER,\$MAPINPUT,\$MAPPER,\$REDUCER,\$parallelize,\$sentserver,\$sentclient,\$libcall) if $cpbin;
- mkdir "$dir/scripts";
- my $cmdfile="$dir/rerun-vest.sh";
- open CMD,'>',$cmdfile;
- print CMD "cd ",&getcwd,"\n";
-# print CMD &escaped_cmdline,"\n"; #buggy - last arg is quoted.
- my $cline=&cmdline."\n";
- print CMD $cline;
- close CMD;
- print STDERR $cline;
- chmod(0755,$cmdfile);
- unless (-e $initialWeights) {
- print STDERR "Please specify an initial weights file with --initial-weights\n";
- print_help();
- exit;
- }
- check_call("cp $initialWeights $dir/weights.0");
- die "Can't find weights.0" unless (-e "$dir/weights.0");
- }
- write_config(*STDERR);
-}
-
-
-# Generate initial files and values
-check_call("cp $iniFile $newIniFile");
-$iniFile = $newIniFile;
-
-my $newsrc = "$dir/dev.input";
-enseg($srcFile, $newsrc);
-$srcFile = $newsrc;
-my $devSize = 0;
-open F, "<$srcFile" or die "Can't read $srcFile: $!";
-while(<F>) { $devSize++; }
-close F;
-
-unless($best_weights){ $best_weights = $weights; }
-unless($projected_score){ $projected_score = 0.0; }
-$seen_weights{$weights} = 1;
-
-my $random_seed = int(time / 1000);
-my $lastWeightsFile;
-my $lastPScore = 0;
-# main optimization loop
-while (1){
- print STDERR "\n\nITERATION $iteration\n==========\n";
-
- if ($iteration > $max_iterations){
- print STDERR "\nREACHED STOPPING CRITERION: Maximum iterations\n";
- last;
- }
- # iteration-specific files
- my $runFile="$dir/run.raw.$iteration";
- my $onebestFile="$dir/1best.$iteration";
- my $logdir="$dir/logs.$iteration";
- my $decoderLog="$logdir/decoder.sentserver.log.$iteration";
- my $scorerLog="$logdir/scorer.log.$iteration";
- check_call("mkdir -p $logdir");
-
-
- #decode
- print STDERR "RUNNING DECODER AT ";
- print STDERR unchecked_output("date");
- my $im1 = $iteration - 1;
- my $weightsFile="$dir/weights.$im1";
- my $decoder_cmd = "$decoder -c $iniFile --weights$pass_suffix $weightsFile -O $dir/hgs";
- my $pcmd;
- if ($use_make) {
- $pcmd = "cat $srcFile | $parallelize --use-fork -p $pmem -e $logdir -j $jobs --";
- } else {
- $pcmd = "cat $srcFile | $parallelize -p $pmem -e $logdir -j $jobs --";
- }
- my $cmd = "$pcmd $decoder_cmd 2> $decoderLog 1> $runFile";
- print STDERR "COMMAND:\n$cmd\n";
- check_bash_call($cmd);
- my $num_hgs;
- my $num_topbest;
- my $retries = 0;
- while($retries < 5) {
- $num_hgs = check_output("ls $dir/hgs/*.gz | wc -l");
- $num_topbest = check_output("wc -l < $runFile");
- print STDERR "NUMBER OF HGs: $num_hgs\n";
- print STDERR "NUMBER OF TOP-BEST HYPs: $num_topbest\n";
- if($devSize == $num_hgs && $devSize == $num_topbest) {
- last;
- } else {
- print STDERR "Incorrect number of hypergraphs or topbest. Waiting for distributed filesystem and retrying...\n";
- sleep(3);
- }
- $retries++;
- }
- die "Dev set contains $devSize sentences, but we don't have topbest and hypergraphs for all these! Decoder failure? Check $decoderLog\n" if ($devSize != $num_hgs || $devSize != $num_topbest);
- my $dec_score = check_output("cat $runFile | $SCORER $refs_comma_sep -l $metric");
- chomp $dec_score;
- print STDERR "DECODER SCORE: $dec_score\n";
-
- # save space
- check_call("gzip -f $runFile");
- check_call("gzip -f $decoderLog");
-
- # run optimizer
- print STDERR "RUNNING OPTIMIZER AT ";
- print STDERR unchecked_output("date");
- my $mergeLog="$logdir/prune-merge.log.$iteration";
-
- my $score = 0;
- my $icc = 0;
- my $inweights="$dir/weights.$im1";
- for (my $opt_iter=1; $opt_iter<$optimization_iters; $opt_iter++) {
- print STDERR "\nGENERATE OPTIMIZATION STRATEGY (OPT-ITERATION $opt_iter/$optimization_iters)\n";
- print STDERR unchecked_output("date");
- $icc++;
- my $nop=$noprimary?"--no_primary":"";
- my $targs=$oraclen ? "--decoder_translations='$runFile.gz' ".get_comma_sep_refs('-references',$refFiles):"";
- my $bwargs=$bleu_weight!=1 ? "--bleu_weight=$bleu_weight":"";
- $cmd="$MAPINPUT -w $inweights -r $dir/hgs $bwargs -s $devSize -d $rand_directions --max_similarity=$maxsim --oracle_directions=$oraclen --oracle_batch=$oracleb $targs $dirargs > $dir/agenda.$im1-$opt_iter";
- print STDERR "COMMAND:\n$cmd\n";
- check_call($cmd);
- check_call("mkdir -p $dir/splag.$im1");
- $cmd="split -a 3 -l $lines_per_mapper $dir/agenda.$im1-$opt_iter $dir/splag.$im1/mapinput.";
- print STDERR "COMMAND:\n$cmd\n";
- check_call($cmd);
- opendir(DIR, "$dir/splag.$im1") or die "Can't open directory: $!";
- my @shards = grep { /^mapinput\./ } readdir(DIR);
- closedir DIR;
- die "No shards!" unless scalar @shards > 0;
- my $joblist = "";
- my $nmappers = 0;
- my @mapoutputs = ();
- @cleanupcmds = ();
- my %o2i = ();
- my $first_shard = 1;
- my $mkfile; # only used with makefiles
- my $mkfilename;
- if ($use_make) {
- $mkfilename = "$dir/splag.$im1/domap.mk";
- open $mkfile, ">$mkfilename" or die "Couldn't write $mkfilename: $!";
- print $mkfile "all: $dir/splag.$im1/map.done\n\n";
- }
- my @mkouts = (); # only used with makefiles
- for my $shard (@shards) {
- my $mapoutput = $shard;
- my $client_name = $shard;
- $client_name =~ s/mapinput.//;
- $client_name = "vest.$client_name";
- $mapoutput =~ s/mapinput/mapoutput/;
- push @mapoutputs, "$dir/splag.$im1/$mapoutput";
- $o2i{"$dir/splag.$im1/$mapoutput"} = "$dir/splag.$im1/$shard";
- my $script = "$MAPPER -s $srcFile -l $metric $refs_comma_sep < $dir/splag.$im1/$shard | sort -t \$'\\t' -k 1 > $dir/splag.$im1/$mapoutput";
- if ($use_make) {
- my $script_file = "$dir/scripts/map.$shard";
- open F, ">$script_file" or die "Can't write $script_file: $!";
- print F "#!/bin/bash\n";
- print F "$script\n";
- close F;
- my $output = "$dir/splag.$im1/$mapoutput";
- push @mkouts, $output;
- chmod(0755, $script_file) or die "Can't chmod $script_file: $!";
- if ($first_shard) { print STDERR "$script\n"; $first_shard=0; }
- print $mkfile "$output: $dir/splag.$im1/$shard\n\t$script_file\n\n";
- } else {
- my $script_file = "$dir/scripts/map.$shard";
- open F, ">$script_file" or die "Can't write $script_file: $!";
- print F "$script\n";
- close F;
- if ($first_shard) { print STDERR "$script\n"; $first_shard=0; }
-
- $nmappers++;
- my $qcmd = "$QSUB_CMD -N $client_name -o /dev/null -e $logdir/$client_name.ER $script_file";
- my $jobid = check_output("$qcmd");
- chomp $jobid;
- $jobid =~ s/^(\d+)(.*?)$/\1/g;
- $jobid =~ s/^Your job (\d+) .*$/\1/;
- push(@cleanupcmds, "qdel $jobid 2> /dev/null");
- print STDERR " $jobid";
- if ($joblist == "") { $joblist = $jobid; }
- else {$joblist = $joblist . "\|" . $jobid; }
- }
- }
- if ($use_make) {
- print $mkfile "$dir/splag.$im1/map.done: @mkouts\n\ttouch $dir/splag.$im1/map.done\n\n";
- close $mkfile;
- my $mcmd = "make -j $jobs -f $mkfilename";
- print STDERR "\nExecuting: $mcmd\n";
- check_call($mcmd);
- } else {
- print STDERR "\nLaunched $nmappers mappers.\n";
- sleep 8;
- print STDERR "Waiting for mappers to complete...\n";
- while ($nmappers > 0) {
- sleep 5;
- my @livejobs = grep(/$joblist/, split(/\n/, unchecked_output("qstat | grep -v ' C '")));
- $nmappers = scalar @livejobs;
- }
- print STDERR "All mappers complete.\n";
- }
- my $tol = 0;
- my $til = 0;
- for my $mo (@mapoutputs) {
- my $olines = get_lines($mo);
- my $ilines = get_lines($o2i{$mo});
- $tol += $olines;
- $til += $ilines;
- die "$mo: output lines ($olines) doesn't match input lines ($ilines)" unless $olines==$ilines;
- }
- print STDERR "Results for $tol/$til lines\n";
- print STDERR "\nSORTING AND RUNNING VEST REDUCER\n";
- print STDERR unchecked_output("date");
- $cmd="sort -t \$'\\t' -k 1 @mapoutputs | $REDUCER -l $metric > $dir/redoutput.$im1";
- print STDERR "COMMAND:\n$cmd\n";
- check_bash_call($cmd);
- $cmd="sort -nk3 $DIR_FLAG '-t|' $dir/redoutput.$im1 | head -1";
- # sort returns failure even when it doesn't fail for some reason
- my $best=unchecked_output("$cmd"); chomp $best;
- print STDERR "$best\n";
- my ($oa, $x, $xscore) = split /\|/, $best;
- $score = $xscore;
- print STDERR "PROJECTED SCORE: $score\n";
- if (abs($x) < $epsilon) {
- print STDERR "\nOPTIMIZER: no score improvement: abs($x) < $epsilon\n";
- last;
- }
- my $psd = $score - $last_score;
- $last_score = $score;
- if (abs($psd) < $epsilon) {
- print STDERR "\nOPTIMIZER: no score improvement: abs($psd) < $epsilon\n";
- last;
- }
- my ($origin, $axis) = split /\s+/, $oa;
-
- my %ori = convert($origin);
- my %axi = convert($axis);
-
- my $finalFile="$dir/weights.$im1-$opt_iter";
- open W, ">$finalFile" or die "Can't write: $finalFile: $!";
- my $norm = 0;
- for my $k (sort keys %ori) {
- my $dd = $ori{$k} + $axi{$k} * $x;
- $norm += $dd * $dd;
- }
- $norm = sqrt($norm);
- $norm = 1;
- for my $k (sort keys %ori) {
- my $v = ($ori{$k} + $axi{$k} * $x) / $norm;
- print W "$k $v\n";
- }
- check_call("rm $dir/splag.$im1/*");
- $inweights = $finalFile;
- }
- $lastWeightsFile = "$dir/weights.$iteration";
- check_call("cp $inweights $lastWeightsFile");
- if ($icc < 2) {
- print STDERR "\nREACHED STOPPING CRITERION: score change too little\n";
- last;
- }
- $lastPScore = $score;
- $iteration++;
- print STDERR "\n==========\n";
-}
-
-print STDERR "\nFINAL WEIGHTS: $lastWeightsFile\n(Use -w <this file> with the decoder)\n\n";
-
-print STDOUT "$lastWeightsFile\n";
-
-exit 0;
-
-sub normalize_weights {
- my ($rfn, $rpts, $feat) = @_;
- my @feat_names = @$rfn;
- my @pts = @$rpts;
- my $z = 1.0;
- for (my $i=0; $i < scalar @feat_names; $i++) {
- if ($feat_names[$i] eq $feat) {
- $z = $pts[$i];
- last;
- }
- }
- for (my $i=0; $i < scalar @feat_names; $i++) {
- $pts[$i] /= $z;
- }
- print STDERR " NORM WEIGHTS: @pts\n";
- return @pts;
-}
-
-sub get_lines {
- my $fn = shift @_;
- open FL, "<$fn" or die "Couldn't read $fn: $!";
- my $lc = 0;
- while(<FL>) { $lc++; }
- return $lc;
-}
-
-sub get_comma_sep_refs {
- my ($r,$p) = @_;
- my $o = check_output("echo $p");
- chomp $o;
- my @files = split /\s+/, $o;
- return "-$r " . join(" -$r ", @files);
-}
-
-sub read_weights_file {
- my ($file) = @_;
- open F, "<$file" or die "Couldn't read $file: $!";
- my @r = ();
- my $pm = -1;
- while(<F>) {
- next if /^#/;
- next if /^\s*$/;
- chomp;
- if (/^(.+)\s+(.+)$/) {
- my $m = $1;
- my $w = $2;
- die "Weights out of order: $m <= $pm" unless $m > $pm;
- push @r, $w;
- } else {
- warn "Unexpected feature name in weight file: $_";
- }
- }
- close F;
- return join ' ', @r;
-}
-
-# subs
-sub write_config {
- my $fh = shift;
- my $cleanup = "yes";
- if ($disable_clean) {$cleanup = "no";}
-
- print $fh "\n";
- print $fh "DECODER: $decoder\n";
- print $fh "INI FILE: $iniFile\n";
- print $fh "WORKING DIR: $dir\n";
- print $fh "SOURCE (DEV): $srcFile\n";
- print $fh "REFS (DEV): $refFiles\n";
- print $fh "EVAL METRIC: $metric\n";
- print $fh "START ITERATION: $iteration\n";
- print $fh "MAX ITERATIONS: $max_iterations\n";
- print $fh "PARALLEL JOBS: $jobs\n";
- print $fh "HEAD NODE: $host\n";
- print $fh "PMEM (DECODING): $pmem\n";
- print $fh "CLEANUP: $cleanup\n";
- print $fh "INITIAL WEIGHTS: $initialWeights\n";
-}
-
-sub update_weights_file {
- my ($neww, $rfn, $rpts) = @_;
- my @feats = @$rfn;
- my @pts = @$rpts;
- my $num_feats = scalar @feats;
- my $num_pts = scalar @pts;
- die "$num_feats (num_feats) != $num_pts (num_pts)" unless $num_feats == $num_pts;
- open G, ">$neww" or die;
- for (my $i = 0; $i < $num_feats; $i++) {
- my $f = $feats[$i];
- my $lambda = $pts[$i];
- print G "$f $lambda\n";
- }
- close G;
-}
-
-sub enseg {
- my $src = shift;
- my $newsrc = shift;
- open(SRC, $src);
- open(NEWSRC, ">$newsrc");
- my $i=0;
- while (my $line=<SRC>){
- chomp $line;
- if ($line =~ /^\s*<seg/i) {
- if($line =~ /id="[0-9]+"/) {
- print NEWSRC "$line\n";
- } else {
- die "When using segments with pre-generated <seg> tags, you must include a zero-based id attribute";
- }
- } else {
- print NEWSRC "<seg id=\"$i\">$line</seg>\n";
- }
- $i++;
- }
- close SRC;
- close NEWSRC;
-}
-
-sub print_help {
-
- my $executable = check_output("basename $0"); chomp $executable;
- print << "Help";
-
-Usage: $executable [options] <ini file>
-
- $executable [options] <ini file>
- Runs a complete MERT optimization using the decoder configuration
- in <ini file>. Required options are --weights, --source-file, and
- --ref-files.
-
-Options:
-
- --help
- Print this message and exit.
-
- --max-iterations <M>
- Maximum number of iterations to run. If not specified, defaults
- to 10.
-
- --pass-suffix <S>
- If the decoder is doing multi-pass decoding, the pass suffix "2",
- "3", etc., is used to control what iteration of weights is set.
-
- --ref-files <files>
- Dev set ref files. This option takes only a single string argument.
- To use multiple files (including file globbing), this argument should
- be quoted.
-
- --metric <method>
- Metric to optimize.
- Example values: IBM_BLEU, NIST_BLEU, Koehn_BLEU, TER, Combi
-
- --normalize <feature-name>
- After each iteration, rescale all feature weights such that feature-
- name has a weight of 1.0.
-
- --rand-directions <num>
- MERT will attempt to optimize along all of the principle directions,
- set this parameter to explore other directions. Defaults to 5.
-
- --source-file <file>
- Dev set source file.
-
- --weights <file>
- A file specifying initial feature weights. The format is
- FeatureName_1 value1
- FeatureName_2 value2
- **All and only the weights listed in <file> will be optimized!**
-
- --workdir <dir>
- Directory for intermediate and output files. If not specified, the
- name is derived from the ini filename. Assuming that the ini
- filename begins with the decoder name and ends with ini, the default
- name of the working directory is inferred from the middle part of
- the filename. E.g. an ini file named decoder.foo.ini would have
- a default working directory name foo.
-
-Job control options:
-
- --jobs <I>
- Number of decoder processes to run in parallel. [default=$default_jobs]
-
- --qsub
- Use qsub to run jobs in parallel (qsub must be configured in
- environment/LocalEnvironment.pm)
-
- --pmem <N>
- Amount of physical memory requested for parallel decoding jobs
- (used with qsub requests only)
-
-Help
-}
-
-sub convert {
- my ($str) = @_;
- my @ps = split /;/, $str;
- my %dict = ();
- for my $p (@ps) {
- my ($k, $v) = split /=/, $p;
- $dict{$k} = $v;
- }
- return %dict;
-}
-
-
-
-sub cmdline {
- return join ' ',($0,@ORIG_ARGV);
-}
-
-#buggy: last arg gets quoted sometimes?
-my $is_shell_special=qr{[ \t\n\\><|&;"'`~*?{}$!()]};
-my $shell_escape_in_quote=qr{[\\"\$`!]};
-
-sub escape_shell {
- my ($arg)=@_;
- return undef unless defined $arg;
- if ($arg =~ /$is_shell_special/) {
- $arg =~ s/($shell_escape_in_quote)/\\$1/g;
- return "\"$arg\"";
- }
- return $arg;
-}
-
-sub escaped_shell_args {
- return map {local $_=$_;chomp;escape_shell($_)} @_;
-}
-
-sub escaped_shell_args_str {
- return join ' ',&escaped_shell_args(@_);
-}
-
-sub escaped_cmdline {
- return "$0 ".&escaped_shell_args_str(@ORIG_ARGV);
-}
diff --git a/vest/error_surface.cc b/vest/error_surface.cc
deleted file mode 100644
index 754aa8de..00000000
--- a/vest/error_surface.cc
+++ /dev/null
@@ -1,43 +0,0 @@
-#include "error_surface.h"
-
-#include <cassert>
-#include <sstream>
-
-using namespace std;
-
-ErrorSurface::~ErrorSurface() {
-}
-
-void ErrorSurface::Serialize(std::string* out) const {
- const int segments = this->size();
- ostringstream os(ios::binary);
- os.write((const char*)&segments,sizeof(segments));
- for (int i = 0; i < segments; ++i) {
- const ErrorSegment& cur = (*this)[i];
- string senc;
- cur.delta->Encode(&senc);
- assert(senc.size() < 256);
- unsigned char len = senc.size();
- os.write((const char*)&cur.x, sizeof(cur.x));
- os.write((const char*)&len, sizeof(len));
- os.write((const char*)&senc[0], len);
- }
- *out = os.str();
-}
-
-void ErrorSurface::Deserialize(ScoreType type, const std::string& in) {
- istringstream is(in, ios::binary);
- int segments;
- is.read((char*)&segments, sizeof(segments));
- this->resize(segments);
- for (int i = 0; i < segments; ++i) {
- ErrorSegment& cur = (*this)[i];
- unsigned char len;
- is.read((char*)&cur.x, sizeof(cur.x));
- is.read((char*)&len, sizeof(len));
- string senc(len, '\0'); assert(senc.size() == len);
- is.read((char*)&senc[0], len);
- cur.delta = SentenceScorer::CreateScoreFromString(type, senc);
- }
-}
-
diff --git a/vest/error_surface.h b/vest/error_surface.h
deleted file mode 100644
index ad728cfa..00000000
--- a/vest/error_surface.h
+++ /dev/null
@@ -1,24 +0,0 @@
-#ifndef _ERROR_SURFACE_H_
-#define _ERROR_SURFACE_H_
-
-#include <vector>
-#include <string>
-
-#include "scorer.h"
-
-class Score;
-
-struct ErrorSegment {
- double x;
- ScoreP delta;
- ErrorSegment() : x(0), delta() {}
-};
-
-class ErrorSurface : public std::vector<ErrorSegment> {
- public:
- ~ErrorSurface();
- void Serialize(std::string* out) const;
- void Deserialize(ScoreType type, const std::string& in);
-};
-
-#endif
diff --git a/vest/libcall.pl b/vest/libcall.pl
deleted file mode 100644
index c7d0f128..00000000
--- a/vest/libcall.pl
+++ /dev/null
@@ -1,71 +0,0 @@
-use IPC::Open3;
-use Symbol qw(gensym);
-
-$DUMMY_STDERR = gensym();
-$DUMMY_STDIN = gensym();
-
-# Run the command and ignore failures
-sub unchecked_call {
- system("@_")
-}
-
-# Run the command and return its output, if any ignoring failures
-sub unchecked_output {
- return `@_`
-}
-
-# WARNING: Do not use this for commands that will return large amounts
-# of stdout or stderr -- they might block indefinitely
-sub check_output {
- print STDERR "Executing and gathering output: @_\n";
-
- my $pid = open3($DUMMY_STDIN, \*PH, $DUMMY_STDERR, @_);
- my $proc_output = "";
- while( <PH> ) {
- $proc_output .= $_;
- }
- waitpid($pid, 0);
- # TODO: Grab signal that the process died from
- my $child_exit_status = $? >> 8;
- if($child_exit_status == 0) {
- return $proc_output;
- } else {
- print STDERR "ERROR: Execution of @_ failed.\n";
- exit(1);
- }
-}
-
-# Based on Moses' safesystem sub
-sub check_call {
- print STDERR "Executing: @_\n";
- system(@_);
- my $exitcode = $? >> 8;
- if($exitcode == 0) {
- return 0;
- } elsif ($? == -1) {
- print STDERR "ERROR: Failed to execute: @_\n $!\n";
- exit(1);
-
- } elsif ($? & 127) {
- printf STDERR "ERROR: Execution of: @_\n died with signal %d, %s coredump\n",
- ($? & 127), ($? & 128) ? 'with' : 'without';
- exit(1);
-
- } else {
- print STDERR "Failed with exit code: $exitcode\n" if $exitcode;
- exit($exitcode);
- }
-}
-
-sub check_bash_call {
- my @args = ( "bash", "-auxeo", "pipefail", "-c", "@_");
- check_call(@args);
-}
-
-sub check_bash_output {
- my @args = ( "bash", "-auxeo", "pipefail", "-c", "@_");
- return check_output(@args);
-}
-
-# perl module weirdness...
-return 1;
diff --git a/vest/line_mediator.pl b/vest/line_mediator.pl
deleted file mode 100755
index bc2bb24c..00000000
--- a/vest/line_mediator.pl
+++ /dev/null
@@ -1,116 +0,0 @@
-#!/usr/bin/perl -w
-#hooks up two processes, 2nd of which has one line of output per line of input, expected by the first, which starts off the communication
-
-# if you don't know how to fork/exec in a C program, this could be helpful under limited cirmustances (would be ok to liaise with sentserver)
-
-#WARNING: because it waits for the result from command 2 after sending every line, and especially if command 1 does the same, using sentserver as command 2 won't actually buy you any real parallelism.
-
-use strict;
-use IPC::Open2;
-use POSIX qw(pipe dup2 STDIN_FILENO STDOUT_FILENO);
-
-my $quiet=!$ENV{DEBUG};
-$quiet=1 if $ENV{QUIET};
-sub info {
- local $,=' ';
- print STDERR @_ unless $quiet;
-}
-
-my $mode='CROSS';
-my $ser='DIRECT';
-$mode='PIPE' if $ENV{PIPE};
-$mode='SNAKE' if $ENV{SNAKE};
-$mode='CROSS' if $ENV{CROSS};
-$ser='SERIAL' if $ENV{SERIAL};
-$ser='DIRECT' if $ENV{DIRECT};
-$ser='SERIAL' if $mode eq 'SNAKE';
-info("mode: $mode\n");
-info("connection: $ser\n");
-
-
-my @c1;
-if (scalar @ARGV) {
- do {
- push @c1,shift
- } while scalar @ARGV && $c1[$#c1] ne '--';
-}
-pop @c1;
-my @c2=@ARGV;
-@ARGV=();
-(scalar @c1 && scalar @c2) || die qq{
-usage: $0 cmd1 args -- cmd2 args
-all options are environment variables.
-DEBUG=1 env var enables debugging output.
-CROSS=1 hooks up two processes, 2nd of which has one line of output per line of input, expected by the first, which starts off the communication. crosses stdin/stderr of cmd1 and cmd2 line by line (both must flush on newline and output. cmd1 initiates the conversation (sends the first line). default: attempts to cross stdin/stdout of c1 and c2 directly (via two unidirectional posix pipes created before fork).
-SERIAL=1: (no parallelism possible) but lines exchanged are logged if DEBUG.
-if SNAKE then stdin -> c1 -> c2 -> c1 -> stdout.
-if PIPE then stdin -> c1 -> c2 -> stdout (same as shell c1|c2, but with SERIAL you can see the intermediate in real time; you could do similar with c1 | tee /dev/fd/2 |c2.
-DIRECT=1 (default) will override SERIAL=1.
-CROSS=1 (default) will override SNAKE or PIPE.
-};
-
-info("1 cmd:",@c1,"\n");
-info("2 cmd:",@c2,"\n");
-
-sub lineto {
- select $_[0];
- $|=1;
- shift;
- print @_;
-}
-
-if ($ser eq 'SERIAL') {
- my ($R1,$W1,$R2,$W2);
- my $c1p=open2($R1,$W1,@c1); # Open2 R W backward from Open3.
- my $c2p=open2($R2,$W2,@c2);
- if ($mode eq 'CROSS') {
- while(<$R1>) {
- info("1:",$_);
- lineto($W2,$_);
- last unless defined ($_=<$R2>);
- info("1|2:",$_);
- lineto($W1,$_);
- }
- } else {
- my $snake=$mode eq 'SNAKE';
- while(<STDIN>) {
- info("IN:",$_);
- lineto($W1,$_);
- last unless defined ($_=<$R1>);
- info("IN|1:",$_);
- lineto($W2,$_);
- last unless defined ($_=<$R2>);
- info("IN|1|2:",$_);
- if ($snake) {
- lineto($W1,$_);
- last unless defined ($_=<$R1>);
- info("IN|1|2|1:",$_);
- }
- lineto(*STDOUT,$_);
- }
- }
-} else {
- info("DIRECT mode\n");
- my @rw1=POSIX::pipe();
- my @rw2=POSIX::pipe();
- my $pid=undef;
- $SIG{CHLD} = sub { wait };
- while (not defined ($pid=fork())) {
- sleep 1;
- }
- my $pipe = $mode eq 'PIPE';
- unless ($pipe) {
- POSIX::close(STDOUT_FILENO);
- POSIX::close(STDIN_FILENO);
- }
- if ($pid) {
- POSIX::dup2($rw1[1],STDOUT_FILENO);
- POSIX::dup2($rw2[0],STDIN_FILENO) unless $pipe;
- exec @c1;
- } else {
- POSIX::dup2($rw2[1],STDOUT_FILENO) unless $pipe;
- POSIX::dup2($rw1[0],STDIN_FILENO);
- exec @c2;
- }
- while (wait()!=-1) {}
-}
diff --git a/vest/line_optimizer.cc b/vest/line_optimizer.cc
deleted file mode 100644
index 7303df8d..00000000
--- a/vest/line_optimizer.cc
+++ /dev/null
@@ -1,109 +0,0 @@
-#include "line_optimizer.h"
-
-#include <limits>
-#include <algorithm>
-
-#include "sparse_vector.h"
-#include "scorer.h"
-
-using namespace std;
-
-typedef ErrorSurface::const_iterator ErrorIter;
-
-// sort by increasing x-ints
-struct IntervalComp {
- bool operator() (const ErrorIter& a, const ErrorIter& b) const {
- return a->x < b->x;
- }
-};
-
-double LineOptimizer::LineOptimize(
- const vector<ErrorSurface>& surfaces,
- const LineOptimizer::ScoreType type,
- float* best_score,
- const double epsilon) {
- // cerr << "MIN=" << MINIMIZE_SCORE << " MAX=" << MAXIMIZE_SCORE << " MINE=" << type << endl;
- vector<ErrorIter> all_ints;
- for (vector<ErrorSurface>::const_iterator i = surfaces.begin();
- i != surfaces.end(); ++i) {
- const ErrorSurface& surface = *i;
- for (ErrorIter j = surface.begin(); j != surface.end(); ++j)
- all_ints.push_back(j);
- }
- sort(all_ints.begin(), all_ints.end(), IntervalComp());
- double last_boundary = all_ints.front()->x;
- ScoreP accp = all_ints.front()->delta->GetZero();
- Score *acc=accp.get();
- float& cur_best_score = *best_score;
- cur_best_score = (type == MAXIMIZE_SCORE ?
- -numeric_limits<float>::max() : numeric_limits<float>::max());
- bool left_edge = true;
- double pos = numeric_limits<double>::quiet_NaN();
- for (vector<ErrorIter>::iterator i = all_ints.begin();
- i != all_ints.end(); ++i) {
- const ErrorSegment& seg = **i;
- assert(seg.delta);
- if (seg.x - last_boundary > epsilon) {
- float sco = acc->ComputeScore();
- if ((type == MAXIMIZE_SCORE && sco > cur_best_score) ||
- (type == MINIMIZE_SCORE && sco < cur_best_score) ) {
- cur_best_score = sco;
- if (left_edge) {
- pos = seg.x - 0.1;
- left_edge = false;
- } else {
- pos = last_boundary + (seg.x - last_boundary) / 2;
- }
- // cerr << "NEW BEST: " << pos << " (score=" << cur_best_score << ")\n";
- }
- // string xx; acc->ScoreDetails(&xx); cerr << "---- " << xx;
- // cerr << "---- s=" << sco << "\n";
- last_boundary = seg.x;
- }
- // cerr << "x-boundary=" << seg.x << "\n";
- acc->PlusEquals(*seg.delta);
- }
- float sco = acc->ComputeScore();
- if ((type == MAXIMIZE_SCORE && sco > cur_best_score) ||
- (type == MINIMIZE_SCORE && sco < cur_best_score) ) {
- cur_best_score = sco;
- if (left_edge) {
- pos = 0;
- } else {
- pos = last_boundary + 1000.0;
- }
- }
- return pos;
-}
-
-void LineOptimizer::RandomUnitVector(const vector<int>& features_to_optimize,
- SparseVector<double>* axis,
- RandomNumberGenerator<boost::mt19937>* rng) {
- axis->clear();
- for (int i = 0; i < features_to_optimize.size(); ++i)
- axis->set_value(features_to_optimize[i], rng->NextNormal(0.0,1.0));
- (*axis) /= axis->l2norm();
-}
-
-void LineOptimizer::CreateOptimizationDirections(
- const vector<int>& features_to_optimize,
- int additional_random_directions,
- RandomNumberGenerator<boost::mt19937>* rng,
- vector<SparseVector<double> >* dirs
- , bool include_orthogonal
- ) {
- dirs->clear();
- typedef SparseVector<double> Dir;
- vector<Dir> &out=*dirs;
- int i=0;
- if (include_orthogonal)
- for (;i<features_to_optimize.size();++i) {
- Dir d;
- d.set_value(features_to_optimize[i],1.);
- out.push_back(d);
- }
- out.resize(i+additional_random_directions);
- for (;i<out.size();++i)
- RandomUnitVector(features_to_optimize, &out[i], rng);
- cerr << "Generated " << out.size() << " total axes to optimize along.\n";
-}
diff --git a/vest/line_optimizer.h b/vest/line_optimizer.h
deleted file mode 100644
index 99a591f4..00000000
--- a/vest/line_optimizer.h
+++ /dev/null
@@ -1,46 +0,0 @@
-#ifndef LINE_OPTIMIZER_H_
-#define LINE_OPTIMIZER_H_
-
-#include <vector>
-
-#include "sparse_vector.h"
-#include "error_surface.h"
-#include "sampler.h"
-
-class Weights;
-
-struct LineOptimizer {
-
- // use MINIMIZE_SCORE for things like TER, WER
- // MAXIMIZE_SCORE for things like BLEU
- enum ScoreType { MAXIMIZE_SCORE, MINIMIZE_SCORE };
-
- // merge all the error surfaces together into a global
- // error surface and find (the middle of) the best segment
- static double LineOptimize(
- const std::vector<ErrorSurface>& envs,
- const LineOptimizer::ScoreType type,
- float* best_score,
- const double epsilon = 1.0/65536.0);
-
- // return a random vector of length 1 where all dimensions
- // not listed in dimensions will be 0.
- static void RandomUnitVector(const std::vector<int>& dimensions,
- SparseVector<double>* axis,
- RandomNumberGenerator<boost::mt19937>* rng);
-
- // generate a list of directions to optimize; the list will
- // contain the orthogonal vectors corresponding to the dimensions in
- // primary and then additional_random_directions directions in those
- // dimensions as well. All vectors will be length 1.
- static void CreateOptimizationDirections(
- const std::vector<int>& primary,
- int additional_random_directions,
- RandomNumberGenerator<boost::mt19937>* rng,
- std::vector<SparseVector<double> >* dirs
- , bool include_primary=true
- );
-
-};
-
-#endif
diff --git a/vest/lo_test.cc b/vest/lo_test.cc
deleted file mode 100644
index f5638600..00000000
--- a/vest/lo_test.cc
+++ /dev/null
@@ -1,235 +0,0 @@
-#include <cmath>
-#include <iostream>
-#include <fstream>
-
-#include <boost/shared_ptr.hpp>
-#include <gtest/gtest.h>
-
-#include "ces.h"
-#include "fdict.h"
-#include "hg.h"
-#include "kbest.h"
-#include "hg_io.h"
-#include "filelib.h"
-#include "inside_outside.h"
-#include "viterbi.h"
-#include "viterbi_envelope.h"
-#include "line_optimizer.h"
-#include "scorer.h"
-
-using namespace std;
-using boost::shared_ptr;
-
-class OptTest : public testing::Test {
- protected:
- virtual void SetUp() { }
- virtual void TearDown() { }
-};
-
-const char* ref11 = "australia reopens embassy in manila";
-const char* ref12 = "( afp , manila , january 2 ) australia reopened its embassy in the philippines today , which was shut down about seven weeks ago due to what was described as a specific threat of a terrorist attack .";
-const char* ref21 = "australia reopened manila embassy";
-const char* ref22 = "( agence france-presse , manila , 2nd ) - australia reopened its embassy in the philippines today . the embassy was closed seven weeks ago after what was described as a specific threat of a terrorist attack .";
-const char* ref31 = "australia to reopen embassy in manila";
-const char* ref32 = "( afp report from manila , january 2 ) australia reopened its embassy in the philippines today . seven weeks ago , the embassy was shut down due to so - called confirmed terrorist attack threats .";
-const char* ref41 = "australia to re - open its embassy to manila";
-const char* ref42 = "( afp , manila , thursday ) australia reopens its embassy to manila , which was closed for the so - called \" clear \" threat of terrorist attack 7 weeks ago .";
-
-TEST_F(OptTest, TestCheckNaN) {
- double x = 0;
- double y = 0;
- double z = x / y;
- EXPECT_EQ(true, isnan(z));
-}
-
-TEST_F(OptTest,TestViterbiEnvelope) {
- shared_ptr<Segment> a1(new Segment(-1, 0));
- shared_ptr<Segment> b1(new Segment(1, 0));
- shared_ptr<Segment> a2(new Segment(-1, 1));
- shared_ptr<Segment> b2(new Segment(1, -1));
- vector<shared_ptr<Segment> > sa; sa.push_back(a1); sa.push_back(b1);
- vector<shared_ptr<Segment> > sb; sb.push_back(a2); sb.push_back(b2);
- ViterbiEnvelope a(sa);
- cerr << a << endl;
- ViterbiEnvelope b(sb);
- ViterbiEnvelope c = a;
- c *= b;
- cerr << a << " (*) " << b << " = " << c << endl;
- EXPECT_EQ(3, c.size());
-}
-
-TEST_F(OptTest,TestViterbiEnvelopeInside) {
- const string json = "{\"rules\":[1,\"[X] ||| a\",2,\"[X] ||| A [1]\",3,\"[X] ||| c\",4,\"[X] ||| C [1]\",5,\"[X] ||| [1] B [2]\",6,\"[X] ||| [1] b [2]\",7,\"[X] ||| X [1]\",8,\"[X] ||| Z [1]\"],\"features\":[\"f1\",\"f2\",\"Feature_1\",\"Feature_0\",\"Model_0\",\"Model_1\",\"Model_2\",\"Model_3\",\"Model_4\",\"Model_5\",\"Model_6\",\"Model_7\"],\"edges\":[{\"tail\":[],\"feats\":[],\"rule\":1}],\"node\":{\"in_edges\":[0]},\"edges\":[{\"tail\":[0],\"feats\":[0,-0.8,1,-0.1],\"rule\":2}],\"node\":{\"in_edges\":[1]},\"edges\":[{\"tail\":[],\"feats\":[1,-1],\"rule\":3}],\"node\":{\"in_edges\":[2]},\"edges\":[{\"tail\":[2],\"feats\":[0,-0.2,1,-0.1],\"rule\":4}],\"node\":{\"in_edges\":[3]},\"edges\":[{\"tail\":[1,3],\"feats\":[0,-1.2,1,-0.2],\"rule\":5},{\"tail\":[1,3],\"feats\":[0,-0.5,1,-1.3],\"rule\":6}],\"node\":{\"in_edges\":[4,5]},\"edges\":[{\"tail\":[4],\"feats\":[0,-0.5,1,-0.8],\"rule\":7},{\"tail\":[4],\"feats\":[0,-0.7,1,-0.9],\"rule\":8}],\"node\":{\"in_edges\":[6,7]}}";
- Hypergraph hg;
- istringstream instr(json);
- HypergraphIO::ReadFromJSON(&instr, &hg);
- SparseVector<double> wts;
- wts.set_value(FD::Convert("f1"), 0.4);
- wts.set_value(FD::Convert("f2"), 1.0);
- hg.Reweight(wts);
- vector<pair<vector<WordID>, prob_t> > list;
- std::vector<SparseVector<double> > features;
- KBest::KBestDerivations<vector<WordID>, ESentenceTraversal> kbest(hg, 10);
- for (int i = 0; i < 10; ++i) {
- const KBest::KBestDerivations<vector<WordID>, ESentenceTraversal>::Derivation* d =
- kbest.LazyKthBest(hg.nodes_.size() - 1, i);
- if (!d) break;
- cerr << log(d->score) << " ||| " << TD::GetString(d->yield) << " ||| " << d->feature_values << endl;
- }
- SparseVector<double> dir; dir.set_value(FD::Convert("f1"), 1.0);
- ViterbiEnvelopeWeightFunction wf(wts, dir);
- ViterbiEnvelope env = Inside<ViterbiEnvelope, ViterbiEnvelopeWeightFunction>(hg, NULL, wf);
- cerr << env << endl;
- const vector<boost::shared_ptr<Segment> >& segs = env.GetSortedSegs();
- dir *= segs[1]->x;
- wts += dir;
- hg.Reweight(wts);
- KBest::KBestDerivations<vector<WordID>, ESentenceTraversal> kbest2(hg, 10);
- for (int i = 0; i < 10; ++i) {
- const KBest::KBestDerivations<vector<WordID>, ESentenceTraversal>::Derivation* d =
- kbest2.LazyKthBest(hg.nodes_.size() - 1, i);
- if (!d) break;
- cerr << log(d->score) << " ||| " << TD::GetString(d->yield) << " ||| " << d->feature_values << endl;
- }
- for (int i = 0; i < segs.size(); ++i) {
- cerr << "seg=" << i << endl;
- vector<WordID> trans;
- segs[i]->ConstructTranslation(&trans);
- cerr << TD::GetString(trans) << endl;
- }
-}
-
-TEST_F(OptTest, TestS1) {
- int fPhraseModel_0 = FD::Convert("PhraseModel_0");
- int fPhraseModel_1 = FD::Convert("PhraseModel_1");
- int fPhraseModel_2 = FD::Convert("PhraseModel_2");
- int fLanguageModel = FD::Convert("LanguageModel");
- int fWordPenalty = FD::Convert("WordPenalty");
- int fPassThrough = FD::Convert("PassThrough");
- SparseVector<double> wts;
- wts.set_value(fWordPenalty, 4.25);
- wts.set_value(fLanguageModel, -1.1165);
- wts.set_value(fPhraseModel_0, -0.96);
- wts.set_value(fPhraseModel_1, -0.65);
- wts.set_value(fPhraseModel_2, -0.77);
- wts.set_value(fPassThrough, -10.0);
-
- vector<int> to_optimize;
- to_optimize.push_back(fWordPenalty);
- to_optimize.push_back(fLanguageModel);
- to_optimize.push_back(fPhraseModel_0);
- to_optimize.push_back(fPhraseModel_1);
- to_optimize.push_back(fPhraseModel_2);
-
- Hypergraph hg;
- ReadFile rf("./test_data/0.json.gz");
- HypergraphIO::ReadFromJSON(rf.stream(), &hg);
- hg.Reweight(wts);
-
- Hypergraph hg2;
- ReadFile rf2("./test_data/1.json.gz");
- HypergraphIO::ReadFromJSON(rf2.stream(), &hg2);
- hg2.Reweight(wts);
-
- vector<vector<WordID> > refs1(4);
- TD::ConvertSentence(ref11, &refs1[0]);
- TD::ConvertSentence(ref21, &refs1[1]);
- TD::ConvertSentence(ref31, &refs1[2]);
- TD::ConvertSentence(ref41, &refs1[3]);
- vector<vector<WordID> > refs2(4);
- TD::ConvertSentence(ref12, &refs2[0]);
- TD::ConvertSentence(ref22, &refs2[1]);
- TD::ConvertSentence(ref32, &refs2[2]);
- TD::ConvertSentence(ref42, &refs2[3]);
- ScoreType type = ScoreTypeFromString("ibm_bleu");
- ScorerP scorer1 = SentenceScorer::CreateSentenceScorer(type, refs1);
- ScorerP scorer2 = SentenceScorer::CreateSentenceScorer(type, refs2);
- vector<ViterbiEnvelope> envs(2);
-
- RandomNumberGenerator<boost::mt19937> rng;
-
- vector<SparseVector<double> > axes; // directions to search
- LineOptimizer::CreateOptimizationDirections(
- to_optimize,
- 10,
- &rng,
- &axes);
- assert(axes.size() == 10 + to_optimize.size());
- for (int i = 0; i < axes.size(); ++i)
- cerr << axes[i] << endl;
- const SparseVector<double>& axis = axes[0];
-
- cerr << "Computing Viterbi envelope using inside algorithm...\n";
- cerr << "axis: " << axis << endl;
- clock_t t_start=clock();
- ViterbiEnvelopeWeightFunction wf(wts, axis); // wts = starting point, axis = search direction
- envs[0] = Inside<ViterbiEnvelope, ViterbiEnvelopeWeightFunction>(hg, NULL, wf);
- envs[1] = Inside<ViterbiEnvelope, ViterbiEnvelopeWeightFunction>(hg2, NULL, wf);
-
- vector<ErrorSurface> es(2);
- ComputeErrorSurface(*scorer1, envs[0], &es[0], IBM_BLEU, hg);
- ComputeErrorSurface(*scorer2, envs[1], &es[1], IBM_BLEU, hg2);
- cerr << envs[0].size() << " " << envs[1].size() << endl;
- cerr << es[0].size() << " " << es[1].size() << endl;
- envs.clear();
- clock_t t_env=clock();
- float score;
- double m = LineOptimizer::LineOptimize(es, LineOptimizer::MAXIMIZE_SCORE, &score);
- clock_t t_opt=clock();
- cerr << "line optimizer returned: " << m << " (SCORE=" << score << ")\n";
- EXPECT_FLOAT_EQ(0.48719698, score);
- SparseVector<double> res = axis;
- res *= m;
- res += wts;
- cerr << "res: " << res << endl;
- cerr << "ENVELOPE PROCESSING=" << (static_cast<double>(t_env - t_start) / 1000.0) << endl;
- cerr << " LINE OPTIMIZATION=" << (static_cast<double>(t_opt - t_env) / 1000.0) << endl;
- hg.Reweight(res);
- hg2.Reweight(res);
- vector<WordID> t1,t2;
- ViterbiESentence(hg, &t1);
- ViterbiESentence(hg2, &t2);
- cerr << TD::GetString(t1) << endl;
- cerr << TD::GetString(t2) << endl;
-}
-
-TEST_F(OptTest,TestZeroOrigin) {
- const string json = "{\"rules\":[1,\"[X7] ||| blA ||| without ||| LHSProb=3.92173 LexE2F=2.90799 LexF2E=1.85003 GenerativeProb=10.5381 RulePenalty=1 XFE=2.77259 XEF=0.441833 LabelledEF=2.63906 LabelledFE=4.96981 LogRuleCount=0.693147\",2,\"[X7] ||| blA ||| except ||| LHSProb=4.92173 LexE2F=3.90799 LexF2E=1.85003 GenerativeProb=11.5381 RulePenalty=1 XFE=2.77259 XEF=1.44183 LabelledEF=2.63906 LabelledFE=4.96981 LogRuleCount=1.69315\",3,\"[S] ||| [X7,1] ||| [1] ||| GlueTop=1\",4,\"[X28] ||| EnwAn ||| title ||| LHSProb=3.96802 LexE2F=2.22462 LexF2E=1.83258 GenerativeProb=10.0863 RulePenalty=1 XFE=0 XEF=1.20397 LabelledEF=1.20397 LabelledFE=-1.98341e-08 LogRuleCount=1.09861\",5,\"[X0] ||| EnwAn ||| funny ||| LHSProb=3.98479 LexE2F=1.79176 LexF2E=3.21888 GenerativeProb=11.1681 RulePenalty=1 XFE=0 XEF=2.30259 LabelledEF=2.30259 LabelledFE=0 LogRuleCount=0 SingletonRule=1\",6,\"[X8] ||| [X7,1] EnwAn ||| entitled [1] ||| LHSProb=3.82533 LexE2F=3.21888 LexF2E=2.52573 GenerativeProb=11.3276 RulePenalty=1 XFE=1.20397 XEF=1.20397 LabelledEF=2.30259 LabelledFE=2.30259 LogRuleCount=0 SingletonRule=1\",7,\"[S] ||| [S,1] [X28,2] ||| [1] [2] ||| Glue=1\",8,\"[S] ||| [S,1] [X0,2] ||| [1] [2] ||| Glue=1\",9,\"[S] ||| [X8,1] ||| [1] ||| GlueTop=1\",10,\"[Goal] ||| [S,1] ||| [1]\"],\"features\":[\"PassThrough\",\"Glue\",\"GlueTop\",\"LanguageModel\",\"WordPenalty\",\"LHSProb\",\"LexE2F\",\"LexF2E\",\"GenerativeProb\",\"RulePenalty\",\"XFE\",\"XEF\",\"LabelledEF\",\"LabelledFE\",\"LogRuleCount\",\"SingletonRule\"],\"edges\":[{\"tail\":[],\"spans\":[0,1,-1,-1],\"feats\":[5,3.92173,6,2.90799,7,1.85003,8,10.5381,9,1,10,2.77259,11,0.441833,12,2.63906,13,4.96981,14,0.693147],\"rule\":1},{\"tail\":[],\"spans\":[0,1,-1,-1],\"feats\":[5,4.92173,6,3.90799,7,1.85003,8,11.5381,9,1,10,2.77259,11,1.44183,12,2.63906,13,4.96981,14,1.69315],\"rule\":2}],\"node\":{\"in_edges\":[0,1],\"cat\":\"X7\"},\"edges\":[{\"tail\":[0],\"spans\":[0,1,-1,-1],\"feats\":[2,1],\"rule\":3}],\"node\":{\"in_edges\":[2],\"cat\":\"S\"},\"edges\":[{\"tail\":[],\"spans\":[1,2,-1,-1],\"feats\":[5,3.96802,6,2.22462,7,1.83258,8,10.0863,9,1,11,1.20397,12,1.20397,13,-1.98341e-08,14,1.09861],\"rule\":4}],\"node\":{\"in_edges\":[3],\"cat\":\"X28\"},\"edges\":[{\"tail\":[],\"spans\":[1,2,-1,-1],\"feats\":[5,3.98479,6,1.79176,7,3.21888,8,11.1681,9,1,11,2.30259,12,2.30259,15,1],\"rule\":5}],\"node\":{\"in_edges\":[4],\"cat\":\"X0\"},\"edges\":[{\"tail\":[0],\"spans\":[0,2,-1,-1],\"feats\":[5,3.82533,6,3.21888,7,2.52573,8,11.3276,9,1,10,1.20397,11,1.20397,12,2.30259,13,2.30259,15,1],\"rule\":6}],\"node\":{\"in_edges\":[5],\"cat\":\"X8\"},\"edges\":[{\"tail\":[1,2],\"spans\":[0,2,-1,-1],\"feats\":[1,1],\"rule\":7},{\"tail\":[1,3],\"spans\":[0,2,-1,-1],\"feats\":[1,1],\"rule\":8},{\"tail\":[4],\"spans\":[0,2,-1,-1],\"feats\":[2,1],\"rule\":9}],\"node\":{\"in_edges\":[6,7,8],\"cat\":\"S\"},\"edges\":[{\"tail\":[5],\"spans\":[0,2,-1,-1],\"feats\":[],\"rule\":10}],\"node\":{\"in_edges\":[9],\"cat\":\"Goal\"}}";
- Hypergraph hg;
- istringstream instr(json);
- HypergraphIO::ReadFromJSON(&instr, &hg);
- SparseVector<double> wts;
- wts.set_value(FD::Convert("PassThrough"), -0.929201533002898);
- hg.Reweight(wts);
-
- vector<pair<vector<WordID>, prob_t> > list;
- std::vector<SparseVector<double> > features;
- KBest::KBestDerivations<vector<WordID>, ESentenceTraversal> kbest(hg, 10);
- for (int i = 0; i < 10; ++i) {
- const KBest::KBestDerivations<vector<WordID>, ESentenceTraversal>::Derivation* d =
- kbest.LazyKthBest(hg.nodes_.size() - 1, i);
- if (!d) break;
- cerr << log(d->score) << " ||| " << TD::GetString(d->yield) << " ||| " << d->feature_values << endl;
- }
-
- SparseVector<double> axis; axis.set_value(FD::Convert("Glue"),1.0);
- ViterbiEnvelopeWeightFunction wf(wts, axis); // wts = starting point, axis = search direction
- vector<ViterbiEnvelope> envs(1);
- envs[0] = Inside<ViterbiEnvelope, ViterbiEnvelopeWeightFunction>(hg, NULL, wf);
-
- ScoreType type = ScoreTypeFromString("ibm_bleu");
- vector<vector<WordID> > mr(4);
- TD::ConvertSentence("untitled", &mr[0]);
- TD::ConvertSentence("with no title", &mr[1]);
- TD::ConvertSentence("without a title", &mr[2]);
- TD::ConvertSentence("without title", &mr[3]);
- ScorerP scorer1 = SentenceScorer::CreateSentenceScorer(type, mr);
- vector<ErrorSurface> es(1);
- ComputeErrorSurface(*scorer1, envs[0], &es[0], IBM_BLEU, hg);
-}
-
-int main(int argc, char **argv) {
- testing::InitGoogleTest(&argc, argv);
- return RUN_ALL_TESTS();
-}
-
diff --git a/vest/mbr_kbest.cc b/vest/mbr_kbest.cc
deleted file mode 100644
index 2867b36b..00000000
--- a/vest/mbr_kbest.cc
+++ /dev/null
@@ -1,138 +0,0 @@
-#include <iostream>
-#include <vector>
-
-#include <boost/program_options.hpp>
-
-#include "prob.h"
-#include "tdict.h"
-#include "scorer.h"
-#include "filelib.h"
-#include "stringlib.h"
-
-using namespace std;
-
-namespace po = boost::program_options;
-
-void InitCommandLine(int argc, char** argv, po::variables_map* conf) {
- po::options_description opts("Configuration options");
- opts.add_options()
- ("scale,a",po::value<double>()->default_value(1.0), "Posterior scaling factor (alpha)")
- ("loss_function,l",po::value<string>()->default_value("bleu"), "Loss function")
- ("input,i",po::value<string>()->default_value("-"), "File to read k-best lists from")
- ("output_list,L", "Show reranked list as output")
- ("help,h", "Help");
- po::options_description dcmdline_options;
- dcmdline_options.add(opts);
- po::store(parse_command_line(argc, argv, dcmdline_options), *conf);
- bool flag = false;
- if (flag || conf->count("help")) {
- cerr << dcmdline_options << endl;
- exit(1);
- }
-}
-
-struct LossComparer {
- bool operator()(const pair<vector<WordID>, double>& a, const pair<vector<WordID>, double>& b) const {
- return a.second < b.second;
- }
-};
-
-bool ReadKBestList(istream* in, string* sent_id, vector<pair<vector<WordID>, prob_t> >* list) {
- static string cache_id;
- static pair<vector<WordID>, prob_t> cache_pair;
- list->clear();
- string cur_id;
- if (cache_pair.first.size() > 0) {
- list->push_back(cache_pair);
- cur_id = cache_id;
- cache_pair.first.clear();
- }
- string line;
- string tstr;
- while(*in) {
- getline(*in, line);
- if (line.empty()) continue;
- size_t p1 = line.find(" ||| ");
- if (p1 == string::npos) { cerr << "Bad format: " << line << endl; abort(); }
- size_t p2 = line.find(" ||| ", p1 + 4);
- if (p2 == string::npos) { cerr << "Bad format: " << line << endl; abort(); }
- size_t p3 = line.rfind(" ||| ");
- cache_id = line.substr(0, p1);
- tstr = line.substr(p1 + 5, p2 - p1 - 5);
- double val = strtod(line.substr(p3 + 5).c_str(), NULL);
- TD::ConvertSentence(tstr, &cache_pair.first);
- cache_pair.second.logeq(val);
- if (cur_id.empty()) cur_id = cache_id;
- if (cur_id == cache_id) {
- list->push_back(cache_pair);
- *sent_id = cur_id;
- cache_pair.first.clear();
- } else { break; }
- }
- return !list->empty();
-}
-
-int main(int argc, char** argv) {
- po::variables_map conf;
- InitCommandLine(argc, argv, &conf);
- const string metric = conf["loss_function"].as<string>();
- const bool output_list = conf.count("output_list") > 0;
- const string file = conf["input"].as<string>();
- const double mbr_scale = conf["scale"].as<double>();
- cerr << "Posterior scaling factor (alpha) = " << mbr_scale << endl;
-
- ScoreType type = ScoreTypeFromString(metric);
- vector<pair<vector<WordID>, prob_t> > list;
- ReadFile rf(file);
- string sent_id;
- while(ReadKBestList(rf.stream(), &sent_id, &list)) {
- vector<prob_t> joints(list.size());
- const prob_t max_score = pow(list.front().second, mbr_scale);
- prob_t marginal = prob_t::Zero();
- for (int i = 0 ; i < list.size(); ++i) {
- const prob_t joint = pow(list[i].second, mbr_scale) / max_score;
- joints[i] = joint;
- // cerr << "list[" << i << "] joint=" << log(joint) << endl;
- marginal += joint;
- }
- int mbr_idx = -1;
- vector<double> mbr_scores(output_list ? list.size() : 0);
- double mbr_loss = numeric_limits<double>::max();
- for (int i = 0 ; i < list.size(); ++i) {
- vector<vector<WordID> > refs(1, list[i].first);
- //cerr << i << ": " << list[i].second <<"\t" << TD::GetString(list[i].first) << endl;
- ScorerP scorer = SentenceScorer::CreateSentenceScorer(type, refs);
- double wl_acc = 0;
- for (int j = 0; j < list.size(); ++j) {
- if (i != j) {
- ScoreP s = scorer->ScoreCandidate(list[j].first);
- double loss = 1.0 - s->ComputeScore();
- if (type == TER || type == AER) loss = 1.0 - loss;
- double weighted_loss = loss * (joints[j] / marginal);
- wl_acc += weighted_loss;
- if ((!output_list) && wl_acc > mbr_loss) break;
- }
- }
- if (output_list) mbr_scores[i] = wl_acc;
- if (wl_acc < mbr_loss) {
- mbr_loss = wl_acc;
- mbr_idx = i;
- }
- }
- // cerr << "ML translation: " << TD::GetString(list[0].first) << endl;
- cerr << "MBR Best idx: " << mbr_idx << endl;
- if (output_list) {
- for (int i = 0; i < list.size(); ++i)
- list[i].second.logeq(mbr_scores[i]);
- sort(list.begin(), list.end(), LossComparer());
- for (int i = 0; i < list.size(); ++i)
- cout << sent_id << " ||| "
- << TD::GetString(list[i].first) << " ||| "
- << log(list[i].second) << endl;
- } else {
- cout << TD::GetString(list[mbr_idx].first) << endl;
- }
- }
- return 0;
-}
-
diff --git a/vest/mr_vest_generate_mapper_input.cc b/vest/mr_vest_generate_mapper_input.cc
deleted file mode 100644
index 0c094fd5..00000000
--- a/vest/mr_vest_generate_mapper_input.cc
+++ /dev/null
@@ -1,320 +0,0 @@
-//TODO: debug segfault when references supplied, null shared_ptr when oracle
-#include <iostream>
-#include <vector>
-#include <sstream>
-
-#include <boost/program_options.hpp>
-#include <boost/program_options/variables_map.hpp>
-
-#include "sampler.h"
-#include "filelib.h"
-#include "weights.h"
-#include "line_optimizer.h"
-#include "hg.h"
-#include "hg_io.h"
-#include "scorer.h"
-#include "oracle_bleu.h"
-#include "ff_bleu.h"
-
-const bool DEBUG_ORACLE=true;
-
-//TODO: decide on cdec_ff ffs, or just bleumodel - if just bleumodel, then do existing features on serialized hypergraphs remain? weights (origin) is passed to oracle_bleu.h:ComputeOracle
-//void register_feature_functions();
-//FFRegistry ff_registry;
-namespace {
-void init_bleumodel() {
- ff_registry.clear();
- ff_registry.Register(new FFFactory<BLEUModel>);
-}
-
-struct init_ff {
- init_ff() {
- init_bleumodel();
- }
-};
-//init_ff reg; // order of initialization? ff_registry may not be init yet. call in Run() instead.
-}
-
-using namespace std;
-namespace po = boost::program_options;
-
-typedef SparseVector<double> Dir;
-typedef Dir Point;
-
-void compress_similar(vector<Dir> &dirs,double min_dist,ostream *log=&cerr,bool avg=true,bool verbose=true) {
- // return; //TODO: debug
- if (min_dist<=0) return;
- double max_s=1.-min_dist;
- if (log&&verbose) *log<<"max allowed S="<<max_s<<endl;
- unsigned N=dirs.size();
- for (int i=0;i<N;++i) {
- for (int j=i+1;j<N;++j) {
- double s=dirs[i].tanimoto_coef(dirs[j]);
- if (log&&verbose) *log<<"S["<<i<<","<<j<<"]="<<s<<' ';
- if (s>max_s) {
- if (log) *log << "Collapsing similar directions (T="<<s<<" > "<<max_s<<"). dirs["<<i<<"]="<<dirs[i]<<" dirs["<<j<<"]"<<endl;
- if (avg) {
- dirs[i]+=dirs[j];
- dirs[i]/=2.;
- if (log) *log<<" averaged="<<dirs[i];
- }
- if (log) *log<<endl;
- swap(dirs[j],dirs[--N]);
- }
- }
- if (log&&verbose) *log<<endl;
-
- }
- dirs.resize(N);
-}
-
-struct oracle_directions {
- MT19937 rng;
- OracleBleu oracle;
- vector<Dir> directions;
-
- bool start_random;
- bool include_primary;
- bool old_to_hope;
- bool fear_to_hope;
- unsigned n_random;
- void AddPrimaryAndRandomDirections() {
- LineOptimizer::CreateOptimizationDirections(
- fids,n_random,&rng,&directions,include_primary);
- }
-
- void Print() {
- for (int i = 0; i < dev_set_size; ++i)
- for (int j = 0; j < directions.size(); ++j) {
- cout << forest_file(i) <<" " << i<<" ";
- print(cout,origin,"=",";");
- cout<<" ";
- print(cout,directions[j],"=",";");
- cout<<"\n";
- }
- }
-
- void AddOptions(po::options_description *opts) {
- oracle.AddOptions(opts);
- opts->add_options()
- ("dev_set_size,s",po::value<unsigned>(&dev_set_size),"[REQD] Development set size (# of parallel sentences)")
- ("forest_repository,r",po::value<string>(&forest_repository),"[REQD] Path to forest repository")
- ("weights,w",po::value<string>(&weights_file),"[REQD] Current feature weights file")
- ("optimize_feature,o",po::value<vector<string> >(), "Feature to optimize (if none specified, all weights listed in the weights file will be optimized)")
- ("random_directions,d",po::value<unsigned>(&n_random)->default_value(10),"Number of random directions to run the line optimizer in")
- ("no_primary,n","don't use the primary (orthogonal each feature alone) directions")
- ("oracle_directions,O",po::value<unsigned>(&n_oracle)->default_value(0),"read the forests and choose this many directions based on heading toward a hope max (bleu+modelscore) translation.")
- ("oracle_start_random",po::bool_switch(&start_random),"sample random subsets of dev set for ALL oracle directions, not just those after a sequential run through it")
- ("oracle_batch,b",po::value<unsigned>(&oracle_batch)->default_value(10),"to produce each oracle direction, sum the 'gradient' over this many sentences")
- ("max_similarity,m",po::value<double>(&max_similarity)->default_value(0),"remove directions that are too similar (Tanimoto coeff. less than (1-this)). 0 means don't filter, 1 means only 1 direction allowed?")
- ("fear_to_hope,f",po::bool_switch(&fear_to_hope),"for each of the oracle_directions, also include a direction from fear to hope (as well as origin to hope)")
- ("no_old_to_hope","don't emit the usual old -> hope oracle")
- ("decoder_translations",po::value<string>(&decoder_translations_file)->default_value(""),"one per line decoder 1best translations for computing document BLEU vs. sentences-seen-so-far BLEU")
- ;
- }
- void InitCommandLine(int argc, char *argv[], po::variables_map *conf) {
- po::options_description opts("Configuration options");
- AddOptions(&opts);
- opts.add_options()("help,h", "Help");
-
- po::options_description dcmdline_options;
- dcmdline_options.add(opts);
- po::store(parse_command_line(argc, argv, dcmdline_options), *conf);
- po::notify(*conf);
- if (conf->count("dev_set_size") == 0) {
- cerr << "Please specify the size of the development set using -s N\n";
- goto bad_cmdline;
- }
- if (conf->count("weights") == 0) {
- cerr << "Please specify the starting-point weights using -w <weightfile.txt>\n";
- goto bad_cmdline;
- }
- if (conf->count("forest_repository") == 0) {
- cerr << "Please specify the forest repository location using -r <DIR>\n";
- goto bad_cmdline;
- }
- if (n_oracle && oracle.refs.empty()) {
- cerr<<"Specify references when using oracle directions\n";
- goto bad_cmdline;
- }
- if (conf->count("help")) {
- cout << dcmdline_options << endl;
- exit(0);
- }
-
- return;
- bad_cmdline:
- cerr << dcmdline_options << endl;
- exit(1);
- }
-
- int main(int argc, char *argv[]) {
- po::variables_map conf;
- InitCommandLine(argc,argv,&conf);
- init_bleumodel();
- UseConf(conf);
- Run();
- return 0;
- }
- bool verbose() const { return oracle.verbose; }
- void Run() {
-// register_feature_functions();
- AddPrimaryAndRandomDirections();
- AddOracleDirections();
- compress_similar(directions,max_similarity,&cerr,true,verbose());
- Print();
- }
-
-
- Point origin; // old weights that gave model 1best.
- vector<string> optimize_features;
- void UseConf(po::variables_map const& conf) {
- oracle.UseConf(conf);
- include_primary=!conf.count("no_primary");
- old_to_hope=!conf.count("no_old_to_hope");
-
- if (conf.count("optimize_feature") > 0)
- optimize_features=conf["optimize_feature"].as<vector<string> >();
- Init();
- }
-
- string weights_file;
- double max_similarity;
- unsigned n_oracle, oracle_batch;
- string forest_repository;
- unsigned dev_set_size;
- vector<Oracle> oracles;
- vector<int> fids;
- string forest_file(unsigned i) const {
- ostringstream o;
- o << forest_repository << '/' << i << ".json.gz";
- return o.str();
- }
-
- oracle_directions() { }
-
- Sentences model_hyps;
-
- vector<ScoreP> model_scores;
- bool have_doc;
- void Init() {
- have_doc=!decoder_translations_file.empty();
- if (have_doc) {
- model_hyps.Load(decoder_translations_file);
- if (verbose()) model_hyps.Print(cerr,5);
- model_scores.resize(model_hyps.size());
- if (dev_set_size!=model_hyps.size()) {
- cerr<<"You supplied decoder_translations with a different number of lines ("<<model_hyps.size()<<") than dev_set_size ("<<dev_set_size<<")"<<endl;
- abort();
- }
- cerr << "Scoring model translations " << model_hyps << endl;
- for (int i=0;i<model_hyps.size();++i) {
- //TODO: what is scoreCcand? without clipping? do without for consistency w/ oracle
- model_scores[i]=oracle.ds[i]->ScoreCandidate(model_hyps[i]);
- assert(model_scores[i]);
- if (verbose()) cerr<<"Before model["<<i<<"]: "<<ds().ScoreDetails()<<endl;
- if (verbose()) cerr<<"model["<<i<<"]: "<<model_scores[i]->ScoreDetails()<<endl;
- oracle.doc_score->PlusEquals(*model_scores[i]);
- if (verbose()) cerr<<"After model["<<i<<"]: "<<ds().ScoreDetails()<<endl;
- }
- //TODO: compute doc bleu stats for each sentence, then when getting oracle temporarily exclude stats for that sentence (skip regular score updating)
- }
- start_random=false;
- cerr << "Forest repo: " << forest_repository << endl;
- assert(DirectoryExists(forest_repository));
- vector<string> features;
- vector<weight_t> dorigin;
- Weights::InitFromFile(weights_file, &dorigin, &features);
- if (optimize_features.size())
- features=optimize_features;
- Weights::InitSparseVector(dorigin, &origin);
- fids.clear();
- AddFeatureIds(features);
- oracles.resize(dev_set_size);
- }
-
- void AddFeatureIds(vector<string> const& features) {
- int i = fids.size();
- fids.resize(fids.size()+features.size());
- for (; i < features.size(); ++i)
- fids[i] = FD::Convert(features[i]);
- }
-
-
- std::string decoder_translations_file; // one per line
- //TODO: is it worthwhile to get a complete document bleu first? would take a list of 1best translations one per line from the decoders, rather than loading all the forests (expensive). translations are in run.raw.N.gz - new arg
- void adjust_doc(unsigned i,double scale=1.) {
- oracle.doc_score->PlusEquals(*model_scores[i],scale);
- }
-
- Score &ds() {
- return *oracle.doc_score;
- }
-
- Oracle const& ComputeOracle(unsigned i) {
- Oracle &o=oracles[i];
- if (o.is_null()) {
- if (have_doc) {
- if (verbose()) cerr<<"Before removing i="<<i<<" "<<ds().ScoreDetails()<<"\n";
- adjust_doc(i,-1);
- }
- ReadFile rf(forest_file(i));
- Hypergraph hg;
- {
- Timer t("Loading forest from JSON "+forest_file(i));
- HypergraphIO::ReadFromJSON(rf.stream(), &hg);
- }
- if (verbose()) cerr<<"Before oracle["<<i<<"]: "<<ds().ScoreDetails()<<endl;
- o=oracle.ComputeOracle(oracle.MakeMetadata(hg,i),&hg,origin);
- if (verbose()) {
- cerr << o;
- ScoreP hopesc=oracle.GetScore(o.hope.sentence,i);
- oracle.doc_score->PlusEquals(*hopesc,1);
- cerr<<"With hope: "<<ds().ScoreDetails()<<endl;
- oracle.doc_score->PlusEquals(*hopesc,-1);
- cerr<<"Without hope: "<<ds().ScoreDetails()<<endl;
- cerr<<" oracle="<<oracle.GetScore(o.hope.sentence,i)->ScoreDetails()<<endl
- <<" model="<<oracle.GetScore(o.model.sentence,i)->ScoreDetails()<<endl;
- if (have_doc)
- cerr<<" doc (should = model): "<<model_scores[i]->ScoreDetails()<<endl;
- }
- if (have_doc) {
- adjust_doc(i,1);
- } else
- oracle.IncludeLastScore();
- }
- return o;
- }
-
- // if start_random is true, immediately sample w/ replacement from src sentences; otherwise, consume them sequentially until exhausted, then random. oracle vectors are summed
- void AddOracleDirections() {
- MT19937::IntRNG rsg=rng.inclusive(0,dev_set_size-1);
- unsigned b=0;
- for(unsigned i=0;i<n_oracle;++i) {
- Dir o2hope;
- Dir fear2hope;
- for (unsigned j=0;j<oracle_batch;++j,++b) {
- Oracle const& o=ComputeOracle((start_random||b>=dev_set_size) ? rsg() : b);
-
- if (old_to_hope)
- o2hope+=o.ModelHopeGradient();
- if (fear_to_hope)
- fear2hope+=o.FearHopeGradient();
- }
- double N=(double)oracle_batch;
- if (old_to_hope) {
- o2hope/=N;
- directions.push_back(o2hope);
- }
- if (fear_to_hope) {
- fear2hope/=N;
- directions.push_back(fear2hope);
- }
- }
- }
-};
-
-int main(int argc, char** argv) {
- oracle_directions od;
- return od.main(argc,argv);
-}
diff --git a/vest/mr_vest_map.cc b/vest/mr_vest_map.cc
deleted file mode 100644
index 71dda6d7..00000000
--- a/vest/mr_vest_map.cc
+++ /dev/null
@@ -1,110 +0,0 @@
-#include <sstream>
-#include <iostream>
-#include <fstream>
-#include <vector>
-
-#include <boost/program_options.hpp>
-#include <boost/program_options/variables_map.hpp>
-
-#include "ces.h"
-#include "filelib.h"
-#include "stringlib.h"
-#include "sparse_vector.h"
-#include "scorer.h"
-#include "viterbi_envelope.h"
-#include "inside_outside.h"
-#include "error_surface.h"
-#include "b64tools.h"
-#include "hg_io.h"
-
-using namespace std;
-namespace po = boost::program_options;
-
-void InitCommandLine(int argc, char** argv, po::variables_map* conf) {
- po::options_description opts("Configuration options");
- opts.add_options()
- ("reference,r",po::value<vector<string> >(), "[REQD] Reference translation (tokenized text)")
- ("source,s",po::value<string>(), "Source file (ignored, except for AER)")
- ("loss_function,l",po::value<string>()->default_value("ibm_bleu"), "Loss function being optimized")
- ("input,i",po::value<string>()->default_value("-"), "Input file to map (- is STDIN)")
- ("help,h", "Help");
- po::options_description dcmdline_options;
- dcmdline_options.add(opts);
- po::store(parse_command_line(argc, argv, dcmdline_options), *conf);
- bool flag = false;
- if (!conf->count("reference")) {
- cerr << "Please specify one or more references using -r <REF.TXT>\n";
- flag = true;
- }
- if (flag || conf->count("help")) {
- cerr << dcmdline_options << endl;
- exit(1);
- }
-}
-
-bool ReadSparseVectorString(const string& s, SparseVector<double>* v) {
-#if 0
- // this should work, but untested.
- std::istringstream i(s);
- i>>*v;
-#else
- vector<string> fields;
- Tokenize(s, ';', &fields);
- if (fields.empty()) return false;
- for (int i = 0; i < fields.size(); ++i) {
- vector<string> pair(2);
- Tokenize(fields[i], '=', &pair);
- if (pair.size() != 2) {
- cerr << "Error parsing vector string: " << fields[i] << endl;
- return false;
- }
- v->set_value(FD::Convert(pair[0]), atof(pair[1].c_str()));
- }
- return true;
-#endif
-}
-
-int main(int argc, char** argv) {
- po::variables_map conf;
- InitCommandLine(argc, argv, &conf);
- const string loss_function = conf["loss_function"].as<string>();
- ScoreType type = ScoreTypeFromString(loss_function);
- DocScorer ds(type, conf["reference"].as<vector<string> >(), conf["source"].as<string>());
- cerr << "Loaded " << ds.size() << " references for scoring with " << loss_function << endl;
- Hypergraph hg;
- string last_file;
- ReadFile in_read(conf["input"].as<string>());
- istream &in=*in_read.stream();
- while(in) {
- string line;
- getline(in, line);
- if (line.empty()) continue;
- istringstream is(line);
- int sent_id;
- string file, s_origin, s_axis;
- // path-to-file (JSON) sent_ed starting-point search-direction
- is >> file >> sent_id >> s_origin >> s_axis;
- SparseVector<double> origin;
- assert(ReadSparseVectorString(s_origin, &origin));
- SparseVector<double> axis;
- assert(ReadSparseVectorString(s_axis, &axis));
- // cerr << "File: " << file << "\nAxis: " << axis << "\n X: " << origin << endl;
- if (last_file != file) {
- last_file = file;
- ReadFile rf(file);
- HypergraphIO::ReadFromJSON(rf.stream(), &hg);
- }
- ViterbiEnvelopeWeightFunction wf(origin, axis);
- ViterbiEnvelope ve = Inside<ViterbiEnvelope, ViterbiEnvelopeWeightFunction>(hg, NULL, wf);
- ErrorSurface es;
- ComputeErrorSurface(*ds[sent_id], ve, &es, type, hg);
- //cerr << "Viterbi envelope has " << ve.size() << " segments\n";
- // cerr << "Error surface has " << es.size() << " segments\n";
- string val;
- es.Serialize(&val);
- cout << 'M' << ' ' << s_origin << ' ' << s_axis << '\t';
- B64::b64encode(val.c_str(), val.size(), &cout);
- cout << endl << flush;
- }
- return 0;
-}
diff --git a/vest/mr_vest_reduce.cc b/vest/mr_vest_reduce.cc
deleted file mode 100644
index 3df52020..00000000
--- a/vest/mr_vest_reduce.cc
+++ /dev/null
@@ -1,81 +0,0 @@
-#include <sstream>
-#include <iostream>
-#include <fstream>
-#include <vector>
-
-#include <boost/program_options.hpp>
-#include <boost/program_options/variables_map.hpp>
-
-#include "sparse_vector.h"
-#include "error_surface.h"
-#include "line_optimizer.h"
-#include "b64tools.h"
-
-using namespace std;
-namespace po = boost::program_options;
-
-void InitCommandLine(int argc, char** argv, po::variables_map* conf) {
- po::options_description opts("Configuration options");
- opts.add_options()
- ("loss_function,l",po::value<string>(), "Loss function being optimized")
- ("help,h", "Help");
- po::options_description dcmdline_options;
- dcmdline_options.add(opts);
- po::store(parse_command_line(argc, argv, dcmdline_options), *conf);
- bool flag = conf->count("loss_function") == 0;
- if (flag || conf->count("help")) {
- cerr << dcmdline_options << endl;
- exit(1);
- }
-}
-
-int main(int argc, char** argv) {
- po::variables_map conf;
- InitCommandLine(argc, argv, &conf);
- const string loss_function = conf["loss_function"].as<string>();
- ScoreType type = ScoreTypeFromString(loss_function);
- LineOptimizer::ScoreType opt_type = LineOptimizer::MAXIMIZE_SCORE;
- if (type == TER || type == AER) {
- opt_type = LineOptimizer::MINIMIZE_SCORE;
- }
- string last_key;
- vector<ErrorSurface> esv;
- while(cin) {
- string line;
- getline(cin, line);
- if (line.empty()) continue;
- size_t ks = line.find("\t");
- assert(string::npos != ks);
- assert(ks > 2);
- string key = line.substr(2, ks - 2);
- string val = line.substr(ks + 1);
- if (key != last_key) {
- if (!last_key.empty()) {
- float score;
- double x = LineOptimizer::LineOptimize(esv, opt_type, &score);
- cout << last_key << "|" << x << "|" << score << endl;
- }
- last_key = key;
- esv.clear();
- }
- if (val.size() % 4 != 0) {
- cerr << "B64 encoding error 1! Skipping.\n";
- continue;
- }
- string encoded(val.size() / 4 * 3, '\0');
- if (!B64::b64decode(reinterpret_cast<const unsigned char*>(&val[0]), val.size(), &encoded[0], encoded.size())) {
- cerr << "B64 encoding error 2! Skipping.\n";
- continue;
- }
- esv.push_back(ErrorSurface());
- esv.back().Deserialize(type, encoded);
- }
- if (!esv.empty()) {
- // cerr << "ESV=" << esv.size() << endl;
- // for (int i = 0; i < esv.size(); ++i) { cerr << esv[i].size() << endl; }
- float score;
- double x = LineOptimizer::LineOptimize(esv, opt_type, &score);
- cout << last_key << "|" << x << "|" << score << endl;
- }
- return 0;
-}
diff --git a/vest/parallelize.pl b/vest/parallelize.pl
deleted file mode 100755
index 7d0365cc..00000000
--- a/vest/parallelize.pl
+++ /dev/null
@@ -1,423 +0,0 @@
-#!/usr/bin/env perl
-
-# Author: Adam Lopez
-#
-# This script takes a command that processes input
-# from stdin one-line-at-time, and parallelizes it
-# on the cluster using David Chiang's sentserver/
-# sentclient architecture.
-#
-# Prerequisites: the command *must* read each line
-# without waiting for subsequent lines of input
-# (for instance, a command which must read all lines
-# of input before processing will not work) and
-# return it to the output *without* buffering
-# multiple lines.
-
-#TODO: if -j 1, run immediately, not via sentserver? possible differences in environment might make debugging harder
-
-#ANNOYANCE: if input is shorter than -j n lines, or at the very last few lines, repeatedly sleeps. time cut down to 15s from 60s
-
-my $SCRIPT_DIR; BEGIN { use Cwd qw/ abs_path /; use File::Basename; $SCRIPT_DIR = dirname(abs_path($0)); push @INC, $SCRIPT_DIR, "$SCRIPT_DIR/../environment"; }
-use LocalConfig;
-
-use Cwd qw/ abs_path cwd getcwd /;
-use File::Temp qw/ tempfile /;
-use Getopt::Long;
-use IPC::Open2;
-use strict;
-use POSIX ":sys_wait_h";
-
-use File::Basename;
-my $myDir = dirname(__FILE__);
-print STDERR __FILE__." -> $myDir\n";
-push(@INC, $myDir);
-require "libcall.pl";
-
-my $tailn=5; # +0 = concatenate all the client logs. 5 = last 5 lines
-my $recycle_clients; # spawn new clients when previous ones terminate
-my $stay_alive; # dont let server die when having zero clients
-my $joblist = "";
-my $errordir="";
-my $multiline;
-my @files_to_stage;
-my $numnodes = 8;
-my $user = $ENV{"USER"};
-my $pmem = "9g";
-my $basep=50300;
-my $randp=300;
-my $tryp=50;
-my $no_which;
-my $no_cd;
-
-my $DEBUG=$ENV{DEBUG};
-print STDERR "DEBUG=$DEBUG output enabled.\n" if $DEBUG;
-my $verbose = 1;
-sub verbose {
- if ($verbose) {
- print STDERR @_,"\n";
- }
-}
-sub debug {
- if ($DEBUG) {
- my ($package, $filename, $line) = caller;
- print STDERR "DEBUG: $filename($line): ",join(' ',@_),"\n";
- }
-}
-my $is_shell_special=qr.[ \t\n\\><|&;"'`~*?{}$!()].;
-my $shell_escape_in_quote=qr.[\\"\$`!].;
-sub escape_shell {
- my ($arg)=@_;
- return undef unless defined $arg;
- return '""' unless $arg;
- if ($arg =~ /$is_shell_special/) {
- $arg =~ s/($shell_escape_in_quote)/\\$1/g;
- return "\"$arg\"";
- }
- return $arg;
-}
-sub preview_files {
- my ($l,$skipempty,$footer,$n)=@_;
- $n=$tailn unless defined $n;
- my @f=grep { ! ($skipempty && -z $_) } @$l;
- my $fn=join(' ',map {escape_shell($_)} @f);
- my $cmd="tail -n $n $fn";
- unchecked_output("$cmd").($footer?"\nNONEMPTY FILES:\n$fn\n":"");
-}
-sub prefix_dirname($) {
- #like `dirname but if ends in / then return the whole thing
- local ($_)=@_;
- if (/\/$/) {
- $_;
- } else {
- s#/[^/]$##;
- $_ ? $_ : '';
- }
-}
-sub ensure_final_slash($) {
- local ($_)=@_;
- m#/$# ? $_ : ($_."/");
-}
-sub extend_path($$;$$) {
- my ($base,$ext,$mkdir,$baseisdir)=@_;
- if (-d $base) {
- $base.="/";
- } else {
- my $dir;
- if ($baseisdir) {
- $dir=$base;
- $base.='/' unless $base =~ /\/$/;
- } else {
- $dir=prefix_dirname($base);
- }
- my @cmd=("/bin/mkdir","-p",$dir);
- check_call(@cmd) if $mkdir;
- }
- return $base.$ext;
-}
-
-my $abscwd=abs_path(&getcwd);
-sub print_help;
-
-my $use_fork;
-my @pids;
-
-# Process command-line options
-unless (GetOptions(
- "stay-alive" => \$stay_alive,
- "recycle-clients" => \$recycle_clients,
- "error-dir=s" => \$errordir,
- "multi-line" => \$multiline,
- "file=s" => \@files_to_stage,
- "use-fork" => \$use_fork,
- "verbose" => \$verbose,
- "jobs=i" => \$numnodes,
- "pmem=s" => \$pmem,
- "baseport=i" => \$basep,
-# "iport=i" => \$randp, #for short name -i
- "no-which!" => \$no_which,
- "no-cd!" => \$no_cd,
- "tailn=s" => \$tailn,
-) && scalar @ARGV){
- print_help();
- die "bad options.";
-}
-
-my $cmd = "";
-my $prog=shift;
-if ($no_which) {
- $cmd=$prog;
-} else {
- $cmd=check_output("which $prog");
- chomp $cmd;
- die "$prog not found - $cmd" unless $cmd;
-}
-#$cmd=abs_path($cmd);
-for my $arg (@ARGV) {
- $cmd .= " ".escape_shell($arg);
-}
-die "Please specify a command to parallelize\n" if $cmd eq '';
-
-my $cdcmd=$no_cd ? '' : ("cd ".escape_shell($abscwd)."\n");
-
-my $executable = $cmd;
-$executable =~ s/^\s*(\S+)($|\s.*)/$1/;
-$executable=check_output("basename $executable");
-chomp $executable;
-
-
-print STDERR "Parallelizing ($numnodes ways): $cmd\n\n";
-
-# create -e dir and save .sh
-use File::Temp qw/tempdir/;
-unless ($errordir) {
- $errordir=tempdir("$executable.XXXXXX",CLEANUP=>1);
-}
-if ($errordir) {
- my $scriptfile=extend_path("$errordir/","$executable.sh",1,1);
- -d $errordir || die "should have created -e dir $errordir";
- open SF,">",$scriptfile || die;
- print SF "$cdcmd$cmd\n";
- close SF;
- chmod 0755,$scriptfile;
- $errordir=abs_path($errordir);
- &verbose("-e dir: $errordir");
-}
-
-# set cleanup handler
-my @cleanup_cmds;
-sub cleanup;
-sub cleanup_and_die;
-$SIG{INT} = "cleanup_and_die";
-$SIG{TERM} = "cleanup_and_die";
-$SIG{HUP} = "cleanup_and_die";
-
-# other subs:
-sub numof_live_jobs;
-sub launch_job_on_node;
-
-
-# vars
-my $mydir = check_output("dirname $0"); chomp $mydir;
-my $sentserver = "$mydir/sentserver";
-my $sentclient = "$mydir/sentclient";
-my $host = check_output("hostname");
-chomp $host;
-
-
-# find open port
-srand;
-my $port = 50300+int(rand($randp));
-my $endp=$port+$tryp;
-sub listening_port_lines {
- my $quiet=$verbose?'':'2>/dev/null';
- return unchecked_output("netstat -a -n $quiet | grep LISTENING | grep -i tcp");
-}
-my $netstat=&listening_port_lines;
-
-if ($verbose){ print STDERR "Testing port $port...";}
-
-while ($netstat=~/$port/ || &listening_port_lines=~/$port/){
- if ($verbose){ print STDERR "port is busy\n";}
- $port++;
- if ($port > $endp){
- die "Unable to find open port\n";
- }
- if ($verbose){ print STDERR "Testing port $port... "; }
-}
-if ($verbose){
- print STDERR "port $port is available\n";
-}
-
-my $key = int(rand()*1000000);
-
-my $multiflag = "";
-if ($multiline){ $multiflag = "-m"; print STDERR "expecting multiline output.\n"; }
-my $stay_alive_flag = "";
-if ($stay_alive){ $stay_alive_flag = "--stay-alive"; print STDERR "staying alive while no clients are connected.\n"; }
-
-my $node_count = 0;
-my $script = "";
-# fork == one thread runs the sentserver, while the
-# other spawns the sentclient commands.
-my $pid = fork;
-if ($pid == 0) { # child
- sleep 8; # give other thread time to start sentserver
- $script = "$cdcmd$sentclient $host:$port:$key $cmd";
-
- if ($verbose){
- print STDERR "Client script:\n====\n";
- print STDERR $script;
- print STDERR "====\n";
- }
- for (my $jobn=0; $jobn<$numnodes; $jobn++){
- launch_job();
- }
- if ($recycle_clients) {
- my $ret;
- my $livejobs;
- while (1) {
- $ret = waitpid($pid, WNOHANG);
- #print STDERR "waitpid $pid ret = $ret \n";
- last if ($ret != 0);
- $livejobs = numof_live_jobs();
- if ($numnodes >= $livejobs ) { # a client terminated, OR # lines of input was less than -j
- print STDERR "num of requested nodes = $numnodes; num of currently live jobs = $livejobs; Client terminated - launching another.\n";
- launch_job();
- } else {
- sleep 15;
- }
- }
- }
- print STDERR "CHILD PROCESSES SPAWNED ... WAITING\n";
- for my $p (@pids) {
- waitpid($p, 0);
- }
-} else {
-# my $todo = "$sentserver -k $key $multiflag $port ";
- my $todo = "$sentserver -k $key $multiflag $port $stay_alive_flag ";
- if ($verbose){ print STDERR "Running: $todo\n"; }
- check_call($todo);
- print STDERR "Call to $sentserver returned.\n";
- cleanup();
- exit(0);
-}
-
-sub numof_live_jobs {
- if ($use_fork) {
- die "not implemented";
- } else {
- # We can probably continue decoding if the qstat error is only temporary
- my @livejobs = grep(/$joblist/, split(/\n/, unchecked_output("qstat")));
- return ($#livejobs + 1);
- }
-}
-my (@errors,@outs,@cmds);
-
-sub launch_job {
- if ($use_fork) { return launch_job_fork(); }
- my $errorfile = "/dev/null";
- my $outfile = "/dev/null";
- $node_count++;
- my $clientname = $executable;
- $clientname =~ s/^(.{4}).*$/$1/;
- $clientname = "$clientname.$node_count";
- if ($errordir){
- $errorfile = "$errordir/$clientname.ER";
- $outfile = "$errordir/$clientname.OU";
- push @errors,$errorfile;
- push @outs,$outfile;
- }
- my $todo = qsub_args($pmem) . " -N $clientname -o $outfile -e $errorfile";
- push @cmds,$todo;
-
- print STDERR "Running: $todo\n";
- local(*QOUT, *QIN);
- open2(\*QOUT, \*QIN, $todo) or die "Failed to open2: $!";
- print QIN $script;
- close QIN;
- while (my $jobid=<QOUT>){
- chomp $jobid;
- if ($verbose){ print STDERR "Launched client job: $jobid"; }
- $jobid =~ s/^(\d+)(.*?)$/\1/g;
- $jobid =~ s/^Your job (\d+) .*$/\1/;
- print STDERR " short job id $jobid\n";
- if ($verbose){
- print STDERR "cd: $abscwd\n";
- print STDERR "cmd: $cmd\n";
- }
- if ($joblist == "") { $joblist = $jobid; }
- else {$joblist = $joblist . "\|" . $jobid; }
- my $cleanfn="qdel $jobid 2> /dev/null";
- push(@cleanup_cmds, $cleanfn);
- }
- close QOUT;
-}
-
-sub launch_job_fork {
- my $errorfile = "/dev/null";
- my $outfile = "/dev/null";
- $node_count++;
- my $clientname = $executable;
- $clientname =~ s/^(.{4}).*$/$1/;
- $clientname = "$clientname.$node_count";
- if ($errordir){
- $errorfile = "$errordir/$clientname.ER";
- $outfile = "$errordir/$clientname.OU";
- push @errors,$errorfile;
- push @outs,$outfile;
- }
- my $pid = fork;
- if ($pid == 0) {
- my ($fh, $scr_name) = get_temp_script();
- print $fh $script;
- close $fh;
- my $todo = "/bin/bash -xeo pipefail $scr_name 1> $outfile 2> $errorfile";
- print STDERR "EXEC: $todo\n";
- my $out = check_output("$todo");
- unlink $scr_name or warn "Failed to remove $scr_name";
- exit 0;
- } else {
- push @pids, $pid;
- }
-}
-
-sub get_temp_script {
- my ($fh, $filename) = tempfile( "workXXXX", SUFFIX => '.sh');
- return ($fh, $filename);
-}
-
-sub cleanup_and_die {
- cleanup();
- die "\n";
-}
-
-sub cleanup {
- print STDERR "Cleaning up...\n";
- for $cmd (@cleanup_cmds){
- print STDERR " Cleanup command: $cmd\n";
- eval $cmd;
- }
- print STDERR "outputs:\n",preview_files(\@outs,1),"\n";
- print STDERR "errors:\n",preview_files(\@errors,1),"\n";
- print STDERR "cmd:\n",$cmd,"\n";
- print STDERR " cat $errordir/*.ER\nfor logs.\n";
- print STDERR "Cleanup finished.\n";
-}
-
-sub print_help
-{
- my $name = check_output("basename $0"); chomp $name;
- print << "Help";
-
-usage: $name [options]
-
- Automatic black-box parallelization of commands.
-
-options:
-
- --use-fork
- Instead of using qsub, use fork.
-
- -e, --error-dir <dir>
- Retain output files from jobs in <dir>, rather
- than silently deleting them.
-
- -m, --multi-line
- Expect that command may produce multiple output
- lines for a single input line. $name makes a
- reasonable attempt to obtain all output before
- processing additional inputs. However, use of this
- option is inherently unsafe.
-
- -v, --verbose
- Print diagnostic informatoin on stderr.
-
- -j, --jobs
- Number of jobs to use.
-
- -p, --pmem
- pmem setting for each job.
-
-Help
-}
diff --git a/vest/sentclient.c b/vest/sentclient.c
deleted file mode 100644
index 91d994ab..00000000
--- a/vest/sentclient.c
+++ /dev/null
@@ -1,76 +0,0 @@
-/* Copyright (c) 2001 by David Chiang. All rights reserved.*/
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <unistd.h>
-#include <sys/socket.h>
-#include <sys/types.h>
-#include <netinet/in.h>
-#include <netdb.h>
-#include <string.h>
-
-#include "sentserver.h"
-
-int main (int argc, char *argv[]) {
- int sock, port;
- char *s, *key;
- struct hostent *hp;
- struct sockaddr_in server;
- int errors = 0;
-
- if (argc < 3) {
- fprintf(stderr, "Usage: sentclient host[:port[:key]] command [args ...]\n");
- exit(1);
- }
-
- s = strchr(argv[1], ':');
- key = NULL;
-
- if (s == NULL) {
- port = DEFAULT_PORT;
- } else {
- *s = '\0';
- s+=1;
- /* dumb hack */
- key = strchr(s, ':');
- if (key != NULL){
- *key = '\0';
- key += 1;
- }
- port = atoi(s);
- }
-
- sock = socket(AF_INET, SOCK_STREAM, 0);
-
- hp = gethostbyname(argv[1]);
- if (hp == NULL) {
- fprintf(stderr, "unknown host %s\n", argv[1]);
- exit(1);
- }
-
- bzero((char *)&server, sizeof(server));
- bcopy(hp->h_addr, (char *)&server.sin_addr, hp->h_length);
- server.sin_family = hp->h_addrtype;
- server.sin_port = htons(port);
-
- while (connect(sock, (struct sockaddr *)&server, sizeof(server)) < 0) {
- perror("connect()");
- sleep(1);
- errors++;
- if (errors > 5)
- exit(1);
- }
-
- close(0);
- close(1);
- dup2(sock, 0);
- dup2(sock, 1);
-
- if (key != NULL){
- write(1, key, strlen(key));
- write(1, "\n", 1);
- }
-
- execvp(argv[2], argv+2);
- return 0;
-}
diff --git a/vest/sentserver.c b/vest/sentserver.c
deleted file mode 100644
index c20b4fa6..00000000
--- a/vest/sentserver.c
+++ /dev/null
@@ -1,515 +0,0 @@
-/* Copyright (c) 2001 by David Chiang. All rights reserved.*/
-
-#include <string.h>
-#include <stdlib.h>
-#include <unistd.h>
-#include <fcntl.h>
-#include <stdio.h>
-#include <sys/socket.h>
-#include <sys/types.h>
-#include <sys/time.h>
-#include <netinet/in.h>
-#include <sched.h>
-#include <pthread.h>
-#include <errno.h>
-
-#include "sentserver.h"
-
-#define MAX_CLIENTS 64
-
-struct clientinfo {
- int s;
- struct sockaddr_in sin;
-};
-
-struct line {
- int id;
- char *s;
- int status;
- struct line *next;
-} *head, **ptail;
-
-int n_sent = 0, n_received=0, n_flushed=0;
-
-#define STATUS_RUNNING 0
-#define STATUS_ABORTED 1
-#define STATUS_FINISHED 2
-
-pthread_mutex_t queue_mutex = PTHREAD_MUTEX_INITIALIZER;
-pthread_mutex_t clients_mutex = PTHREAD_MUTEX_INITIALIZER;
-pthread_mutex_t input_mutex = PTHREAD_MUTEX_INITIALIZER;
-
-int n_clients = 0;
-int s;
-int expect_multiline_output = 0;
-int log_mutex = 0;
-int stay_alive = 0; /* dont panic and die with zero clients */
-
-void queue_finish(struct line *node, char *s, int fid);
-char * read_line(int fd, int multiline);
-void done (int code);
-
-struct line * queue_get(int fid) {
- struct line *cur;
- char *s, *synch;
-
- if (log_mutex) fprintf(stderr, "Getting for data for fid %d\n", fid);
- if (log_mutex) fprintf(stderr, "Locking queue mutex (%d)\n", fid);
- pthread_mutex_lock(&queue_mutex);
-
- /* First, check for aborted sentences. */
-
- if (log_mutex) fprintf(stderr, " Checking queue for aborted jobs (fid %d)\n", fid);
- for (cur = head; cur != NULL; cur = cur->next) {
- if (cur->status == STATUS_ABORTED) {
- cur->status = STATUS_RUNNING;
-
- if (log_mutex) fprintf(stderr, "Unlocking queue mutex (%d)\n", fid);
- pthread_mutex_unlock(&queue_mutex);
-
- return cur;
- }
- }
- if (log_mutex) fprintf(stderr, "Unlocking queue mutex (%d)\n", fid);
- pthread_mutex_unlock(&queue_mutex);
-
- /* Otherwise, read a new one. */
- if (log_mutex) fprintf(stderr, "Locking input mutex (%d)\n", fid);
- if (log_mutex) fprintf(stderr, " Reading input for new data (fid %d)\n", fid);
- pthread_mutex_lock(&input_mutex);
- s = read_line(0,0);
-
- while (s) {
- if (log_mutex) fprintf(stderr, "Locking queue mutex (%d)\n", fid);
- pthread_mutex_lock(&queue_mutex);
- if (log_mutex) fprintf(stderr, "Unlocking input mutex (%d)\n", fid);
- pthread_mutex_unlock(&input_mutex);
-
- cur = malloc(sizeof (struct line));
- cur->id = n_sent;
- cur->s = s;
- cur->next = NULL;
-
- *ptail = cur;
- ptail = &cur->next;
-
- n_sent++;
-
- if (strcmp(s,"===SYNCH===\n")==0){
- fprintf(stderr, "Received ===SYNCH=== signal (fid %d)\n", fid);
- // Note: queue_finish calls free(cur->s).
- // Therefore we need to create a new string here.
- synch = malloc((strlen("===SYNCH===\n")+2) * sizeof (char));
- synch = strcpy(synch, s);
-
- if (log_mutex) fprintf(stderr, "Unlocking queue mutex (%d)\n", fid);
- pthread_mutex_unlock(&queue_mutex);
- queue_finish(cur, synch, fid); /* handles its own lock */
-
- if (log_mutex) fprintf(stderr, "Locking input mutex (%d)\n", fid);
- if (log_mutex) fprintf(stderr, " Reading input for new data (fid %d)\n", fid);
- pthread_mutex_lock(&input_mutex);
-
- s = read_line(0,0);
- } else {
- if (log_mutex) fprintf(stderr, " Received new data %d (fid %d)\n", cur->id, fid);
- cur->status = STATUS_RUNNING;
- if (log_mutex) fprintf(stderr, "Unlocking queue mutex (%d)\n", fid);
- pthread_mutex_unlock(&queue_mutex);
- return cur;
- }
- }
-
- if (log_mutex) fprintf(stderr, "Unlocking input mutex (%d)\n", fid);
- pthread_mutex_unlock(&input_mutex);
- /* Only way to reach this point: no more output */
-
- if (log_mutex) fprintf(stderr, "Locking queue mutex (%d)\n", fid);
- pthread_mutex_lock(&queue_mutex);
- if (head == NULL) {
- fprintf(stderr, "Reached end of file. Exiting.\n");
- done(0);
- } else
- ptail = NULL; /* This serves as a signal that there is no more input */
- if (log_mutex) fprintf(stderr, "Unlocking queue mutex (%d)\n", fid);
- pthread_mutex_unlock(&queue_mutex);
-
- return NULL;
-}
-
-void queue_panic() {
- struct line *next;
- while (head && head->status == STATUS_FINISHED) {
- /* Write out finished sentences */
- if (head->status == STATUS_FINISHED) {
- fputs(head->s, stdout);
- fflush(stdout);
- }
- /* Write out blank line for unfinished sentences */
- if (head->status == STATUS_ABORTED) {
- fputs("\n", stdout);
- fflush(stdout);
- }
- /* By defition, there cannot be any RUNNING sentences, since
- function is only called when n_clients == 0 */
- free(head->s);
- next = head->next;
- free(head);
- head = next;
- n_flushed++;
- }
- fclose(stdout);
- fprintf(stderr, "All clients died. Panicking, flushing completed sentences and exiting.\n");
- done(1);
-}
-
-void queue_abort(struct line *node, int fid) {
- if (log_mutex) fprintf(stderr, "Locking queue mutex (%d)\n", fid);
- pthread_mutex_lock(&queue_mutex);
- node->status = STATUS_ABORTED;
- if (n_clients == 0) {
- if (stay_alive) {
- fprintf(stderr, "Warning! No live clients detected! Staying alive, will retry soon.\n");
- } else {
- queue_panic();
- }
- }
- if (log_mutex) fprintf(stderr, "Unlocking queue mutex (%d)\n", fid);
- pthread_mutex_unlock(&queue_mutex);
-}
-
-
-void queue_print() {
- struct line *cur;
-
- fprintf(stderr, " Queue\n");
-
- for (cur = head; cur != NULL; cur = cur->next) {
- switch(cur->status) {
- case STATUS_RUNNING:
- fprintf(stderr, " %d running ", cur->id); break;
- case STATUS_ABORTED:
- fprintf(stderr, " %d aborted ", cur->id); break;
- case STATUS_FINISHED:
- fprintf(stderr, " %d finished ", cur->id); break;
-
- }
- fprintf(stderr, "\n");
- //fprintf(stderr, cur->s);
- }
-}
-
-void queue_finish(struct line *node, char *s, int fid) {
- struct line *next;
- if (log_mutex) fprintf(stderr, "Locking queue mutex (%d)\n", fid);
- pthread_mutex_lock(&queue_mutex);
-
- free(node->s);
- node->s = s;
- node->status = STATUS_FINISHED;
- n_received++;
-
- /* Flush out finished nodes */
- while (head && head->status == STATUS_FINISHED) {
-
- if (log_mutex) fprintf(stderr, " Flushing finished node %d\n", head->id);
-
- fputs(head->s, stdout);
- fflush(stdout);
- if (log_mutex) fprintf(stderr, " Flushed node %d\n", head->id);
- free(head->s);
-
- next = head->next;
- free(head);
-
- head = next;
-
- n_flushed++;
-
- if (head == NULL) { /* empty queue */
- if (ptail == NULL) { /* This can only happen if set in queue_get as signal that there is no more input. */
- fprintf(stderr, "All sentences finished. Exiting.\n");
- done(0);
- } else /* ptail pointed at something which was just popped off the stack -- reset to head*/
- ptail = &head;
- }
- }
-
- if (log_mutex) fprintf(stderr, " Flushing output %d\n", head->id);
- fflush(stdout);
- fprintf(stderr, "%d sentences sent, %d sentences finished, %d sentences flushed\n", n_sent, n_received, n_flushed);
-
- if (log_mutex) fprintf(stderr, "Unlocking queue mutex (%d)\n", fid);
- pthread_mutex_unlock(&queue_mutex);
-
-}
-
-char * read_line(int fd, int multiline) {
- int size = 80;
- char errorbuf[100];
- char *s = malloc(size+2);
- int result, errors=0;
- int i = 0;
-
- result = read(fd, s+i, 1);
-
- while (1) {
- if (result < 0) {
- perror("read()");
- sprintf(errorbuf, "Error code: %d\n", errno);
- fprintf(stderr, errorbuf);
- errors++;
- if (errors > 5) {
- free(s);
- return NULL;
- } else {
- sleep(1); /* retry after delay */
- }
- } else if (result == 0) {
- break;
- } else if (multiline==0 && s[i] == '\n') {
- break;
- } else {
- if (s[i] == '\n'){
- /* if we've reached this point,
- then multiline must be 1, and we're
- going to poll the fd for an additional
- line of data. The basic design is to
- run a select on the filedescriptor fd.
- Select will return under two conditions:
- if there is data on the fd, or if a
- timeout is reached. We'll select on this
- fd. If select returns because there's data
- ready, keep going; else assume there's no
- more and return the data we already have.
- */
-
- fd_set set;
- FD_ZERO(&set);
- FD_SET(fd, &set);
-
- struct timeval timeout;
- timeout.tv_sec = 3; // number of seconds for timeout
- timeout.tv_usec = 0;
-
- int ready = select(FD_SETSIZE, &set, NULL, NULL, &timeout);
- if (ready<1){
- break; // no more data, stop looping
- }
- }
- i++;
-
- if (i == size) {
- size = size*2;
- s = realloc(s, size+2);
- }
- }
-
- result = read(fd, s+i, 1);
- }
-
- if (result == 0 && i == 0) { /* end of file */
- free(s);
- return NULL;
- }
-
- s[i] = '\n';
- s[i+1] = '\0';
-
- return s;
-}
-
-void * new_client(void *arg) {
- struct clientinfo *client = (struct clientinfo *)arg;
- struct line *cur;
- int result;
- char *s;
- char errorbuf[100];
-
- pthread_mutex_lock(&clients_mutex);
- n_clients++;
- pthread_mutex_unlock(&clients_mutex);
-
- fprintf(stderr, "Client connected (%d connected)\n", n_clients);
-
- for (;;) {
-
- cur = queue_get(client->s);
-
- if (cur) {
- /* fprintf(stderr, "Sending to client: %s", cur->s); */
- fprintf(stderr, "Sending data %d to client (fid %d)\n", cur->id, client->s);
- result = write(client->s, cur->s, strlen(cur->s));
- if (result < strlen(cur->s)){
- perror("write()");
- sprintf(errorbuf, "Error code: %d\n", errno);
- fprintf(stderr, errorbuf);
-
- pthread_mutex_lock(&clients_mutex);
- n_clients--;
- pthread_mutex_unlock(&clients_mutex);
-
- fprintf(stderr, "Client died (%d connected)\n", n_clients);
- queue_abort(cur, client->s);
-
- close(client->s);
- free(client);
-
- pthread_exit(NULL);
- }
- } else {
- close(client->s);
- pthread_mutex_lock(&clients_mutex);
- n_clients--;
- pthread_mutex_unlock(&clients_mutex);
- fprintf(stderr, "Client dismissed (%d connected)\n", n_clients);
- pthread_exit(NULL);
- }
-
- s = read_line(client->s,expect_multiline_output);
- if (s) {
- /* fprintf(stderr, "Client (fid %d) returned: %s", client->s, s); */
- fprintf(stderr, "Client (fid %d) returned data %d\n", client->s, cur->id);
-// queue_print();
- queue_finish(cur, s, client->s);
- } else {
- pthread_mutex_lock(&clients_mutex);
- n_clients--;
- pthread_mutex_unlock(&clients_mutex);
-
- fprintf(stderr, "Client died (%d connected)\n", n_clients);
- queue_abort(cur, client->s);
-
- close(client->s);
- free(client);
-
- pthread_exit(NULL);
- }
-
- }
- return 0;
-}
-
-void done (int code) {
- close(s);
- exit(code);
-}
-
-
-
-int main (int argc, char *argv[]) {
- struct sockaddr_in sin, from;
- int g;
- socklen_t len;
- struct clientinfo *client;
- int port;
- int opt;
- int errors = 0;
- int argi;
- char *key = NULL, *client_key;
- int use_key = 0;
- /* the key stuff here doesn't provide any
- real measure of security, it's mainly to keep
- jobs from bumping into each other. */
-
- pthread_t tid;
- port = DEFAULT_PORT;
-
- for (argi=1; argi < argc; argi++){
- if (strcmp(argv[argi], "-m")==0){
- expect_multiline_output = 1;
- } else if (strcmp(argv[argi], "-k")==0){
- argi++;
- if (argi == argc){
- fprintf(stderr, "Key must be specified after -k\n");
- exit(1);
- }
- key = argv[argi];
- use_key = 1;
- } else if (strcmp(argv[argi], "--stay-alive")==0){
- stay_alive = 1; /* dont panic and die with zero clients */
- } else {
- port = atoi(argv[argi]);
- }
- }
-
- /* Initialize data structures */
- head = NULL;
- ptail = &head;
-
- /* Set up listener */
- s = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP);
- opt = 1;
- setsockopt(s, SOL_SOCKET, SO_REUSEADDR, &opt, sizeof(opt));
-
- sin.sin_family = AF_INET;
- sin.sin_addr.s_addr = htonl(INADDR_ANY);
- sin.sin_port = htons(port);
- while (bind(s, (struct sockaddr *) &sin, sizeof(sin)) < 0) {
- perror("bind()");
- sleep(1);
- errors++;
- if (errors > 100)
- exit(1);
- }
-
- len = sizeof(sin);
- getsockname(s, (struct sockaddr *) &sin, &len);
-
- fprintf(stderr, "Listening on port %hu\n", ntohs(sin.sin_port));
-
- while (listen(s, MAX_CLIENTS) < 0) {
- perror("listen()");
- sleep(1);
- errors++;
- if (errors > 100)
- exit(1);
- }
-
- for (;;) {
- len = sizeof(from);
- g = accept(s, (struct sockaddr *)&from, &len);
- if (g < 0) {
- perror("accept()");
- sleep(1);
- continue;
- }
- client = malloc(sizeof(struct clientinfo));
- client->s = g;
- bcopy(&from, &client->sin, len);
-
- if (use_key){
- fd_set set;
- FD_ZERO(&set);
- FD_SET(client->s, &set);
-
- struct timeval timeout;
- timeout.tv_sec = 3; // number of seconds for timeout
- timeout.tv_usec = 0;
-
- int ready = select(FD_SETSIZE, &set, NULL, NULL, &timeout);
- if (ready<1){
- fprintf(stderr, "Prospective client failed to respond with correct key.\n");
- close(client->s);
- free(client);
- } else {
- client_key = read_line(client->s,0);
- client_key[strlen(client_key)-1]='\0'; /* chop trailing newline */
- if (strcmp(key, client_key)==0){
- pthread_create(&tid, NULL, new_client, client);
- } else {
- fprintf(stderr, "Prospective client failed to respond with correct key.\n");
- close(client->s);
- free(client);
- }
- free(client_key);
- }
- } else {
- pthread_create(&tid, NULL, new_client, client);
- }
- }
-
-}
-
-
-
diff --git a/vest/sentserver.h b/vest/sentserver.h
deleted file mode 100644
index cd17a546..00000000
--- a/vest/sentserver.h
+++ /dev/null
@@ -1,6 +0,0 @@
-#ifndef SENTSERVER_H
-#define SENTSERVER_H
-
-#define DEFAULT_PORT 50000
-
-#endif
diff --git a/vest/tac.pl b/vest/tac.pl
deleted file mode 100755
index 9fb525c1..00000000
--- a/vest/tac.pl
+++ /dev/null
@@ -1,8 +0,0 @@
-#!/usr/bin/perl
-
-while(<>) {
- chomp;
- $|=1;
- print (scalar reverse($_));
- print "\n";
-}
diff --git a/vest/test_aer/README b/vest/test_aer/README
deleted file mode 100644
index 819b2e32..00000000
--- a/vest/test_aer/README
+++ /dev/null
@@ -1,8 +0,0 @@
-To run the test:
-
-../dist-vest.pl --local --metric aer cdec.ini --source-file corpus.src --ref-files=ref.0 --weights weights
-
-This will optimize the parameters of the tiny lexical translation model
-so as to minimize the AER of the Viterbi alignment on the development
-set in corpus.src according to the reference alignments in ref.0.
-
diff --git a/vest/test_aer/cdec.ini b/vest/test_aer/cdec.ini
deleted file mode 100644
index 08187848..00000000
--- a/vest/test_aer/cdec.ini
+++ /dev/null
@@ -1,3 +0,0 @@
-formalism=lextrans
-grammar=grammar
-aligner=true
diff --git a/vest/test_aer/corpus.src b/vest/test_aer/corpus.src
deleted file mode 100644
index 31b23971..00000000
--- a/vest/test_aer/corpus.src
+++ /dev/null
@@ -1,3 +0,0 @@
-el gato negro ||| the black cat
-el gato ||| the cat
-el libro ||| the book
diff --git a/vest/test_aer/grammar b/vest/test_aer/grammar
deleted file mode 100644
index 9d857824..00000000
--- a/vest/test_aer/grammar
+++ /dev/null
@@ -1,12 +0,0 @@
-el ||| cat ||| F1=1
-el ||| the ||| F2=1
-el ||| black ||| F3=1
-el ||| book ||| F11=1
-gato ||| cat ||| F4=1 NN=1
-gato ||| black ||| F5=1
-gato ||| the ||| F6=1
-negro ||| the ||| F7=1
-negro ||| cat ||| F8=1
-negro ||| black ||| F9=1
-libro ||| the ||| F10=1
-libro ||| book ||| F12=1 NN=1
diff --git a/vest/test_aer/ref.0 b/vest/test_aer/ref.0
deleted file mode 100644
index 734a9c5b..00000000
--- a/vest/test_aer/ref.0
+++ /dev/null
@@ -1,3 +0,0 @@
-0-0 1-2 2-1
-0-0 1-1
-0-0 1-1
diff --git a/vest/test_aer/weights b/vest/test_aer/weights
deleted file mode 100644
index afc9282e..00000000
--- a/vest/test_aer/weights
+++ /dev/null
@@ -1,13 +0,0 @@
-F1 0.1
-F2 -.5980815
-F3 0.24235
-F4 0.625
-F5 0.4514
-F6 0.112316
-F7 -0.123415
-F8 -0.25390285
-F9 -0.23852
-F10 0.646
-F11 0.413141
-F12 0.343216
-NN -0.1215
diff --git a/vest/test_data/0.json.gz b/vest/test_data/0.json.gz
deleted file mode 100644
index 30f8dd77..00000000
--- a/vest/test_data/0.json.gz
+++ /dev/null
Binary files differ
diff --git a/vest/test_data/1.json.gz b/vest/test_data/1.json.gz
deleted file mode 100644
index c82cc179..00000000
--- a/vest/test_data/1.json.gz
+++ /dev/null
Binary files differ
diff --git a/vest/test_data/c2e.txt.0 b/vest/test_data/c2e.txt.0
deleted file mode 100644
index 12c4abe9..00000000
--- a/vest/test_data/c2e.txt.0
+++ /dev/null
@@ -1,2 +0,0 @@
-australia reopens embassy in manila
-( afp , manila , january 2 ) australia reopened its embassy in the philippines today , which was shut down about seven weeks ago due to what was described as a specific threat of a terrorist attack .
diff --git a/vest/test_data/c2e.txt.1 b/vest/test_data/c2e.txt.1
deleted file mode 100644
index 4ac12df1..00000000
--- a/vest/test_data/c2e.txt.1
+++ /dev/null
@@ -1,2 +0,0 @@
-australia reopened manila embassy
-( agence france-presse , manila , 2nd ) - australia reopened its embassy in the philippines today . the embassy was closed seven weeks ago after what was described as a specific threat of a terrorist attack .
diff --git a/vest/test_data/c2e.txt.2 b/vest/test_data/c2e.txt.2
deleted file mode 100644
index 2f67b72f..00000000
--- a/vest/test_data/c2e.txt.2
+++ /dev/null
@@ -1,2 +0,0 @@
-australia to reopen embassy in manila
-( afp report from manila , january 2 ) australia reopened its embassy in the philippines today . seven weeks ago , the embassy was shut down due to so-called confirmed terrorist attack threats .
diff --git a/vest/test_data/c2e.txt.3 b/vest/test_data/c2e.txt.3
deleted file mode 100644
index 5483cef6..00000000
--- a/vest/test_data/c2e.txt.3
+++ /dev/null
@@ -1,2 +0,0 @@
-australia to re - open its embassy to manila
-( afp , manila , thursday ) australia reopens its embassy to manila , which was closed for the so-called " clear " threat of terrorist attack 7 weeks ago .
diff --git a/vest/test_data/re.txt.0 b/vest/test_data/re.txt.0
deleted file mode 100644
index 86eff087..00000000
--- a/vest/test_data/re.txt.0
+++ /dev/null
@@ -1,5 +0,0 @@
-erdogan states turkey to reject any pressures to urge it to recognize cyprus
-ankara 12 - 1 ( afp ) - turkish prime minister recep tayyip erdogan announced today , wednesday , that ankara will reject any pressure by the european union to urge it to recognize cyprus . this comes two weeks before the summit of european union state and government heads who will decide whether or nor membership negotiations with ankara should be opened .
-erdogan told " ntv " television station that " the european union cannot address us by imposing new conditions on us with regard to cyprus .
-we will discuss this dossier in the course of membership negotiations . "
-he added " let me be clear , i cannot sidestep turkey , this is something we cannot accept . "
diff --git a/vest/test_data/re.txt.1 b/vest/test_data/re.txt.1
deleted file mode 100644
index 2140f198..00000000
--- a/vest/test_data/re.txt.1
+++ /dev/null
@@ -1,5 +0,0 @@
-erdogan confirms turkey will resist any pressure to recognize cyprus
-ankara 12 - 1 ( afp ) - the turkish head of government , recep tayyip erdogan , announced today ( wednesday ) that ankara would resist any pressure the european union might exercise in order to force it into recognizing cyprus . this comes two weeks before a summit of european union heads of state and government , who will decide whether or not to open membership negotiations with ankara .
-erdogan said to the ntv television channel : " the european union cannot engage with us through imposing new conditions on us with regard to cyprus .
-we shall discuss this issue in the course of the membership negotiations . "
-he added : " let me be clear - i cannot confine turkey . this is something we do not accept . "
diff --git a/vest/test_data/re.txt.2 b/vest/test_data/re.txt.2
deleted file mode 100644
index 94e46286..00000000
--- a/vest/test_data/re.txt.2
+++ /dev/null
@@ -1,5 +0,0 @@
-erdogan confirms that turkey will reject any pressures to encourage it to recognize cyprus
-ankara , 12 / 1 ( afp ) - the turkish prime minister recep tayyip erdogan declared today , wednesday , that ankara will reject any pressures that the european union may apply on it to encourage to recognize cyprus . this comes two weeks before a summit of the heads of countries and governments of the european union , who will decide on whether or not to start negotiations on joining with ankara .
-erdogan told the ntv television station that " it is not possible for the european union to talk to us by imposing new conditions on us regarding cyprus .
-we shall discuss this dossier during the negotiations on joining . "
-and he added , " let me be clear . turkey's arm should not be twisted ; this is something we cannot accept . "
diff --git a/vest/test_data/re.txt.3 b/vest/test_data/re.txt.3
deleted file mode 100644
index f87c3308..00000000
--- a/vest/test_data/re.txt.3
+++ /dev/null
@@ -1,5 +0,0 @@
-erdogan stresses that turkey will reject all pressures to force it to recognize cyprus
-ankara 12 - 1 ( afp ) - turkish prime minister recep tayyip erdogan announced today , wednesday , that ankara would refuse all pressures applied on it by the european union to force it to recognize cyprus . that came two weeks before the summit of the presidents and prime ministers of the european union , who would decide on whether to open negotiations on joining with ankara or not .
-erdogan said to " ntv " tv station that the " european union can not communicate with us by imposing on us new conditions related to cyprus .
-we will discuss this file during the negotiations on joining . "
-he added , " let me be clear . turkey's arm should not be twisted . this is unacceptable to us . "
diff --git a/vest/viterbi_envelope.cc b/vest/viterbi_envelope.cc
deleted file mode 100644
index 9fcf75a0..00000000
--- a/vest/viterbi_envelope.cc
+++ /dev/null
@@ -1,177 +0,0 @@
-#include "viterbi_envelope.h"
-
-#include <cassert>
-#include <limits>
-
-using namespace std;
-using boost::shared_ptr;
-
-ostream& operator<<(ostream& os, const ViterbiEnvelope& env) {
- os << '<';
- const vector<shared_ptr<Segment> >& segs = env.GetSortedSegs();
- for (int i = 0; i < segs.size(); ++i)
- os << (i==0 ? "" : "|") << "x=" << segs[i]->x << ",b=" << segs[i]->b << ",m=" << segs[i]->m << ",p1=" << segs[i]->p1 << ",p2=" << segs[i]->p2;
- return os << '>';
-}
-
-ViterbiEnvelope::ViterbiEnvelope(int i) {
- if (i == 0) {
- // do nothing - <>
- } else if (i == 1) {
- segs.push_back(shared_ptr<Segment>(new Segment(0, 0, 0, shared_ptr<Segment>(), shared_ptr<Segment>())));
- assert(this->IsMultiplicativeIdentity());
- } else {
- cerr << "Only can create ViterbiEnvelope semiring 0 and 1 with this constructor!\n";
- abort();
- }
-}
-
-struct SlopeCompare {
- bool operator() (const shared_ptr<Segment>& a, const shared_ptr<Segment>& b) const {
- return a->m < b->m;
- }
-};
-
-const ViterbiEnvelope& ViterbiEnvelope::operator+=(const ViterbiEnvelope& other) {
- if (!other.is_sorted) other.Sort();
- if (segs.empty()) {
- segs = other.segs;
- return *this;
- }
- is_sorted = false;
- int j = segs.size();
- segs.resize(segs.size() + other.segs.size());
- for (int i = 0; i < other.segs.size(); ++i)
- segs[j++] = other.segs[i];
- assert(j == segs.size());
- return *this;
-}
-
-void ViterbiEnvelope::Sort() const {
- sort(segs.begin(), segs.end(), SlopeCompare());
- const int k = segs.size();
- int j = 0;
- for (int i = 0; i < k; ++i) {
- Segment l = *segs[i];
- l.x = kMinusInfinity;
- // cerr << "m=" << l.m << endl;
- if (0 < j) {
- if (segs[j-1]->m == l.m) { // lines are parallel
- if (l.b <= segs[j-1]->b) continue;
- --j;
- }
- while(0 < j) {
- l.x = (l.b - segs[j-1]->b) / (segs[j-1]->m - l.m);
- if (segs[j-1]->x < l.x) break;
- --j;
- }
- if (0 == j) l.x = kMinusInfinity;
- }
- *segs[j++] = l;
- }
- segs.resize(j);
- is_sorted = true;
-}
-
-const ViterbiEnvelope& ViterbiEnvelope::operator*=(const ViterbiEnvelope& other) {
- if (other.IsMultiplicativeIdentity()) { return *this; }
- if (this->IsMultiplicativeIdentity()) { (*this) = other; return *this; }
-
- if (!is_sorted) Sort();
- if (!other.is_sorted) other.Sort();
-
- if (this->IsEdgeEnvelope()) {
-// if (other.size() > 1)
-// cerr << *this << " (TIMES) " << other << endl;
- shared_ptr<Segment> edge_parent = segs[0];
- const double& edge_b = edge_parent->b;
- const double& edge_m = edge_parent->m;
- segs.clear();
- for (int i = 0; i < other.segs.size(); ++i) {
- const Segment& seg = *other.segs[i];
- const double m = seg.m + edge_m;
- const double b = seg.b + edge_b;
- const double& x = seg.x; // x's don't change with *
- segs.push_back(shared_ptr<Segment>(new Segment(x, m, b, edge_parent, other.segs[i])));
- assert(segs.back()->p1->edge);
- }
-// if (other.size() > 1)
-// cerr << " = " << *this << endl;
- } else {
- vector<shared_ptr<Segment> > new_segs;
- int this_i = 0;
- int other_i = 0;
- const int this_size = segs.size();
- const int other_size = other.segs.size();
- double cur_x = kMinusInfinity; // moves from left to right across the
- // real numbers, stopping for all inter-
- // sections
- double this_next_val = (1 < this_size ? segs[1]->x : kPlusInfinity);
- double other_next_val = (1 < other_size ? other.segs[1]->x : kPlusInfinity);
- while (this_i < this_size && other_i < other_size) {
- const Segment& this_seg = *segs[this_i];
- const Segment& other_seg= *other.segs[other_i];
- const double m = this_seg.m + other_seg.m;
- const double b = this_seg.b + other_seg.b;
-
- new_segs.push_back(shared_ptr<Segment>(new Segment(cur_x, m, b, segs[this_i], other.segs[other_i])));
- int comp = 0;
- if (this_next_val < other_next_val) comp = -1; else
- if (this_next_val > other_next_val) comp = 1;
- if (0 == comp) { // the next values are equal, advance both indices
- ++this_i;
- ++other_i;
- cur_x = this_next_val; // could be other_next_val (they're equal!)
- this_next_val = (this_i+1 < this_size ? segs[this_i+1]->x : kPlusInfinity);
- other_next_val = (other_i+1 < other_size ? other.segs[other_i+1]->x : kPlusInfinity);
- } else { // advance the i with the lower x, update cur_x
- if (-1 == comp) {
- ++this_i;
- cur_x = this_next_val;
- this_next_val = (this_i+1 < this_size ? segs[this_i+1]->x : kPlusInfinity);
- } else {
- ++other_i;
- cur_x = other_next_val;
- other_next_val = (other_i+1 < other_size ? other.segs[other_i+1]->x : kPlusInfinity);
- }
- }
- }
- segs.swap(new_segs);
- }
- //cerr << "Multiply: result=" << (*this) << endl;
- return *this;
-}
-
-// recursively construct translation
-void Segment::ConstructTranslation(vector<WordID>* trans) const {
- const Segment* cur = this;
- vector<vector<WordID> > ant_trans;
- while(!cur->edge) {
- ant_trans.resize(ant_trans.size() + 1);
- cur->p2->ConstructTranslation(&ant_trans.back());
- cur = cur->p1.get();
- }
- size_t ant_size = ant_trans.size();
- vector<const vector<WordID>*> pants(ant_size);
- assert(ant_size == cur->edge->tail_nodes_.size());
- --ant_size;
- for (int i = 0; i < pants.size(); ++i) pants[ant_size - i] = &ant_trans[i];
- cur->edge->rule_->ESubstitute(pants, trans);
-}
-
-void Segment::CollectEdgesUsed(std::vector<bool>* edges_used) const {
- if (edge) {
- assert(edge->id_ < edges_used->size());
- (*edges_used)[edge->id_] = true;
- }
- if (p1) p1->CollectEdgesUsed(edges_used);
- if (p2) p2->CollectEdgesUsed(edges_used);
-}
-
-ViterbiEnvelope ViterbiEnvelopeWeightFunction::operator()(const Hypergraph::Edge& e) const {
- const double m = direction.dot(e.feature_values_);
- const double b = origin.dot(e.feature_values_);
- Segment* seg = new Segment(m, b, e);
- return ViterbiEnvelope(1, seg);
-}
-
diff --git a/vest/viterbi_envelope.h b/vest/viterbi_envelope.h
deleted file mode 100644
index 60ad82d8..00000000
--- a/vest/viterbi_envelope.h
+++ /dev/null
@@ -1,81 +0,0 @@
-#ifndef _VITERBI_ENVELOPE_H_
-#define _VITERBI_ENVELOPE_H_
-
-#include <vector>
-#include <iostream>
-#include <boost/shared_ptr.hpp>
-
-#include "hg.h"
-#include "sparse_vector.h"
-
-static const double kMinusInfinity = -std::numeric_limits<double>::infinity();
-static const double kPlusInfinity = std::numeric_limits<double>::infinity();
-
-struct Segment {
- Segment() : x(), m(), b(), edge() {}
- Segment(double _m, double _b) :
- x(kMinusInfinity), m(_m), b(_b), edge() {}
- Segment(double _x, double _m, double _b, const boost::shared_ptr<Segment>& p1_, const boost::shared_ptr<Segment>& p2_) :
- x(_x), m(_m), b(_b), p1(p1_), p2(p2_), edge() {}
- Segment(double _m, double _b, const Hypergraph::Edge& edge) :
- x(kMinusInfinity), m(_m), b(_b), edge(&edge) {}
-
- double x; // x intersection with previous segment in env, or -inf if none
- double m; // this line's slope
- double b; // intercept with y-axis
-
- // we keep a pointer to the "parents" of this segment so we can reconstruct
- // the Viterbi translation corresponding to this segment
- boost::shared_ptr<Segment> p1;
- boost::shared_ptr<Segment> p2;
-
- // only Segments created from an edge using the ViterbiEnvelopeWeightFunction
- // have rules
- // TRulePtr rule;
- const Hypergraph::Edge* edge;
-
- // recursively recover the Viterbi translation that will result from setting
- // the weights to origin + axis * x, where x is any value from this->x up
- // until the next largest x in the containing ViterbiEnvelope
- void ConstructTranslation(std::vector<WordID>* trans) const;
- void CollectEdgesUsed(std::vector<bool>* edges_used) const;
-};
-
-// this is the semiring value type,
-// it defines constructors for 0, 1, and the operations + and *
-struct ViterbiEnvelope {
- // create semiring zero
- ViterbiEnvelope() : is_sorted(true) {} // zero
- // for debugging:
- ViterbiEnvelope(const std::vector<boost::shared_ptr<Segment> >& s) : segs(s) { Sort(); }
- // create semiring 1 or 0
- explicit ViterbiEnvelope(int i);
- ViterbiEnvelope(int n, Segment* seg) : is_sorted(true), segs(n, boost::shared_ptr<Segment>(seg)) {}
- const ViterbiEnvelope& operator+=(const ViterbiEnvelope& other);
- const ViterbiEnvelope& operator*=(const ViterbiEnvelope& other);
- bool IsMultiplicativeIdentity() const {
- return size() == 1 && (segs[0]->b == 0.0 && segs[0]->m == 0.0) && (!segs[0]->edge) && (!segs[0]->p1) && (!segs[0]->p2); }
- const std::vector<boost::shared_ptr<Segment> >& GetSortedSegs() const {
- if (!is_sorted) Sort();
- return segs;
- }
- size_t size() const { return segs.size(); }
-
- private:
- bool IsEdgeEnvelope() const {
- return segs.size() == 1 && segs[0]->edge; }
- void Sort() const;
- mutable bool is_sorted;
- mutable std::vector<boost::shared_ptr<Segment> > segs;
-};
-std::ostream& operator<<(std::ostream& os, const ViterbiEnvelope& env);
-
-struct ViterbiEnvelopeWeightFunction {
- ViterbiEnvelopeWeightFunction(const SparseVector<double>& ori,
- const SparseVector<double>& dir) : origin(ori), direction(dir) {}
- ViterbiEnvelope operator()(const Hypergraph::Edge& e) const;
- const SparseVector<double> origin;
- const SparseVector<double> direction;
-};
-
-#endif