diff options
| author | Michael Denkowski <michael.j.denkowski@gmail.com> | 2012-12-22 16:01:23 -0500 | 
|---|---|---|
| committer | Michael Denkowski <michael.j.denkowski@gmail.com> | 2012-12-22 16:01:23 -0500 | 
| commit | 778a4cec55f82bcc66b3f52de7cc871e8daaeb92 (patch) | |
| tree | 2a5bccaa85965855104c4e8ac3738b2e1c77f164 /dpmert | |
| parent | 57fff9eea5ba0e71fb958fdb4f32d17f2fe31108 (diff) | |
| parent | d21491daa5e50b4456c7c5f9c2e51d25afd2a757 (diff) | |
Merge branch 'master' of git://github.com/redpony/cdec
Diffstat (limited to 'dpmert')
39 files changed, 0 insertions, 3296 deletions
| diff --git a/dpmert/Makefile.am b/dpmert/Makefile.am deleted file mode 100644 index 00768271..00000000 --- a/dpmert/Makefile.am +++ /dev/null @@ -1,33 +0,0 @@ -bin_PROGRAMS = \ -  mr_dpmert_map \ -  mr_dpmert_reduce \ -  mr_dpmert_generate_mapper_input \ -  sentserver \ -  sentclient - -noinst_PROGRAMS = \ -  lo_test -TESTS = lo_test - -sentserver_SOURCES = sentserver.c -sentserver_LDFLAGS = -pthread - -sentclient_SOURCES = sentclient.c -sentclient_LDFLAGS = -pthread - -mr_dpmert_generate_mapper_input_SOURCES = mr_dpmert_generate_mapper_input.cc line_optimizer.cc -mr_dpmert_generate_mapper_input_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a -lz - -# nbest2hg_SOURCES = nbest2hg.cc -# nbest2hg_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a -lfst -lz - -mr_dpmert_map_SOURCES = mert_geometry.cc ces.cc error_surface.cc mr_dpmert_map.cc line_optimizer.cc -mr_dpmert_map_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a -lz - -mr_dpmert_reduce_SOURCES = error_surface.cc ces.cc mr_dpmert_reduce.cc line_optimizer.cc mert_geometry.cc -mr_dpmert_reduce_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a -lz - -lo_test_SOURCES = lo_test.cc ces.cc mert_geometry.cc error_surface.cc line_optimizer.cc -lo_test_LDADD = $(BOOST_UNIT_TEST_FRAMEWORK_LDFLAGS) $(BOOST_UNIT_TEST_FRAMEWORK_LIBS) $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a -lz - -AM_CPPFLAGS = -DBOOST_TEST_DYN_LINK -W -Wall -Wno-sign-compare $(GTEST_CPPFLAGS) -I$(top_srcdir)/utils -I$(top_srcdir)/decoder -I$(top_srcdir)/mteval diff --git a/dpmert/README.shared-mem b/dpmert/README.shared-mem deleted file mode 100644 index 7728efc0..00000000 --- a/dpmert/README.shared-mem +++ /dev/null @@ -1,9 +0,0 @@ -If you want to run dist-vest.pl on a very large shared memory machine, do the -following: - -  ./dist-vest.pl --use-make I --decode-nodes J --weights weights.init --source-file=dev.src --ref-files=dev.ref.* cdec.ini - -This will use I jobs for doing the line search and J jobs to run the decoder. Typically, since the -decoder must load grammars, language models, etc., J should be smaller than I, but this will depend -on the system you are running on and the complexity of the models used for decoding. - diff --git a/dpmert/ces.cc b/dpmert/ces.cc deleted file mode 100644 index 157b2d17..00000000 --- a/dpmert/ces.cc +++ /dev/null @@ -1,90 +0,0 @@ -#include "ces.h" - -#include <vector> -#include <sstream> -#include <boost/shared_ptr.hpp> - -// TODO, if AER is to be optimized again, we will need this -// #include "aligner.h" -#include "lattice.h" -#include "mert_geometry.h" -#include "error_surface.h" -#include "ns.h" - -using namespace std; - -const bool minimize_segments = true;    // if adjacent segments have equal scores, merge them - -void ComputeErrorSurface(const SegmentEvaluator& ss, -                         const ConvexHull& ve, -                         ErrorSurface* env, -                         const EvaluationMetric* metric, -                         const Hypergraph& hg) { -  vector<WordID> prev_trans; -  const vector<boost::shared_ptr<MERTPoint> >& ienv = ve.GetSortedSegs(); -  env->resize(ienv.size()); -  SufficientStats prev_score; // defaults to 0 -  int j = 0; -  for (unsigned i = 0; i < ienv.size(); ++i) { -    const MERTPoint& seg = *ienv[i]; -    vector<WordID> trans; -#if 0 -    if (type == AER) { -      vector<bool> edges(hg.edges_.size(), false); -      seg.CollectEdgesUsed(&edges);  // get the set of edges in the viterbi -                                     // alignment -      ostringstream os; -      const string* psrc = ss.GetSource(); -      if (psrc == NULL) { -        cerr << "AER scoring in VEST requires source, but it is missing!\n"; -        abort(); -      } -      size_t pos = psrc->rfind(" ||| "); -      if (pos == string::npos) { -        cerr << "Malformed source for AER: expected |||\nINPUT: " << *psrc << endl; -        abort(); -      } -      Lattice src; -      Lattice ref; -      LatticeTools::ConvertTextOrPLF(psrc->substr(0, pos), &src); -      LatticeTools::ConvertTextOrPLF(psrc->substr(pos + 5), &ref); -      AlignerTools::WriteAlignment(src, ref, hg, &os, true, 0, &edges); -      string tstr = os.str(); -      TD::ConvertSentence(tstr.substr(tstr.rfind(" ||| ") + 5), &trans); -    } else { -#endif -      seg.ConstructTranslation(&trans); -    //} -    //cerr << "Scoring: " << TD::GetString(trans) << endl; -    if (trans == prev_trans) { -      if (!minimize_segments) { -        ErrorSegment& out = (*env)[j]; -        out.delta.fields.clear(); -        out.x = seg.x; -	++j; -      } -      //cerr << "Identical translation, skipping scoring\n"; -    } else { -      SufficientStats score; -      ss.Evaluate(trans, &score); -      // cerr << "score= " << score->ComputeScore() << "\n"; -      //string x1; score.Encode(&x1); cerr << "STATS: " << x1 << endl; -      const SufficientStats delta = score - prev_score; -      //string x2; delta.Encode(&x2); cerr << "DELTA: " << x2 << endl; -      //string xx; delta.Encode(&xx); cerr << xx << endl; -      prev_trans.swap(trans); -      prev_score = score; -      if ((!minimize_segments) || (!delta.IsAdditiveIdentity())) { -        ErrorSegment& out = (*env)[j]; -        out.delta = delta; -        out.x = seg.x; -        ++j; -      } -    } -  } -  // cerr << " In segments: " << ienv.size() << endl; -  // cerr << "Out segments: " << j << endl; -  assert(j > 0); -  env->resize(j); -} - diff --git a/dpmert/ces.h b/dpmert/ces.h deleted file mode 100644 index e4fa2080..00000000 --- a/dpmert/ces.h +++ /dev/null @@ -1,16 +0,0 @@ -#ifndef _CES_H_ -#define _CES_H_ - -class ConvexHull; -class Hypergraph; -class SegmentEvaluator; -class ErrorSurface; -class EvaluationMetric; - -void ComputeErrorSurface(const SegmentEvaluator& ss, -                         const ConvexHull& convex_hull, -                         ErrorSurface* es, -                         const EvaluationMetric* metric, -                         const Hypergraph& hg); - -#endif diff --git a/dpmert/decode-and-evaluate.pl b/dpmert/decode-and-evaluate.pl deleted file mode 100755 index fe765d00..00000000 --- a/dpmert/decode-and-evaluate.pl +++ /dev/null @@ -1,246 +0,0 @@ -#!/usr/bin/env perl -use strict; -my @ORIG_ARGV=@ARGV; -use Cwd qw(getcwd); -my $SCRIPT_DIR; BEGIN { use Cwd qw/ abs_path /; use File::Basename; $SCRIPT_DIR = dirname(abs_path($0)); push @INC, $SCRIPT_DIR, "$SCRIPT_DIR/../environment"; } - -# Skip local config (used for distributing jobs) if we're running in local-only mode -use LocalConfig; -use Getopt::Long; -use File::Basename qw(basename); -my $QSUB_CMD = qsub_args(mert_memory()); - -require "libcall.pl"; - -# Default settings -my $default_jobs = env_default_jobs(); -my $bin_dir = $SCRIPT_DIR; -die "Bin directory $bin_dir missing/inaccessible" unless -d $bin_dir; -my $FAST_SCORE="$bin_dir/../mteval/fast_score"; -die "Can't execute $FAST_SCORE" unless -x $FAST_SCORE; -my $parallelize = "$bin_dir/parallelize.pl"; -my $libcall = "$bin_dir/libcall.pl"; -my $sentserver = "$bin_dir/sentserver"; -my $sentclient = "$bin_dir/sentclient"; -my $LocalConfig = "$SCRIPT_DIR/../environment/LocalConfig.pm"; - -my $SCORER = $FAST_SCORE; -my $cdec = "$bin_dir/../decoder/cdec"; -die "Can't find decoder in $cdec" unless -x $cdec; -die "Can't find $parallelize" unless -x $parallelize; -die "Can't find $libcall" unless -e $libcall; -my $decoder = $cdec; -my $jobs = $default_jobs;   # number of decode nodes -my $pmem = "9g"; -my $help = 0; -my $config; -my $test_set; -my $weights; -my $use_make = 1; -my $useqsub; -my $cpbin=1; -# Process command-line options -if (GetOptions( -	"jobs=i" => \$jobs, -	"help" => \$help, -	"qsub" => \$useqsub, -	"input=s" => \$test_set, -        "config=s" => \$config, -	"weights=s" => \$weights, -) == 0 || @ARGV!=0 || $help) { -	print_help(); -	exit; -} - -if ($useqsub) { -  $use_make = 0; -  die "LocalEnvironment.pm does not have qsub configuration for this host. Cannot run with --qsub!\n" unless has_qsub(); -} - -my @missing_args = (); - -if (!defined $test_set) { push @missing_args, "--input"; } -if (!defined $config) { push @missing_args, "--config"; } -if (!defined $weights) { push @missing_args, "--weights"; } -die "Please specify missing arguments: " . join (', ', @missing_args) . "\nUse --help for more information.\n" if (@missing_args); - -my @tf = localtime(time); -my $tname = basename($test_set); -$tname =~ s/\.(sgm|sgml|xml)$//i; -my $dir = "eval.$tname." . sprintf('%d%02d%02d-%02d%02d%02d', 1900+$tf[5], $tf[4], $tf[3], $tf[2], $tf[1], $tf[0]); - -my $time = unchecked_output("date"); - -check_call("mkdir -p $dir"); - -split_devset($test_set, "$dir/test.input.raw", "$dir/test.refs"); -my $refs = "-r $dir/test.refs"; -my $newsrc = "$dir/test.input"; -enseg("$dir/test.input.raw", $newsrc); -my $src_file = $newsrc; -open F, "<$src_file" or die "Can't read $src_file: $!"; close F; - -my $test_trans="$dir/test.trans"; -my $logdir="$dir/logs"; -my $decoderLog="$logdir/decoder.sentserver.log"; -check_call("mkdir -p $logdir"); - -#decode -print STDERR "RUNNING DECODER AT "; -print STDERR unchecked_output("date"); -my $decoder_cmd = "$decoder -c $config --weights $weights"; -my $pcmd; -if ($use_make) { -	$pcmd = "cat $src_file | $parallelize --workdir $dir --use-fork -p $pmem -e $logdir -j $jobs --"; -} else { -	$pcmd = "cat $src_file | $parallelize --workdir $dir -p $pmem -e $logdir -j $jobs --"; -} -my $cmd = "$pcmd $decoder_cmd 2> $decoderLog 1> $test_trans"; -check_bash_call($cmd); -print STDERR "DECODER COMPLETED AT "; -print STDERR unchecked_output("date"); -print STDERR "\nOUTPUT: $test_trans\n\n"; -my $bleu = check_output("cat $test_trans | $SCORER $refs -m ibm_bleu"); -chomp $bleu; -print STDERR "BLEU: $bleu\n"; -my $ter = check_output("cat $test_trans | $SCORER $refs -m ter"); -chomp $ter; -print STDERR " TER: $ter\n"; -open TR, ">$dir/test.scores" or die "Can't write $dir/test.scores: $!"; -print TR <<EOT; -### SCORE REPORT ############################################################# -        OUTPUT=$test_trans -  SCRIPT INPUT=$test_set - DECODER INPUT=$src_file -    REFERENCES=$dir/test.refs ------------------------------------------------------------------------------- -          BLEU=$bleu -           TER=$ter -############################################################################## -EOT -close TR; -my $sr = unchecked_output("cat $dir/test.scores"); -print STDERR "\n\n$sr\n(A copy of this report can be found in $dir/test.scores)\n\n"; -exit 0; - -sub enseg { -	my $src = shift; -	my $newsrc = shift; -	open(SRC, $src); -	open(NEWSRC, ">$newsrc"); -	my $i=0; -	while (my $line=<SRC>){ -		chomp $line; -		if ($line =~ /^\s*<seg/i) { -		    if($line =~ /id="[0-9]+"/) { -			print NEWSRC "$line\n"; -		    } else { -			die "When using segments with pre-generated <seg> tags, you must include a zero-based id attribute"; -		    } -		} else { -			print NEWSRC "<seg id=\"$i\">$line</seg>\n"; -		} -		$i++; -	} -	close SRC; -	close NEWSRC; -} - -sub print_help { -	my $executable = basename($0); chomp $executable; -	print << "Help"; - -Usage: $executable [options] <ini file> - -	$executable --config cdec.ini --weights weights.txt [--jobs N] [--qsub] <testset.in-ref> - -Options: - -	--help -		Print this message and exit. - -	--config <file> -		A path to the cdec.ini file. - -	--weights <file> -		A file specifying feature weights. - -	--dir <dir> -		Directory for intermediate and output files. - -Job control options: - -	--jobs <I> -		Number of decoder processes to run in parallel. [default=$default_jobs] - -	--qsub -		Use qsub to run jobs in parallel (qsub must be configured in -		environment/LocalEnvironment.pm) - -	--pmem <N> -		Amount of physical memory requested for parallel decoding jobs -		(used with qsub requests only) - -Help -} - -sub convert { -  my ($str) = @_; -  my @ps = split /;/, $str; -  my %dict = (); -  for my $p (@ps) { -    my ($k, $v) = split /=/, $p; -    $dict{$k} = $v; -  } -  return %dict; -} - - - -sub cmdline { -    return join ' ',($0,@ORIG_ARGV); -} - -#buggy: last arg gets quoted sometimes? -my $is_shell_special=qr{[ \t\n\\><|&;"'`~*?{}$!()]}; -my $shell_escape_in_quote=qr{[\\"\$`!]}; - -sub escape_shell { -    my ($arg)=@_; -    return undef unless defined $arg; -    if ($arg =~ /$is_shell_special/) { -        $arg =~ s/($shell_escape_in_quote)/\\$1/g; -        return "\"$arg\""; -    } -    return $arg; -} - -sub escaped_shell_args { -    return map {local $_=$_;chomp;escape_shell($_)} @_; -} - -sub escaped_shell_args_str { -    return join ' ',&escaped_shell_args(@_); -} - -sub escaped_cmdline { -    return "$0 ".&escaped_shell_args_str(@ORIG_ARGV); -} - -sub split_devset { -  my ($infile, $outsrc, $outref) = @_; -  open F, "<$infile" or die "Can't read $infile: $!"; -  open S, ">$outsrc" or die "Can't write $outsrc: $!"; -  open R, ">$outref" or die "Can't write $outref: $!"; -  while(<F>) { -    chomp; -    my ($src, @refs) = split /\s*\|\|\|\s*/; -    die "Malformed devset line: $_\n" unless scalar @refs > 0; -    print S "$src\n"; -    print R join(' ||| ', @refs) . "\n"; -  } -  close R; -  close S; -  close F; -} - diff --git a/dpmert/divide_refs.py b/dpmert/divide_refs.py deleted file mode 100755 index b478f918..00000000 --- a/dpmert/divide_refs.py +++ /dev/null @@ -1,15 +0,0 @@ -#!/usr/bin/env python -import sys - -(numRefs, outPrefix) = sys.argv[1:] -numRefs = int(numRefs) - -outs = [open(outPrefix+str(i), "w") for i in range(numRefs)] - -i = 0 -for line in sys.stdin: -  outs[i].write(line) -  i = (i + 1) % numRefs - -for out in outs: -  out.close() diff --git a/dpmert/dpmert.pl b/dpmert/dpmert.pl deleted file mode 100755 index c4f98870..00000000 --- a/dpmert/dpmert.pl +++ /dev/null @@ -1,617 +0,0 @@ -#!/usr/bin/env perl -use strict; -my @ORIG_ARGV=@ARGV; -use Cwd qw(getcwd); -my $SCRIPT_DIR; BEGIN { use Cwd qw/ abs_path /; use File::Basename; $SCRIPT_DIR = dirname(abs_path($0)); push @INC, $SCRIPT_DIR, "$SCRIPT_DIR/../environment"; } - -# Skip local config (used for distributing jobs) if we're running in local-only mode -use LocalConfig; -use Getopt::Long; -use File::Basename qw(basename); -require "libcall.pl"; - -my $QSUB_CMD = qsub_args(mert_memory()); - -# Default settings -my $srcFile;  # deprecated -my $refFiles; # deprecated -my $default_jobs = env_default_jobs(); -my $bin_dir = $SCRIPT_DIR; -die "Bin directory $bin_dir missing/inaccessible" unless -d $bin_dir; -my $FAST_SCORE="$bin_dir/../mteval/fast_score"; -die "Can't execute $FAST_SCORE" unless -x $FAST_SCORE; -my $MAPINPUT = "$bin_dir/mr_dpmert_generate_mapper_input"; -my $MAPPER = "$bin_dir/mr_dpmert_map"; -my $REDUCER = "$bin_dir/mr_dpmert_reduce"; -my $parallelize = "$bin_dir/parallelize.pl"; -my $libcall = "$bin_dir/libcall.pl"; -my $sentserver = "$bin_dir/sentserver"; -my $sentclient = "$bin_dir/sentclient"; -my $LocalConfig = "$SCRIPT_DIR/../environment/LocalConfig.pm"; - -my $SCORER = $FAST_SCORE; -die "Can't find $MAPPER" unless -x $MAPPER; -my $cdec = "$bin_dir/../decoder/cdec"; -die "Can't find decoder in $cdec" unless -x $cdec; -die "Can't find $parallelize" unless -x $parallelize; -die "Can't find $libcall" unless -e $libcall; -my $decoder = $cdec; -my $lines_per_mapper = 200; -my $rand_directions = 15; -my $iteration = 1; -my $best_weights; -my $max_iterations = 15; -my $optimization_iters = 6; -my $jobs = $default_jobs;   # number of decode nodes -my $pmem = "9g"; -my $disable_clean = 0; -my %seen_weights; -my $help = 0; -my $epsilon = 0.0001; -my $last_score = -10000000; -my $metric = "ibm_bleu"; -my $dir; -my $iniFile; -my $weights; -my $initialWeights; -my $bleu_weight=1; -my $use_make = 1;  # use make to parallelize line search -my $useqsub; -my $pass_suffix = ''; -my $devset; -# Process command-line options -if (GetOptions( -	"config=s" => \$iniFile, -	"weights=s" => \$initialWeights, -        "devset=s" => \$devset, -	"jobs=i" => \$jobs, -	"pass-suffix=s" => \$pass_suffix, -	"help" => \$help, -	"qsub" => \$useqsub, -	"iterations=i" => \$max_iterations, -	"pmem=s" => \$pmem, -	"random-directions=i" => \$rand_directions, -	"metric=s" => \$metric, -	"source-file=s" => \$srcFile, -	"output-dir=s" => \$dir, -) == 0 || @ARGV!=0 || $help) { -	print_help(); -	exit; -} - -if ($useqsub) { -  $use_make = 0; -  die "LocalEnvironment.pm does not have qsub configuration for this host. Cannot run with --qsub!\n" unless has_qsub(); -} - -my @missing_args = (); -if (defined $srcFile || defined $refFiles) { -  die <<EOT; - -  The options --ref-files and --source-file are no longer supported. -  Please specify the input file and its reference translations with -  --devset FILE - -EOT -} - -if (!defined $iniFile) { push @missing_args, "--config"; } -if (!defined $devset) { push @missing_args, "--devset"; } -if (!defined $initialWeights) { push @missing_args, "--weights"; } -die "Please specify missing arguments: " . join (', ', @missing_args) . "\nUse --help for more information.\n" if (@missing_args); - -if ($metric =~ /^(combi|ter)$/i) { -  $lines_per_mapper = 40; -} elsif ($metric =~ /^meteor$/i) { -  $lines_per_mapper = 2000;   # start up time is really high for METEOR -} - - -my $nodelist; -my $host =check_output("hostname"); chomp $host; -my $bleu; -my $interval_count = 0; -my $logfile; -my $projected_score; - -# used in sorting scores -my $DIR_FLAG = '-r'; -if ($metric =~ /^ter$|^aer$/i) { -  $DIR_FLAG = ''; -} - -unless ($dir){ -	$dir = "dpmert"; -} -unless ($dir =~ /^\//){  # convert relative path to absolute path -	my $basedir = check_output("pwd"); -	chomp $basedir; -	$dir = "$basedir/$dir"; -} - - -# Initializations and helper functions -srand; - -my @childpids = (); -my @cleanupcmds = (); - -sub cleanup { -	print STDERR "Cleanup...\n"; -	for my $pid (@childpids){ unchecked_call("kill $pid"); } -	for my $cmd (@cleanupcmds){ unchecked_call("$cmd"); } -	exit 1; -}; -# Always call cleanup, no matter how we exit -*CORE::GLOBAL::exit = sub{ cleanup(); };  -$SIG{INT} = "cleanup"; -$SIG{TERM} = "cleanup"; -$SIG{HUP} = "cleanup"; - -my $decoderBase = basename($decoder); chomp $decoderBase; -my $newIniFile = "$dir/$decoderBase.ini"; -my $inputFileName = "$dir/input"; -my $user = $ENV{"USER"}; - -# process ini file --e $iniFile || die "Error: could not open $iniFile for reading\n"; - -sub dirsize { -    opendir ISEMPTY,$_[0]; -    return scalar(readdir(ISEMPTY))-1; -} -if (-e $dir) { -	# allow preexisting logfile, binaries, but not dist-dpmert.pl outputs -	die "ERROR: output directory $dir already exists (remove or use --output-dir dir)\n\n"; -} else { -	mkdir "$dir" or die "Can't mkdir $dir: $!"; -	mkdir "$dir/hgs" or die; -	mkdir "$dir/scripts" or die; -	print STDERR <<EOT; -	DECODER:          $decoder -	INI FILE:         $iniFile -	WORKING DIR:      $dir -	DEVSET:           $devset -	EVAL METRIC:      $metric -	MAX ITERATIONS:   $max_iterations -	PARALLEL JOBS:    $jobs -	HEAD NODE:        $host -	PMEM (DECODING):  $pmem -	INITIAL WEIGHTS:  $initialWeights -EOT -} - -# Generate initial files and values -check_call("cp $iniFile $newIniFile"); -check_call("cp $initialWeights $dir/weights.0"); -$iniFile = $newIniFile; - -split_devset($devset, "$dir/dev.input.raw", "$dir/dev.refs"); -my $refs = "-r $dir/dev.refs"; -my $newsrc = "$dir/dev.input"; -enseg("$dir/dev.input.raw", $newsrc); -$srcFile = $newsrc; -my $devSize = 0; -open F, "<$srcFile" or die "Can't read $srcFile: $!"; -while(<F>) { $devSize++; } -close F; - -unless($best_weights){ $best_weights = $weights; } -unless($projected_score){ $projected_score = 0.0; } -$seen_weights{$weights} = 1; - -my $random_seed = int(time / 1000); -my $lastWeightsFile; -my $lastPScore = 0; -# main optimization loop -while (1){ -	print STDERR "\n\nITERATION $iteration\n==========\n"; - -	if ($iteration > $max_iterations){ -		print STDERR "\nREACHED STOPPING CRITERION: Maximum iterations\n"; -		last; -	} -	# iteration-specific files -	my $runFile="$dir/run.raw.$iteration"; -	my $onebestFile="$dir/1best.$iteration"; -	my $logdir="$dir/logs.$iteration"; -	my $decoderLog="$logdir/decoder.sentserver.log.$iteration"; -	my $scorerLog="$logdir/scorer.log.$iteration"; -	check_call("mkdir -p $logdir"); - - -	#decode -	print STDERR "RUNNING DECODER AT "; -	print STDERR unchecked_output("date"); -	my $im1 = $iteration - 1; -	my $weightsFile="$dir/weights.$im1"; -	my $decoder_cmd = "$decoder -c $iniFile --weights$pass_suffix $weightsFile -O $dir/hgs"; -	my $pcmd; -	if ($use_make) { -		$pcmd = "cat $srcFile | $parallelize --workdir $dir --use-fork -p $pmem -e $logdir -j $jobs --"; -	} else { -		$pcmd = "cat $srcFile | $parallelize --workdir $dir -p $pmem -e $logdir -j $jobs --"; -	} -	my $cmd = "$pcmd $decoder_cmd 2> $decoderLog 1> $runFile"; -	print STDERR "COMMAND:\n$cmd\n"; -	check_bash_call($cmd); -        my $num_hgs; -        my $num_topbest; -        my $retries = 0; -	while($retries < 5) { -	    $num_hgs = check_output("ls $dir/hgs/*.gz | wc -l"); -	    $num_topbest = check_output("wc -l < $runFile"); -	    print STDERR "NUMBER OF HGs: $num_hgs\n"; -	    print STDERR "NUMBER OF TOP-BEST HYPs: $num_topbest\n"; -	    if($devSize == $num_hgs && $devSize == $num_topbest) { -		last; -	    } else { -		print STDERR "Incorrect number of hypergraphs or topbest. Waiting for distributed filesystem and retrying...\n"; -		sleep(3); -	    } -	    $retries++; -	} -	die "Dev set contains $devSize sentences, but we don't have topbest and hypergraphs for all these! Decoder failure? Check $decoderLog\n" if ($devSize != $num_hgs || $devSize != $num_topbest); -	my $dec_score = check_output("cat $runFile | $SCORER $refs -m $metric"); -	chomp $dec_score; -	print STDERR "DECODER SCORE: $dec_score\n"; - -	# save space -	check_call("gzip -f $runFile"); -	check_call("gzip -f $decoderLog"); - -	# run optimizer -	print STDERR "RUNNING OPTIMIZER AT "; -	print STDERR unchecked_output("date"); -	my $mergeLog="$logdir/prune-merge.log.$iteration"; - -	my $score = 0; -	my $icc = 0; -	my $inweights="$dir/weights.$im1"; -	for (my $opt_iter=1; $opt_iter<$optimization_iters; $opt_iter++) { -		print STDERR "\nGENERATE OPTIMIZATION STRATEGY (OPT-ITERATION $opt_iter/$optimization_iters)\n"; -		print STDERR unchecked_output("date"); -		$icc++; -		$cmd="$MAPINPUT -w $inweights -r $dir/hgs -s $devSize -d $rand_directions > $dir/agenda.$im1-$opt_iter"; -		print STDERR "COMMAND:\n$cmd\n"; -		check_call($cmd); -		check_call("mkdir -p $dir/splag.$im1"); -		$cmd="split -a 3 -l $lines_per_mapper $dir/agenda.$im1-$opt_iter $dir/splag.$im1/mapinput."; -		print STDERR "COMMAND:\n$cmd\n"; -		check_call($cmd); -		opendir(DIR, "$dir/splag.$im1") or die "Can't open directory: $!"; -		my @shards = grep { /^mapinput\./ } readdir(DIR); -		closedir DIR; -		die "No shards!" unless scalar @shards > 0; -		my $joblist = ""; -		my $nmappers = 0; -		my @mapoutputs = (); -		@cleanupcmds = (); -		my %o2i = (); -		my $first_shard = 1; -		my $mkfile; # only used with makefiles -		my $mkfilename; -		if ($use_make) { -			$mkfilename = "$dir/splag.$im1/domap.mk"; -			open $mkfile, ">$mkfilename" or die "Couldn't write $mkfilename: $!"; -			print $mkfile "all: $dir/splag.$im1/map.done\n\n"; -		} -		my @mkouts = ();  # only used with makefiles -		for my $shard (@shards) { -			my $mapoutput = $shard; -			my $client_name = $shard; -			$client_name =~ s/mapinput.//; -			$client_name = "dpmert.$client_name"; -			$mapoutput =~ s/mapinput/mapoutput/; -			push @mapoutputs, "$dir/splag.$im1/$mapoutput"; -			$o2i{"$dir/splag.$im1/$mapoutput"} = "$dir/splag.$im1/$shard"; -			my $script = "$MAPPER -s $srcFile -m $metric $refs < $dir/splag.$im1/$shard | sort -t \$'\\t' -k 1 > $dir/splag.$im1/$mapoutput"; -			if ($use_make) { -				my $script_file = "$dir/scripts/map.$shard"; -				open F, ">$script_file" or die "Can't write $script_file: $!"; -				print F "#!/bin/bash\n"; -				print F "$script\n"; -				close F; -				my $output = "$dir/splag.$im1/$mapoutput"; -				push @mkouts, $output; -				chmod(0755, $script_file) or die "Can't chmod $script_file: $!"; -				if ($first_shard) { print STDERR "$script\n"; $first_shard=0; } -				print $mkfile "$output: $dir/splag.$im1/$shard\n\t$script_file\n\n"; -			} else { -				my $script_file = "$dir/scripts/map.$shard"; -				open F, ">$script_file" or die "Can't write $script_file: $!"; -				print F "$script\n"; -				close F; -				if ($first_shard) { print STDERR "$script\n"; $first_shard=0; } - -				$nmappers++; -				my $qcmd = "$QSUB_CMD -N $client_name -o /dev/null -e $logdir/$client_name.ER $script_file"; -				my $jobid = check_output("$qcmd"); -				chomp $jobid; -				$jobid =~ s/^(\d+)(.*?)$/\1/g; -				$jobid =~ s/^Your job (\d+) .*$/\1/; -		 	 	push(@cleanupcmds, "qdel $jobid 2> /dev/null"); -				print STDERR " $jobid"; -				if ($joblist == "") { $joblist = $jobid; } -				else {$joblist = $joblist . "\|" . $jobid; } -			} -		} -		if ($use_make) { -			print $mkfile "$dir/splag.$im1/map.done: @mkouts\n\ttouch $dir/splag.$im1/map.done\n\n"; -			close $mkfile; -			my $mcmd = "make -j $jobs -f $mkfilename"; -			print STDERR "\nExecuting: $mcmd\n"; -			check_call($mcmd); -		} else { -			print STDERR "\nLaunched $nmappers mappers.\n"; -      			sleep 8; -			print STDERR "Waiting for mappers to complete...\n"; -			while ($nmappers > 0) { -			  sleep 5; -			  my @livejobs = grep(/$joblist/, split(/\n/, unchecked_output("qstat | grep -v ' C '"))); -			  $nmappers = scalar @livejobs; -			} -			print STDERR "All mappers complete.\n"; -		} -		my $tol = 0; -		my $til = 0; -		for my $mo (@mapoutputs) { -		  my $olines = get_lines($mo); -		  my $ilines = get_lines($o2i{$mo}); -		  $tol += $olines; -		  $til += $ilines; -		  die "$mo: output lines ($olines) doesn't match input lines ($ilines)" unless $olines==$ilines; -		} -		print STDERR "Results for $tol/$til lines\n"; -		print STDERR "\nSORTING AND RUNNING VEST REDUCER\n"; -		print STDERR unchecked_output("date"); -		$cmd="sort -t \$'\\t' -k 1 @mapoutputs | $REDUCER -m $metric > $dir/redoutput.$im1"; -		print STDERR "COMMAND:\n$cmd\n"; -		check_bash_call($cmd); -		$cmd="sort -nk3 $DIR_FLAG '-t|' $dir/redoutput.$im1 | head -1"; -		# sort returns failure even when it doesn't fail for some reason -		my $best=unchecked_output("$cmd"); chomp $best; -		print STDERR "$best\n"; -		my ($oa, $x, $xscore) = split /\|/, $best; -		$score = $xscore; -		print STDERR "PROJECTED SCORE: $score\n"; -		if (abs($x) < $epsilon) { -			print STDERR "\nOPTIMIZER: no score improvement: abs($x) < $epsilon\n"; -			last; -		} -                my $psd = $score - $last_score; -                $last_score = $score; -		if (abs($psd) < $epsilon) { -			print STDERR "\nOPTIMIZER: no score improvement: abs($psd) < $epsilon\n"; -			last; -		} -		my ($origin, $axis) = split /\s+/, $oa; - -		my %ori = convert($origin); -		my %axi = convert($axis); - -		my $finalFile="$dir/weights.$im1-$opt_iter"; -		open W, ">$finalFile" or die "Can't write: $finalFile: $!"; -                my $norm = 0; -		for my $k (sort keys %ori) { -			my $dd = $ori{$k} + $axi{$k} * $x; -                        $norm += $dd * $dd; -		} -                $norm = sqrt($norm); -		$norm = 1; -		for my $k (sort keys %ori) { -			my $v = ($ori{$k} + $axi{$k} * $x) / $norm; -			print W "$k $v\n"; -		} -		check_call("rm $dir/splag.$im1/*"); -		$inweights = $finalFile; -	} -	$lastWeightsFile = "$dir/weights.$iteration"; -	check_call("cp $inweights $lastWeightsFile"); -	if ($icc < 2) { -		print STDERR "\nREACHED STOPPING CRITERION: score change too little\n"; -		last; -	} -	$lastPScore = $score; -	$iteration++; -	print STDERR "\n==========\n"; -} - -check_call("cp $lastWeightsFile $dir/weights.final"); -print STDERR "\nFINAL WEIGHTS: $dir/weights.final\n(Use -w <this file> with the decoder)\n\n"; -print STDOUT "$dir/weights.final\n"; -exit 0; - - -sub get_lines { -  my $fn = shift @_; -  open FL, "<$fn" or die "Couldn't read $fn: $!"; -  my $lc = 0; -  while(<FL>) { $lc++; } -  return $lc; -} - -sub read_weights_file { -  my ($file) = @_; -  open F, "<$file" or die "Couldn't read $file: $!"; -  my @r = (); -  my $pm = -1; -  while(<F>) { -    next if /^#/; -    next if /^\s*$/; -    chomp; -    if (/^(.+)\s+(.+)$/) { -      my $m = $1; -      my $w = $2; -      die "Weights out of order: $m <= $pm" unless $m > $pm; -      push @r, $w; -    } else { -      warn "Unexpected feature name in weight file: $_"; -    } -  } -  close F; -  return join ' ', @r; -} - -sub update_weights_file { -  my ($neww, $rfn, $rpts) = @_; -  my @feats = @$rfn; -  my @pts = @$rpts; -  my $num_feats = scalar @feats; -  my $num_pts = scalar @pts; -  die "$num_feats (num_feats) != $num_pts (num_pts)" unless $num_feats == $num_pts; -  open G, ">$neww" or die; -  for (my $i = 0; $i < $num_feats; $i++) { -    my $f = $feats[$i]; -    my $lambda = $pts[$i]; -    print G "$f $lambda\n"; -  } -  close G; -} - -sub enseg { -	my $src = shift; -	my $newsrc = shift; -	open(SRC, $src); -	open(NEWSRC, ">$newsrc"); -	my $i=0; -	while (my $line=<SRC>){ -		chomp $line; -		if ($line =~ /^\s*<seg/i) { -		    if($line =~ /id="[0-9]+"/) { -			print NEWSRC "$line\n"; -		    } else { -			die "When using segments with pre-generated <seg> tags, you must include a zero-based id attribute"; -		    } -		} else { -			print NEWSRC "<seg id=\"$i\">$line</seg>\n"; -		} -		$i++; -	} -	close SRC; -	close NEWSRC; -} - -sub print_help { - -	my $executable = basename($0); chomp $executable; -	print << "Help"; - -Usage: $executable [options] <ini file> - -	$executable [options] -		Runs a complete MERT optimization. Required options are --weights, -		--devset, and --config. - -Options: - -	--config <file>   [-c <file>] -		The decoder configuration file. - -	--devset <file>   [-d <file>] -		The source *and* references for the development set. - -	--weights <file>  [-w <file>] -		A file specifying initial feature weights.  The format is -		FeatureName_1 value1 -		FeatureName_2 value2 -		**All and only the weights listed in <file> will be optimized!** - -	--metric <name> -		Metric to optimize. -		Example values: IBM_BLEU, NIST_BLEU, Koehn_BLEU, TER, Combi - -	--iterations <M> -		Maximum number of iterations to run.  If not specified, defaults -		to 10. - -	--pass-suffix <S> -		If the decoder is doing multi-pass decoding, the pass suffix "2", -		"3", etc., is used to control what iteration of weights is set. - -	--rand-directions <num> -		MERT will attempt to optimize along all of the principle directions, -		set this parameter to explore other directions. Defaults to 5. - -	--output-dir <dir> -		Directory for intermediate and output files. - -	--help -		Print this message and exit. - -Job control options: - -	--jobs <I> -		Number of decoder processes to run in parallel. [default=$default_jobs] - -	--qsub -		Use qsub to run jobs in parallel (qsub must be configured in -		environment/LocalEnvironment.pm) - -	--pmem <N> -		Amount of physical memory requested for parallel decoding jobs -		(used with qsub requests only) - -Help -} - -sub convert { -  my ($str) = @_; -  my @ps = split /;/, $str; -  my %dict = (); -  for my $p (@ps) { -    my ($k, $v) = split /=/, $p; -    $dict{$k} = $v; -  } -  return %dict; -} - - - -sub cmdline { -    return join ' ',($0,@ORIG_ARGV); -} - -#buggy: last arg gets quoted sometimes? -my $is_shell_special=qr{[ \t\n\\><|&;"'`~*?{}$!()]}; -my $shell_escape_in_quote=qr{[\\"\$`!]}; - -sub escape_shell { -    my ($arg)=@_; -    return undef unless defined $arg; -    if ($arg =~ /$is_shell_special/) { -        $arg =~ s/($shell_escape_in_quote)/\\$1/g; -        return "\"$arg\""; -    } -    return $arg; -} - -sub escaped_shell_args { -    return map {local $_=$_;chomp;escape_shell($_)} @_; -} - -sub escaped_shell_args_str { -    return join ' ',&escaped_shell_args(@_); -} - -sub escaped_cmdline { -    return "$0 ".&escaped_shell_args_str(@ORIG_ARGV); -} - -sub split_devset { -  my ($infile, $outsrc, $outref) = @_; -  open F, "<$infile" or die "Can't read $infile: $!"; -  open S, ">$outsrc" or die "Can't write $outsrc: $!"; -  open R, ">$outref" or die "Can't write $outref: $!"; -  while(<F>) { -    chomp; -    my ($src, @refs) = split /\s*\|\|\|\s*/; -    die "Malformed devset line: $_\n" unless scalar @refs > 0; -    print S "$src\n"; -    print R join(' ||| ', @refs) . "\n"; -  } -  close R; -  close S; -  close F; -} - diff --git a/dpmert/error_surface.cc b/dpmert/error_surface.cc deleted file mode 100644 index 515b67f8..00000000 --- a/dpmert/error_surface.cc +++ /dev/null @@ -1,42 +0,0 @@ -#include "error_surface.h" - -#include <cassert> -#include <sstream> - -using namespace std; - -ErrorSurface::~ErrorSurface() {} - -void ErrorSurface::Serialize(std::string* out) const { -  const int segments = this->size(); -  ostringstream os(ios::binary); -  os.write((const char*)&segments,sizeof(segments)); -  for (int i = 0; i < segments; ++i) { -    const ErrorSegment& cur = (*this)[i]; -    string senc; -    cur.delta.Encode(&senc); -    assert(senc.size() < 1024); -    unsigned char len = senc.size(); -    os.write((const char*)&cur.x, sizeof(cur.x)); -    os.write((const char*)&len, sizeof(len)); -    os.write((const char*)&senc[0], len); -  } -  *out = os.str(); -} - -void ErrorSurface::Deserialize(const std::string& in) { -  istringstream is(in, ios::binary); -  int segments; -  is.read((char*)&segments, sizeof(segments)); -  this->resize(segments); -  for (int i = 0; i < segments; ++i) { -    ErrorSegment& cur = (*this)[i]; -    unsigned char len; -    is.read((char*)&cur.x, sizeof(cur.x)); -    is.read((char*)&len, sizeof(len)); -    string senc(len, '\0'); assert(senc.size() == len); -    is.read((char*)&senc[0], len); -    cur.delta = SufficientStats(senc); -  } -} - diff --git a/dpmert/error_surface.h b/dpmert/error_surface.h deleted file mode 100644 index bb65847b..00000000 --- a/dpmert/error_surface.h +++ /dev/null @@ -1,24 +0,0 @@ -#ifndef _ERROR_SURFACE_H_ -#define _ERROR_SURFACE_H_ - -#include <vector> -#include <string> - -#include "ns.h" - -class Score; - -struct ErrorSegment { -  double x; -  SufficientStats delta; -  ErrorSegment() : x(0), delta() {} -}; - -class ErrorSurface : public std::vector<ErrorSegment> { - public: -  ~ErrorSurface(); -  void Serialize(std::string* out) const; -  void Deserialize(const std::string& in); -}; - -#endif diff --git a/dpmert/libcall.pl b/dpmert/libcall.pl deleted file mode 100644 index c7d0f128..00000000 --- a/dpmert/libcall.pl +++ /dev/null @@ -1,71 +0,0 @@ -use IPC::Open3; -use Symbol qw(gensym); - -$DUMMY_STDERR = gensym(); -$DUMMY_STDIN = gensym(); - -# Run the command and ignore failures -sub unchecked_call { -    system("@_") -} - -# Run the command and return its output, if any ignoring failures -sub unchecked_output { -    return `@_` -} - -# WARNING: Do not use this for commands that will return large amounts -# of stdout or stderr -- they might block indefinitely -sub check_output { -    print STDERR "Executing and gathering output: @_\n"; - -    my $pid = open3($DUMMY_STDIN, \*PH, $DUMMY_STDERR, @_); -    my $proc_output = ""; -    while( <PH> ) { -	$proc_output .= $_; -    } -    waitpid($pid, 0); -    # TODO: Grab signal that the process died from -    my $child_exit_status = $? >> 8; -    if($child_exit_status == 0) { -	return $proc_output; -    } else { -	print STDERR "ERROR: Execution of @_ failed.\n"; -	exit(1); -    } -} - -# Based on Moses' safesystem sub -sub check_call { -    print STDERR "Executing: @_\n"; -    system(@_); -    my $exitcode = $? >> 8; -    if($exitcode == 0) { -	return 0; -    } elsif ($? == -1) { -	print STDERR "ERROR: Failed to execute: @_\n  $!\n"; -	exit(1); - -    } elsif ($? & 127) { -      printf STDERR "ERROR: Execution of: @_\n  died with signal %d, %s coredump\n", -      ($? & 127),  ($? & 128) ? 'with' : 'without'; -      exit(1); - -    } else { -	print STDERR "Failed with exit code: $exitcode\n" if $exitcode; -	exit($exitcode); -    } -} - -sub check_bash_call { -    my @args = ( "bash", "-auxeo", "pipefail", "-c", "@_"); -    check_call(@args); -} - -sub check_bash_output { -    my @args = ( "bash", "-auxeo", "pipefail", "-c", "@_"); -    return check_output(@args); -} - -# perl module weirdness... -return 1; diff --git a/dpmert/line_mediator.pl b/dpmert/line_mediator.pl deleted file mode 100755 index bc2bb24c..00000000 --- a/dpmert/line_mediator.pl +++ /dev/null @@ -1,116 +0,0 @@ -#!/usr/bin/perl -w -#hooks up two processes, 2nd of which has one line of output per line of input, expected by the first, which starts off the communication - -# if you don't know how to fork/exec in a C program, this could be helpful under limited cirmustances (would be ok to liaise with sentserver) - -#WARNING: because it waits for the result from command 2 after sending every line, and especially if command 1 does the same, using sentserver as command 2 won't actually buy you any real parallelism. - -use strict; -use IPC::Open2; -use POSIX qw(pipe dup2 STDIN_FILENO STDOUT_FILENO); - -my $quiet=!$ENV{DEBUG}; -$quiet=1 if $ENV{QUIET}; -sub info { -    local $,=' '; -    print STDERR @_ unless $quiet; -} - -my $mode='CROSS'; -my $ser='DIRECT'; -$mode='PIPE' if $ENV{PIPE}; -$mode='SNAKE' if $ENV{SNAKE}; -$mode='CROSS' if $ENV{CROSS}; -$ser='SERIAL' if $ENV{SERIAL}; -$ser='DIRECT' if $ENV{DIRECT}; -$ser='SERIAL' if $mode eq 'SNAKE'; -info("mode: $mode\n"); -info("connection: $ser\n"); - - -my @c1; -if (scalar @ARGV) { -    do { -        push @c1,shift -    } while scalar @ARGV && $c1[$#c1] ne '--'; -} -pop @c1; -my @c2=@ARGV; -@ARGV=(); -(scalar @c1 && scalar @c2) || die qq{ -usage: $0 cmd1 args -- cmd2 args -all options are environment variables. -DEBUG=1 env var enables debugging output. -CROSS=1 hooks up two processes, 2nd of which has one line of output per line of input, expected by the first, which starts off the communication.  crosses stdin/stderr of cmd1 and cmd2 line by line (both must flush on newline and output.  cmd1 initiates the conversation (sends the first line).    default: attempts to cross stdin/stdout of c1 and c2 directly (via two unidirectional posix pipes created before fork). -SERIAL=1: (no parallelism possible) but lines exchanged are logged if DEBUG. -if SNAKE then stdin -> c1 -> c2 -> c1 -> stdout. -if PIPE then stdin -> c1 -> c2 -> stdout (same as shell c1|c2, but with SERIAL you can see the intermediate in real time; you could do similar with c1 | tee /dev/fd/2 |c2. -DIRECT=1 (default) will override SERIAL=1. -CROSS=1 (default) will override SNAKE or PIPE. -}; - -info("1 cmd:",@c1,"\n"); -info("2 cmd:",@c2,"\n"); - -sub lineto { -    select $_[0]; -    $|=1; -    shift; -    print @_; -} - -if ($ser eq 'SERIAL') { -    my ($R1,$W1,$R2,$W2); -    my $c1p=open2($R1,$W1,@c1); # Open2 R W backward from Open3. -    my $c2p=open2($R2,$W2,@c2); -    if ($mode eq 'CROSS') { -        while(<$R1>) { -            info("1:",$_); -            lineto($W2,$_); -            last unless defined ($_=<$R2>); -            info("1|2:",$_); -            lineto($W1,$_); -        } -    } else { -        my $snake=$mode eq 'SNAKE'; -        while(<STDIN>) { -            info("IN:",$_); -            lineto($W1,$_); -            last unless defined ($_=<$R1>); -            info("IN|1:",$_); -            lineto($W2,$_); -            last unless defined ($_=<$R2>); -            info("IN|1|2:",$_); -            if ($snake) { -                lineto($W1,$_); -                last unless defined ($_=<$R1>); -                info("IN|1|2|1:",$_); -            } -            lineto(*STDOUT,$_); -        } -    } -} else { -    info("DIRECT mode\n"); -    my @rw1=POSIX::pipe(); -    my @rw2=POSIX::pipe(); -    my $pid=undef; -    $SIG{CHLD} = sub { wait }; -    while (not defined ($pid=fork())) { -        sleep 1; -    } -    my $pipe = $mode eq 'PIPE'; -    unless ($pipe) { -        POSIX::close(STDOUT_FILENO); -        POSIX::close(STDIN_FILENO); -    } -    if ($pid) { -        POSIX::dup2($rw1[1],STDOUT_FILENO); -        POSIX::dup2($rw2[0],STDIN_FILENO) unless $pipe; -        exec @c1; -    } else { -        POSIX::dup2($rw2[1],STDOUT_FILENO) unless $pipe; -        POSIX::dup2($rw1[0],STDIN_FILENO); -        exec @c2; -    } -    while (wait()!=-1) {} -} diff --git a/dpmert/line_optimizer.cc b/dpmert/line_optimizer.cc deleted file mode 100644 index 9cf33502..00000000 --- a/dpmert/line_optimizer.cc +++ /dev/null @@ -1,114 +0,0 @@ -#include "line_optimizer.h" - -#include <limits> -#include <algorithm> - -#include "sparse_vector.h" -#include "ns.h" - -using namespace std; - -typedef ErrorSurface::const_iterator ErrorIter; - -// sort by increasing x-ints -struct IntervalComp { -  bool operator() (const ErrorIter& a, const ErrorIter& b) const { -    return a->x < b->x; -  } -}; - -double LineOptimizer::LineOptimize( -    const EvaluationMetric* metric, -    const vector<ErrorSurface>& surfaces, -    const LineOptimizer::ScoreType type, -    float* best_score, -    const double epsilon) { -  // cerr << "MIN=" << MINIMIZE_SCORE << " MAX=" << MAXIMIZE_SCORE << "  MINE=" << type << endl; -  vector<ErrorIter> all_ints; -  for (vector<ErrorSurface>::const_iterator i = surfaces.begin(); -       i != surfaces.end(); ++i) { -    const ErrorSurface& surface = *i; -    for (ErrorIter j = surface.begin(); j != surface.end(); ++j) -      all_ints.push_back(j); -  } -  sort(all_ints.begin(), all_ints.end(), IntervalComp()); -  double last_boundary = all_ints.front()->x; -  SufficientStats acc; -  float& cur_best_score = *best_score; -  cur_best_score = (type == MAXIMIZE_SCORE ? -    -numeric_limits<float>::max() : numeric_limits<float>::max()); -  bool left_edge = true; -  double pos = numeric_limits<double>::quiet_NaN(); -  for (vector<ErrorIter>::iterator i = all_ints.begin(); -       i != all_ints.end(); ++i) { -    const ErrorSegment& seg = **i; -    if (seg.x - last_boundary > epsilon) { -      float sco = metric->ComputeScore(acc); -      if ((type == MAXIMIZE_SCORE && sco > cur_best_score) || -          (type == MINIMIZE_SCORE && sco < cur_best_score) ) { -        cur_best_score = sco; -	if (left_edge) { -	  pos = seg.x - 0.1; -	  left_edge = false; -	} else { -	  pos = last_boundary + (seg.x - last_boundary) / 2; -	} -	//cerr << "NEW BEST: " << pos << "  (score=" << cur_best_score << ")\n"; -      } -      // string xx = metric->DetailedScore(acc); cerr << "---- " << xx; -#undef SHOW_ERROR_SURFACES -#ifdef SHOW_ERROR_SURFACES -      cerr << "x=" << seg.x << "\ts=" << sco << "\n"; -#endif -      last_boundary = seg.x; -    } -    // cerr << "x-boundary=" << seg.x << "\n"; -    //string x2; acc.Encode(&x2); cerr << "   ACC: " << x2 << endl; -    //string x1; seg.delta.Encode(&x1); cerr << " DELTA: " << x1 << endl; -    acc += seg.delta; -  } -  float sco = metric->ComputeScore(acc); -  if ((type == MAXIMIZE_SCORE && sco > cur_best_score) || -      (type == MINIMIZE_SCORE && sco < cur_best_score) ) { -    cur_best_score = sco; -    if (left_edge) { -      pos = 0; -    } else { -      pos = last_boundary + 1000.0; -    } -  } -  return pos; -} - -void LineOptimizer::RandomUnitVector(const vector<int>& features_to_optimize, -                                     SparseVector<double>* axis, -                                     RandomNumberGenerator<boost::mt19937>* rng) { -  axis->clear(); -  for (int i = 0; i < features_to_optimize.size(); ++i) -    axis->set_value(features_to_optimize[i], rng->NextNormal(0.0,1.0)); -  (*axis) /= axis->l2norm(); -} - -void LineOptimizer::CreateOptimizationDirections( -     const vector<int>& features_to_optimize, -     int additional_random_directions, -     RandomNumberGenerator<boost::mt19937>* rng, -     vector<SparseVector<double> >* dirs -     , bool include_orthogonal -  ) { -  dirs->clear(); -  typedef SparseVector<double> Dir; -  vector<Dir> &out=*dirs; -  int i=0; -  if (include_orthogonal) -    for (;i<features_to_optimize.size();++i) { -      Dir d; -      d.set_value(features_to_optimize[i],1.); -      out.push_back(d); -    } -  out.resize(i+additional_random_directions); -  for (;i<out.size();++i) -     RandomUnitVector(features_to_optimize, &out[i], rng); -  cerr << "Generated " << out.size() << " total axes to optimize along.\n"; -} - diff --git a/dpmert/line_optimizer.h b/dpmert/line_optimizer.h deleted file mode 100644 index 83819f41..00000000 --- a/dpmert/line_optimizer.h +++ /dev/null @@ -1,48 +0,0 @@ -#ifndef LINE_OPTIMIZER_H_ -#define LINE_OPTIMIZER_H_ - -#include <vector> - -#include "sparse_vector.h" -#include "error_surface.h" -#include "sampler.h" - -class EvaluationMetric; -class Weights; - -struct LineOptimizer { - -  // use MINIMIZE_SCORE for things like TER, WER -  // MAXIMIZE_SCORE for things like BLEU -  enum ScoreType { MAXIMIZE_SCORE, MINIMIZE_SCORE }; - -  // merge all the error surfaces together into a global -  // error surface and find (the middle of) the best segment -  static double LineOptimize( -     const EvaluationMetric* metric, -     const std::vector<ErrorSurface>& envs, -     const LineOptimizer::ScoreType type, -     float* best_score, -     const double epsilon = 1.0/65536.0); - -  // return a random vector of length 1 where all dimensions -  // not listed in dimensions will be 0. -  static void RandomUnitVector(const std::vector<int>& dimensions, -                               SparseVector<double>* axis, -                               RandomNumberGenerator<boost::mt19937>* rng); - -  // generate a list of directions to optimize; the list will -  // contain the orthogonal vectors corresponding to the dimensions in -  // primary and then additional_random_directions directions in those -  // dimensions as well.  All vectors will be length 1. -  static void CreateOptimizationDirections( -     const std::vector<int>& primary, -     int additional_random_directions, -     RandomNumberGenerator<boost::mt19937>* rng, -     std::vector<SparseVector<double> >* dirs -     , bool include_primary=true -    ); - -}; - -#endif diff --git a/dpmert/lo_test.cc b/dpmert/lo_test.cc deleted file mode 100644 index 95a08d3d..00000000 --- a/dpmert/lo_test.cc +++ /dev/null @@ -1,229 +0,0 @@ -#define BOOST_TEST_MODULE LineOptimizerTest -#include <boost/test/unit_test.hpp> -#include <boost/test/floating_point_comparison.hpp> - -#include <cmath> -#include <iostream> -#include <fstream> - -#include <boost/shared_ptr.hpp> - -#include "ns.h" -#include "ns_docscorer.h" -#include "ces.h" -#include "fdict.h" -#include "hg.h" -#include "kbest.h" -#include "hg_io.h" -#include "filelib.h" -#include "inside_outside.h" -#include "viterbi.h" -#include "mert_geometry.h" -#include "line_optimizer.h" - -using namespace std; - -const char* ref11 = "australia reopens embassy in manila"; -const char* ref12 = "( afp , manila , january 2 ) australia reopened its embassy in the philippines today , which was shut down about seven weeks ago due to what was described as a specific threat of a terrorist attack ."; -const char* ref21 = "australia reopened manila embassy"; -const char* ref22 = "( agence france-presse , manila , 2nd ) - australia reopened its embassy in the philippines today . the embassy was closed seven weeks ago after what was described as a specific threat of a terrorist attack ."; -const char* ref31 = "australia to reopen embassy in manila"; -const char* ref32 = "( afp report from manila , january 2 ) australia reopened its embassy in the philippines today . seven weeks ago , the embassy was shut down due to so - called confirmed terrorist attack threats ."; -const char* ref41 = "australia to re - open its embassy to manila"; -const char* ref42 = "( afp , manila , thursday ) australia reopens its embassy to manila , which was closed for the so - called \" clear \" threat of terrorist attack 7 weeks ago ."; - -BOOST_AUTO_TEST_CASE( TestCheckNaN) { -  double x = 0; -  double y = 0; -  double z = x / y; -  BOOST_CHECK_EQUAL(true, std::isnan(z)); -} - -BOOST_AUTO_TEST_CASE(TestConvexHull) { -  boost::shared_ptr<MERTPoint> a1(new MERTPoint(-1, 0)); -  boost::shared_ptr<MERTPoint> b1(new MERTPoint(1, 0)); -  boost::shared_ptr<MERTPoint> a2(new MERTPoint(-1, 1)); -  boost::shared_ptr<MERTPoint> b2(new MERTPoint(1, -1)); -  vector<boost::shared_ptr<MERTPoint> > sa; sa.push_back(a1); sa.push_back(b1); -  vector<boost::shared_ptr<MERTPoint> > sb; sb.push_back(a2); sb.push_back(b2); -  ConvexHull a(sa); -  cerr << a << endl; -  ConvexHull b(sb); -  ConvexHull c = a; -  c *= b; -  cerr << a << " (*) " << b << " = " << c << endl; -  BOOST_CHECK_EQUAL(3, c.size()); -} - -BOOST_AUTO_TEST_CASE(TestConvexHullInside) { -  const string json = "{\"rules\":[1,\"[X] ||| a\",2,\"[X] ||| A [1]\",3,\"[X] ||| c\",4,\"[X] ||| C [1]\",5,\"[X] ||| [1] B [2]\",6,\"[X] ||| [1] b [2]\",7,\"[X] ||| X [1]\",8,\"[X] ||| Z [1]\"],\"features\":[\"f1\",\"f2\",\"Feature_1\",\"Feature_0\",\"Model_0\",\"Model_1\",\"Model_2\",\"Model_3\",\"Model_4\",\"Model_5\",\"Model_6\",\"Model_7\"],\"edges\":[{\"tail\":[],\"feats\":[],\"rule\":1}],\"node\":{\"in_edges\":[0]},\"edges\":[{\"tail\":[0],\"feats\":[0,-0.8,1,-0.1],\"rule\":2}],\"node\":{\"in_edges\":[1]},\"edges\":[{\"tail\":[],\"feats\":[1,-1],\"rule\":3}],\"node\":{\"in_edges\":[2]},\"edges\":[{\"tail\":[2],\"feats\":[0,-0.2,1,-0.1],\"rule\":4}],\"node\":{\"in_edges\":[3]},\"edges\":[{\"tail\":[1,3],\"feats\":[0,-1.2,1,-0.2],\"rule\":5},{\"tail\":[1,3],\"feats\":[0,-0.5,1,-1.3],\"rule\":6}],\"node\":{\"in_edges\":[4,5]},\"edges\":[{\"tail\":[4],\"feats\":[0,-0.5,1,-0.8],\"rule\":7},{\"tail\":[4],\"feats\":[0,-0.7,1,-0.9],\"rule\":8}],\"node\":{\"in_edges\":[6,7]}}"; -  Hypergraph hg; -  istringstream instr(json); -  HypergraphIO::ReadFromJSON(&instr, &hg); -  SparseVector<double> wts; -  wts.set_value(FD::Convert("f1"), 0.4); -  wts.set_value(FD::Convert("f2"), 1.0); -  hg.Reweight(wts); -  vector<pair<vector<WordID>, prob_t> > list; -  std::vector<SparseVector<double> > features; -  KBest::KBestDerivations<vector<WordID>, ESentenceTraversal> kbest(hg, 10); -  for (int i = 0; i < 10; ++i) { -    const KBest::KBestDerivations<vector<WordID>, ESentenceTraversal>::Derivation* d = -      kbest.LazyKthBest(hg.nodes_.size() - 1, i); -    if (!d) break; -    cerr << log(d->score) << " ||| " << TD::GetString(d->yield) << " ||| " << d->feature_values << endl; -  } -  SparseVector<double> dir; dir.set_value(FD::Convert("f1"), 1.0); -  ConvexHullWeightFunction wf(wts, dir); -  ConvexHull env = Inside<ConvexHull, ConvexHullWeightFunction>(hg, NULL, wf); -  cerr << env << endl; -  const vector<boost::shared_ptr<MERTPoint> >& segs = env.GetSortedSegs(); -  dir *= segs[1]->x; -  wts += dir; -  hg.Reweight(wts); -  KBest::KBestDerivations<vector<WordID>, ESentenceTraversal> kbest2(hg, 10); -  for (int i = 0; i < 10; ++i) { -    const KBest::KBestDerivations<vector<WordID>, ESentenceTraversal>::Derivation* d = -      kbest2.LazyKthBest(hg.nodes_.size() - 1, i); -    if (!d) break; -    cerr << log(d->score) << " ||| " << TD::GetString(d->yield) << " ||| " << d->feature_values << endl; -  } -  for (unsigned i = 0; i < segs.size(); ++i) { -    cerr << "seg=" << i << endl; -    vector<WordID> trans; -    segs[i]->ConstructTranslation(&trans); -    cerr << TD::GetString(trans) << endl; -  } -} - -BOOST_AUTO_TEST_CASE( TestS1) { -  int fPhraseModel_0 = FD::Convert("PhraseModel_0"); -  int fPhraseModel_1 = FD::Convert("PhraseModel_1"); -  int fPhraseModel_2 = FD::Convert("PhraseModel_2"); -  int fLanguageModel = FD::Convert("LanguageModel"); -  int fWordPenalty = FD::Convert("WordPenalty"); -  int fPassThrough = FD::Convert("PassThrough"); -  SparseVector<double> wts; -  wts.set_value(fWordPenalty, 4.25); -  wts.set_value(fLanguageModel, -1.1165); -  wts.set_value(fPhraseModel_0, -0.96); -  wts.set_value(fPhraseModel_1, -0.65); -  wts.set_value(fPhraseModel_2, -0.77); -  wts.set_value(fPassThrough, -10.0); - -  vector<int> to_optimize; -  to_optimize.push_back(fWordPenalty); -  to_optimize.push_back(fLanguageModel); -  to_optimize.push_back(fPhraseModel_0); -  to_optimize.push_back(fPhraseModel_1); -  to_optimize.push_back(fPhraseModel_2); - -  std::string path(boost::unit_test::framework::master_test_suite().argc == 2 ? boost::unit_test::framework::master_test_suite().argv[1] : "test_data"); - -  Hypergraph hg; -  ReadFile rf(path + "/0.json.gz"); -  HypergraphIO::ReadFromJSON(rf.stream(), &hg); -  hg.Reweight(wts); - -  Hypergraph hg2; -  ReadFile rf2(path + "/1.json.gz"); -  HypergraphIO::ReadFromJSON(rf2.stream(), &hg2); -  hg2.Reweight(wts); - -  vector<vector<WordID> > refs1(4); -  TD::ConvertSentence(ref11, &refs1[0]); -  TD::ConvertSentence(ref21, &refs1[1]); -  TD::ConvertSentence(ref31, &refs1[2]); -  TD::ConvertSentence(ref41, &refs1[3]); -  vector<vector<WordID> > refs2(4); -  TD::ConvertSentence(ref12, &refs2[0]); -  TD::ConvertSentence(ref22, &refs2[1]); -  TD::ConvertSentence(ref32, &refs2[2]); -  TD::ConvertSentence(ref42, &refs2[3]); -  vector<ConvexHull> envs(2); - -  RandomNumberGenerator<boost::mt19937> rng; - -  vector<SparseVector<double> > axes; // directions to search -  LineOptimizer::CreateOptimizationDirections( -     to_optimize, -     10, -     &rng, -     &axes); -  assert(axes.size() == 10 + to_optimize.size()); -  for (unsigned i = 0; i < axes.size(); ++i) -    cerr << axes[i] << endl; -  const SparseVector<double>& axis = axes[0]; - -  cerr << "Computing Viterbi envelope using inside algorithm...\n"; -  cerr << "axis: " << axis << endl; -  clock_t t_start=clock(); -  ConvexHullWeightFunction wf(wts, axis);  // wts = starting point, axis = search direction -  envs[0] = Inside<ConvexHull, ConvexHullWeightFunction>(hg, NULL, wf); -  envs[1] = Inside<ConvexHull, ConvexHullWeightFunction>(hg2, NULL, wf); - -  vector<ErrorSurface> es(2); -  EvaluationMetric* metric = EvaluationMetric::Instance("IBM_BLEU"); -  boost::shared_ptr<SegmentEvaluator> scorer1 = metric->CreateSegmentEvaluator(refs1); -  boost::shared_ptr<SegmentEvaluator> scorer2 = metric->CreateSegmentEvaluator(refs2); -  ComputeErrorSurface(*scorer1, envs[0], &es[0], metric, hg); -  ComputeErrorSurface(*scorer2, envs[1], &es[1], metric, hg2); -  cerr << envs[0].size() << " " << envs[1].size() << endl; -  cerr << es[0].size() << " " << es[1].size() << endl; -  envs.clear(); -  clock_t t_env=clock(); -  float score; -  double m = LineOptimizer::LineOptimize(metric,es, LineOptimizer::MAXIMIZE_SCORE, &score); -  clock_t t_opt=clock(); -  cerr << "line optimizer returned: " << m << " (SCORE=" << score << ")\n"; -  BOOST_CHECK_CLOSE(0.48719698, score, 1e-5); -  SparseVector<double> res = axis; -  res *= m; -  res += wts; -  cerr << "res: " << res << endl; -  cerr << "ENVELOPE PROCESSING=" << (static_cast<double>(t_env - t_start) / 1000.0) << endl; -  cerr << "  LINE OPTIMIZATION=" << (static_cast<double>(t_opt - t_env) / 1000.0) << endl; -  hg.Reweight(res); -  hg2.Reweight(res); -  vector<WordID> t1,t2; -  ViterbiESentence(hg, &t1); -  ViterbiESentence(hg2, &t2); -  cerr << TD::GetString(t1) << endl; -  cerr << TD::GetString(t2) << endl; -} - -BOOST_AUTO_TEST_CASE(TestZeroOrigin) { -  const string json = "{\"rules\":[1,\"[X7] ||| blA ||| without ||| LHSProb=3.92173 LexE2F=2.90799 LexF2E=1.85003 GenerativeProb=10.5381 RulePenalty=1 XFE=2.77259 XEF=0.441833 LabelledEF=2.63906 LabelledFE=4.96981 LogRuleCount=0.693147\",2,\"[X7] ||| blA ||| except ||| LHSProb=4.92173 LexE2F=3.90799 LexF2E=1.85003 GenerativeProb=11.5381 RulePenalty=1 XFE=2.77259 XEF=1.44183 LabelledEF=2.63906 LabelledFE=4.96981 LogRuleCount=1.69315\",3,\"[S] ||| [X7,1] ||| [1] ||| GlueTop=1\",4,\"[X28] ||| EnwAn ||| title ||| LHSProb=3.96802 LexE2F=2.22462 LexF2E=1.83258 GenerativeProb=10.0863 RulePenalty=1 XFE=0 XEF=1.20397 LabelledEF=1.20397 LabelledFE=-1.98341e-08 LogRuleCount=1.09861\",5,\"[X0] ||| EnwAn ||| funny ||| LHSProb=3.98479 LexE2F=1.79176 LexF2E=3.21888 GenerativeProb=11.1681 RulePenalty=1 XFE=0 XEF=2.30259 LabelledEF=2.30259 LabelledFE=0 LogRuleCount=0 SingletonRule=1\",6,\"[X8] ||| [X7,1] EnwAn ||| entitled [1] ||| LHSProb=3.82533 LexE2F=3.21888 LexF2E=2.52573 GenerativeProb=11.3276 RulePenalty=1 XFE=1.20397 XEF=1.20397 LabelledEF=2.30259 LabelledFE=2.30259 LogRuleCount=0 SingletonRule=1\",7,\"[S] ||| [S,1] [X28,2] ||| [1] [2] ||| Glue=1\",8,\"[S] ||| [S,1] [X0,2] ||| [1] [2] ||| Glue=1\",9,\"[S] ||| [X8,1] ||| [1] ||| GlueTop=1\",10,\"[Goal] ||| [S,1] ||| [1]\"],\"features\":[\"PassThrough\",\"Glue\",\"GlueTop\",\"LanguageModel\",\"WordPenalty\",\"LHSProb\",\"LexE2F\",\"LexF2E\",\"GenerativeProb\",\"RulePenalty\",\"XFE\",\"XEF\",\"LabelledEF\",\"LabelledFE\",\"LogRuleCount\",\"SingletonRule\"],\"edges\":[{\"tail\":[],\"spans\":[0,1,-1,-1],\"feats\":[5,3.92173,6,2.90799,7,1.85003,8,10.5381,9,1,10,2.77259,11,0.441833,12,2.63906,13,4.96981,14,0.693147],\"rule\":1},{\"tail\":[],\"spans\":[0,1,-1,-1],\"feats\":[5,4.92173,6,3.90799,7,1.85003,8,11.5381,9,1,10,2.77259,11,1.44183,12,2.63906,13,4.96981,14,1.69315],\"rule\":2}],\"node\":{\"in_edges\":[0,1],\"cat\":\"X7\"},\"edges\":[{\"tail\":[0],\"spans\":[0,1,-1,-1],\"feats\":[2,1],\"rule\":3}],\"node\":{\"in_edges\":[2],\"cat\":\"S\"},\"edges\":[{\"tail\":[],\"spans\":[1,2,-1,-1],\"feats\":[5,3.96802,6,2.22462,7,1.83258,8,10.0863,9,1,11,1.20397,12,1.20397,13,-1.98341e-08,14,1.09861],\"rule\":4}],\"node\":{\"in_edges\":[3],\"cat\":\"X28\"},\"edges\":[{\"tail\":[],\"spans\":[1,2,-1,-1],\"feats\":[5,3.98479,6,1.79176,7,3.21888,8,11.1681,9,1,11,2.30259,12,2.30259,15,1],\"rule\":5}],\"node\":{\"in_edges\":[4],\"cat\":\"X0\"},\"edges\":[{\"tail\":[0],\"spans\":[0,2,-1,-1],\"feats\":[5,3.82533,6,3.21888,7,2.52573,8,11.3276,9,1,10,1.20397,11,1.20397,12,2.30259,13,2.30259,15,1],\"rule\":6}],\"node\":{\"in_edges\":[5],\"cat\":\"X8\"},\"edges\":[{\"tail\":[1,2],\"spans\":[0,2,-1,-1],\"feats\":[1,1],\"rule\":7},{\"tail\":[1,3],\"spans\":[0,2,-1,-1],\"feats\":[1,1],\"rule\":8},{\"tail\":[4],\"spans\":[0,2,-1,-1],\"feats\":[2,1],\"rule\":9}],\"node\":{\"in_edges\":[6,7,8],\"cat\":\"S\"},\"edges\":[{\"tail\":[5],\"spans\":[0,2,-1,-1],\"feats\":[],\"rule\":10}],\"node\":{\"in_edges\":[9],\"cat\":\"Goal\"}}"; -  Hypergraph hg; -  istringstream instr(json); -  HypergraphIO::ReadFromJSON(&instr, &hg); -  SparseVector<double> wts; -  wts.set_value(FD::Convert("PassThrough"), -0.929201533002898); -  hg.Reweight(wts); - -  vector<pair<vector<WordID>, prob_t> > list; -  std::vector<SparseVector<double> > features; -  KBest::KBestDerivations<vector<WordID>, ESentenceTraversal> kbest(hg, 10); -  for (int i = 0; i < 10; ++i) { -    const KBest::KBestDerivations<vector<WordID>, ESentenceTraversal>::Derivation* d = -      kbest.LazyKthBest(hg.nodes_.size() - 1, i); -    if (!d) break; -    cerr << log(d->score) << " ||| " << TD::GetString(d->yield) << " ||| " << d->feature_values << endl; -  } -  -  SparseVector<double> axis; axis.set_value(FD::Convert("Glue"),1.0); -  ConvexHullWeightFunction wf(wts, axis);  // wts = starting point, axis = search direction -  vector<ConvexHull> envs(1); -  envs[0] = Inside<ConvexHull, ConvexHullWeightFunction>(hg, NULL, wf); - -  vector<vector<WordID> > mr(4); -  TD::ConvertSentence("untitled", &mr[0]); -  TD::ConvertSentence("with no title", &mr[1]); -  TD::ConvertSentence("without a title", &mr[2]); -  TD::ConvertSentence("without title", &mr[3]); -  EvaluationMetric* metric = EvaluationMetric::Instance("IBM_BLEU"); -  boost::shared_ptr<SegmentEvaluator> scorer1 = metric->CreateSegmentEvaluator(mr); -  vector<ErrorSurface> es(1); -  ComputeErrorSurface(*scorer1, envs[0], &es[0], metric, hg); -} - diff --git a/dpmert/mert_geometry.cc b/dpmert/mert_geometry.cc deleted file mode 100644 index d6973658..00000000 --- a/dpmert/mert_geometry.cc +++ /dev/null @@ -1,185 +0,0 @@ -#include "mert_geometry.h" - -#include <cassert> -#include <limits> - -using namespace std; - -ConvexHull::ConvexHull(int i) { -  if (i == 0) { -    // do nothing - <> -  } else if (i == 1) { -    points.push_back(boost::shared_ptr<MERTPoint>(new MERTPoint(0, 0, 0, boost::shared_ptr<MERTPoint>(), boost::shared_ptr<MERTPoint>()))); -    assert(this->IsMultiplicativeIdentity()); -  } else { -    cerr << "Only can create ConvexHull semiring 0 and 1 with this constructor!\n"; -    abort(); -  } -} - -const ConvexHull ConvexHullWeightFunction::operator()(const Hypergraph::Edge& e) const { -  const double m = direction.dot(e.feature_values_); -  const double b = origin.dot(e.feature_values_); -  MERTPoint* point = new MERTPoint(m, b, e); -  return ConvexHull(1, point); -} - -ostream& operator<<(ostream& os, const ConvexHull& env) { -  os << '<'; -  const vector<boost::shared_ptr<MERTPoint> >& points = env.GetSortedSegs(); -  for (int i = 0; i < points.size(); ++i) -    os << (i==0 ? "" : "|") << "x=" << points[i]->x << ",b=" << points[i]->b << ",m=" << points[i]->m << ",p1=" << points[i]->p1 << ",p2=" << points[i]->p2; -  return os << '>'; -} - -#define ORIGINAL_MERT_IMPLEMENTATION 1 -#ifdef ORIGINAL_MERT_IMPLEMENTATION - -struct SlopeCompare { -  bool operator() (const boost::shared_ptr<MERTPoint>& a, const boost::shared_ptr<MERTPoint>& b) const { -    return a->m < b->m; -  } -}; - -const ConvexHull& ConvexHull::operator+=(const ConvexHull& other) { -  if (!other.is_sorted) other.Sort(); -  if (points.empty()) { -    points = other.points; -    return *this; -  } -  is_sorted = false; -  int j = points.size(); -  points.resize(points.size() + other.points.size()); -  for (int i = 0; i < other.points.size(); ++i) -    points[j++] = other.points[i]; -  assert(j == points.size()); -  return *this; -} - -void ConvexHull::Sort() const { -  sort(points.begin(), points.end(), SlopeCompare()); -  const int k = points.size(); -  int j = 0; -  for (int i = 0; i < k; ++i) { -    MERTPoint l = *points[i]; -    l.x = kMinusInfinity; -    // cerr << "m=" << l.m << endl; -    if (0 < j) { -      if (points[j-1]->m == l.m) {   // lines are parallel -        if (l.b <= points[j-1]->b) continue; -        --j; -      } -      while(0 < j) { -        l.x = (l.b - points[j-1]->b) / (points[j-1]->m - l.m); -        if (points[j-1]->x < l.x) break; -        --j; -      } -      if (0 == j) l.x = kMinusInfinity; -    } -    *points[j++] = l; -  } -  points.resize(j); -  is_sorted = true; -} - -const ConvexHull& ConvexHull::operator*=(const ConvexHull& other) { -  if (other.IsMultiplicativeIdentity()) { return *this; } -  if (this->IsMultiplicativeIdentity()) { (*this) = other; return *this; } - -  if (!is_sorted) Sort(); -  if (!other.is_sorted) other.Sort(); - -  if (this->IsEdgeEnvelope()) { -//    if (other.size() > 1) -//      cerr << *this << " (TIMES) " << other << endl; -    boost::shared_ptr<MERTPoint> edge_parent = points[0]; -    const double& edge_b = edge_parent->b; -    const double& edge_m = edge_parent->m; -    points.clear(); -    for (int i = 0; i < other.points.size(); ++i) { -      const MERTPoint& p = *other.points[i]; -      const double m = p.m + edge_m; -      const double b = p.b + edge_b; -      const double& x = p.x;       // x's don't change with * -      points.push_back(boost::shared_ptr<MERTPoint>(new MERTPoint(x, m, b, edge_parent, other.points[i]))); -      assert(points.back()->p1->edge); -    } -//    if (other.size() > 1) -//      cerr << " = " << *this << endl; -  } else { -    vector<boost::shared_ptr<MERTPoint> > new_points; -    int this_i = 0; -    int other_i = 0; -    const int this_size  = points.size(); -    const int other_size = other.points.size(); -    double cur_x = kMinusInfinity;   // moves from left to right across the -                                     // real numbers, stopping for all inter- -                                     // sections -    double this_next_val  = (1 < this_size  ? points[1]->x       : kPlusInfinity); -    double other_next_val = (1 < other_size ? other.points[1]->x : kPlusInfinity); -    while (this_i < this_size && other_i < other_size) { -      const MERTPoint& this_point = *points[this_i]; -      const MERTPoint& other_point= *other.points[other_i]; -      const double m = this_point.m + other_point.m; -      const double b = this_point.b + other_point.b; -  -      new_points.push_back(boost::shared_ptr<MERTPoint>(new MERTPoint(cur_x, m, b, points[this_i], other.points[other_i]))); -      int comp = 0; -      if (this_next_val < other_next_val) comp = -1; else -        if (this_next_val > other_next_val) comp = 1; -      if (0 == comp) {  // the next values are equal, advance both indices -        ++this_i; -	++other_i; -        cur_x = this_next_val;  // could be other_next_val (they're equal!) -        this_next_val  = (this_i+1  < this_size  ? points[this_i+1]->x        : kPlusInfinity); -        other_next_val = (other_i+1 < other_size ? other.points[other_i+1]->x : kPlusInfinity); -      } else {  // advance the i with the lower x, update cur_x -        if (-1 == comp) { -          ++this_i; -          cur_x = this_next_val; -          this_next_val =  (this_i+1  < this_size  ? points[this_i+1]->x        : kPlusInfinity); -        } else { -          ++other_i; -          cur_x = other_next_val; -          other_next_val = (other_i+1 < other_size ? other.points[other_i+1]->x : kPlusInfinity); -        } -      } -    } -    points.swap(new_points); -  } -  //cerr << "Multiply: result=" << (*this) << endl; -  return *this; -} - -// recursively construct translation -void MERTPoint::ConstructTranslation(vector<WordID>* trans) const { -  const MERTPoint* cur = this; -  vector<vector<WordID> > ant_trans; -  while(!cur->edge) { -    ant_trans.resize(ant_trans.size() + 1); -    cur->p2->ConstructTranslation(&ant_trans.back()); -    cur = cur->p1.get(); -  } -  size_t ant_size = ant_trans.size(); -  vector<const vector<WordID>*> pants(ant_size); -  assert(ant_size == cur->edge->tail_nodes_.size()); -  --ant_size; -  for (int i = 0; i < pants.size(); ++i) pants[ant_size - i] = &ant_trans[i]; -  cur->edge->rule_->ESubstitute(pants, trans); -} - -void MERTPoint::CollectEdgesUsed(std::vector<bool>* edges_used) const { -  if (edge) { -    assert(edge->id_ < edges_used->size()); -    (*edges_used)[edge->id_] = true; -  } -  if (p1) p1->CollectEdgesUsed(edges_used); -  if (p2) p2->CollectEdgesUsed(edges_used); -} - -#else - -// THIS IS THE NEW FASTER IMPLEMENTATION OF THE MERT SEMIRING OPERATIONS - -#endif - diff --git a/dpmert/mert_geometry.h b/dpmert/mert_geometry.h deleted file mode 100644 index a8b6959e..00000000 --- a/dpmert/mert_geometry.h +++ /dev/null @@ -1,81 +0,0 @@ -#ifndef _MERT_GEOMETRY_H_ -#define _MERT_GEOMETRY_H_ - -#include <vector> -#include <iostream> -#include <boost/shared_ptr.hpp> - -#include "hg.h" -#include "sparse_vector.h" - -static const double kMinusInfinity = -std::numeric_limits<double>::infinity(); -static const double kPlusInfinity = std::numeric_limits<double>::infinity(); - -struct MERTPoint { -  MERTPoint() : x(), m(), b(), edge() {} -  MERTPoint(double _m, double _b) : -    x(kMinusInfinity), m(_m), b(_b), edge() {} -  MERTPoint(double _x, double _m, double _b, const boost::shared_ptr<MERTPoint>& p1_, const boost::shared_ptr<MERTPoint>& p2_) : -    x(_x), m(_m), b(_b), p1(p1_), p2(p2_), edge() {} -  MERTPoint(double _m, double _b, const Hypergraph::Edge& edge) : -    x(kMinusInfinity), m(_m), b(_b), edge(&edge) {} - -  double x;                   // x intersection with previous segment in env, or -inf if none -  double m;                   // this line's slope -  double b;                   // intercept with y-axis - -  // we keep a pointer to the "parents" of this segment so we can reconstruct -  // the Viterbi translation corresponding to this segment -  boost::shared_ptr<MERTPoint> p1; -  boost::shared_ptr<MERTPoint> p2; - -  // only MERTPoints created from an edge using the ConvexHullWeightFunction -  // have rules -  // TRulePtr rule; -  const Hypergraph::Edge* edge; - -  // recursively recover the Viterbi translation that will result from setting -  // the weights to origin + axis * x, where x is any value from this->x up -  // until the next largest x in the containing ConvexHull -  void ConstructTranslation(std::vector<WordID>* trans) const; -  void CollectEdgesUsed(std::vector<bool>* edges_used) const; -}; - -// this is the semiring value type, -// it defines constructors for 0, 1, and the operations + and * -struct ConvexHull { -  // create semiring zero -  ConvexHull() : is_sorted(true) {}  // zero -  // for debugging: -  ConvexHull(const std::vector<boost::shared_ptr<MERTPoint> >& s) : points(s) { Sort(); } -  // create semiring 1 or 0 -  explicit ConvexHull(int i); -  ConvexHull(int n, MERTPoint* point) : is_sorted(true), points(n, boost::shared_ptr<MERTPoint>(point)) {} -  const ConvexHull& operator+=(const ConvexHull& other); -  const ConvexHull& operator*=(const ConvexHull& other); -  bool IsMultiplicativeIdentity() const { -    return size() == 1 && (points[0]->b == 0.0 && points[0]->m == 0.0) && (!points[0]->edge) && (!points[0]->p1) && (!points[0]->p2); } -  const std::vector<boost::shared_ptr<MERTPoint> >& GetSortedSegs() const { -    if (!is_sorted) Sort(); -    return points; -  } -  size_t size() const { return points.size(); } - - private: -  bool IsEdgeEnvelope() const { -    return points.size() == 1 && points[0]->edge; } -  void Sort() const; -  mutable bool is_sorted; -  mutable std::vector<boost::shared_ptr<MERTPoint> > points; -}; -std::ostream& operator<<(std::ostream& os, const ConvexHull& env); - -struct ConvexHullWeightFunction { -  ConvexHullWeightFunction(const SparseVector<double>& ori, -                           const SparseVector<double>& dir) : origin(ori), direction(dir) {} -  const ConvexHull operator()(const Hypergraph::Edge& e) const; -  const SparseVector<double> origin; -  const SparseVector<double> direction; -}; - -#endif diff --git a/dpmert/mr_dpmert_generate_mapper_input.cc b/dpmert/mr_dpmert_generate_mapper_input.cc deleted file mode 100644 index 199cd23a..00000000 --- a/dpmert/mr_dpmert_generate_mapper_input.cc +++ /dev/null @@ -1,81 +0,0 @@ -#include <iostream> -#include <vector> - -#include <boost/program_options.hpp> -#include <boost/program_options/variables_map.hpp> - -#include "filelib.h" -#include "weights.h" -#include "line_optimizer.h" - -using namespace std; -namespace po = boost::program_options; - -void InitCommandLine(int argc, char** argv, po::variables_map* conf) { -  po::options_description opts("Configuration options"); -  opts.add_options() -        ("dev_set_size,s",po::value<unsigned>(),"[REQD] Development set size (# of parallel sentences)") -        ("forest_repository,r",po::value<string>(),"[REQD] Path to forest repository") -        ("weights,w",po::value<string>(),"[REQD] Current feature weights file") -        ("optimize_feature,o",po::value<vector<string> >(), "Feature to optimize (if none specified, all weights listed in the weights file will be optimized)") -        ("random_directions,d",po::value<unsigned int>()->default_value(20),"Number of random directions to run the line optimizer in") -        ("help,h", "Help"); -  po::options_description dcmdline_options; -  dcmdline_options.add(opts); -  po::store(parse_command_line(argc, argv, dcmdline_options), *conf); -  bool flag = false; -  if (conf->count("dev_set_size") == 0) { -    cerr << "Please specify the size of the development set using -d N\n"; -    flag = true; -  } -  if (conf->count("weights") == 0) { -    cerr << "Please specify the starting-point weights using -w <weightfile.txt>\n"; -    flag = true; -  } -  if (conf->count("forest_repository") == 0) { -    cerr << "Please specify the forest repository location using -r <DIR>\n"; -    flag = true; -  } -  if (flag || conf->count("help")) { -    cerr << dcmdline_options << endl; -    exit(1); -  } -} - -int main(int argc, char** argv) { -  RandomNumberGenerator<boost::mt19937> rng; -  po::variables_map conf; -  InitCommandLine(argc, argv, &conf); -  vector<string> features; -  SparseVector<weight_t> origin; -  vector<weight_t> w; -  Weights::InitFromFile(conf["weights"].as<string>(), &w, &features); -  Weights::InitSparseVector(w, &origin); -  const string forest_repository = conf["forest_repository"].as<string>(); -  if (!DirectoryExists(forest_repository)) { -    cerr << "Forest repository directory " << forest_repository << " not found!\n"; -    return 1; -  } -  if (conf.count("optimize_feature") > 0) -    features=conf["optimize_feature"].as<vector<string> >(); -  vector<SparseVector<weight_t> > directions; -  vector<int> fids(features.size()); -  for (unsigned i = 0; i < features.size(); ++i) -    fids[i] = FD::Convert(features[i]); -  LineOptimizer::CreateOptimizationDirections( -     fids, -     conf["random_directions"].as<unsigned int>(), -     &rng, -     &directions); -  unsigned dev_set_size = conf["dev_set_size"].as<unsigned>(); -  for (unsigned i = 0; i < dev_set_size; ++i) { -    for (unsigned j = 0; j < directions.size(); ++j) { -      cout << forest_repository << '/' << i << ".json.gz " << i << ' '; -      print(cout, origin, "=", ";"); -      cout << ' '; -      print(cout, directions[j], "=", ";"); -      cout << endl; -    } -  } -  return 0; -} diff --git a/dpmert/mr_dpmert_map.cc b/dpmert/mr_dpmert_map.cc deleted file mode 100644 index d1efcf96..00000000 --- a/dpmert/mr_dpmert_map.cc +++ /dev/null @@ -1,112 +0,0 @@ -#include <sstream> -#include <iostream> -#include <fstream> -#include <vector> - -#include <boost/program_options.hpp> -#include <boost/program_options/variables_map.hpp> - -#include "ns.h" -#include "ns_docscorer.h" -#include "ces.h" -#include "filelib.h" -#include "stringlib.h" -#include "sparse_vector.h" -#include "mert_geometry.h" -#include "inside_outside.h" -#include "error_surface.h" -#include "b64tools.h" -#include "hg_io.h" - -using namespace std; -namespace po = boost::program_options; - -void InitCommandLine(int argc, char** argv, po::variables_map* conf) { -  po::options_description opts("Configuration options"); -  opts.add_options() -        ("reference,r",po::value<vector<string> >(), "[REQD] Reference translation (tokenized text)") -        ("source,s",po::value<string>(), "Source file (ignored, except for AER)") -        ("evaluation_metric,m",po::value<string>()->default_value("ibm_bleu"), "Evaluation metric being optimized") -        ("input,i",po::value<string>()->default_value("-"), "Input file to map (- is STDIN)") -        ("help,h", "Help"); -  po::options_description dcmdline_options; -  dcmdline_options.add(opts); -  po::store(parse_command_line(argc, argv, dcmdline_options), *conf); -  bool flag = false; -  if (!conf->count("reference")) { -    cerr << "Please specify one or more references using -r <REF.TXT>\n"; -    flag = true; -  } -  if (flag || conf->count("help")) { -    cerr << dcmdline_options << endl; -    exit(1); -  } -} - -bool ReadSparseVectorString(const string& s, SparseVector<double>* v) { -#if 0 -  // this should work, but untested. -  std::istringstream i(s); -  i>>*v; -#else -  vector<string> fields; -  Tokenize(s, ';', &fields); -  if (fields.empty()) return false; -  for (unsigned i = 0; i < fields.size(); ++i) { -    vector<string> pair(2); -    Tokenize(fields[i], '=', &pair); -    if (pair.size() != 2) { -      cerr << "Error parsing vector string: " << fields[i] << endl; -      return false; -    } -    v->set_value(FD::Convert(pair[0]), atof(pair[1].c_str())); -  } -  return true; -#endif -} - -int main(int argc, char** argv) { -  po::variables_map conf; -  InitCommandLine(argc, argv, &conf); -  const string evaluation_metric = conf["evaluation_metric"].as<string>(); -  EvaluationMetric* metric = EvaluationMetric::Instance(evaluation_metric); -  DocumentScorer ds(metric, conf["reference"].as<vector<string> >()); -  cerr << "Loaded " << ds.size() << " references for scoring with " << evaluation_metric << endl; -  Hypergraph hg; -  string last_file; -  ReadFile in_read(conf["input"].as<string>()); -  istream &in=*in_read.stream(); -  while(in) { -    string line; -    getline(in, line); -    if (line.empty()) continue; -    istringstream is(line); -    int sent_id; -    string file, s_origin, s_direction; -    // path-to-file (JSON) sent_ed starting-point search-direction -    is >> file >> sent_id >> s_origin >> s_direction; -    SparseVector<double> origin; -    ReadSparseVectorString(s_origin, &origin); -    SparseVector<double> direction; -    ReadSparseVectorString(s_direction, &direction); -    // cerr << "File: " << file << "\nDir: " << direction << "\n   X: " << origin << endl; -    if (last_file != file) { -      last_file = file; -      ReadFile rf(file); -      HypergraphIO::ReadFromJSON(rf.stream(), &hg); -    } -    const ConvexHullWeightFunction wf(origin, direction); -    const ConvexHull hull = Inside<ConvexHull, ConvexHullWeightFunction>(hg, NULL, wf); - -    ErrorSurface es; -    ComputeErrorSurface(*ds[sent_id], hull, &es, metric, hg); -    //cerr << "Viterbi envelope has " << ve.size() << " segments\n"; -    // cerr << "Error surface has " << es.size() << " segments\n"; -    string val; -    es.Serialize(&val); -    cout << 'M' << ' ' << s_origin << ' ' << s_direction << '\t'; -    B64::b64encode(val.c_str(), val.size(), &cout); -    cout << endl << flush; -  } -  return 0; -} diff --git a/dpmert/mr_dpmert_reduce.cc b/dpmert/mr_dpmert_reduce.cc deleted file mode 100644 index 31512a03..00000000 --- a/dpmert/mr_dpmert_reduce.cc +++ /dev/null @@ -1,77 +0,0 @@ -#include <sstream> -#include <iostream> -#include <fstream> -#include <vector> - -#include <boost/program_options.hpp> -#include <boost/program_options/variables_map.hpp> - -#include "sparse_vector.h" -#include "error_surface.h" -#include "line_optimizer.h" -#include "b64tools.h" -#include "stringlib.h" - -using namespace std; -namespace po = boost::program_options; - -void InitCommandLine(int argc, char** argv, po::variables_map* conf) { -  po::options_description opts("Configuration options"); -  opts.add_options() -        ("evaluation_metric,m",po::value<string>(), "Evaluation metric (IBM_BLEU, etc.)") -        ("help,h", "Help"); -  po::options_description dcmdline_options; -  dcmdline_options.add(opts); -  po::store(parse_command_line(argc, argv, dcmdline_options), *conf); -  bool flag = conf->count("evaluation_metric") == 0; -  if (flag || conf->count("help")) { -    cerr << dcmdline_options << endl; -    exit(1); -  } -} - -int main(int argc, char** argv) { -  po::variables_map conf; -  InitCommandLine(argc, argv, &conf); -  const string evaluation_metric = conf["evaluation_metric"].as<string>(); -  EvaluationMetric* metric = EvaluationMetric::Instance(evaluation_metric); -  LineOptimizer::ScoreType opt_type = LineOptimizer::MAXIMIZE_SCORE; -  if (metric->IsErrorMetric()) -    opt_type = LineOptimizer::MINIMIZE_SCORE; - -  vector<ErrorSurface> esv; -  string last_key, line, key, val; -  while(getline(cin, line)) { -    size_t ks = line.find("\t"); -    assert(string::npos != ks); -    assert(ks > 2); -    key = line.substr(2, ks - 2); -    val = line.substr(ks + 1); -    if (key != last_key) { -      if (!last_key.empty()) { -	float score; -        double x = LineOptimizer::LineOptimize(metric, esv, opt_type, &score); -	cout << last_key << "|" << x << "|" << score << endl; -      } -      last_key.swap(key); -      esv.clear(); -    } -    if (val.size() % 4 != 0) { -      cerr << "B64 encoding error 1! Skipping.\n"; -      continue; -    } -    string encoded(val.size() / 4 * 3, '\0'); -    if (!B64::b64decode(reinterpret_cast<const unsigned char*>(&val[0]), val.size(), &encoded[0], encoded.size())) { -      cerr << "B64 encoding error 2! Skipping.\n"; -      continue; -    } -    esv.push_back(ErrorSurface()); -    esv.back().Deserialize(encoded); -  } -  if (!esv.empty()) { -    float score; -    double x = LineOptimizer::LineOptimize(metric, esv, opt_type, &score); -    cout << last_key << "|" << x << "|" << score << endl; -  } -  return 0; -} diff --git a/dpmert/parallelize.pl b/dpmert/parallelize.pl deleted file mode 100755 index d2ebaeea..00000000 --- a/dpmert/parallelize.pl +++ /dev/null @@ -1,423 +0,0 @@ -#!/usr/bin/env perl - -# Author: Adam Lopez -# -# This script takes a command that processes input -# from stdin one-line-at-time, and parallelizes it -# on the cluster using David Chiang's sentserver/ -# sentclient architecture. -# -# Prerequisites: the command *must* read each line -# without waiting for subsequent lines of input -# (for instance, a command which must read all lines -# of input before processing will not work) and -# return it to the output *without* buffering -# multiple lines. - -#TODO: if -j 1, run immediately, not via sentserver?  possible differences in environment might make debugging harder - -#ANNOYANCE: if input is shorter than -j n lines, or at the very last few lines, repeatedly sleeps.  time cut down to 15s from 60s - -my $SCRIPT_DIR; BEGIN { use Cwd qw/ abs_path /; use File::Basename; $SCRIPT_DIR = dirname(abs_path($0)); push @INC, $SCRIPT_DIR, "$SCRIPT_DIR/../environment"; } -use LocalConfig; - -use Cwd qw/ abs_path cwd getcwd /;  -use File::Temp qw/ tempfile /; -use Getopt::Long; -use IPC::Open2; -use strict; -use POSIX ":sys_wait_h"; - -use File::Basename; -my $myDir = dirname(__FILE__); -print STDERR __FILE__." -> $myDir\n"; -push(@INC, $myDir); -require "libcall.pl"; - -my $tailn=5; # +0 = concatenate all the client logs.  5 = last 5 lines -my $recycle_clients;    # spawn new clients when previous ones terminate -my $stay_alive;      # dont let server die when having zero clients -my $joblist = ""; -my $errordir=""; -my $multiline; -my $workdir = '.'; -my $numnodes = 8; -my $user = $ENV{"USER"}; -my $pmem = "9g"; -my $basep=50300; -my $randp=300; -my $tryp=50; -my $no_which; -my $no_cd; - -my $DEBUG=$ENV{DEBUG}; -print STDERR "DEBUG=$DEBUG output enabled.\n" if $DEBUG; -my $verbose = 1; -sub verbose { -    if ($verbose) { -        print STDERR @_,"\n"; -    } -} -sub debug { -    if ($DEBUG) { -        my ($package, $filename, $line) = caller; -        print STDERR "DEBUG: $filename($line): ",join(' ',@_),"\n"; -    } -} -my $is_shell_special=qr.[ \t\n\\><|&;"'`~*?{}$!()].; -my $shell_escape_in_quote=qr.[\\"\$`!].; -sub escape_shell { -    my ($arg)=@_; -    return undef unless defined $arg; -    return '""' unless $arg; -    if ($arg =~ /$is_shell_special/) { -        $arg =~ s/($shell_escape_in_quote)/\\$1/g; -        return "\"$arg\""; -    } -    return $arg; -} -sub preview_files { -    my ($l,$skipempty,$footer,$n)=@_; -    $n=$tailn unless defined $n; -    my @f=grep { ! ($skipempty && -z $_) } @$l; -    my $fn=join(' ',map {escape_shell($_)} @f); -    my $cmd="tail -n $n $fn"; -    unchecked_output("$cmd").($footer?"\nNONEMPTY FILES:\n$fn\n":""); -} -sub prefix_dirname($) { -    #like `dirname but if ends in / then return the whole thing -    local ($_)=@_; -    if (/\/$/) { -        $_; -    } else { -        s#/[^/]$##; -        $_ ? $_ : ''; -    } -} -sub ensure_final_slash($) { -    local ($_)=@_; -    m#/$# ? $_ : ($_."/"); -} -sub extend_path($$;$$) { -    my ($base,$ext,$mkdir,$baseisdir)=@_; -    if (-d $base) { -        $base.="/"; -    } else { -        my $dir; -        if ($baseisdir) { -            $dir=$base; -            $base.='/' unless $base =~ /\/$/; -        } else { -            $dir=prefix_dirname($base); -        } -        my @cmd=("/bin/mkdir","-p",$dir); -        check_call(@cmd) if $mkdir; -    } -    return $base.$ext; -} - -my $abscwd=abs_path(&getcwd); -sub print_help; - -my $use_fork; -my @pids; - -# Process command-line options -unless (GetOptions( -      "stay-alive" => \$stay_alive, -      "recycle-clients" => \$recycle_clients, -      "error-dir=s" => \$errordir, -      "multi-line" => \$multiline, -      "workdir=s" => \$workdir, -      "use-fork" => \$use_fork, -      "verbose" => \$verbose, -      "jobs=i" => \$numnodes, -      "pmem=s" => \$pmem, -        "baseport=i" => \$basep, -#       "iport=i" => \$randp, #for short name -i -        "no-which!" => \$no_which, -            "no-cd!" => \$no_cd, -            "tailn=s" => \$tailn, -) && scalar @ARGV){ -  print_help(); -    die "bad options."; -} - -my $cmd = ""; -my $prog=shift; -if ($no_which) { -    $cmd=$prog; -} else { -    $cmd=check_output("which $prog"); -    chomp $cmd; -    die "$prog not found - $cmd" unless $cmd; -} -#$cmd=abs_path($cmd); -for my $arg (@ARGV) { -    $cmd .= " ".escape_shell($arg); -} -die "Please specify a command to parallelize\n" if $cmd eq ''; - -my $cdcmd=$no_cd ? '' : ("cd ".escape_shell($abscwd)."\n"); - -my $executable = $cmd; -$executable =~ s/^\s*(\S+)($|\s.*)/$1/; -$executable=check_output("basename $executable"); -chomp $executable; - - -print STDERR "Parallelizing ($numnodes ways): $cmd\n\n"; - -# create -e dir and save .sh -use File::Temp qw/tempdir/; -unless ($errordir) { -    $errordir=tempdir("$executable.XXXXXX",CLEANUP=>1); -} -if ($errordir) { -    my $scriptfile=extend_path("$errordir/","$executable.sh",1,1); -    -d $errordir || die "should have created -e dir $errordir"; -    open SF,">",$scriptfile || die; -    print SF "$cdcmd$cmd\n"; -    close SF; -    chmod 0755,$scriptfile; -    $errordir=abs_path($errordir); -    &verbose("-e dir: $errordir"); -} - -# set cleanup handler -my @cleanup_cmds; -sub cleanup; -sub cleanup_and_die; -$SIG{INT} = "cleanup_and_die"; -$SIG{TERM} = "cleanup_and_die"; -$SIG{HUP} = "cleanup_and_die"; - -# other subs: -sub numof_live_jobs; -sub launch_job_on_node; - - -# vars -my $mydir = check_output("dirname $0"); chomp $mydir; -my $sentserver = "$mydir/sentserver"; -my $sentclient = "$mydir/sentclient"; -my $host = check_output("hostname"); -chomp $host; - - -# find open port -srand; -my $port = 50300+int(rand($randp)); -my $endp=$port+$tryp; -sub listening_port_lines { -    my $quiet=$verbose?'':'2>/dev/null'; -    return unchecked_output("netstat -a -n $quiet | grep LISTENING | grep -i tcp"); -} -my $netstat=&listening_port_lines; - -if ($verbose){ print STDERR "Testing port $port...";} - -while ($netstat=~/$port/ || &listening_port_lines=~/$port/){ -  if ($verbose){ print STDERR "port is busy\n";} -  $port++; -  if ($port > $endp){ -    die "Unable to find open port\n"; -  } -  if ($verbose){ print STDERR "Testing port $port... "; } -} -if ($verbose){ -  print STDERR "port $port is available\n"; -} - -my $key = int(rand()*1000000); - -my $multiflag = ""; -if ($multiline){ $multiflag = "-m"; print STDERR "expecting multiline output.\n"; } -my $stay_alive_flag = ""; -if ($stay_alive){ $stay_alive_flag = "--stay-alive"; print STDERR "staying alive while no clients are connected.\n"; } - -my $node_count = 0; -my $script = ""; -# fork == one thread runs the sentserver, while the -# other spawns the sentclient commands. -my $pid = fork; -if ($pid == 0) { # child -  sleep 8; # give other thread time to start sentserver -  $script = "$cdcmd$sentclient $host:$port:$key $cmd"; - -  if ($verbose){ -    print STDERR "Client script:\n====\n"; -    print STDERR $script; -    print STDERR "====\n"; -  } -  for (my $jobn=0; $jobn<$numnodes; $jobn++){ -    launch_job(); -  } -  if ($recycle_clients) { -    my $ret; -    my $livejobs; -    while (1) { -      $ret = waitpid($pid, WNOHANG); -      #print STDERR "waitpid $pid ret = $ret \n"; -      last if ($ret != 0); -      $livejobs = numof_live_jobs(); -      if ($numnodes >= $livejobs ) {  # a client terminated, OR # lines of input was less than -j -        print STDERR "num of requested nodes = $numnodes; num of currently live jobs = $livejobs; Client terminated - launching another.\n"; -        launch_job(); -      } else { -        sleep 15; -      } -    } -  } -  print STDERR "CHILD PROCESSES SPAWNED ... WAITING\n"; -  for my $p (@pids) { -    waitpid($p, 0); -  } -} else { -#  my $todo = "$sentserver -k $key $multiflag $port "; -  my $todo = "$sentserver -k $key $multiflag $port $stay_alive_flag "; -  if ($verbose){ print STDERR "Running: $todo\n"; } -  check_call($todo); -  print STDERR "Call to $sentserver returned.\n"; -  cleanup(); -  exit(0); -} - -sub numof_live_jobs { -  if ($use_fork) { -    die "not implemented"; -  } else { -    # We can probably continue decoding if the qstat error is only temporary -    my @livejobs = grep(/$joblist/, split(/\n/, unchecked_output("qstat"))); -    return ($#livejobs + 1); -  } -} -my (@errors,@outs,@cmds); - -sub launch_job { -    if ($use_fork) { return launch_job_fork(); } -    my $errorfile = "/dev/null"; -    my $outfile = "/dev/null"; -    $node_count++; -    my $clientname = $executable; -    $clientname =~ s/^(.{4}).*$/$1/; -    $clientname = "$clientname.$node_count"; -    if ($errordir){ -      $errorfile = "$errordir/$clientname.ER"; -      $outfile = "$errordir/$clientname.OU"; -      push @errors,$errorfile; -      push @outs,$outfile; -    } -    my $todo = qsub_args($pmem) . " -N $clientname -o $outfile -e $errorfile"; -    push @cmds,$todo; - -    print STDERR "Running: $todo\n"; -    local(*QOUT, *QIN); -    open2(\*QOUT, \*QIN, $todo) or die "Failed to open2: $!"; -    print QIN $script; -    close QIN; -    while (my $jobid=<QOUT>){ -      chomp $jobid; -      if ($verbose){ print STDERR "Launched client job: $jobid"; } -      $jobid =~ s/^(\d+)(.*?)$/\1/g; -            $jobid =~ s/^Your job (\d+) .*$/\1/; -      print STDERR " short job id $jobid\n"; -            if ($verbose){ -                print STDERR "cd: $abscwd\n"; -                print STDERR "cmd: $cmd\n"; -            } -      if ($joblist == "") { $joblist = $jobid; } -      else {$joblist = $joblist . "\|" . $jobid; } -      my $cleanfn="qdel $jobid 2> /dev/null"; -      push(@cleanup_cmds, $cleanfn); -    } -    close QOUT; -} - -sub launch_job_fork { -  my $errorfile = "/dev/null"; -  my $outfile = "/dev/null"; -  $node_count++; -  my $clientname = $executable; -  $clientname =~ s/^(.{4}).*$/$1/; -  $clientname = "$clientname.$node_count"; -  if ($errordir){ -    $errorfile = "$errordir/$clientname.ER"; -    $outfile = "$errordir/$clientname.OU"; -    push @errors,$errorfile; -    push @outs,$outfile; -  } -  my $pid = fork; -  if ($pid == 0) { -    my ($fh, $scr_name) = get_temp_script(); -    print $fh $script; -    close $fh; -    my $todo = "/bin/bash -xeo pipefail $scr_name 1> $outfile 2> $errorfile"; -    print STDERR "EXEC: $todo\n"; -    my $out = check_output("$todo"); -    unlink $scr_name or warn "Failed to remove $scr_name"; -    exit 0; -  } else { -    push @pids, $pid; -  } -} - -sub get_temp_script { -  my ($fh, $filename) = tempfile( "$workdir/workXXXX", SUFFIX => '.sh'); -  return ($fh, $filename); -} - -sub cleanup_and_die { -  cleanup(); -  die "\n"; -} - -sub cleanup { -  print STDERR "Cleaning up...\n"; -  for $cmd (@cleanup_cmds){ -    print STDERR "  Cleanup command: $cmd\n"; -    eval $cmd; -  } -  print STDERR "outputs:\n",preview_files(\@outs,1),"\n"; -  print STDERR "errors:\n",preview_files(\@errors,1),"\n"; -  print STDERR "cmd:\n",$cmd,"\n"; -  print STDERR " cat $errordir/*.ER\nfor logs.\n"; -  print STDERR "Cleanup finished.\n"; -} - -sub print_help -{ -  my $name = check_output("basename $0"); chomp $name; -  print << "Help"; - -usage: $name [options] - -  Automatic black-box parallelization of commands. - -options: - -  --use-fork -    Instead of using qsub, use fork. - -  -e, --error-dir <dir> -    Retain output files from jobs in <dir>, rather -    than silently deleting them. - -  -m, --multi-line -    Expect that command may produce multiple output -    lines for a single input line.  $name makes a -    reasonable attempt to obtain all output before -    processing additional inputs.  However, use of this -    option is inherently unsafe. - -  -v, --verbose -    Print diagnostic informatoin on stderr. - -  -j, --jobs -    Number of jobs to use. - -  -p, --pmem -    pmem setting for each job. - -Help -} diff --git a/dpmert/sentclient.c b/dpmert/sentclient.c deleted file mode 100644 index 91d994ab..00000000 --- a/dpmert/sentclient.c +++ /dev/null @@ -1,76 +0,0 @@ -/* Copyright (c) 2001 by David Chiang. All rights reserved.*/ - -#include <stdio.h> -#include <stdlib.h> -#include <unistd.h> -#include <sys/socket.h> -#include <sys/types.h> -#include <netinet/in.h> -#include <netdb.h> -#include <string.h> - -#include "sentserver.h" - -int main (int argc, char *argv[]) { -  int sock, port; -  char *s, *key; -  struct hostent *hp; -  struct sockaddr_in server; -  int errors = 0; - -  if (argc < 3) { -    fprintf(stderr, "Usage: sentclient host[:port[:key]] command [args ...]\n"); -    exit(1); -  } - -  s = strchr(argv[1], ':'); -  key = NULL; - -  if (s == NULL) { -    port = DEFAULT_PORT; -  } else { -    *s = '\0'; -    s+=1; -	/* dumb hack */ -	key = strchr(s, ':'); -	if (key != NULL){ -		*key = '\0'; -		key += 1; -	} -    port = atoi(s); -  } - -  sock = socket(AF_INET, SOCK_STREAM, 0); - -  hp = gethostbyname(argv[1]); -  if (hp == NULL) { -    fprintf(stderr, "unknown host %s\n", argv[1]); -    exit(1); -  } - -  bzero((char *)&server, sizeof(server)); -  bcopy(hp->h_addr, (char *)&server.sin_addr, hp->h_length); -  server.sin_family = hp->h_addrtype; -  server.sin_port = htons(port); - -  while (connect(sock, (struct sockaddr *)&server, sizeof(server)) < 0) { -    perror("connect()"); -    sleep(1); -    errors++; -    if (errors > 5) -      exit(1); -  } - -  close(0); -  close(1); -  dup2(sock, 0); -  dup2(sock, 1); - -  if (key != NULL){ -	write(1, key, strlen(key)); -	write(1, "\n", 1); -  } - -  execvp(argv[2], argv+2); -  return 0; -} diff --git a/dpmert/sentserver.c b/dpmert/sentserver.c deleted file mode 100644 index c20b4fa6..00000000 --- a/dpmert/sentserver.c +++ /dev/null @@ -1,515 +0,0 @@ -/* Copyright (c) 2001 by David Chiang. All rights reserved.*/ - -#include <string.h> -#include <stdlib.h> -#include <unistd.h> -#include <fcntl.h> -#include <stdio.h> -#include <sys/socket.h> -#include <sys/types.h> -#include <sys/time.h> -#include <netinet/in.h> -#include <sched.h> -#include <pthread.h> -#include <errno.h> - -#include "sentserver.h" - -#define MAX_CLIENTS 64 - -struct clientinfo { -  int s; -  struct sockaddr_in sin; -}; - -struct line { -  int id; -  char *s; -  int status; -  struct line *next; -} *head, **ptail; - -int n_sent = 0, n_received=0, n_flushed=0; - -#define STATUS_RUNNING 0 -#define STATUS_ABORTED 1 -#define STATUS_FINISHED 2 - -pthread_mutex_t queue_mutex = PTHREAD_MUTEX_INITIALIZER; -pthread_mutex_t clients_mutex = PTHREAD_MUTEX_INITIALIZER; -pthread_mutex_t input_mutex = PTHREAD_MUTEX_INITIALIZER; - -int n_clients = 0; -int s; -int expect_multiline_output = 0; -int log_mutex = 0; -int stay_alive = 0;		/* dont panic and die with zero clients */ - -void queue_finish(struct line *node, char *s, int fid); -char * read_line(int fd, int multiline); -void done (int code); - -struct line * queue_get(int fid) { -	struct line *cur; -	char *s, *synch; - -	if (log_mutex) fprintf(stderr, "Getting for data for fid %d\n", fid); -	if (log_mutex) fprintf(stderr, "Locking queue mutex (%d)\n", fid); -	pthread_mutex_lock(&queue_mutex); - -	/* First, check for aborted sentences. */ - -	if (log_mutex) fprintf(stderr, "  Checking queue for aborted jobs (fid %d)\n", fid); -	for (cur = head; cur != NULL; cur = cur->next) { -		if (cur->status == STATUS_ABORTED) { -			cur->status = STATUS_RUNNING; - -			if (log_mutex) fprintf(stderr, "Unlocking queue mutex (%d)\n", fid); -			pthread_mutex_unlock(&queue_mutex); - -			return cur; -		} -	} -	if (log_mutex) fprintf(stderr, "Unlocking queue mutex (%d)\n", fid); -	pthread_mutex_unlock(&queue_mutex); - -	/* Otherwise, read a new one. */ -	if (log_mutex) fprintf(stderr, "Locking input mutex (%d)\n", fid); -	if (log_mutex) fprintf(stderr, "  Reading input for new data (fid %d)\n", fid); -	pthread_mutex_lock(&input_mutex); -	s = read_line(0,0); - -	while (s) { -		if (log_mutex) fprintf(stderr, "Locking queue mutex (%d)\n", fid); -		pthread_mutex_lock(&queue_mutex); -		if (log_mutex) fprintf(stderr, "Unlocking input mutex (%d)\n", fid); -		pthread_mutex_unlock(&input_mutex); - -		cur = malloc(sizeof (struct line)); -		cur->id = n_sent; -		cur->s = s; -		cur->next = NULL; - -		*ptail = cur; -		ptail = &cur->next; - -		n_sent++; - -		if (strcmp(s,"===SYNCH===\n")==0){ -			fprintf(stderr, "Received ===SYNCH=== signal (fid %d)\n", fid); -			// Note: queue_finish calls free(cur->s). -			// Therefore we need to create a new string here. -			synch = malloc((strlen("===SYNCH===\n")+2) * sizeof (char)); -			synch = strcpy(synch, s); - -			if (log_mutex) fprintf(stderr, "Unlocking queue mutex (%d)\n", fid); -			pthread_mutex_unlock(&queue_mutex); -			queue_finish(cur, synch, fid); /* handles its own lock */ - -			if (log_mutex) fprintf(stderr, "Locking input mutex (%d)\n", fid); -			if (log_mutex) fprintf(stderr, "  Reading input for new data (fid %d)\n", fid); -			pthread_mutex_lock(&input_mutex); - -			s = read_line(0,0); -		} else { -			if (log_mutex) fprintf(stderr, "  Received new data %d (fid %d)\n", cur->id, fid); -			cur->status = STATUS_RUNNING; -			if (log_mutex) fprintf(stderr, "Unlocking queue mutex (%d)\n", fid); -			pthread_mutex_unlock(&queue_mutex); -			return cur; -		} -	} - -	if (log_mutex) fprintf(stderr, "Unlocking input mutex (%d)\n", fid); -	pthread_mutex_unlock(&input_mutex); -	/* Only way to reach this point: no more output */ - -	if (log_mutex) fprintf(stderr, "Locking queue mutex (%d)\n", fid); -	pthread_mutex_lock(&queue_mutex); -	if (head == NULL) { -		fprintf(stderr, "Reached end of file. Exiting.\n"); -		done(0); -	} else -		ptail = NULL; /* This serves as a signal that there is no more input */ -	if (log_mutex) fprintf(stderr, "Unlocking queue mutex (%d)\n", fid); -	pthread_mutex_unlock(&queue_mutex); - -	return NULL; -} - -void queue_panic() { -	struct line *next; -	while (head && head->status == STATUS_FINISHED) { -		/* Write out finished sentences */ -		if (head->status == STATUS_FINISHED) { -			fputs(head->s, stdout); -			fflush(stdout); -		} -		/* Write out blank line for unfinished sentences */ -		if (head->status == STATUS_ABORTED) { -			fputs("\n", stdout); -			fflush(stdout); -		} -		/* By defition, there cannot be any RUNNING sentences, since -		function is only called when n_clients == 0 */ -		free(head->s); -		next = head->next; -		free(head); -		head = next; -		n_flushed++; -	} -	fclose(stdout); -	fprintf(stderr, "All clients died. Panicking, flushing completed sentences and exiting.\n"); -	done(1); -} - -void queue_abort(struct line *node, int fid) { -	if (log_mutex) fprintf(stderr, "Locking queue mutex (%d)\n", fid); -	pthread_mutex_lock(&queue_mutex); -	node->status = STATUS_ABORTED; -	if (n_clients == 0) { -		if (stay_alive) { -			fprintf(stderr, "Warning! No live clients detected! Staying alive, will retry soon.\n"); -		} else { -			queue_panic(); -		} -	} -	if (log_mutex) fprintf(stderr, "Unlocking queue mutex (%d)\n", fid); -	pthread_mutex_unlock(&queue_mutex); -} - - -void queue_print() { -  struct line *cur; - -  fprintf(stderr, "  Queue\n"); - -  for (cur = head; cur != NULL; cur = cur->next) { -    switch(cur->status) { -    case STATUS_RUNNING: -      fprintf(stderr, "    %d running  ", cur->id); break; -    case STATUS_ABORTED: -      fprintf(stderr, "    %d aborted  ", cur->id); break; -    case STATUS_FINISHED: -      fprintf(stderr, "    %d finished ", cur->id); break; - -    } -	fprintf(stderr, "\n"); -    //fprintf(stderr, cur->s); -  } -} - -void queue_finish(struct line *node, char *s, int fid) { -  struct line *next; -  if (log_mutex) fprintf(stderr, "Locking queue mutex (%d)\n", fid); -  pthread_mutex_lock(&queue_mutex); - -  free(node->s); -  node->s = s; -  node->status = STATUS_FINISHED; -  n_received++; - -  /* Flush out finished nodes */ -  while (head && head->status == STATUS_FINISHED) { - -    if (log_mutex) fprintf(stderr, "  Flushing finished node %d\n", head->id); - -    fputs(head->s, stdout); -    fflush(stdout); -    if (log_mutex) fprintf(stderr, "  Flushed node %d\n", head->id); -    free(head->s); - -    next = head->next; -    free(head); - -    head = next; - -    n_flushed++; - -    if (head == NULL) { /* empty queue */ -      if (ptail == NULL) { /* This can only happen if set in queue_get as signal that there is no more input. */ -        fprintf(stderr, "All sentences finished. Exiting.\n"); -        done(0); -      } else /* ptail pointed at something which was just popped off the stack -- reset to head*/ -        ptail = &head; -    } -  } - -  if (log_mutex) fprintf(stderr, "  Flushing output %d\n", head->id); -  fflush(stdout); -  fprintf(stderr, "%d sentences sent, %d sentences finished, %d sentences flushed\n", n_sent, n_received, n_flushed); - -  if (log_mutex) fprintf(stderr, "Unlocking queue mutex (%d)\n", fid); -  pthread_mutex_unlock(&queue_mutex); - -} - -char * read_line(int fd, int multiline) { -  int size = 80; -  char errorbuf[100]; -  char *s = malloc(size+2); -  int result, errors=0; -  int i = 0; - -  result = read(fd, s+i, 1); - -  while (1) { -    if (result < 0) { -      perror("read()"); -      sprintf(errorbuf, "Error code: %d\n", errno); -      fprintf(stderr, errorbuf); -      errors++; -      if (errors > 5) { -	free(s); -	return NULL; -      } else { -	sleep(1); /* retry after delay */ -      } -    } else if (result == 0) { -      break; -    } else if (multiline==0 && s[i] == '\n') { -      break; -    } else { -      if (s[i] == '\n'){ -	/* if we've reached this point, -	   then multiline must be 1, and we're -	   going to poll the fd for an additional -	   line of data.  The basic design is to -	   run a select on the filedescriptor fd. -	   Select will return under two conditions: -	   if there is data on the fd, or if a -	   timeout is reached.  We'll select on this -	   fd.  If select returns because there's data -	   ready, keep going; else assume there's no -	   more and return the data we already have. -	*/ - -	fd_set set; -	FD_ZERO(&set); -	FD_SET(fd, &set); - -	struct timeval timeout; -	timeout.tv_sec = 3; // number of seconds for timeout -	timeout.tv_usec = 0; - -	int ready = select(FD_SETSIZE, &set, NULL, NULL, &timeout); -	if (ready<1){ -	  break; // no more data, stop looping -	} -      } -      i++; - -      if (i == size) { -	size = size*2; -	s = realloc(s, size+2); -      } -    } - -    result = read(fd, s+i, 1); -  } - -  if (result == 0 && i == 0) { /* end of file */ -    free(s); -    return NULL; -  } - -  s[i] = '\n'; -  s[i+1] = '\0'; - -  return s; -} - -void * new_client(void *arg) { -  struct clientinfo *client = (struct clientinfo *)arg; -  struct line *cur; -  int result; -  char *s; -  char errorbuf[100]; - -  pthread_mutex_lock(&clients_mutex); -  n_clients++; -  pthread_mutex_unlock(&clients_mutex); - -  fprintf(stderr, "Client connected (%d connected)\n", n_clients); - -  for (;;) { - -    cur = queue_get(client->s); - -    if (cur) { -      /* fprintf(stderr, "Sending to client: %s", cur->s); */ -      fprintf(stderr, "Sending data %d to client (fid %d)\n", cur->id, client->s); -      result = write(client->s, cur->s, strlen(cur->s)); -      if (result < strlen(cur->s)){ -        perror("write()"); -        sprintf(errorbuf, "Error code: %d\n", errno); -        fprintf(stderr, errorbuf); - -        pthread_mutex_lock(&clients_mutex); -        n_clients--; -        pthread_mutex_unlock(&clients_mutex); - -        fprintf(stderr, "Client died (%d connected)\n", n_clients); -        queue_abort(cur, client->s); - -        close(client->s); -        free(client); - -        pthread_exit(NULL); -      } -    } else { -      close(client->s); -      pthread_mutex_lock(&clients_mutex); -      n_clients--; -      pthread_mutex_unlock(&clients_mutex); -      fprintf(stderr, "Client dismissed (%d connected)\n", n_clients); -      pthread_exit(NULL); -    } - -    s = read_line(client->s,expect_multiline_output); -    if (s) { -      /* fprintf(stderr, "Client (fid %d) returned: %s", client->s, s); */ -      fprintf(stderr, "Client (fid %d) returned data %d\n", client->s, cur->id); -//      queue_print(); -      queue_finish(cur, s, client->s); -    } else { -      pthread_mutex_lock(&clients_mutex); -      n_clients--; -      pthread_mutex_unlock(&clients_mutex); - -      fprintf(stderr, "Client died (%d connected)\n", n_clients); -      queue_abort(cur, client->s); - -      close(client->s); -      free(client); - -      pthread_exit(NULL); -    } - -  } -  return 0; -} - -void done (int code) { -  close(s); -  exit(code); -} - - - -int main (int argc, char *argv[]) { -  struct sockaddr_in sin, from; -  int g; -  socklen_t len; -  struct clientinfo *client; -  int port; -  int opt; -  int errors = 0; -  int argi; -  char *key = NULL, *client_key; -  int use_key = 0; -  /* the key stuff here doesn't provide any -  real measure of security, it's mainly to keep -  jobs from bumping into each other.  */ - -  pthread_t tid; -  port = DEFAULT_PORT; - -  for (argi=1; argi < argc; argi++){ -    if (strcmp(argv[argi], "-m")==0){ -      expect_multiline_output = 1; -    } else if (strcmp(argv[argi], "-k")==0){ -      argi++; -      if (argi == argc){ -      	fprintf(stderr, "Key must be specified after -k\n"); -      	exit(1); -      } -      key = argv[argi]; -      use_key = 1; -    } else if (strcmp(argv[argi], "--stay-alive")==0){ -      stay_alive = 1;    /* dont panic and die with zero clients */ -    } else { -      port = atoi(argv[argi]); -    } -  } - -  /* Initialize data structures */ -  head = NULL; -  ptail = &head; - -  /* Set up listener */ -  s = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP); -  opt = 1; -  setsockopt(s, SOL_SOCKET, SO_REUSEADDR, &opt, sizeof(opt)); - -  sin.sin_family = AF_INET; -  sin.sin_addr.s_addr = htonl(INADDR_ANY); -  sin.sin_port = htons(port); -  while (bind(s, (struct sockaddr *) &sin, sizeof(sin)) < 0) { -	perror("bind()"); -	sleep(1); -	errors++; -	if (errors > 100) -	  exit(1); -  } - -  len = sizeof(sin); -  getsockname(s, (struct sockaddr *) &sin, &len); - -  fprintf(stderr, "Listening on port %hu\n", ntohs(sin.sin_port)); - -  while (listen(s, MAX_CLIENTS) < 0) { -	perror("listen()"); -	sleep(1); -	errors++; -	if (errors > 100) -	  exit(1); -  } - -  for (;;) { -    len = sizeof(from); -    g = accept(s, (struct sockaddr *)&from, &len); -    if (g < 0) { -      perror("accept()"); -      sleep(1); -      continue; -    } -    client = malloc(sizeof(struct clientinfo)); -    client->s = g; -    bcopy(&from, &client->sin, len); - -	if (use_key){ -		fd_set set; -		FD_ZERO(&set); -		FD_SET(client->s, &set); - -		struct timeval timeout; -		timeout.tv_sec = 3; // number of seconds for timeout -		timeout.tv_usec = 0; - -		int ready = select(FD_SETSIZE, &set, NULL, NULL, &timeout); -		if (ready<1){ -			fprintf(stderr, "Prospective client failed to respond with correct key.\n"); -			close(client->s); -			free(client); -		} else { -			client_key = read_line(client->s,0); -			client_key[strlen(client_key)-1]='\0'; /* chop trailing newline */ -			if (strcmp(key, client_key)==0){ -				pthread_create(&tid, NULL, new_client, client); -			} else { -				fprintf(stderr, "Prospective client failed to respond with correct key.\n"); -				close(client->s); -				free(client); -			} -			free(client_key); -		} -	} else { -		pthread_create(&tid, NULL, new_client, client); -	} -  } - -} - - - diff --git a/dpmert/sentserver.h b/dpmert/sentserver.h deleted file mode 100644 index cd17a546..00000000 --- a/dpmert/sentserver.h +++ /dev/null @@ -1,6 +0,0 @@ -#ifndef SENTSERVER_H -#define SENTSERVER_H - -#define DEFAULT_PORT 50000 - -#endif diff --git a/dpmert/test_aer/README b/dpmert/test_aer/README deleted file mode 100644 index 819b2e32..00000000 --- a/dpmert/test_aer/README +++ /dev/null @@ -1,8 +0,0 @@ -To run the test: - -../dist-vest.pl --local --metric aer cdec.ini --source-file corpus.src --ref-files=ref.0 --weights weights - -This will optimize the parameters of the tiny lexical translation model -so as to minimize the AER of the Viterbi alignment on the development -set in corpus.src according to the reference alignments in ref.0. - diff --git a/dpmert/test_aer/cdec.ini b/dpmert/test_aer/cdec.ini deleted file mode 100644 index 08187848..00000000 --- a/dpmert/test_aer/cdec.ini +++ /dev/null @@ -1,3 +0,0 @@ -formalism=lextrans -grammar=grammar -aligner=true diff --git a/dpmert/test_aer/corpus.src b/dpmert/test_aer/corpus.src deleted file mode 100644 index 31b23971..00000000 --- a/dpmert/test_aer/corpus.src +++ /dev/null @@ -1,3 +0,0 @@ -el gato negro ||| the black cat -el gato ||| the cat -el libro ||| the book diff --git a/dpmert/test_aer/grammar b/dpmert/test_aer/grammar deleted file mode 100644 index 9d857824..00000000 --- a/dpmert/test_aer/grammar +++ /dev/null @@ -1,12 +0,0 @@ -el ||| cat ||| F1=1 -el ||| the ||| F2=1 -el ||| black ||| F3=1 -el ||| book ||| F11=1 -gato ||| cat ||| F4=1 NN=1 -gato ||| black ||| F5=1 -gato ||| the ||| F6=1 -negro ||| the ||| F7=1 -negro ||| cat ||| F8=1 -negro ||| black ||| F9=1 -libro ||| the ||| F10=1 -libro ||| book ||| F12=1 NN=1 diff --git a/dpmert/test_aer/ref.0 b/dpmert/test_aer/ref.0 deleted file mode 100644 index 734a9c5b..00000000 --- a/dpmert/test_aer/ref.0 +++ /dev/null @@ -1,3 +0,0 @@ -0-0 1-2 2-1 -0-0 1-1 -0-0 1-1 diff --git a/dpmert/test_aer/weights b/dpmert/test_aer/weights deleted file mode 100644 index afc9282e..00000000 --- a/dpmert/test_aer/weights +++ /dev/null @@ -1,13 +0,0 @@ -F1 0.1 -F2 -.5980815 -F3 0.24235 -F4 0.625 -F5 0.4514 -F6 0.112316 -F7 -0.123415 -F8 -0.25390285 -F9 -0.23852 -F10 0.646 -F11 0.413141 -F12 0.343216 -NN -0.1215 diff --git a/dpmert/test_data/0.json.gz b/dpmert/test_data/0.json.gzBinary files differ deleted file mode 100644 index 30f8dd77..00000000 --- a/dpmert/test_data/0.json.gz +++ /dev/null diff --git a/dpmert/test_data/1.json.gz b/dpmert/test_data/1.json.gzBinary files differ deleted file mode 100644 index c82cc179..00000000 --- a/dpmert/test_data/1.json.gz +++ /dev/null diff --git a/dpmert/test_data/c2e.txt.0 b/dpmert/test_data/c2e.txt.0 deleted file mode 100644 index 12c4abe9..00000000 --- a/dpmert/test_data/c2e.txt.0 +++ /dev/null @@ -1,2 +0,0 @@ -australia reopens embassy in manila -( afp , manila , january 2 ) australia reopened its embassy in the philippines today , which was shut down about seven weeks ago due to what was described as a specific threat of a terrorist attack . diff --git a/dpmert/test_data/c2e.txt.1 b/dpmert/test_data/c2e.txt.1 deleted file mode 100644 index 4ac12df1..00000000 --- a/dpmert/test_data/c2e.txt.1 +++ /dev/null @@ -1,2 +0,0 @@ -australia reopened manila embassy -( agence france-presse , manila , 2nd ) - australia reopened its embassy in the philippines today . the embassy was closed seven weeks ago after what was described as a specific threat of a terrorist attack . diff --git a/dpmert/test_data/c2e.txt.2 b/dpmert/test_data/c2e.txt.2 deleted file mode 100644 index 2f67b72f..00000000 --- a/dpmert/test_data/c2e.txt.2 +++ /dev/null @@ -1,2 +0,0 @@ -australia to reopen embassy in manila -( afp report from manila , january 2 ) australia reopened its embassy in the philippines today . seven weeks ago , the embassy was shut down due to so-called confirmed terrorist attack threats . diff --git a/dpmert/test_data/c2e.txt.3 b/dpmert/test_data/c2e.txt.3 deleted file mode 100644 index 5483cef6..00000000 --- a/dpmert/test_data/c2e.txt.3 +++ /dev/null @@ -1,2 +0,0 @@ -australia to re - open its embassy to manila -( afp , manila , thursday ) australia reopens its embassy to manila , which was closed for the so-called " clear " threat of terrorist attack 7 weeks ago . diff --git a/dpmert/test_data/re.txt.0 b/dpmert/test_data/re.txt.0 deleted file mode 100644 index 86eff087..00000000 --- a/dpmert/test_data/re.txt.0 +++ /dev/null @@ -1,5 +0,0 @@ -erdogan states turkey to reject any pressures to urge it to recognize cyprus -ankara 12 - 1 ( afp ) - turkish prime minister recep tayyip erdogan announced today , wednesday , that ankara will reject any pressure by the european union to urge it to recognize cyprus . this comes two weeks before the summit of european union state and government heads who will decide whether or nor membership negotiations with ankara should be opened . -erdogan told " ntv " television station that " the european union cannot address us by imposing new conditions on us with regard to cyprus . -we will discuss this dossier in the course of membership negotiations . " -he added " let me be clear , i cannot sidestep turkey , this is something we cannot accept . " diff --git a/dpmert/test_data/re.txt.1 b/dpmert/test_data/re.txt.1 deleted file mode 100644 index 2140f198..00000000 --- a/dpmert/test_data/re.txt.1 +++ /dev/null @@ -1,5 +0,0 @@ -erdogan confirms turkey will resist any pressure to recognize cyprus -ankara 12 - 1 ( afp ) - the turkish head of government , recep tayyip erdogan , announced today ( wednesday ) that ankara would resist any pressure the european union might exercise in order to force it into recognizing cyprus . this comes two weeks before a summit of european union heads of state and government , who will decide whether or not to open membership negotiations with ankara . -erdogan said to the ntv television channel : " the european union cannot engage with us through imposing new conditions on us with regard to cyprus . -we shall discuss this issue in the course of the membership negotiations . " -he added : " let me be clear - i cannot confine turkey . this is something we do not accept . " diff --git a/dpmert/test_data/re.txt.2 b/dpmert/test_data/re.txt.2 deleted file mode 100644 index 94e46286..00000000 --- a/dpmert/test_data/re.txt.2 +++ /dev/null @@ -1,5 +0,0 @@ -erdogan confirms that turkey will reject any pressures to encourage it to recognize cyprus -ankara , 12 / 1 ( afp ) - the turkish prime minister recep tayyip erdogan declared today , wednesday , that ankara will reject any pressures that the european union may apply on it to encourage to recognize cyprus . this comes two weeks before a summit of the heads of countries and governments of the european union , who will decide on whether or not to start negotiations on joining with ankara . -erdogan told the ntv television station that " it is not possible for the european union to talk to us by imposing new conditions on us regarding cyprus . -we shall discuss this dossier during the negotiations on joining . " -and he added , " let me be clear . turkey's arm should not be twisted ; this is something we cannot accept . " diff --git a/dpmert/test_data/re.txt.3 b/dpmert/test_data/re.txt.3 deleted file mode 100644 index f87c3308..00000000 --- a/dpmert/test_data/re.txt.3 +++ /dev/null @@ -1,5 +0,0 @@ -erdogan stresses that turkey will reject all pressures to force it to recognize cyprus -ankara 12 - 1 ( afp ) - turkish prime minister recep tayyip erdogan announced today , wednesday , that ankara would refuse all pressures applied on it by the european union to force it to recognize cyprus . that came two weeks before the summit of the presidents and prime ministers of the european union , who would decide on whether to open negotiations on joining with ankara or not . -erdogan said to " ntv " tv station that the " european union can not communicate with us by imposing on us new conditions related to cyprus . -we will discuss this file during the negotiations on joining . " -he added , " let me be clear . turkey's arm should not be twisted . this is unacceptable to us . " | 
