diff options
Diffstat (limited to 'training/dpmert')
32 files changed, 1945 insertions, 0 deletions
| diff --git a/training/dpmert/Makefile.am b/training/dpmert/Makefile.am new file mode 100644 index 00000000..b85bb275 --- /dev/null +++ b/training/dpmert/Makefile.am @@ -0,0 +1,27 @@ +bin_PROGRAMS = \ +  mr_dpmert_map \ +  mr_dpmert_reduce \ +  mr_dpmert_generate_mapper_input + +noinst_PROGRAMS = \ +  lo_test +TESTS = lo_test + +mr_dpmert_generate_mapper_input_SOURCES = mr_dpmert_generate_mapper_input.cc line_optimizer.cc +mr_dpmert_generate_mapper_input_LDADD = ../../decoder/libcdec.a ../../mteval/libmteval.a ../../utils/libutils.a + +# nbest2hg_SOURCES = nbest2hg.cc +# nbest2hg_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a -lfst + +mr_dpmert_map_SOURCES = mert_geometry.cc ces.cc error_surface.cc mr_dpmert_map.cc line_optimizer.cc ces.h error_surface.h line_optimizer.h mert_geometry.h +mr_dpmert_map_LDADD = ../../decoder/libcdec.a ../../mteval/libmteval.a ../../utils/libutils.a + +mr_dpmert_reduce_SOURCES = error_surface.cc ces.cc mr_dpmert_reduce.cc line_optimizer.cc mert_geometry.cc ces.h error_surface.h line_optimizer.h mert_geometry.h +mr_dpmert_reduce_LDADD = ../../decoder/libcdec.a ../../mteval/libmteval.a ../../utils/libutils.a + +lo_test_SOURCES = lo_test.cc ces.cc mert_geometry.cc error_surface.cc line_optimizer.cc ces.h error_surface.h line_optimizer.h mert_geometry.h +lo_test_LDADD = $(BOOST_UNIT_TEST_FRAMEWORK_LDFLAGS) $(BOOST_UNIT_TEST_FRAMEWORK_LIBS) ../../decoder/libcdec.a ../../mteval/libmteval.a ../../utils/libutils.a + +EXTRA_DIST = test_data dpmert.pl + +AM_CPPFLAGS = -DTEST_DATA=\"$(top_srcdir)/training/dpmert/test_data\" -DBOOST_TEST_DYN_LINK -W -Wall -Wno-sign-compare -I$(top_srcdir)/utils -I$(top_srcdir)/decoder -I$(top_srcdir)/mteval diff --git a/training/dpmert/ces.cc b/training/dpmert/ces.cc new file mode 100644 index 00000000..157b2d17 --- /dev/null +++ b/training/dpmert/ces.cc @@ -0,0 +1,90 @@ +#include "ces.h" + +#include <vector> +#include <sstream> +#include <boost/shared_ptr.hpp> + +// TODO, if AER is to be optimized again, we will need this +// #include "aligner.h" +#include "lattice.h" +#include "mert_geometry.h" +#include "error_surface.h" +#include "ns.h" + +using namespace std; + +const bool minimize_segments = true;    // if adjacent segments have equal scores, merge them + +void ComputeErrorSurface(const SegmentEvaluator& ss, +                         const ConvexHull& ve, +                         ErrorSurface* env, +                         const EvaluationMetric* metric, +                         const Hypergraph& hg) { +  vector<WordID> prev_trans; +  const vector<boost::shared_ptr<MERTPoint> >& ienv = ve.GetSortedSegs(); +  env->resize(ienv.size()); +  SufficientStats prev_score; // defaults to 0 +  int j = 0; +  for (unsigned i = 0; i < ienv.size(); ++i) { +    const MERTPoint& seg = *ienv[i]; +    vector<WordID> trans; +#if 0 +    if (type == AER) { +      vector<bool> edges(hg.edges_.size(), false); +      seg.CollectEdgesUsed(&edges);  // get the set of edges in the viterbi +                                     // alignment +      ostringstream os; +      const string* psrc = ss.GetSource(); +      if (psrc == NULL) { +        cerr << "AER scoring in VEST requires source, but it is missing!\n"; +        abort(); +      } +      size_t pos = psrc->rfind(" ||| "); +      if (pos == string::npos) { +        cerr << "Malformed source for AER: expected |||\nINPUT: " << *psrc << endl; +        abort(); +      } +      Lattice src; +      Lattice ref; +      LatticeTools::ConvertTextOrPLF(psrc->substr(0, pos), &src); +      LatticeTools::ConvertTextOrPLF(psrc->substr(pos + 5), &ref); +      AlignerTools::WriteAlignment(src, ref, hg, &os, true, 0, &edges); +      string tstr = os.str(); +      TD::ConvertSentence(tstr.substr(tstr.rfind(" ||| ") + 5), &trans); +    } else { +#endif +      seg.ConstructTranslation(&trans); +    //} +    //cerr << "Scoring: " << TD::GetString(trans) << endl; +    if (trans == prev_trans) { +      if (!minimize_segments) { +        ErrorSegment& out = (*env)[j]; +        out.delta.fields.clear(); +        out.x = seg.x; +	++j; +      } +      //cerr << "Identical translation, skipping scoring\n"; +    } else { +      SufficientStats score; +      ss.Evaluate(trans, &score); +      // cerr << "score= " << score->ComputeScore() << "\n"; +      //string x1; score.Encode(&x1); cerr << "STATS: " << x1 << endl; +      const SufficientStats delta = score - prev_score; +      //string x2; delta.Encode(&x2); cerr << "DELTA: " << x2 << endl; +      //string xx; delta.Encode(&xx); cerr << xx << endl; +      prev_trans.swap(trans); +      prev_score = score; +      if ((!minimize_segments) || (!delta.IsAdditiveIdentity())) { +        ErrorSegment& out = (*env)[j]; +        out.delta = delta; +        out.x = seg.x; +        ++j; +      } +    } +  } +  // cerr << " In segments: " << ienv.size() << endl; +  // cerr << "Out segments: " << j << endl; +  assert(j > 0); +  env->resize(j); +} + diff --git a/training/dpmert/ces.h b/training/dpmert/ces.h new file mode 100644 index 00000000..e4fa2080 --- /dev/null +++ b/training/dpmert/ces.h @@ -0,0 +1,16 @@ +#ifndef _CES_H_ +#define _CES_H_ + +class ConvexHull; +class Hypergraph; +class SegmentEvaluator; +class ErrorSurface; +class EvaluationMetric; + +void ComputeErrorSurface(const SegmentEvaluator& ss, +                         const ConvexHull& convex_hull, +                         ErrorSurface* es, +                         const EvaluationMetric* metric, +                         const Hypergraph& hg); + +#endif diff --git a/training/dpmert/divide_refs.py b/training/dpmert/divide_refs.py new file mode 100755 index 00000000..b478f918 --- /dev/null +++ b/training/dpmert/divide_refs.py @@ -0,0 +1,15 @@ +#!/usr/bin/env python +import sys + +(numRefs, outPrefix) = sys.argv[1:] +numRefs = int(numRefs) + +outs = [open(outPrefix+str(i), "w") for i in range(numRefs)] + +i = 0 +for line in sys.stdin: +  outs[i].write(line) +  i = (i + 1) % numRefs + +for out in outs: +  out.close() diff --git a/training/dpmert/dpmert.pl b/training/dpmert/dpmert.pl new file mode 100755 index 00000000..559420f5 --- /dev/null +++ b/training/dpmert/dpmert.pl @@ -0,0 +1,618 @@ +#!/usr/bin/env perl +use strict; +my @ORIG_ARGV=@ARGV; +use Cwd qw(getcwd); +my $SCRIPT_DIR; BEGIN { use Cwd qw/ abs_path /; use File::Basename; $SCRIPT_DIR = dirname(abs_path($0)); push @INC, $SCRIPT_DIR, "$SCRIPT_DIR/../../environment", "$SCRIPT_DIR/../utils"; } + +# Skip local config (used for distributing jobs) if we're running in local-only mode +use LocalConfig; +use Getopt::Long; +use File::Basename qw(basename); +require "libcall.pl"; + +my $QSUB_CMD = qsub_args(mert_memory()); + +# Default settings +my $srcFile;  # deprecated +my $refFiles; # deprecated +my $default_jobs = env_default_jobs(); +my $bin_dir = $SCRIPT_DIR; +my $util_dir = "$SCRIPT_DIR/../utils"; +die "Bin directory $bin_dir missing/inaccessible" unless -d $bin_dir; +my $FAST_SCORE="$bin_dir/../../mteval/fast_score"; +die "Can't execute $FAST_SCORE" unless -x $FAST_SCORE; +my $MAPINPUT = "$bin_dir/mr_dpmert_generate_mapper_input"; +my $MAPPER = "$bin_dir/mr_dpmert_map"; +my $REDUCER = "$bin_dir/mr_dpmert_reduce"; +my $parallelize = "$util_dir/parallelize.pl"; +my $libcall = "$util_dir/libcall.pl"; +my $sentserver = "$util_dir/sentserver"; +my $sentclient = "$util_dir/sentclient"; +my $LocalConfig = "$SCRIPT_DIR/../../environment/LocalConfig.pm"; + +my $SCORER = $FAST_SCORE; +die "Can't find $MAPPER" unless -x $MAPPER; +my $cdec = "$bin_dir/../../decoder/cdec"; +die "Can't find decoder in $cdec" unless -x $cdec; +die "Can't find $parallelize" unless -x $parallelize; +die "Can't find $libcall" unless -e $libcall; +my $decoder = $cdec; +my $lines_per_mapper = 200; +my $rand_directions = 15; +my $iteration = 1; +my $best_weights; +my $max_iterations = 15; +my $optimization_iters = 6; +my $jobs = $default_jobs;   # number of decode nodes +my $pmem = "9g"; +my $disable_clean = 0; +my %seen_weights; +my $help = 0; +my $epsilon = 0.0001; +my $last_score = -10000000; +my $metric = "ibm_bleu"; +my $dir; +my $iniFile; +my $weights; +my $initialWeights; +my $bleu_weight=1; +my $use_make = 1;  # use make to parallelize line search +my $useqsub; +my $pass_suffix = ''; +my $devset; +# Process command-line options +if (GetOptions( +	"config=s" => \$iniFile, +	"weights=s" => \$initialWeights, +        "devset=s" => \$devset, +	"jobs=i" => \$jobs, +	"pass-suffix=s" => \$pass_suffix, +	"help" => \$help, +	"qsub" => \$useqsub, +	"iterations=i" => \$max_iterations, +	"pmem=s" => \$pmem, +	"random-directions=i" => \$rand_directions, +	"metric=s" => \$metric, +	"source-file=s" => \$srcFile, +	"output-dir=s" => \$dir, +) == 0 || @ARGV!=0 || $help) { +	print_help(); +	exit; +} + +if ($useqsub) { +  $use_make = 0; +  die "LocalEnvironment.pm does not have qsub configuration for this host. Cannot run with --qsub!\n" unless has_qsub(); +} + +my @missing_args = (); +if (defined $srcFile || defined $refFiles) { +  die <<EOT; + +  The options --ref-files and --source-file are no longer supported. +  Please specify the input file and its reference translations with +  --devset FILE + +EOT +} + +if (!defined $iniFile) { push @missing_args, "--config"; } +if (!defined $devset) { push @missing_args, "--devset"; } +if (!defined $initialWeights) { push @missing_args, "--weights"; } +die "Please specify missing arguments: " . join (', ', @missing_args) . "\nUse --help for more information.\n" if (@missing_args); + +if ($metric =~ /^(combi|ter)$/i) { +  $lines_per_mapper = 40; +} elsif ($metric =~ /^meteor$/i) { +  $lines_per_mapper = 2000;   # start up time is really high for METEOR +} + + +my $nodelist; +my $host =check_output("hostname"); chomp $host; +my $bleu; +my $interval_count = 0; +my $logfile; +my $projected_score; + +# used in sorting scores +my $DIR_FLAG = '-r'; +if ($metric =~ /^ter$|^aer$/i) { +  $DIR_FLAG = ''; +} + +unless ($dir){ +	$dir = "dpmert"; +} +unless ($dir =~ /^\//){  # convert relative path to absolute path +	my $basedir = check_output("pwd"); +	chomp $basedir; +	$dir = "$basedir/$dir"; +} + + +# Initializations and helper functions +srand; + +my @childpids = (); +my @cleanupcmds = (); + +sub cleanup { +	print STDERR "Cleanup...\n"; +	for my $pid (@childpids){ unchecked_call("kill $pid"); } +	for my $cmd (@cleanupcmds){ unchecked_call("$cmd"); } +	exit 1; +}; +# Always call cleanup, no matter how we exit +*CORE::GLOBAL::exit = sub{ cleanup(); };  +$SIG{INT} = "cleanup"; +$SIG{TERM} = "cleanup"; +$SIG{HUP} = "cleanup"; + +my $decoderBase = basename($decoder); chomp $decoderBase; +my $newIniFile = "$dir/$decoderBase.ini"; +my $inputFileName = "$dir/input"; +my $user = $ENV{"USER"}; + +# process ini file +-e $iniFile || die "Error: could not open $iniFile for reading\n"; + +sub dirsize { +    opendir ISEMPTY,$_[0]; +    return scalar(readdir(ISEMPTY))-1; +} +if (-e $dir) { +	# allow preexisting logfile, binaries, but not dist-dpmert.pl outputs +	die "ERROR: output directory $dir already exists (remove or use --output-dir dir)\n\n"; +} else { +	mkdir "$dir" or die "Can't mkdir $dir: $!"; +	mkdir "$dir/hgs" or die; +	mkdir "$dir/scripts" or die; +	print STDERR <<EOT; +	DECODER:          $decoder +	INI FILE:         $iniFile +	WORKING DIR:      $dir +	DEVSET:           $devset +	EVAL METRIC:      $metric +	MAX ITERATIONS:   $max_iterations +	PARALLEL JOBS:    $jobs +	HEAD NODE:        $host +	PMEM (DECODING):  $pmem +	INITIAL WEIGHTS:  $initialWeights +EOT +} + +# Generate initial files and values +check_call("cp $iniFile $newIniFile"); +check_call("cp $initialWeights $dir/weights.0"); +$iniFile = $newIniFile; + +split_devset($devset, "$dir/dev.input.raw", "$dir/dev.refs"); +my $refs = "-r $dir/dev.refs"; +my $newsrc = "$dir/dev.input"; +enseg("$dir/dev.input.raw", $newsrc); +$srcFile = $newsrc; +my $devSize = 0; +open F, "<$srcFile" or die "Can't read $srcFile: $!"; +while(<F>) { $devSize++; } +close F; + +unless($best_weights){ $best_weights = $weights; } +unless($projected_score){ $projected_score = 0.0; } +$seen_weights{$weights} = 1; + +my $random_seed = int(time / 1000); +my $lastWeightsFile; +my $lastPScore = 0; +# main optimization loop +while (1){ +	print STDERR "\n\nITERATION $iteration\n==========\n"; + +	if ($iteration > $max_iterations){ +		print STDERR "\nREACHED STOPPING CRITERION: Maximum iterations\n"; +		last; +	} +	# iteration-specific files +	my $runFile="$dir/run.raw.$iteration"; +	my $onebestFile="$dir/1best.$iteration"; +	my $logdir="$dir/logs.$iteration"; +	my $decoderLog="$logdir/decoder.sentserver.log.$iteration"; +	my $scorerLog="$logdir/scorer.log.$iteration"; +	check_call("mkdir -p $logdir"); + + +	#decode +	print STDERR "RUNNING DECODER AT "; +	print STDERR unchecked_output("date"); +	my $im1 = $iteration - 1; +	my $weightsFile="$dir/weights.$im1"; +	my $decoder_cmd = "$decoder -c $iniFile --weights$pass_suffix $weightsFile -O $dir/hgs"; +	my $pcmd; +	if ($use_make) { +		$pcmd = "cat $srcFile | $parallelize --workdir $dir --use-fork -p $pmem -e $logdir -j $jobs --"; +	} else { +		$pcmd = "cat $srcFile | $parallelize --workdir $dir -p $pmem -e $logdir -j $jobs --"; +	} +	my $cmd = "$pcmd $decoder_cmd 2> $decoderLog 1> $runFile"; +	print STDERR "COMMAND:\n$cmd\n"; +	check_bash_call($cmd); +        my $num_hgs; +        my $num_topbest; +        my $retries = 0; +	while($retries < 5) { +	    $num_hgs = check_output("ls $dir/hgs/*.gz | wc -l"); +	    $num_topbest = check_output("wc -l < $runFile"); +	    print STDERR "NUMBER OF HGs: $num_hgs\n"; +	    print STDERR "NUMBER OF TOP-BEST HYPs: $num_topbest\n"; +	    if($devSize == $num_hgs && $devSize == $num_topbest) { +		last; +	    } else { +		print STDERR "Incorrect number of hypergraphs or topbest. Waiting for distributed filesystem and retrying...\n"; +		sleep(3); +	    } +	    $retries++; +	} +	die "Dev set contains $devSize sentences, but we don't have topbest and hypergraphs for all these! Decoder failure? Check $decoderLog\n" if ($devSize != $num_hgs || $devSize != $num_topbest); +	my $dec_score = check_output("cat $runFile | $SCORER $refs -m $metric"); +	chomp $dec_score; +	print STDERR "DECODER SCORE: $dec_score\n"; + +	# save space +	check_call("gzip -f $runFile"); +	check_call("gzip -f $decoderLog"); + +	# run optimizer +	print STDERR "RUNNING OPTIMIZER AT "; +	print STDERR unchecked_output("date"); +	my $mergeLog="$logdir/prune-merge.log.$iteration"; + +	my $score = 0; +	my $icc = 0; +	my $inweights="$dir/weights.$im1"; +	for (my $opt_iter=1; $opt_iter<$optimization_iters; $opt_iter++) { +		print STDERR "\nGENERATE OPTIMIZATION STRATEGY (OPT-ITERATION $opt_iter/$optimization_iters)\n"; +		print STDERR unchecked_output("date"); +		$icc++; +		$cmd="$MAPINPUT -w $inweights -r $dir/hgs -s $devSize -d $rand_directions > $dir/agenda.$im1-$opt_iter"; +		print STDERR "COMMAND:\n$cmd\n"; +		check_call($cmd); +		check_call("mkdir -p $dir/splag.$im1"); +		$cmd="split -a 3 -l $lines_per_mapper $dir/agenda.$im1-$opt_iter $dir/splag.$im1/mapinput."; +		print STDERR "COMMAND:\n$cmd\n"; +		check_call($cmd); +		opendir(DIR, "$dir/splag.$im1") or die "Can't open directory: $!"; +		my @shards = grep { /^mapinput\./ } readdir(DIR); +		closedir DIR; +		die "No shards!" unless scalar @shards > 0; +		my $joblist = ""; +		my $nmappers = 0; +		my @mapoutputs = (); +		@cleanupcmds = (); +		my %o2i = (); +		my $first_shard = 1; +		my $mkfile; # only used with makefiles +		my $mkfilename; +		if ($use_make) { +			$mkfilename = "$dir/splag.$im1/domap.mk"; +			open $mkfile, ">$mkfilename" or die "Couldn't write $mkfilename: $!"; +			print $mkfile "all: $dir/splag.$im1/map.done\n\n"; +		} +		my @mkouts = ();  # only used with makefiles +		for my $shard (@shards) { +			my $mapoutput = $shard; +			my $client_name = $shard; +			$client_name =~ s/mapinput.//; +			$client_name = "dpmert.$client_name"; +			$mapoutput =~ s/mapinput/mapoutput/; +			push @mapoutputs, "$dir/splag.$im1/$mapoutput"; +			$o2i{"$dir/splag.$im1/$mapoutput"} = "$dir/splag.$im1/$shard"; +			my $script = "$MAPPER -s $srcFile -m $metric $refs < $dir/splag.$im1/$shard | sort -t \$'\\t' -k 1 > $dir/splag.$im1/$mapoutput"; +			if ($use_make) { +				my $script_file = "$dir/scripts/map.$shard"; +				open F, ">$script_file" or die "Can't write $script_file: $!"; +				print F "#!/bin/bash\n"; +				print F "$script\n"; +				close F; +				my $output = "$dir/splag.$im1/$mapoutput"; +				push @mkouts, $output; +				chmod(0755, $script_file) or die "Can't chmod $script_file: $!"; +				if ($first_shard) { print STDERR "$script\n"; $first_shard=0; } +				print $mkfile "$output: $dir/splag.$im1/$shard\n\t$script_file\n\n"; +			} else { +				my $script_file = "$dir/scripts/map.$shard"; +				open F, ">$script_file" or die "Can't write $script_file: $!"; +				print F "$script\n"; +				close F; +				if ($first_shard) { print STDERR "$script\n"; $first_shard=0; } + +				$nmappers++; +				my $qcmd = "$QSUB_CMD -N $client_name -o /dev/null -e $logdir/$client_name.ER $script_file"; +				my $jobid = check_output("$qcmd"); +				chomp $jobid; +				$jobid =~ s/^(\d+)(.*?)$/\1/g; +				$jobid =~ s/^Your job (\d+) .*$/\1/; +		 	 	push(@cleanupcmds, "qdel $jobid 2> /dev/null"); +				print STDERR " $jobid"; +				if ($joblist == "") { $joblist = $jobid; } +				else {$joblist = $joblist . "\|" . $jobid; } +			} +		} +		if ($use_make) { +			print $mkfile "$dir/splag.$im1/map.done: @mkouts\n\ttouch $dir/splag.$im1/map.done\n\n"; +			close $mkfile; +			my $mcmd = "make -j $jobs -f $mkfilename"; +			print STDERR "\nExecuting: $mcmd\n"; +			check_call($mcmd); +		} else { +			print STDERR "\nLaunched $nmappers mappers.\n"; +      			sleep 8; +			print STDERR "Waiting for mappers to complete...\n"; +			while ($nmappers > 0) { +			  sleep 5; +			  my @livejobs = grep(/$joblist/, split(/\n/, unchecked_output("qstat | grep -v ' C '"))); +			  $nmappers = scalar @livejobs; +			} +			print STDERR "All mappers complete.\n"; +		} +		my $tol = 0; +		my $til = 0; +		for my $mo (@mapoutputs) { +		  my $olines = get_lines($mo); +		  my $ilines = get_lines($o2i{$mo}); +		  $tol += $olines; +		  $til += $ilines; +		  die "$mo: output lines ($olines) doesn't match input lines ($ilines)" unless $olines==$ilines; +		} +		print STDERR "Results for $tol/$til lines\n"; +		print STDERR "\nSORTING AND RUNNING VEST REDUCER\n"; +		print STDERR unchecked_output("date"); +		$cmd="sort -t \$'\\t' -k 1 @mapoutputs | $REDUCER -m $metric > $dir/redoutput.$im1"; +		print STDERR "COMMAND:\n$cmd\n"; +		check_bash_call($cmd); +		$cmd="sort -nk3 $DIR_FLAG '-t|' $dir/redoutput.$im1 | head -1"; +		# sort returns failure even when it doesn't fail for some reason +		my $best=unchecked_output("$cmd"); chomp $best; +		print STDERR "$best\n"; +		my ($oa, $x, $xscore) = split /\|/, $best; +		$score = $xscore; +		print STDERR "PROJECTED SCORE: $score\n"; +		if (abs($x) < $epsilon) { +			print STDERR "\nOPTIMIZER: no score improvement: abs($x) < $epsilon\n"; +			last; +		} +                my $psd = $score - $last_score; +                $last_score = $score; +		if (abs($psd) < $epsilon) { +			print STDERR "\nOPTIMIZER: no score improvement: abs($psd) < $epsilon\n"; +			last; +		} +		my ($origin, $axis) = split /\s+/, $oa; + +		my %ori = convert($origin); +		my %axi = convert($axis); + +		my $finalFile="$dir/weights.$im1-$opt_iter"; +		open W, ">$finalFile" or die "Can't write: $finalFile: $!"; +                my $norm = 0; +		for my $k (sort keys %ori) { +			my $dd = $ori{$k} + $axi{$k} * $x; +                        $norm += $dd * $dd; +		} +                $norm = sqrt($norm); +		$norm = 1; +		for my $k (sort keys %ori) { +			my $v = ($ori{$k} + $axi{$k} * $x) / $norm; +			print W "$k $v\n"; +		} +		check_call("rm $dir/splag.$im1/*"); +		$inweights = $finalFile; +	} +	$lastWeightsFile = "$dir/weights.$iteration"; +	check_call("cp $inweights $lastWeightsFile"); +	if ($icc < 2) { +		print STDERR "\nREACHED STOPPING CRITERION: score change too little\n"; +		last; +	} +	$lastPScore = $score; +	$iteration++; +	print STDERR "\n==========\n"; +} + +check_call("cp $lastWeightsFile $dir/weights.final"); +print STDERR "\nFINAL WEIGHTS: $dir/weights.final\n(Use -w <this file> with the decoder)\n\n"; +print STDOUT "$dir/weights.final\n"; +exit 0; + + +sub get_lines { +  my $fn = shift @_; +  open FL, "<$fn" or die "Couldn't read $fn: $!"; +  my $lc = 0; +  while(<FL>) { $lc++; } +  return $lc; +} + +sub read_weights_file { +  my ($file) = @_; +  open F, "<$file" or die "Couldn't read $file: $!"; +  my @r = (); +  my $pm = -1; +  while(<F>) { +    next if /^#/; +    next if /^\s*$/; +    chomp; +    if (/^(.+)\s+(.+)$/) { +      my $m = $1; +      my $w = $2; +      die "Weights out of order: $m <= $pm" unless $m > $pm; +      push @r, $w; +    } else { +      warn "Unexpected feature name in weight file: $_"; +    } +  } +  close F; +  return join ' ', @r; +} + +sub update_weights_file { +  my ($neww, $rfn, $rpts) = @_; +  my @feats = @$rfn; +  my @pts = @$rpts; +  my $num_feats = scalar @feats; +  my $num_pts = scalar @pts; +  die "$num_feats (num_feats) != $num_pts (num_pts)" unless $num_feats == $num_pts; +  open G, ">$neww" or die; +  for (my $i = 0; $i < $num_feats; $i++) { +    my $f = $feats[$i]; +    my $lambda = $pts[$i]; +    print G "$f $lambda\n"; +  } +  close G; +} + +sub enseg { +	my $src = shift; +	my $newsrc = shift; +	open(SRC, $src); +	open(NEWSRC, ">$newsrc"); +	my $i=0; +	while (my $line=<SRC>){ +		chomp $line; +		if ($line =~ /^\s*<seg/i) { +		    if($line =~ /id="[0-9]+"/) { +			print NEWSRC "$line\n"; +		    } else { +			die "When using segments with pre-generated <seg> tags, you must include a zero-based id attribute"; +		    } +		} else { +			print NEWSRC "<seg id=\"$i\">$line</seg>\n"; +		} +		$i++; +	} +	close SRC; +	close NEWSRC; +} + +sub print_help { + +	my $executable = basename($0); chomp $executable; +	print << "Help"; + +Usage: $executable [options] <ini file> + +	$executable [options] +		Runs a complete MERT optimization. Required options are --weights, +		--devset, and --config. + +Options: + +	--config <file>   [-c <file>] +		The decoder configuration file. + +	--devset <file>   [-d <file>] +		The source *and* references for the development set. + +	--weights <file>  [-w <file>] +		A file specifying initial feature weights.  The format is +		FeatureName_1 value1 +		FeatureName_2 value2 +		**All and only the weights listed in <file> will be optimized!** + +	--metric <name> +		Metric to optimize. +		Example values: IBM_BLEU, NIST_BLEU, Koehn_BLEU, TER, Combi + +	--iterations <M> +		Maximum number of iterations to run.  If not specified, defaults +		to 10. + +	--pass-suffix <S> +		If the decoder is doing multi-pass decoding, the pass suffix "2", +		"3", etc., is used to control what iteration of weights is set. + +	--rand-directions <num> +		MERT will attempt to optimize along all of the principle directions, +		set this parameter to explore other directions. Defaults to 5. + +	--output-dir <dir> +		Directory for intermediate and output files. + +	--help +		Print this message and exit. + +Job control options: + +	--jobs <I> +		Number of decoder processes to run in parallel. [default=$default_jobs] + +	--qsub +		Use qsub to run jobs in parallel (qsub must be configured in +		environment/LocalEnvironment.pm) + +	--pmem <N> +		Amount of physical memory requested for parallel decoding jobs +		(used with qsub requests only) + +Help +} + +sub convert { +  my ($str) = @_; +  my @ps = split /;/, $str; +  my %dict = (); +  for my $p (@ps) { +    my ($k, $v) = split /=/, $p; +    $dict{$k} = $v; +  } +  return %dict; +} + + + +sub cmdline { +    return join ' ',($0,@ORIG_ARGV); +} + +#buggy: last arg gets quoted sometimes? +my $is_shell_special=qr{[ \t\n\\><|&;"'`~*?{}$!()]}; +my $shell_escape_in_quote=qr{[\\"\$`!]}; + +sub escape_shell { +    my ($arg)=@_; +    return undef unless defined $arg; +    if ($arg =~ /$is_shell_special/) { +        $arg =~ s/($shell_escape_in_quote)/\\$1/g; +        return "\"$arg\""; +    } +    return $arg; +} + +sub escaped_shell_args { +    return map {local $_=$_;chomp;escape_shell($_)} @_; +} + +sub escaped_shell_args_str { +    return join ' ',&escaped_shell_args(@_); +} + +sub escaped_cmdline { +    return "$0 ".&escaped_shell_args_str(@ORIG_ARGV); +} + +sub split_devset { +  my ($infile, $outsrc, $outref) = @_; +  open F, "<$infile" or die "Can't read $infile: $!"; +  open S, ">$outsrc" or die "Can't write $outsrc: $!"; +  open R, ">$outref" or die "Can't write $outref: $!"; +  while(<F>) { +    chomp; +    my ($src, @refs) = split /\s*\|\|\|\s*/; +    die "Malformed devset line: $_\n" unless scalar @refs > 0; +    print S "$src\n"; +    print R join(' ||| ', @refs) . "\n"; +  } +  close R; +  close S; +  close F; +} + diff --git a/training/dpmert/error_surface.cc b/training/dpmert/error_surface.cc new file mode 100644 index 00000000..515b67f8 --- /dev/null +++ b/training/dpmert/error_surface.cc @@ -0,0 +1,42 @@ +#include "error_surface.h" + +#include <cassert> +#include <sstream> + +using namespace std; + +ErrorSurface::~ErrorSurface() {} + +void ErrorSurface::Serialize(std::string* out) const { +  const int segments = this->size(); +  ostringstream os(ios::binary); +  os.write((const char*)&segments,sizeof(segments)); +  for (int i = 0; i < segments; ++i) { +    const ErrorSegment& cur = (*this)[i]; +    string senc; +    cur.delta.Encode(&senc); +    assert(senc.size() < 1024); +    unsigned char len = senc.size(); +    os.write((const char*)&cur.x, sizeof(cur.x)); +    os.write((const char*)&len, sizeof(len)); +    os.write((const char*)&senc[0], len); +  } +  *out = os.str(); +} + +void ErrorSurface::Deserialize(const std::string& in) { +  istringstream is(in, ios::binary); +  int segments; +  is.read((char*)&segments, sizeof(segments)); +  this->resize(segments); +  for (int i = 0; i < segments; ++i) { +    ErrorSegment& cur = (*this)[i]; +    unsigned char len; +    is.read((char*)&cur.x, sizeof(cur.x)); +    is.read((char*)&len, sizeof(len)); +    string senc(len, '\0'); assert(senc.size() == len); +    is.read((char*)&senc[0], len); +    cur.delta = SufficientStats(senc); +  } +} + diff --git a/training/dpmert/error_surface.h b/training/dpmert/error_surface.h new file mode 100644 index 00000000..bb65847b --- /dev/null +++ b/training/dpmert/error_surface.h @@ -0,0 +1,24 @@ +#ifndef _ERROR_SURFACE_H_ +#define _ERROR_SURFACE_H_ + +#include <vector> +#include <string> + +#include "ns.h" + +class Score; + +struct ErrorSegment { +  double x; +  SufficientStats delta; +  ErrorSegment() : x(0), delta() {} +}; + +class ErrorSurface : public std::vector<ErrorSegment> { + public: +  ~ErrorSurface(); +  void Serialize(std::string* out) const; +  void Deserialize(const std::string& in); +}; + +#endif diff --git a/training/dpmert/line_mediator.pl b/training/dpmert/line_mediator.pl new file mode 100755 index 00000000..bc2bb24c --- /dev/null +++ b/training/dpmert/line_mediator.pl @@ -0,0 +1,116 @@ +#!/usr/bin/perl -w +#hooks up two processes, 2nd of which has one line of output per line of input, expected by the first, which starts off the communication + +# if you don't know how to fork/exec in a C program, this could be helpful under limited cirmustances (would be ok to liaise with sentserver) + +#WARNING: because it waits for the result from command 2 after sending every line, and especially if command 1 does the same, using sentserver as command 2 won't actually buy you any real parallelism. + +use strict; +use IPC::Open2; +use POSIX qw(pipe dup2 STDIN_FILENO STDOUT_FILENO); + +my $quiet=!$ENV{DEBUG}; +$quiet=1 if $ENV{QUIET}; +sub info { +    local $,=' '; +    print STDERR @_ unless $quiet; +} + +my $mode='CROSS'; +my $ser='DIRECT'; +$mode='PIPE' if $ENV{PIPE}; +$mode='SNAKE' if $ENV{SNAKE}; +$mode='CROSS' if $ENV{CROSS}; +$ser='SERIAL' if $ENV{SERIAL}; +$ser='DIRECT' if $ENV{DIRECT}; +$ser='SERIAL' if $mode eq 'SNAKE'; +info("mode: $mode\n"); +info("connection: $ser\n"); + + +my @c1; +if (scalar @ARGV) { +    do { +        push @c1,shift +    } while scalar @ARGV && $c1[$#c1] ne '--'; +} +pop @c1; +my @c2=@ARGV; +@ARGV=(); +(scalar @c1 && scalar @c2) || die qq{ +usage: $0 cmd1 args -- cmd2 args +all options are environment variables. +DEBUG=1 env var enables debugging output. +CROSS=1 hooks up two processes, 2nd of which has one line of output per line of input, expected by the first, which starts off the communication.  crosses stdin/stderr of cmd1 and cmd2 line by line (both must flush on newline and output.  cmd1 initiates the conversation (sends the first line).    default: attempts to cross stdin/stdout of c1 and c2 directly (via two unidirectional posix pipes created before fork). +SERIAL=1: (no parallelism possible) but lines exchanged are logged if DEBUG. +if SNAKE then stdin -> c1 -> c2 -> c1 -> stdout. +if PIPE then stdin -> c1 -> c2 -> stdout (same as shell c1|c2, but with SERIAL you can see the intermediate in real time; you could do similar with c1 | tee /dev/fd/2 |c2. +DIRECT=1 (default) will override SERIAL=1. +CROSS=1 (default) will override SNAKE or PIPE. +}; + +info("1 cmd:",@c1,"\n"); +info("2 cmd:",@c2,"\n"); + +sub lineto { +    select $_[0]; +    $|=1; +    shift; +    print @_; +} + +if ($ser eq 'SERIAL') { +    my ($R1,$W1,$R2,$W2); +    my $c1p=open2($R1,$W1,@c1); # Open2 R W backward from Open3. +    my $c2p=open2($R2,$W2,@c2); +    if ($mode eq 'CROSS') { +        while(<$R1>) { +            info("1:",$_); +            lineto($W2,$_); +            last unless defined ($_=<$R2>); +            info("1|2:",$_); +            lineto($W1,$_); +        } +    } else { +        my $snake=$mode eq 'SNAKE'; +        while(<STDIN>) { +            info("IN:",$_); +            lineto($W1,$_); +            last unless defined ($_=<$R1>); +            info("IN|1:",$_); +            lineto($W2,$_); +            last unless defined ($_=<$R2>); +            info("IN|1|2:",$_); +            if ($snake) { +                lineto($W1,$_); +                last unless defined ($_=<$R1>); +                info("IN|1|2|1:",$_); +            } +            lineto(*STDOUT,$_); +        } +    } +} else { +    info("DIRECT mode\n"); +    my @rw1=POSIX::pipe(); +    my @rw2=POSIX::pipe(); +    my $pid=undef; +    $SIG{CHLD} = sub { wait }; +    while (not defined ($pid=fork())) { +        sleep 1; +    } +    my $pipe = $mode eq 'PIPE'; +    unless ($pipe) { +        POSIX::close(STDOUT_FILENO); +        POSIX::close(STDIN_FILENO); +    } +    if ($pid) { +        POSIX::dup2($rw1[1],STDOUT_FILENO); +        POSIX::dup2($rw2[0],STDIN_FILENO) unless $pipe; +        exec @c1; +    } else { +        POSIX::dup2($rw2[1],STDOUT_FILENO) unless $pipe; +        POSIX::dup2($rw1[0],STDIN_FILENO); +        exec @c2; +    } +    while (wait()!=-1) {} +} diff --git a/training/dpmert/line_optimizer.cc b/training/dpmert/line_optimizer.cc new file mode 100644 index 00000000..9cf33502 --- /dev/null +++ b/training/dpmert/line_optimizer.cc @@ -0,0 +1,114 @@ +#include "line_optimizer.h" + +#include <limits> +#include <algorithm> + +#include "sparse_vector.h" +#include "ns.h" + +using namespace std; + +typedef ErrorSurface::const_iterator ErrorIter; + +// sort by increasing x-ints +struct IntervalComp { +  bool operator() (const ErrorIter& a, const ErrorIter& b) const { +    return a->x < b->x; +  } +}; + +double LineOptimizer::LineOptimize( +    const EvaluationMetric* metric, +    const vector<ErrorSurface>& surfaces, +    const LineOptimizer::ScoreType type, +    float* best_score, +    const double epsilon) { +  // cerr << "MIN=" << MINIMIZE_SCORE << " MAX=" << MAXIMIZE_SCORE << "  MINE=" << type << endl; +  vector<ErrorIter> all_ints; +  for (vector<ErrorSurface>::const_iterator i = surfaces.begin(); +       i != surfaces.end(); ++i) { +    const ErrorSurface& surface = *i; +    for (ErrorIter j = surface.begin(); j != surface.end(); ++j) +      all_ints.push_back(j); +  } +  sort(all_ints.begin(), all_ints.end(), IntervalComp()); +  double last_boundary = all_ints.front()->x; +  SufficientStats acc; +  float& cur_best_score = *best_score; +  cur_best_score = (type == MAXIMIZE_SCORE ? +    -numeric_limits<float>::max() : numeric_limits<float>::max()); +  bool left_edge = true; +  double pos = numeric_limits<double>::quiet_NaN(); +  for (vector<ErrorIter>::iterator i = all_ints.begin(); +       i != all_ints.end(); ++i) { +    const ErrorSegment& seg = **i; +    if (seg.x - last_boundary > epsilon) { +      float sco = metric->ComputeScore(acc); +      if ((type == MAXIMIZE_SCORE && sco > cur_best_score) || +          (type == MINIMIZE_SCORE && sco < cur_best_score) ) { +        cur_best_score = sco; +	if (left_edge) { +	  pos = seg.x - 0.1; +	  left_edge = false; +	} else { +	  pos = last_boundary + (seg.x - last_boundary) / 2; +	} +	//cerr << "NEW BEST: " << pos << "  (score=" << cur_best_score << ")\n"; +      } +      // string xx = metric->DetailedScore(acc); cerr << "---- " << xx; +#undef SHOW_ERROR_SURFACES +#ifdef SHOW_ERROR_SURFACES +      cerr << "x=" << seg.x << "\ts=" << sco << "\n"; +#endif +      last_boundary = seg.x; +    } +    // cerr << "x-boundary=" << seg.x << "\n"; +    //string x2; acc.Encode(&x2); cerr << "   ACC: " << x2 << endl; +    //string x1; seg.delta.Encode(&x1); cerr << " DELTA: " << x1 << endl; +    acc += seg.delta; +  } +  float sco = metric->ComputeScore(acc); +  if ((type == MAXIMIZE_SCORE && sco > cur_best_score) || +      (type == MINIMIZE_SCORE && sco < cur_best_score) ) { +    cur_best_score = sco; +    if (left_edge) { +      pos = 0; +    } else { +      pos = last_boundary + 1000.0; +    } +  } +  return pos; +} + +void LineOptimizer::RandomUnitVector(const vector<int>& features_to_optimize, +                                     SparseVector<double>* axis, +                                     RandomNumberGenerator<boost::mt19937>* rng) { +  axis->clear(); +  for (int i = 0; i < features_to_optimize.size(); ++i) +    axis->set_value(features_to_optimize[i], rng->NextNormal(0.0,1.0)); +  (*axis) /= axis->l2norm(); +} + +void LineOptimizer::CreateOptimizationDirections( +     const vector<int>& features_to_optimize, +     int additional_random_directions, +     RandomNumberGenerator<boost::mt19937>* rng, +     vector<SparseVector<double> >* dirs +     , bool include_orthogonal +  ) { +  dirs->clear(); +  typedef SparseVector<double> Dir; +  vector<Dir> &out=*dirs; +  int i=0; +  if (include_orthogonal) +    for (;i<features_to_optimize.size();++i) { +      Dir d; +      d.set_value(features_to_optimize[i],1.); +      out.push_back(d); +    } +  out.resize(i+additional_random_directions); +  for (;i<out.size();++i) +     RandomUnitVector(features_to_optimize, &out[i], rng); +  cerr << "Generated " << out.size() << " total axes to optimize along.\n"; +} + diff --git a/training/dpmert/line_optimizer.h b/training/dpmert/line_optimizer.h new file mode 100644 index 00000000..83819f41 --- /dev/null +++ b/training/dpmert/line_optimizer.h @@ -0,0 +1,48 @@ +#ifndef LINE_OPTIMIZER_H_ +#define LINE_OPTIMIZER_H_ + +#include <vector> + +#include "sparse_vector.h" +#include "error_surface.h" +#include "sampler.h" + +class EvaluationMetric; +class Weights; + +struct LineOptimizer { + +  // use MINIMIZE_SCORE for things like TER, WER +  // MAXIMIZE_SCORE for things like BLEU +  enum ScoreType { MAXIMIZE_SCORE, MINIMIZE_SCORE }; + +  // merge all the error surfaces together into a global +  // error surface and find (the middle of) the best segment +  static double LineOptimize( +     const EvaluationMetric* metric, +     const std::vector<ErrorSurface>& envs, +     const LineOptimizer::ScoreType type, +     float* best_score, +     const double epsilon = 1.0/65536.0); + +  // return a random vector of length 1 where all dimensions +  // not listed in dimensions will be 0. +  static void RandomUnitVector(const std::vector<int>& dimensions, +                               SparseVector<double>* axis, +                               RandomNumberGenerator<boost::mt19937>* rng); + +  // generate a list of directions to optimize; the list will +  // contain the orthogonal vectors corresponding to the dimensions in +  // primary and then additional_random_directions directions in those +  // dimensions as well.  All vectors will be length 1. +  static void CreateOptimizationDirections( +     const std::vector<int>& primary, +     int additional_random_directions, +     RandomNumberGenerator<boost::mt19937>* rng, +     std::vector<SparseVector<double> >* dirs +     , bool include_primary=true +    ); + +}; + +#endif diff --git a/training/dpmert/lo_test.cc b/training/dpmert/lo_test.cc new file mode 100644 index 00000000..d89bcd99 --- /dev/null +++ b/training/dpmert/lo_test.cc @@ -0,0 +1,229 @@ +#define BOOST_TEST_MODULE LineOptimizerTest +#include <boost/test/unit_test.hpp> +#include <boost/test/floating_point_comparison.hpp> + +#include <cmath> +#include <iostream> +#include <fstream> + +#include <boost/shared_ptr.hpp> + +#include "ns.h" +#include "ns_docscorer.h" +#include "ces.h" +#include "fdict.h" +#include "hg.h" +#include "kbest.h" +#include "hg_io.h" +#include "filelib.h" +#include "inside_outside.h" +#include "viterbi.h" +#include "mert_geometry.h" +#include "line_optimizer.h" + +using namespace std; + +const char* ref11 = "australia reopens embassy in manila"; +const char* ref12 = "( afp , manila , january 2 ) australia reopened its embassy in the philippines today , which was shut down about seven weeks ago due to what was described as a specific threat of a terrorist attack ."; +const char* ref21 = "australia reopened manila embassy"; +const char* ref22 = "( agence france-presse , manila , 2nd ) - australia reopened its embassy in the philippines today . the embassy was closed seven weeks ago after what was described as a specific threat of a terrorist attack ."; +const char* ref31 = "australia to reopen embassy in manila"; +const char* ref32 = "( afp report from manila , january 2 ) australia reopened its embassy in the philippines today . seven weeks ago , the embassy was shut down due to so - called confirmed terrorist attack threats ."; +const char* ref41 = "australia to re - open its embassy to manila"; +const char* ref42 = "( afp , manila , thursday ) australia reopens its embassy to manila , which was closed for the so - called \" clear \" threat of terrorist attack 7 weeks ago ."; + +BOOST_AUTO_TEST_CASE( TestCheckNaN) { +  double x = 0; +  double y = 0; +  double z = x / y; +  BOOST_CHECK_EQUAL(true, std::isnan(z)); +} + +BOOST_AUTO_TEST_CASE(TestConvexHull) { +  boost::shared_ptr<MERTPoint> a1(new MERTPoint(-1, 0)); +  boost::shared_ptr<MERTPoint> b1(new MERTPoint(1, 0)); +  boost::shared_ptr<MERTPoint> a2(new MERTPoint(-1, 1)); +  boost::shared_ptr<MERTPoint> b2(new MERTPoint(1, -1)); +  vector<boost::shared_ptr<MERTPoint> > sa; sa.push_back(a1); sa.push_back(b1); +  vector<boost::shared_ptr<MERTPoint> > sb; sb.push_back(a2); sb.push_back(b2); +  ConvexHull a(sa); +  cerr << a << endl; +  ConvexHull b(sb); +  ConvexHull c = a; +  c *= b; +  cerr << a << " (*) " << b << " = " << c << endl; +  BOOST_CHECK_EQUAL(3, c.size()); +} + +BOOST_AUTO_TEST_CASE(TestConvexHullInside) { +  const string json = "{\"rules\":[1,\"[X] ||| a\",2,\"[X] ||| A [1]\",3,\"[X] ||| c\",4,\"[X] ||| C [1]\",5,\"[X] ||| [1] B [2]\",6,\"[X] ||| [1] b [2]\",7,\"[X] ||| X [1]\",8,\"[X] ||| Z [1]\"],\"features\":[\"f1\",\"f2\",\"Feature_1\",\"Feature_0\",\"Model_0\",\"Model_1\",\"Model_2\",\"Model_3\",\"Model_4\",\"Model_5\",\"Model_6\",\"Model_7\"],\"edges\":[{\"tail\":[],\"feats\":[],\"rule\":1}],\"node\":{\"in_edges\":[0]},\"edges\":[{\"tail\":[0],\"feats\":[0,-0.8,1,-0.1],\"rule\":2}],\"node\":{\"in_edges\":[1]},\"edges\":[{\"tail\":[],\"feats\":[1,-1],\"rule\":3}],\"node\":{\"in_edges\":[2]},\"edges\":[{\"tail\":[2],\"feats\":[0,-0.2,1,-0.1],\"rule\":4}],\"node\":{\"in_edges\":[3]},\"edges\":[{\"tail\":[1,3],\"feats\":[0,-1.2,1,-0.2],\"rule\":5},{\"tail\":[1,3],\"feats\":[0,-0.5,1,-1.3],\"rule\":6}],\"node\":{\"in_edges\":[4,5]},\"edges\":[{\"tail\":[4],\"feats\":[0,-0.5,1,-0.8],\"rule\":7},{\"tail\":[4],\"feats\":[0,-0.7,1,-0.9],\"rule\":8}],\"node\":{\"in_edges\":[6,7]}}"; +  Hypergraph hg; +  istringstream instr(json); +  HypergraphIO::ReadFromJSON(&instr, &hg); +  SparseVector<double> wts; +  wts.set_value(FD::Convert("f1"), 0.4); +  wts.set_value(FD::Convert("f2"), 1.0); +  hg.Reweight(wts); +  vector<pair<vector<WordID>, prob_t> > list; +  std::vector<SparseVector<double> > features; +  KBest::KBestDerivations<vector<WordID>, ESentenceTraversal> kbest(hg, 10); +  for (int i = 0; i < 10; ++i) { +    const KBest::KBestDerivations<vector<WordID>, ESentenceTraversal>::Derivation* d = +      kbest.LazyKthBest(hg.nodes_.size() - 1, i); +    if (!d) break; +    cerr << log(d->score) << " ||| " << TD::GetString(d->yield) << " ||| " << d->feature_values << endl; +  } +  SparseVector<double> dir; dir.set_value(FD::Convert("f1"), 1.0); +  ConvexHullWeightFunction wf(wts, dir); +  ConvexHull env = Inside<ConvexHull, ConvexHullWeightFunction>(hg, NULL, wf); +  cerr << env << endl; +  const vector<boost::shared_ptr<MERTPoint> >& segs = env.GetSortedSegs(); +  dir *= segs[1]->x; +  wts += dir; +  hg.Reweight(wts); +  KBest::KBestDerivations<vector<WordID>, ESentenceTraversal> kbest2(hg, 10); +  for (int i = 0; i < 10; ++i) { +    const KBest::KBestDerivations<vector<WordID>, ESentenceTraversal>::Derivation* d = +      kbest2.LazyKthBest(hg.nodes_.size() - 1, i); +    if (!d) break; +    cerr << log(d->score) << " ||| " << TD::GetString(d->yield) << " ||| " << d->feature_values << endl; +  } +  for (unsigned i = 0; i < segs.size(); ++i) { +    cerr << "seg=" << i << endl; +    vector<WordID> trans; +    segs[i]->ConstructTranslation(&trans); +    cerr << TD::GetString(trans) << endl; +  } +} + +BOOST_AUTO_TEST_CASE( TestS1) { +  int fPhraseModel_0 = FD::Convert("PhraseModel_0"); +  int fPhraseModel_1 = FD::Convert("PhraseModel_1"); +  int fPhraseModel_2 = FD::Convert("PhraseModel_2"); +  int fLanguageModel = FD::Convert("LanguageModel"); +  int fWordPenalty = FD::Convert("WordPenalty"); +  int fPassThrough = FD::Convert("PassThrough"); +  SparseVector<double> wts; +  wts.set_value(fWordPenalty, 4.25); +  wts.set_value(fLanguageModel, -1.1165); +  wts.set_value(fPhraseModel_0, -0.96); +  wts.set_value(fPhraseModel_1, -0.65); +  wts.set_value(fPhraseModel_2, -0.77); +  wts.set_value(fPassThrough, -10.0); + +  vector<int> to_optimize; +  to_optimize.push_back(fWordPenalty); +  to_optimize.push_back(fLanguageModel); +  to_optimize.push_back(fPhraseModel_0); +  to_optimize.push_back(fPhraseModel_1); +  to_optimize.push_back(fPhraseModel_2); + +  std::string path(boost::unit_test::framework::master_test_suite().argc == 2 ? boost::unit_test::framework::master_test_suite().argv[1] : TEST_DATA); + +  Hypergraph hg; +  ReadFile rf(path + "/0.json.gz"); +  HypergraphIO::ReadFromJSON(rf.stream(), &hg); +  hg.Reweight(wts); + +  Hypergraph hg2; +  ReadFile rf2(path + "/1.json.gz"); +  HypergraphIO::ReadFromJSON(rf2.stream(), &hg2); +  hg2.Reweight(wts); + +  vector<vector<WordID> > refs1(4); +  TD::ConvertSentence(ref11, &refs1[0]); +  TD::ConvertSentence(ref21, &refs1[1]); +  TD::ConvertSentence(ref31, &refs1[2]); +  TD::ConvertSentence(ref41, &refs1[3]); +  vector<vector<WordID> > refs2(4); +  TD::ConvertSentence(ref12, &refs2[0]); +  TD::ConvertSentence(ref22, &refs2[1]); +  TD::ConvertSentence(ref32, &refs2[2]); +  TD::ConvertSentence(ref42, &refs2[3]); +  vector<ConvexHull> envs(2); + +  RandomNumberGenerator<boost::mt19937> rng; + +  vector<SparseVector<double> > axes; // directions to search +  LineOptimizer::CreateOptimizationDirections( +     to_optimize, +     10, +     &rng, +     &axes); +  assert(axes.size() == 10 + to_optimize.size()); +  for (unsigned i = 0; i < axes.size(); ++i) +    cerr << axes[i] << endl; +  const SparseVector<double>& axis = axes[0]; + +  cerr << "Computing Viterbi envelope using inside algorithm...\n"; +  cerr << "axis: " << axis << endl; +  clock_t t_start=clock(); +  ConvexHullWeightFunction wf(wts, axis);  // wts = starting point, axis = search direction +  envs[0] = Inside<ConvexHull, ConvexHullWeightFunction>(hg, NULL, wf); +  envs[1] = Inside<ConvexHull, ConvexHullWeightFunction>(hg2, NULL, wf); + +  vector<ErrorSurface> es(2); +  EvaluationMetric* metric = EvaluationMetric::Instance("IBM_BLEU"); +  boost::shared_ptr<SegmentEvaluator> scorer1 = metric->CreateSegmentEvaluator(refs1); +  boost::shared_ptr<SegmentEvaluator> scorer2 = metric->CreateSegmentEvaluator(refs2); +  ComputeErrorSurface(*scorer1, envs[0], &es[0], metric, hg); +  ComputeErrorSurface(*scorer2, envs[1], &es[1], metric, hg2); +  cerr << envs[0].size() << " " << envs[1].size() << endl; +  cerr << es[0].size() << " " << es[1].size() << endl; +  envs.clear(); +  clock_t t_env=clock(); +  float score; +  double m = LineOptimizer::LineOptimize(metric,es, LineOptimizer::MAXIMIZE_SCORE, &score); +  clock_t t_opt=clock(); +  cerr << "line optimizer returned: " << m << " (SCORE=" << score << ")\n"; +  BOOST_CHECK_CLOSE(0.48719698, score, 1e-5); +  SparseVector<double> res = axis; +  res *= m; +  res += wts; +  cerr << "res: " << res << endl; +  cerr << "ENVELOPE PROCESSING=" << (static_cast<double>(t_env - t_start) / 1000.0) << endl; +  cerr << "  LINE OPTIMIZATION=" << (static_cast<double>(t_opt - t_env) / 1000.0) << endl; +  hg.Reweight(res); +  hg2.Reweight(res); +  vector<WordID> t1,t2; +  ViterbiESentence(hg, &t1); +  ViterbiESentence(hg2, &t2); +  cerr << TD::GetString(t1) << endl; +  cerr << TD::GetString(t2) << endl; +} + +BOOST_AUTO_TEST_CASE(TestZeroOrigin) { +  const string json = "{\"rules\":[1,\"[X7] ||| blA ||| without ||| LHSProb=3.92173 LexE2F=2.90799 LexF2E=1.85003 GenerativeProb=10.5381 RulePenalty=1 XFE=2.77259 XEF=0.441833 LabelledEF=2.63906 LabelledFE=4.96981 LogRuleCount=0.693147\",2,\"[X7] ||| blA ||| except ||| LHSProb=4.92173 LexE2F=3.90799 LexF2E=1.85003 GenerativeProb=11.5381 RulePenalty=1 XFE=2.77259 XEF=1.44183 LabelledEF=2.63906 LabelledFE=4.96981 LogRuleCount=1.69315\",3,\"[S] ||| [X7,1] ||| [1] ||| GlueTop=1\",4,\"[X28] ||| EnwAn ||| title ||| LHSProb=3.96802 LexE2F=2.22462 LexF2E=1.83258 GenerativeProb=10.0863 RulePenalty=1 XFE=0 XEF=1.20397 LabelledEF=1.20397 LabelledFE=-1.98341e-08 LogRuleCount=1.09861\",5,\"[X0] ||| EnwAn ||| funny ||| LHSProb=3.98479 LexE2F=1.79176 LexF2E=3.21888 GenerativeProb=11.1681 RulePenalty=1 XFE=0 XEF=2.30259 LabelledEF=2.30259 LabelledFE=0 LogRuleCount=0 SingletonRule=1\",6,\"[X8] ||| [X7,1] EnwAn ||| entitled [1] ||| LHSProb=3.82533 LexE2F=3.21888 LexF2E=2.52573 GenerativeProb=11.3276 RulePenalty=1 XFE=1.20397 XEF=1.20397 LabelledEF=2.30259 LabelledFE=2.30259 LogRuleCount=0 SingletonRule=1\",7,\"[S] ||| [S,1] [X28,2] ||| [1] [2] ||| Glue=1\",8,\"[S] ||| [S,1] [X0,2] ||| [1] [2] ||| Glue=1\",9,\"[S] ||| [X8,1] ||| [1] ||| GlueTop=1\",10,\"[Goal] ||| [S,1] ||| [1]\"],\"features\":[\"PassThrough\",\"Glue\",\"GlueTop\",\"LanguageModel\",\"WordPenalty\",\"LHSProb\",\"LexE2F\",\"LexF2E\",\"GenerativeProb\",\"RulePenalty\",\"XFE\",\"XEF\",\"LabelledEF\",\"LabelledFE\",\"LogRuleCount\",\"SingletonRule\"],\"edges\":[{\"tail\":[],\"spans\":[0,1,-1,-1],\"feats\":[5,3.92173,6,2.90799,7,1.85003,8,10.5381,9,1,10,2.77259,11,0.441833,12,2.63906,13,4.96981,14,0.693147],\"rule\":1},{\"tail\":[],\"spans\":[0,1,-1,-1],\"feats\":[5,4.92173,6,3.90799,7,1.85003,8,11.5381,9,1,10,2.77259,11,1.44183,12,2.63906,13,4.96981,14,1.69315],\"rule\":2}],\"node\":{\"in_edges\":[0,1],\"cat\":\"X7\"},\"edges\":[{\"tail\":[0],\"spans\":[0,1,-1,-1],\"feats\":[2,1],\"rule\":3}],\"node\":{\"in_edges\":[2],\"cat\":\"S\"},\"edges\":[{\"tail\":[],\"spans\":[1,2,-1,-1],\"feats\":[5,3.96802,6,2.22462,7,1.83258,8,10.0863,9,1,11,1.20397,12,1.20397,13,-1.98341e-08,14,1.09861],\"rule\":4}],\"node\":{\"in_edges\":[3],\"cat\":\"X28\"},\"edges\":[{\"tail\":[],\"spans\":[1,2,-1,-1],\"feats\":[5,3.98479,6,1.79176,7,3.21888,8,11.1681,9,1,11,2.30259,12,2.30259,15,1],\"rule\":5}],\"node\":{\"in_edges\":[4],\"cat\":\"X0\"},\"edges\":[{\"tail\":[0],\"spans\":[0,2,-1,-1],\"feats\":[5,3.82533,6,3.21888,7,2.52573,8,11.3276,9,1,10,1.20397,11,1.20397,12,2.30259,13,2.30259,15,1],\"rule\":6}],\"node\":{\"in_edges\":[5],\"cat\":\"X8\"},\"edges\":[{\"tail\":[1,2],\"spans\":[0,2,-1,-1],\"feats\":[1,1],\"rule\":7},{\"tail\":[1,3],\"spans\":[0,2,-1,-1],\"feats\":[1,1],\"rule\":8},{\"tail\":[4],\"spans\":[0,2,-1,-1],\"feats\":[2,1],\"rule\":9}],\"node\":{\"in_edges\":[6,7,8],\"cat\":\"S\"},\"edges\":[{\"tail\":[5],\"spans\":[0,2,-1,-1],\"feats\":[],\"rule\":10}],\"node\":{\"in_edges\":[9],\"cat\":\"Goal\"}}"; +  Hypergraph hg; +  istringstream instr(json); +  HypergraphIO::ReadFromJSON(&instr, &hg); +  SparseVector<double> wts; +  wts.set_value(FD::Convert("PassThrough"), -0.929201533002898); +  hg.Reweight(wts); + +  vector<pair<vector<WordID>, prob_t> > list; +  std::vector<SparseVector<double> > features; +  KBest::KBestDerivations<vector<WordID>, ESentenceTraversal> kbest(hg, 10); +  for (int i = 0; i < 10; ++i) { +    const KBest::KBestDerivations<vector<WordID>, ESentenceTraversal>::Derivation* d = +      kbest.LazyKthBest(hg.nodes_.size() - 1, i); +    if (!d) break; +    cerr << log(d->score) << " ||| " << TD::GetString(d->yield) << " ||| " << d->feature_values << endl; +  } +  +  SparseVector<double> axis; axis.set_value(FD::Convert("Glue"),1.0); +  ConvexHullWeightFunction wf(wts, axis);  // wts = starting point, axis = search direction +  vector<ConvexHull> envs(1); +  envs[0] = Inside<ConvexHull, ConvexHullWeightFunction>(hg, NULL, wf); + +  vector<vector<WordID> > mr(4); +  TD::ConvertSentence("untitled", &mr[0]); +  TD::ConvertSentence("with no title", &mr[1]); +  TD::ConvertSentence("without a title", &mr[2]); +  TD::ConvertSentence("without title", &mr[3]); +  EvaluationMetric* metric = EvaluationMetric::Instance("IBM_BLEU"); +  boost::shared_ptr<SegmentEvaluator> scorer1 = metric->CreateSegmentEvaluator(mr); +  vector<ErrorSurface> es(1); +  ComputeErrorSurface(*scorer1, envs[0], &es[0], metric, hg); +} + diff --git a/training/dpmert/mert_geometry.cc b/training/dpmert/mert_geometry.cc new file mode 100644 index 00000000..d6973658 --- /dev/null +++ b/training/dpmert/mert_geometry.cc @@ -0,0 +1,185 @@ +#include "mert_geometry.h" + +#include <cassert> +#include <limits> + +using namespace std; + +ConvexHull::ConvexHull(int i) { +  if (i == 0) { +    // do nothing - <> +  } else if (i == 1) { +    points.push_back(boost::shared_ptr<MERTPoint>(new MERTPoint(0, 0, 0, boost::shared_ptr<MERTPoint>(), boost::shared_ptr<MERTPoint>()))); +    assert(this->IsMultiplicativeIdentity()); +  } else { +    cerr << "Only can create ConvexHull semiring 0 and 1 with this constructor!\n"; +    abort(); +  } +} + +const ConvexHull ConvexHullWeightFunction::operator()(const Hypergraph::Edge& e) const { +  const double m = direction.dot(e.feature_values_); +  const double b = origin.dot(e.feature_values_); +  MERTPoint* point = new MERTPoint(m, b, e); +  return ConvexHull(1, point); +} + +ostream& operator<<(ostream& os, const ConvexHull& env) { +  os << '<'; +  const vector<boost::shared_ptr<MERTPoint> >& points = env.GetSortedSegs(); +  for (int i = 0; i < points.size(); ++i) +    os << (i==0 ? "" : "|") << "x=" << points[i]->x << ",b=" << points[i]->b << ",m=" << points[i]->m << ",p1=" << points[i]->p1 << ",p2=" << points[i]->p2; +  return os << '>'; +} + +#define ORIGINAL_MERT_IMPLEMENTATION 1 +#ifdef ORIGINAL_MERT_IMPLEMENTATION + +struct SlopeCompare { +  bool operator() (const boost::shared_ptr<MERTPoint>& a, const boost::shared_ptr<MERTPoint>& b) const { +    return a->m < b->m; +  } +}; + +const ConvexHull& ConvexHull::operator+=(const ConvexHull& other) { +  if (!other.is_sorted) other.Sort(); +  if (points.empty()) { +    points = other.points; +    return *this; +  } +  is_sorted = false; +  int j = points.size(); +  points.resize(points.size() + other.points.size()); +  for (int i = 0; i < other.points.size(); ++i) +    points[j++] = other.points[i]; +  assert(j == points.size()); +  return *this; +} + +void ConvexHull::Sort() const { +  sort(points.begin(), points.end(), SlopeCompare()); +  const int k = points.size(); +  int j = 0; +  for (int i = 0; i < k; ++i) { +    MERTPoint l = *points[i]; +    l.x = kMinusInfinity; +    // cerr << "m=" << l.m << endl; +    if (0 < j) { +      if (points[j-1]->m == l.m) {   // lines are parallel +        if (l.b <= points[j-1]->b) continue; +        --j; +      } +      while(0 < j) { +        l.x = (l.b - points[j-1]->b) / (points[j-1]->m - l.m); +        if (points[j-1]->x < l.x) break; +        --j; +      } +      if (0 == j) l.x = kMinusInfinity; +    } +    *points[j++] = l; +  } +  points.resize(j); +  is_sorted = true; +} + +const ConvexHull& ConvexHull::operator*=(const ConvexHull& other) { +  if (other.IsMultiplicativeIdentity()) { return *this; } +  if (this->IsMultiplicativeIdentity()) { (*this) = other; return *this; } + +  if (!is_sorted) Sort(); +  if (!other.is_sorted) other.Sort(); + +  if (this->IsEdgeEnvelope()) { +//    if (other.size() > 1) +//      cerr << *this << " (TIMES) " << other << endl; +    boost::shared_ptr<MERTPoint> edge_parent = points[0]; +    const double& edge_b = edge_parent->b; +    const double& edge_m = edge_parent->m; +    points.clear(); +    for (int i = 0; i < other.points.size(); ++i) { +      const MERTPoint& p = *other.points[i]; +      const double m = p.m + edge_m; +      const double b = p.b + edge_b; +      const double& x = p.x;       // x's don't change with * +      points.push_back(boost::shared_ptr<MERTPoint>(new MERTPoint(x, m, b, edge_parent, other.points[i]))); +      assert(points.back()->p1->edge); +    } +//    if (other.size() > 1) +//      cerr << " = " << *this << endl; +  } else { +    vector<boost::shared_ptr<MERTPoint> > new_points; +    int this_i = 0; +    int other_i = 0; +    const int this_size  = points.size(); +    const int other_size = other.points.size(); +    double cur_x = kMinusInfinity;   // moves from left to right across the +                                     // real numbers, stopping for all inter- +                                     // sections +    double this_next_val  = (1 < this_size  ? points[1]->x       : kPlusInfinity); +    double other_next_val = (1 < other_size ? other.points[1]->x : kPlusInfinity); +    while (this_i < this_size && other_i < other_size) { +      const MERTPoint& this_point = *points[this_i]; +      const MERTPoint& other_point= *other.points[other_i]; +      const double m = this_point.m + other_point.m; +      const double b = this_point.b + other_point.b; +  +      new_points.push_back(boost::shared_ptr<MERTPoint>(new MERTPoint(cur_x, m, b, points[this_i], other.points[other_i]))); +      int comp = 0; +      if (this_next_val < other_next_val) comp = -1; else +        if (this_next_val > other_next_val) comp = 1; +      if (0 == comp) {  // the next values are equal, advance both indices +        ++this_i; +	++other_i; +        cur_x = this_next_val;  // could be other_next_val (they're equal!) +        this_next_val  = (this_i+1  < this_size  ? points[this_i+1]->x        : kPlusInfinity); +        other_next_val = (other_i+1 < other_size ? other.points[other_i+1]->x : kPlusInfinity); +      } else {  // advance the i with the lower x, update cur_x +        if (-1 == comp) { +          ++this_i; +          cur_x = this_next_val; +          this_next_val =  (this_i+1  < this_size  ? points[this_i+1]->x        : kPlusInfinity); +        } else { +          ++other_i; +          cur_x = other_next_val; +          other_next_val = (other_i+1 < other_size ? other.points[other_i+1]->x : kPlusInfinity); +        } +      } +    } +    points.swap(new_points); +  } +  //cerr << "Multiply: result=" << (*this) << endl; +  return *this; +} + +// recursively construct translation +void MERTPoint::ConstructTranslation(vector<WordID>* trans) const { +  const MERTPoint* cur = this; +  vector<vector<WordID> > ant_trans; +  while(!cur->edge) { +    ant_trans.resize(ant_trans.size() + 1); +    cur->p2->ConstructTranslation(&ant_trans.back()); +    cur = cur->p1.get(); +  } +  size_t ant_size = ant_trans.size(); +  vector<const vector<WordID>*> pants(ant_size); +  assert(ant_size == cur->edge->tail_nodes_.size()); +  --ant_size; +  for (int i = 0; i < pants.size(); ++i) pants[ant_size - i] = &ant_trans[i]; +  cur->edge->rule_->ESubstitute(pants, trans); +} + +void MERTPoint::CollectEdgesUsed(std::vector<bool>* edges_used) const { +  if (edge) { +    assert(edge->id_ < edges_used->size()); +    (*edges_used)[edge->id_] = true; +  } +  if (p1) p1->CollectEdgesUsed(edges_used); +  if (p2) p2->CollectEdgesUsed(edges_used); +} + +#else + +// THIS IS THE NEW FASTER IMPLEMENTATION OF THE MERT SEMIRING OPERATIONS + +#endif + diff --git a/training/dpmert/mert_geometry.h b/training/dpmert/mert_geometry.h new file mode 100644 index 00000000..a8b6959e --- /dev/null +++ b/training/dpmert/mert_geometry.h @@ -0,0 +1,81 @@ +#ifndef _MERT_GEOMETRY_H_ +#define _MERT_GEOMETRY_H_ + +#include <vector> +#include <iostream> +#include <boost/shared_ptr.hpp> + +#include "hg.h" +#include "sparse_vector.h" + +static const double kMinusInfinity = -std::numeric_limits<double>::infinity(); +static const double kPlusInfinity = std::numeric_limits<double>::infinity(); + +struct MERTPoint { +  MERTPoint() : x(), m(), b(), edge() {} +  MERTPoint(double _m, double _b) : +    x(kMinusInfinity), m(_m), b(_b), edge() {} +  MERTPoint(double _x, double _m, double _b, const boost::shared_ptr<MERTPoint>& p1_, const boost::shared_ptr<MERTPoint>& p2_) : +    x(_x), m(_m), b(_b), p1(p1_), p2(p2_), edge() {} +  MERTPoint(double _m, double _b, const Hypergraph::Edge& edge) : +    x(kMinusInfinity), m(_m), b(_b), edge(&edge) {} + +  double x;                   // x intersection with previous segment in env, or -inf if none +  double m;                   // this line's slope +  double b;                   // intercept with y-axis + +  // we keep a pointer to the "parents" of this segment so we can reconstruct +  // the Viterbi translation corresponding to this segment +  boost::shared_ptr<MERTPoint> p1; +  boost::shared_ptr<MERTPoint> p2; + +  // only MERTPoints created from an edge using the ConvexHullWeightFunction +  // have rules +  // TRulePtr rule; +  const Hypergraph::Edge* edge; + +  // recursively recover the Viterbi translation that will result from setting +  // the weights to origin + axis * x, where x is any value from this->x up +  // until the next largest x in the containing ConvexHull +  void ConstructTranslation(std::vector<WordID>* trans) const; +  void CollectEdgesUsed(std::vector<bool>* edges_used) const; +}; + +// this is the semiring value type, +// it defines constructors for 0, 1, and the operations + and * +struct ConvexHull { +  // create semiring zero +  ConvexHull() : is_sorted(true) {}  // zero +  // for debugging: +  ConvexHull(const std::vector<boost::shared_ptr<MERTPoint> >& s) : points(s) { Sort(); } +  // create semiring 1 or 0 +  explicit ConvexHull(int i); +  ConvexHull(int n, MERTPoint* point) : is_sorted(true), points(n, boost::shared_ptr<MERTPoint>(point)) {} +  const ConvexHull& operator+=(const ConvexHull& other); +  const ConvexHull& operator*=(const ConvexHull& other); +  bool IsMultiplicativeIdentity() const { +    return size() == 1 && (points[0]->b == 0.0 && points[0]->m == 0.0) && (!points[0]->edge) && (!points[0]->p1) && (!points[0]->p2); } +  const std::vector<boost::shared_ptr<MERTPoint> >& GetSortedSegs() const { +    if (!is_sorted) Sort(); +    return points; +  } +  size_t size() const { return points.size(); } + + private: +  bool IsEdgeEnvelope() const { +    return points.size() == 1 && points[0]->edge; } +  void Sort() const; +  mutable bool is_sorted; +  mutable std::vector<boost::shared_ptr<MERTPoint> > points; +}; +std::ostream& operator<<(std::ostream& os, const ConvexHull& env); + +struct ConvexHullWeightFunction { +  ConvexHullWeightFunction(const SparseVector<double>& ori, +                           const SparseVector<double>& dir) : origin(ori), direction(dir) {} +  const ConvexHull operator()(const Hypergraph::Edge& e) const; +  const SparseVector<double> origin; +  const SparseVector<double> direction; +}; + +#endif diff --git a/training/dpmert/mr_dpmert_generate_mapper_input.cc b/training/dpmert/mr_dpmert_generate_mapper_input.cc new file mode 100644 index 00000000..199cd23a --- /dev/null +++ b/training/dpmert/mr_dpmert_generate_mapper_input.cc @@ -0,0 +1,81 @@ +#include <iostream> +#include <vector> + +#include <boost/program_options.hpp> +#include <boost/program_options/variables_map.hpp> + +#include "filelib.h" +#include "weights.h" +#include "line_optimizer.h" + +using namespace std; +namespace po = boost::program_options; + +void InitCommandLine(int argc, char** argv, po::variables_map* conf) { +  po::options_description opts("Configuration options"); +  opts.add_options() +        ("dev_set_size,s",po::value<unsigned>(),"[REQD] Development set size (# of parallel sentences)") +        ("forest_repository,r",po::value<string>(),"[REQD] Path to forest repository") +        ("weights,w",po::value<string>(),"[REQD] Current feature weights file") +        ("optimize_feature,o",po::value<vector<string> >(), "Feature to optimize (if none specified, all weights listed in the weights file will be optimized)") +        ("random_directions,d",po::value<unsigned int>()->default_value(20),"Number of random directions to run the line optimizer in") +        ("help,h", "Help"); +  po::options_description dcmdline_options; +  dcmdline_options.add(opts); +  po::store(parse_command_line(argc, argv, dcmdline_options), *conf); +  bool flag = false; +  if (conf->count("dev_set_size") == 0) { +    cerr << "Please specify the size of the development set using -d N\n"; +    flag = true; +  } +  if (conf->count("weights") == 0) { +    cerr << "Please specify the starting-point weights using -w <weightfile.txt>\n"; +    flag = true; +  } +  if (conf->count("forest_repository") == 0) { +    cerr << "Please specify the forest repository location using -r <DIR>\n"; +    flag = true; +  } +  if (flag || conf->count("help")) { +    cerr << dcmdline_options << endl; +    exit(1); +  } +} + +int main(int argc, char** argv) { +  RandomNumberGenerator<boost::mt19937> rng; +  po::variables_map conf; +  InitCommandLine(argc, argv, &conf); +  vector<string> features; +  SparseVector<weight_t> origin; +  vector<weight_t> w; +  Weights::InitFromFile(conf["weights"].as<string>(), &w, &features); +  Weights::InitSparseVector(w, &origin); +  const string forest_repository = conf["forest_repository"].as<string>(); +  if (!DirectoryExists(forest_repository)) { +    cerr << "Forest repository directory " << forest_repository << " not found!\n"; +    return 1; +  } +  if (conf.count("optimize_feature") > 0) +    features=conf["optimize_feature"].as<vector<string> >(); +  vector<SparseVector<weight_t> > directions; +  vector<int> fids(features.size()); +  for (unsigned i = 0; i < features.size(); ++i) +    fids[i] = FD::Convert(features[i]); +  LineOptimizer::CreateOptimizationDirections( +     fids, +     conf["random_directions"].as<unsigned int>(), +     &rng, +     &directions); +  unsigned dev_set_size = conf["dev_set_size"].as<unsigned>(); +  for (unsigned i = 0; i < dev_set_size; ++i) { +    for (unsigned j = 0; j < directions.size(); ++j) { +      cout << forest_repository << '/' << i << ".json.gz " << i << ' '; +      print(cout, origin, "=", ";"); +      cout << ' '; +      print(cout, directions[j], "=", ";"); +      cout << endl; +    } +  } +  return 0; +} diff --git a/training/dpmert/mr_dpmert_map.cc b/training/dpmert/mr_dpmert_map.cc new file mode 100644 index 00000000..d1efcf96 --- /dev/null +++ b/training/dpmert/mr_dpmert_map.cc @@ -0,0 +1,112 @@ +#include <sstream> +#include <iostream> +#include <fstream> +#include <vector> + +#include <boost/program_options.hpp> +#include <boost/program_options/variables_map.hpp> + +#include "ns.h" +#include "ns_docscorer.h" +#include "ces.h" +#include "filelib.h" +#include "stringlib.h" +#include "sparse_vector.h" +#include "mert_geometry.h" +#include "inside_outside.h" +#include "error_surface.h" +#include "b64tools.h" +#include "hg_io.h" + +using namespace std; +namespace po = boost::program_options; + +void InitCommandLine(int argc, char** argv, po::variables_map* conf) { +  po::options_description opts("Configuration options"); +  opts.add_options() +        ("reference,r",po::value<vector<string> >(), "[REQD] Reference translation (tokenized text)") +        ("source,s",po::value<string>(), "Source file (ignored, except for AER)") +        ("evaluation_metric,m",po::value<string>()->default_value("ibm_bleu"), "Evaluation metric being optimized") +        ("input,i",po::value<string>()->default_value("-"), "Input file to map (- is STDIN)") +        ("help,h", "Help"); +  po::options_description dcmdline_options; +  dcmdline_options.add(opts); +  po::store(parse_command_line(argc, argv, dcmdline_options), *conf); +  bool flag = false; +  if (!conf->count("reference")) { +    cerr << "Please specify one or more references using -r <REF.TXT>\n"; +    flag = true; +  } +  if (flag || conf->count("help")) { +    cerr << dcmdline_options << endl; +    exit(1); +  } +} + +bool ReadSparseVectorString(const string& s, SparseVector<double>* v) { +#if 0 +  // this should work, but untested. +  std::istringstream i(s); +  i>>*v; +#else +  vector<string> fields; +  Tokenize(s, ';', &fields); +  if (fields.empty()) return false; +  for (unsigned i = 0; i < fields.size(); ++i) { +    vector<string> pair(2); +    Tokenize(fields[i], '=', &pair); +    if (pair.size() != 2) { +      cerr << "Error parsing vector string: " << fields[i] << endl; +      return false; +    } +    v->set_value(FD::Convert(pair[0]), atof(pair[1].c_str())); +  } +  return true; +#endif +} + +int main(int argc, char** argv) { +  po::variables_map conf; +  InitCommandLine(argc, argv, &conf); +  const string evaluation_metric = conf["evaluation_metric"].as<string>(); +  EvaluationMetric* metric = EvaluationMetric::Instance(evaluation_metric); +  DocumentScorer ds(metric, conf["reference"].as<vector<string> >()); +  cerr << "Loaded " << ds.size() << " references for scoring with " << evaluation_metric << endl; +  Hypergraph hg; +  string last_file; +  ReadFile in_read(conf["input"].as<string>()); +  istream &in=*in_read.stream(); +  while(in) { +    string line; +    getline(in, line); +    if (line.empty()) continue; +    istringstream is(line); +    int sent_id; +    string file, s_origin, s_direction; +    // path-to-file (JSON) sent_ed starting-point search-direction +    is >> file >> sent_id >> s_origin >> s_direction; +    SparseVector<double> origin; +    ReadSparseVectorString(s_origin, &origin); +    SparseVector<double> direction; +    ReadSparseVectorString(s_direction, &direction); +    // cerr << "File: " << file << "\nDir: " << direction << "\n   X: " << origin << endl; +    if (last_file != file) { +      last_file = file; +      ReadFile rf(file); +      HypergraphIO::ReadFromJSON(rf.stream(), &hg); +    } +    const ConvexHullWeightFunction wf(origin, direction); +    const ConvexHull hull = Inside<ConvexHull, ConvexHullWeightFunction>(hg, NULL, wf); + +    ErrorSurface es; +    ComputeErrorSurface(*ds[sent_id], hull, &es, metric, hg); +    //cerr << "Viterbi envelope has " << ve.size() << " segments\n"; +    // cerr << "Error surface has " << es.size() << " segments\n"; +    string val; +    es.Serialize(&val); +    cout << 'M' << ' ' << s_origin << ' ' << s_direction << '\t'; +    B64::b64encode(val.c_str(), val.size(), &cout); +    cout << endl << flush; +  } +  return 0; +} diff --git a/training/dpmert/mr_dpmert_reduce.cc b/training/dpmert/mr_dpmert_reduce.cc new file mode 100644 index 00000000..31512a03 --- /dev/null +++ b/training/dpmert/mr_dpmert_reduce.cc @@ -0,0 +1,77 @@ +#include <sstream> +#include <iostream> +#include <fstream> +#include <vector> + +#include <boost/program_options.hpp> +#include <boost/program_options/variables_map.hpp> + +#include "sparse_vector.h" +#include "error_surface.h" +#include "line_optimizer.h" +#include "b64tools.h" +#include "stringlib.h" + +using namespace std; +namespace po = boost::program_options; + +void InitCommandLine(int argc, char** argv, po::variables_map* conf) { +  po::options_description opts("Configuration options"); +  opts.add_options() +        ("evaluation_metric,m",po::value<string>(), "Evaluation metric (IBM_BLEU, etc.)") +        ("help,h", "Help"); +  po::options_description dcmdline_options; +  dcmdline_options.add(opts); +  po::store(parse_command_line(argc, argv, dcmdline_options), *conf); +  bool flag = conf->count("evaluation_metric") == 0; +  if (flag || conf->count("help")) { +    cerr << dcmdline_options << endl; +    exit(1); +  } +} + +int main(int argc, char** argv) { +  po::variables_map conf; +  InitCommandLine(argc, argv, &conf); +  const string evaluation_metric = conf["evaluation_metric"].as<string>(); +  EvaluationMetric* metric = EvaluationMetric::Instance(evaluation_metric); +  LineOptimizer::ScoreType opt_type = LineOptimizer::MAXIMIZE_SCORE; +  if (metric->IsErrorMetric()) +    opt_type = LineOptimizer::MINIMIZE_SCORE; + +  vector<ErrorSurface> esv; +  string last_key, line, key, val; +  while(getline(cin, line)) { +    size_t ks = line.find("\t"); +    assert(string::npos != ks); +    assert(ks > 2); +    key = line.substr(2, ks - 2); +    val = line.substr(ks + 1); +    if (key != last_key) { +      if (!last_key.empty()) { +	float score; +        double x = LineOptimizer::LineOptimize(metric, esv, opt_type, &score); +	cout << last_key << "|" << x << "|" << score << endl; +      } +      last_key.swap(key); +      esv.clear(); +    } +    if (val.size() % 4 != 0) { +      cerr << "B64 encoding error 1! Skipping.\n"; +      continue; +    } +    string encoded(val.size() / 4 * 3, '\0'); +    if (!B64::b64decode(reinterpret_cast<const unsigned char*>(&val[0]), val.size(), &encoded[0], encoded.size())) { +      cerr << "B64 encoding error 2! Skipping.\n"; +      continue; +    } +    esv.push_back(ErrorSurface()); +    esv.back().Deserialize(encoded); +  } +  if (!esv.empty()) { +    float score; +    double x = LineOptimizer::LineOptimize(metric, esv, opt_type, &score); +    cout << last_key << "|" << x << "|" << score << endl; +  } +  return 0; +} diff --git a/training/dpmert/test_aer/README b/training/dpmert/test_aer/README new file mode 100644 index 00000000..819b2e32 --- /dev/null +++ b/training/dpmert/test_aer/README @@ -0,0 +1,8 @@ +To run the test: + +../dist-vest.pl --local --metric aer cdec.ini --source-file corpus.src --ref-files=ref.0 --weights weights + +This will optimize the parameters of the tiny lexical translation model +so as to minimize the AER of the Viterbi alignment on the development +set in corpus.src according to the reference alignments in ref.0. + diff --git a/training/dpmert/test_aer/cdec.ini b/training/dpmert/test_aer/cdec.ini new file mode 100644 index 00000000..08187848 --- /dev/null +++ b/training/dpmert/test_aer/cdec.ini @@ -0,0 +1,3 @@ +formalism=lextrans +grammar=grammar +aligner=true diff --git a/training/dpmert/test_aer/corpus.src b/training/dpmert/test_aer/corpus.src new file mode 100644 index 00000000..31b23971 --- /dev/null +++ b/training/dpmert/test_aer/corpus.src @@ -0,0 +1,3 @@ +el gato negro ||| the black cat +el gato ||| the cat +el libro ||| the book diff --git a/training/dpmert/test_aer/grammar b/training/dpmert/test_aer/grammar new file mode 100644 index 00000000..9d857824 --- /dev/null +++ b/training/dpmert/test_aer/grammar @@ -0,0 +1,12 @@ +el ||| cat ||| F1=1 +el ||| the ||| F2=1 +el ||| black ||| F3=1 +el ||| book ||| F11=1 +gato ||| cat ||| F4=1 NN=1 +gato ||| black ||| F5=1 +gato ||| the ||| F6=1 +negro ||| the ||| F7=1 +negro ||| cat ||| F8=1 +negro ||| black ||| F9=1 +libro ||| the ||| F10=1 +libro ||| book ||| F12=1 NN=1 diff --git a/training/dpmert/test_aer/ref.0 b/training/dpmert/test_aer/ref.0 new file mode 100644 index 00000000..734a9c5b --- /dev/null +++ b/training/dpmert/test_aer/ref.0 @@ -0,0 +1,3 @@ +0-0 1-2 2-1 +0-0 1-1 +0-0 1-1 diff --git a/training/dpmert/test_aer/weights b/training/dpmert/test_aer/weights new file mode 100644 index 00000000..afc9282e --- /dev/null +++ b/training/dpmert/test_aer/weights @@ -0,0 +1,13 @@ +F1 0.1 +F2 -.5980815 +F3 0.24235 +F4 0.625 +F5 0.4514 +F6 0.112316 +F7 -0.123415 +F8 -0.25390285 +F9 -0.23852 +F10 0.646 +F11 0.413141 +F12 0.343216 +NN -0.1215 diff --git a/training/dpmert/test_data/0.json.gz b/training/dpmert/test_data/0.json.gzBinary files differ new file mode 100644 index 00000000..30f8dd77 --- /dev/null +++ b/training/dpmert/test_data/0.json.gz diff --git a/training/dpmert/test_data/1.json.gz b/training/dpmert/test_data/1.json.gzBinary files differ new file mode 100644 index 00000000..c82cc179 --- /dev/null +++ b/training/dpmert/test_data/1.json.gz diff --git a/training/dpmert/test_data/c2e.txt.0 b/training/dpmert/test_data/c2e.txt.0 new file mode 100644 index 00000000..12c4abe9 --- /dev/null +++ b/training/dpmert/test_data/c2e.txt.0 @@ -0,0 +1,2 @@ +australia reopens embassy in manila +( afp , manila , january 2 ) australia reopened its embassy in the philippines today , which was shut down about seven weeks ago due to what was described as a specific threat of a terrorist attack . diff --git a/training/dpmert/test_data/c2e.txt.1 b/training/dpmert/test_data/c2e.txt.1 new file mode 100644 index 00000000..4ac12df1 --- /dev/null +++ b/training/dpmert/test_data/c2e.txt.1 @@ -0,0 +1,2 @@ +australia reopened manila embassy +( agence france-presse , manila , 2nd ) - australia reopened its embassy in the philippines today . the embassy was closed seven weeks ago after what was described as a specific threat of a terrorist attack . diff --git a/training/dpmert/test_data/c2e.txt.2 b/training/dpmert/test_data/c2e.txt.2 new file mode 100644 index 00000000..2f67b72f --- /dev/null +++ b/training/dpmert/test_data/c2e.txt.2 @@ -0,0 +1,2 @@ +australia to reopen embassy in manila +( afp report from manila , january 2 ) australia reopened its embassy in the philippines today . seven weeks ago , the embassy was shut down due to so-called confirmed terrorist attack threats . diff --git a/training/dpmert/test_data/c2e.txt.3 b/training/dpmert/test_data/c2e.txt.3 new file mode 100644 index 00000000..5483cef6 --- /dev/null +++ b/training/dpmert/test_data/c2e.txt.3 @@ -0,0 +1,2 @@ +australia to re - open its embassy to manila +( afp , manila , thursday ) australia reopens its embassy to manila , which was closed for the so-called " clear " threat of terrorist attack 7 weeks ago . diff --git a/training/dpmert/test_data/re.txt.0 b/training/dpmert/test_data/re.txt.0 new file mode 100644 index 00000000..86eff087 --- /dev/null +++ b/training/dpmert/test_data/re.txt.0 @@ -0,0 +1,5 @@ +erdogan states turkey to reject any pressures to urge it to recognize cyprus +ankara 12 - 1 ( afp ) - turkish prime minister recep tayyip erdogan announced today , wednesday , that ankara will reject any pressure by the european union to urge it to recognize cyprus . this comes two weeks before the summit of european union state and government heads who will decide whether or nor membership negotiations with ankara should be opened . +erdogan told " ntv " television station that " the european union cannot address us by imposing new conditions on us with regard to cyprus . +we will discuss this dossier in the course of membership negotiations . " +he added " let me be clear , i cannot sidestep turkey , this is something we cannot accept . " diff --git a/training/dpmert/test_data/re.txt.1 b/training/dpmert/test_data/re.txt.1 new file mode 100644 index 00000000..2140f198 --- /dev/null +++ b/training/dpmert/test_data/re.txt.1 @@ -0,0 +1,5 @@ +erdogan confirms turkey will resist any pressure to recognize cyprus +ankara 12 - 1 ( afp ) - the turkish head of government , recep tayyip erdogan , announced today ( wednesday ) that ankara would resist any pressure the european union might exercise in order to force it into recognizing cyprus . this comes two weeks before a summit of european union heads of state and government , who will decide whether or not to open membership negotiations with ankara . +erdogan said to the ntv television channel : " the european union cannot engage with us through imposing new conditions on us with regard to cyprus . +we shall discuss this issue in the course of the membership negotiations . " +he added : " let me be clear - i cannot confine turkey . this is something we do not accept . " diff --git a/training/dpmert/test_data/re.txt.2 b/training/dpmert/test_data/re.txt.2 new file mode 100644 index 00000000..94e46286 --- /dev/null +++ b/training/dpmert/test_data/re.txt.2 @@ -0,0 +1,5 @@ +erdogan confirms that turkey will reject any pressures to encourage it to recognize cyprus +ankara , 12 / 1 ( afp ) - the turkish prime minister recep tayyip erdogan declared today , wednesday , that ankara will reject any pressures that the european union may apply on it to encourage to recognize cyprus . this comes two weeks before a summit of the heads of countries and governments of the european union , who will decide on whether or not to start negotiations on joining with ankara . +erdogan told the ntv television station that " it is not possible for the european union to talk to us by imposing new conditions on us regarding cyprus . +we shall discuss this dossier during the negotiations on joining . " +and he added , " let me be clear . turkey's arm should not be twisted ; this is something we cannot accept . " diff --git a/training/dpmert/test_data/re.txt.3 b/training/dpmert/test_data/re.txt.3 new file mode 100644 index 00000000..f87c3308 --- /dev/null +++ b/training/dpmert/test_data/re.txt.3 @@ -0,0 +1,5 @@ +erdogan stresses that turkey will reject all pressures to force it to recognize cyprus +ankara 12 - 1 ( afp ) - turkish prime minister recep tayyip erdogan announced today , wednesday , that ankara would refuse all pressures applied on it by the european union to force it to recognize cyprus . that came two weeks before the summit of the presidents and prime ministers of the european union , who would decide on whether to open negotiations on joining with ankara or not . +erdogan said to " ntv " tv station that the " european union can not communicate with us by imposing on us new conditions related to cyprus . +we will discuss this file during the negotiations on joining . " +he added , " let me be clear . turkey's arm should not be twisted . this is unacceptable to us . " | 
