diff options
| author | redpony <redpony@ec762483-ff6d-05da-a07a-a48fb63a330f> | 2010-06-22 05:12:27 +0000 | 
|---|---|---|
| committer | redpony <redpony@ec762483-ff6d-05da-a07a-a48fb63a330f> | 2010-06-22 05:12:27 +0000 | 
| commit | 0172721855098ca02b207231a654dffa5e4eb1c9 (patch) | |
| tree | 8069c3a62e2d72bd64a2cdeee9724b2679c8a56b /vest | |
| parent | 37728b8be4d0b3df9da81fdda2198ff55b4b2d91 (diff) | |
initial checkin
git-svn-id: https://ws10smt.googlecode.com/svn/trunk@2 ec762483-ff6d-05da-a07a-a48fb63a330f
Diffstat (limited to 'vest')
39 files changed, 3431 insertions, 0 deletions
| diff --git a/vest/Makefile.am b/vest/Makefile.am new file mode 100644 index 00000000..6815ae75 --- /dev/null +++ b/vest/Makefile.am @@ -0,0 +1,35 @@ +bin_PROGRAMS = \ +  mbr_kbest \ +  mr_vest_map \ +  mr_vest_reduce \ +  mr_vest_generate_mapper_input \ +  fast_score + +if HAVE_GTEST +noinst_PROGRAMS = \ +  scorer_test \ +  lo_test +endif + +mbr_kbest_SOURCES = mbr_kbest.cc ter.cc comb_scorer.cc aer_scorer.cc scorer.cc viterbi_envelope.cc +mbr_kbest_LDADD = $(top_srcdir)/decoder/libcdec.a -lz + +fast_score_SOURCES = fast_score.cc ter.cc comb_scorer.cc aer_scorer.cc scorer.cc viterbi_envelope.cc +fast_score_LDADD = $(top_srcdir)/decoder/libcdec.a -lz + +mr_vest_generate_mapper_input_SOURCES = mr_vest_generate_mapper_input.cc line_optimizer.cc +mr_vest_generate_mapper_input_LDADD = $(top_srcdir)/decoder/libcdec.a -lz + +mr_vest_map_SOURCES = viterbi_envelope.cc error_surface.cc aer_scorer.cc mr_vest_map.cc scorer.cc ter.cc comb_scorer.cc line_optimizer.cc +mr_vest_map_LDADD = $(top_srcdir)/decoder/libcdec.a -lz + +mr_vest_reduce_SOURCES = error_surface.cc aer_scorer.cc mr_vest_reduce.cc scorer.cc ter.cc comb_scorer.cc line_optimizer.cc viterbi_envelope.cc +mr_vest_reduce_LDADD = $(top_srcdir)/decoder/libcdec.a -lz + +scorer_test_SOURCES = aer_scorer.cc scorer_test.cc scorer.cc ter.cc comb_scorer.cc viterbi_envelope.cc +scorer_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) $(top_srcdir)/decoder/libcdec.a -lz + +lo_test_SOURCES = lo_test.cc scorer.cc ter.cc aer_scorer.cc comb_scorer.cc viterbi_envelope.cc error_surface.cc line_optimizer.cc +lo_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) $(top_srcdir)/decoder/libcdec.a -lz + +AM_CPPFLAGS = -W -Wall -Wno-sign-compare $(GTEST_CPPFLAGS) -I$(top_srcdir)/decoder diff --git a/vest/aer_scorer.cc b/vest/aer_scorer.cc new file mode 100644 index 00000000..9c8a783a --- /dev/null +++ b/vest/aer_scorer.cc @@ -0,0 +1,118 @@ +#include "aer_scorer.h" + +#include <cmath> +#include <cassert> +#include <sstream> + +#include "tdict.h" +#include "aligner.h" + +using namespace std; + +class AERScore : public Score { +  friend class AERScorer; + public: +  AERScore() : num_matches(), num_predicted(), num_in_ref() {} +  AERScore(int m, int p, int r) : +    num_matches(m), num_predicted(p), num_in_ref(r) {} +  virtual void PlusEquals(const Score& delta) { +    const AERScore& other = static_cast<const AERScore&>(delta); +    num_matches   += other.num_matches; +    num_predicted += other.num_predicted; +    num_in_ref    += other.num_in_ref; +  } +  virtual Score* GetZero() const { +    return new AERScore; +  } +  virtual void Subtract(const Score& rhs, Score* out) const { +    AERScore* res = static_cast<AERScore*>(out); +    const AERScore& other = static_cast<const AERScore&>(rhs); +    res->num_matches   = num_matches   - other.num_matches; +    res->num_predicted = num_predicted - other.num_predicted; +    res->num_in_ref    = num_in_ref    - other.num_in_ref; +  } +  float Precision() const { +    return static_cast<float>(num_matches) / num_predicted; +  } +  float Recall() const { +    return static_cast<float>(num_matches) / num_in_ref; +  } +  virtual float ComputeScore() const { +    const float prec = Precision(); +    const float rec = Recall(); +    const float f = (2.0 * prec * rec) / (rec + prec); +    if (isnan(f)) return 1.0f; +    return 1.0f - f; +  } +  virtual bool IsAdditiveIdentity() const { +    return (num_matches == 0) && (num_predicted == 0) && (num_in_ref == 0); +  } +  virtual void ScoreDetails(std::string* out) const { +    ostringstream os; +    os << "AER=" << (ComputeScore() * 100.0) +       << " F=" << (100 - ComputeScore() * 100.0) +       << " P=" << (Precision() * 100.0) << " R=" << (Recall() * 100.0) +       << " [" << num_matches << " " << num_predicted << " " << num_in_ref << "]"; +    *out = os.str(); +  } +  virtual void Encode(std::string*out) const { +    out->resize(sizeof(int) * 3); +    *(int *)&(*out)[sizeof(int) * 0] = num_matches; +    *(int *)&(*out)[sizeof(int) * 1] = num_predicted; +    *(int *)&(*out)[sizeof(int) * 2] = num_in_ref; +  } + private: +  int num_matches; +  int num_predicted; +  int num_in_ref; +}; + +AERScorer::AERScorer(const vector<vector<WordID> >& refs, const string& src) : src_(src) { +  if (refs.size() != 1) { +    cerr << "AERScorer can only take a single reference!\n"; +    abort(); +  } +  ref_ = AlignerTools::ReadPharaohAlignmentGrid(TD::GetString(refs.front())); +} + +static inline bool Safe(const Array2D<bool>& a, int i, int j) { +  if (i >= 0 && j >= 0 && i < a.width() && j < a.height()) +    return a(i,j); +  else +    return false; +} + +Score* AERScorer::ScoreCandidate(const vector<WordID>& shyp) const { +  boost::shared_ptr<Array2D<bool> > hyp = +    AlignerTools::ReadPharaohAlignmentGrid(TD::GetString(shyp)); + +  int m = 0; +  int r = 0; +  int p = 0; +  int i_len = ref_->width(); +  int j_len = ref_->height(); +  for (int i = 0; i < i_len; ++i) { +    for (int j = 0; j < j_len; ++j) { +      if ((*ref_)(i,j)) { +        ++r; +        if (Safe(*hyp, i, j)) ++m; +      } +    } +  } +  for (int i = 0; i < hyp->width(); ++i) +    for (int j = 0; j < hyp->height(); ++j) +      if ((*hyp)(i,j)) ++p; + +  return new AERScore(m,p,r); +} + +Score* AERScorer::ScoreFromString(const string& in) { +  AERScore* res = new AERScore; +  res->num_matches   = *(const int *)&in[sizeof(int) * 0]; +  res->num_predicted = *(const int *)&in[sizeof(int) * 1]; +  res->num_in_ref    = *(const int *)&in[sizeof(int) * 2]; +  return res; +} + +const std::string* AERScorer::GetSource() const { return &src_; } + diff --git a/vest/aer_scorer.h b/vest/aer_scorer.h new file mode 100644 index 00000000..a0afea3b --- /dev/null +++ b/vest/aer_scorer.h @@ -0,0 +1,22 @@ +#ifndef _AER_SCORER_ +#define _AER_SCORER_ + +#include <boost/shared_ptr.hpp> + +#include "scorer.h" +#include "array2d.h" + +class AERScorer : public SentenceScorer { + public: +  // when constructing alignment strings from a hypergraph, the source +  // is necessary. +  AERScorer(const std::vector<std::vector<WordID> >& refs, const std::string& src = ""); +  Score* ScoreCandidate(const std::vector<WordID>& hyp) const; +  static Score* ScoreFromString(const std::string& in); +  const std::string* GetSource() const; + private: +  std::string src_; +  boost::shared_ptr<Array2D<bool> > ref_; +}; + +#endif diff --git a/vest/comb_scorer.cc b/vest/comb_scorer.cc new file mode 100644 index 00000000..7b2187f4 --- /dev/null +++ b/vest/comb_scorer.cc @@ -0,0 +1,81 @@ +#include "comb_scorer.h" + +#include <cstdio> + +using namespace std; + +class BLEUTERCombinationScore : public Score { +  friend class BLEUTERCombinationScorer; + public: +  ~BLEUTERCombinationScore(); +  float ComputeScore() const { +    return (bleu->ComputeScore() - ter->ComputeScore()) / 2.0f; +  } +  void ScoreDetails(string* details) const { +    char buf[160]; +    sprintf(buf, "Combi = %.2f, BLEU = %.2f, TER = %.2f",  +      ComputeScore()*100.0f, bleu->ComputeScore()*100.0f, ter->ComputeScore()*100.0f); +    *details = buf; +  } +  void PlusEquals(const Score& delta) { +    bleu->PlusEquals(*static_cast<const BLEUTERCombinationScore&>(delta).bleu); +    ter->PlusEquals(*static_cast<const BLEUTERCombinationScore&>(delta).ter); +  } +  Score* GetZero() const { +    BLEUTERCombinationScore* res = new BLEUTERCombinationScore; +    res->bleu = bleu->GetZero(); +    res->ter = ter->GetZero(); +    return res; +  } +  void Subtract(const Score& rhs, Score* res) const { +    bleu->Subtract(*static_cast<const BLEUTERCombinationScore&>(rhs).bleu, +                   static_cast<BLEUTERCombinationScore*>(res)->bleu); +    ter->Subtract(*static_cast<const BLEUTERCombinationScore&>(rhs).ter, +                   static_cast<BLEUTERCombinationScore*>(res)->ter); +  } +  void Encode(std::string* out) const { +    string bs, ts; +    bleu->Encode(&bs); +    ter->Encode(&ts); +    out->clear(); +    (*out) += static_cast<char>(bs.size()); +    (*out) += bs; +    (*out) += ts; +  } +  bool IsAdditiveIdentity() const { +    return bleu->IsAdditiveIdentity() && ter->IsAdditiveIdentity(); +  } + private: +  Score* bleu; +  Score* ter; +}; + +BLEUTERCombinationScore::~BLEUTERCombinationScore() { +  delete bleu; +  delete ter; +} + +BLEUTERCombinationScorer::BLEUTERCombinationScorer(const vector<vector<WordID> >& refs) { +  bleu_ = SentenceScorer::CreateSentenceScorer(IBM_BLEU, refs); +  ter_ = SentenceScorer::CreateSentenceScorer(TER, refs); +} + +BLEUTERCombinationScorer::~BLEUTERCombinationScorer() { +  delete bleu_; +  delete ter_; +} + +Score* BLEUTERCombinationScorer::ScoreCandidate(const std::vector<WordID>& hyp) const { +  BLEUTERCombinationScore* res = new BLEUTERCombinationScore; +  res->bleu = bleu_->ScoreCandidate(hyp); +  res->ter = ter_->ScoreCandidate(hyp); +  return res; +} + +Score* BLEUTERCombinationScorer::ScoreFromString(const std::string& in) { +  int bss = in[0]; +  BLEUTERCombinationScore* r = new BLEUTERCombinationScore; +  r->bleu = SentenceScorer::CreateScoreFromString(IBM_BLEU, in.substr(1, bss)); +  r->ter = SentenceScorer::CreateScoreFromString(TER, in.substr(1 + bss)); +  return r; +} diff --git a/vest/comb_scorer.h b/vest/comb_scorer.h new file mode 100644 index 00000000..70b1ec75 --- /dev/null +++ b/vest/comb_scorer.h @@ -0,0 +1,17 @@ +#ifndef _COMB_SCORER_ +#define _COMB_SCORER_ + +#include "scorer.h" + +class BLEUTERCombinationScorer : public SentenceScorer { + public: +  BLEUTERCombinationScorer(const std::vector<std::vector<WordID> >& refs); +  ~BLEUTERCombinationScorer(); +  Score* ScoreCandidate(const std::vector<WordID>& hyp) const; +  static Score* ScoreFromString(const std::string& in); + private: +  SentenceScorer* bleu_; +  SentenceScorer* ter_; +}; + +#endif diff --git a/vest/dist-vest.pl b/vest/dist-vest.pl new file mode 100755 index 00000000..dca2f06b --- /dev/null +++ b/vest/dist-vest.pl @@ -0,0 +1,619 @@ +#!/usr/bin/env perl + +use strict; +my $SCRIPT_DIR; BEGIN { use Cwd qw/ abs_path /; use File::Basename; $SCRIPT_DIR = dirname(abs_path($0)); push @INC, $SCRIPT_DIR; } +use Getopt::Long; +use IPC::Open2; +use strict; +use POSIX ":sys_wait_h"; + +# Default settings +my $srcFile; +my $refFiles; +my $bin_dir = $SCRIPT_DIR; +die "Bin directory $bin_dir missing/inaccessible" unless -d $bin_dir; +my $FAST_SCORE="$bin_dir/fast_score"; +die "Can't execute $FAST_SCORE" unless -x $FAST_SCORE; +my $MAPINPUT = "$bin_dir/mr_vest_generate_mapper_input"; +my $MAPPER = "$bin_dir/mr_vest_map"; +my $REDUCER = "$bin_dir/mr_vest_reduce"; +my $SCORER = $FAST_SCORE; +die "Can't find $MAPPER" unless -x $MAPPER; +my $cdec = "$bin_dir/../decoder/cdec"; +die "Can't find decoder in $cdec" unless -x $cdec; +my $decoder = $cdec; +my $DISCARD_FORESTS = 0; +my $lines_per_mapper = 400; +my $rand_directions = 15; +my $iteration = 1; +my $run_local = 0; +my $best_weights; +my $max_iterations = 15; +my $optimization_iters = 6; +my $num_rand_points = 20; +my $mert_nodes = join(" ", grep(/^c\d\d$/, split(/\n/, `pbsnodes -a`))); # "1 1 1 1 1" fails due to file staging conflicts +my $decode_nodes = "1 1 1 1 1 1 1 1 1 1 1 1 1 1 1";  # start 15 jobs +my $pmem = "3g"; +my $disable_clean = 0; +my %seen_weights; +my $normalize; +my $help = 0; +my $epsilon = 0.0001; +my $interval = 5; +my $dryrun = 0; +my $ranges; +my $last_score = -10000000; +my $restart = 0; +my $metric = "ibm_bleu"; +my $dir; +my $iniFile; +my $weights; +my $initialWeights; +my $decoderOpt; + +# Process command-line options +Getopt::Long::Configure("no_auto_abbrev"); +if (GetOptions( +	"decoder=s" => \$decoderOpt, +	"decode-nodes=s" => \$decode_nodes, +	"dont-clean" => \$disable_clean, +	"dry-run" => \$dryrun, +	"epsilon" => \$epsilon, +	"help" => \$help, +	"interval" => \$interval, +	"iteration=i" => \$iteration, +	"local" => \$run_local, +	"max-iterations=i" => \$max_iterations, +	"mert-nodes=s" => \$mert_nodes, +	"normalize=s" => \$normalize, +	"pmem=s" => \$pmem, +	"ranges=s" => \$ranges, +	"rand-directions=i" => \$rand_directions, +	"ref-files=s" => \$refFiles, +	"metric=s" => \$metric, +	"restart" => \$restart, +	"source-file=s" => \$srcFile, +	"weights=s" => \$initialWeights, +	"workdir=s" => \$dir +) == 0 || @ARGV!=1 || $help) { +	print_help(); +	exit; +} + +if ($metric =~ /^(combi|ter)$/i) { +  $lines_per_mapper = 40; +} + +($iniFile) = @ARGV; + + +sub write_config; +sub enseg; +sub print_help; + +my $nodelist; +my $host =`hostname`; chomp $host; +my $bleu; +my $interval_count = 0; +my $logfile; +my $projected_score; + +# used in sorting scores +my $DIR_FLAG = '-r'; +if ($metric =~ /^ter$|^aer$/i) { +  $DIR_FLAG = ''; +} + +my $refs_comma_sep = get_comma_sep_refs($refFiles); + +unless ($dir){  +	$dir = "vest"; +} +unless ($dir =~ /^\//){  # convert relative path to absolute path +	my $basedir = `pwd`; +	chomp $basedir; +	$dir = "$basedir/$dir"; +} +if ($restart){ +	$iniFile = `ls $dir/*.ini`; chomp $iniFile; +	unless (-e $iniFile){ +		die "ERROR: Could not find ini file in $dir to restart\n"; +	} +	$logfile = "$dir/mert.log"; +	open(LOGFILE, ">>$logfile"); +	print LOGFILE "RESTARTING STOPPED OPTIMIZATION\n\n"; + +	# figure out best weights so far and iteration number +	open(LOG, "$dir/mert.log"); +	my $wi = 0; +	while (my $line = <LOG>){ +		chomp $line; +		if ($line =~ /ITERATION (\d+)/) { +			$iteration = $1; +		} +	} + +	$iteration = $wi + 1; +} + +if ($decoderOpt){ $decoder = $decoderOpt; } + + +# Initializations and helper functions +srand; + +my @childpids = (); +my @cleanupcmds = (); + +sub cleanup { +	print STDERR "Cleanup...\n"; +	for my $pid (@childpids){ `kill $pid`; } +	for my $cmd (@cleanupcmds){`$cmd`; } +	exit 1; +}; +$SIG{INT} = "cleanup"; +$SIG{TERM} = "cleanup";  +$SIG{HUP} = "cleanup"; + +my $decoderBase = `basename $decoder`; chomp $decoderBase; +my $newIniFile = "$dir/$decoderBase.ini"; +my $parallelize = '/chomes/redpony/svn-trunk/sa-utils/parallelize.pl'; +my $inputFileName = "$dir/input"; +my $user = $ENV{"USER"}; + +# process ini file +-e $iniFile || die "Error: could not open $iniFile for reading\n"; +open(INI, $iniFile); + +if ($dryrun){ +	write_config(*STDERR); +	exit 0; +} else { +	if (-e $dir){ +		unless($restart){ +			die "ERROR: working dir $dir already exists\n\n"; +		} +	} else { +		mkdir $dir; +		mkdir "$dir/hgs"; +		unless (-e $initialWeights) { +			print STDERR "Please specify an initial weights file with --initial-weights\n"; +			print_help(); +			exit; +		} +		`cp $initialWeights $dir/weights.0`; +		die "Can't find weights.0" unless (-e "$dir/weights.0"); +	} +	unless($restart){ +		$logfile = "$dir/mert.log"; +		open(LOGFILE, ">$logfile"); +	} +	write_config(*LOGFILE); +} + + +# Generate initial files and values +unless ($restart){ `cp $iniFile $newIniFile`; } +$iniFile = $newIniFile; + +my $newsrc = "$dir/dev.input"; +unless($restart){ enseg($srcFile, $newsrc); } +$srcFile = $newsrc; +my $devSize = 0; +open F, "<$srcFile" or die "Can't read $srcFile: $!"; +while(<F>) { $devSize++; } +close F; + +unless($best_weights){ $best_weights = $weights; } +unless($projected_score){ $projected_score = 0.0; } +$seen_weights{$weights} = 1; + +my $random_seed = int(time / 1000); +my $lastWeightsFile; +my $lastPScore = 0; +# main optimization loop +while (1){ +	print LOGFILE "\n\nITERATION $iteration\n==========\n"; + +	# iteration-specific files +	my $runFile="$dir/run.raw.$iteration"; +	my $onebestFile="$dir/1best.$iteration"; +	my $logdir="$dir/logs.$iteration"; +	my $decoderLog="$logdir/decoder.sentserver.log.$iteration"; +	my $scorerLog="$logdir/scorer.log.$iteration"; +	`mkdir -p $logdir`; + +	#decode +	print LOGFILE "DECODE\n"; +	print LOGFILE `date`; +	my $im1 = $iteration - 1; +	my $weightsFile="$dir/weights.$im1"; +	my $decoder_cmd = "$decoder -c $iniFile -w $weightsFile -O $dir/hgs"; +	my $pcmd = "cat $srcFile | $parallelize -p $pmem -e $logdir -n \"$decode_nodes\" -- "; +        if ($run_local) { $pcmd = "cat $srcFile |"; } +        my $cmd = $pcmd . "$decoder_cmd 2> $decoderLog 1> $runFile"; +	print LOGFILE "COMMAND:\n$cmd\n"; +	my $result = 0; +	$result = system($cmd); +	unless ($result == 0){ +		cleanup(); +		print LOGFILE "ERROR: Parallel decoder returned non-zero exit code $result\n"; +		die; +	} +	my $dec_score = `cat $runFile | $SCORER $refs_comma_sep -l $metric`; +	chomp $dec_score; +	print LOGFILE "DECODER SCORE: $dec_score\n"; + +	# save space +	`gzip $runFile`; +	`gzip $decoderLog`; + +	if ($iteration > $max_iterations){ +		print LOGFILE "\nREACHED STOPPING CRITERION: Maximum iterations\n"; +		last; +	} +	 +	# run optimizer +	print LOGFILE "\nUNION FORESTS\n"; +	print LOGFILE `date`; +	my $mergeLog="$logdir/prune-merge.log.$iteration"; +	if ($DISCARD_FORESTS) { +		`rm -f $dir/hgs/*gz`; +	} + +	my $score = 0; +	my $icc = 0; +	my $inweights="$dir/weights.$im1"; +	for (my $opt_iter=1; $opt_iter<$optimization_iters; $opt_iter++) { +		print LOGFILE "\nGENERATE OPTIMIZATION STRATEGY (OPT-ITERATION $opt_iter/$optimization_iters)\n"; +		print LOGFILE `date`; +		$icc++; +		$cmd="$MAPINPUT -w $inweights -r $dir/hgs -s $devSize -d $rand_directions > $dir/agenda.$im1-$opt_iter"; +		print LOGFILE "COMMAND:\n$cmd\n"; +		$result = system($cmd); +		unless ($result == 0){ +			cleanup(); +			print LOGFILE "ERROR: mapinput command returned non-zero exit code $result\n"; +			die; +		} + +	        `mkdir $dir/splag.$im1`; +		$cmd="split -a 3 -l $lines_per_mapper $dir/agenda.$im1-$opt_iter $dir/splag.$im1/mapinput."; +		print LOGFILE "COMMAND:\n$cmd\n"; +		$result = system($cmd); +		unless ($result == 0){ +			cleanup(); +			print LOGFILE "ERROR: split command returned non-zero exit code $result\n"; +			die; +		} +		opendir(DIR, "$dir/splag.$im1") or die "Can't open directory: $!"; +		my @shards = grep { /^mapinput\./ } readdir(DIR); +		closedir DIR; +		die "No shards!" unless scalar @shards > 0; +		my $joblist = ""; +		my $nmappers = 0; +		my @mapoutputs = (); +		@cleanupcmds = (); +		my %o2i = (); +		my $first_shard = 1; +		for my $shard (@shards) { +			my $mapoutput = $shard; +			my $client_name = $shard; +			$client_name =~ s/mapinput.//; +			$client_name = "vest.$client_name"; +			$mapoutput =~ s/mapinput/mapoutput/; +			push @mapoutputs, "$dir/splag.$im1/$mapoutput"; +			$o2i{"$dir/splag.$im1/$mapoutput"} = "$dir/splag.$im1/$shard"; +			my $script = "$MAPPER -s $srcFile -l $metric $refs_comma_sep < $dir/splag.$im1/$shard | sort -t \$'\\t' -k 1 > $dir/splag.$im1/$mapoutput"; +			if ($run_local) { +				print LOGFILE "COMMAND:\n$script\n"; +				$result = system($script); +				unless ($result == 0){ +					cleanup(); +					print LOGFILE "ERROR: mapper returned non-zero exit code $result\n"; +					die; +				} +			} else { +				my $todo = "qsub -q batch -l pmem=3000mb,walltime=5:00:00 -N $client_name -o /dev/null -e $logdir/$client_name.ER"; +				local(*QOUT, *QIN); +				open2(\*QOUT, \*QIN, $todo); +				print QIN $script; +				if ($first_shard) { print LOGFILE "$script\n"; $first_shard=0; } +				close QIN; +				$nmappers++; +				while (my $jobid=<QOUT>){ +	  				chomp $jobid; +		  			push(@cleanupcmds, "`qdel $jobid 2> /dev/null`"); +					$jobid =~ s/^(\d+)(.*?)$/\1/g; +					print STDERR "short job id $jobid\n"; +					if ($joblist == "") { $joblist = $jobid; } +					else {$joblist = $joblist . "\|" . $jobid; } +				} +				close QOUT; +			} +		} +		if ($run_local) { +		} else { +			print LOGFILE "Launched $nmappers mappers.\n"; +			print LOGFILE "Waiting for mappers to complete...\n"; +			while ($nmappers > 0) { +			  sleep 5; +			  my @livejobs = grep(/$joblist/, split(/\n/, `qstat`)); +			  $nmappers = scalar @livejobs; +			} +			print LOGFILE "All mappers complete.\n"; +		} +		my $tol = 0; +		my $til = 0; +		for my $mo (@mapoutputs) { +		  my $olines = get_lines($mo); +		  my $ilines = get_lines($o2i{$mo}); +		  $tol += $olines; +		  $til += $ilines; +		  die "$mo: output lines ($olines) doesn't match input lines ($ilines)" unless $olines==$ilines; +		} +		print LOGFILE "Results for $tol/$til lines\n"; +		print LOGFILE "\nSORTING AND RUNNING FMERT REDUCER\n"; +		print LOGFILE `date`; +		$cmd="sort -t \$'\\t' -k 1 @mapoutputs | $REDUCER -l $metric > $dir/redoutput.$im1"; +		print LOGFILE "COMMAND:\n$cmd\n"; +		$result = system($cmd); +		unless ($result == 0){ +			cleanup(); +			print LOGFILE "ERROR: reducer command returned non-zero exit code $result\n"; +			die; +		} +		$cmd="sort -nk3 $DIR_FLAG '-t|' $dir/redoutput.$im1 | head -1"; +		my $best=`$cmd`; chomp $best; +		print LOGFILE "$best\n"; +		my ($oa, $x, $xscore) = split /\|/, $best; +		$score = $xscore; +		print LOGFILE "PROJECTED SCORE: $score\n"; +		if (abs($x) < $epsilon) { +			print LOGFILE "\nOPTIMIZER: no score improvement: abs($x) < $epsilon\n"; +			last; +		} +                my $psd = $score - $last_score; +                $last_score = $score; +		if (abs($psd) < $epsilon) { +			print LOGFILE "\nOPTIMIZER: no score improvement: abs($psd) < $epsilon\n"; +			last; +		} +		my ($origin, $axis) = split /\s+/, $oa; + +		my %ori = convert($origin); +		my %axi = convert($axis); + +		my $finalFile="$dir/weights.$im1-$opt_iter"; +		open W, ">$finalFile" or die "Can't write: $finalFile: $!"; +                my $norm = 0; +		for my $k (sort keys %ori) { +			my $dd = $ori{$k} + $axi{$k} * $x; +                        $norm += $dd * $dd; +		} +                $norm = sqrt($norm); +		$norm = 1; +		for my $k (sort keys %ori) { +			my $v = ($ori{$k} + $axi{$k} * $x) / $norm; +			print W "$k $v\n"; +		} +		`rm -rf $dir/splag.$im1`; +		$inweights = $finalFile; +	} +	$lastWeightsFile = "$dir/weights.$iteration"; +	`cp $inweights $lastWeightsFile`; +	if ($icc < 2) { +		print LOGFILE "\nREACHED STOPPING CRITERION: score change too little\n"; +		last; +	} +	$lastPScore = $score; +	$iteration++; +	print LOGFILE "\n==========\n"; +} + +print LOGFILE "\nFINAL WEIGHTS: $dir/$lastWeightsFile\n(Use -w <this file> with hiero)\n\n"; + +sub normalize_weights { +  my ($rfn, $rpts, $feat) = @_; +  my @feat_names = @$rfn; +  my @pts = @$rpts; +  my $z = 1.0; +  for (my $i=0; $i < scalar @feat_names; $i++) { +    if ($feat_names[$i] eq $feat) { +      $z = $pts[$i]; +      last; +    } +  } +  for (my $i=0; $i < scalar @feat_names; $i++) { +    $pts[$i] /= $z; +  } +  print LOGFILE " NORM WEIGHTS: @pts\n"; +  return @pts; +} + +sub get_lines { +  my $fn = shift @_; +  open FL, "<$fn" or die "Couldn't read $fn: $!"; +  my $lc = 0; +  while(<FL>) { $lc++; } +  return $lc; +} + +sub get_comma_sep_refs { +  my ($p) = @_; +  my $o = `echo $p`; +  chomp $o; +  my @files = split /\s+/, $o; +  return "-r " . join(' -r ', @files); +} + +sub read_weights_file { +  my ($file) = @_; +  open F, "<$file" or die "Couldn't read $file: $!"; +  my @r = (); +  my $pm = -1; +  while(<F>) { +    next if /^#/; +    next if /^\s*$/; +    chomp; +    if (/^(.+)\s+(.+)$/) { +      my $m = $1; +      my $w = $2; +      die "Weights out of order: $m <= $pm" unless $m > $pm; +      push @r, $w; +    } else { +      warn "Unexpected feature name in weight file: $_"; +    } +  } +  close F; +  return join ' ', @r; +} + +# subs +sub write_config { +	my $fh = shift; +	my $cleanup = "yes"; +	if ($disable_clean) {$cleanup = "no";} + +	print $fh "\n"; +	print $fh "DECODER:          $decoder\n"; +	print $fh "INI FILE:         $iniFile\n"; +	print $fh "WORKING DIR:      $dir\n"; +	print $fh "SOURCE (DEV):     $srcFile\n"; +	print $fh "REFS (DEV):       $refFiles\n"; +	print $fh "EVAL METRIC:      $metric\n"; +	print $fh "START ITERATION:  $iteration\n"; +	print $fh "MAX ITERATIONS:   $max_iterations\n"; +	print $fh "MERT NODES:       $mert_nodes\n"; +	print $fh "DECODE NODES:     $decode_nodes\n"; +	print $fh "HEAD NODE:        $host\n"; +	print $fh "PMEM (DECODING):  $pmem\n"; +	print $fh "CLEANUP:          $cleanup\n"; +	print $fh "INITIAL WEIGHTS:  $initialWeights\n"; + +	if ($restart){ +		print $fh "PROJECTED BLEU:   $projected_score\n"; +		print $fh "BEST WEIGHTS:     $best_weights\n"; +	} +} + +sub update_weights_file { +  my ($neww, $rfn, $rpts) = @_; +  my @feats = @$rfn; +  my @pts = @$rpts; +  my $num_feats = scalar @feats; +  my $num_pts = scalar @pts; +  die "$num_feats (num_feats) != $num_pts (num_pts)" unless $num_feats == $num_pts; +  open G, ">$neww" or die; +  for (my $i = 0; $i < $num_feats; $i++) { +    my $f = $feats[$i]; +    my $lambda = $pts[$i]; +    print G "$f $lambda\n"; +  } +  close G; +} + +sub enseg { +	my $src = shift; +	my $newsrc = shift; +	open(SRC, $src); +	open(NEWSRC, ">$newsrc"); +	my $i=0; +	while (my $line=<SRC>){ +		chomp $line; +		print NEWSRC "<seg id=\"$i\">$line</seg>\n"; +		$i++; +	} +	close SRC; +	close NEWSRC; +} + +sub print_help { + +	my $executable = `basename $0`; chomp $executable; +    print << "Help"; + +Usage: $executable [options] <ini file> +       $executable --restart <work dir> + +	$executable [options] <ini file>  +		Runs a complete MERT optimization and test set decoding, using  +		the decoder configuration in ini file.  Note that many of the +		options have default values that are inferred automatically  +		based on certain conventions.  For details, refer to descriptions +		of the options --decoder, --weights, and --workdir.  + +	$executable --restart <work dir> +		Continues an optimization run that was stopped for some reason, +		using configuration information found in the working directory +		left behind by the stopped run. + +Options: + +	--local +		Run the decoder and optimizer locally. + +	--decoder <decoder path> +		Decoder binary to use. + +	--help +		Print this message and exit. + +	--iteration <I>  +		Starting iteration number.  If not specified, defaults to 1. + +	--max-iterations <M>  +		Maximum number of iterations to run.  If not specified, defaults +		to 10. + +	--pmem <N> +		Amount of physical memory requested for parallel decoding jobs. + +	--ref-files <files>  +		Dev set ref files.  This option takes only a single string argument.  +		To use multiple files (including file globbing), this argument should  +		be quoted. + +	--metric <method> +		Metric to optimize. +		Example values: IBM_BLEU, NIST_BLEU, Koehn_BLEU, TER, Combi + +	--normalize <feature-name> +		After each iteration, rescale all feature weights such that feature- +		name has a weight of 1.0. + +	--rand-directions <num> +		MERT will attempt to optimize along all of the principle directions, +		set this parameter to explore other directions. Defaults to 5. + +	--source-file <file>  +		Dev set source file. + +	--weights <file>  +		A file specifying initial feature weights.  The format is +		FeatureName_1 value1 +		FeatureName_2 value2 + +	--workdir <dir>  +		Directory for intermediate and output files.  If not specified, the +		name is derived from the ini filename.  Assuming that the ini  +		filename begins with the decoder name and ends with ini, the default +		name of the working directory is inferred from the middle part of  +		the filename.  E.g. an ini file named decoder.foo.ini would have +		a default working directory name foo. + +Help +} + +sub convert { +  my ($str) = @_; +  my @ps = split /;/, $str; +  my %dict = (); +  for my $p (@ps) { +    my ($k, $v) = split /=/, $p; +    $dict{$k} = $v; +  } +  return %dict; +} + + diff --git a/vest/error_surface.cc b/vest/error_surface.cc new file mode 100644 index 00000000..4e0af35c --- /dev/null +++ b/vest/error_surface.cc @@ -0,0 +1,46 @@ +#include "error_surface.h" + +#include <cassert> +#include <sstream> + +using namespace std; + +ErrorSurface::~ErrorSurface() { +  for (ErrorSurface::iterator i = begin(); i != end(); ++i) +    //delete i->delta; +    ; +} + +void ErrorSurface::Serialize(std::string* out) const { +  const int segments = this->size(); +  ostringstream os(ios::binary); +  os.write((const char*)&segments,sizeof(segments)); +  for (int i = 0; i < segments; ++i) { +    const ErrorSegment& cur = (*this)[i]; +    string senc; +    cur.delta->Encode(&senc); +    assert(senc.size() < 256); +    unsigned char len = senc.size(); +    os.write((const char*)&cur.x, sizeof(cur.x)); +    os.write((const char*)&len, sizeof(len)); +    os.write((const char*)&senc[0], len); +  } +  *out = os.str(); +} + +void ErrorSurface::Deserialize(ScoreType type, const std::string& in) { +  istringstream is(in, ios::binary);  +  int segments; +  is.read((char*)&segments, sizeof(segments)); +  this->resize(segments); +  for (int i = 0; i < segments; ++i) { +    ErrorSegment& cur = (*this)[i]; +    unsigned char len; +    is.read((char*)&cur.x, sizeof(cur.x)); +    is.read((char*)&len, sizeof(len)); +    string senc(len, '\0'); assert(senc.size() == len); +    is.read((char*)&senc[0], len); +    cur.delta = SentenceScorer::CreateScoreFromString(type, senc); +  } +} + diff --git a/vest/error_surface.h b/vest/error_surface.h new file mode 100644 index 00000000..a8734f54 --- /dev/null +++ b/vest/error_surface.h @@ -0,0 +1,24 @@ +#ifndef _ERROR_SURFACE_H_ +#define _ERROR_SURFACE_H_ + +#include <vector> +#include <string> + +#include "scorer.h" + +class Score; + +struct ErrorSegment { +  double x; +  Score* delta; +  ErrorSegment() : x(0), delta(NULL) {} +}; + +class ErrorSurface : public std::vector<ErrorSegment> { + public: +  ~ErrorSurface(); +  void Serialize(std::string* out) const; +  void Deserialize(ScoreType type, const std::string& in); +}; + +#endif diff --git a/vest/fast_score.cc b/vest/fast_score.cc new file mode 100644 index 00000000..cf743b4f --- /dev/null +++ b/vest/fast_score.cc @@ -0,0 +1,74 @@ +#include <iostream> +#include <vector> + +#include <boost/program_options.hpp> +#include <boost/program_options/variables_map.hpp> + +#include "filelib.h" +#include "tdict.h" +#include "scorer.h" + +using namespace std; +namespace po = boost::program_options; + +void InitCommandLine(int argc, char** argv, po::variables_map* conf) { +  po::options_description opts("Configuration options"); +  opts.add_options() +        ("reference,r",po::value<vector<string> >(), "[REQD] Reference translation(s) (tokenized text file)") +        ("loss_function,l",po::value<string>()->default_value("ibm_bleu"), "Scoring metric (ibm_bleu, nist_bleu, koehn_bleu, ter, combi)") +        ("in_file,i", po::value<string>()->default_value("-"), "Input file") +        ("help,h", "Help"); +  po::options_description dcmdline_options; +  dcmdline_options.add(opts); +  po::store(parse_command_line(argc, argv, dcmdline_options), *conf); +  bool flag = false; +  if (!conf->count("reference")) { +    cerr << "Please specify one or more references using -r <REF1.TXT> -r <REF2.TXT> ...\n"; +    flag = true; +  } +  if (flag || conf->count("help")) { +    cerr << dcmdline_options << endl; +    exit(1); +  } +} + +int main(int argc, char** argv) { +  po::variables_map conf; +  InitCommandLine(argc, argv, &conf); +  const string loss_function = conf["loss_function"].as<string>(); +  ScoreType type = ScoreTypeFromString(loss_function); +  DocScorer ds(type, conf["reference"].as<vector<string> >(), ""); +  cerr << "Loaded " << ds.size() << " references for scoring with " << loss_function << endl; + +  ReadFile rf(conf["in_file"].as<string>()); +  Score* acc = NULL; +  istream& in = *rf.stream(); +  int lc = 0; +  while(in) { +    string line; +    getline(in, line); +    if (line.empty()) continue; +    vector<WordID> sent; +    TD::ConvertSentence(line, &sent); +    Score* sentscore = ds[lc]->ScoreCandidate(sent); +    if (!acc) { acc = sentscore->GetZero(); } +    acc->PlusEquals(*sentscore); +    delete sentscore; +    ++lc; +  } +  assert(lc > 0); +  if (lc > ds.size()) { +    cerr << "Too many (" << lc << ") translations in input, expected " << ds.size() << endl; +    return 1; +  } +  if (lc != ds.size()) +    cerr << "Fewer sentences in hyp (" << lc << ") than refs (" +         << ds.size() << "): scoring partial set!\n"; +  float score = acc->ComputeScore(); +  string details; +  acc->ScoreDetails(&details); +  delete acc; +  cerr << details << endl; +  cout << score << endl; +  return 0; +} diff --git a/vest/line_optimizer.cc b/vest/line_optimizer.cc new file mode 100644 index 00000000..b6410c35 --- /dev/null +++ b/vest/line_optimizer.cc @@ -0,0 +1,103 @@ +#include "line_optimizer.h" + +#include <limits> +#include <algorithm> + +#include "sparse_vector.h" +#include "scorer.h" + +using namespace std; + +typedef ErrorSurface::const_iterator ErrorIter; + +// sort by increasing x-ints +struct IntervalComp { +  bool operator() (const ErrorIter& a, const ErrorIter& b) const { +    return a->x < b->x; +  } +};	 + +double LineOptimizer::LineOptimize( +    const vector<ErrorSurface>& surfaces, +    const LineOptimizer::ScoreType type, +    float* best_score, +    const double epsilon) { +  // cerr << "MIN=" << MINIMIZE_SCORE << " MAX=" << MAXIMIZE_SCORE << "  MINE=" << type << endl; +  vector<ErrorIter> all_ints; +  for (vector<ErrorSurface>::const_iterator i = surfaces.begin(); +       i != surfaces.end(); ++i) { +    const ErrorSurface& surface = *i; +    for (ErrorIter j = surface.begin(); j != surface.end(); ++j) +      all_ints.push_back(j); +  } +  sort(all_ints.begin(), all_ints.end(), IntervalComp()); +  double last_boundary = all_ints.front()->x; +  Score* acc = all_ints.front()->delta->GetZero(); +  float& cur_best_score = *best_score; +  cur_best_score = (type == MAXIMIZE_SCORE ? +    -numeric_limits<float>::max() : numeric_limits<float>::max()); +  bool left_edge = true; +  double pos = numeric_limits<double>::quiet_NaN(); +  for (vector<ErrorIter>::iterator i = all_ints.begin(); +       i != all_ints.end(); ++i) { +    const ErrorSegment& seg = **i; +    assert(seg.delta); +    if (seg.x - last_boundary > epsilon) { +      float sco = acc->ComputeScore(); +      if ((type == MAXIMIZE_SCORE && sco > cur_best_score) || +          (type == MINIMIZE_SCORE && sco < cur_best_score) ) { +        cur_best_score = sco; +	if (left_edge) { +	  pos = seg.x - 0.1; +	  left_edge = false; +	} else { +	  pos = last_boundary + (seg.x - last_boundary) / 2; +	} +	// cerr << "NEW BEST: " << pos << "  (score=" << cur_best_score << ")\n"; +      } +      // string xx; acc->ScoreDetails(&xx); cerr << "---- " << xx; +      // cerr << "---- s=" << sco << "\n"; +      last_boundary = seg.x; +    } +    // cerr << "x-boundary=" << seg.x << "\n"; +    acc->PlusEquals(*seg.delta); +  } +  float sco = acc->ComputeScore(); +  if ((type == MAXIMIZE_SCORE && sco > cur_best_score) || +      (type == MINIMIZE_SCORE && sco < cur_best_score) ) { +    cur_best_score = sco; +    if (left_edge) { +      pos = 0; +    } else { +      pos = last_boundary + 1000.0; +    } +  } +  delete acc; +  return pos; +} + +void LineOptimizer::RandomUnitVector(const vector<int>& features_to_optimize, +                                     SparseVector<double>* axis, +                                     RandomNumberGenerator<boost::mt19937>* rng) { +  axis->clear(); +  for (int i = 0; i < features_to_optimize.size(); ++i) +    axis->set_value(features_to_optimize[i], rng->next() - 0.5); +  (*axis) /= axis->l2norm(); +} + +void LineOptimizer::CreateOptimizationDirections( +     const vector<int>& features_to_optimize, +     int additional_random_directions, +     RandomNumberGenerator<boost::mt19937>* rng, +     vector<SparseVector<double> >* dirs) { +  const int num_directions = features_to_optimize.size() + additional_random_directions; +  dirs->resize(num_directions); +  for (int i = 0; i < num_directions; ++i) { +    SparseVector<double>& axis = (*dirs)[i]; +    if (i < features_to_optimize.size()) +      axis.set_value(features_to_optimize[i], 1.0); +    else +      RandomUnitVector(features_to_optimize, &axis, rng); +  } +  cerr << "Generated " << num_directions << " total axes to optimize along.\n"; +} diff --git a/vest/line_optimizer.h b/vest/line_optimizer.h new file mode 100644 index 00000000..43164360 --- /dev/null +++ b/vest/line_optimizer.h @@ -0,0 +1,44 @@ +#ifndef LINE_OPTIMIZER_H_ +#define LINE_OPTIMIZER_H_ + +#include <vector> + +#include "error_surface.h" +#include "sampler.h" + +template <typename T> class SparseVector; +class Weights; + +struct LineOptimizer { + +  // use MINIMIZE_SCORE for things like TER, WER +  // MAXIMIZE_SCORE for things like BLEU +  enum ScoreType { MAXIMIZE_SCORE, MINIMIZE_SCORE }; + +  // merge all the error surfaces together into a global +  // error surface and find (the middle of) the best segment +  static double LineOptimize( +     const std::vector<ErrorSurface>& envs, +     const LineOptimizer::ScoreType type, +     float* best_score, +     const double epsilon = 1.0/65536.0); + +  // return a random vector of length 1 where all dimensions +  // not listed in dimensions will be 0. +  static void RandomUnitVector(const std::vector<int>& dimensions, +                               SparseVector<double>* axis, +                               RandomNumberGenerator<boost::mt19937>* rng); + +  // generate a list of directions to optimize; the list will +  // contain the orthogonal vectors corresponding to the dimensions in +  // primary and then additional_random_directions directions in those +  // dimensions as well.  All vectors will be length 1. +  static void CreateOptimizationDirections( +     const std::vector<int>& primary, +     int additional_random_directions, +     RandomNumberGenerator<boost::mt19937>* rng, +     std::vector<SparseVector<double> >* dirs); + +}; + +#endif diff --git a/vest/lo_test.cc b/vest/lo_test.cc new file mode 100644 index 00000000..f937daac --- /dev/null +++ b/vest/lo_test.cc @@ -0,0 +1,201 @@ +#include <cmath> +#include <iostream> +#include <fstream> + +#include <boost/shared_ptr.hpp> +#include <gtest/gtest.h> + +#include "fdict.h" +#include "hg.h" +#include "kbest.h" +#include "hg_io.h" +#include "filelib.h" +#include "inside_outside.h" +#include "viterbi.h" +#include "viterbi_envelope.h" +#include "line_optimizer.h" +#include "scorer.h" + +using namespace std; +using boost::shared_ptr; + +class OptTest : public testing::Test { + protected: +   virtual void SetUp() { } +   virtual void TearDown() { } +}; + +const char* ref11 = "australia reopens embassy in manila"; +const char* ref12 = "( afp , manila , january 2 ) australia reopened its embassy in the philippines today , which was shut down about seven weeks ago due to what was described as a specific threat of a terrorist attack ."; +const char* ref21 = "australia reopened manila embassy"; +const char* ref22 = "( agence france-presse , manila , 2nd ) - australia reopened its embassy in the philippines today . the embassy was closed seven weeks ago after what was described as a specific threat of a terrorist attack ."; +const char* ref31 = "australia to reopen embassy in manila"; +const char* ref32 = "( afp report from manila , january 2 ) australia reopened its embassy in the philippines today . seven weeks ago , the embassy was shut down due to so - called confirmed terrorist attack threats ."; +const char* ref41 = "australia to re - open its embassy to manila"; +const char* ref42 = "( afp , manila , thursday ) australia reopens its embassy to manila , which was closed for the so - called \" clear \" threat of terrorist attack 7 weeks ago ."; + +TEST_F(OptTest, TestCheckNaN) { +  double x = 0; +  double y = 0; +  double z = x / y; +  EXPECT_EQ(true, isnan(z)); +} + +TEST_F(OptTest,TestViterbiEnvelope) { +  shared_ptr<Segment> a1(new Segment(-1, 0)); +  shared_ptr<Segment> b1(new Segment(1, 0)); +  shared_ptr<Segment> a2(new Segment(-1, 1)); +  shared_ptr<Segment> b2(new Segment(1, -1)); +  vector<shared_ptr<Segment> > sa; sa.push_back(a1); sa.push_back(b1); +  vector<shared_ptr<Segment> > sb; sb.push_back(a2); sb.push_back(b2); +  ViterbiEnvelope a(sa); +  cerr << a << endl; +  ViterbiEnvelope b(sb); +  ViterbiEnvelope c = a; +  c *= b; +  cerr << a << " (*) " << b << " = " << c << endl; +  EXPECT_EQ(3, c.size()); +} + +TEST_F(OptTest,TestViterbiEnvelopeInside) { +  const string json = "{\"rules\":[1,\"[X] ||| a\",2,\"[X] ||| A [1]\",3,\"[X] ||| c\",4,\"[X] ||| C [1]\",5,\"[X] ||| [1] B [2]\",6,\"[X] ||| [1] b [2]\",7,\"[X] ||| X [1]\",8,\"[X] ||| Z [1]\"],\"features\":[\"f1\",\"f2\",\"Feature_1\",\"Feature_0\",\"Model_0\",\"Model_1\",\"Model_2\",\"Model_3\",\"Model_4\",\"Model_5\",\"Model_6\",\"Model_7\"],\"edges\":[{\"tail\":[],\"feats\":[],\"rule\":1}],\"node\":{\"in_edges\":[0]},\"edges\":[{\"tail\":[0],\"feats\":[0,-0.8,1,-0.1],\"rule\":2}],\"node\":{\"in_edges\":[1]},\"edges\":[{\"tail\":[],\"feats\":[1,-1],\"rule\":3}],\"node\":{\"in_edges\":[2]},\"edges\":[{\"tail\":[2],\"feats\":[0,-0.2,1,-0.1],\"rule\":4}],\"node\":{\"in_edges\":[3]},\"edges\":[{\"tail\":[1,3],\"feats\":[0,-1.2,1,-0.2],\"rule\":5},{\"tail\":[1,3],\"feats\":[0,-0.5,1,-1.3],\"rule\":6}],\"node\":{\"in_edges\":[4,5]},\"edges\":[{\"tail\":[4],\"feats\":[0,-0.5,1,-0.8],\"rule\":7},{\"tail\":[4],\"feats\":[0,-0.7,1,-0.9],\"rule\":8}],\"node\":{\"in_edges\":[6,7]}}"; +  Hypergraph hg; +  istringstream instr(json); +  HypergraphIO::ReadFromJSON(&instr, &hg); +  SparseVector<double> wts; +  wts.set_value(FD::Convert("f1"), 0.4); +  wts.set_value(FD::Convert("f2"), 1.0); +  hg.Reweight(wts); +  vector<pair<vector<WordID>, prob_t> > list; +  std::vector<SparseVector<double> > features; +  KBest::KBestDerivations<vector<WordID>, ESentenceTraversal> kbest(hg, 10); +  for (int i = 0; i < 10; ++i) { +    const KBest::KBestDerivations<vector<WordID>, ESentenceTraversal>::Derivation* d = +      kbest.LazyKthBest(hg.nodes_.size() - 1, i); +    if (!d) break; +    cerr << log(d->score) << " ||| " << TD::GetString(d->yield) << " ||| " << d->feature_values << endl; +  } +  SparseVector<double> dir; dir.set_value(FD::Convert("f1"), 1.0); +  ViterbiEnvelopeWeightFunction wf(wts, dir); +  ViterbiEnvelope env = Inside<ViterbiEnvelope, ViterbiEnvelopeWeightFunction>(hg, NULL, wf); +  cerr << env << endl; +  const vector<boost::shared_ptr<Segment> >& segs = env.GetSortedSegs(); +  dir *= segs[1]->x; +  wts += dir; +  hg.Reweight(wts); +  KBest::KBestDerivations<vector<WordID>, ESentenceTraversal> kbest2(hg, 10); +  for (int i = 0; i < 10; ++i) { +    const KBest::KBestDerivations<vector<WordID>, ESentenceTraversal>::Derivation* d = +      kbest2.LazyKthBest(hg.nodes_.size() - 1, i); +    if (!d) break; +    cerr << log(d->score) << " ||| " << TD::GetString(d->yield) << " ||| " << d->feature_values << endl; +  } +  for (int i = 0; i < segs.size(); ++i) { +    cerr << "seg=" << i << endl; +    vector<WordID> trans; +    segs[i]->ConstructTranslation(&trans); +    cerr << TD::GetString(trans) << endl; +  } +} + +TEST_F(OptTest, TestS1) {  +  int fPhraseModel_0 = FD::Convert("PhraseModel_0"); +  int fPhraseModel_1 = FD::Convert("PhraseModel_1"); +  int fPhraseModel_2 = FD::Convert("PhraseModel_2"); +  int fLanguageModel = FD::Convert("LanguageModel"); +  int fWordPenalty = FD::Convert("WordPenalty"); +  int fPassThrough = FD::Convert("PassThrough"); +  SparseVector<double> wts; +  wts.set_value(fWordPenalty, 4.25); +  wts.set_value(fLanguageModel, -1.1165); +  wts.set_value(fPhraseModel_0, -0.96); +  wts.set_value(fPhraseModel_1, -0.65); +  wts.set_value(fPhraseModel_2, -0.77); +  wts.set_value(fPassThrough, -10.0); + +  vector<int> to_optimize; +  to_optimize.push_back(fWordPenalty); +  to_optimize.push_back(fLanguageModel); +  to_optimize.push_back(fPhraseModel_0); +  to_optimize.push_back(fPhraseModel_1); +  to_optimize.push_back(fPhraseModel_2); + +  Hypergraph hg; +  ReadFile rf("./test_data/0.json.gz"); +  HypergraphIO::ReadFromJSON(rf.stream(), &hg); +  hg.Reweight(wts); + +  Hypergraph hg2; +  ReadFile rf2("./test_data/1.json.gz"); +  HypergraphIO::ReadFromJSON(rf2.stream(), &hg2); +  hg2.Reweight(wts); + +  vector<vector<WordID> > refs1(4); +  TD::ConvertSentence(ref11, &refs1[0]); +  TD::ConvertSentence(ref21, &refs1[1]); +  TD::ConvertSentence(ref31, &refs1[2]); +  TD::ConvertSentence(ref41, &refs1[3]); +  vector<vector<WordID> > refs2(4); +  TD::ConvertSentence(ref12, &refs2[0]); +  TD::ConvertSentence(ref22, &refs2[1]); +  TD::ConvertSentence(ref32, &refs2[2]); +  TD::ConvertSentence(ref42, &refs2[3]); +  ScoreType type = ScoreTypeFromString("ibm_bleu"); +  SentenceScorer* scorer1 = SentenceScorer::CreateSentenceScorer(type, refs1); +  SentenceScorer* scorer2 = SentenceScorer::CreateSentenceScorer(type, refs2); +  vector<ViterbiEnvelope> envs(2); +   +  RandomNumberGenerator<boost::mt19937> rng; + +  vector<SparseVector<double> > axes; +  LineOptimizer::CreateOptimizationDirections( +     to_optimize, +     10, +     &rng, +     &axes); +  assert(axes.size() == 10 + to_optimize.size()); +  for (int i = 0; i < axes.size(); ++i) +    cerr << axes[i] << endl; +  const SparseVector<double>& axis = axes[0]; + +  cerr << "Computing Viterbi envelope using inside algorithm...\n"; +  cerr << "axis: " << axis << endl; +  clock_t t_start=clock(); +  ViterbiEnvelopeWeightFunction wf(wts, axis); +  envs[0] = Inside<ViterbiEnvelope, ViterbiEnvelopeWeightFunction>(hg, NULL, wf); +  envs[1] = Inside<ViterbiEnvelope, ViterbiEnvelopeWeightFunction>(hg2, NULL, wf); + +  vector<ErrorSurface> es(2); +  scorer1->ComputeErrorSurface(envs[0], &es[0], IBM_BLEU, hg); +  scorer2->ComputeErrorSurface(envs[1], &es[1], IBM_BLEU, hg2); +  cerr << envs[0].size() << " " << envs[1].size() << endl; +  cerr << es[0].size() << " " << es[1].size() << endl; +  envs.clear(); +  clock_t t_env=clock(); +  float score; +  double m = LineOptimizer::LineOptimize(es, LineOptimizer::MAXIMIZE_SCORE, &score); +  clock_t t_opt=clock(); +  cerr << "line optimizer returned: " << m << " (SCORE=" << score << ")\n"; +  EXPECT_FLOAT_EQ(0.48719698, score); +  SparseVector<double> res = axis; +  res *= m; +  res += wts; +  cerr << "res: " << res << endl; +  cerr << "ENVELOPE PROCESSING=" << (static_cast<double>(t_env - t_start) / 1000.0) << endl; +  cerr << "  LINE OPTIMIZATION=" << (static_cast<double>(t_opt - t_env) / 1000.0) << endl; +  hg.Reweight(res); +  hg2.Reweight(res); +  vector<WordID> t1,t2; +  ViterbiESentence(hg, &t1); +  ViterbiESentence(hg2, &t2); +  cerr << TD::GetString(t1) << endl; +  cerr << TD::GetString(t2) << endl; +  delete scorer1; +  delete scorer2; +} + +int main(int argc, char **argv) { +  testing::InitGoogleTest(&argc, argv); +  return RUN_ALL_TESTS(); +} + diff --git a/vest/mbr_kbest.cc b/vest/mbr_kbest.cc new file mode 100644 index 00000000..5d70b4e2 --- /dev/null +++ b/vest/mbr_kbest.cc @@ -0,0 +1,140 @@ +#include <iostream> +#include <vector> + +#include <boost/program_options.hpp> + +#include "prob.h" +#include "tdict.h" +#include "scorer.h" +#include "filelib.h" +#include "stringlib.h" + +using namespace std; + +namespace po = boost::program_options; + +void InitCommandLine(int argc, char** argv, po::variables_map* conf) { +  po::options_description opts("Configuration options"); +  opts.add_options() +        ("scale,a",po::value<double>()->default_value(1.0), "Posterior scaling factor (alpha)") +        ("loss_function,l",po::value<string>()->default_value("bleu"), "Loss function") +        ("input,i",po::value<string>()->default_value("-"), "File to read k-best lists from") +        ("output_list,L", "Show reranked list as output") +        ("help,h", "Help"); +  po::options_description dcmdline_options; +  dcmdline_options.add(opts); +  po::store(parse_command_line(argc, argv, dcmdline_options), *conf); +  bool flag = false; +  if (flag || conf->count("help")) { +    cerr << dcmdline_options << endl; +    exit(1); +  } +} + +struct LossComparer { +  bool operator()(const pair<vector<WordID>, double>& a, const pair<vector<WordID>, double>& b) const { +    return a.second < b.second; +  } +}; + +bool ReadKBestList(istream* in, string* sent_id, vector<pair<vector<WordID>, prob_t> >* list) { +  static string cache_id; +  static pair<vector<WordID>, prob_t> cache_pair; +  list->clear(); +  string cur_id; +  if (cache_pair.first.size() > 0) { +    list->push_back(cache_pair); +    cur_id = cache_id; +    cache_pair.first.clear(); +  } +  string line; +  string tstr; +  while(*in) { +    getline(*in, line); +    if (line.empty()) continue; +    size_t p1 = line.find(" ||| "); +    if (p1 == string::npos) { cerr << "Bad format: " << line << endl; abort(); } +    size_t p2 = line.find(" ||| ", p1 + 4); +    if (p2 == string::npos) { cerr << "Bad format: " << line << endl; abort(); } +    size_t p3 = line.rfind(" ||| "); +    cache_id = line.substr(0, p1); +    tstr = line.substr(p1 + 5, p2 - p1 - 5); +    double val = strtod(line.substr(p3 + 5).c_str(), NULL); +    TD::ConvertSentence(tstr, &cache_pair.first); +    cache_pair.second.logeq(val); +    if (cur_id.empty()) cur_id = cache_id; +    if (cur_id == cache_id) { +      list->push_back(cache_pair); +      *sent_id = cur_id; +      cache_pair.first.clear(); +    } else { break; } +  } +  return !list->empty(); +} + +int main(int argc, char** argv) { +  po::variables_map conf; +  InitCommandLine(argc, argv, &conf); +  const string metric = conf["loss_function"].as<string>(); +  const bool output_list = conf.count("output_list") > 0; +  const string file = conf["input"].as<string>(); +  const double mbr_scale = conf["scale"].as<double>(); +  cerr << "Posterior scaling factor (alpha) = " << mbr_scale << endl; + +  ScoreType type = ScoreTypeFromString(metric); +  vector<pair<vector<WordID>, prob_t> > list; +  ReadFile rf(file); +  string sent_id; +  while(ReadKBestList(rf.stream(), &sent_id, &list)) { +    vector<prob_t> joints(list.size()); +    const prob_t max_score = pow(list.front().second, mbr_scale); +    prob_t marginal = prob_t::Zero(); +    for (int i = 0 ; i < list.size(); ++i) { +      const prob_t joint = pow(list[i].second, mbr_scale) / max_score; +      joints[i] = joint; +      // cerr << "list[" << i << "] joint=" << log(joint) << endl; +      marginal += joint; +    } +    int mbr_idx = -1; +    vector<double> mbr_scores(output_list ? list.size() : 0); +    double mbr_loss = numeric_limits<double>::max(); +    for (int i = 0 ; i < list.size(); ++i) { +      vector<vector<WordID> > refs(1, list[i].first); +      //cerr << i << ": " << list[i].second <<"\t" << TD::GetString(list[i].first) << endl; +      SentenceScorer* scorer = SentenceScorer::CreateSentenceScorer(type, refs); +      double wl_acc = 0; +      for (int j = 0; j < list.size(); ++j) { +        if (i != j) { +          Score* s = scorer->ScoreCandidate(list[j].first); +          double loss = 1.0 - s->ComputeScore(); +          if (type == TER || type == AER) loss = 1.0 - loss; +          delete s; +          double weighted_loss = loss * (joints[j] / marginal); +          wl_acc += weighted_loss; +          if ((!output_list) && wl_acc > mbr_loss) break; +        } +      } +      if (output_list) mbr_scores[i] = wl_acc; +      if (wl_acc < mbr_loss) { +        mbr_loss = wl_acc; +        mbr_idx = i; +      } +      delete scorer; +    } +    // cerr << "ML translation: " << TD::GetString(list[0].first) << endl; +    cerr << "MBR Best idx: " << mbr_idx << endl; +    if (output_list) { +      for (int i = 0; i < list.size(); ++i) +        list[i].second.logeq(mbr_scores[i]); +      sort(list.begin(), list.end(), LossComparer()); +      for (int i = 0; i < list.size(); ++i) +        cout << sent_id << " ||| " +             << TD::GetString(list[i].first) << " ||| " +             << log(list[i].second) << endl; +    } else { +      cout << TD::GetString(list[mbr_idx].first) << endl; +    } +  } +  return 0; +} + diff --git a/vest/mr_vest_generate_mapper_input.cc b/vest/mr_vest_generate_mapper_input.cc new file mode 100644 index 00000000..c96a61e4 --- /dev/null +++ b/vest/mr_vest_generate_mapper_input.cc @@ -0,0 +1,72 @@ +#include <iostream> +#include <vector> + +#include <boost/program_options.hpp> +#include <boost/program_options/variables_map.hpp> + +#include "filelib.h" +#include "weights.h" +#include "line_optimizer.h" + +using namespace std; +namespace po = boost::program_options; + +void InitCommandLine(int argc, char** argv, po::variables_map* conf) { +  po::options_description opts("Configuration options"); +  opts.add_options() +        ("dev_set_size,s",po::value<unsigned int>(),"[REQD] Development set size (# of parallel sentences)") +        ("forest_repository,r",po::value<string>(),"[REQD] Path to forest repository") +        ("weights,w",po::value<string>(),"[REQD] Current feature weights file") +        ("optimize_feature,o",po::value<vector<string> >(), "Feature to optimize (if none specified, all weights listed in the weights file will be optimized)") +        ("random_directions,d",po::value<unsigned int>()->default_value(20),"Number of random directions to run the line optimizer in") +        ("help,h", "Help"); +  po::options_description dcmdline_options; +  dcmdline_options.add(opts); +  po::store(parse_command_line(argc, argv, dcmdline_options), *conf); +  bool flag = false; +  if (conf->count("dev_set_size") == 0) { +    cerr << "Please specify the size of the development set using -d N\n"; +    flag = true; +  } +  if (conf->count("weights") == 0) { +    cerr << "Please specify the starting-point weights using -w <weightfile.txt>\n"; +    flag = true; +  } +  if (conf->count("forest_repository") == 0) { +    cerr << "Please specify the forest repository location using -r <DIR>\n"; +    flag = true; +  } +  if (flag || conf->count("help")) { +    cerr << dcmdline_options << endl; +    exit(1); +  } +} + +int main(int argc, char** argv) { +  RandomNumberGenerator<boost::mt19937> rng; +  po::variables_map conf; +  InitCommandLine(argc, argv, &conf); +  Weights weights; +  vector<string> features; +  weights.InitFromFile(conf["weights"].as<string>(), &features); +  const string forest_repository = conf["forest_repository"].as<string>(); +  assert(DirectoryExists(forest_repository)); +  SparseVector<double> origin; +  weights.InitSparseVector(&origin); +  if (conf.count("optimize_feature") > 0) +    features=conf["optimize_feature"].as<vector<string> >(); +  vector<SparseVector<double> > axes; +  vector<int> fids(features.size()); +  for (int i = 0; i < features.size(); ++i) +    fids[i] = FD::Convert(features[i]); +  LineOptimizer::CreateOptimizationDirections( +     fids, +     conf["random_directions"].as<unsigned int>(), +     &rng, +     &axes); +  int dev_set_size = conf["dev_set_size"].as<unsigned int>(); +  for (int i = 0; i < dev_set_size; ++i) +    for (int j = 0; j < axes.size(); ++j) +      cout << forest_repository << '/' << i << ".json.gz " << i << ' ' << origin << ' ' << axes[j] << endl; +  return 0; +} diff --git a/vest/mr_vest_map.cc b/vest/mr_vest_map.cc new file mode 100644 index 00000000..d1ba159f --- /dev/null +++ b/vest/mr_vest_map.cc @@ -0,0 +1,99 @@ +#include <sstream> +#include <iostream> +#include <fstream> +#include <vector> + +#include <boost/program_options.hpp> +#include <boost/program_options/variables_map.hpp> + +#include "filelib.h" +#include "stringlib.h" +#include "sparse_vector.h" +#include "scorer.h" +#include "viterbi_envelope.h" +#include "inside_outside.h" +#include "error_surface.h" +#include "hg.h" +#include "hg_io.h" + +using namespace std; +namespace po = boost::program_options; + +void InitCommandLine(int argc, char** argv, po::variables_map* conf) { +  po::options_description opts("Configuration options"); +  opts.add_options() +        ("reference,r",po::value<vector<string> >(), "[REQD] Reference translation (tokenized text)") +        ("source,s",po::value<string>(), "Source file (ignored, except for AER)") +        ("loss_function,l",po::value<string>()->default_value("ibm_bleu"), "Loss function being optimized") +        ("help,h", "Help"); +  po::options_description dcmdline_options; +  dcmdline_options.add(opts); +  po::store(parse_command_line(argc, argv, dcmdline_options), *conf); +  bool flag = false; +  if (!conf->count("reference")) { +    cerr << "Please specify one or more references using -r <REF.TXT>\n"; +    flag = true; +  } +  if (flag || conf->count("help")) { +    cerr << dcmdline_options << endl; +    exit(1); +  } +} + +bool ReadSparseVectorString(const string& s, SparseVector<double>* v) { +  vector<string> fields; +  Tokenize(s, ';', &fields); +  if (fields.empty()) return false; +  for (int i = 0; i < fields.size(); ++i) { +    vector<string> pair(2); +    Tokenize(fields[i], '=', &pair); +    if (pair.size() != 2) { +      cerr << "Error parsing vector string: " << fields[i] << endl; +      return false; +    } +    v->set_value(FD::Convert(pair[0]), atof(pair[1].c_str())); +  } +  return true; +} + +int main(int argc, char** argv) { +  po::variables_map conf; +  InitCommandLine(argc, argv, &conf); +  const string loss_function = conf["loss_function"].as<string>(); +  ScoreType type = ScoreTypeFromString(loss_function); +  DocScorer ds(type, conf["reference"].as<vector<string> >(), conf["source"].as<string>()); +  cerr << "Loaded " << ds.size() << " references for scoring with " << loss_function << endl; +  Hypergraph hg; +  string last_file; +  while(cin) { +    string line; +    getline(cin, line); +    if (line.empty()) continue; +    istringstream is(line); +    int sent_id; +    string file, s_origin, s_axis; +    is >> file >> sent_id >> s_origin >> s_axis; +    SparseVector<double> origin; +    assert(ReadSparseVectorString(s_origin, &origin)); +    SparseVector<double> axis; +    assert(ReadSparseVectorString(s_axis, &axis)); +    // cerr << "File: " << file << "\nAxis: " << axis << "\n   X: " << origin << endl; +    if (last_file != file) { +      last_file = file; +      ReadFile rf(file); +      HypergraphIO::ReadFromJSON(rf.stream(), &hg); +    } +    ViterbiEnvelopeWeightFunction wf(origin, axis); +    ViterbiEnvelope ve = Inside<ViterbiEnvelope, ViterbiEnvelopeWeightFunction>(hg, NULL, wf); +    ErrorSurface es; +    ds[sent_id]->ComputeErrorSurface(ve, &es, type, hg); +    //cerr << "Viterbi envelope has " << ve.size() << " segments\n"; +    // cerr << "Error surface has " << es.size() << " segments\n"; +    string val; +    es.Serialize(&val); +    cout << 'M' << ' ' << s_origin << ' ' << s_axis << '\t'; +    B64::b64encode(val.c_str(), val.size(), &cout); +    cout << endl; +  } +  return 0; +} diff --git a/vest/mr_vest_reduce.cc b/vest/mr_vest_reduce.cc new file mode 100644 index 00000000..5efcc19a --- /dev/null +++ b/vest/mr_vest_reduce.cc @@ -0,0 +1,81 @@ +#include <sstream> +#include <iostream> +#include <fstream> +#include <vector> + +#include <boost/program_options.hpp> +#include <boost/program_options/variables_map.hpp> + +#include "sparse_vector.h" +#include "error_surface.h" +#include "line_optimizer.h" +#include "hg_io.h" + +using namespace std; +namespace po = boost::program_options; + +void InitCommandLine(int argc, char** argv, po::variables_map* conf) { +  po::options_description opts("Configuration options"); +  opts.add_options() +        ("loss_function,l",po::value<string>(), "Loss function being optimized") +        ("help,h", "Help"); +  po::options_description dcmdline_options; +  dcmdline_options.add(opts); +  po::store(parse_command_line(argc, argv, dcmdline_options), *conf); +  bool flag = conf->count("loss_function") == 0; +  if (flag || conf->count("help")) { +    cerr << dcmdline_options << endl; +    exit(1); +  } +} + +int main(int argc, char** argv) { +  po::variables_map conf; +  InitCommandLine(argc, argv, &conf); +  const string loss_function = conf["loss_function"].as<string>(); +  ScoreType type = ScoreTypeFromString(loss_function); +  LineOptimizer::ScoreType opt_type = LineOptimizer::MAXIMIZE_SCORE; +  if (type == TER || type == AER) { +    opt_type = LineOptimizer::MINIMIZE_SCORE; +  } +  string last_key; +  vector<ErrorSurface> esv; +  while(cin) { +    string line; +    getline(cin, line); +    if (line.empty()) continue; +    size_t ks = line.find("\t"); +    assert(string::npos != ks); +    assert(ks > 2); +    string key = line.substr(2, ks - 2); +    string val = line.substr(ks + 1); +    if (key != last_key) { +      if (!last_key.empty()) { +	float score; +        double x = LineOptimizer::LineOptimize(esv, opt_type, &score); +	cout << last_key << "|" << x << "|" << score << endl; +      } +      last_key = key; +      esv.clear(); +    } +    if (val.size() % 4 != 0) { +      cerr << "B64 encoding error 1! Skipping.\n"; +      continue; +    } +    string encoded(val.size() / 4 * 3, '\0'); +    if (!B64::b64decode(reinterpret_cast<const unsigned char*>(&val[0]), val.size(), &encoded[0], encoded.size())) { +      cerr << "B64 encoding error 2! Skipping.\n"; +      continue; +    } +    esv.push_back(ErrorSurface()); +    esv.back().Deserialize(type, encoded); +  } +  if (!esv.empty()) { +    // cerr << "ESV=" << esv.size() << endl; +    // for (int i = 0; i < esv.size(); ++i) { cerr << esv[i].size() << endl; } +    float score; +    double x = LineOptimizer::LineOptimize(esv, opt_type, &score); +    cout << last_key << "|" << x << "|" << score << endl; +  } +  return 0; +} diff --git a/vest/scorer.cc b/vest/scorer.cc new file mode 100644 index 00000000..53a17917 --- /dev/null +++ b/vest/scorer.cc @@ -0,0 +1,534 @@ +#include "scorer.h" + +#include <map> +#include <sstream> +#include <iostream> +#include <fstream> +#include <cstdio> +#include <valarray> + +#include <boost/shared_ptr.hpp> + +#include "filelib.h" +#include "aligner.h" +#include "viterbi_envelope.h" +#include "error_surface.h" +#include "ter.h" +#include "aer_scorer.h" +#include "comb_scorer.h" +#include "tdict.h" +#include "stringlib.h" +#include "lattice.h" + +using boost::shared_ptr; +using namespace std; + +const bool minimize_segments = true;    // if adjacent segments have equal scores, merge them + +ScoreType ScoreTypeFromString(const string& st) { +  const string sl = LowercaseString(st); +  if (sl == "ser") +    return SER; +  if (sl == "ter") +    return TER; +  if (sl == "aer") +    return AER; +  if (sl == "bleu" || sl == "ibm_bleu") +    return IBM_BLEU; +  if (sl == "nist_bleu") +    return NIST_BLEU; +  if (sl == "koehn_bleu") +    return Koehn_BLEU; +  if (sl == "combi") +    return BLEU_minus_TER_over_2; +  cerr << "Don't understand score type '" << sl << "', defaulting to ibm_bleu.\n"; +  return IBM_BLEU; +} + +Score::~Score() {} +SentenceScorer::~SentenceScorer() {} +const std::string* SentenceScorer::GetSource() const { return NULL; } + +class SERScore : public Score { +  friend class SERScorer; + public: +  SERScore() : correct(0), total(0) {} +  float ComputeScore() const { +    return static_cast<float>(correct) / static_cast<float>(total); +  } +  void ScoreDetails(string* details) const { +    ostringstream os; +    os << "SER= " << ComputeScore() << " (" << correct << '/' << total << ')'; +    *details = os.str(); +  } +  void PlusEquals(const Score& delta) { +    correct += static_cast<const SERScore&>(delta).correct; +    total += static_cast<const SERScore&>(delta).total; +  } +  Score* GetZero() const { return new SERScore; } +  void Subtract(const Score& rhs, Score* res) const { +    SERScore* r = static_cast<SERScore*>(res); +    r->correct = correct - static_cast<const SERScore&>(rhs).correct; +    r->total = total - static_cast<const SERScore&>(rhs).total; +  } +  void Encode(string* out) const { +    assert(!"not implemented"); +  } +  bool IsAdditiveIdentity() const { +    return (total == 0 && correct == 0);  // correct is always 0 <= n <= total +  } + private: +  int correct, total; +}; + +class SERScorer : public SentenceScorer { + public: +  SERScorer(const vector<vector<WordID> >& references) : refs_(references) {} +  Score* ScoreCandidate(const vector<WordID>& hyp) const { +    SERScore* res = new SERScore; +    res->total = 1; +    for (int i = 0; i < refs_.size(); ++i) +      if (refs_[i] == hyp) res->correct = 1; +    return res; +  } +  static Score* ScoreFromString(const string& data) { +    assert(!"Not implemented"); +  } + private: +  vector<vector<WordID> > refs_; +}; + +class BLEUScore : public Score { +  friend class BLEUScorerBase; + public: +  BLEUScore(int n) : correct_ngram_hit_counts(0,n), hyp_ngram_counts(0,n) { +    ref_len = 0; +    hyp_len = 0; } +  float ComputeScore() const; +  void ScoreDetails(string* details) const; +  void PlusEquals(const Score& delta); +  Score* GetZero() const; +  void Subtract(const Score& rhs, Score* res) const; +  void Encode(string* out) const; +  bool IsAdditiveIdentity() const { +    if (fabs(ref_len) > 0.1f || hyp_len != 0) return false; +    for (int i = 0; i < correct_ngram_hit_counts.size(); ++i) +      if (hyp_ngram_counts[i] != 0 || +        correct_ngram_hit_counts[i] != 0) return false; +    return true; +  } + private: +  float ComputeScore(vector<float>* precs, float* bp) const; +  valarray<int> correct_ngram_hit_counts; +  valarray<int> hyp_ngram_counts; +  float ref_len; +  int hyp_len; +}; + +class BLEUScorerBase : public SentenceScorer { + public: +  BLEUScorerBase(const vector<vector<WordID> >& references, +             int n +             ); +  Score* ScoreCandidate(const vector<WordID>& hyp) const; +  static Score* ScoreFromString(const string& in); + + protected: +  virtual float ComputeRefLength(const vector<WordID>& hyp) const = 0; + private: +  struct NGramCompare { +    int operator() (const vector<WordID>& a, const vector<WordID>& b) { +      size_t as = a.size(); +      size_t bs = b.size(); +      const size_t s = (as < bs ? as : bs); +      for (size_t i = 0; i < s; ++i) { +         int d = a[i] - b[i]; +         if (d < 0) return true; +	 if (d > 0) return false; +      } +      return as < bs; +    } +  }; +  typedef map<vector<WordID>, pair<int,int>, NGramCompare> NGramCountMap; +  void CountRef(const vector<WordID>& ref) { +    NGramCountMap tc; +    vector<WordID> ngram(n_); +    int s = ref.size(); +    for (int j=0; j<s; ++j) { +      int remaining = s-j; +      int k = (n_ < remaining ? n_ : remaining); +      ngram.clear(); +      for (int i=1; i<=k; ++i) { +	ngram.push_back(ref[j + i - 1]); +        tc[ngram].first++; +      } +    } +    for (NGramCountMap::iterator i = tc.begin(); i != tc.end(); ++i) { +      pair<int,int>& p = ngrams_[i->first]; +      if (p.first < i->second.first) +        p = i->second; +    } +  } + +  void ComputeNgramStats(const vector<WordID>& sent, +       valarray<int>* correct, +       valarray<int>* hyp) const { +    assert(correct->size() == n_); +    assert(hyp->size() == n_); +    vector<WordID> ngram(n_); +    (*correct) *= 0; +    (*hyp) *= 0; +    int s = sent.size(); +    for (int j=0; j<s; ++j) { +      int remaining = s-j; +      int k = (n_ < remaining ? n_ : remaining); +      ngram.clear(); +      for (int i=1; i<=k; ++i) { +	ngram.push_back(sent[j + i - 1]); +        pair<int,int>& p = ngrams_[ngram]; +        if (p.second < p.first) { +          ++p.second; +          (*correct)[i-1]++; +        } +	// if the 1 gram isn't found, don't try to match don't need to match any 2- 3- .. grams: +	if (!p.first) { +	  for (; i<=k; ++i) +	    (*hyp)[i-1]++; +	} else { +          (*hyp)[i-1]++; +        } +      } +    } +  } + +  mutable NGramCountMap ngrams_; +  int n_; +  vector<int> lengths_; +}; + +Score* BLEUScorerBase::ScoreFromString(const string& in) { +  istringstream is(in); +  int n; +  is >> n; +  BLEUScore* r = new BLEUScore(n); +  is >> r->ref_len >> r->hyp_len; + +  for (int i = 0; i < n; ++i) { +    is >> r->correct_ngram_hit_counts[i]; +    is >> r->hyp_ngram_counts[i]; +  } +  return r; +} + +class IBM_BLEUScorer : public BLEUScorerBase { + public: +    IBM_BLEUScorer(const vector<vector<WordID> >& references, +		   int n=4) : BLEUScorerBase(references, n), lengths_(references.size()) { +   for (int i=0; i < references.size(); ++i) +     lengths_[i] = references[i].size(); + }    + protected: +  float ComputeRefLength(const vector<WordID>& hyp) const { +    if (lengths_.size() == 1) return lengths_[0]; +    int bestd = 2000000; +    int hl = hyp.size(); +    int bl = -1; +    for (vector<int>::const_iterator ci = lengths_.begin(); ci != lengths_.end(); ++ci) { +      int cl = *ci; +      if (abs(cl - hl) < bestd) { +        bestd = abs(cl - hl); +        bl = cl; +      } +    } +    return bl; +  } + private: +  vector<int> lengths_; +}; + +class NIST_BLEUScorer : public BLEUScorerBase { + public: +    NIST_BLEUScorer(const vector<vector<WordID> >& references, +                    int n=4) : BLEUScorerBase(references, n), +		    shortest_(references[0].size()) { +   for (int i=1; i < references.size(); ++i) +     if (references[i].size() < shortest_) +       shortest_ = references[i].size(); + } + protected: +  float ComputeRefLength(const vector<WordID>& hyp) const { +    return shortest_; +  } + private: +  float shortest_; +}; + +class Koehn_BLEUScorer : public BLEUScorerBase { + public: +    Koehn_BLEUScorer(const vector<vector<WordID> >& references, +                     int n=4) : BLEUScorerBase(references, n), +                     avg_(0) { +   for (int i=0; i < references.size(); ++i) +     avg_ += references[i].size(); +   avg_ /= references.size(); + } + protected: +  float ComputeRefLength(const vector<WordID>& hyp) const { +    return avg_; +  } + private: +  float avg_; +}; + +SentenceScorer* SentenceScorer::CreateSentenceScorer(const ScoreType type, +      const vector<vector<WordID> >& refs, +      const string& src) { +  switch (type) { +    case IBM_BLEU: return new IBM_BLEUScorer(refs, 4); +    case NIST_BLEU: return new NIST_BLEUScorer(refs, 4); +    case Koehn_BLEU: return new Koehn_BLEUScorer(refs, 4); +    case AER: return new AERScorer(refs, src); +    case TER: return new TERScorer(refs); +    case SER: return new SERScorer(refs); +    case BLEU_minus_TER_over_2: return new BLEUTERCombinationScorer(refs); +    default: +      assert(!"Not implemented!"); +  } +} + +Score* SentenceScorer::CreateScoreFromString(const ScoreType type, const string& in) { +  switch (type) { +    case IBM_BLEU: +    case NIST_BLEU: +    case Koehn_BLEU: +      return BLEUScorerBase::ScoreFromString(in); +    case TER: +      return TERScorer::ScoreFromString(in); +    case AER: +      return AERScorer::ScoreFromString(in); +    case SER: +      return SERScorer::ScoreFromString(in); +    case BLEU_minus_TER_over_2: +      return BLEUTERCombinationScorer::ScoreFromString(in); +    default: +      assert(!"Not implemented!"); +  } +} + +void SentenceScorer::ComputeErrorSurface(const ViterbiEnvelope& ve, ErrorSurface* env, const ScoreType type, const Hypergraph& hg) const { +  vector<WordID> prev_trans; +  const vector<shared_ptr<Segment> >& ienv = ve.GetSortedSegs(); +  env->resize(ienv.size()); +  Score* prev_score = NULL; +  int j = 0; +  for (int i = 0; i < ienv.size(); ++i) { +    const Segment& seg = *ienv[i]; +    vector<WordID> trans; +    if (type == AER) { +      vector<bool> edges(hg.edges_.size(), false); +      seg.CollectEdgesUsed(&edges);  // get the set of edges in the viterbi +                                     // alignment +      ostringstream os; +      const string* psrc = this->GetSource(); +      if (psrc == NULL) { +        cerr << "AER scoring in VEST requires source, but it is missing!\n"; +        abort(); +      } +      size_t pos = psrc->rfind(" ||| "); +      if (pos == string::npos) { +        cerr << "Malformed source for AER: expected |||\nINPUT: " << *psrc << endl; +        abort(); +      } +      Lattice src; +      Lattice ref; +      LatticeTools::ConvertTextOrPLF(psrc->substr(0, pos), &src); +      LatticeTools::ConvertTextOrPLF(psrc->substr(pos + 5), &ref); +      AlignerTools::WriteAlignment(src, ref, hg, &os, true, &edges); +      string tstr = os.str(); +      TD::ConvertSentence(tstr.substr(tstr.rfind(" ||| ") + 5), &trans); +    } else { +      seg.ConstructTranslation(&trans); +    } +    // cerr << "Scoring: " << TD::GetString(trans) << endl; +    if (trans == prev_trans) { +      if (!minimize_segments) { +        assert(prev_score); // if this fails, it means +	                    // the decoder can generate null translations +        ErrorSegment& out = (*env)[j]; +        out.delta = prev_score->GetZero(); +        out.x = seg.x; +	++j; +      } +      // cerr << "Identical translation, skipping scoring\n"; +    } else { +      Score* score = ScoreCandidate(trans); +      // cerr << "score= " << score->ComputeScore() << "\n"; +      Score* cur_delta = score->GetZero(); +      // just record the score diffs +      if (!prev_score) +        prev_score = score->GetZero(); + +      score->Subtract(*prev_score, cur_delta); +      delete prev_score; +      prev_trans.swap(trans); +      prev_score = score; +      if ((!minimize_segments) || (!cur_delta->IsAdditiveIdentity())) { +        ErrorSegment& out = (*env)[j]; +	out.delta = cur_delta; +        out.x = seg.x; +	++j; +      } +    } +  } +  delete prev_score; +  // cerr << " In segments: " << ienv.size() << endl; +  // cerr << "Out segments: " << j << endl; +  assert(j > 0); +  env->resize(j); +} + +void BLEUScore::ScoreDetails(string* details) const { +  char buf[2000]; +  vector<float> precs(4); +  float bp; +  float bleu = ComputeScore(&precs, &bp); +  sprintf(buf, "BLEU = %.2f, %.1f|%.1f|%.1f|%.1f (brev=%.3f)", +       bleu*100.0, +       precs[0]*100.0, +       precs[1]*100.0, +       precs[2]*100.0, +       precs[3]*100.0, +       bp); +  *details = buf; +} + +float BLEUScore::ComputeScore(vector<float>* precs, float* bp) const { +  float log_bleu = 0; +  if (precs) precs->clear(); +  int count = 0; +  for (int i = 0; i < hyp_ngram_counts.size(); ++i) { +    if (hyp_ngram_counts[i] > 0) { +      float lprec = log(correct_ngram_hit_counts[i]) - log(hyp_ngram_counts[i]); +      if (precs) precs->push_back(exp(lprec)); +      log_bleu += lprec; +      ++count; +    } +  } +  log_bleu /= static_cast<float>(count); +  float lbp = 0.0; +  if (hyp_len < ref_len) +    lbp = (hyp_len - ref_len) / hyp_len; +  log_bleu += lbp; +  if (bp) *bp = exp(lbp); +  return exp(log_bleu); +} + +float BLEUScore::ComputeScore() const { +  return ComputeScore(NULL, NULL); +} + +void BLEUScore::Subtract(const Score& rhs, Score* res) const { +  const BLEUScore& d = static_cast<const BLEUScore&>(rhs); +  BLEUScore* o = static_cast<BLEUScore*>(res); +  o->ref_len = ref_len - d.ref_len; +  o->hyp_len = hyp_len - d.hyp_len; +  o->correct_ngram_hit_counts = correct_ngram_hit_counts - d.correct_ngram_hit_counts; +  o->hyp_ngram_counts = hyp_ngram_counts - d.hyp_ngram_counts; +} + +void BLEUScore::PlusEquals(const Score& delta) { +  const BLEUScore& d = static_cast<const BLEUScore&>(delta); +  correct_ngram_hit_counts += d.correct_ngram_hit_counts; +  hyp_ngram_counts += d.hyp_ngram_counts; +  ref_len += d.ref_len; +  hyp_len += d.hyp_len; +} + +Score* BLEUScore::GetZero() const { +  return new BLEUScore(hyp_ngram_counts.size()); +} + +void BLEUScore::Encode(string* out) const { +  ostringstream os; +  const int n = correct_ngram_hit_counts.size(); +  os << n << ' ' << ref_len << ' ' << hyp_len; +  for (int i = 0; i < n; ++i) +    os << ' ' << correct_ngram_hit_counts[i] << ' ' << hyp_ngram_counts[i]; +  *out = os.str(); +} + +BLEUScorerBase::BLEUScorerBase(const vector<vector<WordID> >& references, +                       int n) : n_(n) { +  for (vector<vector<WordID> >::const_iterator ci = references.begin(); +       ci != references.end(); ++ci) { +    lengths_.push_back(ci->size()); +    CountRef(*ci); +  } +} +  +Score* BLEUScorerBase::ScoreCandidate(const vector<WordID>& hyp) const { +  BLEUScore* bs = new BLEUScore(n_); +  for (NGramCountMap::iterator i=ngrams_.begin(); i != ngrams_.end(); ++i) +    i->second.second = 0; +  ComputeNgramStats(hyp, &bs->correct_ngram_hit_counts, &bs->hyp_ngram_counts); +  bs->ref_len = ComputeRefLength(hyp); +  bs->hyp_len = hyp.size(); +  return bs; +} + +DocScorer::~DocScorer() { +  for (int i=0; i < scorers_.size(); ++i) +    delete scorers_[i]; +} + +DocScorer::DocScorer( +      const ScoreType type, +      const vector<string>& ref_files, +      const string& src_file) { +  // TODO stop using valarray, start using ReadFile +  cerr << "Loading references (" << ref_files.size() << " files)\n"; +  shared_ptr<ReadFile> srcrf; +  if (type == AER && src_file.size() > 0) { +    cerr << "  (source=" << src_file << ")\n"; +    srcrf.reset(new ReadFile(src_file)); +  } +  valarray<ifstream> ifs(ref_files.size()); +  for (int i=0; i < ref_files.size(); ++i) { +     ifs[i].open(ref_files[i].c_str()); +     assert(ifs[i].good()); +  } +  char buf[64000]; +  bool expect_eof = false; +  while (!ifs[0].eof()) { +    vector<vector<WordID> > refs(ref_files.size()); +    for (int i=0; i < ref_files.size(); ++i) { +      if (ifs[i].eof()) break; +      ifs[i].getline(buf, 64000); +      refs[i].clear(); +      if (strlen(buf) == 0) { +        if (ifs[i].eof()) { +	  if (!expect_eof) { +	    assert(i == 0); +	    expect_eof = true; +	  }  +          break; +	} +      } else { +        TD::ConvertSentence(buf, &refs[i]); +        assert(!refs[i].empty()); +      } +      assert(!expect_eof); +    } +    if (!expect_eof) { +      string src_line; +      if (srcrf) { +        getline(*srcrf->stream(), src_line); +        map<string,string> dummy; +        ProcessAndStripSGML(&src_line, &dummy); +      } +      scorers_.push_back(SentenceScorer::CreateSentenceScorer(type, refs, src_line)); +    } +  } +  cerr << "Loaded reference translations for " << scorers_.size() << " sentences.\n"; +} + diff --git a/vest/scorer.h b/vest/scorer.h new file mode 100644 index 00000000..83d4db4c --- /dev/null +++ b/vest/scorer.h @@ -0,0 +1,55 @@ +#ifndef SCORER_H_ +#define SCORER_H_ + +#include <vector> +#include <string> + +#include "wordid.h" + +class ViterbiEnvelope; +class ErrorSurface; +class Hypergraph;  // needed for alignment + +enum ScoreType { IBM_BLEU, NIST_BLEU, Koehn_BLEU, TER, BLEU_minus_TER_over_2, SER, AER }; +ScoreType ScoreTypeFromString(const std::string& st); + +class Score { + public: +  virtual ~Score(); +  virtual float ComputeScore() const = 0; +  virtual void ScoreDetails(std::string* details) const = 0; +  virtual void PlusEquals(const Score& rhs) = 0; +  virtual void Subtract(const Score& rhs, Score* res) const = 0; +  virtual Score* GetZero() const = 0; +  virtual bool IsAdditiveIdentity() const = 0; // returns true if adding this delta +                                      // to another score results in no score change +				      // under any circumstances +  virtual void Encode(std::string* out) const = 0; +}; + +class SentenceScorer { + public: +  virtual ~SentenceScorer(); +  void ComputeErrorSurface(const ViterbiEnvelope& ve, ErrorSurface* es, const ScoreType type, const Hypergraph& hg) const; +  virtual Score* ScoreCandidate(const std::vector<WordID>& hyp) const = 0; +  virtual const std::string* GetSource() const; +  static Score* CreateScoreFromString(const ScoreType type, const std::string& in); +  static SentenceScorer* CreateSentenceScorer(const ScoreType type, +    const std::vector<std::vector<WordID> >& refs, +    const std::string& src = ""); +}; + +class DocScorer { + public: +  ~DocScorer(); +  DocScorer( +    const ScoreType type, +    const std::vector<std::string>& ref_files, +    const std::string& src_file = ""); +  int size() const { return scorers_.size(); } +  const SentenceScorer* operator[](size_t i) const { return scorers_[i]; } + private: +  std::vector<SentenceScorer*> scorers_; +}; + +#endif diff --git a/vest/scorer_test.cc b/vest/scorer_test.cc new file mode 100644 index 00000000..74875ab8 --- /dev/null +++ b/vest/scorer_test.cc @@ -0,0 +1,203 @@ +#include <iostream> +#include <fstream> +#include <valarray> +#include <gtest/gtest.h> + +#include "tdict.h" +#include "scorer.h" +#include "aer_scorer.h" + +using namespace std; + +class ScorerTest : public testing::Test { + protected: +   virtual void SetUp() { +     refs0.resize(4); +     refs1.resize(4); +     TD::ConvertSentence("export of high-tech products in guangdong in first two months this year reached 3.76 billion us dollars", &refs0[0]); +     TD::ConvertSentence("guangdong's export of new high technology products amounts to us $ 3.76 billion in first two months of this year", &refs0[1]); +     TD::ConvertSentence("guangdong exports us $ 3.76 billion worth of high technology products in the first two months of this year", &refs0[2]); +     TD::ConvertSentence("in the first 2 months this year , the export volume of new hi-tech products in guangdong province reached 3.76 billion us dollars .", &refs0[3]); +     TD::ConvertSentence("xinhua news agency , guangzhou , march 16 ( reporter chen ji ) the latest statistics show that from january through february this year , the export of high-tech products in guangdong province reached 3.76 billion us dollars , up 34.8 \% over the same period last year and accounted for 25.5 \% of the total export in the province .", &refs1[0]); +     TD::ConvertSentence("xinhua news agency , guangzhou , march 16 ( reporter : chen ji ) -- latest statistic indicates that guangdong's export of new high technology products amounts to us $ 3.76 billion , up 34.8 \% over corresponding period and accounts for 25.5 \% of the total exports of the province .", &refs1[1]); +     TD::ConvertSentence("xinhua news agency report of march 16 from guangzhou ( by staff reporter chen ji ) - latest statistics indicate guangdong province exported us $ 3.76 billion worth of high technology products , up 34.8 percent from the same period last year , which account for 25.5 percent of the total exports of the province .", &refs1[2]); +     TD::ConvertSentence("guangdong , march 16 , ( xinhua ) -- ( chen ji reports ) as the newest statistics shows , in january and feberuary this year , the export volume of new hi-tech products in guangdong province reached 3.76 billion us dollars , up 34.8 \% than last year , making up 25.5 \% of the province's total .", &refs1[3]); +     TD::ConvertSentence("one guangdong province will next export us $ 3.76 high-tech product two months first this year 3.76 billion us dollars", &hyp1); +     TD::ConvertSentence("xinhua news agency , guangzhou , 16th of march ( reporter chen ) -- latest statistics suggest that guangdong exports new advanced technology product totals $ 3.76 million , 34.8 percent last corresponding period and accounts for 25.5 percent of the total export province .", &hyp2); +   } + +   virtual void TearDown() { } + +   vector<vector<WordID> > refs0; +   vector<vector<WordID> > refs1; +   vector<WordID> hyp1; +   vector<WordID> hyp2; +}; + +TEST_F(ScorerTest, TestCreateFromFiles) { +  vector<string> files; +  files.push_back("test_data/re.txt.0"); +  files.push_back("test_data/re.txt.1"); +  files.push_back("test_data/re.txt.2"); +  files.push_back("test_data/re.txt.3"); +  DocScorer ds(IBM_BLEU, files); +} + +TEST_F(ScorerTest, TestBLEUScorer) { +  SentenceScorer* s1 = SentenceScorer::CreateSentenceScorer(IBM_BLEU, refs0); +  SentenceScorer* s2 = SentenceScorer::CreateSentenceScorer(IBM_BLEU, refs1); +  Score* b1 = s1->ScoreCandidate(hyp1); +  EXPECT_FLOAT_EQ(0.23185077, b1->ComputeScore()); +  Score* b2 = s2->ScoreCandidate(hyp2); +  EXPECT_FLOAT_EQ(0.38101241, b2->ComputeScore()); +  b1->PlusEquals(*b2); +  EXPECT_FLOAT_EQ(0.348854, b1->ComputeScore()); +  EXPECT_FALSE(b1->IsAdditiveIdentity()); +  string details; +  b1->ScoreDetails(&details); +  EXPECT_EQ("BLEU = 34.89, 81.5|50.8|29.5|18.6 (brev=0.898)", details); +  cerr << details << endl; +  string enc; +  b1->Encode(&enc); +  Score* b3 = SentenceScorer::CreateScoreFromString(IBM_BLEU, enc); +  details.clear(); +  cerr << "Encoded BLEU score size: " << enc.size() << endl; +  b3->ScoreDetails(&details); +  cerr << details << endl; +  EXPECT_FALSE(b3->IsAdditiveIdentity()); +  EXPECT_EQ("BLEU = 34.89, 81.5|50.8|29.5|18.6 (brev=0.898)", details); +  Score* bz = b3->GetZero(); +  EXPECT_TRUE(bz->IsAdditiveIdentity()); +  delete bz; +  delete b1; +  delete s1; +  delete b2; +  delete s2; +} + +TEST_F(ScorerTest, TestTERScorer) { +  SentenceScorer* s1 = SentenceScorer::CreateSentenceScorer(TER, refs0); +  SentenceScorer* s2 = SentenceScorer::CreateSentenceScorer(TER, refs1); +  string details; +  Score* t1 = s1->ScoreCandidate(hyp1); +  t1->ScoreDetails(&details); +  cerr << "DETAILS: " << details << endl; +  cerr << t1->ComputeScore() << endl; +  Score* t2 = s2->ScoreCandidate(hyp2); +  t2->ScoreDetails(&details); +  cerr << "DETAILS: " << details << endl; +  cerr << t2->ComputeScore() << endl; +  t1->PlusEquals(*t2); +  cerr << t1->ComputeScore() << endl; +  t1->ScoreDetails(&details); +  cerr << "DETAILS: " << details << endl; +  EXPECT_EQ("TER = 44.16,   4|  8| 16|  6 (len=77)", details); +  string enc; +  t1->Encode(&enc); +  Score* t3 = SentenceScorer::CreateScoreFromString(TER, enc); +  details.clear(); +  t3->ScoreDetails(&details); +  EXPECT_EQ("TER = 44.16,   4|  8| 16|  6 (len=77)", details); +  EXPECT_FALSE(t3->IsAdditiveIdentity()); +  Score* tz = t3->GetZero(); +  EXPECT_TRUE(tz->IsAdditiveIdentity()); +  delete tz; +  delete t3; +  delete t1; +  delete s1; +  delete t2; +  delete s2; +} + +TEST_F(ScorerTest, TestTERScorerSimple) { +  vector<vector<WordID> > ref(1); +  TD::ConvertSentence("1 2 3 A B", &ref[0]); +  vector<WordID> hyp; +  TD::ConvertSentence("A B 1 2 3", &hyp); +  SentenceScorer* s1 = SentenceScorer::CreateSentenceScorer(TER, ref); +  string details; +  Score* t1 = s1->ScoreCandidate(hyp); +  t1->ScoreDetails(&details); +  cerr << "DETAILS: " << details << endl; +  delete t1; +  delete s1; +} + +TEST_F(ScorerTest, TestSERScorerSimple) { +  vector<vector<WordID> > ref(1); +  TD::ConvertSentence("A B C D", &ref[0]); +  vector<WordID> hyp1; +  TD::ConvertSentence("A B C", &hyp1); +  vector<WordID> hyp2; +  TD::ConvertSentence("A B C D", &hyp2); +  SentenceScorer* s1 = SentenceScorer::CreateSentenceScorer(SER, ref); +  string details; +  Score* t1 = s1->ScoreCandidate(hyp1); +  t1->ScoreDetails(&details); +  cerr << "DETAILS: " << details << endl; +  Score* t2 = s1->ScoreCandidate(hyp2); +  t2->ScoreDetails(&details); +  cerr << "DETAILS: " << details << endl; +  t2->PlusEquals(*t1); +  t2->ScoreDetails(&details); +  cerr << "DETAILS: " << details << endl; +  delete t1; +  delete t2; +  delete s1; +} + +TEST_F(ScorerTest, TestCombiScorer) { +  SentenceScorer* s1 = SentenceScorer::CreateSentenceScorer(BLEU_minus_TER_over_2, refs0); +  string details; +  Score* t1 = s1->ScoreCandidate(hyp1); +  t1->ScoreDetails(&details); +  cerr << "DETAILS: " << details << endl; +  cerr << t1->ComputeScore() << endl; +  string enc; +  t1->Encode(&enc); +  Score* t2 = SentenceScorer::CreateScoreFromString(BLEU_minus_TER_over_2, enc); +  details.clear(); +  t2->ScoreDetails(&details); +  cerr << "DETAILS: " << details << endl; +  Score* cz = t2->GetZero(); +  EXPECT_FALSE(t2->IsAdditiveIdentity()); +  EXPECT_TRUE(cz->IsAdditiveIdentity()); +  cz->PlusEquals(*t2); +  EXPECT_FALSE(cz->IsAdditiveIdentity()); +  string d2; +  cz->ScoreDetails(&d2); +  EXPECT_EQ(d2, details);  +  delete cz; +  delete t2; +  delete t1; +} + +TEST_F(ScorerTest, AERTest) { +  vector<vector<WordID> > refs0(1); +  TD::ConvertSentence("0-0 2-1 1-2 3-3", &refs0[0]); + +  vector<WordID> hyp; +  TD::ConvertSentence("0-0 1-1", &hyp); +  AERScorer* as = new AERScorer(refs0); +  Score* x = as->ScoreCandidate(hyp); +  string details; +  x->ScoreDetails(&details); +  cerr << details << endl; +  string enc; +  x->Encode(&enc); +  delete x; +  delete as; +  cerr << "ENC size: " << enc.size() << endl; +  Score* y = SentenceScorer::CreateScoreFromString(AER, enc); +  string d2; +  y->ScoreDetails(&d2); +  cerr << d2 << endl; +  delete y; +  EXPECT_EQ(d2, details); +} + +int main(int argc, char **argv) { +  testing::InitGoogleTest(&argc, argv); +  return RUN_ALL_TESTS(); +} + diff --git a/vest/ter.cc b/vest/ter.cc new file mode 100644 index 00000000..ef66f3b7 --- /dev/null +++ b/vest/ter.cc @@ -0,0 +1,518 @@ +#include "ter.h" + +#include <cstdio> +#include <cassert> +#include <iostream> +#include <limits> +#include <sstream> +#include <tr1/unordered_map> +#include <set> +#include <valarray> +#include <boost/functional/hash.hpp> + +#include "tdict.h" + +const bool ter_use_average_ref_len = true; +const int ter_short_circuit_long_sentences = -1; + +using namespace std; +using namespace std::tr1; + +struct COSTS { +  static const float substitution; +  static const float deletion; +  static const float insertion; +  static const float shift; +}; +const float COSTS::substitution = 1.0f; +const float COSTS::deletion = 1.0f; +const float COSTS::insertion = 1.0f; +const float COSTS::shift = 1.0f; + +static const int MAX_SHIFT_SIZE = 10; +static const int MAX_SHIFT_DIST = 50; + +struct Shift { +  unsigned int d_; +  Shift() : d_() {} +  Shift(int b, int e, int m) : d_() { +    begin(b); +    end(e); +    moveto(m); +  } +  inline int begin() const { +    return d_ & 0x3ff; +  } +  inline int end() const { +    return (d_ >> 10) & 0x3ff; +  } +  inline int moveto() const { +    int m = (d_ >> 20) & 0x7ff; +    if (m > 1024) { m -= 1024; m *= -1; } +    return m; +  } +  inline void begin(int b) { +    d_ &= 0xfffffc00u; +    d_ |= (b & 0x3ff); +  } +  inline void end(int e) { +    d_ &= 0xfff003ffu; +    d_ |= (e & 0x3ff) << 10; +  } +  inline void moveto(int m) { +    bool neg = (m < 0); +    if (neg) { m *= -1; m += 1024; } +    d_ &= 0xfffff; +    d_ |= (m & 0x7ff) << 20; +  } +}; + +class TERScorerImpl { + + public: +  enum TransType { MATCH, SUBSTITUTION, INSERTION, DELETION }; + +  explicit TERScorerImpl(const vector<WordID>& ref) : ref_(ref) { +    for (int i = 0; i < ref.size(); ++i) +      rwexists_.insert(ref[i]); +  } + +  float Calculate(const vector<WordID>& hyp, int* subs, int* ins, int* dels, int* shifts) const { +    return CalculateAllShifts(hyp, subs, ins, dels, shifts); +  } + +  inline int GetRefLength() const { +    return ref_.size(); +  } + + private: +  vector<WordID> ref_; +  set<WordID> rwexists_; + +  typedef unordered_map<vector<WordID>, set<int>, boost::hash<vector<WordID> > > NgramToIntsMap; +  mutable NgramToIntsMap nmap_; +  +  static float MinimumEditDistance( +      const vector<WordID>& hyp, +      const vector<WordID>& ref, +      vector<TransType>* path) { +    vector<vector<TransType> > bmat(hyp.size() + 1, vector<TransType>(ref.size() + 1, MATCH)); +    vector<vector<float> > cmat(hyp.size() + 1, vector<float>(ref.size() + 1, 0)); +    for (int i = 0; i <= hyp.size(); ++i) +      cmat[i][0] = i; +    for (int j = 0; j <= ref.size(); ++j) +      cmat[0][j] = j; +    for (int i = 1; i <= hyp.size(); ++i) { +      const WordID& hw = hyp[i-1]; +      for (int j = 1; j <= ref.size(); ++j) { +        const WordID& rw = ref[j-1]; +	float& cur_c = cmat[i][j]; +	TransType& cur_b = bmat[i][j]; + +        if (rw == hw) { +          cur_c = cmat[i-1][j-1]; +          cur_b = MATCH; +        } else { +          cur_c = cmat[i-1][j-1] + COSTS::substitution; +          cur_b = SUBSTITUTION; +        } +	float cwoi = cmat[i-1][j]; +        if (cur_c > cwoi + COSTS::insertion) { +          cur_c = cwoi + COSTS::insertion; +          cur_b = INSERTION; +        } +        float cwod = cmat[i][j-1]; +        if (cur_c > cwod + COSTS::deletion) { +          cur_c = cwod + COSTS::deletion; +          cur_b = DELETION; +        } +      } +    } +     +    // trace back along the best path and record the transition types +    path->clear(); +    int i = hyp.size(); +    int j = ref.size(); +    while (i > 0 || j > 0) { +      if (j == 0) { +        --i; +        path->push_back(INSERTION); +      } else if (i == 0) { +        --j; +        path->push_back(DELETION); +      } else { +        TransType t = bmat[i][j]; +        path->push_back(t); +        switch (t) { +          case SUBSTITUTION: +          case MATCH: +            --i; --j; break; +          case INSERTION: +            --i; break; +          case DELETION: +            --j; break; +        } +      } +    } +    reverse(path->begin(), path->end()); +    return cmat[hyp.size()][ref.size()]; +  } + +  void BuildWordMatches(const vector<WordID>& hyp, NgramToIntsMap* nmap) const { +    nmap->clear(); +    set<WordID> exists_both; +    for (int i = 0; i < hyp.size(); ++i) +      if (rwexists_.find(hyp[i]) != rwexists_.end()) +        exists_both.insert(hyp[i]); +    for (int start=0; start<ref_.size(); ++start) { +      if (exists_both.find(ref_[start]) == exists_both.end()) continue; +      vector<WordID> cp; +      int mlen = min(MAX_SHIFT_SIZE, static_cast<int>(ref_.size() - start)); +      for (int len=0; len<mlen; ++len) { +        if (len && exists_both.find(ref_[start + len]) == exists_both.end()) break; +        cp.push_back(ref_[start + len]); +	(*nmap)[cp].insert(start); +      } +    } +  } + +  static void PerformShift(const vector<WordID>& in, +    int start, int end, int moveto, vector<WordID>* out) { +    // cerr << "ps: " << start << " " << end << " " << moveto << endl; +    out->clear(); +    if (moveto == -1) { +      for (int i = start; i <= end; ++i) +       out->push_back(in[i]); +      for (int i = 0; i < start; ++i) +       out->push_back(in[i]); +      for (int i = end+1; i < in.size(); ++i) +       out->push_back(in[i]); +    } else if (moveto < start) { +      for (int i = 0; i <= moveto; ++i) +       out->push_back(in[i]); +      for (int i = start; i <= end; ++i) +       out->push_back(in[i]); +      for (int i = moveto+1; i < start; ++i) +       out->push_back(in[i]); +      for (int i = end+1; i < in.size(); ++i) +       out->push_back(in[i]); +    } else if (moveto > end) { +      for (int i = 0; i < start; ++i) +       out->push_back(in[i]); +      for (int i = end+1; i <= moveto; ++i) +       out->push_back(in[i]); +      for (int i = start; i <= end; ++i) +       out->push_back(in[i]); +      for (int i = moveto+1; i < in.size(); ++i) +       out->push_back(in[i]); +    } else { +      for (int i = 0; i < start; ++i) +       out->push_back(in[i]); +      for (int i = end+1; (i < in.size()) && (i <= end + (moveto - start)); ++i) +       out->push_back(in[i]); +      for (int i = start; i <= end; ++i) +       out->push_back(in[i]); +      for (int i = (end + (moveto - start))+1; i < in.size(); ++i) +       out->push_back(in[i]); +    } +    if (out->size() != in.size()) { +      cerr << "ps: " << start << " " << end << " " << moveto << endl; +      cerr << "in=" << TD::GetString(in) << endl; +      cerr << "out=" << TD::GetString(*out) << endl; +    } +    assert(out->size() == in.size());  +    // cerr << "ps: " << TD::GetString(*out) << endl; +  } + +  void GetAllPossibleShifts(const vector<WordID>& hyp, +      const vector<int>& ralign, +      const vector<bool>& herr, +      const vector<bool>& rerr, +      const int min_size, +      vector<vector<Shift> >* shifts) const { +    for (int start = 0; start < hyp.size(); ++start) { +      vector<WordID> cp(1, hyp[start]); +      NgramToIntsMap::iterator niter = nmap_.find(cp); +      if (niter == nmap_.end()) continue; +      bool ok = false; +      int moveto; +      for (set<int>::iterator i = niter->second.begin(); i != niter->second.end(); ++i) { +        moveto = *i; +        int rm = ralign[moveto]; +        ok = (start != rm && +              (rm - start) < MAX_SHIFT_DIST && +              (start - rm - 1) < MAX_SHIFT_DIST); +        if (ok) break; +      } +      if (!ok) continue; +      cp.clear(); +      for (int end = start + min_size - 1; +           ok && end < hyp.size() && end < (start + MAX_SHIFT_SIZE); ++end) { +        cp.push_back(hyp[end]); +	vector<Shift>& sshifts = (*shifts)[end - start]; +        ok = false; +        NgramToIntsMap::iterator niter = nmap_.find(cp); +        if (niter == nmap_.end()) break; +        bool any_herr = false; +        for (int i = start; i <= end && !any_herr; ++i) +          any_herr = herr[i]; +        if (!any_herr) { +          ok = true; +          continue; +        } +        for (set<int>::iterator mi = niter->second.begin(); +             mi != niter->second.end(); ++mi) { +          int moveto = *mi; +	  int rm = ralign[moveto]; +	  if (! ((rm != start) && +	        ((rm < start) || (rm > end)) && +		(rm - start <= MAX_SHIFT_DIST) && +		((start - rm - 1) <= MAX_SHIFT_DIST))) continue; +          ok = true; +	  bool any_rerr = false; +	  for (int i = 0; (i <= end - start) && (!any_rerr); ++i) +            any_rerr = rerr[moveto+i]; +	  if (!any_rerr) continue; +	  for (int roff = 0; roff <= (end - start); ++roff) { +	    int rmr = ralign[moveto+roff]; +	    if ((start != rmr) && ((roff == 0) || (rmr != ralign[moveto]))) +	      sshifts.push_back(Shift(start, end, moveto + roff)); +	  } +        } +      } +    } +  } + +  bool CalculateBestShift(const vector<WordID>& cur, +                          const vector<WordID>& hyp, +                          float curerr, +                          const vector<TransType>& path, +                          vector<WordID>* new_hyp, +                          float* newerr, +                          vector<TransType>* new_path) const { +    vector<bool> herr, rerr; +    vector<int> ralign; +    int hpos = -1; +    for (int i = 0; i < path.size(); ++i) { +      switch (path[i]) { +        case MATCH: +	  ++hpos; +	  herr.push_back(false); +	  rerr.push_back(false); +	  ralign.push_back(hpos); +          break; +        case SUBSTITUTION: +	  ++hpos; +	  herr.push_back(true); +	  rerr.push_back(true); +	  ralign.push_back(hpos); +          break; +        case INSERTION: +	  ++hpos; +	  herr.push_back(true); +          break; +	case DELETION: +	  rerr.push_back(true); +	  ralign.push_back(hpos); +          break; +      } +    } +#if 0 +    cerr << "RALIGN: "; +    for (int i = 0; i < rerr.size(); ++i) +      cerr << ralign[i] << " "; +    cerr << endl; +    cerr << "RERR: "; +    for (int i = 0; i < rerr.size(); ++i) +      cerr << (bool)rerr[i] << " "; +    cerr << endl; +    cerr << "HERR: "; +    for (int i = 0; i < herr.size(); ++i) +      cerr << (bool)herr[i] << " "; +    cerr << endl; +#endif + +    vector<vector<Shift> > shifts(MAX_SHIFT_SIZE + 1); +    GetAllPossibleShifts(cur, ralign, herr, rerr, 1, &shifts); +    float cur_best_shift_cost = 0; +    *newerr = curerr; +    vector<TransType> cur_best_path; +    vector<WordID> cur_best_hyp; +     +    bool res = false; +    for (int i = shifts.size() - 1; i >=0; --i) { +      float curfix = curerr - (cur_best_shift_cost + *newerr); +      float maxfix = 2.0f * (1 + i) - COSTS::shift; +      if ((curfix > maxfix) || ((cur_best_shift_cost == 0) && (curfix == maxfix))) break; +      for (int j = 0; j < shifts[i].size(); ++j) { +        const Shift& s = shifts[i][j]; +	curfix = curerr - (cur_best_shift_cost + *newerr); +	maxfix = 2.0f * (1 + i) - COSTS::shift;  // TODO remove? +        if ((curfix > maxfix) || ((cur_best_shift_cost == 0) && (curfix == maxfix))) continue; +	vector<WordID> shifted(cur.size()); +	PerformShift(cur, s.begin(), s.end(), ralign[s.moveto()], &shifted); +	vector<TransType> try_path; +	float try_cost = MinimumEditDistance(shifted, ref_, &try_path); +	float gain = (*newerr + cur_best_shift_cost) - (try_cost + COSTS::shift); +	if (gain > 0.0f || ((cur_best_shift_cost == 0.0f) && (gain == 0.0f))) { +	  *newerr = try_cost; +	  cur_best_shift_cost = COSTS::shift; +	  new_path->swap(try_path); +	  new_hyp->swap(shifted); +	  res = true; +	  // cerr << "Found better shift " << s.begin() << "..." << s.end() << " moveto " << s.moveto() << endl; +	} +      } +    } + +    return res; +  } + +  static void GetPathStats(const vector<TransType>& path, int* subs, int* ins, int* dels) { +    *subs = *ins = *dels = 0; +    for (int i = 0; i < path.size(); ++i) { +      switch (path[i]) { +        case SUBSTITUTION: +	  ++(*subs); +        case MATCH: +          break; +        case INSERTION: +          ++(*ins); break; +	case DELETION: +          ++(*dels); break; +      } +    } +  } + +  float CalculateAllShifts(const vector<WordID>& hyp, +      int* subs, int* ins, int* dels, int* shifts) const { +    BuildWordMatches(hyp, &nmap_); +    vector<TransType> path; +    float med_cost = MinimumEditDistance(hyp, ref_, &path); +    float edits = 0; +    vector<WordID> cur = hyp; +    *shifts = 0; +    if (ter_short_circuit_long_sentences < 0 || +        ref_.size() < ter_short_circuit_long_sentences) { +      while (true) { +        vector<WordID> new_hyp; +        vector<TransType> new_path; +        float new_med_cost; +        if (!CalculateBestShift(cur, hyp, med_cost, path, &new_hyp, &new_med_cost, &new_path)) +          break; +        edits += COSTS::shift; +        ++(*shifts); +        med_cost = new_med_cost; +        path.swap(new_path); +        cur.swap(new_hyp); +      } +    } +    GetPathStats(path, subs, ins, dels); +    return med_cost + edits; +  } +}; + +class TERScore : public Score { +  friend class TERScorer; + + public: +  static const unsigned kINSERTIONS = 0; +  static const unsigned kDELETIONS = 1; +  static const unsigned kSUBSTITUTIONS = 2; +  static const unsigned kSHIFTS = 3; +  static const unsigned kREF_WORDCOUNT = 4; +  static const unsigned kDUMMY_LAST_ENTRY = 5; + + TERScore() : stats(0,kDUMMY_LAST_ENTRY) {} +  float ComputeScore() const { +    float edits = static_cast<float>(stats[kINSERTIONS] + stats[kDELETIONS] + stats[kSUBSTITUTIONS] + stats[kSHIFTS]); +    return edits / static_cast<float>(stats[kREF_WORDCOUNT]); +  } +  void ScoreDetails(string* details) const; +  void PlusEquals(const Score& delta) { +    stats += static_cast<const TERScore&>(delta).stats; +  } +  Score* GetZero() const { +    return new TERScore; +  } +  void Subtract(const Score& rhs, Score* res) const { +    static_cast<TERScore*>(res)->stats = stats - static_cast<const TERScore&>(rhs).stats; +  } +  void Encode(std::string* out) const { +    ostringstream os; +    os << stats[kINSERTIONS] << ' ' +       << stats[kDELETIONS] << ' ' +       << stats[kSUBSTITUTIONS] << ' ' +       << stats[kSHIFTS] << ' ' +       << stats[kREF_WORDCOUNT]; +    *out = os.str(); +  } +  bool IsAdditiveIdentity() const { +    for (int i = 0; i < kDUMMY_LAST_ENTRY; ++i) +      if (stats[i] != 0) return false; +    return true; +  } + private: +  valarray<int> stats; +}; + +Score* TERScorer::ScoreFromString(const std::string& data) { +  istringstream is(data); +  TERScore* r = new TERScore; +  is >> r->stats[TERScore::kINSERTIONS] +     >> r->stats[TERScore::kDELETIONS] +     >> r->stats[TERScore::kSUBSTITUTIONS] +     >> r->stats[TERScore::kSHIFTS] +     >> r->stats[TERScore::kREF_WORDCOUNT]; +  return r; +} + +void TERScore::ScoreDetails(std::string* details) const { +  char buf[200]; +  sprintf(buf, "TER = %.2f, %3d|%3d|%3d|%3d (len=%d)", +     ComputeScore() * 100.0f,   +     stats[kINSERTIONS], +     stats[kDELETIONS], +     stats[kSUBSTITUTIONS], +     stats[kSHIFTS], +     stats[kREF_WORDCOUNT]); +  *details = buf; +} + +TERScorer::~TERScorer() { +  for (vector<TERScorerImpl*>::iterator i = impl_.begin(); i != impl_.end(); ++i) +    delete *i; +} + +TERScorer::TERScorer(const vector<vector<WordID> >& refs) : impl_(refs.size()) { +  for (int i = 0; i < refs.size(); ++i) +    impl_[i] = new TERScorerImpl(refs[i]); +} + +Score* TERScorer::ScoreCandidate(const std::vector<WordID>& hyp) const { +  float best_score = numeric_limits<float>::max(); +  TERScore* res = new TERScore; +  int avg_len = 0; +  for (int i = 0; i < impl_.size(); ++i) +    avg_len += impl_[i]->GetRefLength(); +  avg_len /= impl_.size(); +  for (int i = 0; i < impl_.size(); ++i) { +    int subs, ins, dels, shifts; +    float score = impl_[i]->Calculate(hyp, &subs, &ins, &dels, &shifts); +    // cerr << "Component TER cost: " << score << endl; +    if (score < best_score) { +      res->stats[TERScore::kINSERTIONS] = ins; +      res->stats[TERScore::kDELETIONS] = dels; +      res->stats[TERScore::kSUBSTITUTIONS] = subs; +      res->stats[TERScore::kSHIFTS] = shifts; +      if (ter_use_average_ref_len) { +        res->stats[TERScore::kREF_WORDCOUNT] = avg_len; +      } else { +        res->stats[TERScore::kREF_WORDCOUNT] = impl_[i]->GetRefLength(); +      } + +      best_score = score; +    } +  } +  return res; +} diff --git a/vest/ter.h b/vest/ter.h new file mode 100644 index 00000000..fe4ba36c --- /dev/null +++ b/vest/ter.h @@ -0,0 +1,18 @@ +#ifndef _TER_H_ +#define _TER_H_ + +#include "scorer.h" + +class TERScorerImpl; + +class TERScorer : public SentenceScorer { + public: +  TERScorer(const std::vector<std::vector<WordID> >& references); +  ~TERScorer(); +  Score* ScoreCandidate(const std::vector<WordID>& hyp) const; +  static Score* ScoreFromString(const std::string& data); + private: +  std::vector<TERScorerImpl*> impl_; +}; + +#endif diff --git a/vest/test_aer/README b/vest/test_aer/README new file mode 100644 index 00000000..819b2e32 --- /dev/null +++ b/vest/test_aer/README @@ -0,0 +1,8 @@ +To run the test: + +../dist-vest.pl --local --metric aer cdec.ini --source-file corpus.src --ref-files=ref.0 --weights weights + +This will optimize the parameters of the tiny lexical translation model +so as to minimize the AER of the Viterbi alignment on the development +set in corpus.src according to the reference alignments in ref.0. + diff --git a/vest/test_aer/cdec.ini b/vest/test_aer/cdec.ini new file mode 100644 index 00000000..08187848 --- /dev/null +++ b/vest/test_aer/cdec.ini @@ -0,0 +1,3 @@ +formalism=lextrans +grammar=grammar +aligner=true diff --git a/vest/test_aer/corpus.src b/vest/test_aer/corpus.src new file mode 100644 index 00000000..31b23971 --- /dev/null +++ b/vest/test_aer/corpus.src @@ -0,0 +1,3 @@ +el gato negro ||| the black cat +el gato ||| the cat +el libro ||| the book diff --git a/vest/test_aer/grammar b/vest/test_aer/grammar new file mode 100644 index 00000000..9d857824 --- /dev/null +++ b/vest/test_aer/grammar @@ -0,0 +1,12 @@ +el ||| cat ||| F1=1 +el ||| the ||| F2=1 +el ||| black ||| F3=1 +el ||| book ||| F11=1 +gato ||| cat ||| F4=1 NN=1 +gato ||| black ||| F5=1 +gato ||| the ||| F6=1 +negro ||| the ||| F7=1 +negro ||| cat ||| F8=1 +negro ||| black ||| F9=1 +libro ||| the ||| F10=1 +libro ||| book ||| F12=1 NN=1 diff --git a/vest/test_aer/ref.0 b/vest/test_aer/ref.0 new file mode 100644 index 00000000..734a9c5b --- /dev/null +++ b/vest/test_aer/ref.0 @@ -0,0 +1,3 @@ +0-0 1-2 2-1 +0-0 1-1 +0-0 1-1 diff --git a/vest/test_aer/weights b/vest/test_aer/weights new file mode 100644 index 00000000..afc9282e --- /dev/null +++ b/vest/test_aer/weights @@ -0,0 +1,13 @@ +F1 0.1 +F2 -.5980815 +F3 0.24235 +F4 0.625 +F5 0.4514 +F6 0.112316 +F7 -0.123415 +F8 -0.25390285 +F9 -0.23852 +F10 0.646 +F11 0.413141 +F12 0.343216 +NN -0.1215 diff --git a/vest/test_data/0.json.gz b/vest/test_data/0.json.gzBinary files differ new file mode 100644 index 00000000..30f8dd77 --- /dev/null +++ b/vest/test_data/0.json.gz diff --git a/vest/test_data/1.json.gz b/vest/test_data/1.json.gzBinary files differ new file mode 100644 index 00000000..c82cc179 --- /dev/null +++ b/vest/test_data/1.json.gz diff --git a/vest/test_data/c2e.txt.0 b/vest/test_data/c2e.txt.0 new file mode 100644 index 00000000..12c4abe9 --- /dev/null +++ b/vest/test_data/c2e.txt.0 @@ -0,0 +1,2 @@ +australia reopens embassy in manila +( afp , manila , january 2 ) australia reopened its embassy in the philippines today , which was shut down about seven weeks ago due to what was described as a specific threat of a terrorist attack . diff --git a/vest/test_data/c2e.txt.1 b/vest/test_data/c2e.txt.1 new file mode 100644 index 00000000..4ac12df1 --- /dev/null +++ b/vest/test_data/c2e.txt.1 @@ -0,0 +1,2 @@ +australia reopened manila embassy +( agence france-presse , manila , 2nd ) - australia reopened its embassy in the philippines today . the embassy was closed seven weeks ago after what was described as a specific threat of a terrorist attack . diff --git a/vest/test_data/c2e.txt.2 b/vest/test_data/c2e.txt.2 new file mode 100644 index 00000000..2f67b72f --- /dev/null +++ b/vest/test_data/c2e.txt.2 @@ -0,0 +1,2 @@ +australia to reopen embassy in manila +( afp report from manila , january 2 ) australia reopened its embassy in the philippines today . seven weeks ago , the embassy was shut down due to so-called confirmed terrorist attack threats . diff --git a/vest/test_data/c2e.txt.3 b/vest/test_data/c2e.txt.3 new file mode 100644 index 00000000..5483cef6 --- /dev/null +++ b/vest/test_data/c2e.txt.3 @@ -0,0 +1,2 @@ +australia to re - open its embassy to manila +( afp , manila , thursday ) australia reopens its embassy to manila , which was closed for the so-called " clear " threat of terrorist attack 7 weeks ago . diff --git a/vest/test_data/re.txt.0 b/vest/test_data/re.txt.0 new file mode 100644 index 00000000..86eff087 --- /dev/null +++ b/vest/test_data/re.txt.0 @@ -0,0 +1,5 @@ +erdogan states turkey to reject any pressures to urge it to recognize cyprus +ankara 12 - 1 ( afp ) - turkish prime minister recep tayyip erdogan announced today , wednesday , that ankara will reject any pressure by the european union to urge it to recognize cyprus . this comes two weeks before the summit of european union state and government heads who will decide whether or nor membership negotiations with ankara should be opened . +erdogan told " ntv " television station that " the european union cannot address us by imposing new conditions on us with regard to cyprus . +we will discuss this dossier in the course of membership negotiations . " +he added " let me be clear , i cannot sidestep turkey , this is something we cannot accept . " diff --git a/vest/test_data/re.txt.1 b/vest/test_data/re.txt.1 new file mode 100644 index 00000000..2140f198 --- /dev/null +++ b/vest/test_data/re.txt.1 @@ -0,0 +1,5 @@ +erdogan confirms turkey will resist any pressure to recognize cyprus +ankara 12 - 1 ( afp ) - the turkish head of government , recep tayyip erdogan , announced today ( wednesday ) that ankara would resist any pressure the european union might exercise in order to force it into recognizing cyprus . this comes two weeks before a summit of european union heads of state and government , who will decide whether or not to open membership negotiations with ankara . +erdogan said to the ntv television channel : " the european union cannot engage with us through imposing new conditions on us with regard to cyprus . +we shall discuss this issue in the course of the membership negotiations . " +he added : " let me be clear - i cannot confine turkey . this is something we do not accept . " diff --git a/vest/test_data/re.txt.2 b/vest/test_data/re.txt.2 new file mode 100644 index 00000000..94e46286 --- /dev/null +++ b/vest/test_data/re.txt.2 @@ -0,0 +1,5 @@ +erdogan confirms that turkey will reject any pressures to encourage it to recognize cyprus +ankara , 12 / 1 ( afp ) - the turkish prime minister recep tayyip erdogan declared today , wednesday , that ankara will reject any pressures that the european union may apply on it to encourage to recognize cyprus . this comes two weeks before a summit of the heads of countries and governments of the european union , who will decide on whether or not to start negotiations on joining with ankara . +erdogan told the ntv television station that " it is not possible for the european union to talk to us by imposing new conditions on us regarding cyprus . +we shall discuss this dossier during the negotiations on joining . " +and he added , " let me be clear . turkey's arm should not be twisted ; this is something we cannot accept . " diff --git a/vest/test_data/re.txt.3 b/vest/test_data/re.txt.3 new file mode 100644 index 00000000..f87c3308 --- /dev/null +++ b/vest/test_data/re.txt.3 @@ -0,0 +1,5 @@ +erdogan stresses that turkey will reject all pressures to force it to recognize cyprus +ankara 12 - 1 ( afp ) - turkish prime minister recep tayyip erdogan announced today , wednesday , that ankara would refuse all pressures applied on it by the european union to force it to recognize cyprus . that came two weeks before the summit of the presidents and prime ministers of the european union , who would decide on whether to open negotiations on joining with ankara or not . +erdogan said to " ntv " tv station that the " european union can not communicate with us by imposing on us new conditions related to cyprus . +we will discuss this file during the negotiations on joining . " +he added , " let me be clear . turkey's arm should not be twisted . this is unacceptable to us . " diff --git a/vest/viterbi_envelope.cc b/vest/viterbi_envelope.cc new file mode 100644 index 00000000..5c24c018 --- /dev/null +++ b/vest/viterbi_envelope.cc @@ -0,0 +1,176 @@ +#include "viterbi_envelope.h" + +#include <cassert> +#include <limits> + +using namespace std; +using boost::shared_ptr; + +ostream& operator<<(ostream& os, const ViterbiEnvelope& env) { +  os << '<'; +  const vector<shared_ptr<Segment> >& segs = env.GetSortedSegs(); +  for (int i = 0; i < segs.size(); ++i) +    os << (i==0 ? "" : "|") << "x=" << segs[i]->x << ",b=" << segs[i]->b << ",m=" << segs[i]->m << ",p1=" << segs[i]->p1 << ",p2=" << segs[i]->p2; +  return os << '>'; +} + +ViterbiEnvelope::ViterbiEnvelope(int i) { +  if (i == 0) { +    // do nothing - <> +  } else if (i == 1) { +    segs.push_back(shared_ptr<Segment>(new Segment(0, 0, 0, shared_ptr<Segment>(), shared_ptr<Segment>()))); +    assert(this->IsMultiplicativeIdentity()); +  } else { +    cerr << "Only can create ViterbiEnvelope semiring 0 and 1 with this constructor!\n"; +    abort(); +  } +} + +struct SlopeCompare { +  bool operator() (const shared_ptr<Segment>& a, const shared_ptr<Segment>& b) const { +    return a->m < b->m; +  } +}; + +const ViterbiEnvelope& ViterbiEnvelope::operator+=(const ViterbiEnvelope& other) { +  if (!other.is_sorted) other.Sort(); +  if (segs.empty()) { +    segs = other.segs; +    return *this; +  } +  is_sorted = false; +  int j = segs.size(); +  segs.resize(segs.size() + other.segs.size()); +  for (int i = 0; i < other.segs.size(); ++i) +    segs[j++] = other.segs[i]; +  assert(j == segs.size()); +  return *this; +} + +void ViterbiEnvelope::Sort() const { +  sort(segs.begin(), segs.end(), SlopeCompare()); +  const int k = segs.size(); +  int j = 0; +  for (int i = 0; i < k; ++i) { +    Segment l = *segs[i]; +    l.x = kMinusInfinity; +    // cerr << "m=" << l.m << endl; +    if (0 < j) { +      if (segs[j-1]->m == l.m) {   // lines are parallel +        if (l.b <= segs[j-1]->b) continue; +        --j; +      } +      while(0 < j) { +        l.x = (l.b - segs[j-1]->b) / (segs[j-1]->m - l.m); +        if (segs[j-1]->x < l.x) break; +        --j; +      } +      if (0 == j) l.x = kMinusInfinity; +    } +    *segs[j++] = l; +  } +  segs.resize(j); +  is_sorted = true; +} + +const ViterbiEnvelope& ViterbiEnvelope::operator*=(const ViterbiEnvelope& other) { +  if (other.IsMultiplicativeIdentity()) { return *this; } +  if (this->IsMultiplicativeIdentity()) { (*this) = other; return *this; } + +  if (!is_sorted) Sort(); +  if (!other.is_sorted) other.Sort(); + +  if (this->IsEdgeEnvelope()) { +//    if (other.size() > 1) +//      cerr << *this << " (TIMES) " << other << endl; +    shared_ptr<Segment> edge_parent = segs[0]; +    const double& edge_b = edge_parent->b; +    const double& edge_m = edge_parent->m; +    segs.clear(); +    for (int i = 0; i < other.segs.size(); ++i) { +      const Segment& seg = *other.segs[i]; +      const double m = seg.m + edge_m; +      const double b = seg.b + edge_b; +      const double& x = seg.x;       // x's don't change with * +      segs.push_back(shared_ptr<Segment>(new Segment(x, m, b, edge_parent, other.segs[i]))); +      assert(segs.back()->p1->edge); +    } +//    if (other.size() > 1) +//      cerr << " = " << *this << endl; +  } else { +    vector<shared_ptr<Segment> > new_segs; +    int this_i = 0; +    int other_i = 0; +    const int this_size  = segs.size(); +    const int other_size = other.segs.size(); +    double cur_x = kMinusInfinity;   // moves from left to right across the +                                     // real numbers, stopping for all inter- +                                     // sections +    double this_next_val  = (1 < this_size  ? segs[1]->x       : kPlusInfinity); +    double other_next_val = (1 < other_size ? other.segs[1]->x : kPlusInfinity); +    while (this_i < this_size && other_i < other_size) { +      const Segment& this_seg = *segs[this_i]; +      const Segment& other_seg= *other.segs[other_i]; +      const double m = this_seg.m + other_seg.m; +      const double b = this_seg.b + other_seg.b; +  +      new_segs.push_back(shared_ptr<Segment>(new Segment(cur_x, m, b, segs[this_i], other.segs[other_i]))); +      int comp = 0; +      if (this_next_val < other_next_val) comp = -1; else +        if (this_next_val > other_next_val) comp = 1; +      if (0 == comp) {  // the next values are equal, advance both indices +        ++this_i; +	++other_i; +        cur_x = this_next_val;  // could be other_next_val (they're equal!) +        this_next_val  = (this_i+1  < this_size  ? segs[this_i+1]->x        : kPlusInfinity); +        other_next_val = (other_i+1 < other_size ? other.segs[other_i+1]->x : kPlusInfinity); +      } else {  // advance the i with the lower x, update cur_x +        if (-1 == comp) { +          ++this_i; +          cur_x = this_next_val; +          this_next_val =  (this_i+1  < this_size  ? segs[this_i+1]->x        : kPlusInfinity); +        } else { +          ++other_i; +          cur_x = other_next_val; +          other_next_val = (other_i+1 < other_size ? other.segs[other_i+1]->x : kPlusInfinity); +        } +      } +    } +    segs.swap(new_segs); +  } +  //cerr << "Multiply: result=" << (*this) << endl; +  return *this; +} + +// recursively construct translation +void Segment::ConstructTranslation(vector<WordID>* trans) const { +  const Segment* cur = this; +  vector<vector<WordID> > ant_trans; +  while(!cur->edge) { +    ant_trans.resize(ant_trans.size() + 1); +    cur->p2->ConstructTranslation(&ant_trans.back()); +    cur = cur->p1.get(); +  } +  size_t ant_size = ant_trans.size(); +  vector<const vector<WordID>*> pants(ant_size); +  --ant_size; +  for (int i = 0; i < pants.size(); ++i) pants[ant_size - i] = &ant_trans[i]; +  cur->edge->rule_->ESubstitute(pants, trans); +} + +void Segment::CollectEdgesUsed(std::vector<bool>* edges_used) const { +  if (edge) { +    assert(edge->id_ < edges_used->size()); +    (*edges_used)[edge->id_] = true; +  } +  if (p1) p1->CollectEdgesUsed(edges_used); +  if (p2) p2->CollectEdgesUsed(edges_used); +} + +ViterbiEnvelope ViterbiEnvelopeWeightFunction::operator()(const Hypergraph::Edge& e) const { +  const double m = direction.dot(e.feature_values_); +  const double b = origin.dot(e.feature_values_); +  Segment* seg = new Segment(m, b, e); +  return ViterbiEnvelope(1, seg); +} + diff --git a/vest/viterbi_envelope.h b/vest/viterbi_envelope.h new file mode 100644 index 00000000..1689a00e --- /dev/null +++ b/vest/viterbi_envelope.h @@ -0,0 +1,81 @@ +#ifndef _VITERBI_ENVELOPE_H_ +#define _VITERBI_ENVELOPE_H_ + +#include <vector> +#include <iostream> +#include <boost/shared_ptr.hpp> + +#include "hg.h" +#include "sparse_vector.h" + +static const double kMinusInfinity = -std::numeric_limits<double>::infinity(); +static const double kPlusInfinity = std::numeric_limits<double>::infinity(); + +struct Segment { +  Segment() : x(), m(), b(), edge() {} +  Segment(double _m, double _b) : +    x(kMinusInfinity), m(_m), b(_b), edge() {} +  Segment(double _x, double _m, double _b, const boost::shared_ptr<Segment>& p1_, const boost::shared_ptr<Segment>& p2_) : +    x(_x), m(_m), b(_b), p1(p1_), p2(p2_), edge() {} +  Segment(double _m, double _b, const Hypergraph::Edge& edge) : +    x(kMinusInfinity), m(_m), b(_b), edge(&edge) {} + +  double x;                   // x intersection with previous segment in env, or -inf if none +  double m;                   // this line's slope +  double b;                   // intercept with y-axis + +  // we keep a pointer to the "parents" of this segment so we can reconstruct +  // the Viterbi translation corresponding to this segment +  boost::shared_ptr<Segment> p1; +  boost::shared_ptr<Segment> p2; + +  // only Segments created from an edge using the ViterbiEnvelopeWeightFunction +  // have rules +  // TRulePtr rule; +  const Hypergraph::Edge* edge; + +  // recursively recover the Viterbi translation that will result from setting +  // the weights to origin + axis * x, where x is any value from this->x up +  // until the next largest x in the containing ViterbiEnvelope +  void ConstructTranslation(std::vector<WordID>* trans) const; +  void CollectEdgesUsed(std::vector<bool>* edges_used) const; +}; + +// this is the semiring value type, +// it defines constructors for 0, 1, and the operations + and * +struct ViterbiEnvelope { +  // create semiring zero +  ViterbiEnvelope() : is_sorted(true) {}  // zero +  // for debugging: +  ViterbiEnvelope(const std::vector<boost::shared_ptr<Segment> >& s) : segs(s) { Sort(); } +  // create semiring 1 or 0 +  explicit ViterbiEnvelope(int i); +  ViterbiEnvelope(int n, Segment* seg) : is_sorted(true), segs(n, boost::shared_ptr<Segment>(seg)) {} +  const ViterbiEnvelope& operator+=(const ViterbiEnvelope& other); +  const ViterbiEnvelope& operator*=(const ViterbiEnvelope& other); +  bool IsMultiplicativeIdentity() const { +    return size() == 1 && (segs[0]->b == 0.0 && segs[0]->m == 0.0) && (!segs[0]->edge); } +  const std::vector<boost::shared_ptr<Segment> >& GetSortedSegs() const { +    if (!is_sorted) Sort(); +    return segs; +  } +  size_t size() const { return segs.size(); } + + private: +  bool IsEdgeEnvelope() const { +    return segs.size() == 1 && segs[0]->edge; } +  void Sort() const; +  mutable bool is_sorted; +  mutable std::vector<boost::shared_ptr<Segment> > segs; +}; +std::ostream& operator<<(std::ostream& os, const ViterbiEnvelope& env); + +struct ViterbiEnvelopeWeightFunction { +  ViterbiEnvelopeWeightFunction(const SparseVector<double>& ori, +                                const SparseVector<double>& dir) : origin(ori), direction(dir) {} +  ViterbiEnvelope operator()(const Hypergraph::Edge& e) const; +  const SparseVector<double> origin; +  const SparseVector<double> direction; +}; + +#endif | 
