summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPatrick Simianer <simianer@cl.uni-heidelberg.de>2012-06-26 14:47:46 +0200
committerPatrick Simianer <simianer@cl.uni-heidelberg.de>2012-06-26 14:47:46 +0200
commit3c03e716a8638163a820dbecf652c772e6d651f2 (patch)
tree255e2aa075b9a4c2a2fb1c077e899afdc5654c19
parentd04c516536db996e5fe5b94b8e5dea4ce2e04b4a (diff)
parente1b85b4f02045c62400b9c755883e7fae14557f6 (diff)
Merge remote-tracking branch 'upstream/master'
-rw-r--r--Makefile.am20
-rw-r--r--configure.ac2
-rw-r--r--minrisk/Makefile.am6
-rwxr-xr-xminrisk/minrisk.pl540
-rwxr-xr-xminrisk/minrisk_generate_input.pl18
-rw-r--r--minrisk/minrisk_optimize.cc141
-rw-r--r--training/Makefile.am3
-rw-r--r--training/risk.cc45
-rw-r--r--training/risk.h26
9 files changed, 798 insertions, 3 deletions
diff --git a/Makefile.am b/Makefile.am
index ea5e2b3f..4df72cff 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -1,7 +1,25 @@
# warning - the subdirectories in the following list should
# be kept in topologically sorted order. Also, DO NOT introduce
# cyclic dependencies between these directories!
-SUBDIRS = utils mteval klm/util klm/lm decoder phrasinator training training/liblbfgs mira dtrain dpmert pro-train rampion extools gi/pf gi/markov_al rst_parser
+SUBDIRS = \
+ utils \
+ mteval \
+ klm/util \
+ klm/lm \
+ decoder \
+ phrasinator \
+ training \
+ training/liblbfgs \
+ mira \
+ dtrain \
+ dpmert \
+ pro-train \
+ rampion \
+ minrisk \
+ extools \
+ gi/pf \
+ gi/markov_al \
+ rst_parser
#gi/pyp-topics/src gi/clda/src gi/posterior-regularisation/prjava
diff --git a/configure.ac b/configure.ac
index e22eddbd..eb2f4aaa 100644
--- a/configure.ac
+++ b/configure.ac
@@ -132,4 +132,4 @@ fi
CPPFLAGS="-DPIC -fPIC $CPPFLAGS -DHAVE_CONFIG_H"
-AC_OUTPUT(Makefile rst_parser/Makefile utils/Makefile mteval/Makefile extools/Makefile decoder/Makefile phrasinator/Makefile training/Makefile training/liblbfgs/Makefile dpmert/Makefile pro-train/Makefile rampion/Makefile klm/util/Makefile klm/lm/Makefile mira/Makefile dtrain/Makefile gi/pyp-topics/src/Makefile gi/clda/src/Makefile gi/pf/Makefile gi/markov_al/Makefile)
+AC_OUTPUT(Makefile rst_parser/Makefile utils/Makefile mteval/Makefile extools/Makefile decoder/Makefile phrasinator/Makefile training/Makefile training/liblbfgs/Makefile dpmert/Makefile pro-train/Makefile rampion/Makefile minrisk/Makefile klm/util/Makefile klm/lm/Makefile mira/Makefile dtrain/Makefile gi/pyp-topics/src/Makefile gi/clda/src/Makefile gi/pf/Makefile gi/markov_al/Makefile)
diff --git a/minrisk/Makefile.am b/minrisk/Makefile.am
new file mode 100644
index 00000000..a24f047c
--- /dev/null
+++ b/minrisk/Makefile.am
@@ -0,0 +1,6 @@
+bin_PROGRAMS = minrisk_optimize
+
+minrisk_optimize_SOURCES = minrisk_optimize.cc
+minrisk_optimize_LDADD = $(top_srcdir)/training/libtraining.a $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a $(top_srcdir)/training/liblbfgs/liblbfgs.a -lz
+
+AM_CPPFLAGS = -W -Wall $(GTEST_CPPFLAGS) -I$(top_srcdir)/utils -I$(top_srcdir)/decoder -I$(top_srcdir)/mteval -I$(top_srcdir)/training
diff --git a/minrisk/minrisk.pl b/minrisk/minrisk.pl
new file mode 100755
index 00000000..99893a66
--- /dev/null
+++ b/minrisk/minrisk.pl
@@ -0,0 +1,540 @@
+#!/usr/bin/env perl
+use strict;
+my @ORIG_ARGV=@ARGV;
+use Cwd qw(getcwd);
+my $SCRIPT_DIR; BEGIN { use Cwd qw/ abs_path /; use File::Basename; $SCRIPT_DIR = dirname(abs_path($0)); push @INC, $SCRIPT_DIR, "$SCRIPT_DIR/../environment"; }
+
+# Skip local config (used for distributing jobs) if we're running in local-only mode
+use LocalConfig;
+use Getopt::Long;
+use IPC::Open2;
+use POSIX ":sys_wait_h";
+my $QSUB_CMD = qsub_args(mert_memory());
+my $default_jobs = env_default_jobs();
+
+my $VEST_DIR="$SCRIPT_DIR/../dpmert";
+require "$VEST_DIR/libcall.pl";
+
+# Default settings
+my $srcFile;
+my $refFiles;
+my $bin_dir = $SCRIPT_DIR;
+die "Bin directory $bin_dir missing/inaccessible" unless -d $bin_dir;
+my $FAST_SCORE="$bin_dir/../mteval/fast_score";
+die "Can't execute $FAST_SCORE" unless -x $FAST_SCORE;
+my $MAPINPUT = "$bin_dir/minrisk_generate_input.pl";
+my $MAPPER = "$bin_dir/minrisk_optimize";
+my $parallelize = "$VEST_DIR/parallelize.pl";
+my $libcall = "$VEST_DIR/libcall.pl";
+my $sentserver = "$VEST_DIR/sentserver";
+my $sentclient = "$VEST_DIR/sentclient";
+my $LocalConfig = "$SCRIPT_DIR/../environment/LocalConfig.pm";
+
+my $SCORER = $FAST_SCORE;
+die "Can't find $MAPPER" unless -x $MAPPER;
+my $cdec = "$bin_dir/../decoder/cdec";
+die "Can't find decoder in $cdec" unless -x $cdec;
+die "Can't find $parallelize" unless -x $parallelize;
+die "Can't find $libcall" unless -e $libcall;
+my $decoder = $cdec;
+my $lines_per_mapper = 30;
+my $iteration = 1;
+my $best_weights;
+my $psi = 1;
+my $default_max_iter = 30;
+my $max_iterations = $default_max_iter;
+my $jobs = $default_jobs; # number of decode nodes
+my $pmem = "4g";
+my $disable_clean = 0;
+my %seen_weights;
+my $help = 0;
+my $epsilon = 0.0001;
+my $dryrun = 0;
+my $last_score = -10000000;
+my $metric = "ibm_bleu";
+my $dir;
+my $iniFile;
+my $weights;
+my $use_make = 1; # use make to parallelize
+my $useqsub = 0;
+my $initial_weights;
+my $pass_suffix = '';
+my $cpbin=1;
+
+# regularization strength
+my $tune_regularizer = 0;
+my $reg = 500;
+my $reg_previous = 5000;
+my $dont_accum = 0;
+
+# Process command-line options
+Getopt::Long::Configure("no_auto_abbrev");
+if (GetOptions(
+ "jobs=i" => \$jobs,
+ "dont-clean" => \$disable_clean,
+ "dont-accumulate" => \$dont_accum,
+ "pass-suffix=s" => \$pass_suffix,
+ "qsub" => \$useqsub,
+ "dry-run" => \$dryrun,
+ "epsilon=s" => \$epsilon,
+ "help" => \$help,
+ "weights=s" => \$initial_weights,
+ "reg=f" => \$reg,
+ "use-make=i" => \$use_make,
+ "max-iterations=i" => \$max_iterations,
+ "pmem=s" => \$pmem,
+ "cpbin!" => \$cpbin,
+ "ref-files=s" => \$refFiles,
+ "metric=s" => \$metric,
+ "source-file=s" => \$srcFile,
+ "workdir=s" => \$dir,
+) == 0 || @ARGV!=1 || $help) {
+ print_help();
+ exit;
+}
+
+die "--tune-regularizer is no longer supported with --reg-previous and --reg. Please tune manually.\n" if $tune_regularizer;
+
+if ($useqsub) {
+ $use_make = 0;
+ die "LocalEnvironment.pm does not have qsub configuration for this host. Cannot run with --qsub!\n" unless has_qsub();
+}
+
+my @missing_args = ();
+if (!defined $srcFile) { push @missing_args, "--source-file"; }
+if (!defined $refFiles) { push @missing_args, "--ref-files"; }
+if (!defined $initial_weights) { push @missing_args, "--weights"; }
+die "Please specify missing arguments: " . join (', ', @missing_args) . "\n" if (@missing_args);
+
+if ($metric =~ /^(combi|ter)$/i) {
+ $lines_per_mapper = 5;
+}
+
+($iniFile) = @ARGV;
+
+
+sub write_config;
+sub enseg;
+sub print_help;
+
+my $nodelist;
+my $host =check_output("hostname"); chomp $host;
+my $bleu;
+my $interval_count = 0;
+my $logfile;
+my $projected_score;
+
+# used in sorting scores
+my $DIR_FLAG = '-r';
+if ($metric =~ /^ter$|^aer$/i) {
+ $DIR_FLAG = '';
+}
+
+my $refs_comma_sep = get_comma_sep_refs('r',$refFiles);
+
+unless ($dir){
+ $dir = "rampion";
+}
+unless ($dir =~ /^\//){ # convert relative path to absolute path
+ my $basedir = check_output("pwd");
+ chomp $basedir;
+ $dir = "$basedir/$dir";
+}
+
+
+# Initializations and helper functions
+srand;
+
+my @childpids = ();
+my @cleanupcmds = ();
+
+sub cleanup {
+ print STDERR "Cleanup...\n";
+ for my $pid (@childpids){ unchecked_call("kill $pid"); }
+ for my $cmd (@cleanupcmds){ unchecked_call("$cmd"); }
+ exit 1;
+};
+# Always call cleanup, no matter how we exit
+*CORE::GLOBAL::exit =
+ sub{ cleanup(); };
+$SIG{INT} = "cleanup";
+$SIG{TERM} = "cleanup";
+$SIG{HUP} = "cleanup";
+
+my $decoderBase = check_output("basename $decoder"); chomp $decoderBase;
+my $newIniFile = "$dir/$decoderBase.ini";
+my $inputFileName = "$dir/input";
+my $user = $ENV{"USER"};
+# process ini file
+-e $iniFile || die "Error: could not open $iniFile for reading\n";
+open(INI, $iniFile);
+
+use File::Basename qw(basename);
+#pass bindir, refs to vars holding bin
+sub modbin {
+ local $_;
+ my $bindir=shift;
+ check_call("mkdir -p $bindir");
+ -d $bindir || die "couldn't make bindir $bindir";
+ for (@_) {
+ my $src=$$_;
+ $$_="$bindir/".basename($src);
+ check_call("cp -p $src $$_");
+ }
+}
+sub dirsize {
+ opendir ISEMPTY,$_[0];
+ return scalar(readdir(ISEMPTY))-1;
+}
+my @allweights;
+if ($dryrun){
+ write_config(*STDERR);
+ exit 0;
+} else {
+ if (-e $dir && dirsize($dir)>1 && -e "$dir/hgs" ){ # allow preexisting logfile, binaries, but not dist-pro.pl outputs
+ die "ERROR: working dir $dir already exists\n\n";
+ } else {
+ -e $dir || mkdir $dir;
+ mkdir "$dir/hgs";
+ modbin("$dir/bin",\$LocalConfig,\$cdec,\$SCORER,\$MAPINPUT,\$MAPPER,\$parallelize,\$sentserver,\$sentclient,\$libcall) if $cpbin;
+ mkdir "$dir/scripts";
+ my $cmdfile="$dir/rerun-pro.sh";
+ open CMD,'>',$cmdfile;
+ print CMD "cd ",&getcwd,"\n";
+# print CMD &escaped_cmdline,"\n"; #buggy - last arg is quoted.
+ my $cline=&cmdline."\n";
+ print CMD $cline;
+ close CMD;
+ print STDERR $cline;
+ chmod(0755,$cmdfile);
+ check_call("cp $initial_weights $dir/weights.0");
+ die "Can't find weights.0" unless (-e "$dir/weights.0");
+ }
+ write_config(*STDERR);
+}
+
+
+# Generate initial files and values
+check_call("cp $iniFile $newIniFile");
+$iniFile = $newIniFile;
+
+my $newsrc = "$dir/dev.input";
+enseg($srcFile, $newsrc);
+$srcFile = $newsrc;
+my $devSize = 0;
+open F, "<$srcFile" or die "Can't read $srcFile: $!";
+while(<F>) { $devSize++; }
+close F;
+
+unless($best_weights){ $best_weights = $weights; }
+unless($projected_score){ $projected_score = 0.0; }
+$seen_weights{$weights} = 1;
+my $kbest = "$dir/kbest";
+if ($dont_accum) {
+ $kbest = '';
+} else {
+ check_call("mkdir -p $kbest");
+ $kbest = "--kbest_repository $kbest";
+}
+
+my $random_seed = int(time / 1000);
+my $lastWeightsFile;
+my $lastPScore = 0;
+# main optimization loop
+while (1){
+ print STDERR "\n\nITERATION $iteration\n==========\n";
+
+ if ($iteration > $max_iterations){
+ print STDERR "\nREACHED STOPPING CRITERION: Maximum iterations\n";
+ last;
+ }
+ # iteration-specific files
+ my $runFile="$dir/run.raw.$iteration";
+ my $onebestFile="$dir/1best.$iteration";
+ my $logdir="$dir/logs.$iteration";
+ my $decoderLog="$logdir/decoder.sentserver.log.$iteration";
+ my $scorerLog="$logdir/scorer.log.$iteration";
+ check_call("mkdir -p $logdir");
+
+
+ #decode
+ print STDERR "RUNNING DECODER AT ";
+ print STDERR unchecked_output("date");
+ my $im1 = $iteration - 1;
+ my $weightsFile="$dir/weights.$im1";
+ push @allweights, "-w $dir/weights.$im1";
+ `rm -f $dir/hgs/*.gz`;
+ my $decoder_cmd = "$decoder -c $iniFile --weights$pass_suffix $weightsFile -O $dir/hgs";
+ my $pcmd;
+ if ($use_make) {
+ $pcmd = "cat $srcFile | $parallelize --use-fork -p $pmem -e $logdir -j $jobs --";
+ } else {
+ $pcmd = "cat $srcFile | $parallelize -p $pmem -e $logdir -j $jobs --";
+ }
+ my $cmd = "$pcmd $decoder_cmd 2> $decoderLog 1> $runFile";
+ print STDERR "COMMAND:\n$cmd\n";
+ check_bash_call($cmd);
+ my $num_hgs;
+ my $num_topbest;
+ my $retries = 0;
+ while($retries < 5) {
+ $num_hgs = check_output("ls $dir/hgs/*.gz | wc -l");
+ $num_topbest = check_output("wc -l < $runFile");
+ print STDERR "NUMBER OF HGs: $num_hgs\n";
+ print STDERR "NUMBER OF TOP-BEST HYPs: $num_topbest\n";
+ if($devSize == $num_hgs && $devSize == $num_topbest) {
+ last;
+ } else {
+ print STDERR "Incorrect number of hypergraphs or topbest. Waiting for distributed filesystem and retrying...\n";
+ sleep(3);
+ }
+ $retries++;
+ }
+ die "Dev set contains $devSize sentences, but we don't have topbest and hypergraphs for all these! Decoder failure? Check $decoderLog\n" if ($devSize != $num_hgs || $devSize != $num_topbest);
+ my $dec_score = check_output("cat $runFile | $SCORER $refs_comma_sep -m $metric");
+ chomp $dec_score;
+ print STDERR "DECODER SCORE: $dec_score\n";
+
+ # save space
+ check_call("gzip -f $runFile");
+ check_call("gzip -f $decoderLog");
+
+ # run optimizer
+ print STDERR "RUNNING OPTIMIZER AT ";
+ print STDERR unchecked_output("date");
+ print STDERR " - GENERATE TRAINING EXEMPLARS\n";
+ my $mergeLog="$logdir/prune-merge.log.$iteration";
+
+ my $score = 0;
+ my $icc = 0;
+ my $inweights="$dir/weights.$im1";
+ my $outweights="$dir/weights.$iteration";
+ $cmd="$MAPINPUT $dir/hgs > $dir/agenda.$im1";
+ print STDERR "COMMAND:\n$cmd\n";
+ check_call($cmd);
+ $cmd="$MAPPER $refs_comma_sep -m $metric -i $dir/agenda.$im1 $kbest -w $inweights > $outweights";
+ check_call($cmd);
+ $lastWeightsFile = $outweights;
+ $iteration++;
+ `rm hgs/*.gz`;
+ print STDERR "\n==========\n";
+}
+
+print STDERR "\nFINAL WEIGHTS: $lastWeightsFile\n(Use -w <this file> with the decoder)\n\n";
+
+print STDOUT "$lastWeightsFile\n";
+
+exit 0;
+
+sub get_lines {
+ my $fn = shift @_;
+ open FL, "<$fn" or die "Couldn't read $fn: $!";
+ my $lc = 0;
+ while(<FL>) { $lc++; }
+ return $lc;
+}
+
+sub get_comma_sep_refs {
+ my ($r,$p) = @_;
+ my $o = check_output("echo $p");
+ chomp $o;
+ my @files = split /\s+/, $o;
+ return "-$r " . join(" -$r ", @files);
+}
+
+sub read_weights_file {
+ my ($file) = @_;
+ open F, "<$file" or die "Couldn't read $file: $!";
+ my @r = ();
+ my $pm = -1;
+ while(<F>) {
+ next if /^#/;
+ next if /^\s*$/;
+ chomp;
+ if (/^(.+)\s+(.+)$/) {
+ my $m = $1;
+ my $w = $2;
+ die "Weights out of order: $m <= $pm" unless $m > $pm;
+ push @r, $w;
+ } else {
+ warn "Unexpected feature name in weight file: $_";
+ }
+ }
+ close F;
+ return join ' ', @r;
+}
+
+# subs
+sub write_config {
+ my $fh = shift;
+ my $cleanup = "yes";
+ if ($disable_clean) {$cleanup = "no";}
+
+ print $fh "\n";
+ print $fh "DECODER: $decoder\n";
+ print $fh "INI FILE: $iniFile\n";
+ print $fh "WORKING DIR: $dir\n";
+ print $fh "SOURCE (DEV): $srcFile\n";
+ print $fh "REFS (DEV): $refFiles\n";
+ print $fh "EVAL METRIC: $metric\n";
+ print $fh "MAX ITERATIONS: $max_iterations\n";
+ print $fh "JOBS: $jobs\n";
+ print $fh "HEAD NODE: $host\n";
+ print $fh "PMEM (DECODING): $pmem\n";
+ print $fh "CLEANUP: $cleanup\n";
+}
+
+sub update_weights_file {
+ my ($neww, $rfn, $rpts) = @_;
+ my @feats = @$rfn;
+ my @pts = @$rpts;
+ my $num_feats = scalar @feats;
+ my $num_pts = scalar @pts;
+ die "$num_feats (num_feats) != $num_pts (num_pts)" unless $num_feats == $num_pts;
+ open G, ">$neww" or die;
+ for (my $i = 0; $i < $num_feats; $i++) {
+ my $f = $feats[$i];
+ my $lambda = $pts[$i];
+ print G "$f $lambda\n";
+ }
+ close G;
+}
+
+sub enseg {
+ my $src = shift;
+ my $newsrc = shift;
+ open(SRC, $src);
+ open(NEWSRC, ">$newsrc");
+ my $i=0;
+ while (my $line=<SRC>){
+ chomp $line;
+ if ($line =~ /^\s*<seg/i) {
+ if($line =~ /id="[0-9]+"/) {
+ print NEWSRC "$line\n";
+ } else {
+ die "When using segments with pre-generated <seg> tags, you must include a zero-based id attribute";
+ }
+ } else {
+ print NEWSRC "<seg id=\"$i\">$line</seg>\n";
+ }
+ $i++;
+ }
+ close SRC;
+ close NEWSRC;
+ die "Empty dev set!" if ($i == 0);
+}
+
+sub print_help {
+
+ my $executable = check_output("basename $0"); chomp $executable;
+ print << "Help";
+
+Usage: $executable [options] <ini file>
+
+ $executable [options] <ini file>
+ Runs a complete PRO optimization using the ini file specified.
+
+Required:
+
+ --ref-files <files>
+ Dev set ref files. This option takes only a single string argument.
+ To use multiple files (including file globbing), this argument should
+ be quoted.
+
+ --source-file <file>
+ Dev set source file.
+
+ --weights <file>
+ Initial weights file (use empty file to start from 0)
+
+General options:
+
+ --help
+ Print this message and exit.
+
+ --dont-accumulate
+ Don't accumulate k-best lists from multiple iterations.
+
+ --max-iterations <M>
+ Maximum number of iterations to run. If not specified, defaults
+ to $default_max_iter.
+
+ --metric <method>
+ Metric to optimize.
+ Example values: IBM_BLEU, NIST_BLEU, Koehn_BLEU, TER, Combi
+
+ --pass-suffix <S>
+ If the decoder is doing multi-pass decoding, the pass suffix "2",
+ "3", etc., is used to control what iteration of weights is set.
+
+ --workdir <dir>
+ Directory for intermediate and output files. If not specified, the
+ name is derived from the ini filename. Assuming that the ini
+ filename begins with the decoder name and ends with ini, the default
+ name of the working directory is inferred from the middle part of
+ the filename. E.g. an ini file named decoder.foo.ini would have
+ a default working directory name foo.
+
+Regularization options:
+
+ --reg <F>
+ l2 regularization strength [default=500]. The greater this value,
+ the closer to zero the weights will be.
+
+Job control options:
+
+ --jobs <I>
+ Number of decoder processes to run in parallel. [default=$default_jobs]
+
+ --qsub
+ Use qsub to run jobs in parallel (qsub must be configured in
+ environment/LocalEnvironment.pm)
+
+ --pmem <N>
+ Amount of physical memory requested for parallel decoding jobs
+ (used with qsub requests only)
+
+Help
+}
+
+sub convert {
+ my ($str) = @_;
+ my @ps = split /;/, $str;
+ my %dict = ();
+ for my $p (@ps) {
+ my ($k, $v) = split /=/, $p;
+ $dict{$k} = $v;
+ }
+ return %dict;
+}
+
+
+sub cmdline {
+ return join ' ',($0,@ORIG_ARGV);
+}
+
+#buggy: last arg gets quoted sometimes?
+my $is_shell_special=qr{[ \t\n\\><|&;"'`~*?{}$!()]};
+my $shell_escape_in_quote=qr{[\\"\$`!]};
+
+sub escape_shell {
+ my ($arg)=@_;
+ return undef unless defined $arg;
+ if ($arg =~ /$is_shell_special/) {
+ $arg =~ s/($shell_escape_in_quote)/\\$1/g;
+ return "\"$arg\"";
+ }
+ return $arg;
+}
+
+sub escaped_shell_args {
+ return map {local $_=$_;chomp;escape_shell($_)} @_;
+}
+
+sub escaped_shell_args_str {
+ return join ' ',&escaped_shell_args(@_);
+}
+
+sub escaped_cmdline {
+ return "$0 ".&escaped_shell_args_str(@ORIG_ARGV);
+}
diff --git a/minrisk/minrisk_generate_input.pl b/minrisk/minrisk_generate_input.pl
new file mode 100755
index 00000000..b30fc4fd
--- /dev/null
+++ b/minrisk/minrisk_generate_input.pl
@@ -0,0 +1,18 @@
+#!/usr/bin/perl -w
+use strict;
+
+die "Usage: $0 HG_DIR\n" unless scalar @ARGV == 1;
+my $d = shift @ARGV;
+die "Can't find directory $d" unless -d $d;
+
+opendir(DIR, $d) or die "Can't read $d: $!";
+my @hgs = grep { /\.gz$/ } readdir(DIR);
+closedir DIR;
+
+for my $hg (@hgs) {
+ my $file = $hg;
+ my $id = $hg;
+ $id =~ s/(\.json)?\.gz//;
+ print "$d/$file $id\n";
+}
+
diff --git a/minrisk/minrisk_optimize.cc b/minrisk/minrisk_optimize.cc
new file mode 100644
index 00000000..5096acc1
--- /dev/null
+++ b/minrisk/minrisk_optimize.cc
@@ -0,0 +1,141 @@
+#include <sstream>
+#include <iostream>
+#include <vector>
+#include <limits>
+
+#include <boost/program_options.hpp>
+#include <boost/program_options/variables_map.hpp>
+
+#include "liblbfgs/lbfgs++.h"
+#include "filelib.h"
+#include "stringlib.h"
+#include "weights.h"
+#include "hg_io.h"
+#include "kbest.h"
+#include "viterbi.h"
+#include "ns.h"
+#include "ns_docscorer.h"
+#include "candidate_set.h"
+#include "risk.h"
+
+using namespace std;
+namespace po = boost::program_options;
+
+void InitCommandLine(int argc, char** argv, po::variables_map* conf) {
+ po::options_description opts("Configuration options");
+ opts.add_options()
+ ("reference,r",po::value<vector<string> >(), "[REQD] Reference translation (tokenized text)")
+ ("weights,w",po::value<string>(), "[REQD] Weights files from current iterations")
+ ("input,i",po::value<string>()->default_value("-"), "Input file to map (- is STDIN)")
+ ("evaluation_metric,m",po::value<string>()->default_value("IBM_BLEU"), "Evaluation metric (ibm_bleu, koehn_bleu, nist_bleu, ter, meteor, etc.)")
+ ("kbest_repository,R",po::value<string>(), "Accumulate k-best lists from previous iterations (parameter is path to repository)")
+ ("kbest_size,k",po::value<unsigned>()->default_value(500u), "Top k-hypotheses to extract")
+ ("help,h", "Help");
+ po::options_description dcmdline_options;
+ dcmdline_options.add(opts);
+ po::store(parse_command_line(argc, argv, dcmdline_options), *conf);
+ bool flag = false;
+ if (!conf->count("reference")) {
+ cerr << "Please specify one or more references using -r <REF.TXT>\n";
+ flag = true;
+ }
+ if (!conf->count("weights")) {
+ cerr << "Please specify weights using -w <WEIGHTS.TXT>\n";
+ flag = true;
+ }
+ if (flag || conf->count("help")) {
+ cerr << dcmdline_options << endl;
+ exit(1);
+ }
+}
+
+EvaluationMetric* metric = NULL;
+
+struct RiskObjective {
+ explicit RiskObjective(const vector<training::CandidateSet>& tr) : training(tr) {}
+ double operator()(const vector<double>& x, double* g) const {
+ fill(g, g + x.size(), 0.0);
+ double obj = 0;
+ for (unsigned i = 0; i < training.size(); ++i) {
+ training::CandidateSetRisk risk(training[i], *metric);
+ SparseVector<double> tg;
+ double r = risk(x, &tg);
+ obj += r;
+ for (SparseVector<double>::iterator it = tg.begin(); it != tg.end(); ++it)
+ g[it->first] += it->second;
+ }
+ cerr << (1-(obj / training.size())) << endl;
+ return obj;
+ }
+ const vector<training::CandidateSet>& training;
+};
+
+double LearnParameters(const vector<training::CandidateSet>& training,
+ const double C1,
+ const unsigned memory_buffers,
+ vector<weight_t>* px) {
+ RiskObjective obj(training);
+ LBFGS<RiskObjective> lbfgs(px, obj, memory_buffers, C1);
+ lbfgs.MinimizeFunction();
+ return 0;
+}
+
+// runs lines 4--15 of rampion algorithm
+int main(int argc, char** argv) {
+ po::variables_map conf;
+ InitCommandLine(argc, argv, &conf);
+ const string evaluation_metric = conf["evaluation_metric"].as<string>();
+
+ metric = EvaluationMetric::Instance(evaluation_metric);
+ DocumentScorer ds(metric, conf["reference"].as<vector<string> >());
+ cerr << "Loaded " << ds.size() << " references for scoring with " << evaluation_metric << endl;
+ double goodsign = -1;
+ double badsign = -goodsign;
+
+ Hypergraph hg;
+ string last_file;
+ ReadFile in_read(conf["input"].as<string>());
+ string kbest_repo;
+ if (conf.count("kbest_repository")) {
+ kbest_repo = conf["kbest_repository"].as<string>();
+ MkDirP(kbest_repo);
+ }
+ istream &in=*in_read.stream();
+ const unsigned kbest_size = conf["kbest_size"].as<unsigned>();
+ vector<weight_t> weights;
+ const string weightsf = conf["weights"].as<string>();
+ Weights::InitFromFile(weightsf, &weights);
+ string line, file;
+ vector<training::CandidateSet> kis;
+ cerr << "Loading hypergraphs...\n";
+ while(getline(in, line)) {
+ istringstream is(line);
+ int sent_id;
+ kis.resize(kis.size() + 1);
+ training::CandidateSet& curkbest = kis.back();
+ string kbest_file;
+ if (kbest_repo.size()) {
+ ostringstream os;
+ os << kbest_repo << "/kbest." << sent_id << ".txt.gz";
+ kbest_file = os.str();
+ if (FileExists(kbest_file))
+ curkbest.ReadFromFile(kbest_file);
+ }
+ is >> file >> sent_id;
+ ReadFile rf(file);
+ if (kis.size() % 5 == 0) { cerr << '.'; }
+ if (kis.size() % 200 == 0) { cerr << " [" << kis.size() << "]\n"; }
+ HypergraphIO::ReadFromJSON(rf.stream(), &hg);
+ hg.Reweight(weights);
+ curkbest.AddKBestCandidates(hg, kbest_size, ds[sent_id]);
+ if (kbest_file.size())
+ curkbest.WriteToFile(kbest_file);
+ }
+ cerr << "\nHypergraphs loaded.\n";
+ weights.resize(FD::NumFeats());
+
+ LearnParameters(kis, 0.0, 100, &weights);
+ Weights::WriteToFile("-", weights);
+ return 0;
+}
+
diff --git a/training/Makefile.am b/training/Makefile.am
index 19ee8f0d..68ebfab4 100644
--- a/training/Makefile.am
+++ b/training/Makefile.am
@@ -27,7 +27,8 @@ noinst_LIBRARIES = libtraining.a
libtraining_a_SOURCES = \
candidate_set.cc \
optimize.cc \
- online_optimizer.cc
+ online_optimizer.cc \
+ risk.cc
mpi_online_optimize_SOURCES = mpi_online_optimize.cc
mpi_online_optimize_LDADD = libtraining.a $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a ../klm/lm/libklm.a ../klm/util/libklm_util.a -lz
diff --git a/training/risk.cc b/training/risk.cc
new file mode 100644
index 00000000..d5a12cfd
--- /dev/null
+++ b/training/risk.cc
@@ -0,0 +1,45 @@
+#include "risk.h"
+
+#include "prob.h"
+#include "candidate_set.h"
+#include "ns.h"
+
+using namespace std;
+
+namespace training {
+
+// g = \sum_e p(e|f) * loss(e) * (phi(e,f) - E[phi(e,f)])
+double CandidateSetRisk::operator()(const vector<double>& params,
+ SparseVector<double>* g) const {
+ prob_t z;
+ for (unsigned i = 0; i < cands_.size(); ++i) {
+ const prob_t u(cands_[i].fmap.dot(params), init_lnx());
+ z += u;
+ }
+ const double log_z = log(z);
+
+ SparseVector<double> exp_feats;
+ if (g) {
+ for (unsigned i = 0; i < cands_.size(); ++i) {
+ const double log_prob = cands_[i].fmap.dot(params) - log_z;
+ const double prob = exp(log_prob);
+ exp_feats += cands_[i].fmap * prob;
+ }
+ }
+
+ double risk = 0;
+ for (unsigned i = 0; i < cands_.size(); ++i) {
+ const double log_prob = cands_[i].fmap.dot(params) - log_z;
+ const double prob = exp(log_prob);
+ const double cost = metric_.IsErrorMetric() ? metric_.ComputeScore(cands_[i].eval_feats)
+ : 1.0 - metric_.ComputeScore(cands_[i].eval_feats);
+ const double r = prob * cost;
+ risk += r;
+ if (g) (*g) += (cands_[i].fmap - exp_feats) * r;
+ }
+ return risk;
+}
+
+}
+
+
diff --git a/training/risk.h b/training/risk.h
new file mode 100644
index 00000000..2e8db0fb
--- /dev/null
+++ b/training/risk.h
@@ -0,0 +1,26 @@
+#ifndef _RISK_H_
+#define _RISK_H_
+
+#include <vector>
+#include "sparse_vector.h"
+class EvaluationMetric;
+
+namespace training {
+ class CandidateSet;
+
+ class CandidateSetRisk {
+ public:
+ explicit CandidateSetRisk(const CandidateSet& cs, const EvaluationMetric& metric) :
+ cands_(cs),
+ metric_(metric) {}
+ // compute the risk (expected loss) of a CandidateSet
+ // (optional) the gradient of the risk with respect to params
+ double operator()(const std::vector<double>& params,
+ SparseVector<double>* g = NULL) const;
+ private:
+ const CandidateSet& cands_;
+ const EvaluationMetric& metric_;
+ };
+};
+
+#endif