From 47aa8d94d3ddff39295966cee67ce884c98be8da Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Fri, 27 Jan 2012 14:49:08 -0500 Subject: rename vest to dpmert (dynamic programming mert), rename variables and types to correspond to standard geometric concepts --- dpmert/Makefile.am | 35 ++ dpmert/README.shared-mem | 9 + dpmert/cat.pl | 4 + dpmert/ces.cc | 91 ++++ dpmert/ces.h | 16 + dpmert/dpmert.pl | 700 ++++++++++++++++++++++++++++++ dpmert/error_surface.cc | 42 ++ dpmert/error_surface.h | 24 + dpmert/libcall.pl | 71 +++ dpmert/line_mediator.pl | 116 +++++ dpmert/line_optimizer.cc | 111 +++++ dpmert/line_optimizer.h | 48 ++ dpmert/lo_test.cc | 236 ++++++++++ dpmert/mert_geometry.cc | 186 ++++++++ dpmert/mert_geometry.h | 81 ++++ dpmert/mr_dpmert_generate_mapper_input.cc | 78 ++++ dpmert/mr_dpmert_map.cc | 112 +++++ dpmert/mr_dpmert_reduce.cc | 77 ++++ dpmert/parallelize.pl | 423 ++++++++++++++++++ dpmert/sentclient.c | 76 ++++ dpmert/sentserver.c | 515 ++++++++++++++++++++++ dpmert/sentserver.h | 6 + dpmert/tac.pl | 8 + dpmert/test_aer/README | 8 + dpmert/test_aer/cdec.ini | 3 + dpmert/test_aer/corpus.src | 3 + dpmert/test_aer/grammar | 12 + dpmert/test_aer/ref.0 | 3 + dpmert/test_aer/weights | 13 + dpmert/test_data/0.json.gz | Bin 0 -> 13709 bytes dpmert/test_data/1.json.gz | Bin 0 -> 204803 bytes dpmert/test_data/c2e.txt.0 | 2 + dpmert/test_data/c2e.txt.1 | 2 + dpmert/test_data/c2e.txt.2 | 2 + dpmert/test_data/c2e.txt.3 | 2 + dpmert/test_data/re.txt.0 | 5 + dpmert/test_data/re.txt.1 | 5 + dpmert/test_data/re.txt.2 | 5 + dpmert/test_data/re.txt.3 | 5 + 39 files changed, 3135 insertions(+) create mode 100644 dpmert/Makefile.am create mode 100644 dpmert/README.shared-mem create mode 100755 dpmert/cat.pl create mode 100644 dpmert/ces.cc create mode 100644 dpmert/ces.h create mode 100755 dpmert/dpmert.pl create mode 100644 dpmert/error_surface.cc create mode 100644 dpmert/error_surface.h create mode 100644 dpmert/libcall.pl create mode 100755 dpmert/line_mediator.pl create mode 100644 dpmert/line_optimizer.cc create mode 100644 dpmert/line_optimizer.h create mode 100644 dpmert/lo_test.cc create mode 100644 dpmert/mert_geometry.cc create mode 100644 dpmert/mert_geometry.h create mode 100644 dpmert/mr_dpmert_generate_mapper_input.cc create mode 100644 dpmert/mr_dpmert_map.cc create mode 100644 dpmert/mr_dpmert_reduce.cc create mode 100755 dpmert/parallelize.pl create mode 100644 dpmert/sentclient.c create mode 100644 dpmert/sentserver.c create mode 100644 dpmert/sentserver.h create mode 100755 dpmert/tac.pl create mode 100644 dpmert/test_aer/README create mode 100644 dpmert/test_aer/cdec.ini create mode 100644 dpmert/test_aer/corpus.src create mode 100644 dpmert/test_aer/grammar create mode 100644 dpmert/test_aer/ref.0 create mode 100644 dpmert/test_aer/weights create mode 100644 dpmert/test_data/0.json.gz create mode 100644 dpmert/test_data/1.json.gz create mode 100644 dpmert/test_data/c2e.txt.0 create mode 100644 dpmert/test_data/c2e.txt.1 create mode 100644 dpmert/test_data/c2e.txt.2 create mode 100644 dpmert/test_data/c2e.txt.3 create mode 100644 dpmert/test_data/re.txt.0 create mode 100644 dpmert/test_data/re.txt.1 create mode 100644 dpmert/test_data/re.txt.2 create mode 100644 dpmert/test_data/re.txt.3 (limited to 'dpmert') diff --git a/dpmert/Makefile.am b/dpmert/Makefile.am new file mode 100644 index 00000000..2676fb50 --- /dev/null +++ b/dpmert/Makefile.am @@ -0,0 +1,35 @@ +bin_PROGRAMS = \ + mr_dpmert_map \ + mr_dpmert_reduce \ + mr_dpmert_generate_mapper_input \ + sentserver \ + sentclient + +if HAVE_GTEST +noinst_PROGRAMS = \ + lo_test +TESTS = lo_test +endif + +sentserver_SOURCES = sentserver.c +sentserver_LDFLAGS = -all-static -pthread + +sentclient_SOURCES = sentclient.c +sentclient_LDFLAGS = -all-static -pthread + +mr_dpmert_generate_mapper_input_SOURCES = mr_dpmert_generate_mapper_input.cc line_optimizer.cc +mr_dpmert_generate_mapper_input_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a -lz + +# nbest2hg_SOURCES = nbest2hg.cc +# nbest2hg_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a -lfst -lz + +mr_dpmert_map_SOURCES = mert_geometry.cc ces.cc error_surface.cc mr_dpmert_map.cc line_optimizer.cc +mr_dpmert_map_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a -lz + +mr_dpmert_reduce_SOURCES = error_surface.cc ces.cc mr_dpmert_reduce.cc line_optimizer.cc mert_geometry.cc +mr_dpmert_reduce_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a -lz + +lo_test_SOURCES = lo_test.cc ces.cc mert_geometry.cc error_surface.cc line_optimizer.cc +lo_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a -lz + +AM_CPPFLAGS = -W -Wall -Wno-sign-compare $(GTEST_CPPFLAGS) -I$(top_srcdir)/utils -I$(top_srcdir)/decoder -I$(top_srcdir)/mteval diff --git a/dpmert/README.shared-mem b/dpmert/README.shared-mem new file mode 100644 index 00000000..7728efc0 --- /dev/null +++ b/dpmert/README.shared-mem @@ -0,0 +1,9 @@ +If you want to run dist-vest.pl on a very large shared memory machine, do the +following: + + ./dist-vest.pl --use-make I --decode-nodes J --weights weights.init --source-file=dev.src --ref-files=dev.ref.* cdec.ini + +This will use I jobs for doing the line search and J jobs to run the decoder. Typically, since the +decoder must load grammars, language models, etc., J should be smaller than I, but this will depend +on the system you are running on and the complexity of the models used for decoding. + diff --git a/dpmert/cat.pl b/dpmert/cat.pl new file mode 100755 index 00000000..2ecba3f9 --- /dev/null +++ b/dpmert/cat.pl @@ -0,0 +1,4 @@ +#!/usr/bin/perl + +$|=1; +print while(<>); diff --git a/dpmert/ces.cc b/dpmert/ces.cc new file mode 100644 index 00000000..a85454da --- /dev/null +++ b/dpmert/ces.cc @@ -0,0 +1,91 @@ +#include "ces.h" + +#include +#include +#include + +// TODO, if AER is to be optimized again, we will need this +// #include "aligner.h" +#include "lattice.h" +#include "mert_geometry.h" +#include "error_surface.h" +#include "ns.h" + +using boost::shared_ptr; +using namespace std; + +const bool minimize_segments = true; // if adjacent segments have equal scores, merge them + +void ComputeErrorSurface(const SegmentEvaluator& ss, + const ConvexHull& ve, + ErrorSurface* env, + const EvaluationMetric* metric, + const Hypergraph& hg) { + vector prev_trans; + const vector >& ienv = ve.GetSortedSegs(); + env->resize(ienv.size()); + SufficientStats prev_score; // defaults to 0 + int j = 0; + for (int i = 0; i < ienv.size(); ++i) { + const MERTPoint& seg = *ienv[i]; + vector trans; +#if 0 + if (type == AER) { + vector edges(hg.edges_.size(), false); + seg.CollectEdgesUsed(&edges); // get the set of edges in the viterbi + // alignment + ostringstream os; + const string* psrc = ss.GetSource(); + if (psrc == NULL) { + cerr << "AER scoring in VEST requires source, but it is missing!\n"; + abort(); + } + size_t pos = psrc->rfind(" ||| "); + if (pos == string::npos) { + cerr << "Malformed source for AER: expected |||\nINPUT: " << *psrc << endl; + abort(); + } + Lattice src; + Lattice ref; + LatticeTools::ConvertTextOrPLF(psrc->substr(0, pos), &src); + LatticeTools::ConvertTextOrPLF(psrc->substr(pos + 5), &ref); + AlignerTools::WriteAlignment(src, ref, hg, &os, true, 0, &edges); + string tstr = os.str(); + TD::ConvertSentence(tstr.substr(tstr.rfind(" ||| ") + 5), &trans); + } else { +#endif + seg.ConstructTranslation(&trans); + //} + //cerr << "Scoring: " << TD::GetString(trans) << endl; + if (trans == prev_trans) { + if (!minimize_segments) { + ErrorSegment& out = (*env)[j]; + out.delta.fields.clear(); + out.x = seg.x; + ++j; + } + //cerr << "Identical translation, skipping scoring\n"; + } else { + SufficientStats score; + ss.Evaluate(trans, &score); + // cerr << "score= " << score->ComputeScore() << "\n"; + //string x1; score.Encode(&x1); cerr << "STATS: " << x1 << endl; + const SufficientStats delta = score - prev_score; + //string x2; delta.Encode(&x2); cerr << "DELTA: " << x2 << endl; + //string xx; delta.Encode(&xx); cerr << xx << endl; + prev_trans.swap(trans); + prev_score = score; + if ((!minimize_segments) || (!delta.IsAdditiveIdentity())) { + ErrorSegment& out = (*env)[j]; + out.delta = delta; + out.x = seg.x; + ++j; + } + } + } + // cerr << " In segments: " << ienv.size() << endl; + // cerr << "Out segments: " << j << endl; + assert(j > 0); + env->resize(j); +} + diff --git a/dpmert/ces.h b/dpmert/ces.h new file mode 100644 index 00000000..e4fa2080 --- /dev/null +++ b/dpmert/ces.h @@ -0,0 +1,16 @@ +#ifndef _CES_H_ +#define _CES_H_ + +class ConvexHull; +class Hypergraph; +class SegmentEvaluator; +class ErrorSurface; +class EvaluationMetric; + +void ComputeErrorSurface(const SegmentEvaluator& ss, + const ConvexHull& convex_hull, + ErrorSurface* es, + const EvaluationMetric* metric, + const Hypergraph& hg); + +#endif diff --git a/dpmert/dpmert.pl b/dpmert/dpmert.pl new file mode 100755 index 00000000..52ce0fc0 --- /dev/null +++ b/dpmert/dpmert.pl @@ -0,0 +1,700 @@ +#!/usr/bin/env perl +use strict; +my @ORIG_ARGV=@ARGV; +use Cwd qw(getcwd); +my $SCRIPT_DIR; BEGIN { use Cwd qw/ abs_path /; use File::Basename; $SCRIPT_DIR = dirname(abs_path($0)); push @INC, $SCRIPT_DIR, "$SCRIPT_DIR/../environment"; } + +# Skip local config (used for distributing jobs) if we're running in local-only mode +use LocalConfig; +use Getopt::Long; +use IPC::Open2; +use POSIX ":sys_wait_h"; +my $QSUB_CMD = qsub_args(mert_memory()); + +require "libcall.pl"; + +# Default settings +my $srcFile; +my $refFiles; +my $default_jobs = env_default_jobs(); +my $bin_dir = $SCRIPT_DIR; +die "Bin directory $bin_dir missing/inaccessible" unless -d $bin_dir; +my $FAST_SCORE="$bin_dir/../mteval/fast_score"; +die "Can't execute $FAST_SCORE" unless -x $FAST_SCORE; +my $MAPINPUT = "$bin_dir/mr_dpmert_generate_mapper_input"; +my $MAPPER = "$bin_dir/mr_dpmert_map"; +my $REDUCER = "$bin_dir/mr_dpmert_reduce"; +my $parallelize = "$bin_dir/parallelize.pl"; +my $libcall = "$bin_dir/libcall.pl"; +my $sentserver = "$bin_dir/sentserver"; +my $sentclient = "$bin_dir/sentclient"; +my $LocalConfig = "$SCRIPT_DIR/../environment/LocalConfig.pm"; + +my $SCORER = $FAST_SCORE; +die "Can't find $MAPPER" unless -x $MAPPER; +my $cdec = "$bin_dir/../decoder/cdec"; +die "Can't find decoder in $cdec" unless -x $cdec; +die "Can't find $parallelize" unless -x $parallelize; +die "Can't find $libcall" unless -e $libcall; +my $decoder = $cdec; +my $lines_per_mapper = 400; +my $rand_directions = 15; +my $iteration = 1; +my $best_weights; +my $max_iterations = 15; +my $optimization_iters = 6; +my $jobs = $default_jobs; # number of decode nodes +my $pmem = "9g"; +my $disable_clean = 0; +my %seen_weights; +my $normalize; +my $help = 0; +my $epsilon = 0.0001; +my $interval = 5; +my $dryrun = 0; +my $last_score = -10000000; +my $metric = "ibm_bleu"; +my $dir; +my $iniFile; +my $weights; +my $initialWeights; +my $decoderOpt; +my $noprimary; +my $maxsim=0; +my $oraclen=0; +my $oracleb=20; +my $bleu_weight=1; +my $use_make = 1; # use make to parallelize line search +my $useqsub; +my $pass_suffix = ''; +my $cpbin=1; +# Process command-line options +Getopt::Long::Configure("no_auto_abbrev"); +if (GetOptions( + "decoder=s" => \$decoderOpt, + "jobs=i" => \$jobs, + "dont-clean" => \$disable_clean, + "pass-suffix=s" => \$pass_suffix, + "dry-run" => \$dryrun, + "epsilon=s" => \$epsilon, + "help" => \$help, + "interval" => \$interval, + "qsub" => \$useqsub, + "max-iterations=i" => \$max_iterations, + "normalize=s" => \$normalize, + "pmem=s" => \$pmem, + "cpbin!" => \$cpbin, + "random-directions=i" => \$rand_directions, + "ref-files=s" => \$refFiles, + "metric=s" => \$metric, + "source-file=s" => \$srcFile, + "weights=s" => \$initialWeights, + "workdir=s" => \$dir, + "opt-iterations=i" => \$optimization_iters, +) == 0 || @ARGV!=1 || $help) { + print_help(); + exit; +} + +if ($useqsub) { + $use_make = 0; + die "LocalEnvironment.pm does not have qsub configuration for this host. Cannot run with --qsub!\n" unless has_qsub(); +} + +my @missing_args = (); +if (!defined $srcFile) { push @missing_args, "--source-file"; } +if (!defined $refFiles) { push @missing_args, "--ref-files"; } +if (!defined $initialWeights) { push @missing_args, "--weights"; } +die "Please specify missing arguments: " . join (', ', @missing_args) . "\n" if (@missing_args); + +if ($metric =~ /^(combi|ter)$/i) { + $lines_per_mapper = 40; +} elsif ($metric =~ /^meteor$/i) { + $lines_per_mapper = 2000; # start up time is really high +} + +($iniFile) = @ARGV; + + +sub write_config; +sub enseg; +sub print_help; + +my $nodelist; +my $host =check_output("hostname"); chomp $host; +my $bleu; +my $interval_count = 0; +my $logfile; +my $projected_score; + +# used in sorting scores +my $DIR_FLAG = '-r'; +if ($metric =~ /^ter$|^aer$/i) { + $DIR_FLAG = ''; +} + +my $refs_comma_sep = get_comma_sep_refs('r',$refFiles); + +unless ($dir){ + $dir = "dpmert"; +} +unless ($dir =~ /^\//){ # convert relative path to absolute path + my $basedir = check_output("pwd"); + chomp $basedir; + $dir = "$basedir/$dir"; +} + +if ($decoderOpt){ $decoder = $decoderOpt; } + + +# Initializations and helper functions +srand; + +my @childpids = (); +my @cleanupcmds = (); + +sub cleanup { + print STDERR "Cleanup...\n"; + for my $pid (@childpids){ unchecked_call("kill $pid"); } + for my $cmd (@cleanupcmds){ unchecked_call("$cmd"); } + exit 1; +}; +# Always call cleanup, no matter how we exit +*CORE::GLOBAL::exit = + sub{ cleanup(); }; +$SIG{INT} = "cleanup"; +$SIG{TERM} = "cleanup"; +$SIG{HUP} = "cleanup"; + +my $decoderBase = check_output("basename $decoder"); chomp $decoderBase; +my $newIniFile = "$dir/$decoderBase.ini"; +my $inputFileName = "$dir/input"; +my $user = $ENV{"USER"}; + + +# process ini file +-e $iniFile || die "Error: could not open $iniFile for reading\n"; +open(INI, $iniFile); + +use File::Basename qw(basename); +#pass bindir, refs to vars holding bin +sub modbin { + local $_; + my $bindir=shift; + check_call("mkdir -p $bindir"); + -d $bindir || die "couldn't make bindir $bindir"; + for (@_) { + my $src=$$_; + $$_="$bindir/".basename($src); + check_call("cp -p $src $$_"); + } +} +sub dirsize { + opendir ISEMPTY,$_[0]; + return scalar(readdir(ISEMPTY))-1; +} +if ($dryrun){ + write_config(*STDERR); + exit 0; +} else { + if (-e $dir && dirsize($dir)>1 && -e "$dir/hgs" ){ # allow preexisting logfile, binaries, but not dist-dpmert.pl outputs + die "ERROR: working dir $dir already exists\n\n"; + } else { + -e $dir || mkdir $dir; + mkdir "$dir/hgs"; + modbin("$dir/bin",\$LocalConfig,\$cdec,\$SCORER,\$MAPINPUT,\$MAPPER,\$REDUCER,\$parallelize,\$sentserver,\$sentclient,\$libcall) if $cpbin; + mkdir "$dir/scripts"; + my $cmdfile="$dir/rerun-dpmert.sh"; + open CMD,'>',$cmdfile; + print CMD "cd ",&getcwd,"\n"; +# print CMD &escaped_cmdline,"\n"; #buggy - last arg is quoted. + my $cline=&cmdline."\n"; + print CMD $cline; + close CMD; + print STDERR $cline; + chmod(0755,$cmdfile); + unless (-e $initialWeights) { + print STDERR "Please specify an initial weights file with --initial-weights\n"; + print_help(); + exit; + } + check_call("cp $initialWeights $dir/weights.0"); + die "Can't find weights.0" unless (-e "$dir/weights.0"); + } + write_config(*STDERR); +} + + +# Generate initial files and values +check_call("cp $iniFile $newIniFile"); +$iniFile = $newIniFile; + +my $newsrc = "$dir/dev.input"; +enseg($srcFile, $newsrc); +$srcFile = $newsrc; +my $devSize = 0; +open F, "<$srcFile" or die "Can't read $srcFile: $!"; +while() { $devSize++; } +close F; + +unless($best_weights){ $best_weights = $weights; } +unless($projected_score){ $projected_score = 0.0; } +$seen_weights{$weights} = 1; + +my $random_seed = int(time / 1000); +my $lastWeightsFile; +my $lastPScore = 0; +# main optimization loop +while (1){ + print STDERR "\n\nITERATION $iteration\n==========\n"; + + if ($iteration > $max_iterations){ + print STDERR "\nREACHED STOPPING CRITERION: Maximum iterations\n"; + last; + } + # iteration-specific files + my $runFile="$dir/run.raw.$iteration"; + my $onebestFile="$dir/1best.$iteration"; + my $logdir="$dir/logs.$iteration"; + my $decoderLog="$logdir/decoder.sentserver.log.$iteration"; + my $scorerLog="$logdir/scorer.log.$iteration"; + check_call("mkdir -p $logdir"); + + + #decode + print STDERR "RUNNING DECODER AT "; + print STDERR unchecked_output("date"); + my $im1 = $iteration - 1; + my $weightsFile="$dir/weights.$im1"; + my $decoder_cmd = "$decoder -c $iniFile --weights$pass_suffix $weightsFile -O $dir/hgs"; + my $pcmd; + if ($use_make) { + $pcmd = "cat $srcFile | $parallelize --use-fork -p $pmem -e $logdir -j $jobs --"; + } else { + $pcmd = "cat $srcFile | $parallelize -p $pmem -e $logdir -j $jobs --"; + } + my $cmd = "$pcmd $decoder_cmd 2> $decoderLog 1> $runFile"; + print STDERR "COMMAND:\n$cmd\n"; + check_bash_call($cmd); + my $num_hgs; + my $num_topbest; + my $retries = 0; + while($retries < 5) { + $num_hgs = check_output("ls $dir/hgs/*.gz | wc -l"); + $num_topbest = check_output("wc -l < $runFile"); + print STDERR "NUMBER OF HGs: $num_hgs\n"; + print STDERR "NUMBER OF TOP-BEST HYPs: $num_topbest\n"; + if($devSize == $num_hgs && $devSize == $num_topbest) { + last; + } else { + print STDERR "Incorrect number of hypergraphs or topbest. Waiting for distributed filesystem and retrying...\n"; + sleep(3); + } + $retries++; + } + die "Dev set contains $devSize sentences, but we don't have topbest and hypergraphs for all these! Decoder failure? Check $decoderLog\n" if ($devSize != $num_hgs || $devSize != $num_topbest); + my $dec_score = check_output("cat $runFile | $SCORER $refs_comma_sep -m $metric"); + chomp $dec_score; + print STDERR "DECODER SCORE: $dec_score\n"; + + # save space + check_call("gzip -f $runFile"); + check_call("gzip -f $decoderLog"); + + # run optimizer + print STDERR "RUNNING OPTIMIZER AT "; + print STDERR unchecked_output("date"); + my $mergeLog="$logdir/prune-merge.log.$iteration"; + + my $score = 0; + my $icc = 0; + my $inweights="$dir/weights.$im1"; + for (my $opt_iter=1; $opt_iter<$optimization_iters; $opt_iter++) { + print STDERR "\nGENERATE OPTIMIZATION STRATEGY (OPT-ITERATION $opt_iter/$optimization_iters)\n"; + print STDERR unchecked_output("date"); + $icc++; + $cmd="$MAPINPUT -w $inweights -r $dir/hgs -s $devSize -d $rand_directions > $dir/agenda.$im1-$opt_iter"; + print STDERR "COMMAND:\n$cmd\n"; + check_call($cmd); + check_call("mkdir -p $dir/splag.$im1"); + $cmd="split -a 3 -l $lines_per_mapper $dir/agenda.$im1-$opt_iter $dir/splag.$im1/mapinput."; + print STDERR "COMMAND:\n$cmd\n"; + check_call($cmd); + opendir(DIR, "$dir/splag.$im1") or die "Can't open directory: $!"; + my @shards = grep { /^mapinput\./ } readdir(DIR); + closedir DIR; + die "No shards!" unless scalar @shards > 0; + my $joblist = ""; + my $nmappers = 0; + my @mapoutputs = (); + @cleanupcmds = (); + my %o2i = (); + my $first_shard = 1; + my $mkfile; # only used with makefiles + my $mkfilename; + if ($use_make) { + $mkfilename = "$dir/splag.$im1/domap.mk"; + open $mkfile, ">$mkfilename" or die "Couldn't write $mkfilename: $!"; + print $mkfile "all: $dir/splag.$im1/map.done\n\n"; + } + my @mkouts = (); # only used with makefiles + for my $shard (@shards) { + my $mapoutput = $shard; + my $client_name = $shard; + $client_name =~ s/mapinput.//; + $client_name = "dpmert.$client_name"; + $mapoutput =~ s/mapinput/mapoutput/; + push @mapoutputs, "$dir/splag.$im1/$mapoutput"; + $o2i{"$dir/splag.$im1/$mapoutput"} = "$dir/splag.$im1/$shard"; + my $script = "$MAPPER -s $srcFile -m $metric $refs_comma_sep < $dir/splag.$im1/$shard | sort -t \$'\\t' -k 1 > $dir/splag.$im1/$mapoutput"; + if ($use_make) { + my $script_file = "$dir/scripts/map.$shard"; + open F, ">$script_file" or die "Can't write $script_file: $!"; + print F "#!/bin/bash\n"; + print F "$script\n"; + close F; + my $output = "$dir/splag.$im1/$mapoutput"; + push @mkouts, $output; + chmod(0755, $script_file) or die "Can't chmod $script_file: $!"; + if ($first_shard) { print STDERR "$script\n"; $first_shard=0; } + print $mkfile "$output: $dir/splag.$im1/$shard\n\t$script_file\n\n"; + } else { + my $script_file = "$dir/scripts/map.$shard"; + open F, ">$script_file" or die "Can't write $script_file: $!"; + print F "$script\n"; + close F; + if ($first_shard) { print STDERR "$script\n"; $first_shard=0; } + + $nmappers++; + my $qcmd = "$QSUB_CMD -N $client_name -o /dev/null -e $logdir/$client_name.ER $script_file"; + my $jobid = check_output("$qcmd"); + chomp $jobid; + $jobid =~ s/^(\d+)(.*?)$/\1/g; + $jobid =~ s/^Your job (\d+) .*$/\1/; + push(@cleanupcmds, "qdel $jobid 2> /dev/null"); + print STDERR " $jobid"; + if ($joblist == "") { $joblist = $jobid; } + else {$joblist = $joblist . "\|" . $jobid; } + } + } + if ($use_make) { + print $mkfile "$dir/splag.$im1/map.done: @mkouts\n\ttouch $dir/splag.$im1/map.done\n\n"; + close $mkfile; + my $mcmd = "make -j $jobs -f $mkfilename"; + print STDERR "\nExecuting: $mcmd\n"; + check_call($mcmd); + } else { + print STDERR "\nLaunched $nmappers mappers.\n"; + sleep 8; + print STDERR "Waiting for mappers to complete...\n"; + while ($nmappers > 0) { + sleep 5; + my @livejobs = grep(/$joblist/, split(/\n/, unchecked_output("qstat | grep -v ' C '"))); + $nmappers = scalar @livejobs; + } + print STDERR "All mappers complete.\n"; + } + my $tol = 0; + my $til = 0; + for my $mo (@mapoutputs) { + my $olines = get_lines($mo); + my $ilines = get_lines($o2i{$mo}); + $tol += $olines; + $til += $ilines; + die "$mo: output lines ($olines) doesn't match input lines ($ilines)" unless $olines==$ilines; + } + print STDERR "Results for $tol/$til lines\n"; + print STDERR "\nSORTING AND RUNNING VEST REDUCER\n"; + print STDERR unchecked_output("date"); + $cmd="sort -t \$'\\t' -k 1 @mapoutputs | $REDUCER -m $metric > $dir/redoutput.$im1"; + print STDERR "COMMAND:\n$cmd\n"; + check_bash_call($cmd); + $cmd="sort -nk3 $DIR_FLAG '-t|' $dir/redoutput.$im1 | head -1"; + # sort returns failure even when it doesn't fail for some reason + my $best=unchecked_output("$cmd"); chomp $best; + print STDERR "$best\n"; + my ($oa, $x, $xscore) = split /\|/, $best; + $score = $xscore; + print STDERR "PROJECTED SCORE: $score\n"; + if (abs($x) < $epsilon) { + print STDERR "\nOPTIMIZER: no score improvement: abs($x) < $epsilon\n"; + last; + } + my $psd = $score - $last_score; + $last_score = $score; + if (abs($psd) < $epsilon) { + print STDERR "\nOPTIMIZER: no score improvement: abs($psd) < $epsilon\n"; + last; + } + my ($origin, $axis) = split /\s+/, $oa; + + my %ori = convert($origin); + my %axi = convert($axis); + + my $finalFile="$dir/weights.$im1-$opt_iter"; + open W, ">$finalFile" or die "Can't write: $finalFile: $!"; + my $norm = 0; + for my $k (sort keys %ori) { + my $dd = $ori{$k} + $axi{$k} * $x; + $norm += $dd * $dd; + } + $norm = sqrt($norm); + $norm = 1; + for my $k (sort keys %ori) { + my $v = ($ori{$k} + $axi{$k} * $x) / $norm; + print W "$k $v\n"; + } + check_call("rm $dir/splag.$im1/*"); + $inweights = $finalFile; + } + $lastWeightsFile = "$dir/weights.$iteration"; + check_call("cp $inweights $lastWeightsFile"); + if ($icc < 2) { + print STDERR "\nREACHED STOPPING CRITERION: score change too little\n"; + last; + } + $lastPScore = $score; + $iteration++; + print STDERR "\n==========\n"; +} + +print STDERR "\nFINAL WEIGHTS: $lastWeightsFile\n(Use -w with the decoder)\n\n"; + +print STDOUT "$lastWeightsFile\n"; + +exit 0; + +sub normalize_weights { + my ($rfn, $rpts, $feat) = @_; + my @feat_names = @$rfn; + my @pts = @$rpts; + my $z = 1.0; + for (my $i=0; $i < scalar @feat_names; $i++) { + if ($feat_names[$i] eq $feat) { + $z = $pts[$i]; + last; + } + } + for (my $i=0; $i < scalar @feat_names; $i++) { + $pts[$i] /= $z; + } + print STDERR " NORM WEIGHTS: @pts\n"; + return @pts; +} + +sub get_lines { + my $fn = shift @_; + open FL, "<$fn" or die "Couldn't read $fn: $!"; + my $lc = 0; + while() { $lc++; } + return $lc; +} + +sub get_comma_sep_refs { + my ($r,$p) = @_; + my $o = check_output("echo $p"); + chomp $o; + my @files = split /\s+/, $o; + return "-$r " . join(" -$r ", @files); +} + +sub read_weights_file { + my ($file) = @_; + open F, "<$file" or die "Couldn't read $file: $!"; + my @r = (); + my $pm = -1; + while() { + next if /^#/; + next if /^\s*$/; + chomp; + if (/^(.+)\s+(.+)$/) { + my $m = $1; + my $w = $2; + die "Weights out of order: $m <= $pm" unless $m > $pm; + push @r, $w; + } else { + warn "Unexpected feature name in weight file: $_"; + } + } + close F; + return join ' ', @r; +} + +# subs +sub write_config { + my $fh = shift; + my $cleanup = "yes"; + if ($disable_clean) {$cleanup = "no";} + + print $fh "\n"; + print $fh "DECODER: $decoder\n"; + print $fh "INI FILE: $iniFile\n"; + print $fh "WORKING DIR: $dir\n"; + print $fh "SOURCE (DEV): $srcFile\n"; + print $fh "REFS (DEV): $refFiles\n"; + print $fh "EVAL METRIC: $metric\n"; + print $fh "START ITERATION: $iteration\n"; + print $fh "MAX ITERATIONS: $max_iterations\n"; + print $fh "PARALLEL JOBS: $jobs\n"; + print $fh "HEAD NODE: $host\n"; + print $fh "PMEM (DECODING): $pmem\n"; + print $fh "CLEANUP: $cleanup\n"; + print $fh "INITIAL WEIGHTS: $initialWeights\n"; +} + +sub update_weights_file { + my ($neww, $rfn, $rpts) = @_; + my @feats = @$rfn; + my @pts = @$rpts; + my $num_feats = scalar @feats; + my $num_pts = scalar @pts; + die "$num_feats (num_feats) != $num_pts (num_pts)" unless $num_feats == $num_pts; + open G, ">$neww" or die; + for (my $i = 0; $i < $num_feats; $i++) { + my $f = $feats[$i]; + my $lambda = $pts[$i]; + print G "$f $lambda\n"; + } + close G; +} + +sub enseg { + my $src = shift; + my $newsrc = shift; + open(SRC, $src); + open(NEWSRC, ">$newsrc"); + my $i=0; + while (my $line=){ + chomp $line; + if ($line =~ /^\s* tags, you must include a zero-based id attribute"; + } + } else { + print NEWSRC "$line\n"; + } + $i++; + } + close SRC; + close NEWSRC; +} + +sub print_help { + + my $executable = check_output("basename $0"); chomp $executable; + print << "Help"; + +Usage: $executable [options] + + $executable [options] + Runs a complete MERT optimization using the decoder configuration + in . Required options are --weights, --source-file, and + --ref-files. + +Options: + + --help + Print this message and exit. + + --max-iterations + Maximum number of iterations to run. If not specified, defaults + to 10. + + --pass-suffix + If the decoder is doing multi-pass decoding, the pass suffix "2", + "3", etc., is used to control what iteration of weights is set. + + --ref-files + Dev set ref files. This option takes only a single string argument. + To use multiple files (including file globbing), this argument should + be quoted. + + --metric + Metric to optimize. + Example values: IBM_BLEU, NIST_BLEU, Koehn_BLEU, TER, Combi + + --normalize + After each iteration, rescale all feature weights such that feature- + name has a weight of 1.0. + + --rand-directions + MERT will attempt to optimize along all of the principle directions, + set this parameter to explore other directions. Defaults to 5. + + --source-file + Dev set source file. + + --weights + A file specifying initial feature weights. The format is + FeatureName_1 value1 + FeatureName_2 value2 + **All and only the weights listed in will be optimized!** + + --workdir + Directory for intermediate and output files. If not specified, the + name is derived from the ini filename. Assuming that the ini + filename begins with the decoder name and ends with ini, the default + name of the working directory is inferred from the middle part of + the filename. E.g. an ini file named decoder.foo.ini would have + a default working directory name foo. + +Job control options: + + --jobs + Number of decoder processes to run in parallel. [default=$default_jobs] + + --qsub + Use qsub to run jobs in parallel (qsub must be configured in + environment/LocalEnvironment.pm) + + --pmem + Amount of physical memory requested for parallel decoding jobs + (used with qsub requests only) + +Help +} + +sub convert { + my ($str) = @_; + my @ps = split /;/, $str; + my %dict = (); + for my $p (@ps) { + my ($k, $v) = split /=/, $p; + $dict{$k} = $v; + } + return %dict; +} + + + +sub cmdline { + return join ' ',($0,@ORIG_ARGV); +} + +#buggy: last arg gets quoted sometimes? +my $is_shell_special=qr{[ \t\n\\><|&;"'`~*?{}$!()]}; +my $shell_escape_in_quote=qr{[\\"\$`!]}; + +sub escape_shell { + my ($arg)=@_; + return undef unless defined $arg; + if ($arg =~ /$is_shell_special/) { + $arg =~ s/($shell_escape_in_quote)/\\$1/g; + return "\"$arg\""; + } + return $arg; +} + +sub escaped_shell_args { + return map {local $_=$_;chomp;escape_shell($_)} @_; +} + +sub escaped_shell_args_str { + return join ' ',&escaped_shell_args(@_); +} + +sub escaped_cmdline { + return "$0 ".&escaped_shell_args_str(@ORIG_ARGV); +} diff --git a/dpmert/error_surface.cc b/dpmert/error_surface.cc new file mode 100644 index 00000000..515b67f8 --- /dev/null +++ b/dpmert/error_surface.cc @@ -0,0 +1,42 @@ +#include "error_surface.h" + +#include +#include + +using namespace std; + +ErrorSurface::~ErrorSurface() {} + +void ErrorSurface::Serialize(std::string* out) const { + const int segments = this->size(); + ostringstream os(ios::binary); + os.write((const char*)&segments,sizeof(segments)); + for (int i = 0; i < segments; ++i) { + const ErrorSegment& cur = (*this)[i]; + string senc; + cur.delta.Encode(&senc); + assert(senc.size() < 1024); + unsigned char len = senc.size(); + os.write((const char*)&cur.x, sizeof(cur.x)); + os.write((const char*)&len, sizeof(len)); + os.write((const char*)&senc[0], len); + } + *out = os.str(); +} + +void ErrorSurface::Deserialize(const std::string& in) { + istringstream is(in, ios::binary); + int segments; + is.read((char*)&segments, sizeof(segments)); + this->resize(segments); + for (int i = 0; i < segments; ++i) { + ErrorSegment& cur = (*this)[i]; + unsigned char len; + is.read((char*)&cur.x, sizeof(cur.x)); + is.read((char*)&len, sizeof(len)); + string senc(len, '\0'); assert(senc.size() == len); + is.read((char*)&senc[0], len); + cur.delta = SufficientStats(senc); + } +} + diff --git a/dpmert/error_surface.h b/dpmert/error_surface.h new file mode 100644 index 00000000..bb65847b --- /dev/null +++ b/dpmert/error_surface.h @@ -0,0 +1,24 @@ +#ifndef _ERROR_SURFACE_H_ +#define _ERROR_SURFACE_H_ + +#include +#include + +#include "ns.h" + +class Score; + +struct ErrorSegment { + double x; + SufficientStats delta; + ErrorSegment() : x(0), delta() {} +}; + +class ErrorSurface : public std::vector { + public: + ~ErrorSurface(); + void Serialize(std::string* out) const; + void Deserialize(const std::string& in); +}; + +#endif diff --git a/dpmert/libcall.pl b/dpmert/libcall.pl new file mode 100644 index 00000000..c7d0f128 --- /dev/null +++ b/dpmert/libcall.pl @@ -0,0 +1,71 @@ +use IPC::Open3; +use Symbol qw(gensym); + +$DUMMY_STDERR = gensym(); +$DUMMY_STDIN = gensym(); + +# Run the command and ignore failures +sub unchecked_call { + system("@_") +} + +# Run the command and return its output, if any ignoring failures +sub unchecked_output { + return `@_` +} + +# WARNING: Do not use this for commands that will return large amounts +# of stdout or stderr -- they might block indefinitely +sub check_output { + print STDERR "Executing and gathering output: @_\n"; + + my $pid = open3($DUMMY_STDIN, \*PH, $DUMMY_STDERR, @_); + my $proc_output = ""; + while( ) { + $proc_output .= $_; + } + waitpid($pid, 0); + # TODO: Grab signal that the process died from + my $child_exit_status = $? >> 8; + if($child_exit_status == 0) { + return $proc_output; + } else { + print STDERR "ERROR: Execution of @_ failed.\n"; + exit(1); + } +} + +# Based on Moses' safesystem sub +sub check_call { + print STDERR "Executing: @_\n"; + system(@_); + my $exitcode = $? >> 8; + if($exitcode == 0) { + return 0; + } elsif ($? == -1) { + print STDERR "ERROR: Failed to execute: @_\n $!\n"; + exit(1); + + } elsif ($? & 127) { + printf STDERR "ERROR: Execution of: @_\n died with signal %d, %s coredump\n", + ($? & 127), ($? & 128) ? 'with' : 'without'; + exit(1); + + } else { + print STDERR "Failed with exit code: $exitcode\n" if $exitcode; + exit($exitcode); + } +} + +sub check_bash_call { + my @args = ( "bash", "-auxeo", "pipefail", "-c", "@_"); + check_call(@args); +} + +sub check_bash_output { + my @args = ( "bash", "-auxeo", "pipefail", "-c", "@_"); + return check_output(@args); +} + +# perl module weirdness... +return 1; diff --git a/dpmert/line_mediator.pl b/dpmert/line_mediator.pl new file mode 100755 index 00000000..bc2bb24c --- /dev/null +++ b/dpmert/line_mediator.pl @@ -0,0 +1,116 @@ +#!/usr/bin/perl -w +#hooks up two processes, 2nd of which has one line of output per line of input, expected by the first, which starts off the communication + +# if you don't know how to fork/exec in a C program, this could be helpful under limited cirmustances (would be ok to liaise with sentserver) + +#WARNING: because it waits for the result from command 2 after sending every line, and especially if command 1 does the same, using sentserver as command 2 won't actually buy you any real parallelism. + +use strict; +use IPC::Open2; +use POSIX qw(pipe dup2 STDIN_FILENO STDOUT_FILENO); + +my $quiet=!$ENV{DEBUG}; +$quiet=1 if $ENV{QUIET}; +sub info { + local $,=' '; + print STDERR @_ unless $quiet; +} + +my $mode='CROSS'; +my $ser='DIRECT'; +$mode='PIPE' if $ENV{PIPE}; +$mode='SNAKE' if $ENV{SNAKE}; +$mode='CROSS' if $ENV{CROSS}; +$ser='SERIAL' if $ENV{SERIAL}; +$ser='DIRECT' if $ENV{DIRECT}; +$ser='SERIAL' if $mode eq 'SNAKE'; +info("mode: $mode\n"); +info("connection: $ser\n"); + + +my @c1; +if (scalar @ARGV) { + do { + push @c1,shift + } while scalar @ARGV && $c1[$#c1] ne '--'; +} +pop @c1; +my @c2=@ARGV; +@ARGV=(); +(scalar @c1 && scalar @c2) || die qq{ +usage: $0 cmd1 args -- cmd2 args +all options are environment variables. +DEBUG=1 env var enables debugging output. +CROSS=1 hooks up two processes, 2nd of which has one line of output per line of input, expected by the first, which starts off the communication. crosses stdin/stderr of cmd1 and cmd2 line by line (both must flush on newline and output. cmd1 initiates the conversation (sends the first line). default: attempts to cross stdin/stdout of c1 and c2 directly (via two unidirectional posix pipes created before fork). +SERIAL=1: (no parallelism possible) but lines exchanged are logged if DEBUG. +if SNAKE then stdin -> c1 -> c2 -> c1 -> stdout. +if PIPE then stdin -> c1 -> c2 -> stdout (same as shell c1|c2, but with SERIAL you can see the intermediate in real time; you could do similar with c1 | tee /dev/fd/2 |c2. +DIRECT=1 (default) will override SERIAL=1. +CROSS=1 (default) will override SNAKE or PIPE. +}; + +info("1 cmd:",@c1,"\n"); +info("2 cmd:",@c2,"\n"); + +sub lineto { + select $_[0]; + $|=1; + shift; + print @_; +} + +if ($ser eq 'SERIAL') { + my ($R1,$W1,$R2,$W2); + my $c1p=open2($R1,$W1,@c1); # Open2 R W backward from Open3. + my $c2p=open2($R2,$W2,@c2); + if ($mode eq 'CROSS') { + while(<$R1>) { + info("1:",$_); + lineto($W2,$_); + last unless defined ($_=<$R2>); + info("1|2:",$_); + lineto($W1,$_); + } + } else { + my $snake=$mode eq 'SNAKE'; + while() { + info("IN:",$_); + lineto($W1,$_); + last unless defined ($_=<$R1>); + info("IN|1:",$_); + lineto($W2,$_); + last unless defined ($_=<$R2>); + info("IN|1|2:",$_); + if ($snake) { + lineto($W1,$_); + last unless defined ($_=<$R1>); + info("IN|1|2|1:",$_); + } + lineto(*STDOUT,$_); + } + } +} else { + info("DIRECT mode\n"); + my @rw1=POSIX::pipe(); + my @rw2=POSIX::pipe(); + my $pid=undef; + $SIG{CHLD} = sub { wait }; + while (not defined ($pid=fork())) { + sleep 1; + } + my $pipe = $mode eq 'PIPE'; + unless ($pipe) { + POSIX::close(STDOUT_FILENO); + POSIX::close(STDIN_FILENO); + } + if ($pid) { + POSIX::dup2($rw1[1],STDOUT_FILENO); + POSIX::dup2($rw2[0],STDIN_FILENO) unless $pipe; + exec @c1; + } else { + POSIX::dup2($rw2[1],STDOUT_FILENO) unless $pipe; + POSIX::dup2($rw1[0],STDIN_FILENO); + exec @c2; + } + while (wait()!=-1) {} +} diff --git a/dpmert/line_optimizer.cc b/dpmert/line_optimizer.cc new file mode 100644 index 00000000..49443fbe --- /dev/null +++ b/dpmert/line_optimizer.cc @@ -0,0 +1,111 @@ +#include "line_optimizer.h" + +#include +#include + +#include "sparse_vector.h" +#include "ns.h" + +using namespace std; + +typedef ErrorSurface::const_iterator ErrorIter; + +// sort by increasing x-ints +struct IntervalComp { + bool operator() (const ErrorIter& a, const ErrorIter& b) const { + return a->x < b->x; + } +}; + +double LineOptimizer::LineOptimize( + const EvaluationMetric* metric, + const vector& surfaces, + const LineOptimizer::ScoreType type, + float* best_score, + const double epsilon) { + // cerr << "MIN=" << MINIMIZE_SCORE << " MAX=" << MAXIMIZE_SCORE << " MINE=" << type << endl; + vector all_ints; + for (vector::const_iterator i = surfaces.begin(); + i != surfaces.end(); ++i) { + const ErrorSurface& surface = *i; + for (ErrorIter j = surface.begin(); j != surface.end(); ++j) + all_ints.push_back(j); + } + sort(all_ints.begin(), all_ints.end(), IntervalComp()); + double last_boundary = all_ints.front()->x; + SufficientStats acc; + float& cur_best_score = *best_score; + cur_best_score = (type == MAXIMIZE_SCORE ? + -numeric_limits::max() : numeric_limits::max()); + bool left_edge = true; + double pos = numeric_limits::quiet_NaN(); + for (vector::iterator i = all_ints.begin(); + i != all_ints.end(); ++i) { + const ErrorSegment& seg = **i; + if (seg.x - last_boundary > epsilon) { + float sco = metric->ComputeScore(acc); + if ((type == MAXIMIZE_SCORE && sco > cur_best_score) || + (type == MINIMIZE_SCORE && sco < cur_best_score) ) { + cur_best_score = sco; + if (left_edge) { + pos = seg.x - 0.1; + left_edge = false; + } else { + pos = last_boundary + (seg.x - last_boundary) / 2; + } + //cerr << "NEW BEST: " << pos << " (score=" << cur_best_score << ")\n"; + } + // string xx = metric->DetailedScore(acc); cerr << "---- " << xx; + // cerr << "---- s=" << sco << "\n"; + last_boundary = seg.x; + } + // cerr << "x-boundary=" << seg.x << "\n"; + //string x2; acc.Encode(&x2); cerr << " ACC: " << x2 << endl; + //string x1; seg.delta.Encode(&x1); cerr << " DELTA: " << x1 << endl; + acc += seg.delta; + } + float sco = metric->ComputeScore(acc); + if ((type == MAXIMIZE_SCORE && sco > cur_best_score) || + (type == MINIMIZE_SCORE && sco < cur_best_score) ) { + cur_best_score = sco; + if (left_edge) { + pos = 0; + } else { + pos = last_boundary + 1000.0; + } + } + return pos; +} + +void LineOptimizer::RandomUnitVector(const vector& features_to_optimize, + SparseVector* axis, + RandomNumberGenerator* rng) { + axis->clear(); + for (int i = 0; i < features_to_optimize.size(); ++i) + axis->set_value(features_to_optimize[i], rng->NextNormal(0.0,1.0)); + (*axis) /= axis->l2norm(); +} + +void LineOptimizer::CreateOptimizationDirections( + const vector& features_to_optimize, + int additional_random_directions, + RandomNumberGenerator* rng, + vector >* dirs + , bool include_orthogonal + ) { + dirs->clear(); + typedef SparseVector Dir; + vector &out=*dirs; + int i=0; + if (include_orthogonal) + for (;i + +#include "sparse_vector.h" +#include "error_surface.h" +#include "sampler.h" + +class EvaluationMetric; +class Weights; + +struct LineOptimizer { + + // use MINIMIZE_SCORE for things like TER, WER + // MAXIMIZE_SCORE for things like BLEU + enum ScoreType { MAXIMIZE_SCORE, MINIMIZE_SCORE }; + + // merge all the error surfaces together into a global + // error surface and find (the middle of) the best segment + static double LineOptimize( + const EvaluationMetric* metric, + const std::vector& envs, + const LineOptimizer::ScoreType type, + float* best_score, + const double epsilon = 1.0/65536.0); + + // return a random vector of length 1 where all dimensions + // not listed in dimensions will be 0. + static void RandomUnitVector(const std::vector& dimensions, + SparseVector* axis, + RandomNumberGenerator* rng); + + // generate a list of directions to optimize; the list will + // contain the orthogonal vectors corresponding to the dimensions in + // primary and then additional_random_directions directions in those + // dimensions as well. All vectors will be length 1. + static void CreateOptimizationDirections( + const std::vector& primary, + int additional_random_directions, + RandomNumberGenerator* rng, + std::vector >* dirs + , bool include_primary=true + ); + +}; + +#endif diff --git a/dpmert/lo_test.cc b/dpmert/lo_test.cc new file mode 100644 index 00000000..d9b909b8 --- /dev/null +++ b/dpmert/lo_test.cc @@ -0,0 +1,236 @@ +#include +#include +#include + +#include +#include + +#include "ns.h" +#include "ns_docscorer.h" +#include "ces.h" +#include "fdict.h" +#include "hg.h" +#include "kbest.h" +#include "hg_io.h" +#include "filelib.h" +#include "inside_outside.h" +#include "viterbi.h" +#include "mert_geometry.h" +#include "line_optimizer.h" + +using namespace std; +using boost::shared_ptr; + +class OptTest : public testing::Test { + protected: + virtual void SetUp() { } + virtual void TearDown() { } +}; + +const char* ref11 = "australia reopens embassy in manila"; +const char* ref12 = "( afp , manila , january 2 ) australia reopened its embassy in the philippines today , which was shut down about seven weeks ago due to what was described as a specific threat of a terrorist attack ."; +const char* ref21 = "australia reopened manila embassy"; +const char* ref22 = "( agence france-presse , manila , 2nd ) - australia reopened its embassy in the philippines today . the embassy was closed seven weeks ago after what was described as a specific threat of a terrorist attack ."; +const char* ref31 = "australia to reopen embassy in manila"; +const char* ref32 = "( afp report from manila , january 2 ) australia reopened its embassy in the philippines today . seven weeks ago , the embassy was shut down due to so - called confirmed terrorist attack threats ."; +const char* ref41 = "australia to re - open its embassy to manila"; +const char* ref42 = "( afp , manila , thursday ) australia reopens its embassy to manila , which was closed for the so - called \" clear \" threat of terrorist attack 7 weeks ago ."; + +TEST_F(OptTest, TestCheckNaN) { + double x = 0; + double y = 0; + double z = x / y; + EXPECT_EQ(true, isnan(z)); +} + +TEST_F(OptTest,TestConvexHull) { + shared_ptr a1(new MERTPoint(-1, 0)); + shared_ptr b1(new MERTPoint(1, 0)); + shared_ptr a2(new MERTPoint(-1, 1)); + shared_ptr b2(new MERTPoint(1, -1)); + vector > sa; sa.push_back(a1); sa.push_back(b1); + vector > sb; sb.push_back(a2); sb.push_back(b2); + ConvexHull a(sa); + cerr << a << endl; + ConvexHull b(sb); + ConvexHull c = a; + c *= b; + cerr << a << " (*) " << b << " = " << c << endl; + EXPECT_EQ(3, c.size()); +} + +TEST_F(OptTest,TestConvexHullInside) { + const string json = "{\"rules\":[1,\"[X] ||| a\",2,\"[X] ||| A [1]\",3,\"[X] ||| c\",4,\"[X] ||| C [1]\",5,\"[X] ||| [1] B [2]\",6,\"[X] ||| [1] b [2]\",7,\"[X] ||| X [1]\",8,\"[X] ||| Z [1]\"],\"features\":[\"f1\",\"f2\",\"Feature_1\",\"Feature_0\",\"Model_0\",\"Model_1\",\"Model_2\",\"Model_3\",\"Model_4\",\"Model_5\",\"Model_6\",\"Model_7\"],\"edges\":[{\"tail\":[],\"feats\":[],\"rule\":1}],\"node\":{\"in_edges\":[0]},\"edges\":[{\"tail\":[0],\"feats\":[0,-0.8,1,-0.1],\"rule\":2}],\"node\":{\"in_edges\":[1]},\"edges\":[{\"tail\":[],\"feats\":[1,-1],\"rule\":3}],\"node\":{\"in_edges\":[2]},\"edges\":[{\"tail\":[2],\"feats\":[0,-0.2,1,-0.1],\"rule\":4}],\"node\":{\"in_edges\":[3]},\"edges\":[{\"tail\":[1,3],\"feats\":[0,-1.2,1,-0.2],\"rule\":5},{\"tail\":[1,3],\"feats\":[0,-0.5,1,-1.3],\"rule\":6}],\"node\":{\"in_edges\":[4,5]},\"edges\":[{\"tail\":[4],\"feats\":[0,-0.5,1,-0.8],\"rule\":7},{\"tail\":[4],\"feats\":[0,-0.7,1,-0.9],\"rule\":8}],\"node\":{\"in_edges\":[6,7]}}"; + Hypergraph hg; + istringstream instr(json); + HypergraphIO::ReadFromJSON(&instr, &hg); + SparseVector wts; + wts.set_value(FD::Convert("f1"), 0.4); + wts.set_value(FD::Convert("f2"), 1.0); + hg.Reweight(wts); + vector, prob_t> > list; + std::vector > features; + KBest::KBestDerivations, ESentenceTraversal> kbest(hg, 10); + for (int i = 0; i < 10; ++i) { + const KBest::KBestDerivations, ESentenceTraversal>::Derivation* d = + kbest.LazyKthBest(hg.nodes_.size() - 1, i); + if (!d) break; + cerr << log(d->score) << " ||| " << TD::GetString(d->yield) << " ||| " << d->feature_values << endl; + } + SparseVector dir; dir.set_value(FD::Convert("f1"), 1.0); + ConvexHullWeightFunction wf(wts, dir); + ConvexHull env = Inside(hg, NULL, wf); + cerr << env << endl; + const vector >& segs = env.GetSortedSegs(); + dir *= segs[1]->x; + wts += dir; + hg.Reweight(wts); + KBest::KBestDerivations, ESentenceTraversal> kbest2(hg, 10); + for (int i = 0; i < 10; ++i) { + const KBest::KBestDerivations, ESentenceTraversal>::Derivation* d = + kbest2.LazyKthBest(hg.nodes_.size() - 1, i); + if (!d) break; + cerr << log(d->score) << " ||| " << TD::GetString(d->yield) << " ||| " << d->feature_values << endl; + } + for (int i = 0; i < segs.size(); ++i) { + cerr << "seg=" << i << endl; + vector trans; + segs[i]->ConstructTranslation(&trans); + cerr << TD::GetString(trans) << endl; + } +} + +TEST_F(OptTest, TestS1) { + int fPhraseModel_0 = FD::Convert("PhraseModel_0"); + int fPhraseModel_1 = FD::Convert("PhraseModel_1"); + int fPhraseModel_2 = FD::Convert("PhraseModel_2"); + int fLanguageModel = FD::Convert("LanguageModel"); + int fWordPenalty = FD::Convert("WordPenalty"); + int fPassThrough = FD::Convert("PassThrough"); + SparseVector wts; + wts.set_value(fWordPenalty, 4.25); + wts.set_value(fLanguageModel, -1.1165); + wts.set_value(fPhraseModel_0, -0.96); + wts.set_value(fPhraseModel_1, -0.65); + wts.set_value(fPhraseModel_2, -0.77); + wts.set_value(fPassThrough, -10.0); + + vector to_optimize; + to_optimize.push_back(fWordPenalty); + to_optimize.push_back(fLanguageModel); + to_optimize.push_back(fPhraseModel_0); + to_optimize.push_back(fPhraseModel_1); + to_optimize.push_back(fPhraseModel_2); + + Hypergraph hg; + ReadFile rf("./test_data/0.json.gz"); + HypergraphIO::ReadFromJSON(rf.stream(), &hg); + hg.Reweight(wts); + + Hypergraph hg2; + ReadFile rf2("./test_data/1.json.gz"); + HypergraphIO::ReadFromJSON(rf2.stream(), &hg2); + hg2.Reweight(wts); + + vector > refs1(4); + TD::ConvertSentence(ref11, &refs1[0]); + TD::ConvertSentence(ref21, &refs1[1]); + TD::ConvertSentence(ref31, &refs1[2]); + TD::ConvertSentence(ref41, &refs1[3]); + vector > refs2(4); + TD::ConvertSentence(ref12, &refs2[0]); + TD::ConvertSentence(ref22, &refs2[1]); + TD::ConvertSentence(ref32, &refs2[2]); + TD::ConvertSentence(ref42, &refs2[3]); + vector envs(2); + + RandomNumberGenerator rng; + + vector > axes; // directions to search + LineOptimizer::CreateOptimizationDirections( + to_optimize, + 10, + &rng, + &axes); + assert(axes.size() == 10 + to_optimize.size()); + for (int i = 0; i < axes.size(); ++i) + cerr << axes[i] << endl; + const SparseVector& axis = axes[0]; + + cerr << "Computing Viterbi envelope using inside algorithm...\n"; + cerr << "axis: " << axis << endl; + clock_t t_start=clock(); + ConvexHullWeightFunction wf(wts, axis); // wts = starting point, axis = search direction + envs[0] = Inside(hg, NULL, wf); + envs[1] = Inside(hg2, NULL, wf); + + vector es(2); + EvaluationMetric* metric = EvaluationMetric::Instance("IBM_BLEU"); + boost::shared_ptr scorer1 = metric->CreateSegmentEvaluator(refs1); + boost::shared_ptr scorer2 = metric->CreateSegmentEvaluator(refs2); + ComputeErrorSurface(*scorer1, envs[0], &es[0], metric, hg); + ComputeErrorSurface(*scorer2, envs[1], &es[1], metric, hg2); + cerr << envs[0].size() << " " << envs[1].size() << endl; + cerr << es[0].size() << " " << es[1].size() << endl; + envs.clear(); + clock_t t_env=clock(); + float score; + double m = LineOptimizer::LineOptimize(metric,es, LineOptimizer::MAXIMIZE_SCORE, &score); + clock_t t_opt=clock(); + cerr << "line optimizer returned: " << m << " (SCORE=" << score << ")\n"; + EXPECT_FLOAT_EQ(0.48719698, score); + SparseVector res = axis; + res *= m; + res += wts; + cerr << "res: " << res << endl; + cerr << "ENVELOPE PROCESSING=" << (static_cast(t_env - t_start) / 1000.0) << endl; + cerr << " LINE OPTIMIZATION=" << (static_cast(t_opt - t_env) / 1000.0) << endl; + hg.Reweight(res); + hg2.Reweight(res); + vector t1,t2; + ViterbiESentence(hg, &t1); + ViterbiESentence(hg2, &t2); + cerr << TD::GetString(t1) << endl; + cerr << TD::GetString(t2) << endl; +} + +TEST_F(OptTest,TestZeroOrigin) { + const string json = "{\"rules\":[1,\"[X7] ||| blA ||| without ||| LHSProb=3.92173 LexE2F=2.90799 LexF2E=1.85003 GenerativeProb=10.5381 RulePenalty=1 XFE=2.77259 XEF=0.441833 LabelledEF=2.63906 LabelledFE=4.96981 LogRuleCount=0.693147\",2,\"[X7] ||| blA ||| except ||| LHSProb=4.92173 LexE2F=3.90799 LexF2E=1.85003 GenerativeProb=11.5381 RulePenalty=1 XFE=2.77259 XEF=1.44183 LabelledEF=2.63906 LabelledFE=4.96981 LogRuleCount=1.69315\",3,\"[S] ||| [X7,1] ||| [1] ||| GlueTop=1\",4,\"[X28] ||| EnwAn ||| title ||| LHSProb=3.96802 LexE2F=2.22462 LexF2E=1.83258 GenerativeProb=10.0863 RulePenalty=1 XFE=0 XEF=1.20397 LabelledEF=1.20397 LabelledFE=-1.98341e-08 LogRuleCount=1.09861\",5,\"[X0] ||| EnwAn ||| funny ||| LHSProb=3.98479 LexE2F=1.79176 LexF2E=3.21888 GenerativeProb=11.1681 RulePenalty=1 XFE=0 XEF=2.30259 LabelledEF=2.30259 LabelledFE=0 LogRuleCount=0 SingletonRule=1\",6,\"[X8] ||| [X7,1] EnwAn ||| entitled [1] ||| LHSProb=3.82533 LexE2F=3.21888 LexF2E=2.52573 GenerativeProb=11.3276 RulePenalty=1 XFE=1.20397 XEF=1.20397 LabelledEF=2.30259 LabelledFE=2.30259 LogRuleCount=0 SingletonRule=1\",7,\"[S] ||| [S,1] [X28,2] ||| [1] [2] ||| Glue=1\",8,\"[S] ||| [S,1] [X0,2] ||| [1] [2] ||| Glue=1\",9,\"[S] ||| [X8,1] ||| [1] ||| GlueTop=1\",10,\"[Goal] ||| [S,1] ||| [1]\"],\"features\":[\"PassThrough\",\"Glue\",\"GlueTop\",\"LanguageModel\",\"WordPenalty\",\"LHSProb\",\"LexE2F\",\"LexF2E\",\"GenerativeProb\",\"RulePenalty\",\"XFE\",\"XEF\",\"LabelledEF\",\"LabelledFE\",\"LogRuleCount\",\"SingletonRule\"],\"edges\":[{\"tail\":[],\"spans\":[0,1,-1,-1],\"feats\":[5,3.92173,6,2.90799,7,1.85003,8,10.5381,9,1,10,2.77259,11,0.441833,12,2.63906,13,4.96981,14,0.693147],\"rule\":1},{\"tail\":[],\"spans\":[0,1,-1,-1],\"feats\":[5,4.92173,6,3.90799,7,1.85003,8,11.5381,9,1,10,2.77259,11,1.44183,12,2.63906,13,4.96981,14,1.69315],\"rule\":2}],\"node\":{\"in_edges\":[0,1],\"cat\":\"X7\"},\"edges\":[{\"tail\":[0],\"spans\":[0,1,-1,-1],\"feats\":[2,1],\"rule\":3}],\"node\":{\"in_edges\":[2],\"cat\":\"S\"},\"edges\":[{\"tail\":[],\"spans\":[1,2,-1,-1],\"feats\":[5,3.96802,6,2.22462,7,1.83258,8,10.0863,9,1,11,1.20397,12,1.20397,13,-1.98341e-08,14,1.09861],\"rule\":4}],\"node\":{\"in_edges\":[3],\"cat\":\"X28\"},\"edges\":[{\"tail\":[],\"spans\":[1,2,-1,-1],\"feats\":[5,3.98479,6,1.79176,7,3.21888,8,11.1681,9,1,11,2.30259,12,2.30259,15,1],\"rule\":5}],\"node\":{\"in_edges\":[4],\"cat\":\"X0\"},\"edges\":[{\"tail\":[0],\"spans\":[0,2,-1,-1],\"feats\":[5,3.82533,6,3.21888,7,2.52573,8,11.3276,9,1,10,1.20397,11,1.20397,12,2.30259,13,2.30259,15,1],\"rule\":6}],\"node\":{\"in_edges\":[5],\"cat\":\"X8\"},\"edges\":[{\"tail\":[1,2],\"spans\":[0,2,-1,-1],\"feats\":[1,1],\"rule\":7},{\"tail\":[1,3],\"spans\":[0,2,-1,-1],\"feats\":[1,1],\"rule\":8},{\"tail\":[4],\"spans\":[0,2,-1,-1],\"feats\":[2,1],\"rule\":9}],\"node\":{\"in_edges\":[6,7,8],\"cat\":\"S\"},\"edges\":[{\"tail\":[5],\"spans\":[0,2,-1,-1],\"feats\":[],\"rule\":10}],\"node\":{\"in_edges\":[9],\"cat\":\"Goal\"}}"; + Hypergraph hg; + istringstream instr(json); + HypergraphIO::ReadFromJSON(&instr, &hg); + SparseVector wts; + wts.set_value(FD::Convert("PassThrough"), -0.929201533002898); + hg.Reweight(wts); + + vector, prob_t> > list; + std::vector > features; + KBest::KBestDerivations, ESentenceTraversal> kbest(hg, 10); + for (int i = 0; i < 10; ++i) { + const KBest::KBestDerivations, ESentenceTraversal>::Derivation* d = + kbest.LazyKthBest(hg.nodes_.size() - 1, i); + if (!d) break; + cerr << log(d->score) << " ||| " << TD::GetString(d->yield) << " ||| " << d->feature_values << endl; + } + + SparseVector axis; axis.set_value(FD::Convert("Glue"),1.0); + ConvexHullWeightFunction wf(wts, axis); // wts = starting point, axis = search direction + vector envs(1); + envs[0] = Inside(hg, NULL, wf); + + vector > mr(4); + TD::ConvertSentence("untitled", &mr[0]); + TD::ConvertSentence("with no title", &mr[1]); + TD::ConvertSentence("without a title", &mr[2]); + TD::ConvertSentence("without title", &mr[3]); + EvaluationMetric* metric = EvaluationMetric::Instance("IBM_BLEU"); + boost::shared_ptr scorer1 = metric->CreateSegmentEvaluator(mr); + vector es(1); + ComputeErrorSurface(*scorer1, envs[0], &es[0], metric, hg); +} + +int main(int argc, char **argv) { + testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} + diff --git a/dpmert/mert_geometry.cc b/dpmert/mert_geometry.cc new file mode 100644 index 00000000..81b25af9 --- /dev/null +++ b/dpmert/mert_geometry.cc @@ -0,0 +1,186 @@ +#include "mert_geometry.h" + +#include +#include + +using namespace std; +using boost::shared_ptr; + +ConvexHull::ConvexHull(int i) { + if (i == 0) { + // do nothing - <> + } else if (i == 1) { + points.push_back(shared_ptr(new MERTPoint(0, 0, 0, shared_ptr(), shared_ptr()))); + assert(this->IsMultiplicativeIdentity()); + } else { + cerr << "Only can create ConvexHull semiring 0 and 1 with this constructor!\n"; + abort(); + } +} + +const ConvexHull ConvexHullWeightFunction::operator()(const Hypergraph::Edge& e) const { + const double m = direction.dot(e.feature_values_); + const double b = origin.dot(e.feature_values_); + MERTPoint* point = new MERTPoint(m, b, e); + return ConvexHull(1, point); +} + +ostream& operator<<(ostream& os, const ConvexHull& env) { + os << '<'; + const vector >& points = env.GetSortedSegs(); + for (int i = 0; i < points.size(); ++i) + os << (i==0 ? "" : "|") << "x=" << points[i]->x << ",b=" << points[i]->b << ",m=" << points[i]->m << ",p1=" << points[i]->p1 << ",p2=" << points[i]->p2; + return os << '>'; +} + +#define ORIGINAL_MERT_IMPLEMENTATION 1 +#ifdef ORIGINAL_MERT_IMPLEMENTATION + +struct SlopeCompare { + bool operator() (const shared_ptr& a, const shared_ptr& b) const { + return a->m < b->m; + } +}; + +const ConvexHull& ConvexHull::operator+=(const ConvexHull& other) { + if (!other.is_sorted) other.Sort(); + if (points.empty()) { + points = other.points; + return *this; + } + is_sorted = false; + int j = points.size(); + points.resize(points.size() + other.points.size()); + for (int i = 0; i < other.points.size(); ++i) + points[j++] = other.points[i]; + assert(j == points.size()); + return *this; +} + +void ConvexHull::Sort() const { + sort(points.begin(), points.end(), SlopeCompare()); + const int k = points.size(); + int j = 0; + for (int i = 0; i < k; ++i) { + MERTPoint l = *points[i]; + l.x = kMinusInfinity; + // cerr << "m=" << l.m << endl; + if (0 < j) { + if (points[j-1]->m == l.m) { // lines are parallel + if (l.b <= points[j-1]->b) continue; + --j; + } + while(0 < j) { + l.x = (l.b - points[j-1]->b) / (points[j-1]->m - l.m); + if (points[j-1]->x < l.x) break; + --j; + } + if (0 == j) l.x = kMinusInfinity; + } + *points[j++] = l; + } + points.resize(j); + is_sorted = true; +} + +const ConvexHull& ConvexHull::operator*=(const ConvexHull& other) { + if (other.IsMultiplicativeIdentity()) { return *this; } + if (this->IsMultiplicativeIdentity()) { (*this) = other; return *this; } + + if (!is_sorted) Sort(); + if (!other.is_sorted) other.Sort(); + + if (this->IsEdgeEnvelope()) { +// if (other.size() > 1) +// cerr << *this << " (TIMES) " << other << endl; + shared_ptr edge_parent = points[0]; + const double& edge_b = edge_parent->b; + const double& edge_m = edge_parent->m; + points.clear(); + for (int i = 0; i < other.points.size(); ++i) { + const MERTPoint& p = *other.points[i]; + const double m = p.m + edge_m; + const double b = p.b + edge_b; + const double& x = p.x; // x's don't change with * + points.push_back(shared_ptr(new MERTPoint(x, m, b, edge_parent, other.points[i]))); + assert(points.back()->p1->edge); + } +// if (other.size() > 1) +// cerr << " = " << *this << endl; + } else { + vector > new_points; + int this_i = 0; + int other_i = 0; + const int this_size = points.size(); + const int other_size = other.points.size(); + double cur_x = kMinusInfinity; // moves from left to right across the + // real numbers, stopping for all inter- + // sections + double this_next_val = (1 < this_size ? points[1]->x : kPlusInfinity); + double other_next_val = (1 < other_size ? other.points[1]->x : kPlusInfinity); + while (this_i < this_size && other_i < other_size) { + const MERTPoint& this_point = *points[this_i]; + const MERTPoint& other_point= *other.points[other_i]; + const double m = this_point.m + other_point.m; + const double b = this_point.b + other_point.b; + + new_points.push_back(shared_ptr(new MERTPoint(cur_x, m, b, points[this_i], other.points[other_i]))); + int comp = 0; + if (this_next_val < other_next_val) comp = -1; else + if (this_next_val > other_next_val) comp = 1; + if (0 == comp) { // the next values are equal, advance both indices + ++this_i; + ++other_i; + cur_x = this_next_val; // could be other_next_val (they're equal!) + this_next_val = (this_i+1 < this_size ? points[this_i+1]->x : kPlusInfinity); + other_next_val = (other_i+1 < other_size ? other.points[other_i+1]->x : kPlusInfinity); + } else { // advance the i with the lower x, update cur_x + if (-1 == comp) { + ++this_i; + cur_x = this_next_val; + this_next_val = (this_i+1 < this_size ? points[this_i+1]->x : kPlusInfinity); + } else { + ++other_i; + cur_x = other_next_val; + other_next_val = (other_i+1 < other_size ? other.points[other_i+1]->x : kPlusInfinity); + } + } + } + points.swap(new_points); + } + //cerr << "Multiply: result=" << (*this) << endl; + return *this; +} + +// recursively construct translation +void MERTPoint::ConstructTranslation(vector* trans) const { + const MERTPoint* cur = this; + vector > ant_trans; + while(!cur->edge) { + ant_trans.resize(ant_trans.size() + 1); + cur->p2->ConstructTranslation(&ant_trans.back()); + cur = cur->p1.get(); + } + size_t ant_size = ant_trans.size(); + vector*> pants(ant_size); + assert(ant_size == cur->edge->tail_nodes_.size()); + --ant_size; + for (int i = 0; i < pants.size(); ++i) pants[ant_size - i] = &ant_trans[i]; + cur->edge->rule_->ESubstitute(pants, trans); +} + +void MERTPoint::CollectEdgesUsed(std::vector* edges_used) const { + if (edge) { + assert(edge->id_ < edges_used->size()); + (*edges_used)[edge->id_] = true; + } + if (p1) p1->CollectEdgesUsed(edges_used); + if (p2) p2->CollectEdgesUsed(edges_used); +} + +#else + +// THIS IS THE NEW FASTER IMPLEMENTATION OF THE MERT SEMIRING OPERATIONS + +#endif + diff --git a/dpmert/mert_geometry.h b/dpmert/mert_geometry.h new file mode 100644 index 00000000..a8b6959e --- /dev/null +++ b/dpmert/mert_geometry.h @@ -0,0 +1,81 @@ +#ifndef _MERT_GEOMETRY_H_ +#define _MERT_GEOMETRY_H_ + +#include +#include +#include + +#include "hg.h" +#include "sparse_vector.h" + +static const double kMinusInfinity = -std::numeric_limits::infinity(); +static const double kPlusInfinity = std::numeric_limits::infinity(); + +struct MERTPoint { + MERTPoint() : x(), m(), b(), edge() {} + MERTPoint(double _m, double _b) : + x(kMinusInfinity), m(_m), b(_b), edge() {} + MERTPoint(double _x, double _m, double _b, const boost::shared_ptr& p1_, const boost::shared_ptr& p2_) : + x(_x), m(_m), b(_b), p1(p1_), p2(p2_), edge() {} + MERTPoint(double _m, double _b, const Hypergraph::Edge& edge) : + x(kMinusInfinity), m(_m), b(_b), edge(&edge) {} + + double x; // x intersection with previous segment in env, or -inf if none + double m; // this line's slope + double b; // intercept with y-axis + + // we keep a pointer to the "parents" of this segment so we can reconstruct + // the Viterbi translation corresponding to this segment + boost::shared_ptr p1; + boost::shared_ptr p2; + + // only MERTPoints created from an edge using the ConvexHullWeightFunction + // have rules + // TRulePtr rule; + const Hypergraph::Edge* edge; + + // recursively recover the Viterbi translation that will result from setting + // the weights to origin + axis * x, where x is any value from this->x up + // until the next largest x in the containing ConvexHull + void ConstructTranslation(std::vector* trans) const; + void CollectEdgesUsed(std::vector* edges_used) const; +}; + +// this is the semiring value type, +// it defines constructors for 0, 1, and the operations + and * +struct ConvexHull { + // create semiring zero + ConvexHull() : is_sorted(true) {} // zero + // for debugging: + ConvexHull(const std::vector >& s) : points(s) { Sort(); } + // create semiring 1 or 0 + explicit ConvexHull(int i); + ConvexHull(int n, MERTPoint* point) : is_sorted(true), points(n, boost::shared_ptr(point)) {} + const ConvexHull& operator+=(const ConvexHull& other); + const ConvexHull& operator*=(const ConvexHull& other); + bool IsMultiplicativeIdentity() const { + return size() == 1 && (points[0]->b == 0.0 && points[0]->m == 0.0) && (!points[0]->edge) && (!points[0]->p1) && (!points[0]->p2); } + const std::vector >& GetSortedSegs() const { + if (!is_sorted) Sort(); + return points; + } + size_t size() const { return points.size(); } + + private: + bool IsEdgeEnvelope() const { + return points.size() == 1 && points[0]->edge; } + void Sort() const; + mutable bool is_sorted; + mutable std::vector > points; +}; +std::ostream& operator<<(std::ostream& os, const ConvexHull& env); + +struct ConvexHullWeightFunction { + ConvexHullWeightFunction(const SparseVector& ori, + const SparseVector& dir) : origin(ori), direction(dir) {} + const ConvexHull operator()(const Hypergraph::Edge& e) const; + const SparseVector origin; + const SparseVector direction; +}; + +#endif diff --git a/dpmert/mr_dpmert_generate_mapper_input.cc b/dpmert/mr_dpmert_generate_mapper_input.cc new file mode 100644 index 00000000..59d4f24f --- /dev/null +++ b/dpmert/mr_dpmert_generate_mapper_input.cc @@ -0,0 +1,78 @@ +#include +#include + +#include +#include + +#include "filelib.h" +#include "weights.h" +#include "line_optimizer.h" + +using namespace std; +namespace po = boost::program_options; + +void InitCommandLine(int argc, char** argv, po::variables_map* conf) { + po::options_description opts("Configuration options"); + opts.add_options() + ("dev_set_size,s",po::value(),"[REQD] Development set size (# of parallel sentences)") + ("forest_repository,r",po::value(),"[REQD] Path to forest repository") + ("weights,w",po::value(),"[REQD] Current feature weights file") + ("optimize_feature,o",po::value >(), "Feature to optimize (if none specified, all weights listed in the weights file will be optimized)") + ("random_directions,d",po::value()->default_value(20),"Number of random directions to run the line optimizer in") + ("help,h", "Help"); + po::options_description dcmdline_options; + dcmdline_options.add(opts); + po::store(parse_command_line(argc, argv, dcmdline_options), *conf); + bool flag = false; + if (conf->count("dev_set_size") == 0) { + cerr << "Please specify the size of the development set using -d N\n"; + flag = true; + } + if (conf->count("weights") == 0) { + cerr << "Please specify the starting-point weights using -w \n"; + flag = true; + } + if (conf->count("forest_repository") == 0) { + cerr << "Please specify the forest repository location using -r \n"; + flag = true; + } + if (flag || conf->count("help")) { + cerr << dcmdline_options << endl; + exit(1); + } +} + +int main(int argc, char** argv) { + RandomNumberGenerator rng; + po::variables_map conf; + InitCommandLine(argc, argv, &conf); + vector features; + SparseVector origin; + vector w; + Weights::InitFromFile(conf["weights"].as(), &w, &features); + Weights::InitSparseVector(w, &origin); + const string forest_repository = conf["forest_repository"].as(); + assert(DirectoryExists(forest_repository)); + if (conf.count("optimize_feature") > 0) + features=conf["optimize_feature"].as >(); + vector > directions; + vector fids(features.size()); + for (int i = 0; i < features.size(); ++i) + fids[i] = FD::Convert(features[i]); + LineOptimizer::CreateOptimizationDirections( + fids, + conf["random_directions"].as(), + &rng, + &directions); + unsigned dev_set_size = conf["dev_set_size"].as(); + for (unsigned i = 0; i < dev_set_size; ++i) { + for (unsigned j = 0; j < directions.size(); ++j) { + cout << forest_repository << '/' << i << ".json.gz " << i << ' '; + print(cout, origin, "=", ";"); + cout << ' '; + print(cout, directions[j], "=", ";"); + cout << endl; + } + } + return 0; +} diff --git a/dpmert/mr_dpmert_map.cc b/dpmert/mr_dpmert_map.cc new file mode 100644 index 00000000..f3304f0f --- /dev/null +++ b/dpmert/mr_dpmert_map.cc @@ -0,0 +1,112 @@ +#include +#include +#include +#include + +#include +#include + +#include "ns.h" +#include "ns_docscorer.h" +#include "ces.h" +#include "filelib.h" +#include "stringlib.h" +#include "sparse_vector.h" +#include "mert_geometry.h" +#include "inside_outside.h" +#include "error_surface.h" +#include "b64tools.h" +#include "hg_io.h" + +using namespace std; +namespace po = boost::program_options; + +void InitCommandLine(int argc, char** argv, po::variables_map* conf) { + po::options_description opts("Configuration options"); + opts.add_options() + ("reference,r",po::value >(), "[REQD] Reference translation (tokenized text)") + ("source,s",po::value(), "Source file (ignored, except for AER)") + ("evaluation_metric,m",po::value()->default_value("ibm_bleu"), "Evaluation metric being optimized") + ("input,i",po::value()->default_value("-"), "Input file to map (- is STDIN)") + ("help,h", "Help"); + po::options_description dcmdline_options; + dcmdline_options.add(opts); + po::store(parse_command_line(argc, argv, dcmdline_options), *conf); + bool flag = false; + if (!conf->count("reference")) { + cerr << "Please specify one or more references using -r \n"; + flag = true; + } + if (flag || conf->count("help")) { + cerr << dcmdline_options << endl; + exit(1); + } +} + +bool ReadSparseVectorString(const string& s, SparseVector* v) { +#if 0 + // this should work, but untested. + std::istringstream i(s); + i>>*v; +#else + vector fields; + Tokenize(s, ';', &fields); + if (fields.empty()) return false; + for (int i = 0; i < fields.size(); ++i) { + vector pair(2); + Tokenize(fields[i], '=', &pair); + if (pair.size() != 2) { + cerr << "Error parsing vector string: " << fields[i] << endl; + return false; + } + v->set_value(FD::Convert(pair[0]), atof(pair[1].c_str())); + } + return true; +#endif +} + +int main(int argc, char** argv) { + po::variables_map conf; + InitCommandLine(argc, argv, &conf); + const string evaluation_metric = conf["evaluation_metric"].as(); + EvaluationMetric* metric = EvaluationMetric::Instance(evaluation_metric); + DocumentScorer ds(metric, conf["reference"].as >()); + cerr << "Loaded " << ds.size() << " references for scoring with " << evaluation_metric << endl; + Hypergraph hg; + string last_file; + ReadFile in_read(conf["input"].as()); + istream &in=*in_read.stream(); + while(in) { + string line; + getline(in, line); + if (line.empty()) continue; + istringstream is(line); + int sent_id; + string file, s_origin, s_direction; + // path-to-file (JSON) sent_ed starting-point search-direction + is >> file >> sent_id >> s_origin >> s_direction; + SparseVector origin; + ReadSparseVectorString(s_origin, &origin); + SparseVector direction; + ReadSparseVectorString(s_direction, &direction); + // cerr << "File: " << file << "\nDir: " << direction << "\n X: " << origin << endl; + if (last_file != file) { + last_file = file; + ReadFile rf(file); + HypergraphIO::ReadFromJSON(rf.stream(), &hg); + } + const ConvexHullWeightFunction wf(origin, direction); + const ConvexHull hull = Inside(hg, NULL, wf); + + ErrorSurface es; + ComputeErrorSurface(*ds[sent_id], hull, &es, metric, hg); + //cerr << "Viterbi envelope has " << ve.size() << " segments\n"; + // cerr << "Error surface has " << es.size() << " segments\n"; + string val; + es.Serialize(&val); + cout << 'M' << ' ' << s_origin << ' ' << s_direction << '\t'; + B64::b64encode(val.c_str(), val.size(), &cout); + cout << endl << flush; + } + return 0; +} diff --git a/dpmert/mr_dpmert_reduce.cc b/dpmert/mr_dpmert_reduce.cc new file mode 100644 index 00000000..dda61f88 --- /dev/null +++ b/dpmert/mr_dpmert_reduce.cc @@ -0,0 +1,77 @@ +#include +#include +#include +#include + +#include +#include + +#include "sparse_vector.h" +#include "error_surface.h" +#include "line_optimizer.h" +#include "b64tools.h" +#include "stringlib.h" + +using namespace std; +namespace po = boost::program_options; + +void InitCommandLine(int argc, char** argv, po::variables_map* conf) { + po::options_description opts("Configuration options"); + opts.add_options() + ("evaluation_metric,m",po::value(), "Evaluation metric (IBM_BLEU, etc.)") + ("help,h", "Help"); + po::options_description dcmdline_options; + dcmdline_options.add(opts); + po::store(parse_command_line(argc, argv, dcmdline_options), *conf); + bool flag = conf->count("evaluation_metric") == 0; + if (flag || conf->count("help")) { + cerr << dcmdline_options << endl; + exit(1); + } +} + +int main(int argc, char** argv) { + po::variables_map conf; + InitCommandLine(argc, argv, &conf); + const string evaluation_metric = conf["evaluation_metric"].as(); + LineOptimizer::ScoreType opt_type = LineOptimizer::MAXIMIZE_SCORE; + if (UppercaseString(evaluation_metric) == "TER") + opt_type = LineOptimizer::MINIMIZE_SCORE; + EvaluationMetric* metric = EvaluationMetric::Instance(evaluation_metric); + + vector esv; + string last_key, line, key, val; + while(getline(cin, line)) { + size_t ks = line.find("\t"); + assert(string::npos != ks); + assert(ks > 2); + key = line.substr(2, ks - 2); + val = line.substr(ks + 1); + if (key != last_key) { + if (!last_key.empty()) { + float score; + double x = LineOptimizer::LineOptimize(metric, esv, opt_type, &score); + cout << last_key << "|" << x << "|" << score << endl; + } + last_key.swap(key); + esv.clear(); + } + if (val.size() % 4 != 0) { + cerr << "B64 encoding error 1! Skipping.\n"; + continue; + } + string encoded(val.size() / 4 * 3, '\0'); + if (!B64::b64decode(reinterpret_cast(&val[0]), val.size(), &encoded[0], encoded.size())) { + cerr << "B64 encoding error 2! Skipping.\n"; + continue; + } + esv.push_back(ErrorSurface()); + esv.back().Deserialize(encoded); + } + if (!esv.empty()) { + float score; + double x = LineOptimizer::LineOptimize(metric, esv, opt_type, &score); + cout << last_key << "|" << x << "|" << score << endl; + } + return 0; +} diff --git a/dpmert/parallelize.pl b/dpmert/parallelize.pl new file mode 100755 index 00000000..7d0365cc --- /dev/null +++ b/dpmert/parallelize.pl @@ -0,0 +1,423 @@ +#!/usr/bin/env perl + +# Author: Adam Lopez +# +# This script takes a command that processes input +# from stdin one-line-at-time, and parallelizes it +# on the cluster using David Chiang's sentserver/ +# sentclient architecture. +# +# Prerequisites: the command *must* read each line +# without waiting for subsequent lines of input +# (for instance, a command which must read all lines +# of input before processing will not work) and +# return it to the output *without* buffering +# multiple lines. + +#TODO: if -j 1, run immediately, not via sentserver? possible differences in environment might make debugging harder + +#ANNOYANCE: if input is shorter than -j n lines, or at the very last few lines, repeatedly sleeps. time cut down to 15s from 60s + +my $SCRIPT_DIR; BEGIN { use Cwd qw/ abs_path /; use File::Basename; $SCRIPT_DIR = dirname(abs_path($0)); push @INC, $SCRIPT_DIR, "$SCRIPT_DIR/../environment"; } +use LocalConfig; + +use Cwd qw/ abs_path cwd getcwd /; +use File::Temp qw/ tempfile /; +use Getopt::Long; +use IPC::Open2; +use strict; +use POSIX ":sys_wait_h"; + +use File::Basename; +my $myDir = dirname(__FILE__); +print STDERR __FILE__." -> $myDir\n"; +push(@INC, $myDir); +require "libcall.pl"; + +my $tailn=5; # +0 = concatenate all the client logs. 5 = last 5 lines +my $recycle_clients; # spawn new clients when previous ones terminate +my $stay_alive; # dont let server die when having zero clients +my $joblist = ""; +my $errordir=""; +my $multiline; +my @files_to_stage; +my $numnodes = 8; +my $user = $ENV{"USER"}; +my $pmem = "9g"; +my $basep=50300; +my $randp=300; +my $tryp=50; +my $no_which; +my $no_cd; + +my $DEBUG=$ENV{DEBUG}; +print STDERR "DEBUG=$DEBUG output enabled.\n" if $DEBUG; +my $verbose = 1; +sub verbose { + if ($verbose) { + print STDERR @_,"\n"; + } +} +sub debug { + if ($DEBUG) { + my ($package, $filename, $line) = caller; + print STDERR "DEBUG: $filename($line): ",join(' ',@_),"\n"; + } +} +my $is_shell_special=qr.[ \t\n\\><|&;"'`~*?{}$!()].; +my $shell_escape_in_quote=qr.[\\"\$`!].; +sub escape_shell { + my ($arg)=@_; + return undef unless defined $arg; + return '""' unless $arg; + if ($arg =~ /$is_shell_special/) { + $arg =~ s/($shell_escape_in_quote)/\\$1/g; + return "\"$arg\""; + } + return $arg; +} +sub preview_files { + my ($l,$skipempty,$footer,$n)=@_; + $n=$tailn unless defined $n; + my @f=grep { ! ($skipempty && -z $_) } @$l; + my $fn=join(' ',map {escape_shell($_)} @f); + my $cmd="tail -n $n $fn"; + unchecked_output("$cmd").($footer?"\nNONEMPTY FILES:\n$fn\n":""); +} +sub prefix_dirname($) { + #like `dirname but if ends in / then return the whole thing + local ($_)=@_; + if (/\/$/) { + $_; + } else { + s#/[^/]$##; + $_ ? $_ : ''; + } +} +sub ensure_final_slash($) { + local ($_)=@_; + m#/$# ? $_ : ($_."/"); +} +sub extend_path($$;$$) { + my ($base,$ext,$mkdir,$baseisdir)=@_; + if (-d $base) { + $base.="/"; + } else { + my $dir; + if ($baseisdir) { + $dir=$base; + $base.='/' unless $base =~ /\/$/; + } else { + $dir=prefix_dirname($base); + } + my @cmd=("/bin/mkdir","-p",$dir); + check_call(@cmd) if $mkdir; + } + return $base.$ext; +} + +my $abscwd=abs_path(&getcwd); +sub print_help; + +my $use_fork; +my @pids; + +# Process command-line options +unless (GetOptions( + "stay-alive" => \$stay_alive, + "recycle-clients" => \$recycle_clients, + "error-dir=s" => \$errordir, + "multi-line" => \$multiline, + "file=s" => \@files_to_stage, + "use-fork" => \$use_fork, + "verbose" => \$verbose, + "jobs=i" => \$numnodes, + "pmem=s" => \$pmem, + "baseport=i" => \$basep, +# "iport=i" => \$randp, #for short name -i + "no-which!" => \$no_which, + "no-cd!" => \$no_cd, + "tailn=s" => \$tailn, +) && scalar @ARGV){ + print_help(); + die "bad options."; +} + +my $cmd = ""; +my $prog=shift; +if ($no_which) { + $cmd=$prog; +} else { + $cmd=check_output("which $prog"); + chomp $cmd; + die "$prog not found - $cmd" unless $cmd; +} +#$cmd=abs_path($cmd); +for my $arg (@ARGV) { + $cmd .= " ".escape_shell($arg); +} +die "Please specify a command to parallelize\n" if $cmd eq ''; + +my $cdcmd=$no_cd ? '' : ("cd ".escape_shell($abscwd)."\n"); + +my $executable = $cmd; +$executable =~ s/^\s*(\S+)($|\s.*)/$1/; +$executable=check_output("basename $executable"); +chomp $executable; + + +print STDERR "Parallelizing ($numnodes ways): $cmd\n\n"; + +# create -e dir and save .sh +use File::Temp qw/tempdir/; +unless ($errordir) { + $errordir=tempdir("$executable.XXXXXX",CLEANUP=>1); +} +if ($errordir) { + my $scriptfile=extend_path("$errordir/","$executable.sh",1,1); + -d $errordir || die "should have created -e dir $errordir"; + open SF,">",$scriptfile || die; + print SF "$cdcmd$cmd\n"; + close SF; + chmod 0755,$scriptfile; + $errordir=abs_path($errordir); + &verbose("-e dir: $errordir"); +} + +# set cleanup handler +my @cleanup_cmds; +sub cleanup; +sub cleanup_and_die; +$SIG{INT} = "cleanup_and_die"; +$SIG{TERM} = "cleanup_and_die"; +$SIG{HUP} = "cleanup_and_die"; + +# other subs: +sub numof_live_jobs; +sub launch_job_on_node; + + +# vars +my $mydir = check_output("dirname $0"); chomp $mydir; +my $sentserver = "$mydir/sentserver"; +my $sentclient = "$mydir/sentclient"; +my $host = check_output("hostname"); +chomp $host; + + +# find open port +srand; +my $port = 50300+int(rand($randp)); +my $endp=$port+$tryp; +sub listening_port_lines { + my $quiet=$verbose?'':'2>/dev/null'; + return unchecked_output("netstat -a -n $quiet | grep LISTENING | grep -i tcp"); +} +my $netstat=&listening_port_lines; + +if ($verbose){ print STDERR "Testing port $port...";} + +while ($netstat=~/$port/ || &listening_port_lines=~/$port/){ + if ($verbose){ print STDERR "port is busy\n";} + $port++; + if ($port > $endp){ + die "Unable to find open port\n"; + } + if ($verbose){ print STDERR "Testing port $port... "; } +} +if ($verbose){ + print STDERR "port $port is available\n"; +} + +my $key = int(rand()*1000000); + +my $multiflag = ""; +if ($multiline){ $multiflag = "-m"; print STDERR "expecting multiline output.\n"; } +my $stay_alive_flag = ""; +if ($stay_alive){ $stay_alive_flag = "--stay-alive"; print STDERR "staying alive while no clients are connected.\n"; } + +my $node_count = 0; +my $script = ""; +# fork == one thread runs the sentserver, while the +# other spawns the sentclient commands. +my $pid = fork; +if ($pid == 0) { # child + sleep 8; # give other thread time to start sentserver + $script = "$cdcmd$sentclient $host:$port:$key $cmd"; + + if ($verbose){ + print STDERR "Client script:\n====\n"; + print STDERR $script; + print STDERR "====\n"; + } + for (my $jobn=0; $jobn<$numnodes; $jobn++){ + launch_job(); + } + if ($recycle_clients) { + my $ret; + my $livejobs; + while (1) { + $ret = waitpid($pid, WNOHANG); + #print STDERR "waitpid $pid ret = $ret \n"; + last if ($ret != 0); + $livejobs = numof_live_jobs(); + if ($numnodes >= $livejobs ) { # a client terminated, OR # lines of input was less than -j + print STDERR "num of requested nodes = $numnodes; num of currently live jobs = $livejobs; Client terminated - launching another.\n"; + launch_job(); + } else { + sleep 15; + } + } + } + print STDERR "CHILD PROCESSES SPAWNED ... WAITING\n"; + for my $p (@pids) { + waitpid($p, 0); + } +} else { +# my $todo = "$sentserver -k $key $multiflag $port "; + my $todo = "$sentserver -k $key $multiflag $port $stay_alive_flag "; + if ($verbose){ print STDERR "Running: $todo\n"; } + check_call($todo); + print STDERR "Call to $sentserver returned.\n"; + cleanup(); + exit(0); +} + +sub numof_live_jobs { + if ($use_fork) { + die "not implemented"; + } else { + # We can probably continue decoding if the qstat error is only temporary + my @livejobs = grep(/$joblist/, split(/\n/, unchecked_output("qstat"))); + return ($#livejobs + 1); + } +} +my (@errors,@outs,@cmds); + +sub launch_job { + if ($use_fork) { return launch_job_fork(); } + my $errorfile = "/dev/null"; + my $outfile = "/dev/null"; + $node_count++; + my $clientname = $executable; + $clientname =~ s/^(.{4}).*$/$1/; + $clientname = "$clientname.$node_count"; + if ($errordir){ + $errorfile = "$errordir/$clientname.ER"; + $outfile = "$errordir/$clientname.OU"; + push @errors,$errorfile; + push @outs,$outfile; + } + my $todo = qsub_args($pmem) . " -N $clientname -o $outfile -e $errorfile"; + push @cmds,$todo; + + print STDERR "Running: $todo\n"; + local(*QOUT, *QIN); + open2(\*QOUT, \*QIN, $todo) or die "Failed to open2: $!"; + print QIN $script; + close QIN; + while (my $jobid=){ + chomp $jobid; + if ($verbose){ print STDERR "Launched client job: $jobid"; } + $jobid =~ s/^(\d+)(.*?)$/\1/g; + $jobid =~ s/^Your job (\d+) .*$/\1/; + print STDERR " short job id $jobid\n"; + if ($verbose){ + print STDERR "cd: $abscwd\n"; + print STDERR "cmd: $cmd\n"; + } + if ($joblist == "") { $joblist = $jobid; } + else {$joblist = $joblist . "\|" . $jobid; } + my $cleanfn="qdel $jobid 2> /dev/null"; + push(@cleanup_cmds, $cleanfn); + } + close QOUT; +} + +sub launch_job_fork { + my $errorfile = "/dev/null"; + my $outfile = "/dev/null"; + $node_count++; + my $clientname = $executable; + $clientname =~ s/^(.{4}).*$/$1/; + $clientname = "$clientname.$node_count"; + if ($errordir){ + $errorfile = "$errordir/$clientname.ER"; + $outfile = "$errordir/$clientname.OU"; + push @errors,$errorfile; + push @outs,$outfile; + } + my $pid = fork; + if ($pid == 0) { + my ($fh, $scr_name) = get_temp_script(); + print $fh $script; + close $fh; + my $todo = "/bin/bash -xeo pipefail $scr_name 1> $outfile 2> $errorfile"; + print STDERR "EXEC: $todo\n"; + my $out = check_output("$todo"); + unlink $scr_name or warn "Failed to remove $scr_name"; + exit 0; + } else { + push @pids, $pid; + } +} + +sub get_temp_script { + my ($fh, $filename) = tempfile( "workXXXX", SUFFIX => '.sh'); + return ($fh, $filename); +} + +sub cleanup_and_die { + cleanup(); + die "\n"; +} + +sub cleanup { + print STDERR "Cleaning up...\n"; + for $cmd (@cleanup_cmds){ + print STDERR " Cleanup command: $cmd\n"; + eval $cmd; + } + print STDERR "outputs:\n",preview_files(\@outs,1),"\n"; + print STDERR "errors:\n",preview_files(\@errors,1),"\n"; + print STDERR "cmd:\n",$cmd,"\n"; + print STDERR " cat $errordir/*.ER\nfor logs.\n"; + print STDERR "Cleanup finished.\n"; +} + +sub print_help +{ + my $name = check_output("basename $0"); chomp $name; + print << "Help"; + +usage: $name [options] + + Automatic black-box parallelization of commands. + +options: + + --use-fork + Instead of using qsub, use fork. + + -e, --error-dir + Retain output files from jobs in , rather + than silently deleting them. + + -m, --multi-line + Expect that command may produce multiple output + lines for a single input line. $name makes a + reasonable attempt to obtain all output before + processing additional inputs. However, use of this + option is inherently unsafe. + + -v, --verbose + Print diagnostic informatoin on stderr. + + -j, --jobs + Number of jobs to use. + + -p, --pmem + pmem setting for each job. + +Help +} diff --git a/dpmert/sentclient.c b/dpmert/sentclient.c new file mode 100644 index 00000000..91d994ab --- /dev/null +++ b/dpmert/sentclient.c @@ -0,0 +1,76 @@ +/* Copyright (c) 2001 by David Chiang. All rights reserved.*/ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "sentserver.h" + +int main (int argc, char *argv[]) { + int sock, port; + char *s, *key; + struct hostent *hp; + struct sockaddr_in server; + int errors = 0; + + if (argc < 3) { + fprintf(stderr, "Usage: sentclient host[:port[:key]] command [args ...]\n"); + exit(1); + } + + s = strchr(argv[1], ':'); + key = NULL; + + if (s == NULL) { + port = DEFAULT_PORT; + } else { + *s = '\0'; + s+=1; + /* dumb hack */ + key = strchr(s, ':'); + if (key != NULL){ + *key = '\0'; + key += 1; + } + port = atoi(s); + } + + sock = socket(AF_INET, SOCK_STREAM, 0); + + hp = gethostbyname(argv[1]); + if (hp == NULL) { + fprintf(stderr, "unknown host %s\n", argv[1]); + exit(1); + } + + bzero((char *)&server, sizeof(server)); + bcopy(hp->h_addr, (char *)&server.sin_addr, hp->h_length); + server.sin_family = hp->h_addrtype; + server.sin_port = htons(port); + + while (connect(sock, (struct sockaddr *)&server, sizeof(server)) < 0) { + perror("connect()"); + sleep(1); + errors++; + if (errors > 5) + exit(1); + } + + close(0); + close(1); + dup2(sock, 0); + dup2(sock, 1); + + if (key != NULL){ + write(1, key, strlen(key)); + write(1, "\n", 1); + } + + execvp(argv[2], argv+2); + return 0; +} diff --git a/dpmert/sentserver.c b/dpmert/sentserver.c new file mode 100644 index 00000000..c20b4fa6 --- /dev/null +++ b/dpmert/sentserver.c @@ -0,0 +1,515 @@ +/* Copyright (c) 2001 by David Chiang. All rights reserved.*/ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "sentserver.h" + +#define MAX_CLIENTS 64 + +struct clientinfo { + int s; + struct sockaddr_in sin; +}; + +struct line { + int id; + char *s; + int status; + struct line *next; +} *head, **ptail; + +int n_sent = 0, n_received=0, n_flushed=0; + +#define STATUS_RUNNING 0 +#define STATUS_ABORTED 1 +#define STATUS_FINISHED 2 + +pthread_mutex_t queue_mutex = PTHREAD_MUTEX_INITIALIZER; +pthread_mutex_t clients_mutex = PTHREAD_MUTEX_INITIALIZER; +pthread_mutex_t input_mutex = PTHREAD_MUTEX_INITIALIZER; + +int n_clients = 0; +int s; +int expect_multiline_output = 0; +int log_mutex = 0; +int stay_alive = 0; /* dont panic and die with zero clients */ + +void queue_finish(struct line *node, char *s, int fid); +char * read_line(int fd, int multiline); +void done (int code); + +struct line * queue_get(int fid) { + struct line *cur; + char *s, *synch; + + if (log_mutex) fprintf(stderr, "Getting for data for fid %d\n", fid); + if (log_mutex) fprintf(stderr, "Locking queue mutex (%d)\n", fid); + pthread_mutex_lock(&queue_mutex); + + /* First, check for aborted sentences. */ + + if (log_mutex) fprintf(stderr, " Checking queue for aborted jobs (fid %d)\n", fid); + for (cur = head; cur != NULL; cur = cur->next) { + if (cur->status == STATUS_ABORTED) { + cur->status = STATUS_RUNNING; + + if (log_mutex) fprintf(stderr, "Unlocking queue mutex (%d)\n", fid); + pthread_mutex_unlock(&queue_mutex); + + return cur; + } + } + if (log_mutex) fprintf(stderr, "Unlocking queue mutex (%d)\n", fid); + pthread_mutex_unlock(&queue_mutex); + + /* Otherwise, read a new one. */ + if (log_mutex) fprintf(stderr, "Locking input mutex (%d)\n", fid); + if (log_mutex) fprintf(stderr, " Reading input for new data (fid %d)\n", fid); + pthread_mutex_lock(&input_mutex); + s = read_line(0,0); + + while (s) { + if (log_mutex) fprintf(stderr, "Locking queue mutex (%d)\n", fid); + pthread_mutex_lock(&queue_mutex); + if (log_mutex) fprintf(stderr, "Unlocking input mutex (%d)\n", fid); + pthread_mutex_unlock(&input_mutex); + + cur = malloc(sizeof (struct line)); + cur->id = n_sent; + cur->s = s; + cur->next = NULL; + + *ptail = cur; + ptail = &cur->next; + + n_sent++; + + if (strcmp(s,"===SYNCH===\n")==0){ + fprintf(stderr, "Received ===SYNCH=== signal (fid %d)\n", fid); + // Note: queue_finish calls free(cur->s). + // Therefore we need to create a new string here. + synch = malloc((strlen("===SYNCH===\n")+2) * sizeof (char)); + synch = strcpy(synch, s); + + if (log_mutex) fprintf(stderr, "Unlocking queue mutex (%d)\n", fid); + pthread_mutex_unlock(&queue_mutex); + queue_finish(cur, synch, fid); /* handles its own lock */ + + if (log_mutex) fprintf(stderr, "Locking input mutex (%d)\n", fid); + if (log_mutex) fprintf(stderr, " Reading input for new data (fid %d)\n", fid); + pthread_mutex_lock(&input_mutex); + + s = read_line(0,0); + } else { + if (log_mutex) fprintf(stderr, " Received new data %d (fid %d)\n", cur->id, fid); + cur->status = STATUS_RUNNING; + if (log_mutex) fprintf(stderr, "Unlocking queue mutex (%d)\n", fid); + pthread_mutex_unlock(&queue_mutex); + return cur; + } + } + + if (log_mutex) fprintf(stderr, "Unlocking input mutex (%d)\n", fid); + pthread_mutex_unlock(&input_mutex); + /* Only way to reach this point: no more output */ + + if (log_mutex) fprintf(stderr, "Locking queue mutex (%d)\n", fid); + pthread_mutex_lock(&queue_mutex); + if (head == NULL) { + fprintf(stderr, "Reached end of file. Exiting.\n"); + done(0); + } else + ptail = NULL; /* This serves as a signal that there is no more input */ + if (log_mutex) fprintf(stderr, "Unlocking queue mutex (%d)\n", fid); + pthread_mutex_unlock(&queue_mutex); + + return NULL; +} + +void queue_panic() { + struct line *next; + while (head && head->status == STATUS_FINISHED) { + /* Write out finished sentences */ + if (head->status == STATUS_FINISHED) { + fputs(head->s, stdout); + fflush(stdout); + } + /* Write out blank line for unfinished sentences */ + if (head->status == STATUS_ABORTED) { + fputs("\n", stdout); + fflush(stdout); + } + /* By defition, there cannot be any RUNNING sentences, since + function is only called when n_clients == 0 */ + free(head->s); + next = head->next; + free(head); + head = next; + n_flushed++; + } + fclose(stdout); + fprintf(stderr, "All clients died. Panicking, flushing completed sentences and exiting.\n"); + done(1); +} + +void queue_abort(struct line *node, int fid) { + if (log_mutex) fprintf(stderr, "Locking queue mutex (%d)\n", fid); + pthread_mutex_lock(&queue_mutex); + node->status = STATUS_ABORTED; + if (n_clients == 0) { + if (stay_alive) { + fprintf(stderr, "Warning! No live clients detected! Staying alive, will retry soon.\n"); + } else { + queue_panic(); + } + } + if (log_mutex) fprintf(stderr, "Unlocking queue mutex (%d)\n", fid); + pthread_mutex_unlock(&queue_mutex); +} + + +void queue_print() { + struct line *cur; + + fprintf(stderr, " Queue\n"); + + for (cur = head; cur != NULL; cur = cur->next) { + switch(cur->status) { + case STATUS_RUNNING: + fprintf(stderr, " %d running ", cur->id); break; + case STATUS_ABORTED: + fprintf(stderr, " %d aborted ", cur->id); break; + case STATUS_FINISHED: + fprintf(stderr, " %d finished ", cur->id); break; + + } + fprintf(stderr, "\n"); + //fprintf(stderr, cur->s); + } +} + +void queue_finish(struct line *node, char *s, int fid) { + struct line *next; + if (log_mutex) fprintf(stderr, "Locking queue mutex (%d)\n", fid); + pthread_mutex_lock(&queue_mutex); + + free(node->s); + node->s = s; + node->status = STATUS_FINISHED; + n_received++; + + /* Flush out finished nodes */ + while (head && head->status == STATUS_FINISHED) { + + if (log_mutex) fprintf(stderr, " Flushing finished node %d\n", head->id); + + fputs(head->s, stdout); + fflush(stdout); + if (log_mutex) fprintf(stderr, " Flushed node %d\n", head->id); + free(head->s); + + next = head->next; + free(head); + + head = next; + + n_flushed++; + + if (head == NULL) { /* empty queue */ + if (ptail == NULL) { /* This can only happen if set in queue_get as signal that there is no more input. */ + fprintf(stderr, "All sentences finished. Exiting.\n"); + done(0); + } else /* ptail pointed at something which was just popped off the stack -- reset to head*/ + ptail = &head; + } + } + + if (log_mutex) fprintf(stderr, " Flushing output %d\n", head->id); + fflush(stdout); + fprintf(stderr, "%d sentences sent, %d sentences finished, %d sentences flushed\n", n_sent, n_received, n_flushed); + + if (log_mutex) fprintf(stderr, "Unlocking queue mutex (%d)\n", fid); + pthread_mutex_unlock(&queue_mutex); + +} + +char * read_line(int fd, int multiline) { + int size = 80; + char errorbuf[100]; + char *s = malloc(size+2); + int result, errors=0; + int i = 0; + + result = read(fd, s+i, 1); + + while (1) { + if (result < 0) { + perror("read()"); + sprintf(errorbuf, "Error code: %d\n", errno); + fprintf(stderr, errorbuf); + errors++; + if (errors > 5) { + free(s); + return NULL; + } else { + sleep(1); /* retry after delay */ + } + } else if (result == 0) { + break; + } else if (multiline==0 && s[i] == '\n') { + break; + } else { + if (s[i] == '\n'){ + /* if we've reached this point, + then multiline must be 1, and we're + going to poll the fd for an additional + line of data. The basic design is to + run a select on the filedescriptor fd. + Select will return under two conditions: + if there is data on the fd, or if a + timeout is reached. We'll select on this + fd. If select returns because there's data + ready, keep going; else assume there's no + more and return the data we already have. + */ + + fd_set set; + FD_ZERO(&set); + FD_SET(fd, &set); + + struct timeval timeout; + timeout.tv_sec = 3; // number of seconds for timeout + timeout.tv_usec = 0; + + int ready = select(FD_SETSIZE, &set, NULL, NULL, &timeout); + if (ready<1){ + break; // no more data, stop looping + } + } + i++; + + if (i == size) { + size = size*2; + s = realloc(s, size+2); + } + } + + result = read(fd, s+i, 1); + } + + if (result == 0 && i == 0) { /* end of file */ + free(s); + return NULL; + } + + s[i] = '\n'; + s[i+1] = '\0'; + + return s; +} + +void * new_client(void *arg) { + struct clientinfo *client = (struct clientinfo *)arg; + struct line *cur; + int result; + char *s; + char errorbuf[100]; + + pthread_mutex_lock(&clients_mutex); + n_clients++; + pthread_mutex_unlock(&clients_mutex); + + fprintf(stderr, "Client connected (%d connected)\n", n_clients); + + for (;;) { + + cur = queue_get(client->s); + + if (cur) { + /* fprintf(stderr, "Sending to client: %s", cur->s); */ + fprintf(stderr, "Sending data %d to client (fid %d)\n", cur->id, client->s); + result = write(client->s, cur->s, strlen(cur->s)); + if (result < strlen(cur->s)){ + perror("write()"); + sprintf(errorbuf, "Error code: %d\n", errno); + fprintf(stderr, errorbuf); + + pthread_mutex_lock(&clients_mutex); + n_clients--; + pthread_mutex_unlock(&clients_mutex); + + fprintf(stderr, "Client died (%d connected)\n", n_clients); + queue_abort(cur, client->s); + + close(client->s); + free(client); + + pthread_exit(NULL); + } + } else { + close(client->s); + pthread_mutex_lock(&clients_mutex); + n_clients--; + pthread_mutex_unlock(&clients_mutex); + fprintf(stderr, "Client dismissed (%d connected)\n", n_clients); + pthread_exit(NULL); + } + + s = read_line(client->s,expect_multiline_output); + if (s) { + /* fprintf(stderr, "Client (fid %d) returned: %s", client->s, s); */ + fprintf(stderr, "Client (fid %d) returned data %d\n", client->s, cur->id); +// queue_print(); + queue_finish(cur, s, client->s); + } else { + pthread_mutex_lock(&clients_mutex); + n_clients--; + pthread_mutex_unlock(&clients_mutex); + + fprintf(stderr, "Client died (%d connected)\n", n_clients); + queue_abort(cur, client->s); + + close(client->s); + free(client); + + pthread_exit(NULL); + } + + } + return 0; +} + +void done (int code) { + close(s); + exit(code); +} + + + +int main (int argc, char *argv[]) { + struct sockaddr_in sin, from; + int g; + socklen_t len; + struct clientinfo *client; + int port; + int opt; + int errors = 0; + int argi; + char *key = NULL, *client_key; + int use_key = 0; + /* the key stuff here doesn't provide any + real measure of security, it's mainly to keep + jobs from bumping into each other. */ + + pthread_t tid; + port = DEFAULT_PORT; + + for (argi=1; argi < argc; argi++){ + if (strcmp(argv[argi], "-m")==0){ + expect_multiline_output = 1; + } else if (strcmp(argv[argi], "-k")==0){ + argi++; + if (argi == argc){ + fprintf(stderr, "Key must be specified after -k\n"); + exit(1); + } + key = argv[argi]; + use_key = 1; + } else if (strcmp(argv[argi], "--stay-alive")==0){ + stay_alive = 1; /* dont panic and die with zero clients */ + } else { + port = atoi(argv[argi]); + } + } + + /* Initialize data structures */ + head = NULL; + ptail = &head; + + /* Set up listener */ + s = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP); + opt = 1; + setsockopt(s, SOL_SOCKET, SO_REUSEADDR, &opt, sizeof(opt)); + + sin.sin_family = AF_INET; + sin.sin_addr.s_addr = htonl(INADDR_ANY); + sin.sin_port = htons(port); + while (bind(s, (struct sockaddr *) &sin, sizeof(sin)) < 0) { + perror("bind()"); + sleep(1); + errors++; + if (errors > 100) + exit(1); + } + + len = sizeof(sin); + getsockname(s, (struct sockaddr *) &sin, &len); + + fprintf(stderr, "Listening on port %hu\n", ntohs(sin.sin_port)); + + while (listen(s, MAX_CLIENTS) < 0) { + perror("listen()"); + sleep(1); + errors++; + if (errors > 100) + exit(1); + } + + for (;;) { + len = sizeof(from); + g = accept(s, (struct sockaddr *)&from, &len); + if (g < 0) { + perror("accept()"); + sleep(1); + continue; + } + client = malloc(sizeof(struct clientinfo)); + client->s = g; + bcopy(&from, &client->sin, len); + + if (use_key){ + fd_set set; + FD_ZERO(&set); + FD_SET(client->s, &set); + + struct timeval timeout; + timeout.tv_sec = 3; // number of seconds for timeout + timeout.tv_usec = 0; + + int ready = select(FD_SETSIZE, &set, NULL, NULL, &timeout); + if (ready<1){ + fprintf(stderr, "Prospective client failed to respond with correct key.\n"); + close(client->s); + free(client); + } else { + client_key = read_line(client->s,0); + client_key[strlen(client_key)-1]='\0'; /* chop trailing newline */ + if (strcmp(key, client_key)==0){ + pthread_create(&tid, NULL, new_client, client); + } else { + fprintf(stderr, "Prospective client failed to respond with correct key.\n"); + close(client->s); + free(client); + } + free(client_key); + } + } else { + pthread_create(&tid, NULL, new_client, client); + } + } + +} + + + diff --git a/dpmert/sentserver.h b/dpmert/sentserver.h new file mode 100644 index 00000000..cd17a546 --- /dev/null +++ b/dpmert/sentserver.h @@ -0,0 +1,6 @@ +#ifndef SENTSERVER_H +#define SENTSERVER_H + +#define DEFAULT_PORT 50000 + +#endif diff --git a/dpmert/tac.pl b/dpmert/tac.pl new file mode 100755 index 00000000..9fb525c1 --- /dev/null +++ b/dpmert/tac.pl @@ -0,0 +1,8 @@ +#!/usr/bin/perl + +while(<>) { + chomp; + $|=1; + print (scalar reverse($_)); + print "\n"; +} diff --git a/dpmert/test_aer/README b/dpmert/test_aer/README new file mode 100644 index 00000000..819b2e32 --- /dev/null +++ b/dpmert/test_aer/README @@ -0,0 +1,8 @@ +To run the test: + +../dist-vest.pl --local --metric aer cdec.ini --source-file corpus.src --ref-files=ref.0 --weights weights + +This will optimize the parameters of the tiny lexical translation model +so as to minimize the AER of the Viterbi alignment on the development +set in corpus.src according to the reference alignments in ref.0. + diff --git a/dpmert/test_aer/cdec.ini b/dpmert/test_aer/cdec.ini new file mode 100644 index 00000000..08187848 --- /dev/null +++ b/dpmert/test_aer/cdec.ini @@ -0,0 +1,3 @@ +formalism=lextrans +grammar=grammar +aligner=true diff --git a/dpmert/test_aer/corpus.src b/dpmert/test_aer/corpus.src new file mode 100644 index 00000000..31b23971 --- /dev/null +++ b/dpmert/test_aer/corpus.src @@ -0,0 +1,3 @@ +el gato negro ||| the black cat +el gato ||| the cat +el libro ||| the book diff --git a/dpmert/test_aer/grammar b/dpmert/test_aer/grammar new file mode 100644 index 00000000..9d857824 --- /dev/null +++ b/dpmert/test_aer/grammar @@ -0,0 +1,12 @@ +el ||| cat ||| F1=1 +el ||| the ||| F2=1 +el ||| black ||| F3=1 +el ||| book ||| F11=1 +gato ||| cat ||| F4=1 NN=1 +gato ||| black ||| F5=1 +gato ||| the ||| F6=1 +negro ||| the ||| F7=1 +negro ||| cat ||| F8=1 +negro ||| black ||| F9=1 +libro ||| the ||| F10=1 +libro ||| book ||| F12=1 NN=1 diff --git a/dpmert/test_aer/ref.0 b/dpmert/test_aer/ref.0 new file mode 100644 index 00000000..734a9c5b --- /dev/null +++ b/dpmert/test_aer/ref.0 @@ -0,0 +1,3 @@ +0-0 1-2 2-1 +0-0 1-1 +0-0 1-1 diff --git a/dpmert/test_aer/weights b/dpmert/test_aer/weights new file mode 100644 index 00000000..afc9282e --- /dev/null +++ b/dpmert/test_aer/weights @@ -0,0 +1,13 @@ +F1 0.1 +F2 -.5980815 +F3 0.24235 +F4 0.625 +F5 0.4514 +F6 0.112316 +F7 -0.123415 +F8 -0.25390285 +F9 -0.23852 +F10 0.646 +F11 0.413141 +F12 0.343216 +NN -0.1215 diff --git a/dpmert/test_data/0.json.gz b/dpmert/test_data/0.json.gz new file mode 100644 index 00000000..30f8dd77 Binary files /dev/null and b/dpmert/test_data/0.json.gz differ diff --git a/dpmert/test_data/1.json.gz b/dpmert/test_data/1.json.gz new file mode 100644 index 00000000..c82cc179 Binary files /dev/null and b/dpmert/test_data/1.json.gz differ diff --git a/dpmert/test_data/c2e.txt.0 b/dpmert/test_data/c2e.txt.0 new file mode 100644 index 00000000..12c4abe9 --- /dev/null +++ b/dpmert/test_data/c2e.txt.0 @@ -0,0 +1,2 @@ +australia reopens embassy in manila +( afp , manila , january 2 ) australia reopened its embassy in the philippines today , which was shut down about seven weeks ago due to what was described as a specific threat of a terrorist attack . diff --git a/dpmert/test_data/c2e.txt.1 b/dpmert/test_data/c2e.txt.1 new file mode 100644 index 00000000..4ac12df1 --- /dev/null +++ b/dpmert/test_data/c2e.txt.1 @@ -0,0 +1,2 @@ +australia reopened manila embassy +( agence france-presse , manila , 2nd ) - australia reopened its embassy in the philippines today . the embassy was closed seven weeks ago after what was described as a specific threat of a terrorist attack . diff --git a/dpmert/test_data/c2e.txt.2 b/dpmert/test_data/c2e.txt.2 new file mode 100644 index 00000000..2f67b72f --- /dev/null +++ b/dpmert/test_data/c2e.txt.2 @@ -0,0 +1,2 @@ +australia to reopen embassy in manila +( afp report from manila , january 2 ) australia reopened its embassy in the philippines today . seven weeks ago , the embassy was shut down due to so-called confirmed terrorist attack threats . diff --git a/dpmert/test_data/c2e.txt.3 b/dpmert/test_data/c2e.txt.3 new file mode 100644 index 00000000..5483cef6 --- /dev/null +++ b/dpmert/test_data/c2e.txt.3 @@ -0,0 +1,2 @@ +australia to re - open its embassy to manila +( afp , manila , thursday ) australia reopens its embassy to manila , which was closed for the so-called " clear " threat of terrorist attack 7 weeks ago . diff --git a/dpmert/test_data/re.txt.0 b/dpmert/test_data/re.txt.0 new file mode 100644 index 00000000..86eff087 --- /dev/null +++ b/dpmert/test_data/re.txt.0 @@ -0,0 +1,5 @@ +erdogan states turkey to reject any pressures to urge it to recognize cyprus +ankara 12 - 1 ( afp ) - turkish prime minister recep tayyip erdogan announced today , wednesday , that ankara will reject any pressure by the european union to urge it to recognize cyprus . this comes two weeks before the summit of european union state and government heads who will decide whether or nor membership negotiations with ankara should be opened . +erdogan told " ntv " television station that " the european union cannot address us by imposing new conditions on us with regard to cyprus . +we will discuss this dossier in the course of membership negotiations . " +he added " let me be clear , i cannot sidestep turkey , this is something we cannot accept . " diff --git a/dpmert/test_data/re.txt.1 b/dpmert/test_data/re.txt.1 new file mode 100644 index 00000000..2140f198 --- /dev/null +++ b/dpmert/test_data/re.txt.1 @@ -0,0 +1,5 @@ +erdogan confirms turkey will resist any pressure to recognize cyprus +ankara 12 - 1 ( afp ) - the turkish head of government , recep tayyip erdogan , announced today ( wednesday ) that ankara would resist any pressure the european union might exercise in order to force it into recognizing cyprus . this comes two weeks before a summit of european union heads of state and government , who will decide whether or not to open membership negotiations with ankara . +erdogan said to the ntv television channel : " the european union cannot engage with us through imposing new conditions on us with regard to cyprus . +we shall discuss this issue in the course of the membership negotiations . " +he added : " let me be clear - i cannot confine turkey . this is something we do not accept . " diff --git a/dpmert/test_data/re.txt.2 b/dpmert/test_data/re.txt.2 new file mode 100644 index 00000000..94e46286 --- /dev/null +++ b/dpmert/test_data/re.txt.2 @@ -0,0 +1,5 @@ +erdogan confirms that turkey will reject any pressures to encourage it to recognize cyprus +ankara , 12 / 1 ( afp ) - the turkish prime minister recep tayyip erdogan declared today , wednesday , that ankara will reject any pressures that the european union may apply on it to encourage to recognize cyprus . this comes two weeks before a summit of the heads of countries and governments of the european union , who will decide on whether or not to start negotiations on joining with ankara . +erdogan told the ntv television station that " it is not possible for the european union to talk to us by imposing new conditions on us regarding cyprus . +we shall discuss this dossier during the negotiations on joining . " +and he added , " let me be clear . turkey's arm should not be twisted ; this is something we cannot accept . " diff --git a/dpmert/test_data/re.txt.3 b/dpmert/test_data/re.txt.3 new file mode 100644 index 00000000..f87c3308 --- /dev/null +++ b/dpmert/test_data/re.txt.3 @@ -0,0 +1,5 @@ +erdogan stresses that turkey will reject all pressures to force it to recognize cyprus +ankara 12 - 1 ( afp ) - turkish prime minister recep tayyip erdogan announced today , wednesday , that ankara would refuse all pressures applied on it by the european union to force it to recognize cyprus . that came two weeks before the summit of the presidents and prime ministers of the european union , who would decide on whether to open negotiations on joining with ankara or not . +erdogan said to " ntv " tv station that the " european union can not communicate with us by imposing on us new conditions related to cyprus . +we will discuss this file during the negotiations on joining . " +he added , " let me be clear . turkey's arm should not be twisted . this is unacceptable to us . " -- cgit v1.2.3 From e1eae0ac941aa76528d4673dbd35f214cdac23fb Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Thu, 2 Feb 2012 17:19:40 -0500 Subject: remove some dead code to clean things up --- decoder/1dev.ur | 1 - decoder/apply_fsa_models.README | 21 --- decoder/cdec-gz.ini | 7 - decoder/cdec-nolm-tuned.ini | 7 - decoder/decode.sh | 10 -- decoder/do.tests.sh | 1 - decoder/fsa-decode.sh | 3 - decoder/fsa-hiero.ini | 5 - decoder/fsa.ini | 2 - decoder/glue-lda.scfg | 8 -- decoder/grammar.hiero | 151 --------------------- decoder/perro.sh | 1 - decoder/perro.ur | 1 - decoder/short.ur | 1 - decoder/weights-fsa | 14 -- decoder/weights.hiero | 10 -- dpmert/tac.pl | 8 -- expLog | 60 --------- graehl/NOTES | 18 --- graehl/NOTES.beam | 29 ----- graehl/NOTES.earley | 111 ---------------- graehl/NOTES.lm.phrase | 180 -------------------------- graehl/NOTES.partial.binarize | 21 --- graehl/NOTES.wfsa | 16 --- rescore/cdec_kbest_to_zmert.pl | 64 --------- rescore/example/README | 4 - rescore/example/cdec.ini | 2 - rescore/example/hyp.txt | 5 - rescore/example/small.scfg | 9 -- rescore/example/source.txt | 2 - rescore/example/weights | 1 - rescore/generate_zmert_params_from_weights.pl | 26 ---- rescore/rerank.pl | 86 ------------ rescore/rescore_inv_model1.pl | 126 ------------------ rescore/rescore_with_cdec_model.pl | 121 ----------------- 35 files changed, 1132 deletions(-) delete mode 100755 decoder/1dev.ur delete mode 100755 decoder/apply_fsa_models.README delete mode 100755 decoder/cdec-gz.ini delete mode 100755 decoder/cdec-nolm-tuned.ini delete mode 100755 decoder/decode.sh delete mode 100755 decoder/do.tests.sh delete mode 100755 decoder/fsa-decode.sh delete mode 100755 decoder/fsa-hiero.ini delete mode 100755 decoder/fsa.ini delete mode 100755 decoder/glue-lda.scfg delete mode 100755 decoder/grammar.hiero delete mode 100755 decoder/perro.sh delete mode 100755 decoder/perro.ur delete mode 100755 decoder/short.ur delete mode 100644 decoder/weights-fsa delete mode 100755 decoder/weights.hiero delete mode 100755 dpmert/tac.pl delete mode 100644 expLog delete mode 100755 graehl/NOTES delete mode 100755 graehl/NOTES.beam delete mode 100755 graehl/NOTES.earley delete mode 100755 graehl/NOTES.lm.phrase delete mode 100755 graehl/NOTES.partial.binarize delete mode 100755 graehl/NOTES.wfsa delete mode 100755 rescore/cdec_kbest_to_zmert.pl delete mode 100644 rescore/example/README delete mode 100644 rescore/example/cdec.ini delete mode 100644 rescore/example/hyp.txt delete mode 100644 rescore/example/small.scfg delete mode 100644 rescore/example/source.txt delete mode 100644 rescore/example/weights delete mode 100755 rescore/generate_zmert_params_from_weights.pl delete mode 100755 rescore/rerank.pl delete mode 100755 rescore/rescore_inv_model1.pl delete mode 100755 rescore/rescore_with_cdec_model.pl (limited to 'dpmert') diff --git a/decoder/1dev.ur b/decoder/1dev.ur deleted file mode 100755 index adeaa101..00000000 --- a/decoder/1dev.ur +++ /dev/null @@ -1 +0,0 @@ -krAcy ( AstRAf rpwrtRr ) krAcy myN pyr kw mxtlf HAdvAt myN xAtwn smyt 4 AfrAd hlAk hw gyY jbkh smndr sY Ayk $xS ky lA$ mly . diff --git a/decoder/apply_fsa_models.README b/decoder/apply_fsa_models.README deleted file mode 100755 index 7e116a62..00000000 --- a/decoder/apply_fsa_models.README +++ /dev/null @@ -1,21 +0,0 @@ -trie root and trie lhs2[lhs-nodeid] -> trie node - -trie node edges (adj) - list of w,dest,p. dest==0 means it's a completed rule (note: p is redundant with node e.dest->p-p, except in case of dest=0). we will also use null_wordid (max_int) for dest=0 edges, but that doesn't matter - -we intersect by iterating over adj and scoring w/ fsa. TODO: index for sparse fsa; for now we assume smoothed ngram fsa where all items are scorable. - -predicted items: we don't make copies of the pending predictions as we scan toward completion; instead, item backpointers are followed until the prediction (where backpointer=0). such backpointer=0 items have a queue of prediction-originating items. - -reusing completed items using a lookup on pair [NT,a] -> all [NT,a,b] lazy best-first. b-next (right state) index in lazy index. - -perhaps predictors need to register the # of items it has already mated with. (b-next index) - -comb-like (cube) t-next (position in trie node edge list), b-next? or just check chart and don't redup. depends on whether we want just 1best or kbest deriv - diff. ways of reaching same result are good in kbest. - -types of chart items: - -A->t.*,a,b (trie node t) with mutable state t-next for generating successor lazily (vs. all at once) - -A->t.B,a,b (t-next of A->t.* points to (B,t')): mutable state b-next for choosing which B->b,? to use. note: such an item can't be queued immediately on its own, but can be added to the pending list of B->b,? ; once any B->b,? is completed then we see if any more b-next are already known; if they're exhausted then we add back to pending list? - -A->a,? - list of all known (b,inside prob) such that A[a,b]. we may also choose to represent this as A->.*,a,a. diff --git a/decoder/cdec-gz.ini b/decoder/cdec-gz.ini deleted file mode 100755 index f9b15420..00000000 --- a/decoder/cdec-gz.ini +++ /dev/null @@ -1,7 +0,0 @@ -cubepruning_pop_limit=200 -feature_function=WordPenalty -feature_function=ArityPenalty -add_pass_through_rules=true -formalism=scfg -grammar=mt09.grammar.gz -weights=weights.tune.nolm diff --git a/decoder/cdec-nolm-tuned.ini b/decoder/cdec-nolm-tuned.ini deleted file mode 100755 index 5ebab747..00000000 --- a/decoder/cdec-nolm-tuned.ini +++ /dev/null @@ -1,7 +0,0 @@ -cubepruning_pop_limit=200 -feature_function=WordPenalty -feature_function=ArityPenalty -add_pass_through_rules=true -formalism=scfg -grammar=mt09.grammar -weights=weights.tune.nolm diff --git a/decoder/decode.sh b/decoder/decode.sh deleted file mode 100755 index 677e64ad..00000000 --- a/decoder/decode.sh +++ /dev/null @@ -1,10 +0,0 @@ -d=$(dirname `readlink -f $0`)/ -decode() { -if [ "$lm" ] ; then - lmargs0=-F - lmargs1="LanguageModel lm.gz -n LM" -fi -set -x -$gdb ${cdec:=$d/cdec} -c $d/${cfg:=cdec-fsa}.ini -i $d/${in:=1dev.ur} $lmargs0 "$lmargs1" --show_features --show_config --show_weights "$@" -set +x -} diff --git a/decoder/do.tests.sh b/decoder/do.tests.sh deleted file mode 100755 index b3ddeb18..00000000 --- a/decoder/do.tests.sh +++ /dev/null @@ -1 +0,0 @@ -for f in *_test; do ./$f; done diff --git a/decoder/fsa-decode.sh b/decoder/fsa-decode.sh deleted file mode 100755 index 66879523..00000000 --- a/decoder/fsa-decode.sh +++ /dev/null @@ -1,3 +0,0 @@ -d=$(dirname `readlink -f $0`)/ -. $d/decode.sh -in=1dev.ur cfg=cdec-fsa decode diff --git a/decoder/fsa-hiero.ini b/decoder/fsa-hiero.ini deleted file mode 100755 index 7c7d0347..00000000 --- a/decoder/fsa-hiero.ini +++ /dev/null @@ -1,5 +0,0 @@ -formalism=scfg -scfg_extra_glue_grammar=glue-lda.scfg -grammar=grammar.hiero -show_tree_structure=true -weights=weights.hiero diff --git a/decoder/fsa.ini b/decoder/fsa.ini deleted file mode 100755 index 571a2e34..00000000 --- a/decoder/fsa.ini +++ /dev/null @@ -1,2 +0,0 @@ -feature_function=ShorterThanPrev -feature_function=LongerThanPrev diff --git a/decoder/glue-lda.scfg b/decoder/glue-lda.scfg deleted file mode 100755 index 27489817..00000000 --- a/decoder/glue-lda.scfg +++ /dev/null @@ -1,8 +0,0 @@ -[S] ||| [S,1] [X0,2] ||| [1] [2] ||| Glue=1 -[S] ||| [X0,1] ||| [1] ||| GlueTop=1 -[S] ||| [S,1] [X1,2] ||| [1] [2] ||| Glue=1 -[S] ||| [X1,1] ||| [1] ||| GlueTop=1 -[S] ||| [S,1] [X2,2] ||| [1] [2] ||| Glue=1 -[S] ||| [X2,1] ||| [1] ||| GlueTop=1 -[S] ||| [S,1] [X3,2] ||| [1] [2] ||| Glue=1 -[S] ||| [X3,1] ||| [1] ||| GlueTop=1 diff --git a/decoder/grammar.hiero b/decoder/grammar.hiero deleted file mode 100755 index 79adf33a..00000000 --- a/decoder/grammar.hiero +++ /dev/null @@ -1,151 +0,0 @@ -[X] ||| . ||| . ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| [X,1] . ||| [1] . ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| [X,1] anciano ||| [1] old man ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=1.38629 -[X] ||| [X,1] anciano . ||| [1] old man . ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=1.38629 -[X] ||| [X,1] anciano [X,2] ||| [1] old man [2] ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=1.38629 -[X] ||| [X,1] feo ||| ugly [1] ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| [X,1] feo . ||| ugly [1] . ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| [X,1] feo [X,2] ||| ugly [1] [2] ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| [X,1] gato ||| [1] cat ||| EgivenF=0.405465 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| [X,1] gato . ||| [1] cat . ||| EgivenF=0.405465 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| [X,1] gato [X,2] ||| [1] [2] cat ||| EgivenF=0 FgivenE=1.09861 LexEgivenF=0 LexFgivenE=0 -[X] ||| [X,1] gato [X,2] ||| [1] cat [2] ||| EgivenF=0 FgivenE=0.405465 LexEgivenF=0 LexFgivenE=0 -[X] ||| [X,1] gato [X,2] . ||| [1] [2] cat . ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| [X,1] gato negro ||| [1] black cat ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| [X,1] gato negro . ||| [1] black cat . ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| [X,1] gato negro [X,2] ||| [1] black cat [2] ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| [X,1] grande ||| big [1] ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| [X,1] grande . ||| big [1] . ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| [X,1] grande [X,2] ||| big [1] [2] ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| [X,1] negro ||| black [1] ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| [X,1] negro . ||| black [1] . ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| [X,1] negro [X,2] ||| black [1] [2] ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| [X,1] oruga ||| [1] caterpiller ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| [X,1] oruga . ||| [1] caterpiller . ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| [X,1] oruga [X,2] ||| [1] caterpiller [2] ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| [X,1] patito [X,2] ||| [1] [2] duckling ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| [X,1] patito [X,2] . ||| [1] [2] duckling . ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| [X,1] patito feo ||| [1] ugly duckling ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| [X,1] patito feo . ||| [1] ugly duckling . ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| [X,1] patito feo [X,2] ||| [1] ugly duckling [2] ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| [X,1] peces ||| [1] fish ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| [X,1] peces . ||| [1] fish . ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| [X,1] peces [X,2] ||| [1] fish [2] ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| [X,1] perro ||| [1] dog ||| EgivenF=0.405465 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| [X,1] perro . ||| [1] dog . ||| EgivenF=0.405465 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| [X,1] perro [X,2] ||| [1] dog [2] ||| EgivenF=0 FgivenE=0.405465 LexEgivenF=0 LexFgivenE=0 -[X] ||| [X,1] perro [X,2] ||| [1] [2] dog ||| EgivenF=0 FgivenE=1.09861 LexEgivenF=0 LexFgivenE=0 -[X] ||| [X,1] perro [X,2] . ||| [1] [2] dog . ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| [X,1] perro grande ||| [1] big dog ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| [X,1] perro grande . ||| [1] big dog . ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| [X,1] perro grande [X,2] ||| [1] big dog [2] ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| [X,1] pájaro [X,2] ||| [1] [2] bird ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| [X,1] pájaro [X,2] . ||| [1] [2] bird . ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| [X,1] pájaro negro ||| [1] black bird ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| [X,1] pájaro negro . ||| [1] black bird . ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| [X,1] pájaro negro [X,2] ||| [1] black bird [2] ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| anciano ||| old man ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=1.38629 -[X] ||| anciano . ||| old man . ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=1.38629 -[X] ||| anciano [X,1] ||| old man [1] ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=1.38629 -[X] ||| el ||| the ||| EgivenF=0.287682 FgivenE=0 LexEgivenF=0.287682 LexFgivenE=0 -[X] ||| el [X,1] ||| the [1] ||| EgivenF=0.287682 FgivenE=0 LexEgivenF=0.287682 LexFgivenE=0 -[X] ||| el [X,1] . ||| the [1] . ||| EgivenF=0.287682 FgivenE=0 LexEgivenF=0.287682 LexFgivenE=0 -[X] ||| el [X,1] feo ||| the ugly [1] ||| EgivenF=0 FgivenE=0 LexEgivenF=0.287682 LexFgivenE=0 -[X] ||| el [X,1] feo . ||| the ugly [1] . ||| EgivenF=0 FgivenE=0 LexEgivenF=0.287682 LexFgivenE=0 -[X] ||| el [X,1] feo [X,2] ||| the ugly [1] [2] ||| EgivenF=0 FgivenE=0 LexEgivenF=0.287682 LexFgivenE=0 -[X] ||| el [X,1] grande ||| the big [1] ||| EgivenF=0 FgivenE=0 LexEgivenF=0.287682 LexFgivenE=0 -[X] ||| el [X,1] grande . ||| the big [1] . ||| EgivenF=0 FgivenE=0 LexEgivenF=0.287682 LexFgivenE=0 -[X] ||| el [X,1] grande [X,2] ||| the big [1] [2] ||| EgivenF=0 FgivenE=0 LexEgivenF=0.287682 LexFgivenE=0 -[X] ||| el [X,1] negro ||| the black [1] ||| EgivenF=0 FgivenE=0 LexEgivenF=0.287682 LexFgivenE=0 -[X] ||| el [X,1] negro . ||| the black [1] . ||| EgivenF=0 FgivenE=0 LexEgivenF=0.287682 LexFgivenE=0 -[X] ||| el [X,1] negro [X,2] ||| the black [1] [2] ||| EgivenF=0 FgivenE=0 LexEgivenF=0.287682 LexFgivenE=0 -[X] ||| el gato ||| the cat ||| EgivenF=0 FgivenE=0 LexEgivenF=0.287682 LexFgivenE=0 -[X] ||| el gato . ||| the cat . ||| EgivenF=0 FgivenE=0 LexEgivenF=0.287682 LexFgivenE=0 -[X] ||| el gato [X,1] ||| the [1] cat ||| EgivenF=0 FgivenE=0.693147 LexEgivenF=0.287682 LexFgivenE=0 -[X] ||| el gato [X,1] ||| the cat [1] ||| EgivenF=0 FgivenE=0.693147 LexEgivenF=0.287682 LexFgivenE=0 -[X] ||| el gato [X,1] . ||| the [1] cat . ||| EgivenF=0 FgivenE=0 LexEgivenF=0.287682 LexFgivenE=0 -[X] ||| el gato negro ||| the black cat ||| EgivenF=0 FgivenE=0 LexEgivenF=0.287682 LexFgivenE=0 -[X] ||| el gato negro . ||| the black cat . ||| EgivenF=0 FgivenE=0 LexEgivenF=0.287682 LexFgivenE=0 -[X] ||| el gato negro [X,1] ||| the black cat [1] ||| EgivenF=0 FgivenE=0 LexEgivenF=0.287682 LexFgivenE=0 -[X] ||| el patito [X,1] ||| the [1] duckling ||| EgivenF=0 FgivenE=0 LexEgivenF=0.287682 LexFgivenE=0 -[X] ||| el patito [X,1] . ||| the [1] duckling . ||| EgivenF=0 FgivenE=0 LexEgivenF=0.287682 LexFgivenE=0 -[X] ||| el patito feo ||| the ugly duckling ||| EgivenF=0 FgivenE=0 LexEgivenF=0.287682 LexFgivenE=0 -[X] ||| el patito feo . ||| the ugly duckling . ||| EgivenF=0 FgivenE=0 LexEgivenF=0.287682 LexFgivenE=0 -[X] ||| el patito feo [X,1] ||| the ugly duckling [1] ||| EgivenF=0 FgivenE=0 LexEgivenF=0.287682 LexFgivenE=0 -[X] ||| el perro ||| the dog ||| EgivenF=0 FgivenE=0 LexEgivenF=0.287682 LexFgivenE=0 -[X] ||| el perro . ||| the dog . ||| EgivenF=0 FgivenE=0 LexEgivenF=0.287682 LexFgivenE=0 -[X] ||| el perro [X,1] ||| the [1] dog ||| EgivenF=0 FgivenE=0.693147 LexEgivenF=0.287682 LexFgivenE=0 -[X] ||| el perro [X,1] ||| the dog [1] ||| EgivenF=0 FgivenE=0.693147 LexEgivenF=0.287682 LexFgivenE=0 -[X] ||| el perro [X,1] . ||| the [1] dog . ||| EgivenF=0 FgivenE=0 LexEgivenF=0.287682 LexFgivenE=0 -[X] ||| el perro grande ||| the big dog ||| EgivenF=0 FgivenE=0 LexEgivenF=0.287682 LexFgivenE=0 -[X] ||| el perro grande . ||| the big dog . ||| EgivenF=0 FgivenE=0 LexEgivenF=0.287682 LexFgivenE=0 -[X] ||| el perro grande [X,1] ||| the big dog [1] ||| EgivenF=0 FgivenE=0 LexEgivenF=0.287682 LexFgivenE=0 -[X] ||| el pájaro [X,1] ||| the [1] bird ||| EgivenF=0 FgivenE=0 LexEgivenF=0.287682 LexFgivenE=0 -[X] ||| el pájaro [X,1] . ||| the [1] bird . ||| EgivenF=0 FgivenE=0 LexEgivenF=0.287682 LexFgivenE=0 -[X] ||| el pájaro negro ||| the black bird ||| EgivenF=0 FgivenE=0 LexEgivenF=0.287682 LexFgivenE=0 -[X] ||| el pájaro negro . ||| the black bird . ||| EgivenF=0 FgivenE=0 LexEgivenF=0.287682 LexFgivenE=0 -[X] ||| el pájaro negro [X,1] ||| the black bird [1] ||| EgivenF=0 FgivenE=0 LexEgivenF=0.287682 LexFgivenE=0 -[X] ||| eso ||| that ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| eso [X,1] ||| that [1] ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| eso [X,1] . ||| that [1] . ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| eso perro ||| that dog ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| eso perro . ||| that dog . ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| eso perro [X,1] ||| that dog [1] ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| este ||| this ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| este [X,1] ||| this [1] ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| este [X,1] . ||| this [1] . ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| este anciano ||| this old man ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=1.38629 -[X] ||| este anciano . ||| this old man . ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=1.38629 -[X] ||| este anciano [X,1] ||| this old man [1] ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=1.38629 -[X] ||| este gato ||| this cat ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| este gato . ||| this cat . ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| este gato [X,1] ||| this cat [1] ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| feo ||| ugly ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| gato ||| cat ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| gato . ||| cat . ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| gato [X,1] ||| [1] cat ||| EgivenF=1.09861 FgivenE=1.09861 LexEgivenF=0 LexFgivenE=0 -[X] ||| gato [X,1] ||| cat [1] ||| EgivenF=0 FgivenE=0.405465 LexEgivenF=0 LexFgivenE=0 -[X] ||| gato [X,1] . ||| [1] cat . ||| EgivenF=1.09861 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| gato negro ||| black cat ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| gato negro . ||| black cat . ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| gato negro [X,1] ||| black cat [1] ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| grande ||| big ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| la ||| the ||| EgivenF=2.07944 FgivenE=0 LexEgivenF=2.07944 LexFgivenE=0 -[X] ||| la [X,1] ||| the [1] ||| EgivenF=2.07944 FgivenE=0 LexEgivenF=2.07944 LexFgivenE=0 -[X] ||| la [X,1] . ||| the [1] . ||| EgivenF=2.07944 FgivenE=0 LexEgivenF=2.07944 LexFgivenE=0 -[X] ||| la oruga ||| the caterpiller ||| EgivenF=0 FgivenE=0 LexEgivenF=2.07944 LexFgivenE=0 -[X] ||| la oruga . ||| the caterpiller . ||| EgivenF=0 FgivenE=0 LexEgivenF=2.07944 LexFgivenE=0 -[X] ||| la oruga [X,1] ||| the caterpiller [1] ||| EgivenF=0 FgivenE=0 LexEgivenF=2.07944 LexFgivenE=0 -[X] ||| los ||| the ||| EgivenF=2.07944 FgivenE=0 LexEgivenF=2.07944 LexFgivenE=0 -[X] ||| los [X,1] ||| the [1] ||| EgivenF=2.07944 FgivenE=0 LexEgivenF=2.07944 LexFgivenE=0 -[X] ||| los [X,1] . ||| the [1] . ||| EgivenF=2.07944 FgivenE=0 LexEgivenF=2.07944 LexFgivenE=0 -[X] ||| los peces ||| the fish ||| EgivenF=0 FgivenE=0 LexEgivenF=2.07944 LexFgivenE=0 -[X] ||| los peces . ||| the fish . ||| EgivenF=0 FgivenE=0 LexEgivenF=2.07944 LexFgivenE=0 -[X] ||| los peces [X,1] ||| the fish [1] ||| EgivenF=0 FgivenE=0 LexEgivenF=2.07944 LexFgivenE=0 -[X] ||| negro ||| black ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| oruga ||| caterpiller ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| oruga . ||| caterpiller . ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| oruga [X,1] ||| caterpiller [1] ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| patito ||| duckling ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| patito [X,1] ||| [1] duckling ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| patito [X,1] . ||| [1] duckling . ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| patito feo ||| ugly duckling ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| patito feo . ||| ugly duckling . ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| patito feo [X,1] ||| ugly duckling [1] ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| peces ||| fish ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| peces . ||| fish . ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| peces [X,1] ||| fish [1] ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| perro ||| dog ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| perro . ||| dog . ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| perro [X,1] ||| [1] dog ||| EgivenF=1.09861 FgivenE=1.09861 LexEgivenF=0 LexFgivenE=0 -[X] ||| perro [X,1] ||| dog [1] ||| EgivenF=0 FgivenE=0.405465 LexEgivenF=0 LexFgivenE=0 -[X] ||| perro [X,1] . ||| [1] dog . ||| EgivenF=1.09861 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| perro grande ||| big dog ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| perro grande . ||| big dog . ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| perro grande [X,1] ||| big dog [1] ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| pájaro ||| bird ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| pájaro [X,1] ||| [1] bird ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| pájaro [X,1] . ||| [1] bird . ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| pájaro negro ||| black bird ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| pájaro negro . ||| black bird . ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| pájaro negro [X,1] ||| black bird [1] ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 diff --git a/decoder/perro.sh b/decoder/perro.sh deleted file mode 100755 index 3e54ac71..00000000 --- a/decoder/perro.sh +++ /dev/null @@ -1 +0,0 @@ -$gdb $cdec "$@" -k 30 --show_features -c fsa-hiero.ini -i perro.ur diff --git a/decoder/perro.ur b/decoder/perro.ur deleted file mode 100755 index 6c5da6d7..00000000 --- a/decoder/perro.ur +++ /dev/null @@ -1 +0,0 @@ -eso perro feo diff --git a/decoder/short.ur b/decoder/short.ur deleted file mode 100755 index 48612801..00000000 --- a/decoder/short.ur +++ /dev/null @@ -1 +0,0 @@ -krAcy myN pyr kw mxtlf HAdvAt diff --git a/decoder/weights-fsa b/decoder/weights-fsa deleted file mode 100644 index 3cc96c2f..00000000 --- a/decoder/weights-fsa +++ /dev/null @@ -1,14 +0,0 @@ -Arity_0 1.70741473606976 -Arity_1 1.12426238048012 -Arity_2 1.14986187839554 -Glue -0.04589037041388 -LanguageModel 1.09051 -LM 1.09051 -PassThrough -3.66226367902928 -PhraseModel_0 -1.94633451863252 -PhraseModel_1 -0.1475347695476 -PhraseModel_2 -1.614818994946 -WordPenalty -3.0 -WordPenaltyFsa -0.56028442964748 -ShorterThanPrev -10 -LongerThanPrev -10 diff --git a/decoder/weights.hiero b/decoder/weights.hiero deleted file mode 100755 index 6747f059..00000000 --- a/decoder/weights.hiero +++ /dev/null @@ -1,10 +0,0 @@ -SameFirstLetter 1 -LongerThanPrev 1 -ShorterThanPrev 1 -GlueTop 0.0 -Glue -1.0 -EgivenF -0.5 -FgivenE -0.5 -LexEgivenF -0.5 -LexFgivenE -0.5 -LM 1 diff --git a/dpmert/tac.pl b/dpmert/tac.pl deleted file mode 100755 index 9fb525c1..00000000 --- a/dpmert/tac.pl +++ /dev/null @@ -1,8 +0,0 @@ -#!/usr/bin/perl - -while(<>) { - chomp; - $|=1; - print (scalar reverse($_)); - print "\n"; -} diff --git a/expLog b/expLog deleted file mode 100644 index 2070ac98..00000000 --- a/expLog +++ /dev/null @@ -1,60 +0,0 @@ -TIME MEASURES AFTER MERGE WITH cdec: -8/July/2011 -commit ed8a6e81d87f6e917ecf - -./runEval -Fri Jul 8 13:28:23 CEST 2011 -Fri Jul 8 13:30:24 CEST 2011 -Loading references (4 files) -Loaded reference translations for 919 sentences. -Loaded 919 references for scoring with ibm_bleu -BLEU = 32.25, 76.5|43.1|24.3|13.9 (brev=0.993) -0.322487 -Fri Jul 8 13:30:24 CEST 2011 ------------- -Fri Jul 8 15:04:00 CEST 2011 -Fri Jul 8 15:05:58 CEST 2011 -Time required for Cube Pruning execution: 77.61 seconds. ------------- -Fri Jul 8 15:24:39 CEST 2011 -Fri Jul 8 15:26:36 CEST 2011 -Time required for Cube Pruning execution: 79.01 seconds. ------------- - -./runEvalFCP -Fri Jul 8 13:33:17 CEST 2011 -Fri Jul 8 13:35:06 CEST 2011 -Loading references (4 files) -Loaded reference translations for 919 sentences. -Loaded 919 references for scoring with ibm_bleu -BLEU = 32.39, 76.5|43.1|24.5|14.0 (brev=0.994) -0.323857 -Fri Jul 8 13:35:07 CEST 2011 ------------- -Fri Jul 8 15:08:17 CEST 2011 -Fri Jul 8 15:10:05 CEST 2011 -Time required for Cube Pruning execution: 69.36 seconds. ------------- -Fri Jul 8 15:21:48 CEST 2011 -Fri Jul 8 15:23:35 CEST 2011 -Time required for Cube Pruning execution: 69.71 seconds. ------------- - -./runEvalFCP2 -Fri Jul 8 13:53:38 CEST 2011 -Fri Jul 8 13:55:29 CEST 2011 -Loading references (4 files) -Loaded reference translations for 919 sentences. -Loaded 919 references for scoring with ibm_bleu -BLEU = 32.49, 76.6|43.2|24.5|14.1 (brev=0.994) -0.324901 -Fri Jul 8 13:55:29 CEST 2011 ------------- -Fri Jul 8 15:12:52 CEST 2011 -Fri Jul 8 15:14:42 CEST 2011 -Time required for Cube Pruning execution: 72.66 seconds. ------------- -Fri Jul 8 15:19:13 CEST 2011 -Fri Jul 8 15:21:03 CEST 2011 -Time required for Cube Pruning execution: 72.06 seconds. ------------- diff --git a/graehl/NOTES b/graehl/NOTES deleted file mode 100755 index 77e99fee..00000000 --- a/graehl/NOTES +++ /dev/null @@ -1,18 +0,0 @@ -BUG: tune is bad - urdu conf=baseline tuning (16 dev bleu score???) - -conf=baseline force=1 ./tune.sh - - decode is good. - - UPDATE: maybe tuning is fine; chris never gave me a dev-corpus-filtered grammar and so a bleu of 16 may be what we always got; i just never checked. this means i need to redo tuned-first-pass experiments - -valgrind is ok - - dist-vest? - - (changes made to scoring? plusequals? shared_ptr? small_vector?) - - scorer_test is good - - - line_optimer fast_score scorer diff --git a/graehl/NOTES.beam b/graehl/NOTES.beam deleted file mode 100755 index a48d1ab7..00000000 --- a/graehl/NOTES.beam +++ /dev/null @@ -1,29 +0,0 @@ -(graehl, comments on code) - -passive chart: completion of actual translation rules (X or S NT in Hiero), have -rule features. Hyperedge inserted with copy of rule feature vector -(non-sparse). Inefficient; should be postponed on intermediate parses with -global pruning; just keep pointer to rules and models must provide an interface -to build a (sparse) feat. vector on demand later for the stuff we keep. - -multithreading: none. list of hyperarcs for refinement would need to be -segregated into subforest blocks and have own output lists for later merging. -e.g. bottom up count number of tail-reachable nodes under each hypernode, then -assign to workers. - -ngram caching: trie, no locks, for example. for threading, LRU hashing w/ locks per bucket is probably better, or per-thread caches. probably cache is reset per sentence? - -randlm worth using? guess not. - -actually get all 0-state models in 1st pass parse and prune passive edges per span. - -allocate cube pruning budget per prev pass - -(has been tried in ISI decoder) models with nonstandard state comparison, -typically (partially) greedy forest scoring, some part of the state is excluded -from equality/hashing. Within virtual ff interface, would just add equals, hash -to vtable (rather than the faster raw state compare). If this is done often, -then add a nonvirtual flag to interface instead, saying whether to use the -virt. methods or not. or: simple flag by user of ApplyModels (per feature?) -saying whether to be 100% greedy or 0% - no halfway e.g. item name uses bigram -context, but score using 5gram state. diff --git a/graehl/NOTES.earley b/graehl/NOTES.earley deleted file mode 100755 index 0953708c..00000000 --- a/graehl/NOTES.earley +++ /dev/null @@ -1,111 +0,0 @@ -1. fsts (modify target string produced) are quite feasible. augment fsa ff to not just emit features, but also new target words. composition or intersection is no problem (can always bunch into a single FSA/FST lazily by wrapping) - -2. sparse fsas (many transitions have -inf score) aren't efficiently supported presently (they are in early_composer where the fsa is a phrase table); the fsa ff interface doesn't even provide a way to query the non-0 transitions (you have to emit a -inf feature). if sparse fsas were expected often and we wanted exact search, then a similar index of the tcfg as in earley_composer would make sense. however, with prob. beam search, we prune out bad-scoring stuff anyway - -3. binarization of rhs isn't usually considered necessary in earley, but i liked the idea of optimal binarization making the most sharing possible. however, this means what would have just been a scan is now a scan+complete. - -4. prefix (forward) + inside cost. this is phrased in a way so that prefix includes inside. but there's no reason not to think of it in exclusive terms (outside,inside) where prefix=outside*inside when using viterbi. on the other hand, we usually use the outside*inside as the beam score. and furthermore, it looks like when summing over all derivations, there would be some difficulty calculating, as the total inside wouldn't be known at first. - -(a,i) r => (+=a*r,r) would become (o,i) r => (+=[(o*i*r)/r],r) = (+=o*i,r) -(_,b'') (a,b) => (+=a*b'',+=b*b'') would become (_,b'') (o,b) => (?????) - -==== - - -the target CFG (tcfg) is finite - absolutely no cycles. conceptually we're intersecting it with a wfsa (weights are feature vectors), which is a lot like parsing a lattice, in that states are (source state, dest state) pairs and you're covering some string(s) that go from source->dest in the wfsa. - -Chris' paper http://www.ling.umd.edu/~redpony/forest-reordering.pdf - apparently (figure 5) already contains the exact concept we're going for, albeit with only inside scores. http://www.speech.sri.com/cgi-bin/run-distill?ftp:papers/stolcke-cl95.ps.gz describes a nice way of computing sums over derivations given a string by keeping a tuple of ("forward","inner") scores while Earley parsing. I'm not sure yet if this is applicable (because we'll already have the exact outside scores from the -LM forest already, and plan on using cost pushing toward the top so we don't have to explicitly consider them). - -normally in earley, one word is consumed at a time, left to right. completions happen from shortest to longest, then (repeated) predictions, and finally scans. i'm sure this has the usual obvious extension to parsing lattices (proceed in some topological order). - -but because the wfsa (ngram lm) has cycles and forgets the length of the string (at some point), it's slightly more complicated than lattice parsing the tcfg - there's no topological order over the wfsa states and so you can't finish all the items [x,j] for j from left->right. best first with monotonic total scores (admissable heuristics) is an easy way to avoid generating the whole space; otherwise I can't think of a fixed order that would allow for alternative beaming. as we discussed, arbitrary predicates filtering candidate items can be added if exact best-first is too slow - -if the wfsa were just a single string t[0...n-1], then any time you have an item [i,j]X->a.b that means that there's some derivation in the tCFG of S =>* t[0...i-1]Xc => t[0....i-1]abc =>* t[0...j-1]bc , for a FSA, the analog is S =>* W(0,i)Xc => W(0,i)abc =>* W(0,i)W(i,j)bc where W(a,b) means any string in the wfsa language with a as the start state and b as the final state. - - -http://www.isi.edu/natural-language/teaching/cs544/cs544-huang-3-Earley.pdf - -http://www.isi.edu/~lhuang/dp-final.pdf (variation on stolcke 1995 prefix cost) - -http://acl.ldc.upenn.edu/P/P07/P07-1019.pdf - phrase based lazy priority queue "cube growing" descendants (p149) - - - - - -http://www.speech.sri.com/cgi-bin/run-distill?ftp:papers/stolcke-cl95.ps.gz - -http://www.icsi.berkeley.edu/~stolcke/papers/cl95/node10.html#SECTION00042000000000000000 - -a) An (unconstrained) Earley path, or simply path, is a sequence of Earley -states linked by prediction, scanning, or completion. For the purpose of -this definition, we allow scanning to operate in “generation mode,” i.e., all -states with terminals to the right of the dot can be scanned, not just those -matching the input. (For completed states, the predecessor state is defined -to be the complete state from the same state set contributing to the -completion.) -b) A path is said to be constrained by, or generate a string x if the terminals -immediately to the left of the dot in all scanned states, in sequence, form -the string x. -c) A path is complete if the last state on it matches the first, except that the -dot has moved to the end of the RHS. -d) We say that a path starts with nonterminal X if the first state on it is a -predicted statewith X on the LHS. -e) The length of a path is defined as the number of scanned states on it. - -Note that the definition of path length is somewhat counter-intuitive, but is motivated -by the fact that only scanned states correspond directly to input symbols. Thus, -the length of a path is always the same as the length of the input string it generates. - -A constrained path starting with the initial state contains a sequence of states from -state set 0 derived by repeated prediction, followed by a single state from set 1 produced -by scanning the first symbol, followed by a sequence of states produced by completion, -followed by a sequence of predicted states, followed by a state scanning the second -symbol, and so on. The significance of Earley paths is that they are in a one-to-one -correspondence with left-most derivations. - - -============= - -The forward probability alpha_i(X[k]->x.y) is the sum of the probabilities of all -constrained paths of length that end in state X[k]->x.y - -b) The inner probability beta_i(X[k]->x.y) is the sum of the probabilities of all -paths of length i-k that start in state X[k,k]->.xy and end in X[k,i]->x.y, and generate the input symbols x[k,...,i-1] - -(forward,inner) [i.e. (outside,inside)?] - unchanged by scan (rule cost is paid up front when predicting) - -if X[k,i] -> x.Yz (a,b) and rule Y -> r (p) -then Y[i,i] -> .r (a',p) with a' += a*p - -if Y[j,i]->y. (a'',b'') and X[k,j]->r.Ys (a,b) -then X[k,i]->rY.s (a',b') with a' += a*b'', b' += b*b'' - -(this is summing over all derivations) - - -========== - -is forward cost viterbi fine? i.e. can i have items whose names ignore the lhs NT (look up predictions that i finish lazily / graph structured?) -====== - -1) A -> x . * (trie) - -this is somewhat nice. cost pushed for best first, of course. similar benefit as left-branching binarization without the explicit predict/complete steps? - -vs. just - -2) * -> x . y - -here you have to potentially list out all A -> . x y as items * -> . x y immediately, and shared rhs seqs won't be shared except at the usual single-NT predict/complete. of course, the prediction of items -> . x y can occur lazy best-first. - -vs. - -3) * -> x . * - -with 3, we predict all sorts of useless items - that won't give us our goal A and may not partcipate in any parse. this is not a good option at all. - -====== - --LM forest may have many in-edges per V. (many rules per NT lhs). so instead of generating all successors for scan/predict, i wanted to have them in sorted (admissable) -LM cost order and postpone once the prefix+rule part is more expensive than something else in the agenda. question: how many such postponed successor things diff --git a/graehl/NOTES.lm.phrase b/graehl/NOTES.lm.phrase deleted file mode 100755 index e87cc6fb..00000000 --- a/graehl/NOTES.lm.phrase +++ /dev/null @@ -1,180 +0,0 @@ -possibly the most direct solution is to print every individual probability from LM (to global fstream?). since the difference happens even w/o shortening, disable shortening to remove the possible effect of floor(p+bo) vs floor(p)+bo disagreeing - -+LM forest (nodes/edges): 2163/11293 - +LM forest (paths): 7.14685e+14 - +LM forest Viterbi logp: -490.21 - +LM forest Viterbi: karachi ( AstRAf rpwrtRr ) in karachi on monday in different HAdvAt gyY and killed 4 people including a woman found dead body of a person from the sea . - +LM forest derivation: ({<0,28>[1] ||| (final r->l(( karachi| ) start=[ ]->{karachi (} r->l(|. sea) end=[sea .]->{} LanguageModelFsa=-5.74766; }({<0,28>[1] [2] ||| [karachi ( : [a woman]] r->l(( karachi|) [found dead : [sea .]] r->l(dead found|woman a) = [karachi ( : [sea .]] LanguageModelFsa=-5.93027 h=-5.83552); }({<0,20>[1] [2] ||| [karachi ( : []] r->l(( karachi|) [gyY and : [a woman]] r->l(and gyY|) = [karachi ( : [a woman]] LanguageModelFsa=-101.72 h=-5.83552); }({<0,12>[1] [2] ||| [karachi ( : [in karachi]] r->l(( karachi|) [on monday : []] r->l(monday on|karachi in) = [karachi ( : []] LanguageModelFsa=-1.99946 h=-5.83552); }({<0,7>[1] [2] ||| [karachi ( : [rpwrtRr )]] r->l(( karachi|) [in karachi : [in karachi]] r->l(karachi in|) rpwrtRr) = [karachi ( : [in karachi]] LanguageModelFsa=-3.40247 h=-5.83552); }({<0,5>[1] [2] ||| [karachi ( : []] r->l(( karachi|) [rpwrtRr ) : [rpwrtRr )]] r->l() rpwrtRr|) = [karachi ( : [rpwrtRr )]] LanguageModelFsa=-102.623 h=-5.83552); }({<0,3>[1] ||| [karachi ( : []] r->l(( karachi|) = [karachi ( : []] LanguageModelFsa=0 h=-5.83552); }({<0,3>karachi [1] ||| [( AstRAf : []] r->l(( karachi|) r->l(AstRAf|( karachi) = [karachi ( : []] LanguageModelFsa=-100 h=-5.83552); r->l(karachi|)}({<1,3>( [1] ||| [AstRAf] r->l(AstRAf (|) = [( AstRAf : []] LanguageModelFsa=0 h=-102.641); r->l((|)}({<2,3>AstRAf ||| r->l(AstRAf|) = [AstRAf] LanguageModelFsa=0 h=-100); r->l(AstRAf|)}) ) ) ) ({<3,5>[1] ) ||| [rpwrtRr] r->l() rpwrtRr|) = [rpwrtRr ) : [rpwrtRr )]] LanguageModelFsa=0 h=-102.623); r->l()|)}({<3,4>rpwrtRr ||| r->l(rpwrtRr|) = [rpwrtRr] LanguageModelFsa=0 h=-100); r->l(rpwrtRr|)}) ) ) ({<5,7>in karachi ||| r->l(karachi in|) = [in karachi : [in karachi]] LanguageModelFsa=0 h=-3.80404); r->l(karachi in|)}) ) ({<7,12>on monday in [1] ||| r->l(monday on|) rule-phrase[in] r->l(in|monday on) [different HAdvAt : []] r->l(HAdvAt different|in monday) = [on monday : []] LanguageModelFsa=-103.918 h=-3.91305); r->l(in monday on|)}({<9,11>different [1] ||| [HAdvAt] r->l(HAdvAt different|) = [different HAdvAt : []] LanguageModelFsa=0 h=-103.573); r->l(different|)}({<10,11>HAdvAt ||| r->l(HAdvAt|) = [HAdvAt] LanguageModelFsa=0 h=-100); r->l(HAdvAt|)}) ) ) ) ({<12,20>[2] killed [1] ||| [gyY and : [gyY and]] r->l(and gyY|) rule-phrase[killed] r->l(killed|and gyY) [4 people : [a woman]] r->l(people 4|killed and) = [gyY and : [a woman]] LanguageModelFsa=-5.57026 h=-101.72); r->l(killed|)}({<12,16>[2] people including a [1] ||| [4] r->l(people 4|) rule-phrase[including a] r->l(a including|people 4) [woman] r->l(woman|a including) = [4 people : [a woman]] LanguageModelFsa=-3.99305 h=-6.22734); r->l(a including people|)}({<12,13>woman ||| r->l(woman|) = [woman] LanguageModelFsa=0 h=-3.82934); r->l(woman|)}) ({<14,15>4 ||| r->l(4|) = [4] LanguageModelFsa=0 h=-3.62974); r->l(4|)}) ) ({<18,20>[1] and ||| [gyY] r->l(and gyY|) = [gyY and : [gyY and]] LanguageModelFsa=0 h=-101.72); r->l(and|)}({<18,19>gyY ||| r->l(gyY|) = [gyY] LanguageModelFsa=0 h=-100); r->l(gyY|)}) ) ) ) ({<20,28>[1] the sea . ||| [found dead : [ from]] r->l(dead found|) rule-phrase[the sea .] r->l(. sea the|from ) = [found dead : [sea .]] LanguageModelFsa=-4.84745 h=-7.62839); r->l(. sea the|)}({<21,27>found [1] from ||| [dead body : []] r->l(dead found|) r->l(body|dead found) rule-phrase[from] r->l(from|) = [found dead : [ from]] LanguageModelFsa=-3.42491 h=-7.62839); r->l(found|) r->l(from|)}({<22,26>dead body of [1] ||| r->l(body dead|) rule-phrase[of] r->l(of|body dead) [a person : []] r->l(person a|of body) = [dead body : []] LanguageModelFsa=-2.9934 h=-4.63222); r->l(of body dead|)}({<22,24>a [1] ||| [person] r->l(person a|) = [a person : []] LanguageModelFsa=0 h=-4.90016); r->l(a|)}({<23,24>person ||| r->l(person|) = [person] LanguageModelFsa=0 h=-3.50165); r->l(person|)}) ) ) ) ) ) ) - +LM forest features: Arity_0=-3.47436;Arity_1=-4.77724;Arity_2=-3.04006;Glue=5;LanguageModel=-446.49;LmFsa=-446.17;PassThrough=5;PhraseModel_0=12.2199;PhraseModel_1=11.6391;PhraseModel_2=10.9878;WordPenalty=-13.0288;Unigram=-462.696;UnigramFsa=-462.696 -Output kbest to - -0 ||| karachi ( AstRAf rpwrtRr ) in karachi on monday in different HAdvAt gyY and killed 4 people including a woman found dead body of a person from the sea . ||| Arity_0=-3.47436;Arity_1=-4.77724;Arity_2=-3.04006;Glue=5;LanguageModel=-446.49;LmFsa=-446.17;PassThrough=5;PhraseModel_0=12.2199;PhraseModel_1=11.6391;PhraseModel_2=10.9878;WordPenalty=-13.0288;Unigram=-462.696;UnigramFsa=-462.696 ||| -490.21 - -sent_id=0 -({<0,28>[1] ||| (final r->l(( karachi| ) start=[ ]->{karachi (} r->l(|. sea) end=[sea .]->{} LanguageModelFsa=-5.74766; } - ({<0,28>[1] [2] ||| [karachi ( : [a woman]] r->l(( karachi|) [found dead : [sea .]] r->l(dead found|woman a) = [karachi ( : [sea .]] LanguageModelFsa=-5.93027 h=-5.83552); } - ({<0,20>[1] [2] ||| [karachi ( : []] r->l(( karachi|) [gyY and : [a woman]] r->l(and gyY|) = [karachi ( : [a woman]] LanguageModelFsa=-101.72 h=-5.83552); } - ({<0,12>[1] [2] ||| [karachi ( : [in karachi]] r->l(( karachi|) [on monday : []] r->l(monday on|karachi in) = [karachi ( : []] LanguageModelFsa=-1.99946 h=-5.83552); } - ({<0,7>[1] [2] ||| [karachi ( : [rpwrtRr )]] r->l(( karachi|) [in karachi : [in karachi]] r->l(karachi in|) rpwrtRr) = [karachi ( : [in karachi]] LanguageModelFsa=-3.40247 h=-5.83552); } - ({<0,5>[1] [2] ||| [karachi ( : []] r->l(( karachi|) [rpwrtRr ) : [rpwrtRr )]] r->l() rpwrtRr|) = [karachi ( : [rpwrtRr )]] LanguageModelFsa=-102.623 h=-5.83552); } - ({<0,3>[1] ||| [karachi ( : []] r->l(( karachi|) = [karachi ( : []] LanguageModelFsa=0 h=-5.83552); } - ({<0,3>karachi [1] ||| [( AstRAf : []] r->l(( karachi|) r->l(AstRAf|( karachi) = [karachi ( : []] LanguageModelFsa=-100 h=-5.83552); r->l(karachi|)} - ({<1,3>( [1] ||| [AstRAf] r->l(AstRAf (|) = [( AstRAf : []] LanguageModelFsa=0 h=-102.641); r->l((|)} - ({<2,3>AstRAf ||| r->l(AstRAf|) = [AstRAf] LanguageModelFsa=0 h=-100); r->l(AstRAf|)} - ) - ) - ) - ) - ({<3,5>[1] ) ||| [rpwrtRr] r->l() rpwrtRr|) = [rpwrtRr ) : [rpwrtRr )]] LanguageModelFsa=0 h=-102.623); r->l()|)} - ({<3,4>rpwrtRr ||| r->l(rpwrtRr|) = [rpwrtRr] LanguageModelFsa=0 h=-100); r->l(rpwrtRr|)} - ) - ) - ) - ({<5,7>in karachi ||| r->l(karachi in|) = [in karachi : [in karachi]] LanguageModelFsa=0 h=-3.80404); r->l(karachi in|)} - ) - ) - ({<7,12>on monday in [1] ||| r->l(monday on|) rule-phrase[in] r->l(in|monday on) [different HAdvAt : []] r->l(HAdvAt different|in monday) = [on monday : []] LanguageModelFsa=-103.918 h=-3.91305); r->l(in monday on|)} - ({<9,11>different [1] ||| [HAdvAt] r->l(HAdvAt different|) = [different HAdvAt : []] LanguageModelFsa=0 h=-103.573); r->l(different|)} - ({<10,11>HAdvAt ||| r->l(HAdvAt|) = [HAdvAt] LanguageModelFsa=0 h=-100); r->l(HAdvAt|)} - ) - ) - ) - ) - ({<12,20>[2] killed [1] ||| [gyY and : [gyY and]] r->l(and gyY|) rule-phrase[killed] r->l(killed|and gyY) [4 people : [a woman]] r->l(people 4|killed and) = [gyY and : [a woman]] LanguageModelFsa=-5.57026 h=-101.72); r->l(killed|)} - ({<12,16>[2] people including a [1] ||| [4] r->l(people 4|) rule-phrase[including a] r->l(a including|people 4) [woman] r->l(woman|a including) = [4 people : [a woman]] LanguageModelFsa=-3.99305 h=-6.22734); r->l(a including people|)} - ({<12,13>woman ||| r->l(woman|) = [woman] LanguageModelFsa=0 h=-3.82934); r->l(woman|)} - ) - ({<14,15>4 ||| r->l(4|) = [4] LanguageModelFsa=0 h=-3.62974); r->l(4|)} - ) - ) - ({<18,20>[1] and ||| [gyY] r->l(and gyY|) = [gyY and : [gyY and]] LanguageModelFsa=0 h=-101.72); r->l(and|)} - ({<18,19>gyY ||| r->l(gyY|) = [gyY] LanguageModelFsa=0 h=-100); r->l(gyY|)} - ) - ) - ) - ) - ({<20,28>[1] the sea . ||| [found dead : [ from]] r->l(dead found|) rule-phrase[the sea .] r->l(. sea the|from ) = [found dead : [sea .]] LanguageModelFsa=-4.84745 h=-7.62839); r->l(. sea the|)} - ({<21,27>found [1] from ||| [dead body : []] r->l(dead found|) r->l(body|dead found) rule-phrase[from] r->l(from|) = [found dead : [ from]] LanguageModelFsa=-3.42491 h=-7.62839); r->l(found|) r->l(from|)} - ({<22,26>dead body of [1] ||| r->l(body dead|) rule-phrase[of] r->l(of|body dead) [a person : []] r->l(person a|of body) = [dead body : []] LanguageModelFsa=-2.9934 h=-4.63222); r->l(of body dead|)} - ({<22,24>a [1] ||| [person] r->l(person a|) = [a person : []] LanguageModelFsa=0 h=-4.90016); r->l(a|)} - ({<23,24>person ||| r->l(person|) = [person] LanguageModelFsa=0 h=-3.50165); r->l(person|)} - ) - ) - ) - ) - ) - ) -) -0 ||| karachi ( AstRAf rpwrtRr ) in karachi on monday in different HAdvAt gyY and killed 4 people including a woman found the dead body of a person from the sea . ||| Arity_0=-3.47436;Arity_1=-4.77724;Arity_2=-3.04006;Glue=5;LanguageModel=-446.828;LmFsa=-446.508;PassThrough=5;PhraseModel_0=12.697;PhraseModel_1=11.6391;PhraseModel_2=11.5728;WordPenalty=-13.4631;Unigram=-463.765;UnigramFsa=-463.765 ||| -490.295 - -sent_id=0 -({<0,28>[1] ||| (final r->l(( karachi| ) start=[ ]->{karachi (} r->l(|. sea) end=[sea .]->{} LanguageModelFsa=-5.74766; } - ({<0,28>[1] [2] ||| [karachi ( : [a woman]] r->l(( karachi|) [found the : [sea .]] r->l(the found|woman a) = [karachi ( : [sea .]] LanguageModelFsa=-3.6217 h=-5.83552); } - ({<0,20>[1] [2] ||| [karachi ( : []] r->l(( karachi|) [gyY and : [a woman]] r->l(and gyY|) = [karachi ( : [a woman]] LanguageModelFsa=-101.72 h=-5.83552); } - ({<0,12>[1] [2] ||| [karachi ( : [in karachi]] r->l(( karachi|) [on monday : []] r->l(monday on|karachi in) = [karachi ( : []] LanguageModelFsa=-1.99946 h=-5.83552); } - ({<0,7>[1] [2] ||| [karachi ( : [rpwrtRr )]] r->l(( karachi|) [in karachi : [in karachi]] r->l(karachi in|) rpwrtRr) = [karachi ( : [in karachi]] LanguageModelFsa=-3.40247 h=-5.83552); } - ({<0,5>[1] [2] ||| [karachi ( : []] r->l(( karachi|) [rpwrtRr ) : [rpwrtRr )]] r->l() rpwrtRr|) = [karachi ( : [rpwrtRr )]] LanguageModelFsa=-102.623 h=-5.83552); } - ({<0,3>[1] ||| [karachi ( : []] r->l(( karachi|) = [karachi ( : []] LanguageModelFsa=0 h=-5.83552); } - ({<0,3>karachi [1] ||| [( AstRAf : []] r->l(( karachi|) r->l(AstRAf|( karachi) = [karachi ( : []] LanguageModelFsa=-100 h=-5.83552); r->l(karachi|)} - ({<1,3>( [1] ||| [AstRAf] r->l(AstRAf (|) = [( AstRAf : []] LanguageModelFsa=0 h=-102.641); r->l((|)} - ({<2,3>AstRAf ||| r->l(AstRAf|) = [AstRAf] LanguageModelFsa=0 h=-100); r->l(AstRAf|)} - ) - ) - ) - ) - ({<3,5>[1] ) ||| [rpwrtRr] r->l() rpwrtRr|) = [rpwrtRr ) : [rpwrtRr )]] LanguageModelFsa=0 h=-102.623); r->l()|)} - ({<3,4>rpwrtRr ||| r->l(rpwrtRr|) = [rpwrtRr] LanguageModelFsa=0 h=-100); r->l(rpwrtRr|)} - ) - ) - ) - ({<5,7>in karachi ||| r->l(karachi in|) = [in karachi : [in karachi]] LanguageModelFsa=0 h=-3.80404); r->l(karachi in|)} - ) - ) - ({<7,12>on monday in [1] ||| r->l(monday on|) rule-phrase[in] r->l(in|monday on) [different HAdvAt : []] r->l(HAdvAt different|in monday) = [on monday : []] LanguageModelFsa=-103.918 h=-3.91305); r->l(in monday on|)} - ({<9,11>different [1] ||| [HAdvAt] r->l(HAdvAt different|) = [different HAdvAt : []] LanguageModelFsa=0 h=-103.573); r->l(different|)} - ({<10,11>HAdvAt ||| r->l(HAdvAt|) = [HAdvAt] LanguageModelFsa=0 h=-100); r->l(HAdvAt|)} - ) - ) - ) - ) - ({<12,20>[2] killed [1] ||| [gyY and : [gyY and]] r->l(and gyY|) rule-phrase[killed] r->l(killed|and gyY) [4 people : [a woman]] r->l(people 4|killed and) = [gyY and : [a woman]] LanguageModelFsa=-5.57026 h=-101.72); r->l(killed|)} - ({<12,16>[2] people including a [1] ||| [4] r->l(people 4|) rule-phrase[including a] r->l(a including|people 4) [woman] r->l(woman|a including) = [4 people : [a woman]] LanguageModelFsa=-3.99305 h=-6.22734); r->l(a including people|)} - ({<12,13>woman ||| r->l(woman|) = [woman] LanguageModelFsa=0 h=-3.82934); r->l(woman|)} - ) - ({<14,15>4 ||| r->l(4|) = [4] LanguageModelFsa=0 h=-3.62974); r->l(4|)} - ) - ) - ({<18,20>[1] and ||| [gyY] r->l(and gyY|) = [gyY and : [gyY and]] LanguageModelFsa=0 h=-101.72); r->l(and|)} - ({<18,19>gyY ||| r->l(gyY|) = [gyY] LanguageModelFsa=0 h=-100); r->l(gyY|)} - ) - ) - ) - ) - ({<20,28>[1] the sea . ||| [found the : [ from]] r->l(the found|) rule-phrase[the sea .] r->l(. sea the|from ) = [found the : [sea .]] LanguageModelFsa=-4.84745 h=-5.31983); r->l(. sea the|)} - ({<21,27>found [1] from ||| [the dead : []] r->l(the found|) r->l(dead|the found) rule-phrase[from] r->l(from|) = [found the : [ from]] LanguageModelFsa=-5.34421 h=-5.31983); r->l(found|) r->l(from|)} - ({<22,26>the dead body of [1] ||| r->l(dead the|) rule-phrase[body of] r->l(of body|dead the) [a person : []] r->l(person a|of body) = [the dead : []] LanguageModelFsa=-3.7205 h=-4.97373); r->l(of body dead the|)} - ({<22,24>a [1] ||| [person] r->l(person a|) = [a person : []] LanguageModelFsa=0 h=-4.90016); r->l(a|)} - ({<23,24>person ||| r->l(person|) = [person] LanguageModelFsa=0 h=-3.50165); r->l(person|)} - ) - ) - ) - ) - ) - ) -) -0 ||| karachi ( AstRAf rpwrtRr ) in karachi on monday in different HAdvAt gyY killed 4 people including a woman while dead body of a person from the sea . ||| Arity_0=-3.47436;Arity_1=-4.77724;Arity_2=-3.04006;Glue=5;LanguageModel=-445.419;LmFsa=-445.099;PassThrough=5;PhraseModel_0=12.5687;PhraseModel_1=12.5781;PhraseModel_2=9.61571;WordPenalty=-12.5945;Unigram=-461.303;UnigramFsa=-461.303 ||| -490.646 - -sent_id=0 -({<0,28>[1] ||| (final r->l(( karachi| ) start=[ ]->{karachi (} r->l(|. sea) end=[sea .]->{} LanguageModelFsa=-5.74766; } - ({<0,28>[1] [2] ||| [karachi ( : [a woman]] r->l(( karachi|) [while dead : [sea .]] r->l(dead while|woman a) = [karachi ( : [sea .]] LanguageModelFsa=-5.71074 h=-5.83552); } - ({<0,19>[1] [2] ||| [karachi ( : []] r->l(( karachi|) [gyY killed : [a woman]] r->l(killed gyY|) = [karachi ( : [a woman]] LanguageModelFsa=-103.345 h=-5.83552); } - ({<0,12>[1] [2] ||| [karachi ( : [in karachi]] r->l(( karachi|) [on monday : []] r->l(monday on|karachi in) = [karachi ( : []] LanguageModelFsa=-1.99946 h=-5.83552); } - ({<0,7>[1] [2] ||| [karachi ( : [rpwrtRr )]] r->l(( karachi|) [in karachi : [in karachi]] r->l(karachi in|) rpwrtRr) = [karachi ( : [in karachi]] LanguageModelFsa=-3.40247 h=-5.83552); } - ({<0,5>[1] [2] ||| [karachi ( : []] r->l(( karachi|) [rpwrtRr ) : [rpwrtRr )]] r->l() rpwrtRr|) = [karachi ( : [rpwrtRr )]] LanguageModelFsa=-102.623 h=-5.83552); } - ({<0,3>[1] ||| [karachi ( : []] r->l(( karachi|) = [karachi ( : []] LanguageModelFsa=0 h=-5.83552); } - ({<0,3>karachi [1] ||| [( AstRAf : []] r->l(( karachi|) r->l(AstRAf|( karachi) = [karachi ( : []] LanguageModelFsa=-100 h=-5.83552); r->l(karachi|)} - ({<1,3>( [1] ||| [AstRAf] r->l(AstRAf (|) = [( AstRAf : []] LanguageModelFsa=0 h=-102.641); r->l((|)} - ({<2,3>AstRAf ||| r->l(AstRAf|) = [AstRAf] LanguageModelFsa=0 h=-100); r->l(AstRAf|)} - ) - ) - ) - ) - ({<3,5>[1] ) ||| [rpwrtRr] r->l() rpwrtRr|) = [rpwrtRr ) : [rpwrtRr )]] LanguageModelFsa=0 h=-102.623); r->l()|)} - ({<3,4>rpwrtRr ||| r->l(rpwrtRr|) = [rpwrtRr] LanguageModelFsa=0 h=-100); r->l(rpwrtRr|)} - ) - ) - ) - ({<5,7>in karachi ||| r->l(karachi in|) = [in karachi : [in karachi]] LanguageModelFsa=0 h=-3.80404); r->l(karachi in|)} - ) - ) - ({<7,12>on monday in [1] ||| r->l(monday on|) rule-phrase[in] r->l(in|monday on) [different HAdvAt : []] r->l(HAdvAt different|in monday) = [on monday : []] LanguageModelFsa=-103.918 h=-3.91305); r->l(in monday on|)} - ({<9,11>different [1] ||| [HAdvAt] r->l(HAdvAt different|) = [different HAdvAt : []] LanguageModelFsa=0 h=-103.573); r->l(different|)} - ({<10,11>HAdvAt ||| r->l(HAdvAt|) = [HAdvAt] LanguageModelFsa=0 h=-100); r->l(HAdvAt|)} - ) - ) - ) - ) - ({<12,19>[2] killed [1] ||| [gyY] r->l(killed gyY|) [4 people : [a woman]] r->l(people 4|killed gyY) = [gyY killed : [a woman]] LanguageModelFsa=-2.98475 h=-103.345); r->l(killed|)} - ({<12,16>[2] people including a [1] ||| [4] r->l(people 4|) rule-phrase[including a] r->l(a including|people 4) [woman] r->l(woman|a including) = [4 people : [a woman]] LanguageModelFsa=-3.99305 h=-6.22734); r->l(a including people|)} - ({<12,13>woman ||| r->l(woman|) = [woman] LanguageModelFsa=0 h=-3.82934); r->l(woman|)} - ) - ({<14,15>4 ||| r->l(4|) = [4] LanguageModelFsa=0 h=-3.62974); r->l(4|)} - ) - ) - ({<18,19>gyY ||| r->l(gyY|) = [gyY] LanguageModelFsa=0 h=-100); r->l(gyY|)} - ) - ) - ) - ({<19,28>while [1] ||| [dead body : [sea .]] r->l(dead while|) r->l(body|dead while) = [while dead : [sea .]] LanguageModelFsa=-1.20144 h=-6.25144); r->l(while|)} - ({<20,28>[1] . ||| [dead body : [the sea]] r->l(body dead|) rule-phrase[.] r->l(.|sea the) = [dead body : [sea .]] LanguageModelFsa=-0.45297 h=-4.63222); r->l(.|)} - ({<20,26>[1] the sea ||| [dead body : [ from]] r->l(body dead|) rule-phrase[the sea] r->l(sea the|from ) = [dead body : [the sea]] LanguageModelFsa=-4.39448 h=-4.63222); r->l(sea the|)} - ({<21,26>dead body of [1] ||| r->l(body dead|) rule-phrase[of] r->l(of|body dead) [a person : [ from]] r->l(person a|of body) = [dead body : [ from]] LanguageModelFsa=-2.9934 h=-4.63222); r->l(of body dead|)} - ({<21,24>a [1] from ||| [person] r->l(person a|) rule-phrase[from] r->l(from|) = [a person : [ from]] LanguageModelFsa=-2.33299 h=-4.90016); r->l(a|) r->l(from|)} - ({<23,24>person ||| r->l(person|) = [person] LanguageModelFsa=0 h=-3.50165); r->l(person|)} - ) - ) - ) - ) - ) - ) - ) -) diff --git a/graehl/NOTES.partial.binarize b/graehl/NOTES.partial.binarize deleted file mode 100755 index a9985891..00000000 --- a/graehl/NOTES.partial.binarize +++ /dev/null @@ -1,21 +0,0 @@ -Earley doesn't require binarized rules. - -But a (partially) binarized grammar may lead to smaller (exhaustive or heuristic) charts. The tradeoff is mostly more reduce steps (the # of NTs should be similar or less than the usual dotted-item binarization0. - -Optionally collapse a rule rhs to unary as well (normal binarization would stop when an rhs is binary), if the rule to collapse it exists or is frequent enough. - -Greedy binarization schemes: - -1) (repeatedly) for the most frequent rhs bigram "X a" create a binary rule "V -> X a" and replace "X a" in all rules' rhs with V. stop if the most frequent bigram has count lower than some threshold (e.g. 3), because each instance of it saves one symbol, but the new rule has 3 symbols. - -2) (repeatedly) for each rule, pick the most frequent bigram in its rhs and binarize it (2a for that rule only, 2b everywhere that bigram occurs). again, some frequency threshold. optionally allow collapsing an rhs to unary. this fails to use some substitutions that are available "for free" based on actions taken at earlier rules w/ no frequent bigrams in common with this one. - -3) (DeNero) (for complete binarization only?) for each rule until binarized, pick a split point k of L->r[0..n) to make rules L->V1 V2, V1->r[0..k) V2->r[k..n), to minimize the number of new rules created. If no prefix or suffix of r already exists as a virtual rule, then choose k=floor(n/2). To amend this to consider frequency of rhs, use the frequency of rhs-prefix/suffixes to decide where to split? - -4?) Song, Chin-Yew Lin - seems to require collecting stats from a larged parsed corpus - interesting idea: make rules that don't match fail early (that's 1 way you get a speedup), and pick V1 -> ... based on some kind of expected utility. - -5) l2r, r2l. yawn. - -1) seems the most sensible. don't just keep a count for each bigram, keep a set of left and right adjacent partially overlapping bigrams (i.e. the words left and right). for "a b" if "c" and "d" occur to the right, then "b c" and "b d" would be the right adjacent bigrams. when replacing a bigram, follow the left and right adjacencies to decrement the count of those bigrams, and add a (bidirectional) link to the new bigram. - -Further, partial-1) can be followed by complete-3) or 5) - although i see no reason not to just continue 1) until the grammar is binary if you want a full binarization. diff --git a/graehl/NOTES.wfsa b/graehl/NOTES.wfsa deleted file mode 100755 index b74dc810..00000000 --- a/graehl/NOTES.wfsa +++ /dev/null @@ -1,16 +0,0 @@ -left-to-right finite-state models (with heuristic) that depend only on the target string. - -http://github.com/jganitkevitch/cdec.git has some progress toward this: - -earley_generator.*: make a trie of earley dotted items (from first pass finite parse projected to target side?) and rules for each earley deduction step (is the predict step actually making a hyperedge? or is it marked "active" and so doesn't appear in the result?) - -ff_ltor.*: interface for l2r models; needless scoring of "complete" action (only heuristic changes there and heuristics can just be precomputed for all dot-items -ff_lm.*: ugly clone of regular LM model with l2r interface - -apply_models.*: ApplyLeftToRightModelSet - -l2r features: - -multiple feature ids from single model? - -declare markov bound for bottom-up scoring (inside items) wrapper, and "backoff start" state (i.e. empty context, not context) diff --git a/rescore/cdec_kbest_to_zmert.pl b/rescore/cdec_kbest_to_zmert.pl deleted file mode 100755 index 88bc9682..00000000 --- a/rescore/cdec_kbest_to_zmert.pl +++ /dev/null @@ -1,64 +0,0 @@ -#!/usr/bin/perl -w - -use strict; -use utf8; -use Getopt::Long; - -my $feature_file; -my $hyp_file; -my $help; - -Getopt::Long::Configure("no_auto_abbrev"); -if (GetOptions( - "feature_file|f=s" => \$feature_file, - "hypothesis_file|h=s" => \$hyp_file, - "help" => \$help, -) == 0 || @ARGV!=0 || $help || !$feature_file || !$hyp_file) { - usage(); - exit(1); -} - -open W, "<$feature_file" or die "Can't read $feature_file: $!"; -my %weights; -my @all_feats; -while() { - chomp; - next if /^#/; - next if /^\s*$/; - my ($fname, $w) = split /\s+/; - push @all_feats, $fname; - $weights{$fname} = 1; -} -close W; - -open HYP, "<$hyp_file" or die "Can't read $hyp_file: $!"; -while() { - chomp; - my ($id, $hyp, $feats) = split / \|\|\| /; - my @afeats = split /\s+/, $feats; - my $tot = 0; - my %fvaldict; - for my $featpair (@afeats) { - my ($fname,$fval) = split /=/, $featpair; - $fvaldict{$fname} = $fval; - my $weight = $weights{$fname}; - warn "Feature '$fname' not mentioned in feature file $feature_file" unless defined $weight; - $weights{$fname} = 1; - } - my @trans; - for my $feat (@all_feats) { - my $v = $fvaldict{$feat}; - if (!defined $v) { $v = '0.0'; } - push @trans, $v; - } - print "$id ||| $hyp ||| @trans\n"; -} -close HYP; - -sub usage { - print <) { - next if /^#/; - chomp; - next if /^\s*$/; - s/^\s+//; - s/\s+$//; - my ($a,$b) = split /\s+/; - next unless ($a && $b); - my $line = $DEFAULT; - if ($defaults{$a}) { $line = $defaults{$a}; } - print "$a\t|||\t$b\t$line\n"; -} - -print "normalization = none\n"; - diff --git a/rescore/rerank.pl b/rescore/rerank.pl deleted file mode 100755 index 4a0c5750..00000000 --- a/rescore/rerank.pl +++ /dev/null @@ -1,86 +0,0 @@ -#!/usr/bin/perl -w - -use strict; -use utf8; -use Getopt::Long; - -my $weights_file; -my $hyp_file; -my $help; -my $kbest; # flag to extract reranked list - -Getopt::Long::Configure("no_auto_abbrev"); -if (GetOptions( - "weights_file|w=s" => \$weights_file, - "hypothesis_file|h=s" => \$hyp_file, - "kbest" => \$kbest, - "help" => \$help, -) == 0 || @ARGV!=0 || $help || !$weights_file || !$hyp_file) { - usage(); - exit(1); -} - -open W, "<$weights_file" or die "Can't read $weights_file: $!"; -my %weights; -while() { - chomp; - next if /^#/; - next if /^\s*$/; - my ($fname, $w) = split /\s+/; - $weights{$fname} = $w; -} -close W; - -my $cur = undef; -my %hyps = (); -open HYP, "<$hyp_file" or die "Can't read $hyp_file: $!"; -while() { - chomp; - my ($id, $hyp, $feats) = split / \|\|\| /; - unless (defined $cur) { $cur = $id; } - if ($cur ne $id) { - extract_1best($cur, \%hyps); - $cur = $id; - %hyps = (); - } - my @afeats = split /\s+/, $feats; - my $tot = 0; - for my $featpair (@afeats) { - my ($fname,$fval) = split /=/, $featpair; - my $weight = $weights{$fname}; - die "Unweighted feature '$fname'" unless defined $weight; - $tot += ($weight * $fval); - } - $hyps{"$hyp ||| $feats"} = $tot; -} -extract_1best($cur, \%hyps) if defined $cur; -close HYP; - -sub extract_1best { - my ($id, $rh) = @_; - my %hyps = %$rh; - if ($kbest) { - for my $hyp (sort { $hyps{$b} <=> $hyps{$a} } keys %hyps) { - print "$id ||| $hyp\n"; - } - } else { - my $best_score = undef; - my $best_hyp = undef; - for my $hyp (keys %hyps) { - if (!defined $best_score || $hyps{$hyp} > $best_score) { - $best_score = $hyps{$hyp}; - $best_hyp = $hyp; - } - } - $best_hyp =~ s/ \|\|\|.*$//; - print "$best_hyp\n"; - } -} - -sub usage { - print < \$model_file, - "source_file|s=s" => \$src_file, - "feature_name|f=s" => \$feature_name, - "hypothesis_file|h=s" => \$hyp_file, - "help" => \$help, -) == 0 || @ARGV!=0 || $help || !$model_file || !$src_file || !$hyp_file) { - usage(); - exit; -} - -binmode STDIN, ":utf8"; -binmode STDOUT, ":utf8"; -binmode STDERR, ":utf8"; - -print STDERR "Reading Model 1 probabilities from $model_file...\n"; -open M, "<$model_file" or die "Couldn't read $model_file: $!"; -binmode M, ":utf8"; -my %m1; -while(){ - chomp; - my ($e,$f,$lp) = split /\s+/; - die unless defined $e; - die unless defined $f; - die unless defined $lp; - $m1{$f}->{$e} = $lp; -} -close M; - -open SRC, "<$src_file" or die "Can't read $src_file: $!"; -open HYP, "<$hyp_file" or die "Can't read $hyp_file: $!"; -binmode(SRC,":utf8"); -binmode(HYP,":utf8"); -binmode(STDOUT,":utf8"); -my @source; while(){chomp; push @source, $_; } -close SRC; -my $src_len = scalar @source; -print STDERR "Read $src_len sentences...\n"; -print STDERR "Rescoring...\n"; - -my $cur = undef; -my @hyps = (); -my @feats = (); -while() { - chomp; - my ($id, $hyp, $feats) = split / \|\|\| /; - unless (defined $cur) { $cur = $id; } - die "sentence ids in k-best list file must be between 0 and $src_len" if $id < 0 || $id > $src_len; - if ($cur ne $id) { - rescore($cur, $source[$cur], \@hyps, \@feats); - $cur = $id; - @hyps = (); - @feats = (); - } - push @hyps, $hyp; - push @feats, $feats; -} -rescore($cur, $source[$cur], \@hyps, \@feats) if defined $cur; - -sub rescore { - my ($id, $src, $rh, $rf) = @_; - my @hyps = @$rh; - my @feats = @$rf; - my $nhyps = scalar @hyps; - my %cache = (); - print STDERR "RESCORING SENTENCE id=$id (# hypotheses=$nhyps)...\n"; - for (my $i=0; $i < $nhyps; $i++) { - my $score = $cache{$hyps[$i]}; - if (!defined $score) { - if ($reverse_model) { - die "not implemented"; - } else { - $score = m1_prob($src, $hyps[$i]); - } - $cache{$hyps[$i]} = $score; - } - print "$id ||| $hyps[$i] ||| $feats[$i] $feature_name=$score\n"; - } - -} - -sub m1_prob { - my ($fsent, $esent) = @_; - die unless defined $fsent; - die unless defined $esent; - my @fwords = split /\s+/, $fsent; - my @ewords = split /\s+/, $esent; - push @ewords, ""; - my $tp = 0; - for my $f (@fwords) { - my $m1f = $m1{$f}; - if (!defined $m1f) { $m1f = {}; } - my $tfp = 0; - for my $e (@ewords) { - my $lp = $m1f->{$e}; - if (!defined $lp) { $lp = -100; } - #print "P($f|$e) = $lp\n"; - my $prob = exp($lp); - #if ($prob > $tfp) { $tfp = $prob; } - $tfp += $prob; - } - $tp += log($tfp); - $tp -= log(scalar @ewords); # uniform probability of each generating word - } - return $tp; -} - -sub usage { - print STDERR "Usage: $0 -m model_file.txt -h hypothesis.nbest -s source.txt\n Adds the back-translation probability under Model 1\n Use training/model1 to generate the required parameter file\n"; -} - - diff --git a/rescore/rescore_with_cdec_model.pl b/rescore/rescore_with_cdec_model.pl deleted file mode 100755 index cdd8c217..00000000 --- a/rescore/rescore_with_cdec_model.pl +++ /dev/null @@ -1,121 +0,0 @@ -#!/usr/bin/perl -w - -use strict; -use utf8; -my @ORIG_ARGV=@ARGV; -use Cwd qw(getcwd); -my $SCRIPT_DIR; BEGIN { use Cwd qw/ abs_path /; use File::Basename; $SCRIPT_DIR = dirname(abs_path($0)); push @INC, $SCRIPT_DIR, "$SCRIPT_DIR/../environment"; } -use LocalConfig; -use Getopt::Long; -use IPC::Open2; -use POSIX ":sys_wait_h"; - -my $decoder = "$SCRIPT_DIR/../decoder/cdec"; -my $help; -my $cdec_ini; -my $src_file; -my $hyp_file; -my $reverse_model; -my $weights_file; -my $feature_name='NewModel'; - -sub catch_pipe { - my $signame = shift; - die "$0 received SIGPIPE: did the decoder die?\n"; -} -$SIG{PIPE} = \&catch_pipe; - -Getopt::Long::Configure("no_auto_abbrev"); -if (GetOptions( - "config|c=s" => \$cdec_ini, - "weights|w=s" => \$weights_file, - "source_file|s=s" => \$src_file, - "feature_name|f=s" => \$feature_name, - "hypothesis_file|h=s" => \$hyp_file, - "reverse" => \$reverse_model, # if true translate hyp -> src - "decoder=s" => \$decoder, - "help" => \$help, -) == 0 || @ARGV!=0 || $help || !$cdec_ini || !$src_file || !$hyp_file) { - usage(); - exit; -} -die "Can't find $decoder" unless -f $decoder; -die "Can't run $decoder" unless -x $decoder; -my $weights = ''; -if (defined $weights_file) { - die "Can't read $weights_file" unless -f $weights_file; - $weights = "-w $weights_file"; -} -my $decoder_command = "$decoder -c $cdec_ini --quiet $weights --show_conditional_prob"; -print STDERR "DECODER COMMAND: $decoder_command\n"; -my $cdec_pid = open2(\*CDEC_IN, \*CDEC_OUT, $decoder_command) - or die "Couldn't run $decoder: $!"; -sleep 1; - -die "Can't find $cdec_ini" unless -f $cdec_ini; -open SRC, "<$src_file" or die "Can't read $src_file: $!"; -open HYP, "<$hyp_file" or die "Can't read $hyp_file: $!"; -binmode(SRC,":utf8"); -binmode(HYP,":utf8"); -binmode(STDOUT,":utf8"); -my @source; while(){chomp; push @source, $_; } -close SRC; -my $src_len = scalar @source; -print STDERR "Read $src_len sentences...\n"; -binmode(CDEC_IN, ":utf8"); -binmode(CDEC_OUT, ":utf8"); - -my $cur = undef; -my @hyps = (); -my @feats = (); -while() { - chomp; - my ($id, $hyp, $feats) = split / \|\|\| /; - unless (defined $cur) { $cur = $id; } - die "sentence ids in k-best list file must be between 0 and $src_len" if $id < 0 || $id > $src_len; - if ($cur ne $id) { - rescore($cur, $source[$cur], \@hyps, \@feats); - $cur = $id; - @hyps = (); - @feats = (); - } - push @hyps, $hyp; - push @feats, $feats; -} -rescore($cur, $source[$cur], \@hyps, \@feats) if defined $cur; - -close CDEC_IN; -close CDEC_OUT; -close HYP; -waitpid($cdec_pid, 0); -my $status = $? >> 8; -if ($status != 0) { - print STDERR "Decoder returned bad status!\n"; -} - -sub rescore { - my ($id, $src, $rh, $rf) = @_; - my @hyps = @$rh; - my @feats = @$rf; - my $nhyps = scalar @hyps; - print STDERR "RESCORING SENTENCE id=$id (# hypotheses=$nhyps)...\n"; - for (my $i=0; $i < $nhyps; $i++) { - if ($reverse_model) { - print CDEC_OUT "$hyps[$i] ||| $src\n"; - } else { - print CDEC_OUT "$src ||| $hyps[$i]\n"; - } - my $score = ; - chomp $score; - my @words = split /\s+/, $hyps[$i]; - print "$id ||| $hyps[$i] ||| $feats[$i] $feature_name=$score\n"; - } -} - -sub usage { - print < Date: Fri, 3 Feb 2012 20:55:10 -0500 Subject: use interface properly --- dpmert/mr_dpmert_reduce.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'dpmert') diff --git a/dpmert/mr_dpmert_reduce.cc b/dpmert/mr_dpmert_reduce.cc index dda61f88..31512a03 100644 --- a/dpmert/mr_dpmert_reduce.cc +++ b/dpmert/mr_dpmert_reduce.cc @@ -34,10 +34,10 @@ int main(int argc, char** argv) { po::variables_map conf; InitCommandLine(argc, argv, &conf); const string evaluation_metric = conf["evaluation_metric"].as(); + EvaluationMetric* metric = EvaluationMetric::Instance(evaluation_metric); LineOptimizer::ScoreType opt_type = LineOptimizer::MAXIMIZE_SCORE; - if (UppercaseString(evaluation_metric) == "TER") + if (metric->IsErrorMetric()) opt_type = LineOptimizer::MINIMIZE_SCORE; - EvaluationMetric* metric = EvaluationMetric::Instance(evaluation_metric); vector esv; string last_key, line, key, val; -- cgit v1.2.3 From 7da167c93ca2c10649cb0292fa2818ca8d5c76ff Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Fri, 3 Feb 2012 20:59:00 -0500 Subject: remove dead code --- dpmert/cat.pl | 4 ---- 1 file changed, 4 deletions(-) delete mode 100755 dpmert/cat.pl (limited to 'dpmert') diff --git a/dpmert/cat.pl b/dpmert/cat.pl deleted file mode 100755 index 2ecba3f9..00000000 --- a/dpmert/cat.pl +++ /dev/null @@ -1,4 +0,0 @@ -#!/usr/bin/perl - -$|=1; -print while(<>); -- cgit v1.2.3