diff options
Diffstat (limited to 'rampion')
| -rw-r--r-- | rampion/Makefile.am | 6 | ||||
| -rwxr-xr-x | rampion/rampion.pl | 526 | ||||
| -rw-r--r-- | rampion/rampion_cccp.cc | 157 | ||||
| -rwxr-xr-x | rampion/rampion_generate_input.pl | 18 | 
4 files changed, 707 insertions, 0 deletions
| diff --git a/rampion/Makefile.am b/rampion/Makefile.am new file mode 100644 index 00000000..12df39c2 --- /dev/null +++ b/rampion/Makefile.am @@ -0,0 +1,6 @@ +bin_PROGRAMS = rampion_cccp + +rampion_cccp_SOURCES = rampion_cccp.cc +rampion_cccp_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a -lz + +AM_CPPFLAGS = -W -Wall -Wno-sign-compare $(GTEST_CPPFLAGS) -I$(top_srcdir)/utils -I$(top_srcdir)/decoder -I$(top_srcdir)/mteval -I$(top_srcdir)/training diff --git a/rampion/rampion.pl b/rampion/rampion.pl new file mode 100755 index 00000000..fda2bac2 --- /dev/null +++ b/rampion/rampion.pl @@ -0,0 +1,526 @@ +#!/usr/bin/env perl +use strict; +my @ORIG_ARGV=@ARGV; +use Cwd qw(getcwd); +my $SCRIPT_DIR; BEGIN { use Cwd qw/ abs_path /; use File::Basename; $SCRIPT_DIR = dirname(abs_path($0)); push @INC, $SCRIPT_DIR, "$SCRIPT_DIR/../environment"; } + +# Skip local config (used for distributing jobs) if we're running in local-only mode +use LocalConfig; +use Getopt::Long; +use IPC::Open2; +use POSIX ":sys_wait_h"; +my $QSUB_CMD = qsub_args(mert_memory()); +my $default_jobs = env_default_jobs(); + +my $VEST_DIR="$SCRIPT_DIR/../dpmert"; +require "$VEST_DIR/libcall.pl"; + +# Default settings +my $srcFile; +my $refFiles; +my $bin_dir = $SCRIPT_DIR; +die "Bin directory $bin_dir missing/inaccessible" unless -d $bin_dir; +my $FAST_SCORE="$bin_dir/../mteval/fast_score"; +die "Can't execute $FAST_SCORE" unless -x $FAST_SCORE; +my $MAPINPUT = "$bin_dir/rampion_generate_input.pl"; +my $MAPPER = "$bin_dir/rampion_cccp"; +my $parallelize = "$VEST_DIR/parallelize.pl"; +my $libcall = "$VEST_DIR/libcall.pl"; +my $sentserver = "$VEST_DIR/sentserver"; +my $sentclient = "$VEST_DIR/sentclient"; +my $LocalConfig = "$SCRIPT_DIR/../environment/LocalConfig.pm"; + +my $SCORER = $FAST_SCORE; +die "Can't find $MAPPER" unless -x $MAPPER; +my $cdec = "$bin_dir/../decoder/cdec"; +die "Can't find decoder in $cdec" unless -x $cdec; +die "Can't find $parallelize" unless -x $parallelize; +die "Can't find $libcall" unless -e $libcall; +my $decoder = $cdec; +my $lines_per_mapper = 30; +my $iteration = 1; +my $best_weights; +my $psi = 1; +my $default_max_iter = 30; +my $max_iterations = $default_max_iter; +my $jobs = $default_jobs;   # number of decode nodes +my $pmem = "4g"; +my $disable_clean = 0; +my %seen_weights; +my $help = 0; +my $epsilon = 0.0001; +my $dryrun = 0; +my $last_score = -10000000; +my $metric = "ibm_bleu"; +my $dir; +my $iniFile; +my $weights; +my $use_make = 1;  # use make to parallelize +my $useqsub = 0; +my $initial_weights; +my $pass_suffix = ''; +my $cpbin=1; + +# regularization strength +my $tune_regularizer = 0; +my $reg = 500; +my $reg_previous = 5000; + +# Process command-line options +Getopt::Long::Configure("no_auto_abbrev"); +if (GetOptions( +	"jobs=i" => \$jobs, +	"dont-clean" => \$disable_clean, +	"pass-suffix=s" => \$pass_suffix, +        "qsub" => \$useqsub, +	"dry-run" => \$dryrun, +	"epsilon=s" => \$epsilon, +	"help" => \$help, +        "weights=s" => \$initial_weights, +	"reg=f" => \$reg, +	"use-make=i" => \$use_make, +	"max-iterations=i" => \$max_iterations, +	"pmem=s" => \$pmem, +        "cpbin!" => \$cpbin, +	"ref-files=s" => \$refFiles, +	"metric=s" => \$metric, +	"source-file=s" => \$srcFile, +	"workdir=s" => \$dir, +) == 0 || @ARGV!=1 || $help) { +	print_help(); +	exit; +} + +die "--tune-regularizer is no longer supported with --reg-previous and --reg. Please tune manually.\n" if $tune_regularizer; + +if ($useqsub) { +  $use_make = 0; +  die "LocalEnvironment.pm does not have qsub configuration for this host. Cannot run with --qsub!\n" unless has_qsub(); +} + +my @missing_args = (); +if (!defined $srcFile) { push @missing_args, "--source-file"; } +if (!defined $refFiles) { push @missing_args, "--ref-files"; } +if (!defined $initial_weights) { push @missing_args, "--weights"; } +die "Please specify missing arguments: " . join (', ', @missing_args) . "\n" if (@missing_args); + +if ($metric =~ /^(combi|ter)$/i) { +  $lines_per_mapper = 5; +} + +($iniFile) = @ARGV; + + +sub write_config; +sub enseg; +sub print_help; + +my $nodelist; +my $host =check_output("hostname"); chomp $host; +my $bleu; +my $interval_count = 0; +my $logfile; +my $projected_score; + +# used in sorting scores +my $DIR_FLAG = '-r'; +if ($metric =~ /^ter$|^aer$/i) { +  $DIR_FLAG = ''; +} + +my $refs_comma_sep = get_comma_sep_refs('r',$refFiles); + +unless ($dir){ +	$dir = "rampion"; +} +unless ($dir =~ /^\//){  # convert relative path to absolute path +	my $basedir = check_output("pwd"); +	chomp $basedir; +	$dir = "$basedir/$dir"; +} + + +# Initializations and helper functions +srand; + +my @childpids = (); +my @cleanupcmds = (); + +sub cleanup { +	print STDERR "Cleanup...\n"; +	for my $pid (@childpids){ unchecked_call("kill $pid"); } +	for my $cmd (@cleanupcmds){ unchecked_call("$cmd"); } +	exit 1; +}; +# Always call cleanup, no matter how we exit +*CORE::GLOBAL::exit =  +    sub{ cleanup(); };  +$SIG{INT} = "cleanup"; +$SIG{TERM} = "cleanup"; +$SIG{HUP} = "cleanup"; + +my $decoderBase = check_output("basename $decoder"); chomp $decoderBase; +my $newIniFile = "$dir/$decoderBase.ini"; +my $inputFileName = "$dir/input"; +my $user = $ENV{"USER"}; + + +# process ini file +-e $iniFile || die "Error: could not open $iniFile for reading\n"; +open(INI, $iniFile); + +use File::Basename qw(basename); +#pass bindir, refs to vars holding bin +sub modbin { +    local $_; +    my $bindir=shift; +    check_call("mkdir -p $bindir"); +    -d $bindir || die "couldn't make bindir $bindir"; +    for (@_) { +        my $src=$$_; +        $$_="$bindir/".basename($src); +        check_call("cp -p $src $$_"); +    } +} +sub dirsize { +    opendir ISEMPTY,$_[0]; +    return scalar(readdir(ISEMPTY))-1; +} +my @allweights; +if ($dryrun){ +	write_config(*STDERR); +	exit 0; +} else { +	if (-e $dir && dirsize($dir)>1 && -e "$dir/hgs" ){ # allow preexisting logfile, binaries, but not dist-pro.pl outputs +	  die "ERROR: working dir $dir already exists\n\n"; +	} else { +		-e $dir || mkdir $dir; +		mkdir "$dir/hgs"; +        modbin("$dir/bin",\$LocalConfig,\$cdec,\$SCORER,\$MAPINPUT,\$MAPPER,\$parallelize,\$sentserver,\$sentclient,\$libcall) if $cpbin; +    mkdir "$dir/scripts"; +        my $cmdfile="$dir/rerun-pro.sh"; +        open CMD,'>',$cmdfile; +        print CMD "cd ",&getcwd,"\n"; +#        print CMD &escaped_cmdline,"\n"; #buggy - last arg is quoted. +        my $cline=&cmdline."\n"; +        print CMD $cline; +        close CMD; +        print STDERR $cline; +        chmod(0755,$cmdfile); +	check_call("cp $initial_weights $dir/weights.0"); +	die "Can't find weights.0" unless (-e "$dir/weights.0"); +	} +	write_config(*STDERR); +} + + +# Generate initial files and values +check_call("cp $iniFile $newIniFile"); +$iniFile = $newIniFile; + +my $newsrc = "$dir/dev.input"; +enseg($srcFile, $newsrc); +$srcFile = $newsrc; +my $devSize = 0; +open F, "<$srcFile" or die "Can't read $srcFile: $!"; +while(<F>) { $devSize++; } +close F; + +unless($best_weights){ $best_weights = $weights; } +unless($projected_score){ $projected_score = 0.0; } +$seen_weights{$weights} = 1; + +my $random_seed = int(time / 1000); +my $lastWeightsFile; +my $lastPScore = 0; +# main optimization loop +while (1){ +	print STDERR "\n\nITERATION $iteration\n==========\n"; + +	if ($iteration > $max_iterations){ +		print STDERR "\nREACHED STOPPING CRITERION: Maximum iterations\n"; +		last; +	} +	# iteration-specific files +	my $runFile="$dir/run.raw.$iteration"; +	my $onebestFile="$dir/1best.$iteration"; +	my $logdir="$dir/logs.$iteration"; +	my $decoderLog="$logdir/decoder.sentserver.log.$iteration"; +	my $scorerLog="$logdir/scorer.log.$iteration"; +	check_call("mkdir -p $logdir"); + + +	#decode +	print STDERR "RUNNING DECODER AT "; +	print STDERR unchecked_output("date"); +	my $im1 = $iteration - 1; +	my $weightsFile="$dir/weights.$im1"; +        push @allweights, "-w $dir/weights.$im1"; +        `rm -f $dir/hgs/*.gz`; +	my $decoder_cmd = "$decoder -c $iniFile --weights$pass_suffix $weightsFile -O $dir/hgs"; +	my $pcmd; +	if ($use_make) { +		$pcmd = "cat $srcFile | $parallelize --use-fork -p $pmem -e $logdir -j $jobs --"; +	} else { +		$pcmd = "cat $srcFile | $parallelize -p $pmem -e $logdir -j $jobs --"; +	} +	my $cmd = "$pcmd $decoder_cmd 2> $decoderLog 1> $runFile"; +	print STDERR "COMMAND:\n$cmd\n"; +	check_bash_call($cmd); +        my $num_hgs; +        my $num_topbest; +        my $retries = 0; +	while($retries < 5) { +	    $num_hgs = check_output("ls $dir/hgs/*.gz | wc -l"); +	    $num_topbest = check_output("wc -l < $runFile"); +	    print STDERR "NUMBER OF HGs: $num_hgs\n"; +	    print STDERR "NUMBER OF TOP-BEST HYPs: $num_topbest\n"; +	    if($devSize == $num_hgs && $devSize == $num_topbest) { +		last; +	    } else { +		print STDERR "Incorrect number of hypergraphs or topbest. Waiting for distributed filesystem and retrying...\n"; +		sleep(3); +	    } +	    $retries++; +	} +	die "Dev set contains $devSize sentences, but we don't have topbest and hypergraphs for all these! Decoder failure? Check $decoderLog\n" if ($devSize != $num_hgs || $devSize != $num_topbest); +	my $dec_score = check_output("cat $runFile | $SCORER $refs_comma_sep -m $metric"); +	chomp $dec_score; +	print STDERR "DECODER SCORE: $dec_score\n"; + +	# save space +	check_call("gzip -f $runFile"); +	check_call("gzip -f $decoderLog"); + +	# run optimizer +	print STDERR "RUNNING OPTIMIZER AT "; +	print STDERR unchecked_output("date"); +	print STDERR " - GENERATE TRAINING EXEMPLARS\n"; +	my $mergeLog="$logdir/prune-merge.log.$iteration"; + +	my $score = 0; +	my $icc = 0; +	my $inweights="$dir/weights.$im1"; +	$cmd="$MAPINPUT $dir/hgs > $dir/agenda.$im1"; +	print STDERR "COMMAND:\n$cmd\n"; +	check_call($cmd); +	die "PLEASE IMPL"; +	$iteration++; +	print STDERR "\n==========\n"; +} + +print STDERR "\nFINAL WEIGHTS: $lastWeightsFile\n(Use -w <this file> with the decoder)\n\n"; + +print STDOUT "$lastWeightsFile\n"; + +exit 0; + +sub get_lines { +  my $fn = shift @_; +  open FL, "<$fn" or die "Couldn't read $fn: $!"; +  my $lc = 0; +  while(<FL>) { $lc++; } +  return $lc; +} + +sub get_comma_sep_refs { +  my ($r,$p) = @_; +  my $o = check_output("echo $p"); +  chomp $o; +  my @files = split /\s+/, $o; +  return "-$r " . join(" -$r ", @files); +} + +sub read_weights_file { +  my ($file) = @_; +  open F, "<$file" or die "Couldn't read $file: $!"; +  my @r = (); +  my $pm = -1; +  while(<F>) { +    next if /^#/; +    next if /^\s*$/; +    chomp; +    if (/^(.+)\s+(.+)$/) { +      my $m = $1; +      my $w = $2; +      die "Weights out of order: $m <= $pm" unless $m > $pm; +      push @r, $w; +    } else { +      warn "Unexpected feature name in weight file: $_"; +    } +  } +  close F; +  return join ' ', @r; +} + +# subs +sub write_config { +	my $fh = shift; +	my $cleanup = "yes"; +	if ($disable_clean) {$cleanup = "no";} + +	print $fh "\n"; +	print $fh "DECODER:          $decoder\n"; +	print $fh "INI FILE:         $iniFile\n"; +	print $fh "WORKING DIR:      $dir\n"; +	print $fh "SOURCE (DEV):     $srcFile\n"; +	print $fh "REFS (DEV):       $refFiles\n"; +	print $fh "EVAL METRIC:      $metric\n"; +	print $fh "MAX ITERATIONS:   $max_iterations\n"; +	print $fh "JOBS:             $jobs\n"; +	print $fh "HEAD NODE:        $host\n"; +	print $fh "PMEM (DECODING):  $pmem\n"; +	print $fh "CLEANUP:          $cleanup\n"; +} + +sub update_weights_file { +  my ($neww, $rfn, $rpts) = @_; +  my @feats = @$rfn; +  my @pts = @$rpts; +  my $num_feats = scalar @feats; +  my $num_pts = scalar @pts; +  die "$num_feats (num_feats) != $num_pts (num_pts)" unless $num_feats == $num_pts; +  open G, ">$neww" or die; +  for (my $i = 0; $i < $num_feats; $i++) { +    my $f = $feats[$i]; +    my $lambda = $pts[$i]; +    print G "$f $lambda\n"; +  } +  close G; +} + +sub enseg { +	my $src = shift; +	my $newsrc = shift; +	open(SRC, $src); +	open(NEWSRC, ">$newsrc"); +	my $i=0; +	while (my $line=<SRC>){ +		chomp $line; +		if ($line =~ /^\s*<seg/i) { +		    if($line =~ /id="[0-9]+"/) { +			print NEWSRC "$line\n"; +		    } else { +			die "When using segments with pre-generated <seg> tags, you must include a zero-based id attribute"; +		    } +		} else { +			print NEWSRC "<seg id=\"$i\">$line</seg>\n"; +		} +		$i++; +	} +	close SRC; +	close NEWSRC; +	die "Empty dev set!" if ($i == 0); +} + +sub print_help { + +	my $executable = check_output("basename $0"); chomp $executable; +	print << "Help"; + +Usage: $executable [options] <ini file> + +	$executable [options] <ini file> +		Runs a complete PRO optimization using the ini file specified. + +Required: + +	--ref-files <files> +		Dev set ref files.  This option takes only a single string argument. +		To use multiple files (including file globbing), this argument should +		be quoted. + +	--source-file <file> +		Dev set source file. + +	--weights <file> +		Initial weights file (use empty file to start from 0) + +General options: + +	--help +		Print this message and exit. + +	--max-iterations <M> +		Maximum number of iterations to run.  If not specified, defaults +		to $default_max_iter. + +	--metric <method> +		Metric to optimize. +		Example values: IBM_BLEU, NIST_BLEU, Koehn_BLEU, TER, Combi + +	--pass-suffix <S> +		If the decoder is doing multi-pass decoding, the pass suffix "2", +		"3", etc., is used to control what iteration of weights is set. + +	--workdir <dir> +		Directory for intermediate and output files.  If not specified, the +		name is derived from the ini filename.  Assuming that the ini +		filename begins with the decoder name and ends with ini, the default +		name of the working directory is inferred from the middle part of +		the filename.  E.g. an ini file named decoder.foo.ini would have +		a default working directory name foo. + +Regularization options: + +	--reg <F> +		l2 regularization strength [default=500]. The greater this value, +		the closer to zero the weights will be. + +Job control options: + +	--jobs <I> +		Number of decoder processes to run in parallel. [default=$default_jobs] + +	--qsub +		Use qsub to run jobs in parallel (qsub must be configured in +		environment/LocalEnvironment.pm) + +	--pmem <N> +		Amount of physical memory requested for parallel decoding jobs +		(used with qsub requests only) + +Help +} + +sub convert { +  my ($str) = @_; +  my @ps = split /;/, $str; +  my %dict = (); +  for my $p (@ps) { +    my ($k, $v) = split /=/, $p; +    $dict{$k} = $v; +  } +  return %dict; +} + + +sub cmdline { +    return join ' ',($0,@ORIG_ARGV); +} + +#buggy: last arg gets quoted sometimes? +my $is_shell_special=qr{[ \t\n\\><|&;"'`~*?{}$!()]}; +my $shell_escape_in_quote=qr{[\\"\$`!]}; + +sub escape_shell { +    my ($arg)=@_; +    return undef unless defined $arg; +    if ($arg =~ /$is_shell_special/) { +        $arg =~ s/($shell_escape_in_quote)/\\$1/g; +        return "\"$arg\""; +    } +    return $arg; +} + +sub escaped_shell_args { +    return map {local $_=$_;chomp;escape_shell($_)} @_; +} + +sub escaped_shell_args_str { +    return join ' ',&escaped_shell_args(@_); +} + +sub escaped_cmdline { +    return "$0 ".&escaped_shell_args_str(@ORIG_ARGV); +} diff --git a/rampion/rampion_cccp.cc b/rampion/rampion_cccp.cc new file mode 100644 index 00000000..6eb3ccf3 --- /dev/null +++ b/rampion/rampion_cccp.cc @@ -0,0 +1,157 @@ +#include <sstream> +#include <iostream> +#include <vector> +#include <limits> + +#include <boost/program_options.hpp> +#include <boost/program_options/variables_map.hpp> + +#include "filelib.h" +#include "stringlib.h" +#include "weights.h" +#include "hg_io.h" +#include "kbest.h" +#include "viterbi.h" +#include "ns.h" +#include "ns_docscorer.h" + +using namespace std; +namespace po = boost::program_options; + +void InitCommandLine(int argc, char** argv, po::variables_map* conf) { +  po::options_description opts("Configuration options"); +  opts.add_options() +        ("reference,r",po::value<vector<string> >(), "[REQD] Reference translation (tokenized text)") +        ("weights,w",po::value<string>(), "[REQD] Weights files from current iterations") +        ("input,i",po::value<string>()->default_value("-"), "Input file to map (- is STDIN)") +        ("evaluation_metric,m",po::value<string>()->default_value("IBM_BLEU"), "Evaluation metric (ibm_bleu, koehn_bleu, nist_bleu, ter, meteor, etc.)") +        ("kbest_size,k",po::value<unsigned>()->default_value(500u), "Top k-hypotheses to extract") +        ("cccp_iterations,I", po::value<unsigned>()->default_value(10u), "CCCP iterations (T')") +        ("ssd_iterations,J", po::value<unsigned>()->default_value(5u), "Stochastic subgradient iterations (T'')") +        ("eta", po::value<double>()->default_value(1e-4), "Step size") +        ("regularization_strength,C", po::value<double>()->default_value(1.0), "L2 regularization strength") +        ("alpha,a", po::value<double>()->default_value(10.0), "Cost scale (alpha); alpha * [1-metric(y,y')]") +        ("help,h", "Help"); +  po::options_description dcmdline_options; +  dcmdline_options.add(opts); +  po::store(parse_command_line(argc, argv, dcmdline_options), *conf); +  bool flag = false; +  if (!conf->count("reference")) { +    cerr << "Please specify one or more references using -r <REF.TXT>\n"; +    flag = true; +  } +  if (!conf->count("weights")) { +    cerr << "Please specify weights using -w <WEIGHTS.TXT>\n"; +    flag = true; +  } +  if (flag || conf->count("help")) { +    cerr << dcmdline_options << endl; +    exit(1); +  } +} + +struct HypInfo { +  HypInfo() : g(-100.0f) {} +  HypInfo(const vector<WordID>& h, +          const SparseVector<weight_t>& feats, +          const SegmentEvaluator& scorer, const EvaluationMetric* metric) : hyp(h), x(feats) { +    SufficientStats ss; +    scorer.Evaluate(hyp, &ss); +    g = metric->ComputeScore(ss); +  } + +  vector<WordID> hyp; +  float g; +  SparseVector<weight_t> x; +}; + +void CostAugmentedSearch(const vector<HypInfo>& kbest, +                         const SparseVector<double>& w, +                         double alpha, +                         SparseVector<double>* fmap) { +  unsigned best_i = 0; +  double best = -numeric_limits<double>::infinity(); +  for (unsigned i = 0; i < kbest.size(); ++i) { +    double s = kbest[i].x.dot(w) + alpha * kbest[i].g; +    if (s > best) { +      best = s; +      best_i = i; +    } +  } +  *fmap = kbest[best_i].x; +} + +// runs lines 4--15 of rampion algorithm +int main(int argc, char** argv) { +  po::variables_map conf; +  InitCommandLine(argc, argv, &conf); +  const string evaluation_metric = conf["evaluation_metric"].as<string>(); + +  EvaluationMetric* metric = EvaluationMetric::Instance(evaluation_metric); +  DocumentScorer ds(metric, conf["reference"].as<vector<string> >()); +  cerr << "Loaded " << ds.size() << " references for scoring with " << evaluation_metric << endl; +  double goodsign = 1; +  if (metric->IsErrorMetric()) goodsign = -goodsign; +  double badsign = -goodsign; + +  Hypergraph hg; +  string last_file; +  ReadFile in_read(conf["input"].as<string>()); +  istream &in=*in_read.stream(); +  const unsigned kbest_size = conf["kbest_size"].as<unsigned>(); +  const unsigned tp = conf["cccp_iterations"].as<unsigned>(); +  const unsigned tpp = conf["ssd_iterations"].as<unsigned>(); +  const double eta = conf["eta"].as<double>(); +  const double reg = conf["regularization_strength"].as<double>(); +  const double alpha = conf["alpha"].as<double>(); +  SparseVector<weight_t> weights; +  { +    vector<weight_t> vweights; +    const string weightsf = conf["weights"].as<string>(); +    Weights::InitFromFile(weightsf, &vweights); +    Weights::InitSparseVector(vweights, &weights); +  } +  string line, file; +  vector<vector<HypInfo> > kis; +  cerr << "Loading hypergraphs...\n"; +  while(getline(in, line)) { +    istringstream is(line); +    int sent_id; +    kis.resize(kis.size() + 1); +    vector<HypInfo>& curkbest = kis.back(); +    is >> file >> sent_id; +    ReadFile rf(file); +    HypergraphIO::ReadFromJSON(rf.stream(), &hg); +    hg.Reweight(weights); +    KBest::KBestDerivations<vector<WordID>, ESentenceTraversal> kbest(hg, kbest_size); + +    for (int i = 0; i < kbest_size; ++i) { +      const KBest::KBestDerivations<vector<WordID>, ESentenceTraversal>::Derivation* d = +        kbest.LazyKthBest(hg.nodes_.size() - 1, i); +      if (!d) break; +      curkbest.push_back(HypInfo(d->yield, d->feature_values, *ds[sent_id], metric)); +    } +  } + +  cerr << "Hypergraphs loaded.\n"; +  vector<SparseVector<weight_t> > goals(kis.size());  // f(x_i,y+,h+) +  SparseVector<weight_t> fear;  // f(x,y-,h-) +  for (unsigned iterp = 1; iterp <= tp; ++iterp) { +    cerr << "CCCP Iteration " << iterp << endl; +    for (int i = 0; i < goals.size(); ++i) +      CostAugmentedSearch(kis[i], weights, goodsign * alpha, &goals[i]); +    for (unsigned iterpp = 1; iterpp <= tpp; ++iterpp) { +      cerr << "  SSD Iteration " << iterpp << endl; +      for (int i = 0; i < goals.size(); ++i) { +        CostAugmentedSearch(kis[i], weights, badsign * alpha, &fear); +        weights -= weights * (eta * reg / goals.size()); +        weights += (goals[i] - fear) * eta; +      } +    } +  } +  vector<weight_t> w; +  weights.init_vector(&w); +  Weights::WriteToFile("-", w); +  return 0; +} + diff --git a/rampion/rampion_generate_input.pl b/rampion/rampion_generate_input.pl new file mode 100755 index 00000000..b30fc4fd --- /dev/null +++ b/rampion/rampion_generate_input.pl @@ -0,0 +1,18 @@ +#!/usr/bin/perl -w +use strict; + +die "Usage: $0 HG_DIR\n" unless scalar @ARGV == 1; +my $d = shift @ARGV; +die "Can't find directory $d" unless -d $d; + +opendir(DIR, $d) or die "Can't read $d: $!"; +my @hgs = grep { /\.gz$/ } readdir(DIR); +closedir DIR; + +for my $hg (@hgs) { +  my $file = $hg; +  my $id = $hg; +  $id =~ s/(\.json)?\.gz//; +  print "$d/$file $id\n"; +} + | 
