mostly implemented rampion optimizer

author: Chris Dyer <cdyer@cs.cmu.edu> 2012-04-26 00:06:09 -0400
committer: Chris Dyer <cdyer@cs.cmu.edu> 2012-04-26 00:06:09 -0400
commit: 81578ddd4a32ee06d964bd7b5740ca61f76d5bc1 (patch)
tree: aaded6c267d81370e98e46363a2b76b70d4f975a
parent: 63945135627f41ed0c81e647db79bfe2eba4bf5c (diff)
9 files changed, 1309 insertions, 2 deletions
diff --git a/Makefile.am b/Makefile.am
index 2ecb60df..b5cba524 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -1,7 +1,7 @@
 # warning - the subdirectories in the following list should
 # be kept in topologically sorted order. Also, DO NOT introduce
 # cyclic dependencies between these directories!
-SUBDIRS = utils mteval klm/util klm/lm decoder phrasinator training mira dpmert pro-train extools gi/pf gi/markov_al rst_parser
+SUBDIRS = utils mteval klm/util klm/lm decoder phrasinator training mira dpmert pro-train rampion extools gi/pf gi/markov_al rst_parser
 
 #gi/pyp-topics/src gi/clda/src gi/posterior-regularisation/prjava
 
diff --git a/configure.ac b/configure.ac
index f03f97f0..81773e08 100644
--- a/configure.ac
+++ b/configure.ac
@@ -130,4 +130,4 @@ then
   AM_CONDITIONAL([GLC], true)
 fi
 
-AC_OUTPUT(Makefile rst_parser/Makefile utils/Makefile mteval/Makefile extools/Makefile decoder/Makefile phrasinator/Makefile training/Makefile dpmert/Makefile pro-train/Makefile klm/util/Makefile klm/lm/Makefile mira/Makefile gi/pyp-topics/src/Makefile gi/clda/src/Makefile gi/pf/Makefile gi/markov_al/Makefile)
+AC_OUTPUT(Makefile rst_parser/Makefile utils/Makefile mteval/Makefile extools/Makefile decoder/Makefile phrasinator/Makefile training/Makefile dpmert/Makefile pro-train/Makefile rampion/Makefile klm/util/Makefile klm/lm/Makefile mira/Makefile gi/pyp-topics/src/Makefile gi/clda/src/Makefile gi/pf/Makefile gi/markov_al/Makefile)
diff --git a/rampion/Makefile.am b/rampion/Makefile.am
new file mode 100644
index 00000000..12df39c2
--- /dev/null
+++ b/rampion/Makefile.am
@@ -0,0 +1,6 @@
+bin_PROGRAMS = rampion_cccp
+
+rampion_cccp_SOURCES = rampion_cccp.cc
+rampion_cccp_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a -lz
+
+AM_CPPFLAGS = -W -Wall -Wno-sign-compare $(GTEST_CPPFLAGS) -I$(top_srcdir)/utils -I$(top_srcdir)/decoder -I$(top_srcdir)/mteval -I$(top_srcdir)/training
diff --git a/rampion/rampion.pl b/rampion/rampion.pl
new file mode 100755
index 00000000..fda2bac2
--- /dev/null
+++ b/rampion/rampion.pl
@@ -0,0 +1,526 @@
+#!/usr/bin/env perl
+use strict;
+my @ORIG_ARGV=@ARGV;
+use Cwd qw(getcwd);
+my $SCRIPT_DIR; BEGIN { use Cwd qw/ abs_path /; use File::Basename; $SCRIPT_DIR = dirname(abs_path($0)); push @INC, $SCRIPT_DIR, "$SCRIPT_DIR/../environment"; }
+
+# Skip local config (used for distributing jobs) if we're running in local-only mode
+use LocalConfig;
+use Getopt::Long;
+use IPC::Open2;
+use POSIX ":sys_wait_h";
+my $QSUB_CMD = qsub_args(mert_memory());
+my $default_jobs = env_default_jobs();
+
+my $VEST_DIR="$SCRIPT_DIR/../dpmert";
+require "$VEST_DIR/libcall.pl";
+
+# Default settings
+my $srcFile;
+my $refFiles;
+my $bin_dir = $SCRIPT_DIR;
+die "Bin directory $bin_dir missing/inaccessible" unless -d $bin_dir;
+my $FAST_SCORE="$bin_dir/../mteval/fast_score";
+die "Can't execute $FAST_SCORE" unless -x $FAST_SCORE;
+my $MAPINPUT = "$bin_dir/rampion_generate_input.pl";
+my $MAPPER = "$bin_dir/rampion_cccp";
+my $parallelize = "$VEST_DIR/parallelize.pl";
+my $libcall = "$VEST_DIR/libcall.pl";
+my $sentserver = "$VEST_DIR/sentserver";
+my $sentclient = "$VEST_DIR/sentclient";
+my $LocalConfig = "$SCRIPT_DIR/../environment/LocalConfig.pm";
+
+my $SCORER = $FAST_SCORE;
+die "Can't find $MAPPER" unless -x $MAPPER;
+my $cdec = "$bin_dir/../decoder/cdec";
+die "Can't find decoder in $cdec" unless -x $cdec;
+die "Can't find $parallelize" unless -x $parallelize;
+die "Can't find $libcall" unless -e $libcall;
+my $decoder = $cdec;
+my $lines_per_mapper = 30;
+my $iteration = 1;
+my $best_weights;
+my $psi = 1;
+my $default_max_iter = 30;
+my $max_iterations = $default_max_iter;
+my $jobs = $default_jobs;   # number of decode nodes
+my $pmem = "4g";
+my $disable_clean = 0;
+my %seen_weights;
+my $help = 0;
+my $epsilon = 0.0001;
+my $dryrun = 0;
+my $last_score = -10000000;
+my $metric = "ibm_bleu";
+my $dir;
+my $iniFile;
+my $weights;
+my $use_make = 1;  # use make to parallelize
+my $useqsub = 0;
+my $initial_weights;
+my $pass_suffix = '';
+my $cpbin=1;
+
+# regularization strength
+my $tune_regularizer = 0;
+my $reg = 500;
+my $reg_previous = 5000;
+
+# Process command-line options
+Getopt::Long::Configure("no_auto_abbrev");
+if (GetOptions(
+	"jobs=i" => \$jobs,
+	"dont-clean" => \$disable_clean,
+	"pass-suffix=s" => \$pass_suffix,
+        "qsub" => \$useqsub,
+	"dry-run" => \$dryrun,
+	"epsilon=s" => \$epsilon,
+	"help" => \$help,
+        "weights=s" => \$initial_weights,
+	"reg=f" => \$reg,
+	"use-make=i" => \$use_make,
+	"max-iterations=i" => \$max_iterations,
+	"pmem=s" => \$pmem,
+        "cpbin!" => \$cpbin,
+	"ref-files=s" => \$refFiles,
+	"metric=s" => \$metric,
+	"source-file=s" => \$srcFile,
+	"workdir=s" => \$dir,
+) == 0 || @ARGV!=1 || $help) {
+	print_help();
+	exit;
+}
+
+die "--tune-regularizer is no longer supported with --reg-previous and --reg. Please tune manually.\n" if $tune_regularizer;
+
+if ($useqsub) {
+  $use_make = 0;
+  die "LocalEnvironment.pm does not have qsub configuration for this host. Cannot run with --qsub!\n" unless has_qsub();
+}
+
+my @missing_args = ();
+if (!defined $srcFile) { push @missing_args, "--source-file"; }
+if (!defined $refFiles) { push @missing_args, "--ref-files"; }
+if (!defined $initial_weights) { push @missing_args, "--weights"; }
+die "Please specify missing arguments: " . join (', ', @missing_args) . "\n" if (@missing_args);
+
+if ($metric =~ /^(combi|ter)$/i) {
+  $lines_per_mapper = 5;
+}
+
+($iniFile) = @ARGV;
+
+
+sub write_config;
+sub enseg;
+sub print_help;
+
+my $nodelist;
+my $host =check_output("hostname"); chomp $host;
+my $bleu;
+my $interval_count = 0;
+my $logfile;
+my $projected_score;
+
+# used in sorting scores
+my $DIR_FLAG = '-r';
+if ($metric =~ /^ter$|^aer$/i) {
+  $DIR_FLAG = '';
+}
+
+my $refs_comma_sep = get_comma_sep_refs('r',$refFiles);
+
+unless ($dir){
+	$dir = "rampion";
+}
+unless ($dir =~ /^\//){  # convert relative path to absolute path
+	my $basedir = check_output("pwd");
+	chomp $basedir;
+	$dir = "$basedir/$dir";
+}
+
+
+# Initializations and helper functions
+srand;
+
+my @childpids = ();
+my @cleanupcmds = ();
+
+sub cleanup {
+	print STDERR "Cleanup...\n";
+	for my $pid (@childpids){ unchecked_call("kill $pid"); }
+	for my $cmd (@cleanupcmds){ unchecked_call("$cmd"); }
+	exit 1;
+};
+# Always call cleanup, no matter how we exit
+*CORE::GLOBAL::exit = 
+    sub{ cleanup(); }; 
+$SIG{INT} = "cleanup";
+$SIG{TERM} = "cleanup";
+$SIG{HUP} = "cleanup";
+
+my $decoderBase = check_output("basename $decoder"); chomp $decoderBase;
+my $newIniFile = "$dir/$decoderBase.ini";
+my $inputFileName = "$dir/input";
+my $user = $ENV{"USER"};
+
+
+# process ini file
+-e $iniFile || die "Error: could not open $iniFile for reading\n";
+open(INI, $iniFile);
+
+use File::Basename qw(basename);
+#pass bindir, refs to vars holding bin
+sub modbin {
+    local $_;
+    my $bindir=shift;
+    check_call("mkdir -p $bindir");
+    -d $bindir || die "couldn't make bindir $bindir";
+    for (@_) {
+        my $src=$$_;
+        $$_="$bindir/".basename($src);
+        check_call("cp -p $src $$_");
+    }
+}
+sub dirsize {
+    opendir ISEMPTY,$_[0];
+    return scalar(readdir(ISEMPTY))-1;
+}
+my @allweights;
+if ($dryrun){
+	write_config(*STDERR);
+	exit 0;
+} else {
+	if (-e $dir && dirsize($dir)>1 && -e "$dir/hgs" ){ # allow preexisting logfile, binaries, but not dist-pro.pl outputs
+	  die "ERROR: working dir $dir already exists\n\n";
+	} else {
+		-e $dir || mkdir $dir;
+		mkdir "$dir/hgs";
+        modbin("$dir/bin",\$LocalConfig,\$cdec,\$SCORER,\$MAPINPUT,\$MAPPER,\$parallelize,\$sentserver,\$sentclient,\$libcall) if $cpbin;
+    mkdir "$dir/scripts";
+        my $cmdfile="$dir/rerun-pro.sh";
+        open CMD,'>',$cmdfile;
+        print CMD "cd ",&getcwd,"\n";
+#        print CMD &escaped_cmdline,"\n"; #buggy - last arg is quoted.
+        my $cline=&cmdline."\n";
+        print CMD $cline;
+        close CMD;
+        print STDERR $cline;
+        chmod(0755,$cmdfile);
+	check_call("cp $initial_weights $dir/weights.0");
+	die "Can't find weights.0" unless (-e "$dir/weights.0");
+	}
+	write_config(*STDERR);
+}
+
+
+# Generate initial files and values
+check_call("cp $iniFile $newIniFile");
+$iniFile = $newIniFile;
+
+my $newsrc = "$dir/dev.input";
+enseg($srcFile, $newsrc);
+$srcFile = $newsrc;
+my $devSize = 0;
+open F, "<$srcFile" or die "Can't read $srcFile: $!";
+while(<F>) { $devSize++; }
+close F;
+
+unless($best_weights){ $best_weights = $weights; }
+unless($projected_score){ $projected_score = 0.0; }
+$seen_weights{$weights} = 1;
+
+my $random_seed = int(time / 1000);
+my $lastWeightsFile;
+my $lastPScore = 0;
+# main optimization loop
+while (1){
+	print STDERR "\n\nITERATION $iteration\n==========\n";
+
+	if ($iteration > $max_iterations){
+		print STDERR "\nREACHED STOPPING CRITERION: Maximum iterations\n";
+		last;
+	}
+	# iteration-specific files
+	my $runFile="$dir/run.raw.$iteration";
+	my $onebestFile="$dir/1best.$iteration";
+	my $logdir="$dir/logs.$iteration";
+	my $decoderLog="$logdir/decoder.sentserver.log.$iteration";
+	my $scorerLog="$logdir/scorer.log.$iteration";
+	check_call("mkdir -p $logdir");
+
+
+	#decode
+	print STDERR "RUNNING DECODER AT ";
+	print STDERR unchecked_output("date");
+	my $im1 = $iteration - 1;
+	my $weightsFile="$dir/weights.$im1";
+        push @allweights, "-w $dir/weights.$im1";
+        `rm -f $dir/hgs/*.gz`;
+	my $decoder_cmd = "$decoder -c $iniFile --weights$pass_suffix $weightsFile -O $dir/hgs";
+	my $pcmd;
+	if ($use_make) {
+		$pcmd = "cat $srcFile | $parallelize --use-fork -p $pmem -e $logdir -j $jobs --";
+	} else {
+		$pcmd = "cat $srcFile | $parallelize -p $pmem -e $logdir -j $jobs --";
+	}
+	my $cmd = "$pcmd $decoder_cmd 2> $decoderLog 1> $runFile";
+	print STDERR "COMMAND:\n$cmd\n";
+	check_bash_call($cmd);
+        my $num_hgs;
+        my $num_topbest;
+        my $retries = 0;
+	while($retries < 5) {
+	    $num_hgs = check_output("ls $dir/hgs/*.gz | wc -l");
+	    $num_topbest = check_output("wc -l < $runFile");
+	    print STDERR "NUMBER OF HGs: $num_hgs\n";
+	    print STDERR "NUMBER OF TOP-BEST HYPs: $num_topbest\n";
+	    if($devSize == $num_hgs && $devSize == $num_topbest) {
+		last;
+	    } else {
+		print STDERR "Incorrect number of hypergraphs or topbest. Waiting for distributed filesystem and retrying...\n";
+		sleep(3);
+	    }
+	    $retries++;
+	}
+	die "Dev set contains $devSize sentences, but we don't have topbest and hypergraphs for all these! Decoder failure? Check $decoderLog\n" if ($devSize != $num_hgs || $devSize != $num_topbest);
+	my $dec_score = check_output("cat $runFile | $SCORER $refs_comma_sep -m $metric");
+	chomp $dec_score;
+	print STDERR "DECODER SCORE: $dec_score\n";
+
+	# save space
+	check_call("gzip -f $runFile");
+	check_call("gzip -f $decoderLog");
+
+	# run optimizer
+	print STDERR "RUNNING OPTIMIZER AT ";
+	print STDERR unchecked_output("date");
+	print STDERR " - GENERATE TRAINING EXEMPLARS\n";
+	my $mergeLog="$logdir/prune-merge.log.$iteration";
+
+	my $score = 0;
+	my $icc = 0;
+	my $inweights="$dir/weights.$im1";
+	$cmd="$MAPINPUT $dir/hgs > $dir/agenda.$im1";
+	print STDERR "COMMAND:\n$cmd\n";
+	check_call($cmd);
+	die "PLEASE IMPL";
+	$iteration++;
+	print STDERR "\n==========\n";
+}
+
+print STDERR "\nFINAL WEIGHTS: $lastWeightsFile\n(Use -w <this file> with the decoder)\n\n";
+
+print STDOUT "$lastWeightsFile\n";
+
+exit 0;
+
+sub get_lines {
+  my $fn = shift @_;
+  open FL, "<$fn" or die "Couldn't read $fn: $!";
+  my $lc = 0;
+  while(<FL>) { $lc++; }
+  return $lc;
+}
+
+sub get_comma_sep_refs {
+  my ($r,$p) = @_;
+  my $o = check_output("echo $p");
+  chomp $o;
+  my @files = split /\s+/, $o;
+  return "-$r " . join(" -$r ", @files);
+}
+
+sub read_weights_file {
+  my ($file) = @_;
+  open F, "<$file" or die "Couldn't read $file: $!";
+  my @r = ();
+  my $pm = -1;
+  while(<F>) {
+    next if /^#/;
+    next if /^\s*$/;
+    chomp;
+    if (/^(.+)\s+(.+)$/) {
+      my $m = $1;
+      my $w = $2;
+      die "Weights out of order: $m <= $pm" unless $m > $pm;
+      push @r, $w;
+    } else {
+      warn "Unexpected feature name in weight file: $_";
+    }
+  }
+  close F;
+  return join ' ', @r;
+}
+
+# subs
+sub write_config {
+	my $fh = shift;
+	my $cleanup = "yes";
+	if ($disable_clean) {$cleanup = "no";}
+
+	print $fh "\n";
+	print $fh "DECODER:          $decoder\n";
+	print $fh "INI FILE:         $iniFile\n";
+	print $fh "WORKING DIR:      $dir\n";
+	print $fh "SOURCE (DEV):     $srcFile\n";
+	print $fh "REFS (DEV):       $refFiles\n";
+	print $fh "EVAL METRIC:      $metric\n";
+	print $fh "MAX ITERATIONS:   $max_iterations\n";
+	print $fh "JOBS:             $jobs\n";
+	print $fh "HEAD NODE:        $host\n";
+	print $fh "PMEM (DECODING):  $pmem\n";
+	print $fh "CLEANUP:          $cleanup\n";
+}
+
+sub update_weights_file {
+  my ($neww, $rfn, $rpts) = @_;
+  my @feats = @$rfn;
+  my @pts = @$rpts;
+  my $num_feats = scalar @feats;
+  my $num_pts = scalar @pts;
+  die "$num_feats (num_feats) != $num_pts (num_pts)" unless $num_feats == $num_pts;
+  open G, ">$neww" or die;
+  for (my $i = 0; $i < $num_feats; $i++) {
+    my $f = $feats[$i];
+    my $lambda = $pts[$i];
+    print G "$f $lambda\n";
+  }
+  close G;
+}
+
+sub enseg {
+	my $src = shift;
+	my $newsrc = shift;
+	open(SRC, $src);
+	open(NEWSRC, ">$newsrc");
+	my $i=0;
+	while (my $line=<SRC>){
+		chomp $line;
+		if ($line =~ /^\s*<seg/i) {
+		    if($line =~ /id="[0-9]+"/) {
+			print NEWSRC "$line\n";
+		    } else {
+			die "When using segments with pre-generated <seg> tags, you must include a zero-based id attribute";
+		    }
+		} else {
+			print NEWSRC "<seg id=\"$i\">$line</seg>\n";
+		}
+		$i++;
+	}
+	close SRC;
+	close NEWSRC;
+	die "Empty dev set!" if ($i == 0);
+}
+
+sub print_help {
+
+	my $executable = check_output("basename $0"); chomp $executable;
+	print << "Help";
+
+Usage: $executable [options] <ini file>
+
+	$executable [options] <ini file>
+		Runs a complete PRO optimization using the ini file specified.
+
+Required:
+
+	--ref-files <files>
+		Dev set ref files.  This option takes only a single string argument.
+		To use multiple files (including file globbing), this argument should
+		be quoted.
+
+	--source-file <file>
+		Dev set source file.
+
+	--weights <file>
+		Initial weights file (use empty file to start from 0)
+
+General options:
+
+	--help
+		Print this message and exit.
+
+	--max-iterations <M>
+		Maximum number of iterations to run.  If not specified, defaults
+		to $default_max_iter.
+
+	--metric <method>
+		Metric to optimize.
+		Example values: IBM_BLEU, NIST_BLEU, Koehn_BLEU, TER, Combi
+
+	--pass-suffix <S>
+		If the decoder is doing multi-pass decoding, the pass suffix "2",
+		"3", etc., is used to control what iteration of weights is set.
+
+	--workdir <dir>
+		Directory for intermediate and output files.  If not specified, the
+		name is derived from the ini filename.  Assuming that the ini
+		filename begins with the decoder name and ends with ini, the default
+		name of the working directory is inferred from the middle part of
+		the filename.  E.g. an ini file named decoder.foo.ini would have
+		a default working directory name foo.
+
+Regularization options:
+
+	--reg <F>
+		l2 regularization strength [default=500]. The greater this value,
+		the closer to zero the weights will be.
+
+Job control options:
+
+	--jobs <I>
+		Number of decoder processes to run in parallel. [default=$default_jobs]
+
+	--qsub
+		Use qsub to run jobs in parallel (qsub must be configured in
+		environment/LocalEnvironment.pm)
+
+	--pmem <N>
+		Amount of physical memory requested for parallel decoding jobs
+		(used with qsub requests only)
+
+Help
+}
+
+sub convert {
+  my ($str) = @_;
+  my @ps = split /;/, $str;
+  my %dict = ();
+  for my $p (@ps) {
+    my ($k, $v) = split /=/, $p;
+    $dict{$k} = $v;
+  }
+  return %dict;
+}
+
+
+sub cmdline {
+    return join ' ',($0,@ORIG_ARGV);
+}
+
+#buggy: last arg gets quoted sometimes?
+my $is_shell_special=qr{[ \t\n\\><|&;"'`~*?{}$!()]};
+my $shell_escape_in_quote=qr{[\\"\$`!]};
+
+sub escape_shell {
+    my ($arg)=@_;
+    return undef unless defined $arg;
+    if ($arg =~ /$is_shell_special/) {
+        $arg =~ s/($shell_escape_in_quote)/\\$1/g;
+        return "\"$arg\"";
+    }
+    return $arg;
+}
+
+sub escaped_shell_args {
+    return map {local $_=$_;chomp;escape_shell($_)} @_;
+}
+
+sub escaped_shell_args_str {
+    return join ' ',&escaped_shell_args(@_);
+}
+
+sub escaped_cmdline {
+    return "$0 ".&escaped_shell_args_str(@ORIG_ARGV);
+}
diff --git a/rampion/rampion_cccp.cc b/rampion/rampion_cccp.cc
new file mode 100644
index 00000000..6eb3ccf3
--- /dev/null
+++ b/rampion/rampion_cccp.cc
@@ -0,0 +1,157 @@
+#include <sstream>
+#include <iostream>
+#include <vector>
+#include <limits>
+
+#include <boost/program_options.hpp>
+#include <boost/program_options/variables_map.hpp>
+
+#include "filelib.h"
+#include "stringlib.h"
+#include "weights.h"
+#include "hg_io.h"
+#include "kbest.h"
+#include "viterbi.h"
+#include "ns.h"
+#include "ns_docscorer.h"
+
+using namespace std;
+namespace po = boost::program_options;
+
+void InitCommandLine(int argc, char** argv, po::variables_map* conf) {
+  po::options_description opts("Configuration options");
+  opts.add_options()
+        ("reference,r",po::value<vector<string> >(), "[REQD] Reference translation (tokenized text)")
+        ("weights,w",po::value<string>(), "[REQD] Weights files from current iterations")
+        ("input,i",po::value<string>()->default_value("-"), "Input file to map (- is STDIN)")
+        ("evaluation_metric,m",po::value<string>()->default_value("IBM_BLEU"), "Evaluation metric (ibm_bleu, koehn_bleu, nist_bleu, ter, meteor, etc.)")
+        ("kbest_size,k",po::value<unsigned>()->default_value(500u), "Top k-hypotheses to extract")
+        ("cccp_iterations,I", po::value<unsigned>()->default_value(10u), "CCCP iterations (T')")
+        ("ssd_iterations,J", po::value<unsigned>()->default_value(5u), "Stochastic subgradient iterations (T'')")
+        ("eta", po::value<double>()->default_value(1e-4), "Step size")
+        ("regularization_strength,C", po::value<double>()->default_value(1.0), "L2 regularization strength")
+        ("alpha,a", po::value<double>()->default_value(10.0), "Cost scale (alpha); alpha * [1-metric(y,y')]")
+        ("help,h", "Help");
+  po::options_description dcmdline_options;
+  dcmdline_options.add(opts);
+  po::store(parse_command_line(argc, argv, dcmdline_options), *conf);
+  bool flag = false;
+  if (!conf->count("reference")) {
+    cerr << "Please specify one or more references using -r <REF.TXT>\n";
+    flag = true;
+  }
+  if (!conf->count("weights")) {
+    cerr << "Please specify weights using -w <WEIGHTS.TXT>\n";
+    flag = true;
+  }
+  if (flag || conf->count("help")) {
+    cerr << dcmdline_options << endl;
+    exit(1);
+  }
+}
+
+struct HypInfo {
+  HypInfo() : g(-100.0f) {}
+  HypInfo(const vector<WordID>& h,
+          const SparseVector<weight_t>& feats,
+          const SegmentEvaluator& scorer, const EvaluationMetric* metric) : hyp(h), x(feats) {
+    SufficientStats ss;
+    scorer.Evaluate(hyp, &ss);
+    g = metric->ComputeScore(ss);
+  }
+
+  vector<WordID> hyp;
+  float g;
+  SparseVector<weight_t> x;
+};
+
+void CostAugmentedSearch(const vector<HypInfo>& kbest,
+                         const SparseVector<double>& w,
+                         double alpha,
+                         SparseVector<double>* fmap) {
+  unsigned best_i = 0;
+  double best = -numeric_limits<double>::infinity();
+  for (unsigned i = 0; i < kbest.size(); ++i) {
+    double s = kbest[i].x.dot(w) + alpha * kbest[i].g;
+    if (s > best) {
+      best = s;
+      best_i = i;
+    }
+  }
+  *fmap = kbest[best_i].x;
+}
+
+// runs lines 4--15 of rampion algorithm
+int main(int argc, char** argv) {
+  po::variables_map conf;
+  InitCommandLine(argc, argv, &conf);
+  const string evaluation_metric = conf["evaluation_metric"].as<string>();
+
+  EvaluationMetric* metric = EvaluationMetric::Instance(evaluation_metric);
+  DocumentScorer ds(metric, conf["reference"].as<vector<string> >());
+  cerr << "Loaded " << ds.size() << " references for scoring with " << evaluation_metric << endl;
+  double goodsign = 1;
+  if (metric->IsErrorMetric()) goodsign = -goodsign;
+  double badsign = -goodsign;
+
+  Hypergraph hg;
+  string last_file;
+  ReadFile in_read(conf["input"].as<string>());
+  istream &in=*in_read.stream();
+  const unsigned kbest_size = conf["kbest_size"].as<unsigned>();
+  const unsigned tp = conf["cccp_iterations"].as<unsigned>();
+  const unsigned tpp = conf["ssd_iterations"].as<unsigned>();
+  const double eta = conf["eta"].as<double>();
+  const double reg = conf["regularization_strength"].as<double>();
+  const double alpha = conf["alpha"].as<double>();
+  SparseVector<weight_t> weights;
+  {
+    vector<weight_t> vweights;
+    const string weightsf = conf["weights"].as<string>();
+    Weights::InitFromFile(weightsf, &vweights);
+    Weights::InitSparseVector(vweights, &weights);
+  }
+  string line, file;
+  vector<vector<HypInfo> > kis;
+  cerr << "Loading hypergraphs...\n";
+  while(getline(in, line)) {
+    istringstream is(line);
+    int sent_id;
+    kis.resize(kis.size() + 1);
+    vector<HypInfo>& curkbest = kis.back();
+    is >> file >> sent_id;
+    ReadFile rf(file);
+    HypergraphIO::ReadFromJSON(rf.stream(), &hg);
+    hg.Reweight(weights);
+    KBest::KBestDerivations<vector<WordID>, ESentenceTraversal> kbest(hg, kbest_size);
+
+    for (int i = 0; i < kbest_size; ++i) {
+      const KBest::KBestDerivations<vector<WordID>, ESentenceTraversal>::Derivation* d =
+        kbest.LazyKthBest(hg.nodes_.size() - 1, i);
+      if (!d) break;
+      curkbest.push_back(HypInfo(d->yield, d->feature_values, *ds[sent_id], metric));
+    }
+  }
+
+  cerr << "Hypergraphs loaded.\n";
+  vector<SparseVector<weight_t> > goals(kis.size());  // f(x_i,y+,h+)
+  SparseVector<weight_t> fear;  // f(x,y-,h-)
+  for (unsigned iterp = 1; iterp <= tp; ++iterp) {
+    cerr << "CCCP Iteration " << iterp << endl;
+    for (int i = 0; i < goals.size(); ++i)
+      CostAugmentedSearch(kis[i], weights, goodsign * alpha, &goals[i]);
+    for (unsigned iterpp = 1; iterpp <= tpp; ++iterpp) {
+      cerr << "  SSD Iteration " << iterpp << endl;
+      for (int i = 0; i < goals.size(); ++i) {
+        CostAugmentedSearch(kis[i], weights, badsign * alpha, &fear);
+        weights -= weights * (eta * reg / goals.size());
+        weights += (goals[i] - fear) * eta;
+      }
+    }
+  }
+  vector<weight_t> w;
+  weights.init_vector(&w);
+  Weights::WriteToFile("-", w);
+  return 0;
+}
+
diff --git a/rampion/rampion_generate_input.pl b/rampion/rampion_generate_input.pl
new file mode 100755
index 00000000..b30fc4fd
--- /dev/null
+++ b/rampion/rampion_generate_input.pl
@@ -0,0 +1,18 @@
+#!/usr/bin/perl -w
+use strict;
+
+die "Usage: $0 HG_DIR\n" unless scalar @ARGV == 1;
+my $d = shift @ARGV;
+die "Can't find directory $d" unless -d $d;
+
+opendir(DIR, $d) or die "Can't read $d: $!";
+my @hgs = grep { /\.gz$/ } readdir(DIR);
+closedir DIR;
+
+for my $hg (@hgs) {
+  my $file = $hg;
+  my $id = $hg;
+  $id =~ s/(\.json)?\.gz//;
+  print "$d/$file $id\n";
+}
+
diff --git a/utils/city.cc b/utils/city.cc
new file mode 100644
index 00000000..f1301ce4
--- /dev/null
+++ b/utils/city.cc
@@ -0,0 +1,467 @@
+// Copyright (c) 2011 Google, Inc.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+//
+// CityHash, by Geoff Pike and Jyrki Alakuijala
+//
+// This file provides CityHash64() and related functions.
+//
+// It's probably possible to create even faster hash functions by
+// writing a program that systematically explores some of the space of
+// possible hash functions, by using SIMD instructions, or by
+// compromising on hash quality.
+
+#include "config.h"
+#include <city.h>
+
+#include <algorithm>
+#include <string.h>  // for memcpy and memset
+
+using namespace std;
+
+static uint64 UNALIGNED_LOAD64(const char *p) {
+  uint64 result;
+  memcpy(&result, p, sizeof(result));
+  return result;
+}
+
+static uint32 UNALIGNED_LOAD32(const char *p) {
+  uint32 result;
+  memcpy(&result, p, sizeof(result));
+  return result;
+}
+
+#if !defined(WORDS_BIGENDIAN)
+
+#define uint32_in_expected_order(x) (x)
+#define uint64_in_expected_order(x) (x)
+
+#else
+
+#ifdef _MSC_VER
+#include <stdlib.h>
+#define bswap_32(x) _byteswap_ulong(x)
+#define bswap_64(x) _byteswap_uint64(x)
+
+#elif defined(__APPLE__)
+// Mac OS X / Darwin features
+#include <libkern/OSByteOrder.h>
+#define bswap_32(x) OSSwapInt32(x)
+#define bswap_64(x) OSSwapInt64(x)
+
+#else
+#include <byteswap.h>
+#endif
+
+#define uint32_in_expected_order(x) (bswap_32(x))
+#define uint64_in_expected_order(x) (bswap_64(x))
+
+#endif  // WORDS_BIGENDIAN
+
+#if !defined(LIKELY)
+#if HAVE_BUILTIN_EXPECT
+#define LIKELY(x) (__builtin_expect(!!(x), 1))
+#else
+#define LIKELY(x) (x)
+#endif
+#endif
+
+static uint64 Fetch64(const char *p) {
+  return uint64_in_expected_order(UNALIGNED_LOAD64(p));
+}
+
+static uint32 Fetch32(const char *p) {
+  return uint32_in_expected_order(UNALIGNED_LOAD32(p));
+}
+
+// Some primes between 2^63 and 2^64 for various uses.
+static const uint64 k0 = 0xc3a5c85c97cb3127ULL;
+static const uint64 k1 = 0xb492b66fbe98f273ULL;
+static const uint64 k2 = 0x9ae16a3b2f90404fULL;
+static const uint64 k3 = 0xc949d7c7509e6557ULL;
+
+// Bitwise right rotate.  Normally this will compile to a single
+// instruction, especially if the shift is a manifest constant.
+static uint64 Rotate(uint64 val, int shift) {
+  // Avoid shifting by 64: doing so yields an undefined result.
+  return shift == 0 ? val : ((val >> shift) | (val << (64 - shift)));
+}
+
+// Equivalent to Rotate(), but requires the second arg to be non-zero.
+// On x86-64, and probably others, it's possible for this to compile
+// to a single instruction if both args are already in registers.
+static uint64 RotateByAtLeast1(uint64 val, int shift) {
+  return (val >> shift) | (val << (64 - shift));
+}
+
+static uint64 ShiftMix(uint64 val) {
+  return val ^ (val >> 47);
+}
+
+static uint64 HashLen16(uint64 u, uint64 v) {
+  return Hash128to64(uint128(u, v));
+}
+
+static uint64 HashLen0to16(const char *s, size_t len) {
+  if (len > 8) {
+    uint64 a = Fetch64(s);
+    uint64 b = Fetch64(s + len - 8);
+    return HashLen16(a, RotateByAtLeast1(b + len, len)) ^ b;
+  }
+  if (len >= 4) {
+    uint64 a = Fetch32(s);
+    return HashLen16(len + (a << 3), Fetch32(s + len - 4));
+  }
+  if (len > 0) {
+    uint8 a = s[0];
+    uint8 b = s[len >> 1];
+    uint8 c = s[len - 1];
+    uint32 y = static_cast<uint32>(a) + (static_cast<uint32>(b) << 8);
+    uint32 z = len + (static_cast<uint32>(c) << 2);
+    return ShiftMix(y * k2 ^ z * k3) * k2;
+  }
+  return k2;
+}
+
+// This probably works well for 16-byte strings as well, but it may be overkill
+// in that case.
+static uint64 HashLen17to32(const char *s, size_t len) {
+  uint64 a = Fetch64(s) * k1;
+  uint64 b = Fetch64(s + 8);
+  uint64 c = Fetch64(s + len - 8) * k2;
+  uint64 d = Fetch64(s + len - 16) * k0;
+  return HashLen16(Rotate(a - b, 43) + Rotate(c, 30) + d,
+                   a + Rotate(b ^ k3, 20) - c + len);
+}
+
+// Return a 16-byte hash for 48 bytes.  Quick and dirty.
+// Callers do best to use "random-looking" values for a and b.
+static pair<uint64, uint64> WeakHashLen32WithSeeds(
+    uint64 w, uint64 x, uint64 y, uint64 z, uint64 a, uint64 b) {
+  a += w;
+  b = Rotate(b + a + z, 21);
+  uint64 c = a;
+  a += x;
+  a += y;
+  b += Rotate(a, 44);
+  return make_pair(a + z, b + c);
+}
+
+// Return a 16-byte hash for s[0] ... s[31], a, and b.  Quick and dirty.
+static pair<uint64, uint64> WeakHashLen32WithSeeds(
+    const char* s, uint64 a, uint64 b) {
+  return WeakHashLen32WithSeeds(Fetch64(s),
+                                Fetch64(s + 8),
+                                Fetch64(s + 16),
+                                Fetch64(s + 24),
+                                a,
+                                b);
+}
+
+// Return an 8-byte hash for 33 to 64 bytes.
+static uint64 HashLen33to64(const char *s, size_t len) {
+  uint64 z = Fetch64(s + 24);
+  uint64 a = Fetch64(s) + (len + Fetch64(s + len - 16)) * k0;
+  uint64 b = Rotate(a + z, 52);
+  uint64 c = Rotate(a, 37);
+  a += Fetch64(s + 8);
+  c += Rotate(a, 7);
+  a += Fetch64(s + 16);
+  uint64 vf = a + z;
+  uint64 vs = b + Rotate(a, 31) + c;
+  a = Fetch64(s + 16) + Fetch64(s + len - 32);
+  z = Fetch64(s + len - 8);
+  b = Rotate(a + z, 52);
+  c = Rotate(a, 37);
+  a += Fetch64(s + len - 24);
+  c += Rotate(a, 7);
+  a += Fetch64(s + len - 16);
+  uint64 wf = a + z;
+  uint64 ws = b + Rotate(a, 31) + c;
+  uint64 r = ShiftMix((vf + ws) * k2 + (wf + vs) * k0);
+  return ShiftMix(r * k0 + vs) * k2;
+}
+
+uint64 CityHash64(const char *s, size_t len) {
+  if (len <= 32) {
+    if (len <= 16) {
+      return HashLen0to16(s, len);
+    } else {
+      return HashLen17to32(s, len);
+    }
+  } else if (len <= 64) {
+    return HashLen33to64(s, len);
+  }
+
+  // For strings over 64 bytes we hash the end first, and then as we
+  // loop we keep 56 bytes of state: v, w, x, y, and z.
+  uint64 x = Fetch64(s + len - 40);
+  uint64 y = Fetch64(s + len - 16) + Fetch64(s + len - 56);
+  uint64 z = HashLen16(Fetch64(s + len - 48) + len, Fetch64(s + len - 24));
+  pair<uint64, uint64> v = WeakHashLen32WithSeeds(s + len - 64, len, z);
+  pair<uint64, uint64> w = WeakHashLen32WithSeeds(s + len - 32, y + k1, x);
+  x = x * k1 + Fetch64(s);
+
+  // Decrease len to the nearest multiple of 64, and operate on 64-byte chunks.
+  len = (len - 1) & ~static_cast<size_t>(63);
+  do {
+    x = Rotate(x + y + v.first + Fetch64(s + 8), 37) * k1;
+    y = Rotate(y + v.second + Fetch64(s + 48), 42) * k1;
+    x ^= w.second;
+    y += v.first + Fetch64(s + 40);
+    z = Rotate(z + w.first, 33) * k1;
+    v = WeakHashLen32WithSeeds(s, v.second * k1, x + w.first);
+    w = WeakHashLen32WithSeeds(s + 32, z + w.second, y + Fetch64(s + 16));
+    std::swap(z, x);
+    s += 64;
+    len -= 64;
+  } while (len != 0);
+  return HashLen16(HashLen16(v.first, w.first) + ShiftMix(y) * k1 + z,
+                   HashLen16(v.second, w.second) + x);
+}
+
+uint64 CityHash64WithSeed(const char *s, size_t len, uint64 seed) {
+  return CityHash64WithSeeds(s, len, k2, seed);
+}
+
+uint64 CityHash64WithSeeds(const char *s, size_t len,
+                           uint64 seed0, uint64 seed1) {
+  return HashLen16(CityHash64(s, len) - seed0, seed1);
+}
+
+// A subroutine for CityHash128().  Returns a decent 128-bit hash for strings
+// of any length representable in signed long.  Based on City and Murmur.
+static uint128 CityMurmur(const char *s, size_t len, uint128 seed) {
+  uint64 a = Uint128Low64(seed);
+  uint64 b = Uint128High64(seed);
+  uint64 c = 0;
+  uint64 d = 0;
+  signed long l = len - 16;
+  if (l <= 0) {  // len <= 16
+    a = ShiftMix(a * k1) * k1;
+    c = b * k1 + HashLen0to16(s, len);
+    d = ShiftMix(a + (len >= 8 ? Fetch64(s) : c));
+  } else {  // len > 16
+    c = HashLen16(Fetch64(s + len - 8) + k1, a);
+    d = HashLen16(b + len, c + Fetch64(s + len - 16));
+    a += d;
+    do {
+      a ^= ShiftMix(Fetch64(s) * k1) * k1;
+      a *= k1;
+      b ^= a;
+      c ^= ShiftMix(Fetch64(s + 8) * k1) * k1;
+      c *= k1;
+      d ^= c;
+      s += 16;
+      l -= 16;
+    } while (l > 0);
+  }
+  a = HashLen16(a, c);
+  b = HashLen16(d, b);
+  return uint128(a ^ b, HashLen16(b, a));
+}
+
+uint128 CityHash128WithSeed(const char *s, size_t len, uint128 seed) {
+  if (len < 128) {
+    return CityMurmur(s, len, seed);
+  }
+
+  // We expect len >= 128 to be the common case.  Keep 56 bytes of state:
+  // v, w, x, y, and z.
+  pair<uint64, uint64> v, w;
+  uint64 x = Uint128Low64(seed);
+  uint64 y = Uint128High64(seed);
+  uint64 z = len * k1;
+  v.first = Rotate(y ^ k1, 49) * k1 + Fetch64(s);
+  v.second = Rotate(v.first, 42) * k1 + Fetch64(s + 8);
+  w.first = Rotate(y + z, 35) * k1 + x;
+  w.second = Rotate(x + Fetch64(s + 88), 53) * k1;
+
+  // This is the same inner loop as CityHash64(), manually unrolled.
+  do {
+    x = Rotate(x + y + v.first + Fetch64(s + 8), 37) * k1;
+    y = Rotate(y + v.second + Fetch64(s + 48), 42) * k1;
+    x ^= w.second;
+    y += v.first + Fetch64(s + 40);
+    z = Rotate(z + w.first, 33) * k1;
+    v = WeakHashLen32WithSeeds(s, v.second * k1, x + w.first);
+    w = WeakHashLen32WithSeeds(s + 32, z + w.second, y + Fetch64(s + 16));
+    std::swap(z, x);
+    s += 64;
+    x = Rotate(x + y + v.first + Fetch64(s + 8), 37) * k1;
+    y = Rotate(y + v.second + Fetch64(s + 48), 42) * k1;
+    x ^= w.second;
+    y += v.first + Fetch64(s + 40);
+    z = Rotate(z + w.first, 33) * k1;
+    v = WeakHashLen32WithSeeds(s, v.second * k1, x + w.first);
+    w = WeakHashLen32WithSeeds(s + 32, z + w.second, y + Fetch64(s + 16));
+    std::swap(z, x);
+    s += 64;
+    len -= 128;
+  } while (LIKELY(len >= 128));
+  x += Rotate(v.first + z, 49) * k0;
+  z += Rotate(w.first, 37) * k0;
+  // If 0 < len < 128, hash up to 4 chunks of 32 bytes each from the end of s.
+  for (size_t tail_done = 0; tail_done < len; ) {
+    tail_done += 32;
+    y = Rotate(x + y, 42) * k0 + v.second;
+    w.first += Fetch64(s + len - tail_done + 16);
+    x = x * k0 + w.first;
+    z += w.second + Fetch64(s + len - tail_done);
+    w.second += v.first;
+    v = WeakHashLen32WithSeeds(s + len - tail_done, v.first + z, v.second);
+  }
+  // At this point our 56 bytes of state should contain more than
+  // enough information for a strong 128-bit hash.  We use two
+  // different 56-byte-to-8-byte hashes to get a 16-byte final result.
+  x = HashLen16(x, v.first);
+  y = HashLen16(y + z, w.first);
+  return uint128(HashLen16(x + v.second, w.second) + y,
+                 HashLen16(x + w.second, y + v.second));
+}
+
+uint128 CityHash128(const char *s, size_t len) {
+  if (len >= 16) {
+    return CityHash128WithSeed(s + 16,
+                               len - 16,
+                               uint128(Fetch64(s) ^ k3,
+                                       Fetch64(s + 8)));
+  } else if (len >= 8) {
+    return CityHash128WithSeed(NULL,
+                               0,
+                               uint128(Fetch64(s) ^ (len * k0),
+                                       Fetch64(s + len - 8) ^ k1));
+  } else {
+    return CityHash128WithSeed(s, len, uint128(k0, k1));
+  }
+}
+
+#ifdef __SSE4_2__
+#include <citycrc.h>
+#include <nmmintrin.h>
+
+// Requires len >= 240.
+static void CityHashCrc256Long(const char *s, size_t len,
+                               uint32 seed, uint64 *result) {
+  uint64 a = Fetch64(s + 56) + k0;
+  uint64 b = Fetch64(s + 96) + k0;
+  uint64 c = result[0] = HashLen16(b, len);
+  uint64 d = result[1] = Fetch64(s + 120) * k0 + len;
+  uint64 e = Fetch64(s + 184) + seed;
+  uint64 f = seed;
+  uint64 g = 0;
+  uint64 h = 0;
+  uint64 i = 0;
+  uint64 j = 0;
+  uint64 t = c + d;
+
+  // 240 bytes of input per iter.
+  size_t iters = len / 240;
+  len -= iters * 240;
+  do {
+#define CHUNK(multiplier, z)                                    \
+    {                                                           \
+      uint64 old_a = a;                                         \
+      a = Rotate(b, 41 ^ z) * multiplier + Fetch64(s);          \
+      b = Rotate(c, 27 ^ z) * multiplier + Fetch64(s + 8);      \
+      c = Rotate(d, 41 ^ z) * multiplier + Fetch64(s + 16);     \
+      d = Rotate(e, 33 ^ z) * multiplier + Fetch64(s + 24);     \
+      e = Rotate(t, 25 ^ z) * multiplier + Fetch64(s + 32);     \
+      t = old_a;                                                \
+    }                                                           \
+    f = _mm_crc32_u64(f, a);                                    \
+    g = _mm_crc32_u64(g, b);                                    \
+    h = _mm_crc32_u64(h, c);                                    \
+    i = _mm_crc32_u64(i, d);                                    \
+    j = _mm_crc32_u64(j, e);                                    \
+    s += 40
+
+    CHUNK(1, 1); CHUNK(k0, 0);
+    CHUNK(1, 1); CHUNK(k0, 0);
+    CHUNK(1, 1); CHUNK(k0, 0);
+  } while (--iters > 0);
+
+  while (len >= 40) {
+    CHUNK(k0, 0);
+    len -= 40;
+  }
+  if (len > 0) {
+    s = s + len - 40;
+    CHUNK(k0, 0);
+  }
+  j += i << 32;
+  a = HashLen16(a, j);
+  h += g << 32;
+  b += h;
+  c = HashLen16(c, f) + i;
+  d = HashLen16(d, e + result[0]);
+  j += e;
+  i += HashLen16(h, t);
+  e = HashLen16(a, d) + j;
+  f = HashLen16(b, c) + a;
+  g = HashLen16(j, i) + c;
+  result[0] = e + f + g + h;
+  a = ShiftMix((a + g) * k0) * k0 + b;
+  result[1] += a + result[0];
+  a = ShiftMix(a * k0) * k0 + c;
+  result[2] = a + result[1];
+  a = ShiftMix((a + e) * k0) * k0;
+  result[3] = a + result[2];
+}
+
+// Requires len < 240.
+static void CityHashCrc256Short(const char *s, size_t len, uint64 *result) {
+  char buf[240];
+  memcpy(buf, s, len);
+  memset(buf + len, 0, 240 - len);
+  CityHashCrc256Long(buf, 240, ~static_cast<uint32>(len), result);
+}
+
+void CityHashCrc256(const char *s, size_t len, uint64 *result) {
+  if (LIKELY(len >= 240)) {
+    CityHashCrc256Long(s, len, 0, result);
+  } else {
+    CityHashCrc256Short(s, len, result);
+  }
+}
+
+uint128 CityHashCrc128WithSeed(const char *s, size_t len, uint128 seed) {
+  if (len <= 900) {
+    return CityHash128WithSeed(s, len, seed);
+  } else {
+    uint64 result[4];
+    CityHashCrc256(s, len, result);
+    uint64 u = Uint128High64(seed) + result[0];
+    uint64 v = Uint128Low64(seed) + result[1];
+    return uint128(HashLen16(u, v + result[2]),
+                   HashLen16(Rotate(v, 32), u * k0 + result[3]));
+  }
+}
+
+uint128 CityHashCrc128(const char *s, size_t len) {
+  if (len <= 900) {
+    return CityHash128(s, len);
+  } else {
+    uint64 result[4];
+    CityHashCrc256(s, len, result);
+    return uint128(result[2], result[3]);
+  }
+}
+
+#endif
diff --git a/utils/city.h b/utils/city.h
new file mode 100644
index 00000000..c2ab352c
--- /dev/null
+++ b/utils/city.h
@@ -0,0 +1,90 @@
+// Copyright (c) 2011 Google, Inc.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+//
+// CityHash, by Geoff Pike and Jyrki Alakuijala
+//
+// This file provides a few functions for hashing strings. On x86-64
+// hardware in 2011, CityHash64() is faster than other high-quality
+// hash functions, such as Murmur.  This is largely due to higher
+// instruction-level parallelism.  CityHash64() and CityHash128() also perform
+// well on hash-quality tests.
+//
+// CityHash128() is optimized for relatively long strings and returns
+// a 128-bit hash.  For strings more than about 2000 bytes it can be
+// faster than CityHash64().
+//
+// Functions in the CityHash family are not suitable for cryptography.
+//
+// WARNING: This code has not been tested on big-endian platforms!
+// It is known to work well on little-endian platforms that have a small penalty
+// for unaligned reads, such as current Intel and AMD moderate-to-high-end CPUs.
+//
+// By the way, for some hash functions, given strings a and b, the hash
+// of a+b is easily derived from the hashes of a and b.  This property
+// doesn't hold for any hash functions in this file.
+
+#ifndef CITY_HASH_H_
+#define CITY_HASH_H_
+
+#include <stdlib.h>  // for size_t.
+#include <stdint.h>
+#include <utility>
+
+typedef uint8_t uint8;
+typedef uint32_t uint32;
+typedef uint64_t uint64;
+typedef std::pair<uint64, uint64> uint128;
+
+inline uint64 Uint128Low64(const uint128& x) { return x.first; }
+inline uint64 Uint128High64(const uint128& x) { return x.second; }
+
+// Hash function for a byte array.
+uint64 CityHash64(const char *buf, size_t len);
+
+// Hash function for a byte array.  For convenience, a 64-bit seed is also
+// hashed into the result.
+uint64 CityHash64WithSeed(const char *buf, size_t len, uint64 seed);
+
+// Hash function for a byte array.  For convenience, two seeds are also
+// hashed into the result.
+uint64 CityHash64WithSeeds(const char *buf, size_t len,
+                           uint64 seed0, uint64 seed1);
+
+// Hash function for a byte array.
+uint128 CityHash128(const char *s, size_t len);
+
+// Hash function for a byte array.  For convenience, a 128-bit seed is also
+// hashed into the result.
+uint128 CityHash128WithSeed(const char *s, size_t len, uint128 seed);
+
+// Hash 128 input bits down to 64 bits of output.
+// This is intended to be a reasonably good hash function.
+inline uint64 Hash128to64(const uint128& x) {
+  // Murmur-inspired hashing.
+  const uint64 kMul = 0x9ddfea08eb382d69ULL;
+  uint64 a = (Uint128Low64(x) ^ Uint128High64(x)) * kMul;
+  a ^= (a >> 47);
+  uint64 b = (Uint128High64(x) ^ a) * kMul;
+  b ^= (b >> 47);
+  b *= kMul;
+  return b;
+}
+
+#endif  // CITY_HASH_H_
diff --git a/utils/citycrc.h b/utils/citycrc.h
new file mode 100644
index 00000000..318e3917
--- /dev/null
+++ b/utils/citycrc.h
@@ -0,0 +1,43 @@
+// Copyright (c) 2011 Google, Inc.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+//
+// CityHash, by Geoff Pike and Jyrki Alakuijala
+//
+// This file declares the subset of the CityHash functions that require
+// _mm_crc32_u64().  See the CityHash README for details.
+//
+// Functions in the CityHash family are not suitable for cryptography.
+
+#ifndef CITY_HASH_CRC_H_
+#define CITY_HASH_CRC_H_
+
+#include <city.h>
+
+// Hash function for a byte array.
+uint128 CityHashCrc128(const char *s, size_t len);
+
+// Hash function for a byte array.  For convenience, a 128-bit seed is also
+// hashed into the result.
+uint128 CityHashCrc128WithSeed(const char *s, size_t len, uint128 seed);
+
+// Hash function for a byte array.  Sets result[0] ... result[3].
+void CityHashCrc256(const char *s, size_t len, uint64 *result);
+
+#endif  // CITY_HASH_CRC_H_
author	Chris Dyer <cdyer@cs.cmu.edu>	2012-04-26 00:06:09 -0400
committer	Chris Dyer <cdyer@cs.cmu.edu>	2012-04-26 00:06:09 -0400
commit	81578ddd4a32ee06d964bd7b5740ca61f76d5bc1 (patch)
tree	aaded6c267d81370e98e46363a2b76b70d4f975a
parent	63945135627f41ed0c81e647db79bfe2eba4bf5c (diff)