diff options
author | Chris Dyer <cdyer@cs.cmu.edu> | 2012-04-26 00:06:09 -0400 |
---|---|---|
committer | Chris Dyer <cdyer@cs.cmu.edu> | 2012-04-26 00:06:09 -0400 |
commit | 81578ddd4a32ee06d964bd7b5740ca61f76d5bc1 (patch) | |
tree | aaded6c267d81370e98e46363a2b76b70d4f975a | |
parent | 63945135627f41ed0c81e647db79bfe2eba4bf5c (diff) |
mostly implemented rampion optimizer
-rw-r--r-- | Makefile.am | 2 | ||||
-rw-r--r-- | configure.ac | 2 | ||||
-rw-r--r-- | rampion/Makefile.am | 6 | ||||
-rwxr-xr-x | rampion/rampion.pl | 526 | ||||
-rw-r--r-- | rampion/rampion_cccp.cc | 157 | ||||
-rwxr-xr-x | rampion/rampion_generate_input.pl | 18 | ||||
-rw-r--r-- | utils/city.cc | 467 | ||||
-rw-r--r-- | utils/city.h | 90 | ||||
-rw-r--r-- | utils/citycrc.h | 43 |
9 files changed, 1309 insertions, 2 deletions
diff --git a/Makefile.am b/Makefile.am index 2ecb60df..b5cba524 100644 --- a/Makefile.am +++ b/Makefile.am @@ -1,7 +1,7 @@ # warning - the subdirectories in the following list should # be kept in topologically sorted order. Also, DO NOT introduce # cyclic dependencies between these directories! -SUBDIRS = utils mteval klm/util klm/lm decoder phrasinator training mira dpmert pro-train extools gi/pf gi/markov_al rst_parser +SUBDIRS = utils mteval klm/util klm/lm decoder phrasinator training mira dpmert pro-train rampion extools gi/pf gi/markov_al rst_parser #gi/pyp-topics/src gi/clda/src gi/posterior-regularisation/prjava diff --git a/configure.ac b/configure.ac index f03f97f0..81773e08 100644 --- a/configure.ac +++ b/configure.ac @@ -130,4 +130,4 @@ then AM_CONDITIONAL([GLC], true) fi -AC_OUTPUT(Makefile rst_parser/Makefile utils/Makefile mteval/Makefile extools/Makefile decoder/Makefile phrasinator/Makefile training/Makefile dpmert/Makefile pro-train/Makefile klm/util/Makefile klm/lm/Makefile mira/Makefile gi/pyp-topics/src/Makefile gi/clda/src/Makefile gi/pf/Makefile gi/markov_al/Makefile) +AC_OUTPUT(Makefile rst_parser/Makefile utils/Makefile mteval/Makefile extools/Makefile decoder/Makefile phrasinator/Makefile training/Makefile dpmert/Makefile pro-train/Makefile rampion/Makefile klm/util/Makefile klm/lm/Makefile mira/Makefile gi/pyp-topics/src/Makefile gi/clda/src/Makefile gi/pf/Makefile gi/markov_al/Makefile) diff --git a/rampion/Makefile.am b/rampion/Makefile.am new file mode 100644 index 00000000..12df39c2 --- /dev/null +++ b/rampion/Makefile.am @@ -0,0 +1,6 @@ +bin_PROGRAMS = rampion_cccp + +rampion_cccp_SOURCES = rampion_cccp.cc +rampion_cccp_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a -lz + +AM_CPPFLAGS = -W -Wall -Wno-sign-compare $(GTEST_CPPFLAGS) -I$(top_srcdir)/utils -I$(top_srcdir)/decoder -I$(top_srcdir)/mteval -I$(top_srcdir)/training diff --git a/rampion/rampion.pl b/rampion/rampion.pl new file mode 100755 index 00000000..fda2bac2 --- /dev/null +++ b/rampion/rampion.pl @@ -0,0 +1,526 @@ +#!/usr/bin/env perl +use strict; +my @ORIG_ARGV=@ARGV; +use Cwd qw(getcwd); +my $SCRIPT_DIR; BEGIN { use Cwd qw/ abs_path /; use File::Basename; $SCRIPT_DIR = dirname(abs_path($0)); push @INC, $SCRIPT_DIR, "$SCRIPT_DIR/../environment"; } + +# Skip local config (used for distributing jobs) if we're running in local-only mode +use LocalConfig; +use Getopt::Long; +use IPC::Open2; +use POSIX ":sys_wait_h"; +my $QSUB_CMD = qsub_args(mert_memory()); +my $default_jobs = env_default_jobs(); + +my $VEST_DIR="$SCRIPT_DIR/../dpmert"; +require "$VEST_DIR/libcall.pl"; + +# Default settings +my $srcFile; +my $refFiles; +my $bin_dir = $SCRIPT_DIR; +die "Bin directory $bin_dir missing/inaccessible" unless -d $bin_dir; +my $FAST_SCORE="$bin_dir/../mteval/fast_score"; +die "Can't execute $FAST_SCORE" unless -x $FAST_SCORE; +my $MAPINPUT = "$bin_dir/rampion_generate_input.pl"; +my $MAPPER = "$bin_dir/rampion_cccp"; +my $parallelize = "$VEST_DIR/parallelize.pl"; +my $libcall = "$VEST_DIR/libcall.pl"; +my $sentserver = "$VEST_DIR/sentserver"; +my $sentclient = "$VEST_DIR/sentclient"; +my $LocalConfig = "$SCRIPT_DIR/../environment/LocalConfig.pm"; + +my $SCORER = $FAST_SCORE; +die "Can't find $MAPPER" unless -x $MAPPER; +my $cdec = "$bin_dir/../decoder/cdec"; +die "Can't find decoder in $cdec" unless -x $cdec; +die "Can't find $parallelize" unless -x $parallelize; +die "Can't find $libcall" unless -e $libcall; +my $decoder = $cdec; +my $lines_per_mapper = 30; +my $iteration = 1; +my $best_weights; +my $psi = 1; +my $default_max_iter = 30; +my $max_iterations = $default_max_iter; +my $jobs = $default_jobs; # number of decode nodes +my $pmem = "4g"; +my $disable_clean = 0; +my %seen_weights; +my $help = 0; +my $epsilon = 0.0001; +my $dryrun = 0; +my $last_score = -10000000; +my $metric = "ibm_bleu"; +my $dir; +my $iniFile; +my $weights; +my $use_make = 1; # use make to parallelize +my $useqsub = 0; +my $initial_weights; +my $pass_suffix = ''; +my $cpbin=1; + +# regularization strength +my $tune_regularizer = 0; +my $reg = 500; +my $reg_previous = 5000; + +# Process command-line options +Getopt::Long::Configure("no_auto_abbrev"); +if (GetOptions( + "jobs=i" => \$jobs, + "dont-clean" => \$disable_clean, + "pass-suffix=s" => \$pass_suffix, + "qsub" => \$useqsub, + "dry-run" => \$dryrun, + "epsilon=s" => \$epsilon, + "help" => \$help, + "weights=s" => \$initial_weights, + "reg=f" => \$reg, + "use-make=i" => \$use_make, + "max-iterations=i" => \$max_iterations, + "pmem=s" => \$pmem, + "cpbin!" => \$cpbin, + "ref-files=s" => \$refFiles, + "metric=s" => \$metric, + "source-file=s" => \$srcFile, + "workdir=s" => \$dir, +) == 0 || @ARGV!=1 || $help) { + print_help(); + exit; +} + +die "--tune-regularizer is no longer supported with --reg-previous and --reg. Please tune manually.\n" if $tune_regularizer; + +if ($useqsub) { + $use_make = 0; + die "LocalEnvironment.pm does not have qsub configuration for this host. Cannot run with --qsub!\n" unless has_qsub(); +} + +my @missing_args = (); +if (!defined $srcFile) { push @missing_args, "--source-file"; } +if (!defined $refFiles) { push @missing_args, "--ref-files"; } +if (!defined $initial_weights) { push @missing_args, "--weights"; } +die "Please specify missing arguments: " . join (', ', @missing_args) . "\n" if (@missing_args); + +if ($metric =~ /^(combi|ter)$/i) { + $lines_per_mapper = 5; +} + +($iniFile) = @ARGV; + + +sub write_config; +sub enseg; +sub print_help; + +my $nodelist; +my $host =check_output("hostname"); chomp $host; +my $bleu; +my $interval_count = 0; +my $logfile; +my $projected_score; + +# used in sorting scores +my $DIR_FLAG = '-r'; +if ($metric =~ /^ter$|^aer$/i) { + $DIR_FLAG = ''; +} + +my $refs_comma_sep = get_comma_sep_refs('r',$refFiles); + +unless ($dir){ + $dir = "rampion"; +} +unless ($dir =~ /^\//){ # convert relative path to absolute path + my $basedir = check_output("pwd"); + chomp $basedir; + $dir = "$basedir/$dir"; +} + + +# Initializations and helper functions +srand; + +my @childpids = (); +my @cleanupcmds = (); + +sub cleanup { + print STDERR "Cleanup...\n"; + for my $pid (@childpids){ unchecked_call("kill $pid"); } + for my $cmd (@cleanupcmds){ unchecked_call("$cmd"); } + exit 1; +}; +# Always call cleanup, no matter how we exit +*CORE::GLOBAL::exit = + sub{ cleanup(); }; +$SIG{INT} = "cleanup"; +$SIG{TERM} = "cleanup"; +$SIG{HUP} = "cleanup"; + +my $decoderBase = check_output("basename $decoder"); chomp $decoderBase; +my $newIniFile = "$dir/$decoderBase.ini"; +my $inputFileName = "$dir/input"; +my $user = $ENV{"USER"}; + + +# process ini file +-e $iniFile || die "Error: could not open $iniFile for reading\n"; +open(INI, $iniFile); + +use File::Basename qw(basename); +#pass bindir, refs to vars holding bin +sub modbin { + local $_; + my $bindir=shift; + check_call("mkdir -p $bindir"); + -d $bindir || die "couldn't make bindir $bindir"; + for (@_) { + my $src=$$_; + $$_="$bindir/".basename($src); + check_call("cp -p $src $$_"); + } +} +sub dirsize { + opendir ISEMPTY,$_[0]; + return scalar(readdir(ISEMPTY))-1; +} +my @allweights; +if ($dryrun){ + write_config(*STDERR); + exit 0; +} else { + if (-e $dir && dirsize($dir)>1 && -e "$dir/hgs" ){ # allow preexisting logfile, binaries, but not dist-pro.pl outputs + die "ERROR: working dir $dir already exists\n\n"; + } else { + -e $dir || mkdir $dir; + mkdir "$dir/hgs"; + modbin("$dir/bin",\$LocalConfig,\$cdec,\$SCORER,\$MAPINPUT,\$MAPPER,\$parallelize,\$sentserver,\$sentclient,\$libcall) if $cpbin; + mkdir "$dir/scripts"; + my $cmdfile="$dir/rerun-pro.sh"; + open CMD,'>',$cmdfile; + print CMD "cd ",&getcwd,"\n"; +# print CMD &escaped_cmdline,"\n"; #buggy - last arg is quoted. + my $cline=&cmdline."\n"; + print CMD $cline; + close CMD; + print STDERR $cline; + chmod(0755,$cmdfile); + check_call("cp $initial_weights $dir/weights.0"); + die "Can't find weights.0" unless (-e "$dir/weights.0"); + } + write_config(*STDERR); +} + + +# Generate initial files and values +check_call("cp $iniFile $newIniFile"); +$iniFile = $newIniFile; + +my $newsrc = "$dir/dev.input"; +enseg($srcFile, $newsrc); +$srcFile = $newsrc; +my $devSize = 0; +open F, "<$srcFile" or die "Can't read $srcFile: $!"; +while(<F>) { $devSize++; } +close F; + +unless($best_weights){ $best_weights = $weights; } +unless($projected_score){ $projected_score = 0.0; } +$seen_weights{$weights} = 1; + +my $random_seed = int(time / 1000); +my $lastWeightsFile; +my $lastPScore = 0; +# main optimization loop +while (1){ + print STDERR "\n\nITERATION $iteration\n==========\n"; + + if ($iteration > $max_iterations){ + print STDERR "\nREACHED STOPPING CRITERION: Maximum iterations\n"; + last; + } + # iteration-specific files + my $runFile="$dir/run.raw.$iteration"; + my $onebestFile="$dir/1best.$iteration"; + my $logdir="$dir/logs.$iteration"; + my $decoderLog="$logdir/decoder.sentserver.log.$iteration"; + my $scorerLog="$logdir/scorer.log.$iteration"; + check_call("mkdir -p $logdir"); + + + #decode + print STDERR "RUNNING DECODER AT "; + print STDERR unchecked_output("date"); + my $im1 = $iteration - 1; + my $weightsFile="$dir/weights.$im1"; + push @allweights, "-w $dir/weights.$im1"; + `rm -f $dir/hgs/*.gz`; + my $decoder_cmd = "$decoder -c $iniFile --weights$pass_suffix $weightsFile -O $dir/hgs"; + my $pcmd; + if ($use_make) { + $pcmd = "cat $srcFile | $parallelize --use-fork -p $pmem -e $logdir -j $jobs --"; + } else { + $pcmd = "cat $srcFile | $parallelize -p $pmem -e $logdir -j $jobs --"; + } + my $cmd = "$pcmd $decoder_cmd 2> $decoderLog 1> $runFile"; + print STDERR "COMMAND:\n$cmd\n"; + check_bash_call($cmd); + my $num_hgs; + my $num_topbest; + my $retries = 0; + while($retries < 5) { + $num_hgs = check_output("ls $dir/hgs/*.gz | wc -l"); + $num_topbest = check_output("wc -l < $runFile"); + print STDERR "NUMBER OF HGs: $num_hgs\n"; + print STDERR "NUMBER OF TOP-BEST HYPs: $num_topbest\n"; + if($devSize == $num_hgs && $devSize == $num_topbest) { + last; + } else { + print STDERR "Incorrect number of hypergraphs or topbest. Waiting for distributed filesystem and retrying...\n"; + sleep(3); + } + $retries++; + } + die "Dev set contains $devSize sentences, but we don't have topbest and hypergraphs for all these! Decoder failure? Check $decoderLog\n" if ($devSize != $num_hgs || $devSize != $num_topbest); + my $dec_score = check_output("cat $runFile | $SCORER $refs_comma_sep -m $metric"); + chomp $dec_score; + print STDERR "DECODER SCORE: $dec_score\n"; + + # save space + check_call("gzip -f $runFile"); + check_call("gzip -f $decoderLog"); + + # run optimizer + print STDERR "RUNNING OPTIMIZER AT "; + print STDERR unchecked_output("date"); + print STDERR " - GENERATE TRAINING EXEMPLARS\n"; + my $mergeLog="$logdir/prune-merge.log.$iteration"; + + my $score = 0; + my $icc = 0; + my $inweights="$dir/weights.$im1"; + $cmd="$MAPINPUT $dir/hgs > $dir/agenda.$im1"; + print STDERR "COMMAND:\n$cmd\n"; + check_call($cmd); + die "PLEASE IMPL"; + $iteration++; + print STDERR "\n==========\n"; +} + +print STDERR "\nFINAL WEIGHTS: $lastWeightsFile\n(Use -w <this file> with the decoder)\n\n"; + +print STDOUT "$lastWeightsFile\n"; + +exit 0; + +sub get_lines { + my $fn = shift @_; + open FL, "<$fn" or die "Couldn't read $fn: $!"; + my $lc = 0; + while(<FL>) { $lc++; } + return $lc; +} + +sub get_comma_sep_refs { + my ($r,$p) = @_; + my $o = check_output("echo $p"); + chomp $o; + my @files = split /\s+/, $o; + return "-$r " . join(" -$r ", @files); +} + +sub read_weights_file { + my ($file) = @_; + open F, "<$file" or die "Couldn't read $file: $!"; + my @r = (); + my $pm = -1; + while(<F>) { + next if /^#/; + next if /^\s*$/; + chomp; + if (/^(.+)\s+(.+)$/) { + my $m = $1; + my $w = $2; + die "Weights out of order: $m <= $pm" unless $m > $pm; + push @r, $w; + } else { + warn "Unexpected feature name in weight file: $_"; + } + } + close F; + return join ' ', @r; +} + +# subs +sub write_config { + my $fh = shift; + my $cleanup = "yes"; + if ($disable_clean) {$cleanup = "no";} + + print $fh "\n"; + print $fh "DECODER: $decoder\n"; + print $fh "INI FILE: $iniFile\n"; + print $fh "WORKING DIR: $dir\n"; + print $fh "SOURCE (DEV): $srcFile\n"; + print $fh "REFS (DEV): $refFiles\n"; + print $fh "EVAL METRIC: $metric\n"; + print $fh "MAX ITERATIONS: $max_iterations\n"; + print $fh "JOBS: $jobs\n"; + print $fh "HEAD NODE: $host\n"; + print $fh "PMEM (DECODING): $pmem\n"; + print $fh "CLEANUP: $cleanup\n"; +} + +sub update_weights_file { + my ($neww, $rfn, $rpts) = @_; + my @feats = @$rfn; + my @pts = @$rpts; + my $num_feats = scalar @feats; + my $num_pts = scalar @pts; + die "$num_feats (num_feats) != $num_pts (num_pts)" unless $num_feats == $num_pts; + open G, ">$neww" or die; + for (my $i = 0; $i < $num_feats; $i++) { + my $f = $feats[$i]; + my $lambda = $pts[$i]; + print G "$f $lambda\n"; + } + close G; +} + +sub enseg { + my $src = shift; + my $newsrc = shift; + open(SRC, $src); + open(NEWSRC, ">$newsrc"); + my $i=0; + while (my $line=<SRC>){ + chomp $line; + if ($line =~ /^\s*<seg/i) { + if($line =~ /id="[0-9]+"/) { + print NEWSRC "$line\n"; + } else { + die "When using segments with pre-generated <seg> tags, you must include a zero-based id attribute"; + } + } else { + print NEWSRC "<seg id=\"$i\">$line</seg>\n"; + } + $i++; + } + close SRC; + close NEWSRC; + die "Empty dev set!" if ($i == 0); +} + +sub print_help { + + my $executable = check_output("basename $0"); chomp $executable; + print << "Help"; + +Usage: $executable [options] <ini file> + + $executable [options] <ini file> + Runs a complete PRO optimization using the ini file specified. + +Required: + + --ref-files <files> + Dev set ref files. This option takes only a single string argument. + To use multiple files (including file globbing), this argument should + be quoted. + + --source-file <file> + Dev set source file. + + --weights <file> + Initial weights file (use empty file to start from 0) + +General options: + + --help + Print this message and exit. + + --max-iterations <M> + Maximum number of iterations to run. If not specified, defaults + to $default_max_iter. + + --metric <method> + Metric to optimize. + Example values: IBM_BLEU, NIST_BLEU, Koehn_BLEU, TER, Combi + + --pass-suffix <S> + If the decoder is doing multi-pass decoding, the pass suffix "2", + "3", etc., is used to control what iteration of weights is set. + + --workdir <dir> + Directory for intermediate and output files. If not specified, the + name is derived from the ini filename. Assuming that the ini + filename begins with the decoder name and ends with ini, the default + name of the working directory is inferred from the middle part of + the filename. E.g. an ini file named decoder.foo.ini would have + a default working directory name foo. + +Regularization options: + + --reg <F> + l2 regularization strength [default=500]. The greater this value, + the closer to zero the weights will be. + +Job control options: + + --jobs <I> + Number of decoder processes to run in parallel. [default=$default_jobs] + + --qsub + Use qsub to run jobs in parallel (qsub must be configured in + environment/LocalEnvironment.pm) + + --pmem <N> + Amount of physical memory requested for parallel decoding jobs + (used with qsub requests only) + +Help +} + +sub convert { + my ($str) = @_; + my @ps = split /;/, $str; + my %dict = (); + for my $p (@ps) { + my ($k, $v) = split /=/, $p; + $dict{$k} = $v; + } + return %dict; +} + + +sub cmdline { + return join ' ',($0,@ORIG_ARGV); +} + +#buggy: last arg gets quoted sometimes? +my $is_shell_special=qr{[ \t\n\\><|&;"'`~*?{}$!()]}; +my $shell_escape_in_quote=qr{[\\"\$`!]}; + +sub escape_shell { + my ($arg)=@_; + return undef unless defined $arg; + if ($arg =~ /$is_shell_special/) { + $arg =~ s/($shell_escape_in_quote)/\\$1/g; + return "\"$arg\""; + } + return $arg; +} + +sub escaped_shell_args { + return map {local $_=$_;chomp;escape_shell($_)} @_; +} + +sub escaped_shell_args_str { + return join ' ',&escaped_shell_args(@_); +} + +sub escaped_cmdline { + return "$0 ".&escaped_shell_args_str(@ORIG_ARGV); +} diff --git a/rampion/rampion_cccp.cc b/rampion/rampion_cccp.cc new file mode 100644 index 00000000..6eb3ccf3 --- /dev/null +++ b/rampion/rampion_cccp.cc @@ -0,0 +1,157 @@ +#include <sstream> +#include <iostream> +#include <vector> +#include <limits> + +#include <boost/program_options.hpp> +#include <boost/program_options/variables_map.hpp> + +#include "filelib.h" +#include "stringlib.h" +#include "weights.h" +#include "hg_io.h" +#include "kbest.h" +#include "viterbi.h" +#include "ns.h" +#include "ns_docscorer.h" + +using namespace std; +namespace po = boost::program_options; + +void InitCommandLine(int argc, char** argv, po::variables_map* conf) { + po::options_description opts("Configuration options"); + opts.add_options() + ("reference,r",po::value<vector<string> >(), "[REQD] Reference translation (tokenized text)") + ("weights,w",po::value<string>(), "[REQD] Weights files from current iterations") + ("input,i",po::value<string>()->default_value("-"), "Input file to map (- is STDIN)") + ("evaluation_metric,m",po::value<string>()->default_value("IBM_BLEU"), "Evaluation metric (ibm_bleu, koehn_bleu, nist_bleu, ter, meteor, etc.)") + ("kbest_size,k",po::value<unsigned>()->default_value(500u), "Top k-hypotheses to extract") + ("cccp_iterations,I", po::value<unsigned>()->default_value(10u), "CCCP iterations (T')") + ("ssd_iterations,J", po::value<unsigned>()->default_value(5u), "Stochastic subgradient iterations (T'')") + ("eta", po::value<double>()->default_value(1e-4), "Step size") + ("regularization_strength,C", po::value<double>()->default_value(1.0), "L2 regularization strength") + ("alpha,a", po::value<double>()->default_value(10.0), "Cost scale (alpha); alpha * [1-metric(y,y')]") + ("help,h", "Help"); + po::options_description dcmdline_options; + dcmdline_options.add(opts); + po::store(parse_command_line(argc, argv, dcmdline_options), *conf); + bool flag = false; + if (!conf->count("reference")) { + cerr << "Please specify one or more references using -r <REF.TXT>\n"; + flag = true; + } + if (!conf->count("weights")) { + cerr << "Please specify weights using -w <WEIGHTS.TXT>\n"; + flag = true; + } + if (flag || conf->count("help")) { + cerr << dcmdline_options << endl; + exit(1); + } +} + +struct HypInfo { + HypInfo() : g(-100.0f) {} + HypInfo(const vector<WordID>& h, + const SparseVector<weight_t>& feats, + const SegmentEvaluator& scorer, const EvaluationMetric* metric) : hyp(h), x(feats) { + SufficientStats ss; + scorer.Evaluate(hyp, &ss); + g = metric->ComputeScore(ss); + } + + vector<WordID> hyp; + float g; + SparseVector<weight_t> x; +}; + +void CostAugmentedSearch(const vector<HypInfo>& kbest, + const SparseVector<double>& w, + double alpha, + SparseVector<double>* fmap) { + unsigned best_i = 0; + double best = -numeric_limits<double>::infinity(); + for (unsigned i = 0; i < kbest.size(); ++i) { + double s = kbest[i].x.dot(w) + alpha * kbest[i].g; + if (s > best) { + best = s; + best_i = i; + } + } + *fmap = kbest[best_i].x; +} + +// runs lines 4--15 of rampion algorithm +int main(int argc, char** argv) { + po::variables_map conf; + InitCommandLine(argc, argv, &conf); + const string evaluation_metric = conf["evaluation_metric"].as<string>(); + + EvaluationMetric* metric = EvaluationMetric::Instance(evaluation_metric); + DocumentScorer ds(metric, conf["reference"].as<vector<string> >()); + cerr << "Loaded " << ds.size() << " references for scoring with " << evaluation_metric << endl; + double goodsign = 1; + if (metric->IsErrorMetric()) goodsign = -goodsign; + double badsign = -goodsign; + + Hypergraph hg; + string last_file; + ReadFile in_read(conf["input"].as<string>()); + istream &in=*in_read.stream(); + const unsigned kbest_size = conf["kbest_size"].as<unsigned>(); + const unsigned tp = conf["cccp_iterations"].as<unsigned>(); + const unsigned tpp = conf["ssd_iterations"].as<unsigned>(); + const double eta = conf["eta"].as<double>(); + const double reg = conf["regularization_strength"].as<double>(); + const double alpha = conf["alpha"].as<double>(); + SparseVector<weight_t> weights; + { + vector<weight_t> vweights; + const string weightsf = conf["weights"].as<string>(); + Weights::InitFromFile(weightsf, &vweights); + Weights::InitSparseVector(vweights, &weights); + } + string line, file; + vector<vector<HypInfo> > kis; + cerr << "Loading hypergraphs...\n"; + while(getline(in, line)) { + istringstream is(line); + int sent_id; + kis.resize(kis.size() + 1); + vector<HypInfo>& curkbest = kis.back(); + is >> file >> sent_id; + ReadFile rf(file); + HypergraphIO::ReadFromJSON(rf.stream(), &hg); + hg.Reweight(weights); + KBest::KBestDerivations<vector<WordID>, ESentenceTraversal> kbest(hg, kbest_size); + + for (int i = 0; i < kbest_size; ++i) { + const KBest::KBestDerivations<vector<WordID>, ESentenceTraversal>::Derivation* d = + kbest.LazyKthBest(hg.nodes_.size() - 1, i); + if (!d) break; + curkbest.push_back(HypInfo(d->yield, d->feature_values, *ds[sent_id], metric)); + } + } + + cerr << "Hypergraphs loaded.\n"; + vector<SparseVector<weight_t> > goals(kis.size()); // f(x_i,y+,h+) + SparseVector<weight_t> fear; // f(x,y-,h-) + for (unsigned iterp = 1; iterp <= tp; ++iterp) { + cerr << "CCCP Iteration " << iterp << endl; + for (int i = 0; i < goals.size(); ++i) + CostAugmentedSearch(kis[i], weights, goodsign * alpha, &goals[i]); + for (unsigned iterpp = 1; iterpp <= tpp; ++iterpp) { + cerr << " SSD Iteration " << iterpp << endl; + for (int i = 0; i < goals.size(); ++i) { + CostAugmentedSearch(kis[i], weights, badsign * alpha, &fear); + weights -= weights * (eta * reg / goals.size()); + weights += (goals[i] - fear) * eta; + } + } + } + vector<weight_t> w; + weights.init_vector(&w); + Weights::WriteToFile("-", w); + return 0; +} + diff --git a/rampion/rampion_generate_input.pl b/rampion/rampion_generate_input.pl new file mode 100755 index 00000000..b30fc4fd --- /dev/null +++ b/rampion/rampion_generate_input.pl @@ -0,0 +1,18 @@ +#!/usr/bin/perl -w +use strict; + +die "Usage: $0 HG_DIR\n" unless scalar @ARGV == 1; +my $d = shift @ARGV; +die "Can't find directory $d" unless -d $d; + +opendir(DIR, $d) or die "Can't read $d: $!"; +my @hgs = grep { /\.gz$/ } readdir(DIR); +closedir DIR; + +for my $hg (@hgs) { + my $file = $hg; + my $id = $hg; + $id =~ s/(\.json)?\.gz//; + print "$d/$file $id\n"; +} + diff --git a/utils/city.cc b/utils/city.cc new file mode 100644 index 00000000..f1301ce4 --- /dev/null +++ b/utils/city.cc @@ -0,0 +1,467 @@ +// Copyright (c) 2011 Google, Inc. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. +// +// CityHash, by Geoff Pike and Jyrki Alakuijala +// +// This file provides CityHash64() and related functions. +// +// It's probably possible to create even faster hash functions by +// writing a program that systematically explores some of the space of +// possible hash functions, by using SIMD instructions, or by +// compromising on hash quality. + +#include "config.h" +#include <city.h> + +#include <algorithm> +#include <string.h> // for memcpy and memset + +using namespace std; + +static uint64 UNALIGNED_LOAD64(const char *p) { + uint64 result; + memcpy(&result, p, sizeof(result)); + return result; +} + +static uint32 UNALIGNED_LOAD32(const char *p) { + uint32 result; + memcpy(&result, p, sizeof(result)); + return result; +} + +#if !defined(WORDS_BIGENDIAN) + +#define uint32_in_expected_order(x) (x) +#define uint64_in_expected_order(x) (x) + +#else + +#ifdef _MSC_VER +#include <stdlib.h> +#define bswap_32(x) _byteswap_ulong(x) +#define bswap_64(x) _byteswap_uint64(x) + +#elif defined(__APPLE__) +// Mac OS X / Darwin features +#include <libkern/OSByteOrder.h> +#define bswap_32(x) OSSwapInt32(x) +#define bswap_64(x) OSSwapInt64(x) + +#else +#include <byteswap.h> +#endif + +#define uint32_in_expected_order(x) (bswap_32(x)) +#define uint64_in_expected_order(x) (bswap_64(x)) + +#endif // WORDS_BIGENDIAN + +#if !defined(LIKELY) +#if HAVE_BUILTIN_EXPECT +#define LIKELY(x) (__builtin_expect(!!(x), 1)) +#else +#define LIKELY(x) (x) +#endif +#endif + +static uint64 Fetch64(const char *p) { + return uint64_in_expected_order(UNALIGNED_LOAD64(p)); +} + +static uint32 Fetch32(const char *p) { + return uint32_in_expected_order(UNALIGNED_LOAD32(p)); +} + +// Some primes between 2^63 and 2^64 for various uses. +static const uint64 k0 = 0xc3a5c85c97cb3127ULL; +static const uint64 k1 = 0xb492b66fbe98f273ULL; +static const uint64 k2 = 0x9ae16a3b2f90404fULL; +static const uint64 k3 = 0xc949d7c7509e6557ULL; + +// Bitwise right rotate. Normally this will compile to a single +// instruction, especially if the shift is a manifest constant. +static uint64 Rotate(uint64 val, int shift) { + // Avoid shifting by 64: doing so yields an undefined result. + return shift == 0 ? val : ((val >> shift) | (val << (64 - shift))); +} + +// Equivalent to Rotate(), but requires the second arg to be non-zero. +// On x86-64, and probably others, it's possible for this to compile +// to a single instruction if both args are already in registers. +static uint64 RotateByAtLeast1(uint64 val, int shift) { + return (val >> shift) | (val << (64 - shift)); +} + +static uint64 ShiftMix(uint64 val) { + return val ^ (val >> 47); +} + +static uint64 HashLen16(uint64 u, uint64 v) { + return Hash128to64(uint128(u, v)); +} + +static uint64 HashLen0to16(const char *s, size_t len) { + if (len > 8) { + uint64 a = Fetch64(s); + uint64 b = Fetch64(s + len - 8); + return HashLen16(a, RotateByAtLeast1(b + len, len)) ^ b; + } + if (len >= 4) { + uint64 a = Fetch32(s); + return HashLen16(len + (a << 3), Fetch32(s + len - 4)); + } + if (len > 0) { + uint8 a = s[0]; + uint8 b = s[len >> 1]; + uint8 c = s[len - 1]; + uint32 y = static_cast<uint32>(a) + (static_cast<uint32>(b) << 8); + uint32 z = len + (static_cast<uint32>(c) << 2); + return ShiftMix(y * k2 ^ z * k3) * k2; + } + return k2; +} + +// This probably works well for 16-byte strings as well, but it may be overkill +// in that case. +static uint64 HashLen17to32(const char *s, size_t len) { + uint64 a = Fetch64(s) * k1; + uint64 b = Fetch64(s + 8); + uint64 c = Fetch64(s + len - 8) * k2; + uint64 d = Fetch64(s + len - 16) * k0; + return HashLen16(Rotate(a - b, 43) + Rotate(c, 30) + d, + a + Rotate(b ^ k3, 20) - c + len); +} + +// Return a 16-byte hash for 48 bytes. Quick and dirty. +// Callers do best to use "random-looking" values for a and b. +static pair<uint64, uint64> WeakHashLen32WithSeeds( + uint64 w, uint64 x, uint64 y, uint64 z, uint64 a, uint64 b) { + a += w; + b = Rotate(b + a + z, 21); + uint64 c = a; + a += x; + a += y; + b += Rotate(a, 44); + return make_pair(a + z, b + c); +} + +// Return a 16-byte hash for s[0] ... s[31], a, and b. Quick and dirty. +static pair<uint64, uint64> WeakHashLen32WithSeeds( + const char* s, uint64 a, uint64 b) { + return WeakHashLen32WithSeeds(Fetch64(s), + Fetch64(s + 8), + Fetch64(s + 16), + Fetch64(s + 24), + a, + b); +} + +// Return an 8-byte hash for 33 to 64 bytes. +static uint64 HashLen33to64(const char *s, size_t len) { + uint64 z = Fetch64(s + 24); + uint64 a = Fetch64(s) + (len + Fetch64(s + len - 16)) * k0; + uint64 b = Rotate(a + z, 52); + uint64 c = Rotate(a, 37); + a += Fetch64(s + 8); + c += Rotate(a, 7); + a += Fetch64(s + 16); + uint64 vf = a + z; + uint64 vs = b + Rotate(a, 31) + c; + a = Fetch64(s + 16) + Fetch64(s + len - 32); + z = Fetch64(s + len - 8); + b = Rotate(a + z, 52); + c = Rotate(a, 37); + a += Fetch64(s + len - 24); + c += Rotate(a, 7); + a += Fetch64(s + len - 16); + uint64 wf = a + z; + uint64 ws = b + Rotate(a, 31) + c; + uint64 r = ShiftMix((vf + ws) * k2 + (wf + vs) * k0); + return ShiftMix(r * k0 + vs) * k2; +} + +uint64 CityHash64(const char *s, size_t len) { + if (len <= 32) { + if (len <= 16) { + return HashLen0to16(s, len); + } else { + return HashLen17to32(s, len); + } + } else if (len <= 64) { + return HashLen33to64(s, len); + } + + // For strings over 64 bytes we hash the end first, and then as we + // loop we keep 56 bytes of state: v, w, x, y, and z. + uint64 x = Fetch64(s + len - 40); + uint64 y = Fetch64(s + len - 16) + Fetch64(s + len - 56); + uint64 z = HashLen16(Fetch64(s + len - 48) + len, Fetch64(s + len - 24)); + pair<uint64, uint64> v = WeakHashLen32WithSeeds(s + len - 64, len, z); + pair<uint64, uint64> w = WeakHashLen32WithSeeds(s + len - 32, y + k1, x); + x = x * k1 + Fetch64(s); + + // Decrease len to the nearest multiple of 64, and operate on 64-byte chunks. + len = (len - 1) & ~static_cast<size_t>(63); + do { + x = Rotate(x + y + v.first + Fetch64(s + 8), 37) * k1; + y = Rotate(y + v.second + Fetch64(s + 48), 42) * k1; + x ^= w.second; + y += v.first + Fetch64(s + 40); + z = Rotate(z + w.first, 33) * k1; + v = WeakHashLen32WithSeeds(s, v.second * k1, x + w.first); + w = WeakHashLen32WithSeeds(s + 32, z + w.second, y + Fetch64(s + 16)); + std::swap(z, x); + s += 64; + len -= 64; + } while (len != 0); + return HashLen16(HashLen16(v.first, w.first) + ShiftMix(y) * k1 + z, + HashLen16(v.second, w.second) + x); +} + +uint64 CityHash64WithSeed(const char *s, size_t len, uint64 seed) { + return CityHash64WithSeeds(s, len, k2, seed); +} + +uint64 CityHash64WithSeeds(const char *s, size_t len, + uint64 seed0, uint64 seed1) { + return HashLen16(CityHash64(s, len) - seed0, seed1); +} + +// A subroutine for CityHash128(). Returns a decent 128-bit hash for strings +// of any length representable in signed long. Based on City and Murmur. +static uint128 CityMurmur(const char *s, size_t len, uint128 seed) { + uint64 a = Uint128Low64(seed); + uint64 b = Uint128High64(seed); + uint64 c = 0; + uint64 d = 0; + signed long l = len - 16; + if (l <= 0) { // len <= 16 + a = ShiftMix(a * k1) * k1; + c = b * k1 + HashLen0to16(s, len); + d = ShiftMix(a + (len >= 8 ? Fetch64(s) : c)); + } else { // len > 16 + c = HashLen16(Fetch64(s + len - 8) + k1, a); + d = HashLen16(b + len, c + Fetch64(s + len - 16)); + a += d; + do { + a ^= ShiftMix(Fetch64(s) * k1) * k1; + a *= k1; + b ^= a; + c ^= ShiftMix(Fetch64(s + 8) * k1) * k1; + c *= k1; + d ^= c; + s += 16; + l -= 16; + } while (l > 0); + } + a = HashLen16(a, c); + b = HashLen16(d, b); + return uint128(a ^ b, HashLen16(b, a)); +} + +uint128 CityHash128WithSeed(const char *s, size_t len, uint128 seed) { + if (len < 128) { + return CityMurmur(s, len, seed); + } + + // We expect len >= 128 to be the common case. Keep 56 bytes of state: + // v, w, x, y, and z. + pair<uint64, uint64> v, w; + uint64 x = Uint128Low64(seed); + uint64 y = Uint128High64(seed); + uint64 z = len * k1; + v.first = Rotate(y ^ k1, 49) * k1 + Fetch64(s); + v.second = Rotate(v.first, 42) * k1 + Fetch64(s + 8); + w.first = Rotate(y + z, 35) * k1 + x; + w.second = Rotate(x + Fetch64(s + 88), 53) * k1; + + // This is the same inner loop as CityHash64(), manually unrolled. + do { + x = Rotate(x + y + v.first + Fetch64(s + 8), 37) * k1; + y = Rotate(y + v.second + Fetch64(s + 48), 42) * k1; + x ^= w.second; + y += v.first + Fetch64(s + 40); + z = Rotate(z + w.first, 33) * k1; + v = WeakHashLen32WithSeeds(s, v.second * k1, x + w.first); + w = WeakHashLen32WithSeeds(s + 32, z + w.second, y + Fetch64(s + 16)); + std::swap(z, x); + s += 64; + x = Rotate(x + y + v.first + Fetch64(s + 8), 37) * k1; + y = Rotate(y + v.second + Fetch64(s + 48), 42) * k1; + x ^= w.second; + y += v.first + Fetch64(s + 40); + z = Rotate(z + w.first, 33) * k1; + v = WeakHashLen32WithSeeds(s, v.second * k1, x + w.first); + w = WeakHashLen32WithSeeds(s + 32, z + w.second, y + Fetch64(s + 16)); + std::swap(z, x); + s += 64; + len -= 128; + } while (LIKELY(len >= 128)); + x += Rotate(v.first + z, 49) * k0; + z += Rotate(w.first, 37) * k0; + // If 0 < len < 128, hash up to 4 chunks of 32 bytes each from the end of s. + for (size_t tail_done = 0; tail_done < len; ) { + tail_done += 32; + y = Rotate(x + y, 42) * k0 + v.second; + w.first += Fetch64(s + len - tail_done + 16); + x = x * k0 + w.first; + z += w.second + Fetch64(s + len - tail_done); + w.second += v.first; + v = WeakHashLen32WithSeeds(s + len - tail_done, v.first + z, v.second); + } + // At this point our 56 bytes of state should contain more than + // enough information for a strong 128-bit hash. We use two + // different 56-byte-to-8-byte hashes to get a 16-byte final result. + x = HashLen16(x, v.first); + y = HashLen16(y + z, w.first); + return uint128(HashLen16(x + v.second, w.second) + y, + HashLen16(x + w.second, y + v.second)); +} + +uint128 CityHash128(const char *s, size_t len) { + if (len >= 16) { + return CityHash128WithSeed(s + 16, + len - 16, + uint128(Fetch64(s) ^ k3, + Fetch64(s + 8))); + } else if (len >= 8) { + return CityHash128WithSeed(NULL, + 0, + uint128(Fetch64(s) ^ (len * k0), + Fetch64(s + len - 8) ^ k1)); + } else { + return CityHash128WithSeed(s, len, uint128(k0, k1)); + } +} + +#ifdef __SSE4_2__ +#include <citycrc.h> +#include <nmmintrin.h> + +// Requires len >= 240. +static void CityHashCrc256Long(const char *s, size_t len, + uint32 seed, uint64 *result) { + uint64 a = Fetch64(s + 56) + k0; + uint64 b = Fetch64(s + 96) + k0; + uint64 c = result[0] = HashLen16(b, len); + uint64 d = result[1] = Fetch64(s + 120) * k0 + len; + uint64 e = Fetch64(s + 184) + seed; + uint64 f = seed; + uint64 g = 0; + uint64 h = 0; + uint64 i = 0; + uint64 j = 0; + uint64 t = c + d; + + // 240 bytes of input per iter. + size_t iters = len / 240; + len -= iters * 240; + do { +#define CHUNK(multiplier, z) \ + { \ + uint64 old_a = a; \ + a = Rotate(b, 41 ^ z) * multiplier + Fetch64(s); \ + b = Rotate(c, 27 ^ z) * multiplier + Fetch64(s + 8); \ + c = Rotate(d, 41 ^ z) * multiplier + Fetch64(s + 16); \ + d = Rotate(e, 33 ^ z) * multiplier + Fetch64(s + 24); \ + e = Rotate(t, 25 ^ z) * multiplier + Fetch64(s + 32); \ + t = old_a; \ + } \ + f = _mm_crc32_u64(f, a); \ + g = _mm_crc32_u64(g, b); \ + h = _mm_crc32_u64(h, c); \ + i = _mm_crc32_u64(i, d); \ + j = _mm_crc32_u64(j, e); \ + s += 40 + + CHUNK(1, 1); CHUNK(k0, 0); + CHUNK(1, 1); CHUNK(k0, 0); + CHUNK(1, 1); CHUNK(k0, 0); + } while (--iters > 0); + + while (len >= 40) { + CHUNK(k0, 0); + len -= 40; + } + if (len > 0) { + s = s + len - 40; + CHUNK(k0, 0); + } + j += i << 32; + a = HashLen16(a, j); + h += g << 32; + b += h; + c = HashLen16(c, f) + i; + d = HashLen16(d, e + result[0]); + j += e; + i += HashLen16(h, t); + e = HashLen16(a, d) + j; + f = HashLen16(b, c) + a; + g = HashLen16(j, i) + c; + result[0] = e + f + g + h; + a = ShiftMix((a + g) * k0) * k0 + b; + result[1] += a + result[0]; + a = ShiftMix(a * k0) * k0 + c; + result[2] = a + result[1]; + a = ShiftMix((a + e) * k0) * k0; + result[3] = a + result[2]; +} + +// Requires len < 240. +static void CityHashCrc256Short(const char *s, size_t len, uint64 *result) { + char buf[240]; + memcpy(buf, s, len); + memset(buf + len, 0, 240 - len); + CityHashCrc256Long(buf, 240, ~static_cast<uint32>(len), result); +} + +void CityHashCrc256(const char *s, size_t len, uint64 *result) { + if (LIKELY(len >= 240)) { + CityHashCrc256Long(s, len, 0, result); + } else { + CityHashCrc256Short(s, len, result); + } +} + +uint128 CityHashCrc128WithSeed(const char *s, size_t len, uint128 seed) { + if (len <= 900) { + return CityHash128WithSeed(s, len, seed); + } else { + uint64 result[4]; + CityHashCrc256(s, len, result); + uint64 u = Uint128High64(seed) + result[0]; + uint64 v = Uint128Low64(seed) + result[1]; + return uint128(HashLen16(u, v + result[2]), + HashLen16(Rotate(v, 32), u * k0 + result[3])); + } +} + +uint128 CityHashCrc128(const char *s, size_t len) { + if (len <= 900) { + return CityHash128(s, len); + } else { + uint64 result[4]; + CityHashCrc256(s, len, result); + return uint128(result[2], result[3]); + } +} + +#endif diff --git a/utils/city.h b/utils/city.h new file mode 100644 index 00000000..c2ab352c --- /dev/null +++ b/utils/city.h @@ -0,0 +1,90 @@ +// Copyright (c) 2011 Google, Inc. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. +// +// CityHash, by Geoff Pike and Jyrki Alakuijala +// +// This file provides a few functions for hashing strings. On x86-64 +// hardware in 2011, CityHash64() is faster than other high-quality +// hash functions, such as Murmur. This is largely due to higher +// instruction-level parallelism. CityHash64() and CityHash128() also perform +// well on hash-quality tests. +// +// CityHash128() is optimized for relatively long strings and returns +// a 128-bit hash. For strings more than about 2000 bytes it can be +// faster than CityHash64(). +// +// Functions in the CityHash family are not suitable for cryptography. +// +// WARNING: This code has not been tested on big-endian platforms! +// It is known to work well on little-endian platforms that have a small penalty +// for unaligned reads, such as current Intel and AMD moderate-to-high-end CPUs. +// +// By the way, for some hash functions, given strings a and b, the hash +// of a+b is easily derived from the hashes of a and b. This property +// doesn't hold for any hash functions in this file. + +#ifndef CITY_HASH_H_ +#define CITY_HASH_H_ + +#include <stdlib.h> // for size_t. +#include <stdint.h> +#include <utility> + +typedef uint8_t uint8; +typedef uint32_t uint32; +typedef uint64_t uint64; +typedef std::pair<uint64, uint64> uint128; + +inline uint64 Uint128Low64(const uint128& x) { return x.first; } +inline uint64 Uint128High64(const uint128& x) { return x.second; } + +// Hash function for a byte array. +uint64 CityHash64(const char *buf, size_t len); + +// Hash function for a byte array. For convenience, a 64-bit seed is also +// hashed into the result. +uint64 CityHash64WithSeed(const char *buf, size_t len, uint64 seed); + +// Hash function for a byte array. For convenience, two seeds are also +// hashed into the result. +uint64 CityHash64WithSeeds(const char *buf, size_t len, + uint64 seed0, uint64 seed1); + +// Hash function for a byte array. +uint128 CityHash128(const char *s, size_t len); + +// Hash function for a byte array. For convenience, a 128-bit seed is also +// hashed into the result. +uint128 CityHash128WithSeed(const char *s, size_t len, uint128 seed); + +// Hash 128 input bits down to 64 bits of output. +// This is intended to be a reasonably good hash function. +inline uint64 Hash128to64(const uint128& x) { + // Murmur-inspired hashing. + const uint64 kMul = 0x9ddfea08eb382d69ULL; + uint64 a = (Uint128Low64(x) ^ Uint128High64(x)) * kMul; + a ^= (a >> 47); + uint64 b = (Uint128High64(x) ^ a) * kMul; + b ^= (b >> 47); + b *= kMul; + return b; +} + +#endif // CITY_HASH_H_ diff --git a/utils/citycrc.h b/utils/citycrc.h new file mode 100644 index 00000000..318e3917 --- /dev/null +++ b/utils/citycrc.h @@ -0,0 +1,43 @@ +// Copyright (c) 2011 Google, Inc. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. +// +// CityHash, by Geoff Pike and Jyrki Alakuijala +// +// This file declares the subset of the CityHash functions that require +// _mm_crc32_u64(). See the CityHash README for details. +// +// Functions in the CityHash family are not suitable for cryptography. + +#ifndef CITY_HASH_CRC_H_ +#define CITY_HASH_CRC_H_ + +#include <city.h> + +// Hash function for a byte array. +uint128 CityHashCrc128(const char *s, size_t len); + +// Hash function for a byte array. For convenience, a 128-bit seed is also +// hashed into the result. +uint128 CityHashCrc128WithSeed(const char *s, size_t len, uint128 seed); + +// Hash function for a byte array. Sets result[0] ... result[3]. +void CityHashCrc256(const char *s, size_t len, uint64 *result); + +#endif // CITY_HASH_CRC_H_ |