From 3bca2ef755eb45f7f6e39e92a9adc02a7b4c84f0 Mon Sep 17 00:00:00 2001
From: Chris Dyer <cdyer@Chriss-MacBook-Air.local>
Date: Mon, 24 Jun 2013 23:10:12 -0400
Subject: remove old mira script, fix default number of iterations in new one

---
 training/mira/mira.py     |   2 +-
 training/mira/run_mira.pl | 630 ----------------------------------------------
 2 files changed, 1 insertion(+), 631 deletions(-)
 delete mode 100755 training/mira/run_mira.pl

diff --git a/training/mira/mira.py b/training/mira/mira.py
index f031c313..29c51e1d 100755
--- a/training/mira/mira.py
+++ b/training/mira/mira.py
@@ -96,7 +96,7 @@ def main():
   parser.add_argument('-m', '--metric', default='ibm_bleu',
                       help='metric to optimize. Example values: '
                            'ibm_bleu, nist_bleu, Koehn_bleu, TER, Combi')
-  parser.add_argument('--max-iterations', type=int, default=10, metavar='N',
+  parser.add_argument('--max-iterations', type=int, default=20, metavar='N',
                       help='maximum number of iterations to run')
   parser.add_argument('--optimizer', type=int, default=2, choices=range(1,6),
                       help='learning method to use for weight update.'
diff --git a/training/mira/run_mira.pl b/training/mira/run_mira.pl
deleted file mode 100755
index d71590ba..00000000
--- a/training/mira/run_mira.pl
+++ /dev/null
@@ -1,630 +0,0 @@
-#!/usr/bin/env perl
-use strict;
-my @ORIG_ARGV=@ARGV;
-use Cwd qw(getcwd);
-my $SCRIPT_DIR; BEGIN { use Cwd qw/ abs_path /; use File::Basename; $SCRIPT_DIR = dirname(abs_path($0));
-push @INC, $SCRIPT_DIR, "$SCRIPT_DIR/../../environment"; }
-
-# Skip local config (used for distributing jobs) if we're running in local-only mode
-use LocalConfig;
-use Getopt::Long;
-use IPC::Open2;
-use POSIX ":sys_wait_h";
-my $QSUB_CMD = qsub_args(mert_memory());
-my $default_jobs = env_default_jobs();
-
-my $srcFile;
-my $refFiles;
-my $bin_dir = $SCRIPT_DIR;
-die "Bin directory $bin_dir missing/inaccessible" unless -d $bin_dir;
-my $FAST_SCORE="$bin_dir/../../mteval/fast_score";
-die "Can't execute $FAST_SCORE" unless -x $FAST_SCORE;
-
-my $iteration = 0.0;
-my $max_iterations = 10;
-my $metric = "ibm_bleu";
-my $iniFile;
-my $weights;
-my $initialWeights;
-my $jobs = $default_jobs;   # number of decode nodes
-my $pmem = "1g";
-my $dir;
-
-my $SCORER = $FAST_SCORE;
-
-my $UTILS_DIR="$SCRIPT_DIR/../utils";
-require "$UTILS_DIR/libcall.pl";
-
-my $parallelize = "$UTILS_DIR/parallelize.pl";
-my $libcall = "$UTILS_DIR/libcall.pl";
-my $sentserver = "$UTILS_DIR/sentserver";
-my $sentclient = "$UTILS_DIR/sentclient";
-
-my $run_local = 0;
-my $pass_suffix = '';
-
-my $cdec ="$bin_dir/kbest_cut_mira"; 
-
-die "Can't find decoder in $cdec" unless -x $cdec;
-my $decoder = $cdec;
-my $decoderOpt;
-my $update_size;
-my $approx_score;
-my $kbest_size=250;
-my $metric_scale=1;
-my $optimizer=2;
-my $disable_clean = 0;
-my $use_make=0;  
-my $density_prune;
-my $cpbin=1;
-my $help = 0;
-my $epsilon = 0.0001;
-my $step_size = 0.01;
-my $gpref;
-my $unique_kbest;
-my $freeze;
-my $hopes=1;
-my $fears=1;
-my $sent_approx=0;
-my $pseudo_doc=0;
-
-my $range = 35000;
-my $minimum = 15000;
-my $portn = int(rand($range)) + $minimum;
-
-
-# Process command-line options
-Getopt::Long::Configure("no_auto_abbrev");
-if (GetOptions(
-        "decoder=s" => \$decoderOpt,
-        "jobs=i" => \$jobs,
-        "density-prune=f" => \$density_prune,
-        "dont-clean" => \$disable_clean,
-        "pass-suffix=s" => \$pass_suffix,
-        "epsilon=s" => \$epsilon,
-        "help" => \$help,
-        "local" => \$run_local,
-        "use-make=i" => \$use_make,
-        "max-iterations=i" => \$max_iterations,
-        "pmem=s" => \$pmem,
-        "cpbin!" => \$cpbin,
-        "ref-files=s" => \$refFiles,
-        "metric=s" => \$metric,
-        "source-file=s" => \$srcFile,
-        "weights=s" => \$initialWeights,
-	"optimizer=i" => \$optimizer,
-	"metric-scale=i" => \$metric_scale,
-	"kbest-size=i" => \$kbest_size,
-	"update-size=i" => \$update_size,
-	"step-size=f" => \$step_size,
-	"hope-select=i" => \$hopes,
-	"fear-select=i" => \$fears,
-	"sent-approx" => \$sent_approx,
-        "pseudo-doc" => \$pseudo_doc,
-	"unique-kbest" => \$unique_kbest,
-        "grammar-prefix=s" => \$gpref,
-	"freeze" => \$freeze,
-        "workdir=s" => \$dir,
-	) == 0 || @ARGV!=1 || $help) {
-        print_help();
-        exit;
-}
-
-($iniFile) = @ARGV;
-
-
-sub write_config;
-sub enseg;
-sub print_help;
-
-my $nodelist;
-my $host =check_output("hostname"); chomp $host;
-my $bleu;
-my $interval_count = 0;
-my $logfile;
-my $projected_score;
-
-
-#my $refs_comma_sep = get_comma_sep_refs($refFiles);
-my $refs_comma_sep = get_comma_sep_refs('r',$refFiles);
-
-#my $refs_comma_sep_4cdec = get_comma_sep_refs_4cdec($refFiles);
-
-unless ($dir){
-        $dir = "mira";
-}
-unless ($dir =~ /^\//){  # convert relative path to absolute path
-        my $basedir = check_output("pwd");
-        chomp $basedir;
-        $dir = "$basedir/$dir";
-}
-
-if ($decoderOpt){ $decoder = $decoderOpt; }
-
-# Initializations and helper functions
-srand;
-
-my @childpids = ();
-my @cleanupcmds = ();
-
-sub cleanup {
-        print STDERR "Cleanup...\n";
-        for my $pid (@childpids){ unchecked_call("kill $pid"); }
-        for my $cmd (@cleanupcmds){ unchecked_call("$cmd"); }
-        exit 1;
-};
-
-# Always call cleanup, no matter how we exit
-*CORE::GLOBAL::exit =
-    sub{ cleanup(); };
-$SIG{INT} = "cleanup";
-$SIG{TERM} = "cleanup";
-$SIG{HUP} = "cleanup";
-
-
-my $decoderBase = check_output("basename $decoder"); chomp $decoderBase;
-my $newIniFile = "$dir/$decoderBase.ini";
-my $inputFileName = "$dir/input";
-my $user = $ENV{"USER"};
-
-
-# process ini file
--e $iniFile || die "Error: could not open $iniFile for reading\n";
-open(INI, $iniFile);
-
-use File::Basename qw(basename);
-#pass bindir, refs to vars holding bin
-sub modbin {
-    local $_;
-    my $bindir=shift;
-    check_call("mkdir -p $bindir");
-    -d $bindir || die "couldn't make bindir $bindir";
-    for (@_) {
-        my $src=$$_;
-        $$_="$bindir/".basename($src);
-        check_call("cp -p $src $$_");
-    }
-}
-sub dirsize {
-    opendir ISEMPTY,$_[0];
-    return scalar(readdir(ISEMPTY))-1;
-}
-
-
-
-
-if (-e $dir && dirsize($dir)>1 && -e "$dir/weights" ){ # allow preexisting logfile, binaries, but not dist-vest.pl outputs
-    die "ERROR: working dir $dir already exists\n\n";
-} else {
-    -e $dir || mkdir $dir;
-    mkdir "$dir/scripts";
-    my $cmdfile="$dir/rerun-mira.sh";
-    open CMD,'>',$cmdfile;
-    print CMD "cd ",&getcwd,"\n";
-    my $cline=&cmdline."\n";
-    print CMD $cline;
-    close CMD;
-    print STDERR $cline;
-    chmod(0755,$cmdfile);
-    unless (-e $initialWeights) {
-        print STDERR "Please specify an initial weights file with --initial-weights\n";
-        print_help();
-        exit;
-    }
-    check_call("cp $initialWeights $dir/weights.0");
-    die "Can't find weights.0" unless (-e "$dir/weights.0");
-}
-write_config(*STDERR);
-
-# Generate initial files and values
-check_call("cp $iniFile $newIniFile");
-$iniFile = $newIniFile;
-
-my $newsrc = "$dir/dev.input";
-enseg($srcFile, $newsrc, $gpref);
-
-$srcFile = $newsrc;
-my $devSize = 0;
-open F, "<$srcFile" or die "Can't read $srcFile: $!";
-while(<F>) { $devSize++; }
-close F;
-
-my $lastPScore = 0;
-my $lastWeightsFile;
-my $bestScoreIter=-1;
-my $bestScore=-1;
-unless ($update_size){$update_size = $kbest_size;}
-# main optimization loop
-#while (1){
-for (my $opt_iter=0; $opt_iter<$max_iterations; $opt_iter++) {
-
-	print STDERR "\n\nITERATION $opt_iter\n==========\n";
-	print STDERR "Using port $portn\n";
-
-	# iteration-specific files
-	my $runFile="$dir/run.raw.$opt_iter";
-	my $onebestFile="$dir/1best.$opt_iter";
-	my $logdir="$dir/logs.$opt_iter";
-	my $decoderLog="$logdir/decoder.sentserver.log.$opt_iter";
-	my $scorerLog="$logdir/scorer.log.$opt_iter";
-	my $weightdir="$dir/weights.pass$opt_iter/";
-	check_call("mkdir -p $logdir");
-	check_call("mkdir -p $weightdir");
-
-	#decode
-	print STDERR "RUNNING DECODER AT ";
-	print STDERR unchecked_output("date");
-#	my $im1 = $opt_iter - 1;
-	my $weightsFile="$dir/weights.$opt_iter";
-	print "ITER $iteration " ;
-	my $cur_pass = "-p 0$opt_iter";
-	my $decoder_cmd = "$decoder -c $iniFile -w $weightsFile $refs_comma_sep -m $metric -s $metric_scale -b $update_size -k $kbest_size -o $optimizer $cur_pass -O $weightdir -D $dir  -h $hopes -f $fears -C $step_size";
-	if($unique_kbest){
-		$decoder_cmd .= " -u";
-	}
-	if($sent_approx){
-		$decoder_cmd .= " -a";
-	}
-	if($pseudo_doc){
-                $decoder_cmd .= " -e";
-        }
-	if ($density_prune) {
-		$decoder_cmd .= " --density_prune $density_prune";
-	}
-	my $pcmd;
-	if ($run_local) {
-		$pcmd = "cat $srcFile |";
-	} elsif ($use_make) {
-	    # TODO: Throw error when jobs is speong with use_make
-		$pcmd = "cat $srcFile | $parallelize --use-fork -p $pmem -e $logdir -j $use_make --";
-	} 
-	else {
-	    $pcmd = "cat $srcFile | $parallelize -p $pmem -e $logdir -j $jobs --baseport $portn --";
-	}
-	my $cmd = "$pcmd $decoder_cmd 2> $decoderLog 1> $runFile";
-	print STDERR "COMMAND:\n$cmd\n";
-	check_bash_call($cmd);
-
-	my $retries = 0;
-        my $num_topbest;
-        while($retries < 6) {
-            $num_topbest = check_output("wc -l < $runFile");
-            print STDERR "NUMBER OF TOP-BEST HYPs: $num_topbest\n";
-            if($devSize == $num_topbest) {
-                last;
-            } else {
-                print STDERR "Incorrect number of topbest. Waiting for distributed filesystem and retrying...\n";
-                sleep(10);
-            }
-            $retries++;
-        }
-	 die "Dev set contains $devSize sentences, but we don't have topbest for all these! Decoder failure? Check $decoderLog\n" if ($devSize != $num_topbest);
-
-
-	#score the output from this iteration
-	open RUN, "<$runFile" or die "Can't read $runFile: $!";
-	open H, ">$runFile.H" or die;
-	open F, ">$runFile.F" or die;
-	open B, ">$runFile.B" or die;
-	while(<RUN>) {
-	    chomp();
-	    (my $hope,my $best,my $fear) = split(/ \|\|\| /);
-	    print H "$hope \n"; 	    
-	    print B "$best \n";
- 	    print F "$fear \n";
-	}
-	close RUN;
-	close F; close B; close H;
-	
-	my $dec_score = check_output("cat $runFile.B | $SCORER $refs_comma_sep -m $metric");
-	my $dec_score_h = check_output("cat $runFile.H | $SCORER $refs_comma_sep -m $metric");
-	my $dec_score_f = check_output("cat $runFile.F | $SCORER $refs_comma_sep -m $metric");
-	chomp $dec_score; chomp $dec_score_h; chomp $dec_score_f;
-	print STDERR "DECODER SCORE: $dec_score HOPE: $dec_score_h FEAR: $dec_score_f\n";
-	if ($dec_score> $bestScore){
-		$bestScoreIter=$opt_iter; 
-		$bestScore=$dec_score;
-	}
-	# save space
-	check_call("gzip -f $runFile");
-	check_call("gzip -f $decoderLog");
-		my $iter_filler="";
-	if($opt_iter < 10)
-	{$iter_filler="0";}
-
-	my $nextIter = $opt_iter + 1;
-	my $newWeightsFile = "$dir/weights.$nextIter";
-	$lastWeightsFile = "$dir/weights.$opt_iter";
-
-	average_weights("$weightdir/weights.mira-pass*.*[0-9].gz", $newWeightsFile, $logdir);
-	system("gzip -f $logdir/kbes*");
-	print STDERR "\n==========\n";
-	$iteration++;
-}
-print STDERR "\nBEST ITER: $bestScoreIter :: $bestScore\n\n\n";
-
-print STDOUT "$lastWeightsFile\n";
-
-sub get_lines {
-  my $fn = shift @_;
-  open FL, "<$fn" or die "Couldn't read $fn: $!";
-  my $lc = 0;
-  while(<FL>) { $lc++; }
-  return $lc;
-}
-
-sub get_comma_sep_refs {
-  my ($r,$p) = @_;
-  my $o = check_output("echo $p");
-  chomp $o;
-  my @files = split /\s+/, $o;
-  return "-$r " . join(" -$r ", @files);
-}
-
-
-sub read_weights_file {
-  my ($file) = @_;
-  open F, "<$file" or die "Couldn't read $file: $!";
-  my @r = ();
-  my $pm = -1;
-  while(<F>) {
-    next if /^#/;
-    next if /^\s*$/;
-    chomp;
-    if (/^(.+)\s+(.+)$/) {
-      my $m = $1;
-      my $w = $2;
-      die "Weights out of order: $m <= $pm" unless $m > $pm;
-      push @r, $w;
-    } else {
-      warn "Unexpected feature name in weight file: $_";
-    }
-  }
-  close F;
-  return join ' ', @r;
-}
-
-sub write_config {
-	my $fh = shift;
-	my $cleanup = "yes";
-	if ($disable_clean) {$cleanup = "no";}
-
-	print $fh "\n";
-	print $fh "DECODER:          $decoder\n";
-	print $fh "INI FILE:         $iniFile\n";
-	print $fh "WORKING DIR:      $dir\n";
-	print $fh "SOURCE (DEV):     $srcFile\n";
-	print $fh "REFS (DEV):       $refFiles\n";
-	print $fh "EVAL METRIC:      $metric\n";
-	print $fh "START ITERATION:  $iteration\n";
-	print $fh "MAX ITERATIONS:   $max_iterations\n";
-	print $fh "DECODE NODES:     $jobs\n";
-	print $fh "HEAD NODE:        $host\n";
-	print $fh "PMEM (DECODING):  $pmem\n";
-	print $fh "CLEANUP:          $cleanup\n";
-	print $fh "INITIAL WEIGHTS:  $initialWeights\n";
-        print $fh "GRAMMAR PREFIX:   $gpref\n";
-}
-
-sub update_weights_file {
-  my ($neww, $rfn, $rpts) = @_;
-  my @feats = @$rfn;
-  my @pts = @$rpts;
-  my $num_feats = scalar @feats;
-  my $num_pts = scalar @pts;
-  die "$num_feats (num_feats) != $num_pts (num_pts)" unless $num_feats == $num_pts;
-  open G, ">$neww" or die;
-  for (my $i = 0; $i < $num_feats; $i++) {
-    my $f = $feats[$i];
-    my $lambda = $pts[$i];
-    print G "$f $lambda\n";
-  }
-  close G;
-}
-
-sub enseg {
-    my $src = shift;
-    my $newsrc = shift;
-    my $grammarpref = shift;
-
-    open(SRC, $src);
-    open(NEWSRC, ">$newsrc");
-    my $i=0;
-    while (my $line=<SRC>){
-	chomp $line;
-	if ($line =~ /^\s*<seg/i) {
-	    if($line =~ /id="[0-9]+"/) {
-		print NEWSRC "$line\n";
-	    } else {
-		die "When using segments with pre-generated <seg> tags, you must include a zero-based id attribute";
-	    }
-	}
-	elsif (defined $grammarpref) {
-	    print NEWSRC "<seg id=\"$i\" grammar=\"$grammarpref.$i.gz\">$line</seg>\n";}
-	else {
-	    print NEWSRC "<seg id=\"$i\">$line</seg>\n";
-	}
-	$i++;
-    }
-    close SRC;
-    close NEWSRC;
-}
-
-sub print_help {
- my $executable = check_output("basename $0"); chomp $executable;
-        print << "Help";
-
-Usage: $executable [options] <ini file>
-        Runs a complete MIRA optimization using the ini file specified.
-	Example invocation:
-	run_mira.pl \
-        --pmem 3g \
-        --max-iterations 20 \
-        --optimizer 2 \
-        --unique-kbest \
-        --jobs 15 \
-        --kbest-size 500 \
-        --hope-select 1 \
-        --fear-select 1  \
-        --ref-files "ref.0.soseos ref.1.soseos" \
-        --source-file src.soseos \
-        --weights weights.init \
-        --workdir workdir \
-        --grammar-prefix grammars/grammar \
-        --step-size 0.01 \
-        --metric-scale 10000 \
-
-Required:
-
-        --ref-files <files>
-                Dev set ref files.  This option takes only a single string argument.
-                To use multiple files (including file globbing), this argument should
-                be quoted.
-        --source-file <file>
-                Dev set source file.
-        --weights <file>
-                Initial weights file
-
-General options:
-
-        --help
-                Print this message and exit.
-
-       --max-iterations <M>
-                Maximum number of iterations to run.  If not specified, defaults
-                to $max_iterations.
-
-        --metric <method>
-                Metric to optimize.
-                Example values: IBM_BLEU, NIST_BLEU, Koehn_BLEU, TER, Combi
-
-        --workdir <dir>
-                Directory for intermediate and output files.  If not specified, the
-                name is derived from the ini filename.  Assuming that the ini
-                filename begins with the decoder name and ends with ini, the default
-                name of the working directory is inferred from the middle part of
-                the filename.  E.g. an ini file named decoder.foo.ini would have
-                a default working directory name foo.
-	--optimizer <I>
-		Learning method to use for weight update. Choice are 1) SGD, 2) PA MIRA with Selection from Cutting Plane, 3) Cutting Plane MIRA, 4) PA MIRA,5) nbest MIRA with hope, fear, and model constraints
-	--metric-scale <I>
-		Scale MT loss by this amount when computing hope/fear candidates
-	--kbest-size <I>
-		Size of k-best list to extract from forest
-	--update-size <I>
-		Size of k-best list to use for update (applies to optimizer 5)
-	--step-size <F>
-		Controls aggresiveness of update (C) 
-	--hope-select<I>
-		How to select hope candidate. Choices are 1) model score - cost, 2) min cost
-	--fear-select <I>
-		How to select fear candodate. Choices are 1) model score + cost, 2) max cost, 3) max score
-	--sent-approx
-		Use smoothed sentence-level MT metric
-	--pseudo-doc
-		Use pseudo document to approximate MT metric
-	--unique-kbest
-		Extract unique k-best from forest
-	--grammar-prefix <path>
-		Path to sentence-specific grammar files
-
-Job control options:
-
-        --jobs <I>
-                Number of decoder processes to run in parallel. [default=$default_jobs]
-
-        --pmem <N>
-                Amount of physical memory requested for parallel decoding jobs
-                (used with qsub requests only)
-
-	--local 
-		Run single learner
-	--use-make <I>
-		Run parallel learners on a single machine through fork.
-
-
-Help
-}
-
-
-sub cmdline {
-    return join ' ',($0,@ORIG_ARGV);
-}
-
-#buggy: last arg gets quoted sometimes?
-my $is_shell_special=qr{[ \t\n\\><|&;"'`~*?{}$!()]};
-my $shell_escape_in_quote=qr{[\\"\$`!]};
-
-sub escape_shell {
-    my ($arg)=@_;
-    return undef unless defined $arg;
-    if ($arg =~ /$is_shell_special/) {
-        $arg =~ s/($shell_escape_in_quote)/\\$1/g;
-        return "\"$arg\"";
-    }
-    return $arg;
-}
-
-sub escaped_shell_args {
-    return map {local $_=$_;chomp;escape_shell($_)} @_;
-}
-
-sub escaped_shell_args_str {
-    return join ' ',&escaped_shell_args(@_);
-}
-
-sub escaped_cmdline {
-    return "$0 ".&escaped_shell_args_str(@ORIG_ARGV);
-}
-
-sub average_weights {
-
-    my $path = shift;
-    my $out = shift;
-    my $logpath = shift;
-    print "AVERAGE $path $out\n";
-    my %feature_weights= ();
-    my $total =0;
-    my $total_mult =0;
-    sleep(10);
-    foreach my $file (glob "$path")
-    {
-	$file =~ /\/([^\/]+).gz$/;
-	my $fname = $1;
-	my $cmd = "gzip -d $file";
-	$file =~ s/\.gz//;
-	check_bash_call($cmd);
-	my $mult = 0;
-	print "FILE $file \n";
-	open SCORE, "< $file" or next;
-	$total++;
-	while( <SCORE> ) {
-	    my $line = $_;
-	    if ($line !~ m/^\#/)
-	    {
-		my @s = split(" ",$line);
-		$feature_weights{$s[0]}+= $mult * $s[1];
-	    }
-	    else
-	    {
-		(my $msg,my $ran,$mult) = split(/ \|\|\| /);
-		print "Processing $ran $mult\n";
-	    }
-	}
-	$total_mult += $mult;
-	
-	close SCORE;
-	$cmd = "gzip $file"; check_bash_call($cmd);
-    }
-    
-#print out new averaged weights
-    open OUT, "> $out" or next;
-    for my $f ( keys %feature_weights ) {
-	print "$f $feature_weights{$f} $total_mult\n";
-	my $ave = $feature_weights{$f} / $total_mult;
-	
-	print "Printing $f $ave ||| ";
-	print OUT "$f $ave\n";
-    }
-    
-}
-- 
cgit v1.2.3


From 15cbad322cffd1aaa50a380b5fcee04e159423cd Mon Sep 17 00:00:00 2001
From: Chris Dyer <cdyer@Chriss-MacBook-Air.local>
Date: Mon, 24 Jun 2013 23:26:53 -0400
Subject: fix makefile

---
 klm/util/Makefile.am | 28 +++++++++++++++-------------
 1 file changed, 15 insertions(+), 13 deletions(-)

diff --git a/klm/util/Makefile.am b/klm/util/Makefile.am
index 7f873e96..5e650af7 100644
--- a/klm/util/Makefile.am
+++ b/klm/util/Makefile.am
@@ -19,40 +19,42 @@
 noinst_LIBRARIES = libklm_util.a
 
 libklm_util_a_SOURCES = \
+  bit_packing.cc \
   bit_packing.hh \
+  ersatz_progress.cc \
   ersatz_progress.hh \
+  exception.cc \
   exception.hh \
+  fake_ofstream.hh \
+  file.cc \
   file.hh \
+  file_piece.cc \
   file_piece.hh \
   getopt.hh \
   have.hh \
   joint_sort.hh \
+  mmap.cc \
   mmap.hh \
+  multi_intersection.hh \
+  murmur_hash.cc \
   murmur_hash.hh \
   pcqueue.hh \
+  pool.cc \
   pool.hh \
   probing_hash_table.hh \
   proxy_iterator.hh \
+  read_compressed.cc \
   read_compressed.hh \
+  scoped.cc \
   scoped.hh \
   sized_iterator.hh \
   sorted_uniform.hh \
+  string_piece.cc \
   string_piece.hh \
   string_piece_hash.hh \
   thread_pool.hh \
   tokenize_piece.hh \
-  usage.hh \
-  ersatz_progress.cc \
-  bit_packing.cc \
-  exception.cc \
-	file.cc \
-  file_piece.cc \
-  mmap.cc \
-  murmur_hash.cc \
-  pool.cc \
-	read_compressed.cc \
-  scoped.cc \
-  string_piece.cc \
-	usage.cc
+  usage.cc \
+  usage.hh
 
 AM_CPPFLAGS = -W -Wall -I$(top_srcdir)/klm -I$(top_srcdir)/klm/util/double-conversion
-- 
cgit v1.2.3


From eed7024bfa6d92e669bee0ec5c5e38a7b238a2f6 Mon Sep 17 00:00:00 2001
From: Chris Dyer <cdyer@Chriss-MacBook-Air.local>
Date: Mon, 24 Jun 2013 23:35:18 -0400
Subject: fix deps

---
 klm/search/Makefile.am | 18 +++++++++++++++---
 1 file changed, 15 insertions(+), 3 deletions(-)

diff --git a/klm/search/Makefile.am b/klm/search/Makefile.am
index b8c8a050..b7f01c24 100644
--- a/klm/search/Makefile.am
+++ b/klm/search/Makefile.am
@@ -1,10 +1,22 @@
 noinst_LIBRARIES = libksearch.a
 
 libksearch_a_SOURCES = \
+  applied.hh \
+  config.hh \
+  context.hh \
+  dedupe.hh \
+  edge.hh \
   edge_generator.cc \
-	nbest.cc \
+  edge_generator.hh \
+  header.hh \
+  nbest.cc \
+  nbest.hh \
   rule.cc \
-  vertex.cc
+  rule.hh \
+  types.hh \
+  vertex.cc \
+  vertex.hh \
+  vertex_generator.hh
 
-AM_CPPFLAGS = -W -Wall -Wno-sign-compare $(GTEST_CPPFLAGS) -I..
+AM_CPPFLAGS = -W -Wall -Wno-sign-compare -I..
 
-- 
cgit v1.2.3


From cd4dbb5c0b581efb0369330ea330f2b473628a96 Mon Sep 17 00:00:00 2001
From: Paul Baltescu <pauldb89@gmail.com>
Date: Tue, 25 Jun 2013 15:13:30 +0100
Subject: Reduce memory used by precomputation.

---
 extractor/fast_intersector.cc         |  11 ++-
 extractor/fast_intersector_test.cc    |   8 +-
 extractor/mocks/mock_precomputation.h |   2 +-
 extractor/precomputation.cc           | 138 +++++++++++++++++---------------
 extractor/precomputation.h            |  49 +++++++-----
 extractor/precomputation_test.cc      | 143 +++++++++++++++++++---------------
 6 files changed, 199 insertions(+), 152 deletions(-)

diff --git a/extractor/fast_intersector.cc b/extractor/fast_intersector.cc
index a8591a72..5360c1da 100644
--- a/extractor/fast_intersector.cc
+++ b/extractor/fast_intersector.cc
@@ -20,10 +20,13 @@ FastIntersector::FastIntersector(shared_ptr<SuffixArray> suffix_array,
     vocabulary(vocabulary),
     max_rule_span(max_rule_span),
     min_gap_size(min_gap_size) {
-  Index precomputed_collocations = precomputation->GetCollocations();
-  for (pair<vector<int>, vector<int>> entry: precomputed_collocations) {
-    vector<int> phrase = ConvertPhrase(entry.first);
-    collocations[phrase] = entry.second;
+  auto precomputed_collocations = precomputation->GetCollocations();
+  for (auto item: precomputed_collocations) {
+    vector<int> phrase = ConvertPhrase(item.first);
+    vector<int> location = item.second;
+    vector<int>& phrase_collocations = collocations[phrase];
+    phrase_collocations.insert(phrase_collocations.end(), location.begin(),
+                               location.end());
   }
 }
 
diff --git a/extractor/fast_intersector_test.cc b/extractor/fast_intersector_test.cc
index 76c3aaea..2e618b63 100644
--- a/extractor/fast_intersector_test.cc
+++ b/extractor/fast_intersector_test.cc
@@ -60,14 +60,14 @@ class FastIntersectorTest : public Test {
 
     precomputation = make_shared<MockPrecomputation>();
     EXPECT_CALL(*precomputation, GetCollocations())
-        .WillRepeatedly(ReturnRef(collocations));
+        .WillRepeatedly(Return(collocations));
 
     phrase_builder = make_shared<PhraseBuilder>(vocabulary);
     intersector = make_shared<FastIntersector>(suffix_array, precomputation,
                                                vocabulary, 15, 1);
   }
 
-  Index collocations;
+  Collocations collocations;
   shared_ptr<MockDataArray> data_array;
   shared_ptr<MockSuffixArray> suffix_array;
   shared_ptr<MockPrecomputation> precomputation;
@@ -82,9 +82,9 @@ TEST_F(FastIntersectorTest, TestCachedCollocation) {
   Phrase phrase = phrase_builder->Build(symbols);
   PhraseLocation prefix_location(15, 16), suffix_location(16, 17);
 
-  collocations[symbols] = expected_location;
+  collocations.push_back(make_pair(symbols, expected_location));
   EXPECT_CALL(*precomputation, GetCollocations())
-      .WillRepeatedly(ReturnRef(collocations));
+      .WillRepeatedly(Return(collocations));
   intersector = make_shared<FastIntersector>(suffix_array, precomputation,
                                              vocabulary, 15, 1);
 
diff --git a/extractor/mocks/mock_precomputation.h b/extractor/mocks/mock_precomputation.h
index 8753343e..86f4ce27 100644
--- a/extractor/mocks/mock_precomputation.h
+++ b/extractor/mocks/mock_precomputation.h
@@ -6,7 +6,7 @@ namespace extractor {
 
 class MockPrecomputation : public Precomputation {
  public:
-  MOCK_CONST_METHOD0(GetCollocations, const Index&());
+  MOCK_CONST_METHOD0(GetCollocations, Collocations());
 };
 
 } // namespace extractor
diff --git a/extractor/precomputation.cc b/extractor/precomputation.cc
index 3b8aed69..37dbf7b7 100644
--- a/extractor/precomputation.cc
+++ b/extractor/precomputation.cc
@@ -14,63 +14,65 @@ int Precomputation::FIRST_NONTERMINAL = -1;
 int Precomputation::SECOND_NONTERMINAL = -2;
 
 Precomputation::Precomputation(
-    shared_ptr<SuffixArray> suffix_array, int num_frequent_patterns,
-    int num_super_frequent_patterns, int max_rule_span,
+    shared_ptr<SuffixArray> suffix_array, int num_frequent_phrases,
+    int num_super_frequent_phrases, int max_rule_span,
     int max_rule_symbols, int min_gap_size,
     int max_frequent_phrase_len, int min_frequency) {
   vector<int> data = suffix_array->GetData()->GetData();
-  vector<vector<int>> frequent_patterns = FindMostFrequentPatterns(
-      suffix_array, data, num_frequent_patterns, max_frequent_phrase_len,
+  vector<vector<int>> frequent_phrases = FindMostFrequentPhrases(
+      suffix_array, data, num_frequent_phrases, max_frequent_phrase_len,
       min_frequency);
 
   // Construct sets containing the frequent and superfrequent contiguous
   // collocations.
-  unordered_set<vector<int>, VectorHash> frequent_patterns_set;
-  unordered_set<vector<int>, VectorHash> super_frequent_patterns_set;
-  for (size_t i = 0; i < frequent_patterns.size(); ++i) {
-    frequent_patterns_set.insert(frequent_patterns[i]);
-    if (i < num_super_frequent_patterns) {
-      super_frequent_patterns_set.insert(frequent_patterns[i]);
+  unordered_set<vector<int>, VectorHash> frequent_phrases_set;
+  unordered_set<vector<int>, VectorHash> super_frequent_phrases_set;
+  for (size_t i = 0; i < frequent_phrases.size(); ++i) {
+    frequent_phrases_set.insert(frequent_phrases[i]);
+    if (i < num_super_frequent_phrases) {
+      super_frequent_phrases_set.insert(frequent_phrases[i]);
     }
   }
 
-  vector<tuple<int, int, int>> matchings;
+  vector<tuple<int, int, int>> locations;
   for (size_t i = 0; i < data.size(); ++i) {
-    // If the sentence is over, add all the discontiguous frequent patterns to
-    // the index.
+    // If the sentence is over, add all the discontiguous frequent phrases to
+    // the list.
     if (data[i] == DataArray::END_OF_LINE) {
-      AddCollocations(matchings, data, max_rule_span, min_gap_size,
+      AddCollocations(locations, data, max_rule_span, min_gap_size,
           max_rule_symbols);
-      matchings.clear();
+      locations.clear();
       continue;
     }
-    vector<int> pattern;
-    // Find all the contiguous frequent patterns starting at position i.
+    vector<int> phrase;
+    // Find all the contiguous frequent phrases starting at position i.
     for (int j = 1; j <= max_frequent_phrase_len && i + j <= data.size(); ++j) {
-      pattern.push_back(data[i + j - 1]);
-      if (frequent_patterns_set.count(pattern)) {
-        int is_super_frequent = super_frequent_patterns_set.count(pattern);
-        matchings.push_back(make_tuple(i, j, is_super_frequent));
+      phrase.push_back(data[i + j - 1]);
+      if (frequent_phrases_set.count(phrase)) {
+        int is_super_frequent = super_frequent_phrases_set.count(phrase);
+        locations.push_back(make_tuple(i, j, is_super_frequent));
       } else {
-        // If the current pattern is not frequent, any longer pattern having the
-        // current pattern as prefix will not be frequent.
+        // If the current phrase is not frequent, any longer phrase having the
+        // current phrase as prefix will not be frequent.
         break;
       }
     }
   }
+
+  collocations.shrink_to_fit();
 }
 
 Precomputation::Precomputation() {}
 
 Precomputation::~Precomputation() {}
 
-vector<vector<int>> Precomputation::FindMostFrequentPatterns(
+vector<vector<int>> Precomputation::FindMostFrequentPhrases(
     shared_ptr<SuffixArray> suffix_array, const vector<int>& data,
-    int num_frequent_patterns, int max_frequent_phrase_len, int min_frequency) {
+    int num_frequent_phrases, int max_frequent_phrase_len, int min_frequency) {
   vector<int> lcp = suffix_array->BuildLCPArray();
   vector<int> run_start(max_frequent_phrase_len);
 
-  // Find all the patterns occurring at least min_frequency times.
+  // Find all the phrases occurring at least min_frequency times.
   priority_queue<pair<int, pair<int, int>>> heap;
   for (size_t i = 1; i < lcp.size(); ++i) {
     for (int len = lcp[i]; len < max_frequent_phrase_len; ++len) {
@@ -83,34 +85,34 @@ vector<vector<int>> Precomputation::FindMostFrequentPatterns(
     }
   }
 
-  // Extract the most frequent patterns.
-  vector<vector<int>> frequent_patterns;
-  while (frequent_patterns.size() < num_frequent_patterns && !heap.empty()) {
+  // Extract the most frequent phrases.
+  vector<vector<int>> frequent_phrases;
+  while (frequent_phrases.size() < num_frequent_phrases && !heap.empty()) {
     int start = heap.top().second.first;
     int len = heap.top().second.second;
     heap.pop();
 
-    vector<int> pattern(data.begin() + start, data.begin() + start + len);
-    if (find(pattern.begin(), pattern.end(), DataArray::END_OF_LINE) ==
-        pattern.end()) {
-      frequent_patterns.push_back(pattern);
+    vector<int> phrase(data.begin() + start, data.begin() + start + len);
+    if (find(phrase.begin(), phrase.end(), DataArray::END_OF_LINE) ==
+        phrase.end()) {
+      frequent_phrases.push_back(phrase);
     }
   }
-  return frequent_patterns;
+  return frequent_phrases;
 }
 
 void Precomputation::AddCollocations(
-    const vector<tuple<int, int, int>>& matchings, const vector<int>& data,
+    const vector<tuple<int, int, int>>& locations, const vector<int>& data,
     int max_rule_span, int min_gap_size, int max_rule_symbols) {
-  // Select the leftmost subpattern.
-  for (size_t i = 0; i < matchings.size(); ++i) {
+  // Select the leftmost subphrase.
+  for (size_t i = 0; i < locations.size(); ++i) {
     int start1, size1, is_super1;
-    tie(start1, size1, is_super1) = matchings[i];
+    tie(start1, size1, is_super1) = locations[i];
 
-    // Select the second (middle) subpattern
-    for (size_t j = i + 1; j < matchings.size(); ++j) {
+    // Select the second (middle) subphrase
+    for (size_t j = i + 1; j < locations.size(); ++j) {
       int start2, size2, is_super2;
-      tie(start2, size2, is_super2) = matchings[j];
+      tie(start2, size2, is_super2) = locations[j];
       if (start2 - start1 >= max_rule_span) {
         break;
       }
@@ -118,20 +120,21 @@ void Precomputation::AddCollocations(
       if (start2 - start1 - size1 >= min_gap_size
           && start2 + size2 - start1 <= max_rule_span
           && size1 + size2 + 1 <= max_rule_symbols) {
-        vector<int> pattern(data.begin() + start1,
+        vector<int> collocation(data.begin() + start1,
             data.begin() + start1 + size1);
-        pattern.push_back(Precomputation::FIRST_NONTERMINAL);
-        pattern.insert(pattern.end(), data.begin() + start2,
+        collocation.push_back(Precomputation::FIRST_NONTERMINAL);
+        collocation.insert(collocation.end(), data.begin() + start2,
             data.begin() + start2 + size2);
-        AddStartPositions(collocations[pattern], start1, start2);
+
+        AddCollocation(collocation, GetLocation(start1, start2));
 
         // Try extending the binary collocation to a ternary collocation.
         if (is_super2) {
-          pattern.push_back(Precomputation::SECOND_NONTERMINAL);
-          // Select the rightmost subpattern.
-          for (size_t k = j + 1; k < matchings.size(); ++k) {
+          collocation.push_back(Precomputation::SECOND_NONTERMINAL);
+          // Select the rightmost subphrase.
+          for (size_t k = j + 1; k < locations.size(); ++k) {
             int start3, size3, is_super3;
-            tie(start3, size3, is_super3) = matchings[k];
+            tie(start3, size3, is_super3) = locations[k];
             if (start3 - start1 >= max_rule_span) {
               break;
             }
@@ -140,10 +143,12 @@ void Precomputation::AddCollocations(
                 && start3 + size3 - start1 <= max_rule_span
                 && size1 + size2 + size3 + 2 <= max_rule_symbols
                 && (is_super1 || is_super3)) {
-              pattern.insert(pattern.end(), data.begin() + start3,
+              collocation.insert(collocation.end(), data.begin() + start3,
                   data.begin() + start3 + size3);
-              AddStartPositions(collocations[pattern], start1, start2, start3);
-              pattern.erase(pattern.end() - size3);
+
+              AddCollocation(collocation, GetLocation(start1, start2, start3));
+
+              collocation.erase(collocation.end() - size3);
             }
           }
         }
@@ -152,20 +157,29 @@ void Precomputation::AddCollocations(
   }
 }
 
-void Precomputation::AddStartPositions(
-    vector<int>& positions, int pos1, int pos2) {
-  positions.push_back(pos1);
-  positions.push_back(pos2);
+vector<int> Precomputation::GetLocation(int pos1, int pos2) {
+  vector<int> location;
+  location.push_back(pos1);
+  location.push_back(pos2);
+  return location;
+}
+
+vector<int> Precomputation::GetLocation(int pos1, int pos2, int pos3) {
+  vector<int> location;
+  location.push_back(pos1);
+  location.push_back(pos2);
+  location.push_back(pos3);
+  return location;
 }
 
-void Precomputation::AddStartPositions(
-    vector<int>& positions, int pos1, int pos2, int pos3) {
-  positions.push_back(pos1);
-  positions.push_back(pos2);
-  positions.push_back(pos3);
+void Precomputation::AddCollocation(vector<int> collocation,
+                                    vector<int> location) {
+  collocation.shrink_to_fit();
+  location.shrink_to_fit();
+  collocations.push_back(make_pair(collocation, location));
 }
 
-const Index& Precomputation::GetCollocations() const {
+Collocations Precomputation::GetCollocations() const {
   return collocations;
 }
 
diff --git a/extractor/precomputation.h b/extractor/precomputation.h
index 9f0c9424..0a06349b 100644
--- a/extractor/precomputation.h
+++ b/extractor/precomputation.h
@@ -19,16 +19,18 @@ using namespace std;
 namespace extractor {
 
 typedef boost::hash<vector<int>> VectorHash;
-typedef unordered_map<vector<int>, vector<int>, VectorHash> Index;
+typedef vector<pair<vector<int>, vector<int>>> Collocations;
 
 class SuffixArray;
 
 /**
- * Data structure wrapping an index with all the occurrences of the most
- * frequent discontiguous collocations in the source data.
+ * Data structure containing all the data needed for constructing an index with
+ * all the occurrences of the most frequent discontiguous collocations in the
+ * source data.
  *
- * Let a, b, c be contiguous collocations. The index will contain an entry for
- * every collocation of the form:
+ * Let a, b, c be contiguous phrases. The data structure will contain the
+ * locations in the source data where every collocation of the following forms
+ * occurs:
  * - aXb, where a and b are frequent
  * - aXbXc, where a and b are super-frequent and c is frequent or
  *                b and c are super-frequent and a is frequent.
@@ -37,8 +39,8 @@ class Precomputation {
  public:
   // Constructs the index using the suffix array.
   Precomputation(
-      shared_ptr<SuffixArray> suffix_array, int num_frequent_patterns,
-      int num_super_frequent_patterns, int max_rule_span,
+      shared_ptr<SuffixArray> suffix_array, int num_frequent_phrases,
+      int num_super_frequent_phrases, int max_rule_span,
       int max_rule_symbols, int min_gap_size,
       int max_frequent_phrase_len, int min_frequency);
 
@@ -47,8 +49,9 @@ class Precomputation {
 
   virtual ~Precomputation();
 
-  // Returns a reference to the index.
-  virtual const Index& GetCollocations() const;
+  // Returns the list of the locations of the most frequent collocations in the
+  // source data.
+  virtual Collocations GetCollocations() const;
 
   bool operator==(const Precomputation& other) const;
 
@@ -57,23 +60,29 @@ class Precomputation {
 
  private:
   // Finds the most frequent contiguous collocations.
-  vector<vector<int>> FindMostFrequentPatterns(
+  vector<vector<int>> FindMostFrequentPhrases(
       shared_ptr<SuffixArray> suffix_array, const vector<int>& data,
-      int num_frequent_patterns, int max_frequent_phrase_len,
+      int num_frequent_phrases, int max_frequent_phrase_len,
       int min_frequency);
 
   // Given the locations of the frequent contiguous collocations in a sentence,
   // it adds new entries to the index for each discontiguous collocation
   // matching the criteria specified in the class description.
-  void AddCollocations(
-      const vector<std::tuple<int, int, int>>& matchings, const vector<int>& data,
-      int max_rule_span, int min_gap_size, int max_rule_symbols);
+  void AddCollocations(const vector<std::tuple<int, int, int>>& locations,
+                       const vector<int>& data, int max_rule_span,
+                       int min_gap_size, int max_rule_symbols);
 
-  // Adds an occurrence of a binary collocation.
-  void AddStartPositions(vector<int>& positions, int pos1, int pos2);
+  // Creates a vector representation for the location of a binary collocation
+  // containing the starting points of each subpattern.
+  vector<int> GetLocation(int pos1, int pos2);
 
-  // Adds an occurrence of a ternary collocation.
-  void AddStartPositions(vector<int>& positions, int pos1, int pos2, int pos3);
+  // Creates a vector representation for the location of a ternary collocation
+  // containing the starting points of each subpattern.
+  vector<int> GetLocation(int pos1, int pos2, int pos3);
+
+  // Appends a collocation to the list of collocations after shrinking the
+  // vectors to avoid unnecessary memory usage.
+  void AddCollocation(vector<int> collocation, vector<int> location);
 
   friend class boost::serialization::access;
 
@@ -91,13 +100,13 @@ class Precomputation {
     for (size_t i = 0; i < num_entries; ++i) {
       pair<vector<int>, vector<int>> entry;
       ar >> entry;
-      collocations.insert(entry);
+      collocations.push_back(entry);
     }
   }
 
   BOOST_SERIALIZATION_SPLIT_MEMBER();
 
-  Index collocations;
+  Collocations collocations;
 };
 
 } // namespace extractor
diff --git a/extractor/precomputation_test.cc b/extractor/precomputation_test.cc
index e81ece5d..c6e457fd 100644
--- a/extractor/precomputation_test.cc
+++ b/extractor/precomputation_test.cc
@@ -38,6 +38,23 @@ class PrecomputationTest : public Test {
     precomputation = Precomputation(suffix_array, 3, 3, 10, 5, 1, 4, 2);
   }
 
+  void CheckCollocation(const Collocations& collocations,
+                        const vector<int>& collocation,
+                        const vector<vector<int>>& locations) {
+    for (auto location: locations) {
+      auto item = make_pair(collocation, location);
+      EXPECT_FALSE(find(collocations.begin(), collocations.end(), item) ==
+                   collocations.end());
+    }
+  }
+
+  void CheckIllegalCollocation(const Collocations& collocations,
+                               const vector<int>& collocation) {
+    for (auto item: collocations) {
+      EXPECT_FALSE(collocation == item.first);
+    }
+  }
+
   vector<int> data;
   shared_ptr<MockDataArray> data_array;
   shared_ptr<MockSuffixArray> suffix_array;
@@ -45,67 +62,71 @@ class PrecomputationTest : public Test {
 };
 
 TEST_F(PrecomputationTest, TestCollocations) {
-  Index collocations = precomputation.GetCollocations();
-
-  vector<int> key = {2, 3, -1, 2};
-  vector<int> expected_value = {1, 5, 1, 8, 5, 8, 5, 11, 8, 11};
-  EXPECT_EQ(expected_value, collocations[key]);
-  key = {2, 3, -1, 2, 3};
-  expected_value = {1, 5, 1, 8, 5, 8};
-  EXPECT_EQ(expected_value, collocations[key]);
-  key = {2, 3, -1, 3};
-  expected_value = {1, 6, 1, 9, 5, 9};
-  EXPECT_EQ(expected_value, collocations[key]);
-  key = {3, -1, 2};
-  expected_value = {2, 5, 2, 8, 2, 11, 6, 8, 6, 11, 9, 11};
-  EXPECT_EQ(expected_value, collocations[key]);
-  key = {3, -1, 3};
-  expected_value = {2, 6, 2, 9, 6, 9};
-  EXPECT_EQ(expected_value, collocations[key]);
-  key = {3, -1, 2, 3};
-  expected_value = {2, 5, 2, 8, 6, 8};
-  EXPECT_EQ(expected_value, collocations[key]);
-  key = {2, -1, 2};
-  expected_value = {1, 5, 1, 8, 5, 8, 5, 11, 8, 11};
-  EXPECT_EQ(expected_value, collocations[key]);
-  key = {2, -1, 2, 3};
-  expected_value = {1, 5, 1, 8, 5, 8};
-  EXPECT_EQ(expected_value, collocations[key]);
-  key = {2, -1, 3};
-  expected_value = {1, 6, 1, 9, 5, 9};
-  EXPECT_EQ(expected_value, collocations[key]);
-
-  key = {2, -1, 2, -2, 2};
-  expected_value = {1, 5, 8, 5, 8, 11};
-  EXPECT_EQ(expected_value, collocations[key]);
-  key = {2, -1, 2, -2, 3};
-  expected_value = {1, 5, 9};
-  EXPECT_EQ(expected_value, collocations[key]);
-  key = {2, -1, 3, -2, 2};
-  expected_value = {1, 6, 8, 5, 9, 11};
-  EXPECT_EQ(expected_value, collocations[key]);
-  key = {2, -1, 3, -2, 3};
-  expected_value = {1, 6, 9};
-  EXPECT_EQ(expected_value, collocations[key]);
-  key = {3, -1, 2, -2, 2};
-  expected_value = {2, 5, 8, 2, 5, 11, 2, 8, 11, 6, 8, 11};
-  EXPECT_EQ(expected_value, collocations[key]);
-  key = {3, -1, 2, -2, 3};
-  expected_value = {2, 5, 9};
-  EXPECT_EQ(expected_value, collocations[key]);
-  key = {3, -1, 3, -2, 2};
-  expected_value = {2, 6, 8, 2, 6, 11, 2, 9, 11, 6, 9, 11};
-  EXPECT_EQ(expected_value, collocations[key]);
-  key = {3, -1, 3, -2, 3};
-  expected_value = {2, 6, 9};
-  EXPECT_EQ(expected_value, collocations[key]);
-
-  // Exceeds max_rule_symbols.
-  key = {2, -1, 2, -2, 2, 3};
-  EXPECT_EQ(0, collocations.count(key));
-  // Contains non frequent pattern.
-  key = {2, -1, 5};
-  EXPECT_EQ(0, collocations.count(key));
+  Collocations collocations = precomputation.GetCollocations();
+
+  EXPECT_EQ(50, collocations.size());
+
+  vector<int> collocation = {2, 3, -1, 2};
+  vector<vector<int>> locations = {{1, 5}, {1, 8}, {5, 8}, {5, 11}, {8, 11}};
+  CheckCollocation(collocations, collocation, locations);
+
+  collocation = {2, 3, -1, 2, 3};
+  locations = {{1, 5}, {1, 8}, {5, 8}};
+  CheckCollocation(collocations, collocation, locations);
+
+  collocation = {2, 3, -1, 3};
+  locations = {{1, 6}, {1, 9}, {5, 9}};
+  CheckCollocation(collocations, collocation, locations);
+  collocation = {3, -1, 2};
+  locations = {{2, 5}, {2, 8}, {2, 11}, {6, 8}, {6, 11}, {9, 11}};
+  CheckCollocation(collocations, collocation, locations);
+  collocation = {3, -1, 3};
+  locations = {{2, 6}, {2, 9}, {6, 9}};
+  CheckCollocation(collocations, collocation, locations);
+  collocation = {3, -1, 2, 3};
+  locations = {{2, 5}, {2, 8}, {6, 8}};
+  CheckCollocation(collocations, collocation, locations);
+  collocation = {2, -1, 2};
+  locations = {{1, 5}, {1, 8}, {5, 8}, {5, 11}, {8, 11}};
+  CheckCollocation(collocations, collocation, locations);
+  collocation = {2, -1, 2, 3};
+  locations = {{1, 5}, {1, 8}, {5, 8}};
+  CheckCollocation(collocations, collocation, locations);
+  collocation = {2, -1, 3};
+  locations = {{1, 6}, {1, 9}, {5, 9}};
+  CheckCollocation(collocations, collocation, locations);
+
+  collocation = {2, -1, 2, -2, 2};
+  locations = {{1, 5, 8}, {5, 8, 11}};
+  CheckCollocation(collocations, collocation, locations);
+  collocation = {2, -1, 2, -2, 3};
+  locations = {{1, 5, 9}};
+  CheckCollocation(collocations, collocation, locations);
+  collocation = {2, -1, 3, -2, 2};
+  locations = {{1, 6, 8}, {5, 9, 11}};
+  CheckCollocation(collocations, collocation, locations);
+  collocation = {2, -1, 3, -2, 3};
+  locations = {{1, 6, 9}};
+  CheckCollocation(collocations, collocation, locations);
+  collocation = {3, -1, 2, -2, 2};
+  locations = {{2, 5, 8}, {2, 5, 11}, {2, 8, 11}, {6, 8, 11}};
+  CheckCollocation(collocations, collocation, locations);
+  collocation = {3, -1, 2, -2, 3};
+  locations = {{2, 5, 9}};
+  CheckCollocation(collocations, collocation, locations);
+  collocation = {3, -1, 3, -2, 2};
+  locations = {{2, 6, 8}, {2, 6, 11}, {2, 9, 11}, {6, 9, 11}};
+  CheckCollocation(collocations, collocation, locations);
+  collocation = {3, -1, 3, -2, 3};
+  locations = {{2, 6, 9}};
+  CheckCollocation(collocations, collocation, locations);
+
+  // Collocation exceeds max_rule_symbols.
+  collocation = {2, -1, 2, -2, 2, 3};
+  CheckIllegalCollocation(collocations, collocation);
+  // Collocation contains non frequent pattern.
+  collocation = {2, -1, 5};
+  CheckIllegalCollocation(collocations, collocation);
 }
 
 TEST_F(PrecomputationTest, TestSerialization) {
-- 
cgit v1.2.3


From 36bc55a33b5f075f6479c64f94d60ecbbd1f3270 Mon Sep 17 00:00:00 2001
From: Paul Baltescu <pauldb89@gmail.com>
Date: Tue, 25 Jun 2013 17:28:54 +0100
Subject: Undo last commit.

---
 extractor/fast_intersector.cc         |  11 +--
 extractor/fast_intersector_test.cc    |   8 +-
 extractor/mocks/mock_precomputation.h |   2 +-
 extractor/precomputation.cc           | 138 +++++++++++++++-----------------
 extractor/precomputation.h            |  49 +++++-------
 extractor/precomputation_test.cc      | 143 +++++++++++++++-------------------
 6 files changed, 152 insertions(+), 199 deletions(-)

diff --git a/extractor/fast_intersector.cc b/extractor/fast_intersector.cc
index 5360c1da..a8591a72 100644
--- a/extractor/fast_intersector.cc
+++ b/extractor/fast_intersector.cc
@@ -20,13 +20,10 @@ FastIntersector::FastIntersector(shared_ptr<SuffixArray> suffix_array,
     vocabulary(vocabulary),
     max_rule_span(max_rule_span),
     min_gap_size(min_gap_size) {
-  auto precomputed_collocations = precomputation->GetCollocations();
-  for (auto item: precomputed_collocations) {
-    vector<int> phrase = ConvertPhrase(item.first);
-    vector<int> location = item.second;
-    vector<int>& phrase_collocations = collocations[phrase];
-    phrase_collocations.insert(phrase_collocations.end(), location.begin(),
-                               location.end());
+  Index precomputed_collocations = precomputation->GetCollocations();
+  for (pair<vector<int>, vector<int>> entry: precomputed_collocations) {
+    vector<int> phrase = ConvertPhrase(entry.first);
+    collocations[phrase] = entry.second;
   }
 }
 
diff --git a/extractor/fast_intersector_test.cc b/extractor/fast_intersector_test.cc
index 2e618b63..76c3aaea 100644
--- a/extractor/fast_intersector_test.cc
+++ b/extractor/fast_intersector_test.cc
@@ -60,14 +60,14 @@ class FastIntersectorTest : public Test {
 
     precomputation = make_shared<MockPrecomputation>();
     EXPECT_CALL(*precomputation, GetCollocations())
-        .WillRepeatedly(Return(collocations));
+        .WillRepeatedly(ReturnRef(collocations));
 
     phrase_builder = make_shared<PhraseBuilder>(vocabulary);
     intersector = make_shared<FastIntersector>(suffix_array, precomputation,
                                                vocabulary, 15, 1);
   }
 
-  Collocations collocations;
+  Index collocations;
   shared_ptr<MockDataArray> data_array;
   shared_ptr<MockSuffixArray> suffix_array;
   shared_ptr<MockPrecomputation> precomputation;
@@ -82,9 +82,9 @@ TEST_F(FastIntersectorTest, TestCachedCollocation) {
   Phrase phrase = phrase_builder->Build(symbols);
   PhraseLocation prefix_location(15, 16), suffix_location(16, 17);
 
-  collocations.push_back(make_pair(symbols, expected_location));
+  collocations[symbols] = expected_location;
   EXPECT_CALL(*precomputation, GetCollocations())
-      .WillRepeatedly(Return(collocations));
+      .WillRepeatedly(ReturnRef(collocations));
   intersector = make_shared<FastIntersector>(suffix_array, precomputation,
                                              vocabulary, 15, 1);
 
diff --git a/extractor/mocks/mock_precomputation.h b/extractor/mocks/mock_precomputation.h
index 86f4ce27..8753343e 100644
--- a/extractor/mocks/mock_precomputation.h
+++ b/extractor/mocks/mock_precomputation.h
@@ -6,7 +6,7 @@ namespace extractor {
 
 class MockPrecomputation : public Precomputation {
  public:
-  MOCK_CONST_METHOD0(GetCollocations, Collocations());
+  MOCK_CONST_METHOD0(GetCollocations, const Index&());
 };
 
 } // namespace extractor
diff --git a/extractor/precomputation.cc b/extractor/precomputation.cc
index 37dbf7b7..3b8aed69 100644
--- a/extractor/precomputation.cc
+++ b/extractor/precomputation.cc
@@ -14,65 +14,63 @@ int Precomputation::FIRST_NONTERMINAL = -1;
 int Precomputation::SECOND_NONTERMINAL = -2;
 
 Precomputation::Precomputation(
-    shared_ptr<SuffixArray> suffix_array, int num_frequent_phrases,
-    int num_super_frequent_phrases, int max_rule_span,
+    shared_ptr<SuffixArray> suffix_array, int num_frequent_patterns,
+    int num_super_frequent_patterns, int max_rule_span,
     int max_rule_symbols, int min_gap_size,
     int max_frequent_phrase_len, int min_frequency) {
   vector<int> data = suffix_array->GetData()->GetData();
-  vector<vector<int>> frequent_phrases = FindMostFrequentPhrases(
-      suffix_array, data, num_frequent_phrases, max_frequent_phrase_len,
+  vector<vector<int>> frequent_patterns = FindMostFrequentPatterns(
+      suffix_array, data, num_frequent_patterns, max_frequent_phrase_len,
       min_frequency);
 
   // Construct sets containing the frequent and superfrequent contiguous
   // collocations.
-  unordered_set<vector<int>, VectorHash> frequent_phrases_set;
-  unordered_set<vector<int>, VectorHash> super_frequent_phrases_set;
-  for (size_t i = 0; i < frequent_phrases.size(); ++i) {
-    frequent_phrases_set.insert(frequent_phrases[i]);
-    if (i < num_super_frequent_phrases) {
-      super_frequent_phrases_set.insert(frequent_phrases[i]);
+  unordered_set<vector<int>, VectorHash> frequent_patterns_set;
+  unordered_set<vector<int>, VectorHash> super_frequent_patterns_set;
+  for (size_t i = 0; i < frequent_patterns.size(); ++i) {
+    frequent_patterns_set.insert(frequent_patterns[i]);
+    if (i < num_super_frequent_patterns) {
+      super_frequent_patterns_set.insert(frequent_patterns[i]);
     }
   }
 
-  vector<tuple<int, int, int>> locations;
+  vector<tuple<int, int, int>> matchings;
   for (size_t i = 0; i < data.size(); ++i) {
-    // If the sentence is over, add all the discontiguous frequent phrases to
-    // the list.
+    // If the sentence is over, add all the discontiguous frequent patterns to
+    // the index.
     if (data[i] == DataArray::END_OF_LINE) {
-      AddCollocations(locations, data, max_rule_span, min_gap_size,
+      AddCollocations(matchings, data, max_rule_span, min_gap_size,
           max_rule_symbols);
-      locations.clear();
+      matchings.clear();
       continue;
     }
-    vector<int> phrase;
-    // Find all the contiguous frequent phrases starting at position i.
+    vector<int> pattern;
+    // Find all the contiguous frequent patterns starting at position i.
     for (int j = 1; j <= max_frequent_phrase_len && i + j <= data.size(); ++j) {
-      phrase.push_back(data[i + j - 1]);
-      if (frequent_phrases_set.count(phrase)) {
-        int is_super_frequent = super_frequent_phrases_set.count(phrase);
-        locations.push_back(make_tuple(i, j, is_super_frequent));
+      pattern.push_back(data[i + j - 1]);
+      if (frequent_patterns_set.count(pattern)) {
+        int is_super_frequent = super_frequent_patterns_set.count(pattern);
+        matchings.push_back(make_tuple(i, j, is_super_frequent));
       } else {
-        // If the current phrase is not frequent, any longer phrase having the
-        // current phrase as prefix will not be frequent.
+        // If the current pattern is not frequent, any longer pattern having the
+        // current pattern as prefix will not be frequent.
         break;
       }
     }
   }
-
-  collocations.shrink_to_fit();
 }
 
 Precomputation::Precomputation() {}
 
 Precomputation::~Precomputation() {}
 
-vector<vector<int>> Precomputation::FindMostFrequentPhrases(
+vector<vector<int>> Precomputation::FindMostFrequentPatterns(
     shared_ptr<SuffixArray> suffix_array, const vector<int>& data,
-    int num_frequent_phrases, int max_frequent_phrase_len, int min_frequency) {
+    int num_frequent_patterns, int max_frequent_phrase_len, int min_frequency) {
   vector<int> lcp = suffix_array->BuildLCPArray();
   vector<int> run_start(max_frequent_phrase_len);
 
-  // Find all the phrases occurring at least min_frequency times.
+  // Find all the patterns occurring at least min_frequency times.
   priority_queue<pair<int, pair<int, int>>> heap;
   for (size_t i = 1; i < lcp.size(); ++i) {
     for (int len = lcp[i]; len < max_frequent_phrase_len; ++len) {
@@ -85,34 +83,34 @@ vector<vector<int>> Precomputation::FindMostFrequentPhrases(
     }
   }
 
-  // Extract the most frequent phrases.
-  vector<vector<int>> frequent_phrases;
-  while (frequent_phrases.size() < num_frequent_phrases && !heap.empty()) {
+  // Extract the most frequent patterns.
+  vector<vector<int>> frequent_patterns;
+  while (frequent_patterns.size() < num_frequent_patterns && !heap.empty()) {
     int start = heap.top().second.first;
     int len = heap.top().second.second;
     heap.pop();
 
-    vector<int> phrase(data.begin() + start, data.begin() + start + len);
-    if (find(phrase.begin(), phrase.end(), DataArray::END_OF_LINE) ==
-        phrase.end()) {
-      frequent_phrases.push_back(phrase);
+    vector<int> pattern(data.begin() + start, data.begin() + start + len);
+    if (find(pattern.begin(), pattern.end(), DataArray::END_OF_LINE) ==
+        pattern.end()) {
+      frequent_patterns.push_back(pattern);
     }
   }
-  return frequent_phrases;
+  return frequent_patterns;
 }
 
 void Precomputation::AddCollocations(
-    const vector<tuple<int, int, int>>& locations, const vector<int>& data,
+    const vector<tuple<int, int, int>>& matchings, const vector<int>& data,
     int max_rule_span, int min_gap_size, int max_rule_symbols) {
-  // Select the leftmost subphrase.
-  for (size_t i = 0; i < locations.size(); ++i) {
+  // Select the leftmost subpattern.
+  for (size_t i = 0; i < matchings.size(); ++i) {
     int start1, size1, is_super1;
-    tie(start1, size1, is_super1) = locations[i];
+    tie(start1, size1, is_super1) = matchings[i];
 
-    // Select the second (middle) subphrase
-    for (size_t j = i + 1; j < locations.size(); ++j) {
+    // Select the second (middle) subpattern
+    for (size_t j = i + 1; j < matchings.size(); ++j) {
       int start2, size2, is_super2;
-      tie(start2, size2, is_super2) = locations[j];
+      tie(start2, size2, is_super2) = matchings[j];
       if (start2 - start1 >= max_rule_span) {
         break;
       }
@@ -120,21 +118,20 @@ void Precomputation::AddCollocations(
       if (start2 - start1 - size1 >= min_gap_size
           && start2 + size2 - start1 <= max_rule_span
           && size1 + size2 + 1 <= max_rule_symbols) {
-        vector<int> collocation(data.begin() + start1,
+        vector<int> pattern(data.begin() + start1,
             data.begin() + start1 + size1);
-        collocation.push_back(Precomputation::FIRST_NONTERMINAL);
-        collocation.insert(collocation.end(), data.begin() + start2,
+        pattern.push_back(Precomputation::FIRST_NONTERMINAL);
+        pattern.insert(pattern.end(), data.begin() + start2,
             data.begin() + start2 + size2);
-
-        AddCollocation(collocation, GetLocation(start1, start2));
+        AddStartPositions(collocations[pattern], start1, start2);
 
         // Try extending the binary collocation to a ternary collocation.
         if (is_super2) {
-          collocation.push_back(Precomputation::SECOND_NONTERMINAL);
-          // Select the rightmost subphrase.
-          for (size_t k = j + 1; k < locations.size(); ++k) {
+          pattern.push_back(Precomputation::SECOND_NONTERMINAL);
+          // Select the rightmost subpattern.
+          for (size_t k = j + 1; k < matchings.size(); ++k) {
             int start3, size3, is_super3;
-            tie(start3, size3, is_super3) = locations[k];
+            tie(start3, size3, is_super3) = matchings[k];
             if (start3 - start1 >= max_rule_span) {
               break;
             }
@@ -143,12 +140,10 @@ void Precomputation::AddCollocations(
                 && start3 + size3 - start1 <= max_rule_span
                 && size1 + size2 + size3 + 2 <= max_rule_symbols
                 && (is_super1 || is_super3)) {
-              collocation.insert(collocation.end(), data.begin() + start3,
+              pattern.insert(pattern.end(), data.begin() + start3,
                   data.begin() + start3 + size3);
-
-              AddCollocation(collocation, GetLocation(start1, start2, start3));
-
-              collocation.erase(collocation.end() - size3);
+              AddStartPositions(collocations[pattern], start1, start2, start3);
+              pattern.erase(pattern.end() - size3);
             }
           }
         }
@@ -157,29 +152,20 @@ void Precomputation::AddCollocations(
   }
 }
 
-vector<int> Precomputation::GetLocation(int pos1, int pos2) {
-  vector<int> location;
-  location.push_back(pos1);
-  location.push_back(pos2);
-  return location;
-}
-
-vector<int> Precomputation::GetLocation(int pos1, int pos2, int pos3) {
-  vector<int> location;
-  location.push_back(pos1);
-  location.push_back(pos2);
-  location.push_back(pos3);
-  return location;
+void Precomputation::AddStartPositions(
+    vector<int>& positions, int pos1, int pos2) {
+  positions.push_back(pos1);
+  positions.push_back(pos2);
 }
 
-void Precomputation::AddCollocation(vector<int> collocation,
-                                    vector<int> location) {
-  collocation.shrink_to_fit();
-  location.shrink_to_fit();
-  collocations.push_back(make_pair(collocation, location));
+void Precomputation::AddStartPositions(
+    vector<int>& positions, int pos1, int pos2, int pos3) {
+  positions.push_back(pos1);
+  positions.push_back(pos2);
+  positions.push_back(pos3);
 }
 
-Collocations Precomputation::GetCollocations() const {
+const Index& Precomputation::GetCollocations() const {
   return collocations;
 }
 
diff --git a/extractor/precomputation.h b/extractor/precomputation.h
index 0a06349b..9f0c9424 100644
--- a/extractor/precomputation.h
+++ b/extractor/precomputation.h
@@ -19,18 +19,16 @@ using namespace std;
 namespace extractor {
 
 typedef boost::hash<vector<int>> VectorHash;
-typedef vector<pair<vector<int>, vector<int>>> Collocations;
+typedef unordered_map<vector<int>, vector<int>, VectorHash> Index;
 
 class SuffixArray;
 
 /**
- * Data structure containing all the data needed for constructing an index with
- * all the occurrences of the most frequent discontiguous collocations in the
- * source data.
+ * Data structure wrapping an index with all the occurrences of the most
+ * frequent discontiguous collocations in the source data.
  *
- * Let a, b, c be contiguous phrases. The data structure will contain the
- * locations in the source data where every collocation of the following forms
- * occurs:
+ * Let a, b, c be contiguous collocations. The index will contain an entry for
+ * every collocation of the form:
  * - aXb, where a and b are frequent
  * - aXbXc, where a and b are super-frequent and c is frequent or
  *                b and c are super-frequent and a is frequent.
@@ -39,8 +37,8 @@ class Precomputation {
  public:
   // Constructs the index using the suffix array.
   Precomputation(
-      shared_ptr<SuffixArray> suffix_array, int num_frequent_phrases,
-      int num_super_frequent_phrases, int max_rule_span,
+      shared_ptr<SuffixArray> suffix_array, int num_frequent_patterns,
+      int num_super_frequent_patterns, int max_rule_span,
       int max_rule_symbols, int min_gap_size,
       int max_frequent_phrase_len, int min_frequency);
 
@@ -49,9 +47,8 @@ class Precomputation {
 
   virtual ~Precomputation();
 
-  // Returns the list of the locations of the most frequent collocations in the
-  // source data.
-  virtual Collocations GetCollocations() const;
+  // Returns a reference to the index.
+  virtual const Index& GetCollocations() const;
 
   bool operator==(const Precomputation& other) const;
 
@@ -60,29 +57,23 @@ class Precomputation {
 
  private:
   // Finds the most frequent contiguous collocations.
-  vector<vector<int>> FindMostFrequentPhrases(
+  vector<vector<int>> FindMostFrequentPatterns(
       shared_ptr<SuffixArray> suffix_array, const vector<int>& data,
-      int num_frequent_phrases, int max_frequent_phrase_len,
+      int num_frequent_patterns, int max_frequent_phrase_len,
       int min_frequency);
 
   // Given the locations of the frequent contiguous collocations in a sentence,
   // it adds new entries to the index for each discontiguous collocation
   // matching the criteria specified in the class description.
-  void AddCollocations(const vector<std::tuple<int, int, int>>& locations,
-                       const vector<int>& data, int max_rule_span,
-                       int min_gap_size, int max_rule_symbols);
+  void AddCollocations(
+      const vector<std::tuple<int, int, int>>& matchings, const vector<int>& data,
+      int max_rule_span, int min_gap_size, int max_rule_symbols);
 
-  // Creates a vector representation for the location of a binary collocation
-  // containing the starting points of each subpattern.
-  vector<int> GetLocation(int pos1, int pos2);
+  // Adds an occurrence of a binary collocation.
+  void AddStartPositions(vector<int>& positions, int pos1, int pos2);
 
-  // Creates a vector representation for the location of a ternary collocation
-  // containing the starting points of each subpattern.
-  vector<int> GetLocation(int pos1, int pos2, int pos3);
-
-  // Appends a collocation to the list of collocations after shrinking the
-  // vectors to avoid unnecessary memory usage.
-  void AddCollocation(vector<int> collocation, vector<int> location);
+  // Adds an occurrence of a ternary collocation.
+  void AddStartPositions(vector<int>& positions, int pos1, int pos2, int pos3);
 
   friend class boost::serialization::access;
 
@@ -100,13 +91,13 @@ class Precomputation {
     for (size_t i = 0; i < num_entries; ++i) {
       pair<vector<int>, vector<int>> entry;
       ar >> entry;
-      collocations.push_back(entry);
+      collocations.insert(entry);
     }
   }
 
   BOOST_SERIALIZATION_SPLIT_MEMBER();
 
-  Collocations collocations;
+  Index collocations;
 };
 
 } // namespace extractor
diff --git a/extractor/precomputation_test.cc b/extractor/precomputation_test.cc
index c6e457fd..e81ece5d 100644
--- a/extractor/precomputation_test.cc
+++ b/extractor/precomputation_test.cc
@@ -38,23 +38,6 @@ class PrecomputationTest : public Test {
     precomputation = Precomputation(suffix_array, 3, 3, 10, 5, 1, 4, 2);
   }
 
-  void CheckCollocation(const Collocations& collocations,
-                        const vector<int>& collocation,
-                        const vector<vector<int>>& locations) {
-    for (auto location: locations) {
-      auto item = make_pair(collocation, location);
-      EXPECT_FALSE(find(collocations.begin(), collocations.end(), item) ==
-                   collocations.end());
-    }
-  }
-
-  void CheckIllegalCollocation(const Collocations& collocations,
-                               const vector<int>& collocation) {
-    for (auto item: collocations) {
-      EXPECT_FALSE(collocation == item.first);
-    }
-  }
-
   vector<int> data;
   shared_ptr<MockDataArray> data_array;
   shared_ptr<MockSuffixArray> suffix_array;
@@ -62,71 +45,67 @@ class PrecomputationTest : public Test {
 };
 
 TEST_F(PrecomputationTest, TestCollocations) {
-  Collocations collocations = precomputation.GetCollocations();
-
-  EXPECT_EQ(50, collocations.size());
-
-  vector<int> collocation = {2, 3, -1, 2};
-  vector<vector<int>> locations = {{1, 5}, {1, 8}, {5, 8}, {5, 11}, {8, 11}};
-  CheckCollocation(collocations, collocation, locations);
-
-  collocation = {2, 3, -1, 2, 3};
-  locations = {{1, 5}, {1, 8}, {5, 8}};
-  CheckCollocation(collocations, collocation, locations);
-
-  collocation = {2, 3, -1, 3};
-  locations = {{1, 6}, {1, 9}, {5, 9}};
-  CheckCollocation(collocations, collocation, locations);
-  collocation = {3, -1, 2};
-  locations = {{2, 5}, {2, 8}, {2, 11}, {6, 8}, {6, 11}, {9, 11}};
-  CheckCollocation(collocations, collocation, locations);
-  collocation = {3, -1, 3};
-  locations = {{2, 6}, {2, 9}, {6, 9}};
-  CheckCollocation(collocations, collocation, locations);
-  collocation = {3, -1, 2, 3};
-  locations = {{2, 5}, {2, 8}, {6, 8}};
-  CheckCollocation(collocations, collocation, locations);
-  collocation = {2, -1, 2};
-  locations = {{1, 5}, {1, 8}, {5, 8}, {5, 11}, {8, 11}};
-  CheckCollocation(collocations, collocation, locations);
-  collocation = {2, -1, 2, 3};
-  locations = {{1, 5}, {1, 8}, {5, 8}};
-  CheckCollocation(collocations, collocation, locations);
-  collocation = {2, -1, 3};
-  locations = {{1, 6}, {1, 9}, {5, 9}};
-  CheckCollocation(collocations, collocation, locations);
-
-  collocation = {2, -1, 2, -2, 2};
-  locations = {{1, 5, 8}, {5, 8, 11}};
-  CheckCollocation(collocations, collocation, locations);
-  collocation = {2, -1, 2, -2, 3};
-  locations = {{1, 5, 9}};
-  CheckCollocation(collocations, collocation, locations);
-  collocation = {2, -1, 3, -2, 2};
-  locations = {{1, 6, 8}, {5, 9, 11}};
-  CheckCollocation(collocations, collocation, locations);
-  collocation = {2, -1, 3, -2, 3};
-  locations = {{1, 6, 9}};
-  CheckCollocation(collocations, collocation, locations);
-  collocation = {3, -1, 2, -2, 2};
-  locations = {{2, 5, 8}, {2, 5, 11}, {2, 8, 11}, {6, 8, 11}};
-  CheckCollocation(collocations, collocation, locations);
-  collocation = {3, -1, 2, -2, 3};
-  locations = {{2, 5, 9}};
-  CheckCollocation(collocations, collocation, locations);
-  collocation = {3, -1, 3, -2, 2};
-  locations = {{2, 6, 8}, {2, 6, 11}, {2, 9, 11}, {6, 9, 11}};
-  CheckCollocation(collocations, collocation, locations);
-  collocation = {3, -1, 3, -2, 3};
-  locations = {{2, 6, 9}};
-  CheckCollocation(collocations, collocation, locations);
-
-  // Collocation exceeds max_rule_symbols.
-  collocation = {2, -1, 2, -2, 2, 3};
-  CheckIllegalCollocation(collocations, collocation);
-  // Collocation contains non frequent pattern.
-  collocation = {2, -1, 5};
-  CheckIllegalCollocation(collocations, collocation);
+  Index collocations = precomputation.GetCollocations();
+
+  vector<int> key = {2, 3, -1, 2};
+  vector<int> expected_value = {1, 5, 1, 8, 5, 8, 5, 11, 8, 11};
+  EXPECT_EQ(expected_value, collocations[key]);
+  key = {2, 3, -1, 2, 3};
+  expected_value = {1, 5, 1, 8, 5, 8};
+  EXPECT_EQ(expected_value, collocations[key]);
+  key = {2, 3, -1, 3};
+  expected_value = {1, 6, 1, 9, 5, 9};
+  EXPECT_EQ(expected_value, collocations[key]);
+  key = {3, -1, 2};
+  expected_value = {2, 5, 2, 8, 2, 11, 6, 8, 6, 11, 9, 11};
+  EXPECT_EQ(expected_value, collocations[key]);
+  key = {3, -1, 3};
+  expected_value = {2, 6, 2, 9, 6, 9};
+  EXPECT_EQ(expected_value, collocations[key]);
+  key = {3, -1, 2, 3};
+  expected_value = {2, 5, 2, 8, 6, 8};
+  EXPECT_EQ(expected_value, collocations[key]);
+  key = {2, -1, 2};
+  expected_value = {1, 5, 1, 8, 5, 8, 5, 11, 8, 11};
+  EXPECT_EQ(expected_value, collocations[key]);
+  key = {2, -1, 2, 3};
+  expected_value = {1, 5, 1, 8, 5, 8};
+  EXPECT_EQ(expected_value, collocations[key]);
+  key = {2, -1, 3};
+  expected_value = {1, 6, 1, 9, 5, 9};
+  EXPECT_EQ(expected_value, collocations[key]);
+
+  key = {2, -1, 2, -2, 2};
+  expected_value = {1, 5, 8, 5, 8, 11};
+  EXPECT_EQ(expected_value, collocations[key]);
+  key = {2, -1, 2, -2, 3};
+  expected_value = {1, 5, 9};
+  EXPECT_EQ(expected_value, collocations[key]);
+  key = {2, -1, 3, -2, 2};
+  expected_value = {1, 6, 8, 5, 9, 11};
+  EXPECT_EQ(expected_value, collocations[key]);
+  key = {2, -1, 3, -2, 3};
+  expected_value = {1, 6, 9};
+  EXPECT_EQ(expected_value, collocations[key]);
+  key = {3, -1, 2, -2, 2};
+  expected_value = {2, 5, 8, 2, 5, 11, 2, 8, 11, 6, 8, 11};
+  EXPECT_EQ(expected_value, collocations[key]);
+  key = {3, -1, 2, -2, 3};
+  expected_value = {2, 5, 9};
+  EXPECT_EQ(expected_value, collocations[key]);
+  key = {3, -1, 3, -2, 2};
+  expected_value = {2, 6, 8, 2, 6, 11, 2, 9, 11, 6, 9, 11};
+  EXPECT_EQ(expected_value, collocations[key]);
+  key = {3, -1, 3, -2, 3};
+  expected_value = {2, 6, 9};
+  EXPECT_EQ(expected_value, collocations[key]);
+
+  // Exceeds max_rule_symbols.
+  key = {2, -1, 2, -2, 2, 3};
+  EXPECT_EQ(0, collocations.count(key));
+  // Contains non frequent pattern.
+  key = {2, -1, 5};
+  EXPECT_EQ(0, collocations.count(key));
 }
 
 TEST_F(PrecomputationTest, TestSerialization) {
-- 
cgit v1.2.3


From eb4259454f963e4189dab65a12302f467e598621 Mon Sep 17 00:00:00 2001
From: Paul Baltescu <pauldb89@gmail.com>
Date: Tue, 25 Jun 2013 22:11:14 +0100
Subject: Updated README.

---
 extractor/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/extractor/README.md b/extractor/README.md
index 575f5ca5..c9db8de8 100644
--- a/extractor/README.md
+++ b/extractor/README.md
@@ -2,7 +2,7 @@ C++ implementation of the online grammar extractor originally developed by [Adam
 
 To run the extractor you need to:
 
-    cdec/extractor/run_extractor -a <alignment> -b <parallel_corpus> -g <grammar_output_path> < <input_sentences> > <sgm_file>
+    cdec/extractor/run_extractor -t <num_threads> -a <alignment> -b <parallel_corpus> -g <grammar_output_path> < <input_sentences> > <sgm_file>
 
 To run unit tests you need first to configure `cdec` with the [Google Test](https://code.google.com/p/googletest/) and [Google Mock](https://code.google.com/p/googlemock/) libraries:
 
-- 
cgit v1.2.3