From 3bca2ef755eb45f7f6e39e92a9adc02a7b4c84f0 Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Mon, 24 Jun 2013 23:10:12 -0400 Subject: remove old mira script, fix default number of iterations in new one --- training/mira/mira.py | 2 +- training/mira/run_mira.pl | 630 ---------------------------------------------- 2 files changed, 1 insertion(+), 631 deletions(-) delete mode 100755 training/mira/run_mira.pl diff --git a/training/mira/mira.py b/training/mira/mira.py index f031c313..29c51e1d 100755 --- a/training/mira/mira.py +++ b/training/mira/mira.py @@ -96,7 +96,7 @@ def main(): parser.add_argument('-m', '--metric', default='ibm_bleu', help='metric to optimize. Example values: ' 'ibm_bleu, nist_bleu, Koehn_bleu, TER, Combi') - parser.add_argument('--max-iterations', type=int, default=10, metavar='N', + parser.add_argument('--max-iterations', type=int, default=20, metavar='N', help='maximum number of iterations to run') parser.add_argument('--optimizer', type=int, default=2, choices=range(1,6), help='learning method to use for weight update.' diff --git a/training/mira/run_mira.pl b/training/mira/run_mira.pl deleted file mode 100755 index d71590ba..00000000 --- a/training/mira/run_mira.pl +++ /dev/null @@ -1,630 +0,0 @@ -#!/usr/bin/env perl -use strict; -my @ORIG_ARGV=@ARGV; -use Cwd qw(getcwd); -my $SCRIPT_DIR; BEGIN { use Cwd qw/ abs_path /; use File::Basename; $SCRIPT_DIR = dirname(abs_path($0)); -push @INC, $SCRIPT_DIR, "$SCRIPT_DIR/../../environment"; } - -# Skip local config (used for distributing jobs) if we're running in local-only mode -use LocalConfig; -use Getopt::Long; -use IPC::Open2; -use POSIX ":sys_wait_h"; -my $QSUB_CMD = qsub_args(mert_memory()); -my $default_jobs = env_default_jobs(); - -my $srcFile; -my $refFiles; -my $bin_dir = $SCRIPT_DIR; -die "Bin directory $bin_dir missing/inaccessible" unless -d $bin_dir; -my $FAST_SCORE="$bin_dir/../../mteval/fast_score"; -die "Can't execute $FAST_SCORE" unless -x $FAST_SCORE; - -my $iteration = 0.0; -my $max_iterations = 10; -my $metric = "ibm_bleu"; -my $iniFile; -my $weights; -my $initialWeights; -my $jobs = $default_jobs; # number of decode nodes -my $pmem = "1g"; -my $dir; - -my $SCORER = $FAST_SCORE; - -my $UTILS_DIR="$SCRIPT_DIR/../utils"; -require "$UTILS_DIR/libcall.pl"; - -my $parallelize = "$UTILS_DIR/parallelize.pl"; -my $libcall = "$UTILS_DIR/libcall.pl"; -my $sentserver = "$UTILS_DIR/sentserver"; -my $sentclient = "$UTILS_DIR/sentclient"; - -my $run_local = 0; -my $pass_suffix = ''; - -my $cdec ="$bin_dir/kbest_cut_mira"; - -die "Can't find decoder in $cdec" unless -x $cdec; -my $decoder = $cdec; -my $decoderOpt; -my $update_size; -my $approx_score; -my $kbest_size=250; -my $metric_scale=1; -my $optimizer=2; -my $disable_clean = 0; -my $use_make=0; -my $density_prune; -my $cpbin=1; -my $help = 0; -my $epsilon = 0.0001; -my $step_size = 0.01; -my $gpref; -my $unique_kbest; -my $freeze; -my $hopes=1; -my $fears=1; -my $sent_approx=0; -my $pseudo_doc=0; - -my $range = 35000; -my $minimum = 15000; -my $portn = int(rand($range)) + $minimum; - - -# Process command-line options -Getopt::Long::Configure("no_auto_abbrev"); -if (GetOptions( - "decoder=s" => \$decoderOpt, - "jobs=i" => \$jobs, - "density-prune=f" => \$density_prune, - "dont-clean" => \$disable_clean, - "pass-suffix=s" => \$pass_suffix, - "epsilon=s" => \$epsilon, - "help" => \$help, - "local" => \$run_local, - "use-make=i" => \$use_make, - "max-iterations=i" => \$max_iterations, - "pmem=s" => \$pmem, - "cpbin!" => \$cpbin, - "ref-files=s" => \$refFiles, - "metric=s" => \$metric, - "source-file=s" => \$srcFile, - "weights=s" => \$initialWeights, - "optimizer=i" => \$optimizer, - "metric-scale=i" => \$metric_scale, - "kbest-size=i" => \$kbest_size, - "update-size=i" => \$update_size, - "step-size=f" => \$step_size, - "hope-select=i" => \$hopes, - "fear-select=i" => \$fears, - "sent-approx" => \$sent_approx, - "pseudo-doc" => \$pseudo_doc, - "unique-kbest" => \$unique_kbest, - "grammar-prefix=s" => \$gpref, - "freeze" => \$freeze, - "workdir=s" => \$dir, - ) == 0 || @ARGV!=1 || $help) { - print_help(); - exit; -} - -($iniFile) = @ARGV; - - -sub write_config; -sub enseg; -sub print_help; - -my $nodelist; -my $host =check_output("hostname"); chomp $host; -my $bleu; -my $interval_count = 0; -my $logfile; -my $projected_score; - - -#my $refs_comma_sep = get_comma_sep_refs($refFiles); -my $refs_comma_sep = get_comma_sep_refs('r',$refFiles); - -#my $refs_comma_sep_4cdec = get_comma_sep_refs_4cdec($refFiles); - -unless ($dir){ - $dir = "mira"; -} -unless ($dir =~ /^\//){ # convert relative path to absolute path - my $basedir = check_output("pwd"); - chomp $basedir; - $dir = "$basedir/$dir"; -} - -if ($decoderOpt){ $decoder = $decoderOpt; } - -# Initializations and helper functions -srand; - -my @childpids = (); -my @cleanupcmds = (); - -sub cleanup { - print STDERR "Cleanup...\n"; - for my $pid (@childpids){ unchecked_call("kill $pid"); } - for my $cmd (@cleanupcmds){ unchecked_call("$cmd"); } - exit 1; -}; - -# Always call cleanup, no matter how we exit -*CORE::GLOBAL::exit = - sub{ cleanup(); }; -$SIG{INT} = "cleanup"; -$SIG{TERM} = "cleanup"; -$SIG{HUP} = "cleanup"; - - -my $decoderBase = check_output("basename $decoder"); chomp $decoderBase; -my $newIniFile = "$dir/$decoderBase.ini"; -my $inputFileName = "$dir/input"; -my $user = $ENV{"USER"}; - - -# process ini file --e $iniFile || die "Error: could not open $iniFile for reading\n"; -open(INI, $iniFile); - -use File::Basename qw(basename); -#pass bindir, refs to vars holding bin -sub modbin { - local $_; - my $bindir=shift; - check_call("mkdir -p $bindir"); - -d $bindir || die "couldn't make bindir $bindir"; - for (@_) { - my $src=$$_; - $$_="$bindir/".basename($src); - check_call("cp -p $src $$_"); - } -} -sub dirsize { - opendir ISEMPTY,$_[0]; - return scalar(readdir(ISEMPTY))-1; -} - - - - -if (-e $dir && dirsize($dir)>1 && -e "$dir/weights" ){ # allow preexisting logfile, binaries, but not dist-vest.pl outputs - die "ERROR: working dir $dir already exists\n\n"; -} else { - -e $dir || mkdir $dir; - mkdir "$dir/scripts"; - my $cmdfile="$dir/rerun-mira.sh"; - open CMD,'>',$cmdfile; - print CMD "cd ",&getcwd,"\n"; - my $cline=&cmdline."\n"; - print CMD $cline; - close CMD; - print STDERR $cline; - chmod(0755,$cmdfile); - unless (-e $initialWeights) { - print STDERR "Please specify an initial weights file with --initial-weights\n"; - print_help(); - exit; - } - check_call("cp $initialWeights $dir/weights.0"); - die "Can't find weights.0" unless (-e "$dir/weights.0"); -} -write_config(*STDERR); - -# Generate initial files and values -check_call("cp $iniFile $newIniFile"); -$iniFile = $newIniFile; - -my $newsrc = "$dir/dev.input"; -enseg($srcFile, $newsrc, $gpref); - -$srcFile = $newsrc; -my $devSize = 0; -open F, "<$srcFile" or die "Can't read $srcFile: $!"; -while() { $devSize++; } -close F; - -my $lastPScore = 0; -my $lastWeightsFile; -my $bestScoreIter=-1; -my $bestScore=-1; -unless ($update_size){$update_size = $kbest_size;} -# main optimization loop -#while (1){ -for (my $opt_iter=0; $opt_iter<$max_iterations; $opt_iter++) { - - print STDERR "\n\nITERATION $opt_iter\n==========\n"; - print STDERR "Using port $portn\n"; - - # iteration-specific files - my $runFile="$dir/run.raw.$opt_iter"; - my $onebestFile="$dir/1best.$opt_iter"; - my $logdir="$dir/logs.$opt_iter"; - my $decoderLog="$logdir/decoder.sentserver.log.$opt_iter"; - my $scorerLog="$logdir/scorer.log.$opt_iter"; - my $weightdir="$dir/weights.pass$opt_iter/"; - check_call("mkdir -p $logdir"); - check_call("mkdir -p $weightdir"); - - #decode - print STDERR "RUNNING DECODER AT "; - print STDERR unchecked_output("date"); -# my $im1 = $opt_iter - 1; - my $weightsFile="$dir/weights.$opt_iter"; - print "ITER $iteration " ; - my $cur_pass = "-p 0$opt_iter"; - my $decoder_cmd = "$decoder -c $iniFile -w $weightsFile $refs_comma_sep -m $metric -s $metric_scale -b $update_size -k $kbest_size -o $optimizer $cur_pass -O $weightdir -D $dir -h $hopes -f $fears -C $step_size"; - if($unique_kbest){ - $decoder_cmd .= " -u"; - } - if($sent_approx){ - $decoder_cmd .= " -a"; - } - if($pseudo_doc){ - $decoder_cmd .= " -e"; - } - if ($density_prune) { - $decoder_cmd .= " --density_prune $density_prune"; - } - my $pcmd; - if ($run_local) { - $pcmd = "cat $srcFile |"; - } elsif ($use_make) { - # TODO: Throw error when jobs is speong with use_make - $pcmd = "cat $srcFile | $parallelize --use-fork -p $pmem -e $logdir -j $use_make --"; - } - else { - $pcmd = "cat $srcFile | $parallelize -p $pmem -e $logdir -j $jobs --baseport $portn --"; - } - my $cmd = "$pcmd $decoder_cmd 2> $decoderLog 1> $runFile"; - print STDERR "COMMAND:\n$cmd\n"; - check_bash_call($cmd); - - my $retries = 0; - my $num_topbest; - while($retries < 6) { - $num_topbest = check_output("wc -l < $runFile"); - print STDERR "NUMBER OF TOP-BEST HYPs: $num_topbest\n"; - if($devSize == $num_topbest) { - last; - } else { - print STDERR "Incorrect number of topbest. Waiting for distributed filesystem and retrying...\n"; - sleep(10); - } - $retries++; - } - die "Dev set contains $devSize sentences, but we don't have topbest for all these! Decoder failure? Check $decoderLog\n" if ($devSize != $num_topbest); - - - #score the output from this iteration - open RUN, "<$runFile" or die "Can't read $runFile: $!"; - open H, ">$runFile.H" or die; - open F, ">$runFile.F" or die; - open B, ">$runFile.B" or die; - while() { - chomp(); - (my $hope,my $best,my $fear) = split(/ \|\|\| /); - print H "$hope \n"; - print B "$best \n"; - print F "$fear \n"; - } - close RUN; - close F; close B; close H; - - my $dec_score = check_output("cat $runFile.B | $SCORER $refs_comma_sep -m $metric"); - my $dec_score_h = check_output("cat $runFile.H | $SCORER $refs_comma_sep -m $metric"); - my $dec_score_f = check_output("cat $runFile.F | $SCORER $refs_comma_sep -m $metric"); - chomp $dec_score; chomp $dec_score_h; chomp $dec_score_f; - print STDERR "DECODER SCORE: $dec_score HOPE: $dec_score_h FEAR: $dec_score_f\n"; - if ($dec_score> $bestScore){ - $bestScoreIter=$opt_iter; - $bestScore=$dec_score; - } - # save space - check_call("gzip -f $runFile"); - check_call("gzip -f $decoderLog"); - my $iter_filler=""; - if($opt_iter < 10) - {$iter_filler="0";} - - my $nextIter = $opt_iter + 1; - my $newWeightsFile = "$dir/weights.$nextIter"; - $lastWeightsFile = "$dir/weights.$opt_iter"; - - average_weights("$weightdir/weights.mira-pass*.*[0-9].gz", $newWeightsFile, $logdir); - system("gzip -f $logdir/kbes*"); - print STDERR "\n==========\n"; - $iteration++; -} -print STDERR "\nBEST ITER: $bestScoreIter :: $bestScore\n\n\n"; - -print STDOUT "$lastWeightsFile\n"; - -sub get_lines { - my $fn = shift @_; - open FL, "<$fn" or die "Couldn't read $fn: $!"; - my $lc = 0; - while() { $lc++; } - return $lc; -} - -sub get_comma_sep_refs { - my ($r,$p) = @_; - my $o = check_output("echo $p"); - chomp $o; - my @files = split /\s+/, $o; - return "-$r " . join(" -$r ", @files); -} - - -sub read_weights_file { - my ($file) = @_; - open F, "<$file" or die "Couldn't read $file: $!"; - my @r = (); - my $pm = -1; - while() { - next if /^#/; - next if /^\s*$/; - chomp; - if (/^(.+)\s+(.+)$/) { - my $m = $1; - my $w = $2; - die "Weights out of order: $m <= $pm" unless $m > $pm; - push @r, $w; - } else { - warn "Unexpected feature name in weight file: $_"; - } - } - close F; - return join ' ', @r; -} - -sub write_config { - my $fh = shift; - my $cleanup = "yes"; - if ($disable_clean) {$cleanup = "no";} - - print $fh "\n"; - print $fh "DECODER: $decoder\n"; - print $fh "INI FILE: $iniFile\n"; - print $fh "WORKING DIR: $dir\n"; - print $fh "SOURCE (DEV): $srcFile\n"; - print $fh "REFS (DEV): $refFiles\n"; - print $fh "EVAL METRIC: $metric\n"; - print $fh "START ITERATION: $iteration\n"; - print $fh "MAX ITERATIONS: $max_iterations\n"; - print $fh "DECODE NODES: $jobs\n"; - print $fh "HEAD NODE: $host\n"; - print $fh "PMEM (DECODING): $pmem\n"; - print $fh "CLEANUP: $cleanup\n"; - print $fh "INITIAL WEIGHTS: $initialWeights\n"; - print $fh "GRAMMAR PREFIX: $gpref\n"; -} - -sub update_weights_file { - my ($neww, $rfn, $rpts) = @_; - my @feats = @$rfn; - my @pts = @$rpts; - my $num_feats = scalar @feats; - my $num_pts = scalar @pts; - die "$num_feats (num_feats) != $num_pts (num_pts)" unless $num_feats == $num_pts; - open G, ">$neww" or die; - for (my $i = 0; $i < $num_feats; $i++) { - my $f = $feats[$i]; - my $lambda = $pts[$i]; - print G "$f $lambda\n"; - } - close G; -} - -sub enseg { - my $src = shift; - my $newsrc = shift; - my $grammarpref = shift; - - open(SRC, $src); - open(NEWSRC, ">$newsrc"); - my $i=0; - while (my $line=){ - chomp $line; - if ($line =~ /^\s* tags, you must include a zero-based id attribute"; - } - } - elsif (defined $grammarpref) { - print NEWSRC "$line\n";} - else { - print NEWSRC "$line\n"; - } - $i++; - } - close SRC; - close NEWSRC; -} - -sub print_help { - my $executable = check_output("basename $0"); chomp $executable; - print << "Help"; - -Usage: $executable [options] - Runs a complete MIRA optimization using the ini file specified. - Example invocation: - run_mira.pl \ - --pmem 3g \ - --max-iterations 20 \ - --optimizer 2 \ - --unique-kbest \ - --jobs 15 \ - --kbest-size 500 \ - --hope-select 1 \ - --fear-select 1 \ - --ref-files "ref.0.soseos ref.1.soseos" \ - --source-file src.soseos \ - --weights weights.init \ - --workdir workdir \ - --grammar-prefix grammars/grammar \ - --step-size 0.01 \ - --metric-scale 10000 \ - -Required: - - --ref-files - Dev set ref files. This option takes only a single string argument. - To use multiple files (including file globbing), this argument should - be quoted. - --source-file - Dev set source file. - --weights - Initial weights file - -General options: - - --help - Print this message and exit. - - --max-iterations - Maximum number of iterations to run. If not specified, defaults - to $max_iterations. - - --metric - Metric to optimize. - Example values: IBM_BLEU, NIST_BLEU, Koehn_BLEU, TER, Combi - - --workdir - Directory for intermediate and output files. If not specified, the - name is derived from the ini filename. Assuming that the ini - filename begins with the decoder name and ends with ini, the default - name of the working directory is inferred from the middle part of - the filename. E.g. an ini file named decoder.foo.ini would have - a default working directory name foo. - --optimizer - Learning method to use for weight update. Choice are 1) SGD, 2) PA MIRA with Selection from Cutting Plane, 3) Cutting Plane MIRA, 4) PA MIRA,5) nbest MIRA with hope, fear, and model constraints - --metric-scale - Scale MT loss by this amount when computing hope/fear candidates - --kbest-size - Size of k-best list to extract from forest - --update-size - Size of k-best list to use for update (applies to optimizer 5) - --step-size - Controls aggresiveness of update (C) - --hope-select - How to select hope candidate. Choices are 1) model score - cost, 2) min cost - --fear-select - How to select fear candodate. Choices are 1) model score + cost, 2) max cost, 3) max score - --sent-approx - Use smoothed sentence-level MT metric - --pseudo-doc - Use pseudo document to approximate MT metric - --unique-kbest - Extract unique k-best from forest - --grammar-prefix - Path to sentence-specific grammar files - -Job control options: - - --jobs - Number of decoder processes to run in parallel. [default=$default_jobs] - - --pmem - Amount of physical memory requested for parallel decoding jobs - (used with qsub requests only) - - --local - Run single learner - --use-make - Run parallel learners on a single machine through fork. - - -Help -} - - -sub cmdline { - return join ' ',($0,@ORIG_ARGV); -} - -#buggy: last arg gets quoted sometimes? -my $is_shell_special=qr{[ \t\n\\><|&;"'`~*?{}$!()]}; -my $shell_escape_in_quote=qr{[\\"\$`!]}; - -sub escape_shell { - my ($arg)=@_; - return undef unless defined $arg; - if ($arg =~ /$is_shell_special/) { - $arg =~ s/($shell_escape_in_quote)/\\$1/g; - return "\"$arg\""; - } - return $arg; -} - -sub escaped_shell_args { - return map {local $_=$_;chomp;escape_shell($_)} @_; -} - -sub escaped_shell_args_str { - return join ' ',&escaped_shell_args(@_); -} - -sub escaped_cmdline { - return "$0 ".&escaped_shell_args_str(@ORIG_ARGV); -} - -sub average_weights { - - my $path = shift; - my $out = shift; - my $logpath = shift; - print "AVERAGE $path $out\n"; - my %feature_weights= (); - my $total =0; - my $total_mult =0; - sleep(10); - foreach my $file (glob "$path") - { - $file =~ /\/([^\/]+).gz$/; - my $fname = $1; - my $cmd = "gzip -d $file"; - $file =~ s/\.gz//; - check_bash_call($cmd); - my $mult = 0; - print "FILE $file \n"; - open SCORE, "< $file" or next; - $total++; - while( ) { - my $line = $_; - if ($line !~ m/^\#/) - { - my @s = split(" ",$line); - $feature_weights{$s[0]}+= $mult * $s[1]; - } - else - { - (my $msg,my $ran,$mult) = split(/ \|\|\| /); - print "Processing $ran $mult\n"; - } - } - $total_mult += $mult; - - close SCORE; - $cmd = "gzip $file"; check_bash_call($cmd); - } - -#print out new averaged weights - open OUT, "> $out" or next; - for my $f ( keys %feature_weights ) { - print "$f $feature_weights{$f} $total_mult\n"; - my $ave = $feature_weights{$f} / $total_mult; - - print "Printing $f $ave ||| "; - print OUT "$f $ave\n"; - } - -} -- cgit v1.2.3 From 15cbad322cffd1aaa50a380b5fcee04e159423cd Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Mon, 24 Jun 2013 23:26:53 -0400 Subject: fix makefile --- klm/util/Makefile.am | 28 +++++++++++++++------------- 1 file changed, 15 insertions(+), 13 deletions(-) diff --git a/klm/util/Makefile.am b/klm/util/Makefile.am index 7f873e96..5e650af7 100644 --- a/klm/util/Makefile.am +++ b/klm/util/Makefile.am @@ -19,40 +19,42 @@ noinst_LIBRARIES = libklm_util.a libklm_util_a_SOURCES = \ + bit_packing.cc \ bit_packing.hh \ + ersatz_progress.cc \ ersatz_progress.hh \ + exception.cc \ exception.hh \ + fake_ofstream.hh \ + file.cc \ file.hh \ + file_piece.cc \ file_piece.hh \ getopt.hh \ have.hh \ joint_sort.hh \ + mmap.cc \ mmap.hh \ + multi_intersection.hh \ + murmur_hash.cc \ murmur_hash.hh \ pcqueue.hh \ + pool.cc \ pool.hh \ probing_hash_table.hh \ proxy_iterator.hh \ + read_compressed.cc \ read_compressed.hh \ + scoped.cc \ scoped.hh \ sized_iterator.hh \ sorted_uniform.hh \ + string_piece.cc \ string_piece.hh \ string_piece_hash.hh \ thread_pool.hh \ tokenize_piece.hh \ - usage.hh \ - ersatz_progress.cc \ - bit_packing.cc \ - exception.cc \ - file.cc \ - file_piece.cc \ - mmap.cc \ - murmur_hash.cc \ - pool.cc \ - read_compressed.cc \ - scoped.cc \ - string_piece.cc \ - usage.cc + usage.cc \ + usage.hh AM_CPPFLAGS = -W -Wall -I$(top_srcdir)/klm -I$(top_srcdir)/klm/util/double-conversion -- cgit v1.2.3 From eed7024bfa6d92e669bee0ec5c5e38a7b238a2f6 Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Mon, 24 Jun 2013 23:35:18 -0400 Subject: fix deps --- klm/search/Makefile.am | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/klm/search/Makefile.am b/klm/search/Makefile.am index b8c8a050..b7f01c24 100644 --- a/klm/search/Makefile.am +++ b/klm/search/Makefile.am @@ -1,10 +1,22 @@ noinst_LIBRARIES = libksearch.a libksearch_a_SOURCES = \ + applied.hh \ + config.hh \ + context.hh \ + dedupe.hh \ + edge.hh \ edge_generator.cc \ - nbest.cc \ + edge_generator.hh \ + header.hh \ + nbest.cc \ + nbest.hh \ rule.cc \ - vertex.cc + rule.hh \ + types.hh \ + vertex.cc \ + vertex.hh \ + vertex_generator.hh -AM_CPPFLAGS = -W -Wall -Wno-sign-compare $(GTEST_CPPFLAGS) -I.. +AM_CPPFLAGS = -W -Wall -Wno-sign-compare -I.. -- cgit v1.2.3 From cd4dbb5c0b581efb0369330ea330f2b473628a96 Mon Sep 17 00:00:00 2001 From: Paul Baltescu Date: Tue, 25 Jun 2013 15:13:30 +0100 Subject: Reduce memory used by precomputation. --- extractor/fast_intersector.cc | 11 ++- extractor/fast_intersector_test.cc | 8 +- extractor/mocks/mock_precomputation.h | 2 +- extractor/precomputation.cc | 138 +++++++++++++++++--------------- extractor/precomputation.h | 49 +++++++----- extractor/precomputation_test.cc | 143 +++++++++++++++++++--------------- 6 files changed, 199 insertions(+), 152 deletions(-) diff --git a/extractor/fast_intersector.cc b/extractor/fast_intersector.cc index a8591a72..5360c1da 100644 --- a/extractor/fast_intersector.cc +++ b/extractor/fast_intersector.cc @@ -20,10 +20,13 @@ FastIntersector::FastIntersector(shared_ptr suffix_array, vocabulary(vocabulary), max_rule_span(max_rule_span), min_gap_size(min_gap_size) { - Index precomputed_collocations = precomputation->GetCollocations(); - for (pair, vector> entry: precomputed_collocations) { - vector phrase = ConvertPhrase(entry.first); - collocations[phrase] = entry.second; + auto precomputed_collocations = precomputation->GetCollocations(); + for (auto item: precomputed_collocations) { + vector phrase = ConvertPhrase(item.first); + vector location = item.second; + vector& phrase_collocations = collocations[phrase]; + phrase_collocations.insert(phrase_collocations.end(), location.begin(), + location.end()); } } diff --git a/extractor/fast_intersector_test.cc b/extractor/fast_intersector_test.cc index 76c3aaea..2e618b63 100644 --- a/extractor/fast_intersector_test.cc +++ b/extractor/fast_intersector_test.cc @@ -60,14 +60,14 @@ class FastIntersectorTest : public Test { precomputation = make_shared(); EXPECT_CALL(*precomputation, GetCollocations()) - .WillRepeatedly(ReturnRef(collocations)); + .WillRepeatedly(Return(collocations)); phrase_builder = make_shared(vocabulary); intersector = make_shared(suffix_array, precomputation, vocabulary, 15, 1); } - Index collocations; + Collocations collocations; shared_ptr data_array; shared_ptr suffix_array; shared_ptr precomputation; @@ -82,9 +82,9 @@ TEST_F(FastIntersectorTest, TestCachedCollocation) { Phrase phrase = phrase_builder->Build(symbols); PhraseLocation prefix_location(15, 16), suffix_location(16, 17); - collocations[symbols] = expected_location; + collocations.push_back(make_pair(symbols, expected_location)); EXPECT_CALL(*precomputation, GetCollocations()) - .WillRepeatedly(ReturnRef(collocations)); + .WillRepeatedly(Return(collocations)); intersector = make_shared(suffix_array, precomputation, vocabulary, 15, 1); diff --git a/extractor/mocks/mock_precomputation.h b/extractor/mocks/mock_precomputation.h index 8753343e..86f4ce27 100644 --- a/extractor/mocks/mock_precomputation.h +++ b/extractor/mocks/mock_precomputation.h @@ -6,7 +6,7 @@ namespace extractor { class MockPrecomputation : public Precomputation { public: - MOCK_CONST_METHOD0(GetCollocations, const Index&()); + MOCK_CONST_METHOD0(GetCollocations, Collocations()); }; } // namespace extractor diff --git a/extractor/precomputation.cc b/extractor/precomputation.cc index 3b8aed69..37dbf7b7 100644 --- a/extractor/precomputation.cc +++ b/extractor/precomputation.cc @@ -14,63 +14,65 @@ int Precomputation::FIRST_NONTERMINAL = -1; int Precomputation::SECOND_NONTERMINAL = -2; Precomputation::Precomputation( - shared_ptr suffix_array, int num_frequent_patterns, - int num_super_frequent_patterns, int max_rule_span, + shared_ptr suffix_array, int num_frequent_phrases, + int num_super_frequent_phrases, int max_rule_span, int max_rule_symbols, int min_gap_size, int max_frequent_phrase_len, int min_frequency) { vector data = suffix_array->GetData()->GetData(); - vector> frequent_patterns = FindMostFrequentPatterns( - suffix_array, data, num_frequent_patterns, max_frequent_phrase_len, + vector> frequent_phrases = FindMostFrequentPhrases( + suffix_array, data, num_frequent_phrases, max_frequent_phrase_len, min_frequency); // Construct sets containing the frequent and superfrequent contiguous // collocations. - unordered_set, VectorHash> frequent_patterns_set; - unordered_set, VectorHash> super_frequent_patterns_set; - for (size_t i = 0; i < frequent_patterns.size(); ++i) { - frequent_patterns_set.insert(frequent_patterns[i]); - if (i < num_super_frequent_patterns) { - super_frequent_patterns_set.insert(frequent_patterns[i]); + unordered_set, VectorHash> frequent_phrases_set; + unordered_set, VectorHash> super_frequent_phrases_set; + for (size_t i = 0; i < frequent_phrases.size(); ++i) { + frequent_phrases_set.insert(frequent_phrases[i]); + if (i < num_super_frequent_phrases) { + super_frequent_phrases_set.insert(frequent_phrases[i]); } } - vector> matchings; + vector> locations; for (size_t i = 0; i < data.size(); ++i) { - // If the sentence is over, add all the discontiguous frequent patterns to - // the index. + // If the sentence is over, add all the discontiguous frequent phrases to + // the list. if (data[i] == DataArray::END_OF_LINE) { - AddCollocations(matchings, data, max_rule_span, min_gap_size, + AddCollocations(locations, data, max_rule_span, min_gap_size, max_rule_symbols); - matchings.clear(); + locations.clear(); continue; } - vector pattern; - // Find all the contiguous frequent patterns starting at position i. + vector phrase; + // Find all the contiguous frequent phrases starting at position i. for (int j = 1; j <= max_frequent_phrase_len && i + j <= data.size(); ++j) { - pattern.push_back(data[i + j - 1]); - if (frequent_patterns_set.count(pattern)) { - int is_super_frequent = super_frequent_patterns_set.count(pattern); - matchings.push_back(make_tuple(i, j, is_super_frequent)); + phrase.push_back(data[i + j - 1]); + if (frequent_phrases_set.count(phrase)) { + int is_super_frequent = super_frequent_phrases_set.count(phrase); + locations.push_back(make_tuple(i, j, is_super_frequent)); } else { - // If the current pattern is not frequent, any longer pattern having the - // current pattern as prefix will not be frequent. + // If the current phrase is not frequent, any longer phrase having the + // current phrase as prefix will not be frequent. break; } } } + + collocations.shrink_to_fit(); } Precomputation::Precomputation() {} Precomputation::~Precomputation() {} -vector> Precomputation::FindMostFrequentPatterns( +vector> Precomputation::FindMostFrequentPhrases( shared_ptr suffix_array, const vector& data, - int num_frequent_patterns, int max_frequent_phrase_len, int min_frequency) { + int num_frequent_phrases, int max_frequent_phrase_len, int min_frequency) { vector lcp = suffix_array->BuildLCPArray(); vector run_start(max_frequent_phrase_len); - // Find all the patterns occurring at least min_frequency times. + // Find all the phrases occurring at least min_frequency times. priority_queue>> heap; for (size_t i = 1; i < lcp.size(); ++i) { for (int len = lcp[i]; len < max_frequent_phrase_len; ++len) { @@ -83,34 +85,34 @@ vector> Precomputation::FindMostFrequentPatterns( } } - // Extract the most frequent patterns. - vector> frequent_patterns; - while (frequent_patterns.size() < num_frequent_patterns && !heap.empty()) { + // Extract the most frequent phrases. + vector> frequent_phrases; + while (frequent_phrases.size() < num_frequent_phrases && !heap.empty()) { int start = heap.top().second.first; int len = heap.top().second.second; heap.pop(); - vector pattern(data.begin() + start, data.begin() + start + len); - if (find(pattern.begin(), pattern.end(), DataArray::END_OF_LINE) == - pattern.end()) { - frequent_patterns.push_back(pattern); + vector phrase(data.begin() + start, data.begin() + start + len); + if (find(phrase.begin(), phrase.end(), DataArray::END_OF_LINE) == + phrase.end()) { + frequent_phrases.push_back(phrase); } } - return frequent_patterns; + return frequent_phrases; } void Precomputation::AddCollocations( - const vector>& matchings, const vector& data, + const vector>& locations, const vector& data, int max_rule_span, int min_gap_size, int max_rule_symbols) { - // Select the leftmost subpattern. - for (size_t i = 0; i < matchings.size(); ++i) { + // Select the leftmost subphrase. + for (size_t i = 0; i < locations.size(); ++i) { int start1, size1, is_super1; - tie(start1, size1, is_super1) = matchings[i]; + tie(start1, size1, is_super1) = locations[i]; - // Select the second (middle) subpattern - for (size_t j = i + 1; j < matchings.size(); ++j) { + // Select the second (middle) subphrase + for (size_t j = i + 1; j < locations.size(); ++j) { int start2, size2, is_super2; - tie(start2, size2, is_super2) = matchings[j]; + tie(start2, size2, is_super2) = locations[j]; if (start2 - start1 >= max_rule_span) { break; } @@ -118,20 +120,21 @@ void Precomputation::AddCollocations( if (start2 - start1 - size1 >= min_gap_size && start2 + size2 - start1 <= max_rule_span && size1 + size2 + 1 <= max_rule_symbols) { - vector pattern(data.begin() + start1, + vector collocation(data.begin() + start1, data.begin() + start1 + size1); - pattern.push_back(Precomputation::FIRST_NONTERMINAL); - pattern.insert(pattern.end(), data.begin() + start2, + collocation.push_back(Precomputation::FIRST_NONTERMINAL); + collocation.insert(collocation.end(), data.begin() + start2, data.begin() + start2 + size2); - AddStartPositions(collocations[pattern], start1, start2); + + AddCollocation(collocation, GetLocation(start1, start2)); // Try extending the binary collocation to a ternary collocation. if (is_super2) { - pattern.push_back(Precomputation::SECOND_NONTERMINAL); - // Select the rightmost subpattern. - for (size_t k = j + 1; k < matchings.size(); ++k) { + collocation.push_back(Precomputation::SECOND_NONTERMINAL); + // Select the rightmost subphrase. + for (size_t k = j + 1; k < locations.size(); ++k) { int start3, size3, is_super3; - tie(start3, size3, is_super3) = matchings[k]; + tie(start3, size3, is_super3) = locations[k]; if (start3 - start1 >= max_rule_span) { break; } @@ -140,10 +143,12 @@ void Precomputation::AddCollocations( && start3 + size3 - start1 <= max_rule_span && size1 + size2 + size3 + 2 <= max_rule_symbols && (is_super1 || is_super3)) { - pattern.insert(pattern.end(), data.begin() + start3, + collocation.insert(collocation.end(), data.begin() + start3, data.begin() + start3 + size3); - AddStartPositions(collocations[pattern], start1, start2, start3); - pattern.erase(pattern.end() - size3); + + AddCollocation(collocation, GetLocation(start1, start2, start3)); + + collocation.erase(collocation.end() - size3); } } } @@ -152,20 +157,29 @@ void Precomputation::AddCollocations( } } -void Precomputation::AddStartPositions( - vector& positions, int pos1, int pos2) { - positions.push_back(pos1); - positions.push_back(pos2); +vector Precomputation::GetLocation(int pos1, int pos2) { + vector location; + location.push_back(pos1); + location.push_back(pos2); + return location; +} + +vector Precomputation::GetLocation(int pos1, int pos2, int pos3) { + vector location; + location.push_back(pos1); + location.push_back(pos2); + location.push_back(pos3); + return location; } -void Precomputation::AddStartPositions( - vector& positions, int pos1, int pos2, int pos3) { - positions.push_back(pos1); - positions.push_back(pos2); - positions.push_back(pos3); +void Precomputation::AddCollocation(vector collocation, + vector location) { + collocation.shrink_to_fit(); + location.shrink_to_fit(); + collocations.push_back(make_pair(collocation, location)); } -const Index& Precomputation::GetCollocations() const { +Collocations Precomputation::GetCollocations() const { return collocations; } diff --git a/extractor/precomputation.h b/extractor/precomputation.h index 9f0c9424..0a06349b 100644 --- a/extractor/precomputation.h +++ b/extractor/precomputation.h @@ -19,16 +19,18 @@ using namespace std; namespace extractor { typedef boost::hash> VectorHash; -typedef unordered_map, vector, VectorHash> Index; +typedef vector, vector>> Collocations; class SuffixArray; /** - * Data structure wrapping an index with all the occurrences of the most - * frequent discontiguous collocations in the source data. + * Data structure containing all the data needed for constructing an index with + * all the occurrences of the most frequent discontiguous collocations in the + * source data. * - * Let a, b, c be contiguous collocations. The index will contain an entry for - * every collocation of the form: + * Let a, b, c be contiguous phrases. The data structure will contain the + * locations in the source data where every collocation of the following forms + * occurs: * - aXb, where a and b are frequent * - aXbXc, where a and b are super-frequent and c is frequent or * b and c are super-frequent and a is frequent. @@ -37,8 +39,8 @@ class Precomputation { public: // Constructs the index using the suffix array. Precomputation( - shared_ptr suffix_array, int num_frequent_patterns, - int num_super_frequent_patterns, int max_rule_span, + shared_ptr suffix_array, int num_frequent_phrases, + int num_super_frequent_phrases, int max_rule_span, int max_rule_symbols, int min_gap_size, int max_frequent_phrase_len, int min_frequency); @@ -47,8 +49,9 @@ class Precomputation { virtual ~Precomputation(); - // Returns a reference to the index. - virtual const Index& GetCollocations() const; + // Returns the list of the locations of the most frequent collocations in the + // source data. + virtual Collocations GetCollocations() const; bool operator==(const Precomputation& other) const; @@ -57,23 +60,29 @@ class Precomputation { private: // Finds the most frequent contiguous collocations. - vector> FindMostFrequentPatterns( + vector> FindMostFrequentPhrases( shared_ptr suffix_array, const vector& data, - int num_frequent_patterns, int max_frequent_phrase_len, + int num_frequent_phrases, int max_frequent_phrase_len, int min_frequency); // Given the locations of the frequent contiguous collocations in a sentence, // it adds new entries to the index for each discontiguous collocation // matching the criteria specified in the class description. - void AddCollocations( - const vector>& matchings, const vector& data, - int max_rule_span, int min_gap_size, int max_rule_symbols); + void AddCollocations(const vector>& locations, + const vector& data, int max_rule_span, + int min_gap_size, int max_rule_symbols); - // Adds an occurrence of a binary collocation. - void AddStartPositions(vector& positions, int pos1, int pos2); + // Creates a vector representation for the location of a binary collocation + // containing the starting points of each subpattern. + vector GetLocation(int pos1, int pos2); - // Adds an occurrence of a ternary collocation. - void AddStartPositions(vector& positions, int pos1, int pos2, int pos3); + // Creates a vector representation for the location of a ternary collocation + // containing the starting points of each subpattern. + vector GetLocation(int pos1, int pos2, int pos3); + + // Appends a collocation to the list of collocations after shrinking the + // vectors to avoid unnecessary memory usage. + void AddCollocation(vector collocation, vector location); friend class boost::serialization::access; @@ -91,13 +100,13 @@ class Precomputation { for (size_t i = 0; i < num_entries; ++i) { pair, vector> entry; ar >> entry; - collocations.insert(entry); + collocations.push_back(entry); } } BOOST_SERIALIZATION_SPLIT_MEMBER(); - Index collocations; + Collocations collocations; }; } // namespace extractor diff --git a/extractor/precomputation_test.cc b/extractor/precomputation_test.cc index e81ece5d..c6e457fd 100644 --- a/extractor/precomputation_test.cc +++ b/extractor/precomputation_test.cc @@ -38,6 +38,23 @@ class PrecomputationTest : public Test { precomputation = Precomputation(suffix_array, 3, 3, 10, 5, 1, 4, 2); } + void CheckCollocation(const Collocations& collocations, + const vector& collocation, + const vector>& locations) { + for (auto location: locations) { + auto item = make_pair(collocation, location); + EXPECT_FALSE(find(collocations.begin(), collocations.end(), item) == + collocations.end()); + } + } + + void CheckIllegalCollocation(const Collocations& collocations, + const vector& collocation) { + for (auto item: collocations) { + EXPECT_FALSE(collocation == item.first); + } + } + vector data; shared_ptr data_array; shared_ptr suffix_array; @@ -45,67 +62,71 @@ class PrecomputationTest : public Test { }; TEST_F(PrecomputationTest, TestCollocations) { - Index collocations = precomputation.GetCollocations(); - - vector key = {2, 3, -1, 2}; - vector expected_value = {1, 5, 1, 8, 5, 8, 5, 11, 8, 11}; - EXPECT_EQ(expected_value, collocations[key]); - key = {2, 3, -1, 2, 3}; - expected_value = {1, 5, 1, 8, 5, 8}; - EXPECT_EQ(expected_value, collocations[key]); - key = {2, 3, -1, 3}; - expected_value = {1, 6, 1, 9, 5, 9}; - EXPECT_EQ(expected_value, collocations[key]); - key = {3, -1, 2}; - expected_value = {2, 5, 2, 8, 2, 11, 6, 8, 6, 11, 9, 11}; - EXPECT_EQ(expected_value, collocations[key]); - key = {3, -1, 3}; - expected_value = {2, 6, 2, 9, 6, 9}; - EXPECT_EQ(expected_value, collocations[key]); - key = {3, -1, 2, 3}; - expected_value = {2, 5, 2, 8, 6, 8}; - EXPECT_EQ(expected_value, collocations[key]); - key = {2, -1, 2}; - expected_value = {1, 5, 1, 8, 5, 8, 5, 11, 8, 11}; - EXPECT_EQ(expected_value, collocations[key]); - key = {2, -1, 2, 3}; - expected_value = {1, 5, 1, 8, 5, 8}; - EXPECT_EQ(expected_value, collocations[key]); - key = {2, -1, 3}; - expected_value = {1, 6, 1, 9, 5, 9}; - EXPECT_EQ(expected_value, collocations[key]); - - key = {2, -1, 2, -2, 2}; - expected_value = {1, 5, 8, 5, 8, 11}; - EXPECT_EQ(expected_value, collocations[key]); - key = {2, -1, 2, -2, 3}; - expected_value = {1, 5, 9}; - EXPECT_EQ(expected_value, collocations[key]); - key = {2, -1, 3, -2, 2}; - expected_value = {1, 6, 8, 5, 9, 11}; - EXPECT_EQ(expected_value, collocations[key]); - key = {2, -1, 3, -2, 3}; - expected_value = {1, 6, 9}; - EXPECT_EQ(expected_value, collocations[key]); - key = {3, -1, 2, -2, 2}; - expected_value = {2, 5, 8, 2, 5, 11, 2, 8, 11, 6, 8, 11}; - EXPECT_EQ(expected_value, collocations[key]); - key = {3, -1, 2, -2, 3}; - expected_value = {2, 5, 9}; - EXPECT_EQ(expected_value, collocations[key]); - key = {3, -1, 3, -2, 2}; - expected_value = {2, 6, 8, 2, 6, 11, 2, 9, 11, 6, 9, 11}; - EXPECT_EQ(expected_value, collocations[key]); - key = {3, -1, 3, -2, 3}; - expected_value = {2, 6, 9}; - EXPECT_EQ(expected_value, collocations[key]); - - // Exceeds max_rule_symbols. - key = {2, -1, 2, -2, 2, 3}; - EXPECT_EQ(0, collocations.count(key)); - // Contains non frequent pattern. - key = {2, -1, 5}; - EXPECT_EQ(0, collocations.count(key)); + Collocations collocations = precomputation.GetCollocations(); + + EXPECT_EQ(50, collocations.size()); + + vector collocation = {2, 3, -1, 2}; + vector> locations = {{1, 5}, {1, 8}, {5, 8}, {5, 11}, {8, 11}}; + CheckCollocation(collocations, collocation, locations); + + collocation = {2, 3, -1, 2, 3}; + locations = {{1, 5}, {1, 8}, {5, 8}}; + CheckCollocation(collocations, collocation, locations); + + collocation = {2, 3, -1, 3}; + locations = {{1, 6}, {1, 9}, {5, 9}}; + CheckCollocation(collocations, collocation, locations); + collocation = {3, -1, 2}; + locations = {{2, 5}, {2, 8}, {2, 11}, {6, 8}, {6, 11}, {9, 11}}; + CheckCollocation(collocations, collocation, locations); + collocation = {3, -1, 3}; + locations = {{2, 6}, {2, 9}, {6, 9}}; + CheckCollocation(collocations, collocation, locations); + collocation = {3, -1, 2, 3}; + locations = {{2, 5}, {2, 8}, {6, 8}}; + CheckCollocation(collocations, collocation, locations); + collocation = {2, -1, 2}; + locations = {{1, 5}, {1, 8}, {5, 8}, {5, 11}, {8, 11}}; + CheckCollocation(collocations, collocation, locations); + collocation = {2, -1, 2, 3}; + locations = {{1, 5}, {1, 8}, {5, 8}}; + CheckCollocation(collocations, collocation, locations); + collocation = {2, -1, 3}; + locations = {{1, 6}, {1, 9}, {5, 9}}; + CheckCollocation(collocations, collocation, locations); + + collocation = {2, -1, 2, -2, 2}; + locations = {{1, 5, 8}, {5, 8, 11}}; + CheckCollocation(collocations, collocation, locations); + collocation = {2, -1, 2, -2, 3}; + locations = {{1, 5, 9}}; + CheckCollocation(collocations, collocation, locations); + collocation = {2, -1, 3, -2, 2}; + locations = {{1, 6, 8}, {5, 9, 11}}; + CheckCollocation(collocations, collocation, locations); + collocation = {2, -1, 3, -2, 3}; + locations = {{1, 6, 9}}; + CheckCollocation(collocations, collocation, locations); + collocation = {3, -1, 2, -2, 2}; + locations = {{2, 5, 8}, {2, 5, 11}, {2, 8, 11}, {6, 8, 11}}; + CheckCollocation(collocations, collocation, locations); + collocation = {3, -1, 2, -2, 3}; + locations = {{2, 5, 9}}; + CheckCollocation(collocations, collocation, locations); + collocation = {3, -1, 3, -2, 2}; + locations = {{2, 6, 8}, {2, 6, 11}, {2, 9, 11}, {6, 9, 11}}; + CheckCollocation(collocations, collocation, locations); + collocation = {3, -1, 3, -2, 3}; + locations = {{2, 6, 9}}; + CheckCollocation(collocations, collocation, locations); + + // Collocation exceeds max_rule_symbols. + collocation = {2, -1, 2, -2, 2, 3}; + CheckIllegalCollocation(collocations, collocation); + // Collocation contains non frequent pattern. + collocation = {2, -1, 5}; + CheckIllegalCollocation(collocations, collocation); } TEST_F(PrecomputationTest, TestSerialization) { -- cgit v1.2.3 From 36bc55a33b5f075f6479c64f94d60ecbbd1f3270 Mon Sep 17 00:00:00 2001 From: Paul Baltescu Date: Tue, 25 Jun 2013 17:28:54 +0100 Subject: Undo last commit. --- extractor/fast_intersector.cc | 11 +-- extractor/fast_intersector_test.cc | 8 +- extractor/mocks/mock_precomputation.h | 2 +- extractor/precomputation.cc | 138 +++++++++++++++----------------- extractor/precomputation.h | 49 +++++------- extractor/precomputation_test.cc | 143 +++++++++++++++------------------- 6 files changed, 152 insertions(+), 199 deletions(-) diff --git a/extractor/fast_intersector.cc b/extractor/fast_intersector.cc index 5360c1da..a8591a72 100644 --- a/extractor/fast_intersector.cc +++ b/extractor/fast_intersector.cc @@ -20,13 +20,10 @@ FastIntersector::FastIntersector(shared_ptr suffix_array, vocabulary(vocabulary), max_rule_span(max_rule_span), min_gap_size(min_gap_size) { - auto precomputed_collocations = precomputation->GetCollocations(); - for (auto item: precomputed_collocations) { - vector phrase = ConvertPhrase(item.first); - vector location = item.second; - vector& phrase_collocations = collocations[phrase]; - phrase_collocations.insert(phrase_collocations.end(), location.begin(), - location.end()); + Index precomputed_collocations = precomputation->GetCollocations(); + for (pair, vector> entry: precomputed_collocations) { + vector phrase = ConvertPhrase(entry.first); + collocations[phrase] = entry.second; } } diff --git a/extractor/fast_intersector_test.cc b/extractor/fast_intersector_test.cc index 2e618b63..76c3aaea 100644 --- a/extractor/fast_intersector_test.cc +++ b/extractor/fast_intersector_test.cc @@ -60,14 +60,14 @@ class FastIntersectorTest : public Test { precomputation = make_shared(); EXPECT_CALL(*precomputation, GetCollocations()) - .WillRepeatedly(Return(collocations)); + .WillRepeatedly(ReturnRef(collocations)); phrase_builder = make_shared(vocabulary); intersector = make_shared(suffix_array, precomputation, vocabulary, 15, 1); } - Collocations collocations; + Index collocations; shared_ptr data_array; shared_ptr suffix_array; shared_ptr precomputation; @@ -82,9 +82,9 @@ TEST_F(FastIntersectorTest, TestCachedCollocation) { Phrase phrase = phrase_builder->Build(symbols); PhraseLocation prefix_location(15, 16), suffix_location(16, 17); - collocations.push_back(make_pair(symbols, expected_location)); + collocations[symbols] = expected_location; EXPECT_CALL(*precomputation, GetCollocations()) - .WillRepeatedly(Return(collocations)); + .WillRepeatedly(ReturnRef(collocations)); intersector = make_shared(suffix_array, precomputation, vocabulary, 15, 1); diff --git a/extractor/mocks/mock_precomputation.h b/extractor/mocks/mock_precomputation.h index 86f4ce27..8753343e 100644 --- a/extractor/mocks/mock_precomputation.h +++ b/extractor/mocks/mock_precomputation.h @@ -6,7 +6,7 @@ namespace extractor { class MockPrecomputation : public Precomputation { public: - MOCK_CONST_METHOD0(GetCollocations, Collocations()); + MOCK_CONST_METHOD0(GetCollocations, const Index&()); }; } // namespace extractor diff --git a/extractor/precomputation.cc b/extractor/precomputation.cc index 37dbf7b7..3b8aed69 100644 --- a/extractor/precomputation.cc +++ b/extractor/precomputation.cc @@ -14,65 +14,63 @@ int Precomputation::FIRST_NONTERMINAL = -1; int Precomputation::SECOND_NONTERMINAL = -2; Precomputation::Precomputation( - shared_ptr suffix_array, int num_frequent_phrases, - int num_super_frequent_phrases, int max_rule_span, + shared_ptr suffix_array, int num_frequent_patterns, + int num_super_frequent_patterns, int max_rule_span, int max_rule_symbols, int min_gap_size, int max_frequent_phrase_len, int min_frequency) { vector data = suffix_array->GetData()->GetData(); - vector> frequent_phrases = FindMostFrequentPhrases( - suffix_array, data, num_frequent_phrases, max_frequent_phrase_len, + vector> frequent_patterns = FindMostFrequentPatterns( + suffix_array, data, num_frequent_patterns, max_frequent_phrase_len, min_frequency); // Construct sets containing the frequent and superfrequent contiguous // collocations. - unordered_set, VectorHash> frequent_phrases_set; - unordered_set, VectorHash> super_frequent_phrases_set; - for (size_t i = 0; i < frequent_phrases.size(); ++i) { - frequent_phrases_set.insert(frequent_phrases[i]); - if (i < num_super_frequent_phrases) { - super_frequent_phrases_set.insert(frequent_phrases[i]); + unordered_set, VectorHash> frequent_patterns_set; + unordered_set, VectorHash> super_frequent_patterns_set; + for (size_t i = 0; i < frequent_patterns.size(); ++i) { + frequent_patterns_set.insert(frequent_patterns[i]); + if (i < num_super_frequent_patterns) { + super_frequent_patterns_set.insert(frequent_patterns[i]); } } - vector> locations; + vector> matchings; for (size_t i = 0; i < data.size(); ++i) { - // If the sentence is over, add all the discontiguous frequent phrases to - // the list. + // If the sentence is over, add all the discontiguous frequent patterns to + // the index. if (data[i] == DataArray::END_OF_LINE) { - AddCollocations(locations, data, max_rule_span, min_gap_size, + AddCollocations(matchings, data, max_rule_span, min_gap_size, max_rule_symbols); - locations.clear(); + matchings.clear(); continue; } - vector phrase; - // Find all the contiguous frequent phrases starting at position i. + vector pattern; + // Find all the contiguous frequent patterns starting at position i. for (int j = 1; j <= max_frequent_phrase_len && i + j <= data.size(); ++j) { - phrase.push_back(data[i + j - 1]); - if (frequent_phrases_set.count(phrase)) { - int is_super_frequent = super_frequent_phrases_set.count(phrase); - locations.push_back(make_tuple(i, j, is_super_frequent)); + pattern.push_back(data[i + j - 1]); + if (frequent_patterns_set.count(pattern)) { + int is_super_frequent = super_frequent_patterns_set.count(pattern); + matchings.push_back(make_tuple(i, j, is_super_frequent)); } else { - // If the current phrase is not frequent, any longer phrase having the - // current phrase as prefix will not be frequent. + // If the current pattern is not frequent, any longer pattern having the + // current pattern as prefix will not be frequent. break; } } } - - collocations.shrink_to_fit(); } Precomputation::Precomputation() {} Precomputation::~Precomputation() {} -vector> Precomputation::FindMostFrequentPhrases( +vector> Precomputation::FindMostFrequentPatterns( shared_ptr suffix_array, const vector& data, - int num_frequent_phrases, int max_frequent_phrase_len, int min_frequency) { + int num_frequent_patterns, int max_frequent_phrase_len, int min_frequency) { vector lcp = suffix_array->BuildLCPArray(); vector run_start(max_frequent_phrase_len); - // Find all the phrases occurring at least min_frequency times. + // Find all the patterns occurring at least min_frequency times. priority_queue>> heap; for (size_t i = 1; i < lcp.size(); ++i) { for (int len = lcp[i]; len < max_frequent_phrase_len; ++len) { @@ -85,34 +83,34 @@ vector> Precomputation::FindMostFrequentPhrases( } } - // Extract the most frequent phrases. - vector> frequent_phrases; - while (frequent_phrases.size() < num_frequent_phrases && !heap.empty()) { + // Extract the most frequent patterns. + vector> frequent_patterns; + while (frequent_patterns.size() < num_frequent_patterns && !heap.empty()) { int start = heap.top().second.first; int len = heap.top().second.second; heap.pop(); - vector phrase(data.begin() + start, data.begin() + start + len); - if (find(phrase.begin(), phrase.end(), DataArray::END_OF_LINE) == - phrase.end()) { - frequent_phrases.push_back(phrase); + vector pattern(data.begin() + start, data.begin() + start + len); + if (find(pattern.begin(), pattern.end(), DataArray::END_OF_LINE) == + pattern.end()) { + frequent_patterns.push_back(pattern); } } - return frequent_phrases; + return frequent_patterns; } void Precomputation::AddCollocations( - const vector>& locations, const vector& data, + const vector>& matchings, const vector& data, int max_rule_span, int min_gap_size, int max_rule_symbols) { - // Select the leftmost subphrase. - for (size_t i = 0; i < locations.size(); ++i) { + // Select the leftmost subpattern. + for (size_t i = 0; i < matchings.size(); ++i) { int start1, size1, is_super1; - tie(start1, size1, is_super1) = locations[i]; + tie(start1, size1, is_super1) = matchings[i]; - // Select the second (middle) subphrase - for (size_t j = i + 1; j < locations.size(); ++j) { + // Select the second (middle) subpattern + for (size_t j = i + 1; j < matchings.size(); ++j) { int start2, size2, is_super2; - tie(start2, size2, is_super2) = locations[j]; + tie(start2, size2, is_super2) = matchings[j]; if (start2 - start1 >= max_rule_span) { break; } @@ -120,21 +118,20 @@ void Precomputation::AddCollocations( if (start2 - start1 - size1 >= min_gap_size && start2 + size2 - start1 <= max_rule_span && size1 + size2 + 1 <= max_rule_symbols) { - vector collocation(data.begin() + start1, + vector pattern(data.begin() + start1, data.begin() + start1 + size1); - collocation.push_back(Precomputation::FIRST_NONTERMINAL); - collocation.insert(collocation.end(), data.begin() + start2, + pattern.push_back(Precomputation::FIRST_NONTERMINAL); + pattern.insert(pattern.end(), data.begin() + start2, data.begin() + start2 + size2); - - AddCollocation(collocation, GetLocation(start1, start2)); + AddStartPositions(collocations[pattern], start1, start2); // Try extending the binary collocation to a ternary collocation. if (is_super2) { - collocation.push_back(Precomputation::SECOND_NONTERMINAL); - // Select the rightmost subphrase. - for (size_t k = j + 1; k < locations.size(); ++k) { + pattern.push_back(Precomputation::SECOND_NONTERMINAL); + // Select the rightmost subpattern. + for (size_t k = j + 1; k < matchings.size(); ++k) { int start3, size3, is_super3; - tie(start3, size3, is_super3) = locations[k]; + tie(start3, size3, is_super3) = matchings[k]; if (start3 - start1 >= max_rule_span) { break; } @@ -143,12 +140,10 @@ void Precomputation::AddCollocations( && start3 + size3 - start1 <= max_rule_span && size1 + size2 + size3 + 2 <= max_rule_symbols && (is_super1 || is_super3)) { - collocation.insert(collocation.end(), data.begin() + start3, + pattern.insert(pattern.end(), data.begin() + start3, data.begin() + start3 + size3); - - AddCollocation(collocation, GetLocation(start1, start2, start3)); - - collocation.erase(collocation.end() - size3); + AddStartPositions(collocations[pattern], start1, start2, start3); + pattern.erase(pattern.end() - size3); } } } @@ -157,29 +152,20 @@ void Precomputation::AddCollocations( } } -vector Precomputation::GetLocation(int pos1, int pos2) { - vector location; - location.push_back(pos1); - location.push_back(pos2); - return location; -} - -vector Precomputation::GetLocation(int pos1, int pos2, int pos3) { - vector location; - location.push_back(pos1); - location.push_back(pos2); - location.push_back(pos3); - return location; +void Precomputation::AddStartPositions( + vector& positions, int pos1, int pos2) { + positions.push_back(pos1); + positions.push_back(pos2); } -void Precomputation::AddCollocation(vector collocation, - vector location) { - collocation.shrink_to_fit(); - location.shrink_to_fit(); - collocations.push_back(make_pair(collocation, location)); +void Precomputation::AddStartPositions( + vector& positions, int pos1, int pos2, int pos3) { + positions.push_back(pos1); + positions.push_back(pos2); + positions.push_back(pos3); } -Collocations Precomputation::GetCollocations() const { +const Index& Precomputation::GetCollocations() const { return collocations; } diff --git a/extractor/precomputation.h b/extractor/precomputation.h index 0a06349b..9f0c9424 100644 --- a/extractor/precomputation.h +++ b/extractor/precomputation.h @@ -19,18 +19,16 @@ using namespace std; namespace extractor { typedef boost::hash> VectorHash; -typedef vector, vector>> Collocations; +typedef unordered_map, vector, VectorHash> Index; class SuffixArray; /** - * Data structure containing all the data needed for constructing an index with - * all the occurrences of the most frequent discontiguous collocations in the - * source data. + * Data structure wrapping an index with all the occurrences of the most + * frequent discontiguous collocations in the source data. * - * Let a, b, c be contiguous phrases. The data structure will contain the - * locations in the source data where every collocation of the following forms - * occurs: + * Let a, b, c be contiguous collocations. The index will contain an entry for + * every collocation of the form: * - aXb, where a and b are frequent * - aXbXc, where a and b are super-frequent and c is frequent or * b and c are super-frequent and a is frequent. @@ -39,8 +37,8 @@ class Precomputation { public: // Constructs the index using the suffix array. Precomputation( - shared_ptr suffix_array, int num_frequent_phrases, - int num_super_frequent_phrases, int max_rule_span, + shared_ptr suffix_array, int num_frequent_patterns, + int num_super_frequent_patterns, int max_rule_span, int max_rule_symbols, int min_gap_size, int max_frequent_phrase_len, int min_frequency); @@ -49,9 +47,8 @@ class Precomputation { virtual ~Precomputation(); - // Returns the list of the locations of the most frequent collocations in the - // source data. - virtual Collocations GetCollocations() const; + // Returns a reference to the index. + virtual const Index& GetCollocations() const; bool operator==(const Precomputation& other) const; @@ -60,29 +57,23 @@ class Precomputation { private: // Finds the most frequent contiguous collocations. - vector> FindMostFrequentPhrases( + vector> FindMostFrequentPatterns( shared_ptr suffix_array, const vector& data, - int num_frequent_phrases, int max_frequent_phrase_len, + int num_frequent_patterns, int max_frequent_phrase_len, int min_frequency); // Given the locations of the frequent contiguous collocations in a sentence, // it adds new entries to the index for each discontiguous collocation // matching the criteria specified in the class description. - void AddCollocations(const vector>& locations, - const vector& data, int max_rule_span, - int min_gap_size, int max_rule_symbols); + void AddCollocations( + const vector>& matchings, const vector& data, + int max_rule_span, int min_gap_size, int max_rule_symbols); - // Creates a vector representation for the location of a binary collocation - // containing the starting points of each subpattern. - vector GetLocation(int pos1, int pos2); + // Adds an occurrence of a binary collocation. + void AddStartPositions(vector& positions, int pos1, int pos2); - // Creates a vector representation for the location of a ternary collocation - // containing the starting points of each subpattern. - vector GetLocation(int pos1, int pos2, int pos3); - - // Appends a collocation to the list of collocations after shrinking the - // vectors to avoid unnecessary memory usage. - void AddCollocation(vector collocation, vector location); + // Adds an occurrence of a ternary collocation. + void AddStartPositions(vector& positions, int pos1, int pos2, int pos3); friend class boost::serialization::access; @@ -100,13 +91,13 @@ class Precomputation { for (size_t i = 0; i < num_entries; ++i) { pair, vector> entry; ar >> entry; - collocations.push_back(entry); + collocations.insert(entry); } } BOOST_SERIALIZATION_SPLIT_MEMBER(); - Collocations collocations; + Index collocations; }; } // namespace extractor diff --git a/extractor/precomputation_test.cc b/extractor/precomputation_test.cc index c6e457fd..e81ece5d 100644 --- a/extractor/precomputation_test.cc +++ b/extractor/precomputation_test.cc @@ -38,23 +38,6 @@ class PrecomputationTest : public Test { precomputation = Precomputation(suffix_array, 3, 3, 10, 5, 1, 4, 2); } - void CheckCollocation(const Collocations& collocations, - const vector& collocation, - const vector>& locations) { - for (auto location: locations) { - auto item = make_pair(collocation, location); - EXPECT_FALSE(find(collocations.begin(), collocations.end(), item) == - collocations.end()); - } - } - - void CheckIllegalCollocation(const Collocations& collocations, - const vector& collocation) { - for (auto item: collocations) { - EXPECT_FALSE(collocation == item.first); - } - } - vector data; shared_ptr data_array; shared_ptr suffix_array; @@ -62,71 +45,67 @@ class PrecomputationTest : public Test { }; TEST_F(PrecomputationTest, TestCollocations) { - Collocations collocations = precomputation.GetCollocations(); - - EXPECT_EQ(50, collocations.size()); - - vector collocation = {2, 3, -1, 2}; - vector> locations = {{1, 5}, {1, 8}, {5, 8}, {5, 11}, {8, 11}}; - CheckCollocation(collocations, collocation, locations); - - collocation = {2, 3, -1, 2, 3}; - locations = {{1, 5}, {1, 8}, {5, 8}}; - CheckCollocation(collocations, collocation, locations); - - collocation = {2, 3, -1, 3}; - locations = {{1, 6}, {1, 9}, {5, 9}}; - CheckCollocation(collocations, collocation, locations); - collocation = {3, -1, 2}; - locations = {{2, 5}, {2, 8}, {2, 11}, {6, 8}, {6, 11}, {9, 11}}; - CheckCollocation(collocations, collocation, locations); - collocation = {3, -1, 3}; - locations = {{2, 6}, {2, 9}, {6, 9}}; - CheckCollocation(collocations, collocation, locations); - collocation = {3, -1, 2, 3}; - locations = {{2, 5}, {2, 8}, {6, 8}}; - CheckCollocation(collocations, collocation, locations); - collocation = {2, -1, 2}; - locations = {{1, 5}, {1, 8}, {5, 8}, {5, 11}, {8, 11}}; - CheckCollocation(collocations, collocation, locations); - collocation = {2, -1, 2, 3}; - locations = {{1, 5}, {1, 8}, {5, 8}}; - CheckCollocation(collocations, collocation, locations); - collocation = {2, -1, 3}; - locations = {{1, 6}, {1, 9}, {5, 9}}; - CheckCollocation(collocations, collocation, locations); - - collocation = {2, -1, 2, -2, 2}; - locations = {{1, 5, 8}, {5, 8, 11}}; - CheckCollocation(collocations, collocation, locations); - collocation = {2, -1, 2, -2, 3}; - locations = {{1, 5, 9}}; - CheckCollocation(collocations, collocation, locations); - collocation = {2, -1, 3, -2, 2}; - locations = {{1, 6, 8}, {5, 9, 11}}; - CheckCollocation(collocations, collocation, locations); - collocation = {2, -1, 3, -2, 3}; - locations = {{1, 6, 9}}; - CheckCollocation(collocations, collocation, locations); - collocation = {3, -1, 2, -2, 2}; - locations = {{2, 5, 8}, {2, 5, 11}, {2, 8, 11}, {6, 8, 11}}; - CheckCollocation(collocations, collocation, locations); - collocation = {3, -1, 2, -2, 3}; - locations = {{2, 5, 9}}; - CheckCollocation(collocations, collocation, locations); - collocation = {3, -1, 3, -2, 2}; - locations = {{2, 6, 8}, {2, 6, 11}, {2, 9, 11}, {6, 9, 11}}; - CheckCollocation(collocations, collocation, locations); - collocation = {3, -1, 3, -2, 3}; - locations = {{2, 6, 9}}; - CheckCollocation(collocations, collocation, locations); - - // Collocation exceeds max_rule_symbols. - collocation = {2, -1, 2, -2, 2, 3}; - CheckIllegalCollocation(collocations, collocation); - // Collocation contains non frequent pattern. - collocation = {2, -1, 5}; - CheckIllegalCollocation(collocations, collocation); + Index collocations = precomputation.GetCollocations(); + + vector key = {2, 3, -1, 2}; + vector expected_value = {1, 5, 1, 8, 5, 8, 5, 11, 8, 11}; + EXPECT_EQ(expected_value, collocations[key]); + key = {2, 3, -1, 2, 3}; + expected_value = {1, 5, 1, 8, 5, 8}; + EXPECT_EQ(expected_value, collocations[key]); + key = {2, 3, -1, 3}; + expected_value = {1, 6, 1, 9, 5, 9}; + EXPECT_EQ(expected_value, collocations[key]); + key = {3, -1, 2}; + expected_value = {2, 5, 2, 8, 2, 11, 6, 8, 6, 11, 9, 11}; + EXPECT_EQ(expected_value, collocations[key]); + key = {3, -1, 3}; + expected_value = {2, 6, 2, 9, 6, 9}; + EXPECT_EQ(expected_value, collocations[key]); + key = {3, -1, 2, 3}; + expected_value = {2, 5, 2, 8, 6, 8}; + EXPECT_EQ(expected_value, collocations[key]); + key = {2, -1, 2}; + expected_value = {1, 5, 1, 8, 5, 8, 5, 11, 8, 11}; + EXPECT_EQ(expected_value, collocations[key]); + key = {2, -1, 2, 3}; + expected_value = {1, 5, 1, 8, 5, 8}; + EXPECT_EQ(expected_value, collocations[key]); + key = {2, -1, 3}; + expected_value = {1, 6, 1, 9, 5, 9}; + EXPECT_EQ(expected_value, collocations[key]); + + key = {2, -1, 2, -2, 2}; + expected_value = {1, 5, 8, 5, 8, 11}; + EXPECT_EQ(expected_value, collocations[key]); + key = {2, -1, 2, -2, 3}; + expected_value = {1, 5, 9}; + EXPECT_EQ(expected_value, collocations[key]); + key = {2, -1, 3, -2, 2}; + expected_value = {1, 6, 8, 5, 9, 11}; + EXPECT_EQ(expected_value, collocations[key]); + key = {2, -1, 3, -2, 3}; + expected_value = {1, 6, 9}; + EXPECT_EQ(expected_value, collocations[key]); + key = {3, -1, 2, -2, 2}; + expected_value = {2, 5, 8, 2, 5, 11, 2, 8, 11, 6, 8, 11}; + EXPECT_EQ(expected_value, collocations[key]); + key = {3, -1, 2, -2, 3}; + expected_value = {2, 5, 9}; + EXPECT_EQ(expected_value, collocations[key]); + key = {3, -1, 3, -2, 2}; + expected_value = {2, 6, 8, 2, 6, 11, 2, 9, 11, 6, 9, 11}; + EXPECT_EQ(expected_value, collocations[key]); + key = {3, -1, 3, -2, 3}; + expected_value = {2, 6, 9}; + EXPECT_EQ(expected_value, collocations[key]); + + // Exceeds max_rule_symbols. + key = {2, -1, 2, -2, 2, 3}; + EXPECT_EQ(0, collocations.count(key)); + // Contains non frequent pattern. + key = {2, -1, 5}; + EXPECT_EQ(0, collocations.count(key)); } TEST_F(PrecomputationTest, TestSerialization) { -- cgit v1.2.3 From eb4259454f963e4189dab65a12302f467e598621 Mon Sep 17 00:00:00 2001 From: Paul Baltescu Date: Tue, 25 Jun 2013 22:11:14 +0100 Subject: Updated README. --- extractor/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/extractor/README.md b/extractor/README.md index 575f5ca5..c9db8de8 100644 --- a/extractor/README.md +++ b/extractor/README.md @@ -2,7 +2,7 @@ C++ implementation of the online grammar extractor originally developed by [Adam To run the extractor you need to: - cdec/extractor/run_extractor -a -b -g < > + cdec/extractor/run_extractor -t -a -b -g < > To run unit tests you need first to configure `cdec` with the [Google Test](https://code.google.com/p/googletest/) and [Google Mock](https://code.google.com/p/googlemock/) libraries: -- cgit v1.2.3