summaryrefslogtreecommitdiff
path: root/gi/pipeline/backoff-pipe.pl
diff options
context:
space:
mode:
authorKenneth Heafield <github@kheafield.com>2012-10-22 12:07:20 +0100
committerKenneth Heafield <github@kheafield.com>2012-10-22 12:07:20 +0100
commit5f98fe5c4f2a2090eeb9d30c030305a70a8347d1 (patch)
tree9b6002f850e6dea1e3400c6b19bb31a9cdf3067f /gi/pipeline/backoff-pipe.pl
parentcf9994131993b40be62e90e213b1e11e6b550143 (diff)
parent21825a09d97c2e0afd20512f306fb25fed55e529 (diff)
Merge remote branch 'upstream/master'
Conflicts: Jamroot bjam decoder/Jamfile decoder/cdec.cc dpmert/Jamfile jam-files/sanity.jam klm/lm/Jamfile klm/util/Jamfile mira/Jamfile
Diffstat (limited to 'gi/pipeline/backoff-pipe.pl')
-rw-r--r--gi/pipeline/backoff-pipe.pl215
1 files changed, 0 insertions, 215 deletions
diff --git a/gi/pipeline/backoff-pipe.pl b/gi/pipeline/backoff-pipe.pl
deleted file mode 100644
index ac103c8b..00000000
--- a/gi/pipeline/backoff-pipe.pl
+++ /dev/null
@@ -1,215 +0,0 @@
-#!/usr/bin/perl -w
-use strict;
-
-use Getopt::Long "GetOptions";
-
-my @grammars;
-my $OUTPUTPREFIX = './giwork/bo.hier.grammar';
-safemkdir($OUTPUTPREFIX);
-my $backoff_levels = 1;
-my $glue_levels = 1;
-
-usage() unless &GetOptions('grmr=s@' => \ @grammars,
- 'outprefix=s' => \ $OUTPUTPREFIX,
- 'bo-lvls=i' => \ $backoff_levels,
- 'glue-lvls=i' => \ $glue_levels,
-);
-
-my $OUTDIR = $OUTPUTPREFIX . '/hier';
-print STDERR "@grammars\n";
-
-
-my %grmr = ();
-foreach my $grammar (@grammars) {
- $grammar =~ m/\/[^\/]*\.t(\d+)\.[^\/]*/;
- $grmr{$1} = $grammar;
-}
-
-my @index = sort keys %grmr;
-$OUTDIR = $OUTDIR . join('-',@index);
-safemkdir($OUTDIR);
-my $BACKOFF_GRMR = $OUTDIR . '/backoff.hier.gz';
-safesystem("echo \"\" | gzip > $BACKOFF_GRMR");
-my $GLUE_GRMR = $OUTDIR . '/glue.hier.gz';
-safesystem("echo \"\" | gzip > $GLUE_GRMR");
-my $joinedgrammars = $OUTDIR . '/grammar.hier.gz';
-
-join_grammars();
-
-for my $i (0..(scalar @index)-2) {
- my $freqs = extract_freqs($index[$i], $index[$i+1]);
- if ($i < $backoff_levels) {
- create_backoff_rules($index[$i],$index[$i+1],$freqs);
- }
- if ($i < $glue_levels) {
- add_glue_rules($index[$i]);
- }
-}
-
-output_grammar_info();
-
-
-sub usage {
- print <<EOT;
-
-Usage: $0 [OPTIONS] corpus.fr-en-al
-
-Induces a grammar using Pitman-Yor topic modeling or Posterior Regularisation.
-
-EOT
- exit 1;
-};
-
-sub safemkdir {
- my $dir = shift;
- if (-d $dir) { return 1; }
- return mkdir($dir);
-}
-
-
-sub safesystem {
- print STDERR "Executing: @_\n";
- system(@_);
- if ($? == -1) {
- print STDERR "ERROR: Failed to execute: @_\n $!\n";
- exit(1);
- }
- elsif ($? & 127) {
- printf STDERR "ERROR: Execution of: @_\n died with signal %d, %s coredump\n",
- ($? & 127), ($? & 128) ? 'with' : 'without';
- exit(1);
- }
- else {
- my $exitcode = $? >> 8;
- print STDERR "Exit code: $exitcode\n" if $exitcode;
- return ! $exitcode;
- }
-}
-
-
-sub join_grammars {
- print STDERR "\n!!! JOINING GRAMMARS\n";
- if(-e $joinedgrammars) {
- print STDERR "$joinedgrammars exists, reusing...\n";
- return;
- }
- safesystem("echo \"\" | gzip > $joinedgrammars");
- foreach my $i (@index) {
- my $g = $grmr{$i};
- safesystem("zcat $g | sed -r -e 's/X([0-9]+)/X$i\\1/g' - | gzip > $g.2.gz");
- safesystem("zcat $joinedgrammars $g.2.gz | gzip > $joinedgrammars.2.gz");
- safesystem("mv $joinedgrammars.2.gz $joinedgrammars");
- }
-}
-
-
-sub extract_freqs {
- my($grmr1,$grmr2) = @_;
- print STDERR "\n!!!EXTRACTING FREQUENCIES: $grmr1->$grmr2\n";
- my $IN_COARSE = substr($grmr{$grmr1},0,index($grmr{$grmr1},".grammar/")) . "/labeled_spans.txt";
- my $IN_FINE = substr($grmr{$grmr2},0,index($grmr{$grmr2},".grammar/")) . "/labeled_spans.txt";
- my $OUT_SPANS = "$OUTDIR/labeled_spans.hier$grmr1-$grmr2.txt";
- my $FREQS = "$OUTDIR/label_freq.hier$grmr1-$grmr2.txt";
- if(-e $OUT_SPANS && -e $FREQS) {
- print STDERR "$OUT_SPANS exists, reusing...\n";
- print STDERR "$FREQS exists, reusing...\n";
- return $FREQS;
- }
-
- safesystem("paste -d ' ' $IN_COARSE $IN_FINE > $OUT_SPANS");
-
- my %FREQ_HIER = ();
- my %finehier = ();
-
- open SPANS, $OUT_SPANS or die $!;
- while (<SPANS>) {
- my ($tmp, $coarse, $fine) = split /\|\|\|/;
- my @coarse_spans = $coarse =~ /\d+-\d+:X(\d+)/g;
- my @fine_spans = $fine =~ /\d+-\d+:X(\d+)/g;
-
- foreach my $i (0..(scalar @coarse_spans)-1) {
- my $coarse_cat = $coarse_spans[$i];
- my $fine_cat = $fine_spans[$i];
-
- $FREQ_HIER{$coarse_cat}{$fine_cat}++;
- }
- }
- close SPANS;
- foreach (values %FREQ_HIER) {
- my $coarse_freq = $_;
- my $total = 0;
- $total+=$_ for (values %{ $coarse_freq });
- $coarse_freq->{$_}=log($coarse_freq->{$_}/$total) for (keys %{ $coarse_freq });
- }
- open FREQS, ">", $FREQS or die $!;
- foreach my $coarse_cat (keys %FREQ_HIER) {
- print FREQS "$coarse_cat |||";
- foreach my $fine_cat (keys %{$FREQ_HIER{$coarse_cat}}) {
- my $freq = $FREQ_HIER{$coarse_cat}{$fine_cat};
- print FREQS " $fine_cat:$freq";
- if(! exists $finehier{$fine_cat} || $finehier{$fine_cat} < $freq) {
- $finehier{$fine_cat} = $coarse_cat;
- }
- }
- print FREQS "\n";
- }
-# foreach my $fine_cat (keys %finehier) {
-# print FREQS "$fine_cat -> $finehier{$fine_cat}\n";
-# }
- close FREQS;
- return $FREQS;
-}
-
-
-sub create_backoff_rules {
- print STDERR "\n!!! CREATING BACKOFF RULES\n";
- my ($grmr1, $grmr2, $freq) = @_;
- my $OUTFILE = "$OUTDIR/backoff.hier$grmr1-$grmr2.txt";
- if(-e $OUTFILE) {
- print STDERR "$OUTFILE exists, reusing...\n";
- return;
- }
- open FREQS, $freq or die $!;
- open TMP, ">", $OUTFILE or die $!;
- while (<FREQS>) {
- my $line = $_;
- $line = m/^(\d+) \|\|\| (.+)$/;
- my $coarse = $1;
- $line = $2;
- my @finefreq = $line =~ m/(\d+):(\S+)/g;
- for(my $i = 0; $i < scalar @finefreq; $i+=2) {
- my $finecat = $finefreq[$i];
- my $finefreq = $finefreq[$i+1];
- print TMP "[X$grmr1$coarse] ||| [X$grmr2$finecat,1]\t[1] ||| BackoffRule=$finefreq A=0-0\n";
- }
- }
- close TMP;
- close FREQS;
- safesystem("zcat $BACKOFF_GRMR | cat - $OUTFILE | gzip > $BACKOFF_GRMR.2.gz");
- safesystem("mv $BACKOFF_GRMR.2.gz $BACKOFF_GRMR");
-}
-
-sub add_glue_rules {
- print STDERR "\n!!! CREATING GLUE RULES\n";
- my ($grmr) = @_;
- my $OUTFILE = "$OUTDIR/glue.$grmr.gz";
- if (-e $OUTFILE) {
- print STDERR "$OUTFILE exists, reusing...\n";
- return;
- }
- open TMP, ">", $OUTFILE or die $!;
- for my $i (0..($grmr-1)) {
- print TMP "[S] ||| [S,1] [X$grmr$i,2] ||| [1] [2] ||| Glue=1\n";
- print TMP "[S] ||| [X$grmr$i,1] ||| [1] ||| GlueTop=1\n";
- }
- close TMP;
- safesystem("zcat $GLUE_GRMR | cat - $OUTFILE | gzip > $GLUE_GRMR.2.gz");
- safesystem("mv $GLUE_GRMR.2.gz $GLUE_GRMR");
-}
-
-sub output_grammar_info {
- print STDERR "\n!!! GRAMMAR INFORMATION\n";
- print STDOUT "GRAMMAR: \t$joinedgrammars\n";
- print STDOUT "GLUE: \t$GLUE_GRMR\n";
- print STDOUT "BACKOFF: \t$BACKOFF_GRMR\n";
-}