summaryrefslogtreecommitdiff
path: root/gi/pipeline
diff options
context:
space:
mode:
Diffstat (limited to 'gi/pipeline')
-rw-r--r--gi/pipeline/backoff-pipe.pl101
-rwxr-xr-xgi/pipeline/evaluation-pipeline.pl40
2 files changed, 79 insertions, 62 deletions
diff --git a/gi/pipeline/backoff-pipe.pl b/gi/pipeline/backoff-pipe.pl
index d03b43be..5e36b3d7 100644
--- a/gi/pipeline/backoff-pipe.pl
+++ b/gi/pipeline/backoff-pipe.pl
@@ -5,9 +5,9 @@ use Getopt::Long "GetOptions";
my @grammars;
my $OUTPUTPREFIX = './giwork/bo.hier.grammar';
+safemkdir($OUTPUTPREFIX);
my $backoff_levels = 1;
my $glue_levels = 1;
-my %FREQ_HIER = ();
usage() unless &GetOptions('grmr=s@' => \ @grammars,
'outprefix=s' => \ $OUTPUTPREFIX,
@@ -16,18 +16,22 @@ usage() unless &GetOptions('grmr=s@' => \ @grammars,
);
my $OUTDIR = $OUTPUTPREFIX . '/hier';
+print STDERR "@grammars\n";
+
my %grmr = ();
foreach my $grammar (@grammars) {
- $grammar =~ m/\/[^\/]*\.t(\d+)\.[^\/]*$/;
- my $grains = $1;
- $grmr{$grains} = $grammar;
+ $grammar =~ m/\/[^\/]*\.t(\d+)\.[^\/]*/;
+ $grmr{$1} = $grammar;
}
my @index = sort keys %grmr;
$OUTDIR = $OUTDIR . join('-',@index);
-my $BACKOFF_GRMR = $OUTDIR . '/backoff.gz';
-my $GLUE_GRMR = $OUTDIR . '/glue.gz';
+safemkdir($OUTDIR);
+my $BACKOFF_GRMR = $OUTDIR . '/backoff.hier.gz';
+safesystem("echo \"\" | gzip > $BACKOFF_GRMR");
+my $GLUE_GRMR = $OUTDIR . '/glue.hier.gz';
+safesystem("echo \"\" | gzip > $GLUE_GRMR");
my $joinedgrammars = $OUTDIR . '/grammar.hier.gz';
join_grammars();
@@ -56,6 +60,12 @@ EOT
exit 1;
};
+sub safemkdir {
+ my $dir = shift;
+ if (-d $dir) { return 1; }
+ return mkdir($dir);
+}
+
sub safesystem {
print STDERR "Executing: @_\n";
@@ -78,11 +88,17 @@ sub safesystem {
sub join_grammars {
+ print STDERR "\n!!! JOINING GRAMMARS\n";
+ if(-e $joinedgrammars) {
+ print STDERR "$joinedgrammars exists, reusing...\n";
+ return;
+ }
safesystem("echo \"\" | gzip > $joinedgrammars");
foreach my $i (@index) {
my $g = $grmr{$i};
- safesystem("zcat $g | sed -r -e 's/(X[0-9]+)/\\1-$i/g' - | gzip > $g");
- safesystem("zcat $joinedgrammars $g | gzip > $joinedgrammars");
+ safesystem("zcat $g | sed -r -e 's/X([0-9]+)/X$i\\1/g' - | gzip > $g.2.gz");
+ safesystem("zcat $joinedgrammars $g.2.gz | gzip > $joinedgrammars.2.gz");
+ safesystem("mv $joinedgrammars.2.gz $joinedgrammars");
}
}
@@ -90,16 +106,21 @@ sub join_grammars {
sub extract_freqs {
my($grmr1,$grmr2) = @_;
print STDERR "\n!!!EXTRACTING FREQUENCIES: $grmr1->$grmr2\n";
- my $IN_COARSE = substr($grammars{$grmr1},0,index($grammars{$grmr1},".grammar/")) . "/labeled_spans.txt";
- my $IN_FINE = substr($grammars{$grmr2},0,index($grammars{$grmr2},".grammar/")) . "/labeled_spans.txt";
- my $OUT_SPANS = "$OUTDIR/labeled_spans.hier$NUM_TOPICS_COARSE-$NUM_TOPICS_FINE.txt";
- my $FREQS = "$OUTDIR/label_freq.hier$NUM_TOPICS_COARSE-$NUM_TOPICS_FINE.txt";
- my %finehier = ();
- if (-e $OUT_SPANS) {
+ my $IN_COARSE = substr($grmr{$grmr1},0,index($grmr{$grmr1},".grammar/")) . "/labeled_spans.txt";
+ my $IN_FINE = substr($grmr{$grmr2},0,index($grmr{$grmr2},".grammar/")) . "/labeled_spans.txt";
+ my $OUT_SPANS = "$OUTDIR/labeled_spans.hier$grmr1-$grmr2.txt";
+ my $FREQS = "$OUTDIR/label_freq.hier$grmr1-$grmr2.txt";
+ if(-e $OUT_SPANS && -e $FREQS) {
print STDERR "$OUT_SPANS exists, reusing...\n";
- } else {
- safesystem("paste -d ' ' $IN_COARSE $IN_FINE > $OUT_SPANS");
+ print STDERR "$FREQS exists, reusing...\n";
+ return $FREQS;
}
+
+ safesystem("paste -d ' ' $IN_COARSE $IN_FINE > $OUT_SPANS");
+
+ my %FREQ_HIER = ();
+ my %finehier = ();
+
open SPANS, $OUT_SPANS or die $!;
while (<SPANS>) {
my ($tmp, $coarse, $fine) = split /\|\|\|/;
@@ -141,38 +162,54 @@ sub extract_freqs {
sub create_backoff_rules {
+ print STDERR "\n!!! CREATING BACKOFF RULES\n";
my ($grmr1, $grmr2, $freq) = @_;
- open FREQS, $freqs or die $!;
- open TMP, ">", "tmp" or die $!;
+ my $OUTFILE = "$OUTDIR/backoff.hier$grmr1-$grmr2.txt";
+ if(-e $OUTFILE) {
+ print STDERR "$OUTFILE exists, reusing...\n";
+ return;
+ }
+ open FREQS, $freq or die $!;
+ open TMP, ">", $OUTFILE or die $!;
while (<FREQS>) {
- my $coarse = m/^(\d+) \|\|\|/;
- if ($coarse == $grmr1) {
- my @finefreq = m/(\d+):(-?\d+\.?\d*)/g;
- for(my $i = 0; $i < scalar @finefreq; $i+=2) {
- my $finecat = @finefreq[$i];
- my $finefreq = @finefreq[$i+1];
- print TMP "[X$coarse-$grmr1] ||| [X$finecat-$grmr2,1]\t[1] ||| BackoffRule=$finefreq\n";
- }
+ my $line = $_;
+ $line = m/^(\d+) \|\|\| (.+)$/;
+ my $coarse = $1;
+ $line = $2;
+ my @finefreq = $line =~ m/(\d+):(\S+)/g;
+ for(my $i = 0; $i < scalar @finefreq; $i+=2) {
+ my $finecat = $finefreq[$i];
+ my $finefreq = $finefreq[$i+1];
+ print TMP "[X$grmr1$coarse] ||| [X$grmr2$finecat,1]\t[1] ||| BackoffRule=$finefreq\n";
}
}
close TMP;
close FREQS;
- safesystem('zcat $BACKOFF_GRMR | cat - tmp | gzip > $BACKOFF_GRMR');
+ safesystem("zcat $BACKOFF_GRMR | cat - $OUTFILE | gzip > $BACKOFF_GRMR.2.gz");
+ safesystem("mv $BACKOFF_GRMR.2.gz $BACKOFF_GRMR");
}
sub add_glue_rules {
+ print STDERR "\n!!! CREATING GLUE RULES\n";
my ($grmr) = @_;
- open TMP, ">", "tmp" or die $!;
+ my $OUTFILE = "$OUTDIR/glue.$grmr.gz";
+ if (-e $OUTFILE) {
+ print STDERR "$OUTFILE exists, reusing...\n";
+ return;
+ }
+ open TMP, ">", $OUTFILE or die $!;
for my $i (0..($grmr-1)) {
- print TMP "[S] ||| [S,1] [X$i-$grmr,2] ||| [1] [2] ||| Glue=1\n";
- print TMP "[S] ||| [X$i-$grmr,1] ||| [1] ||| GlueTop=1\n";
+ print TMP "[S] ||| [S,1] [X$grmr$i,2] ||| [1] [2] ||| Glue=1\n";
+ print TMP "[S] ||| [X$grmr$i,1] ||| [1] ||| GlueTop=1\n";
}
close TMP;
- safesystem('zcat $GLUE_GRMR | cat - tmp | gzip > $GLUE_GRMR');
+ safesystem("zcat $GLUE_GRMR | cat - $OUTFILE | gzip > $GLUE_GRMR.2.gz");
+ safesystem("mv $GLUE_GRMR.2.gz $GLUE_GRMR");
}
sub output_grammar_info {
+ print STDERR "\n!!! GRAMMAR INFORMATION\n";
print STDOUT "GRAMMAR: \t$joinedgrammars\n";
print STDOUT "GLUE: \t$GLUE_GRMR\n";
- print STDOUT "BACKOFF: \t$BACKOFF_GRAMMAR\n";
+ print STDOUT "BACKOFF: \t$BACKOFF_GRMR\n";
}
diff --git a/gi/pipeline/evaluation-pipeline.pl b/gi/pipeline/evaluation-pipeline.pl
index 64803fd0..fe0fb1a5 100755
--- a/gi/pipeline/evaluation-pipeline.pl
+++ b/gi/pipeline/evaluation-pipeline.pl
@@ -1,4 +1,4 @@
-sub m#!/usr/bin/perl -w
+#!/usr/bin/perl -w
use strict;
use Getopt::Long;
use Cwd;
@@ -123,8 +123,8 @@ my $bkoffgram;
my $gluegram;
my $usefork;
if (GetOptions(
- "backoff_grammar=s" => \$bkoffgram,
- "glue_grammar=s" => \$gluegram,
+ "backoff-grammar=s" => \$bkoffgram,
+ "glue-grammar=s" => \$gluegram,
"data=s" => \$dataDir,
"features=s@" => \@features,
"use-fork" => \$usefork,
@@ -136,7 +136,6 @@ if (GetOptions(
}
if ($usefork) { $usefork="--use-fork"; } else { $usefork = ''; }
my @fkeys = keys %$feat_map;
-push(@features, "BackoffRule") if $bkoffgram;
die "You must specify one or more features with -f. Known features: @fkeys\n" unless scalar @features > 0;
my @xfeats;
for my $feat (@features) {
@@ -183,12 +182,16 @@ write_random_weights_file($weights, @xfeats);
my $bkoff_grmr;
my $glue_grmr;
if($bkoffgram) {
+ print STDERR "Placing backoff grammar…\n";
$bkoff_grmr = mydircat($outdir, "backoff.scfg.gz");
- safesystem("cp $bkoffgram $bkoff_grmr");
+ print STDERR "cp $bkoffgram $bkoff_grmr\n";
+ safesystem(undef,"cp $bkoffgram $bkoff_grmr");
}
if($gluegram) {
+ print STDERR "Placing glue grammar…\n";
$glue_grmr = mydircat($outdir, "glue.bo.scfg.gz");
- safesystem("cp $gluegram $glue_grmr");
+ print STDERR "cp $gluegram $glue_grmr\n";
+ safesystem(undef,"cp $gluegram $glue_grmr");
}
# MAKE DEV
@@ -267,30 +270,7 @@ sub filter {
safesystem($outgrammar, $cmd) or die "Featurizing failed";
}
return $outgrammar;
-}
-
-sub add_backoff {
- my ($grammar, $topics, $name, $outdir) = @_;
- my $out = mydircat($outdir, "backoff.$name.scfg");
- my $outgrammar = mydircat($outdir, "$name.scfg.gz");
- my $cmd = "zcat $grammar > $out";
- safesystem($out,$cmd) or die "Adding backoff rules failed.";
- for(my $tpcnum=0;$tpcnum<$topics;$tpcnum++) {
- for(my $tpc2=0;$tpc2<$topics;$tpc2++) {
- my $bkoff = "1";
- if($tpc2 == $tpcnum) {
- $bkoff = "0";
- }
- my $rule = "[X$tpcnum\_] ||| [X$tpc2,1] ||| [1] ||| BackoffRule=$bkoff";
- $cmd = "echo '$rule' >> $out";
- safesystem($out,$cmd) or die "Adding backoff rules failed.";
- }
- }
- $cmd = "cat $out | gzip > $outgrammar";
- safesystem($outgrammar, $cmd) or die "Adding backoff rules failed.";
- return $outgrammar;
-}
-
+}
sub mydircat {
my ($base, $suffix) = @_;