summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--gi/pipeline/backoff-pipe.pl178
-rwxr-xr-xgi/pipeline/evaluation-pipeline.pl27
-rwxr-xr-xgi/pipeline/local-gi-pipeline.pl25
3 files changed, 212 insertions, 18 deletions
diff --git a/gi/pipeline/backoff-pipe.pl b/gi/pipeline/backoff-pipe.pl
new file mode 100644
index 00000000..d03b43be
--- /dev/null
+++ b/gi/pipeline/backoff-pipe.pl
@@ -0,0 +1,178 @@
+#!/usr/bin/perl -w
+use strict;
+
+use Getopt::Long "GetOptions";
+
+my @grammars;
+my $OUTPUTPREFIX = './giwork/bo.hier.grammar';
+my $backoff_levels = 1;
+my $glue_levels = 1;
+my %FREQ_HIER = ();
+
+usage() unless &GetOptions('grmr=s@' => \ @grammars,
+ 'outprefix=s' => \ $OUTPUTPREFIX,
+ 'bo-lvls=i' => \ $backoff_levels,
+ 'glue-lvls=i' => \ $glue_levels,
+);
+
+my $OUTDIR = $OUTPUTPREFIX . '/hier';
+
+my %grmr = ();
+foreach my $grammar (@grammars) {
+ $grammar =~ m/\/[^\/]*\.t(\d+)\.[^\/]*$/;
+ my $grains = $1;
+ $grmr{$grains} = $grammar;
+}
+
+my @index = sort keys %grmr;
+$OUTDIR = $OUTDIR . join('-',@index);
+my $BACKOFF_GRMR = $OUTDIR . '/backoff.gz';
+my $GLUE_GRMR = $OUTDIR . '/glue.gz';
+my $joinedgrammars = $OUTDIR . '/grammar.hier.gz';
+
+join_grammars();
+
+for my $i (0..(scalar @index)-2) {
+ my $freqs = extract_freqs($index[$i], $index[$i+1]);
+ if ($i < $backoff_levels) {
+ create_backoff_rules($index[$i],$index[$i+1],$freqs);
+ }
+ if ($i < $glue_levels) {
+ add_glue_rules($index[$i]);
+ }
+}
+
+output_grammar_info();
+
+
+sub usage {
+ print <<EOT;
+
+Usage: $0 [OPTIONS] corpus.fr-en-al
+
+Induces a grammar using Pitman-Yor topic modeling or Posterior Regularisation.
+
+EOT
+ exit 1;
+};
+
+
+sub safesystem {
+ print STDERR "Executing: @_\n";
+ system(@_);
+ if ($? == -1) {
+ print STDERR "ERROR: Failed to execute: @_\n $!\n";
+ exit(1);
+ }
+ elsif ($? & 127) {
+ printf STDERR "ERROR: Execution of: @_\n died with signal %d, %s coredump\n",
+ ($? & 127), ($? & 128) ? 'with' : 'without';
+ exit(1);
+ }
+ else {
+ my $exitcode = $? >> 8;
+ print STDERR "Exit code: $exitcode\n" if $exitcode;
+ return ! $exitcode;
+ }
+}
+
+
+sub join_grammars {
+ safesystem("echo \"\" | gzip > $joinedgrammars");
+ foreach my $i (@index) {
+ my $g = $grmr{$i};
+ safesystem("zcat $g | sed -r -e 's/(X[0-9]+)/\\1-$i/g' - | gzip > $g");
+ safesystem("zcat $joinedgrammars $g | gzip > $joinedgrammars");
+ }
+}
+
+
+sub extract_freqs {
+ my($grmr1,$grmr2) = @_;
+ print STDERR "\n!!!EXTRACTING FREQUENCIES: $grmr1->$grmr2\n";
+ my $IN_COARSE = substr($grammars{$grmr1},0,index($grammars{$grmr1},".grammar/")) . "/labeled_spans.txt";
+ my $IN_FINE = substr($grammars{$grmr2},0,index($grammars{$grmr2},".grammar/")) . "/labeled_spans.txt";
+ my $OUT_SPANS = "$OUTDIR/labeled_spans.hier$NUM_TOPICS_COARSE-$NUM_TOPICS_FINE.txt";
+ my $FREQS = "$OUTDIR/label_freq.hier$NUM_TOPICS_COARSE-$NUM_TOPICS_FINE.txt";
+ my %finehier = ();
+ if (-e $OUT_SPANS) {
+ print STDERR "$OUT_SPANS exists, reusing...\n";
+ } else {
+ safesystem("paste -d ' ' $IN_COARSE $IN_FINE > $OUT_SPANS");
+ }
+ open SPANS, $OUT_SPANS or die $!;
+ while (<SPANS>) {
+ my ($tmp, $coarse, $fine) = split /\|\|\|/;
+ my @coarse_spans = $coarse =~ /\d+-\d+:X(\d+)/g;
+ my @fine_spans = $fine =~ /\d+-\d+:X(\d+)/g;
+
+ foreach my $i (0..(scalar @coarse_spans)-1) {
+ my $coarse_cat = $coarse_spans[$i];
+ my $fine_cat = $fine_spans[$i];
+
+ $FREQ_HIER{$coarse_cat}{$fine_cat}++;
+ }
+ }
+ close SPANS;
+ foreach (values %FREQ_HIER) {
+ my $coarse_freq = $_;
+ my $total = 0;
+ $total+=$_ for (values %{ $coarse_freq });
+ $coarse_freq->{$_}=log($coarse_freq->{$_}/$total) for (keys %{ $coarse_freq });
+ }
+ open FREQS, ">", $FREQS or die $!;
+ foreach my $coarse_cat (keys %FREQ_HIER) {
+ print FREQS "$coarse_cat |||";
+ foreach my $fine_cat (keys %{$FREQ_HIER{$coarse_cat}}) {
+ my $freq = $FREQ_HIER{$coarse_cat}{$fine_cat};
+ print FREQS " $fine_cat:$freq";
+ if(! exists $finehier{$fine_cat} || $finehier{$fine_cat} < $freq) {
+ $finehier{$fine_cat} = $coarse_cat;
+ }
+ }
+ print FREQS "\n";
+ }
+# foreach my $fine_cat (keys %finehier) {
+# print FREQS "$fine_cat -> $finehier{$fine_cat}\n";
+# }
+ close FREQS;
+ return $FREQS;
+}
+
+
+sub create_backoff_rules {
+ my ($grmr1, $grmr2, $freq) = @_;
+ open FREQS, $freqs or die $!;
+ open TMP, ">", "tmp" or die $!;
+ while (<FREQS>) {
+ my $coarse = m/^(\d+) \|\|\|/;
+ if ($coarse == $grmr1) {
+ my @finefreq = m/(\d+):(-?\d+\.?\d*)/g;
+ for(my $i = 0; $i < scalar @finefreq; $i+=2) {
+ my $finecat = @finefreq[$i];
+ my $finefreq = @finefreq[$i+1];
+ print TMP "[X$coarse-$grmr1] ||| [X$finecat-$grmr2,1]\t[1] ||| BackoffRule=$finefreq\n";
+ }
+ }
+ }
+ close TMP;
+ close FREQS;
+ safesystem('zcat $BACKOFF_GRMR | cat - tmp | gzip > $BACKOFF_GRMR');
+}
+
+sub add_glue_rules {
+ my ($grmr) = @_;
+ open TMP, ">", "tmp" or die $!;
+ for my $i (0..($grmr-1)) {
+ print TMP "[S] ||| [S,1] [X$i-$grmr,2] ||| [1] [2] ||| Glue=1\n";
+ print TMP "[S] ||| [X$i-$grmr,1] ||| [1] ||| GlueTop=1\n";
+ }
+ close TMP;
+ safesystem('zcat $GLUE_GRMR | cat - tmp | gzip > $GLUE_GRMR');
+}
+
+sub output_grammar_info {
+ print STDOUT "GRAMMAR: \t$joinedgrammars\n";
+ print STDOUT "GLUE: \t$GLUE_GRMR\n";
+ print STDOUT "BACKOFF: \t$BACKOFF_GRAMMAR\n";
+}
diff --git a/gi/pipeline/evaluation-pipeline.pl b/gi/pipeline/evaluation-pipeline.pl
index 178159b9..64803fd0 100755
--- a/gi/pipeline/evaluation-pipeline.pl
+++ b/gi/pipeline/evaluation-pipeline.pl
@@ -1,4 +1,4 @@
-#!/usr/bin/perl -w
+sub m#!/usr/bin/perl -w
use strict;
use Getopt::Long;
use Cwd;
@@ -120,9 +120,11 @@ my $FEATURIZER_OPTS = '';
my $dataDir = '/export/ws10smt/data';
my @features;
my $bkoffgram;
+my $gluegram;
my $usefork;
if (GetOptions(
- "backoff_grammar" => \$bkoffgram,
+ "backoff_grammar=s" => \$bkoffgram,
+ "glue_grammar=s" => \$gluegram,
"data=s" => \$dataDir,
"features=s@" => \@features,
"use-fork" => \$usefork,
@@ -178,13 +180,21 @@ print STDERR "\nCREATING INITIAL WEIGHTS FILE: weights.init\n";
my $weights = mydircat($outdir, "weights.init");
write_random_weights_file($weights, @xfeats);
+my $bkoff_grmr;
+my $glue_grmr;
+if($bkoffgram) {
+ $bkoff_grmr = mydircat($outdir, "backoff.scfg.gz");
+ safesystem("cp $bkoffgram $bkoff_grmr");
+}
+if($gluegram) {
+ $glue_grmr = mydircat($outdir, "glue.bo.scfg.gz");
+ safesystem("cp $gluegram $glue_grmr");
+}
+
# MAKE DEV
print STDERR "\nFILTERING FOR dev...\n";
print STDERR "DEV: $dev (REFS=$drefs)\n";
my $devgrammar = filter($grammar, $dev, 'dev', $outdir);
-if($bkoffgram) {
- $devgrammar = add_backoff($devgrammar, $numtopics, 'dev', $outdir);
-}
my $devini = mydircat($outdir, "cdec-dev.ini");
write_cdec_ini($devini, $devgrammar);
@@ -194,9 +204,6 @@ print STDERR "\nFILTERING FOR test...\n";
print STDERR "TEST: $test (EVAL=$teval)\n";
`mkdir -p $outdir`;
my $testgrammar = filter($grammar, $test, 'test', $outdir);
-if($bkoffgram) {
- $testgrammar = add_backoff($testgrammar, $numtopics, 'test', $outdir);
-}
my $testini = mydircat($outdir, "cdec-test.ini");
write_cdec_ini($testini, $testgrammar);
@@ -296,11 +303,12 @@ sub mydircat {
sub write_cdec_ini {
my ($filename, $grammar_path) = (@_);
open CDECINI, ">$filename" or die "Can't write $filename: $!";
+ my $glue = ($gluegram ? "$glue_grmr" : "$datadir/glue/glue.scfg.gz");
print CDECINI <<EOT;
formalism=scfg
cubepruning_pop_limit=100
add_pass_through_rules=true
-scfg_extra_glue_grammar=$datadir/glue/glue.scfg.gz
+scfg_extra_glue_grammar=$glue
grammar=$datadir/oov.scfg.gz
grammar=$grammar_path
scfg_default_nt=OOV
@@ -308,6 +316,7 @@ scfg_no_hiero_glue_grammar=true
feature_function=WordPenalty
feature_function=LanguageModel -o 3 $LANG_MODEL
EOT
+ print CDECINI "grammar=$bkoff_grmr\n" if $bkoffgram;
close CDECINI;
};
diff --git a/gi/pipeline/local-gi-pipeline.pl b/gi/pipeline/local-gi-pipeline.pl
index 34948880..6b58ab7c 100755
--- a/gi/pipeline/local-gi-pipeline.pl
+++ b/gi/pipeline/local-gi-pipeline.pl
@@ -102,10 +102,14 @@ my $CLUSTER_DIR = $OUTPUT . '/' . cluster_dir();
my $LABELED_DIR = $OUTPUT . '/' . labeled_dir();
my $CLUSTER_DIR_C;
my $CLUSTER_DIR_F;
+my $LABELED_DIR_C;
+my $LABELED_DIR_F;
if($HIER_CAT) {
$CLUSTER_DIR_F = $CLUSTER_DIR;
+ $LABELED_DIR_F = $LABELED_DIR;
$NUM_TOPICS = $NUM_TOPICS_COARSE;
$CLUSTER_DIR_C = $OUTPUT . '/' . cluster_dir();
+ $LABELED_DIR_C = $OUTPUT . '/' . labeled_dir();
$NUM_TOPICS = $NUM_TOPICS_FINE;
}
my $GRAMMAR_DIR = $OUTPUT . '/' . grammar_dir();
@@ -115,7 +119,8 @@ safemkdir($DATA_DIR) or die "Couldn't create output directory $DATA_DIR: $!";
safemkdir($CONTEXT_DIR) or die "Couldn't create output directory $CONTEXT_DIR: $!";
safemkdir($CLUSTER_DIR) or die "Couldn't create output directory $CLUSTER_DIR: $!";
if($HIER_CAT) {
- safemkdir($CLUSTER_DIR_C) or die "Couldn't create output directory $CLUSTER_DIR: $!";
+ safemkdir($CLUSTER_DIR_C) or die "Couldn't create output directory $CLUSTER_DIR_C: $!";
+ safemkdir($LABELED_DIR_C) or die "Couldn't create output directory $LABELED_DIR_C: $!";
}
safemkdir($LABELED_DIR) or die "Couldn't create output directory $LABELED_DIR: $!";
safemkdir($GRAMMAR_DIR) or die "Couldn't create output directory $GRAMMAR_DIR: $!";
@@ -143,9 +148,11 @@ if (lc($MODEL) eq "pyp") {
if($HIER_CAT) {
$NUM_TOPICS = $NUM_TOPICS_COARSE;
$CLUSTER_DIR = $CLUSTER_DIR_C;
+ $LABELED_DIR = $LABELED_DIR_C;
label_spans_with_topics();
$NUM_TOPICS = $NUM_TOPICS_FINE;
$CLUSTER_DIR = $CLUSTER_DIR_F;
+ $LABELED_DIR = $LABELED_DIR_F;
label_spans_with_topics();
extract_freqs();
} else {
@@ -297,10 +304,10 @@ sub label_spans_with_topics {
sub extract_freqs {
print STDERR "\n!!!EXTRACTING FREQUENCIES\n";
- my $IN_COARSE = "$CLUSTER_DIR_C/labeled_spans.txt";
- my $IN_FINE = "$CLUSTER_DIR_F/labeled_spans.txt";
- my $OUT_SPANS = "$CLUSTER_DIR_F/labeled_spans.hier$NUM_TOPICS_COARSE-$NUM_TOPICS_FINE.txt";
- my $FREQS = "$CLUSTER_DIR_F/label_freq.hier$NUM_TOPICS_COARSE-$NUM_TOPICS_FINE.txt";
+ my $IN_COARSE = "$LABELED_DIR_C/labeled_spans.txt";
+ my $IN_FINE = "$LABELED_DIR_F/labeled_spans.txt";
+ my $OUT_SPANS = "$LABELED_DIR_F/labeled_spans.hier$NUM_TOPICS_COARSE-$NUM_TOPICS_FINE.txt";
+ my $FREQS = "$LABELED_DIR_F/label_freq.hier$NUM_TOPICS_COARSE-$NUM_TOPICS_FINE.txt";
my $COARSE_EXPR = "\'s/\\(X[0-9][0-9]*\\)/\\1c/g\'"; #'
my $FINE_EXPR = "\'s/\\(X[0-9][0-9]*\\)/\\1f/g\'"; #'
my %finehier = ();
@@ -341,9 +348,9 @@ sub extract_freqs {
}
print FREQS "\n";
}
- foreach my $fine_cat (keys %finehier) {
- print FREQS "$fine_cat -> $finehier{$fine_cat}\n";
- }
+# foreach my $fine_cat (keys %finehier) {
+# print FREQS "$fine_cat -> $finehier{$fine_cat}\n";
+# }
close FREQS;
$CLUSTER_DIR = $CLUSTER_DIR_F;
}
@@ -364,7 +371,7 @@ sub grammar_extract {
sub grammar_extract_bidir {
#gzcat ex.output.gz | ./mr_stripe_rule_reduce -p -b | sort -t $'\t' -k 1 | ./mr_stripe_rule_reduce | gzip > phrase-table.gz
- my $LABELED = ($HIER_CAT ? "$CLUSTER_DIR_F/corpus.src_trg_al_label.hier" : "$LABELED_DIR/corpus.src_trg_al_label");
+ my $LABELED = "$LABELED_DIR/corpus.src_trg_al_label";
print STDERR "\n!!!EXTRACTING GRAMMAR\n";
my $OUTGRAMMAR = "$GRAMMAR_DIR/grammar.bidir.gz";
if (-e $OUTGRAMMAR) {