diff options
-rw-r--r-- | gi/pipeline/backoff-pipe.pl | 178 | ||||
-rwxr-xr-x | gi/pipeline/evaluation-pipeline.pl | 27 | ||||
-rwxr-xr-x | gi/pipeline/local-gi-pipeline.pl | 25 |
3 files changed, 212 insertions, 18 deletions
diff --git a/gi/pipeline/backoff-pipe.pl b/gi/pipeline/backoff-pipe.pl new file mode 100644 index 00000000..d03b43be --- /dev/null +++ b/gi/pipeline/backoff-pipe.pl @@ -0,0 +1,178 @@ +#!/usr/bin/perl -w +use strict; + +use Getopt::Long "GetOptions"; + +my @grammars; +my $OUTPUTPREFIX = './giwork/bo.hier.grammar'; +my $backoff_levels = 1; +my $glue_levels = 1; +my %FREQ_HIER = (); + +usage() unless &GetOptions('grmr=s@' => \ @grammars, + 'outprefix=s' => \ $OUTPUTPREFIX, + 'bo-lvls=i' => \ $backoff_levels, + 'glue-lvls=i' => \ $glue_levels, +); + +my $OUTDIR = $OUTPUTPREFIX . '/hier'; + +my %grmr = (); +foreach my $grammar (@grammars) { + $grammar =~ m/\/[^\/]*\.t(\d+)\.[^\/]*$/; + my $grains = $1; + $grmr{$grains} = $grammar; +} + +my @index = sort keys %grmr; +$OUTDIR = $OUTDIR . join('-',@index); +my $BACKOFF_GRMR = $OUTDIR . '/backoff.gz'; +my $GLUE_GRMR = $OUTDIR . '/glue.gz'; +my $joinedgrammars = $OUTDIR . '/grammar.hier.gz'; + +join_grammars(); + +for my $i (0..(scalar @index)-2) { + my $freqs = extract_freqs($index[$i], $index[$i+1]); + if ($i < $backoff_levels) { + create_backoff_rules($index[$i],$index[$i+1],$freqs); + } + if ($i < $glue_levels) { + add_glue_rules($index[$i]); + } +} + +output_grammar_info(); + + +sub usage { + print <<EOT; + +Usage: $0 [OPTIONS] corpus.fr-en-al + +Induces a grammar using Pitman-Yor topic modeling or Posterior Regularisation. + +EOT + exit 1; +}; + + +sub safesystem { + print STDERR "Executing: @_\n"; + system(@_); + if ($? == -1) { + print STDERR "ERROR: Failed to execute: @_\n $!\n"; + exit(1); + } + elsif ($? & 127) { + printf STDERR "ERROR: Execution of: @_\n died with signal %d, %s coredump\n", + ($? & 127), ($? & 128) ? 'with' : 'without'; + exit(1); + } + else { + my $exitcode = $? >> 8; + print STDERR "Exit code: $exitcode\n" if $exitcode; + return ! $exitcode; + } +} + + +sub join_grammars { + safesystem("echo \"\" | gzip > $joinedgrammars"); + foreach my $i (@index) { + my $g = $grmr{$i}; + safesystem("zcat $g | sed -r -e 's/(X[0-9]+)/\\1-$i/g' - | gzip > $g"); + safesystem("zcat $joinedgrammars $g | gzip > $joinedgrammars"); + } +} + + +sub extract_freqs { + my($grmr1,$grmr2) = @_; + print STDERR "\n!!!EXTRACTING FREQUENCIES: $grmr1->$grmr2\n"; + my $IN_COARSE = substr($grammars{$grmr1},0,index($grammars{$grmr1},".grammar/")) . "/labeled_spans.txt"; + my $IN_FINE = substr($grammars{$grmr2},0,index($grammars{$grmr2},".grammar/")) . "/labeled_spans.txt"; + my $OUT_SPANS = "$OUTDIR/labeled_spans.hier$NUM_TOPICS_COARSE-$NUM_TOPICS_FINE.txt"; + my $FREQS = "$OUTDIR/label_freq.hier$NUM_TOPICS_COARSE-$NUM_TOPICS_FINE.txt"; + my %finehier = (); + if (-e $OUT_SPANS) { + print STDERR "$OUT_SPANS exists, reusing...\n"; + } else { + safesystem("paste -d ' ' $IN_COARSE $IN_FINE > $OUT_SPANS"); + } + open SPANS, $OUT_SPANS or die $!; + while (<SPANS>) { + my ($tmp, $coarse, $fine) = split /\|\|\|/; + my @coarse_spans = $coarse =~ /\d+-\d+:X(\d+)/g; + my @fine_spans = $fine =~ /\d+-\d+:X(\d+)/g; + + foreach my $i (0..(scalar @coarse_spans)-1) { + my $coarse_cat = $coarse_spans[$i]; + my $fine_cat = $fine_spans[$i]; + + $FREQ_HIER{$coarse_cat}{$fine_cat}++; + } + } + close SPANS; + foreach (values %FREQ_HIER) { + my $coarse_freq = $_; + my $total = 0; + $total+=$_ for (values %{ $coarse_freq }); + $coarse_freq->{$_}=log($coarse_freq->{$_}/$total) for (keys %{ $coarse_freq }); + } + open FREQS, ">", $FREQS or die $!; + foreach my $coarse_cat (keys %FREQ_HIER) { + print FREQS "$coarse_cat |||"; + foreach my $fine_cat (keys %{$FREQ_HIER{$coarse_cat}}) { + my $freq = $FREQ_HIER{$coarse_cat}{$fine_cat}; + print FREQS " $fine_cat:$freq"; + if(! exists $finehier{$fine_cat} || $finehier{$fine_cat} < $freq) { + $finehier{$fine_cat} = $coarse_cat; + } + } + print FREQS "\n"; + } +# foreach my $fine_cat (keys %finehier) { +# print FREQS "$fine_cat -> $finehier{$fine_cat}\n"; +# } + close FREQS; + return $FREQS; +} + + +sub create_backoff_rules { + my ($grmr1, $grmr2, $freq) = @_; + open FREQS, $freqs or die $!; + open TMP, ">", "tmp" or die $!; + while (<FREQS>) { + my $coarse = m/^(\d+) \|\|\|/; + if ($coarse == $grmr1) { + my @finefreq = m/(\d+):(-?\d+\.?\d*)/g; + for(my $i = 0; $i < scalar @finefreq; $i+=2) { + my $finecat = @finefreq[$i]; + my $finefreq = @finefreq[$i+1]; + print TMP "[X$coarse-$grmr1] ||| [X$finecat-$grmr2,1]\t[1] ||| BackoffRule=$finefreq\n"; + } + } + } + close TMP; + close FREQS; + safesystem('zcat $BACKOFF_GRMR | cat - tmp | gzip > $BACKOFF_GRMR'); +} + +sub add_glue_rules { + my ($grmr) = @_; + open TMP, ">", "tmp" or die $!; + for my $i (0..($grmr-1)) { + print TMP "[S] ||| [S,1] [X$i-$grmr,2] ||| [1] [2] ||| Glue=1\n"; + print TMP "[S] ||| [X$i-$grmr,1] ||| [1] ||| GlueTop=1\n"; + } + close TMP; + safesystem('zcat $GLUE_GRMR | cat - tmp | gzip > $GLUE_GRMR'); +} + +sub output_grammar_info { + print STDOUT "GRAMMAR: \t$joinedgrammars\n"; + print STDOUT "GLUE: \t$GLUE_GRMR\n"; + print STDOUT "BACKOFF: \t$BACKOFF_GRAMMAR\n"; +} diff --git a/gi/pipeline/evaluation-pipeline.pl b/gi/pipeline/evaluation-pipeline.pl index 178159b9..64803fd0 100755 --- a/gi/pipeline/evaluation-pipeline.pl +++ b/gi/pipeline/evaluation-pipeline.pl @@ -1,4 +1,4 @@ -#!/usr/bin/perl -w +sub m#!/usr/bin/perl -w use strict; use Getopt::Long; use Cwd; @@ -120,9 +120,11 @@ my $FEATURIZER_OPTS = ''; my $dataDir = '/export/ws10smt/data'; my @features; my $bkoffgram; +my $gluegram; my $usefork; if (GetOptions( - "backoff_grammar" => \$bkoffgram, + "backoff_grammar=s" => \$bkoffgram, + "glue_grammar=s" => \$gluegram, "data=s" => \$dataDir, "features=s@" => \@features, "use-fork" => \$usefork, @@ -178,13 +180,21 @@ print STDERR "\nCREATING INITIAL WEIGHTS FILE: weights.init\n"; my $weights = mydircat($outdir, "weights.init"); write_random_weights_file($weights, @xfeats); +my $bkoff_grmr; +my $glue_grmr; +if($bkoffgram) { + $bkoff_grmr = mydircat($outdir, "backoff.scfg.gz"); + safesystem("cp $bkoffgram $bkoff_grmr"); +} +if($gluegram) { + $glue_grmr = mydircat($outdir, "glue.bo.scfg.gz"); + safesystem("cp $gluegram $glue_grmr"); +} + # MAKE DEV print STDERR "\nFILTERING FOR dev...\n"; print STDERR "DEV: $dev (REFS=$drefs)\n"; my $devgrammar = filter($grammar, $dev, 'dev', $outdir); -if($bkoffgram) { - $devgrammar = add_backoff($devgrammar, $numtopics, 'dev', $outdir); -} my $devini = mydircat($outdir, "cdec-dev.ini"); write_cdec_ini($devini, $devgrammar); @@ -194,9 +204,6 @@ print STDERR "\nFILTERING FOR test...\n"; print STDERR "TEST: $test (EVAL=$teval)\n"; `mkdir -p $outdir`; my $testgrammar = filter($grammar, $test, 'test', $outdir); -if($bkoffgram) { - $testgrammar = add_backoff($testgrammar, $numtopics, 'test', $outdir); -} my $testini = mydircat($outdir, "cdec-test.ini"); write_cdec_ini($testini, $testgrammar); @@ -296,11 +303,12 @@ sub mydircat { sub write_cdec_ini { my ($filename, $grammar_path) = (@_); open CDECINI, ">$filename" or die "Can't write $filename: $!"; + my $glue = ($gluegram ? "$glue_grmr" : "$datadir/glue/glue.scfg.gz"); print CDECINI <<EOT; formalism=scfg cubepruning_pop_limit=100 add_pass_through_rules=true -scfg_extra_glue_grammar=$datadir/glue/glue.scfg.gz +scfg_extra_glue_grammar=$glue grammar=$datadir/oov.scfg.gz grammar=$grammar_path scfg_default_nt=OOV @@ -308,6 +316,7 @@ scfg_no_hiero_glue_grammar=true feature_function=WordPenalty feature_function=LanguageModel -o 3 $LANG_MODEL EOT + print CDECINI "grammar=$bkoff_grmr\n" if $bkoffgram; close CDECINI; }; diff --git a/gi/pipeline/local-gi-pipeline.pl b/gi/pipeline/local-gi-pipeline.pl index 34948880..6b58ab7c 100755 --- a/gi/pipeline/local-gi-pipeline.pl +++ b/gi/pipeline/local-gi-pipeline.pl @@ -102,10 +102,14 @@ my $CLUSTER_DIR = $OUTPUT . '/' . cluster_dir(); my $LABELED_DIR = $OUTPUT . '/' . labeled_dir(); my $CLUSTER_DIR_C; my $CLUSTER_DIR_F; +my $LABELED_DIR_C; +my $LABELED_DIR_F; if($HIER_CAT) { $CLUSTER_DIR_F = $CLUSTER_DIR; + $LABELED_DIR_F = $LABELED_DIR; $NUM_TOPICS = $NUM_TOPICS_COARSE; $CLUSTER_DIR_C = $OUTPUT . '/' . cluster_dir(); + $LABELED_DIR_C = $OUTPUT . '/' . labeled_dir(); $NUM_TOPICS = $NUM_TOPICS_FINE; } my $GRAMMAR_DIR = $OUTPUT . '/' . grammar_dir(); @@ -115,7 +119,8 @@ safemkdir($DATA_DIR) or die "Couldn't create output directory $DATA_DIR: $!"; safemkdir($CONTEXT_DIR) or die "Couldn't create output directory $CONTEXT_DIR: $!"; safemkdir($CLUSTER_DIR) or die "Couldn't create output directory $CLUSTER_DIR: $!"; if($HIER_CAT) { - safemkdir($CLUSTER_DIR_C) or die "Couldn't create output directory $CLUSTER_DIR: $!"; + safemkdir($CLUSTER_DIR_C) or die "Couldn't create output directory $CLUSTER_DIR_C: $!"; + safemkdir($LABELED_DIR_C) or die "Couldn't create output directory $LABELED_DIR_C: $!"; } safemkdir($LABELED_DIR) or die "Couldn't create output directory $LABELED_DIR: $!"; safemkdir($GRAMMAR_DIR) or die "Couldn't create output directory $GRAMMAR_DIR: $!"; @@ -143,9 +148,11 @@ if (lc($MODEL) eq "pyp") { if($HIER_CAT) { $NUM_TOPICS = $NUM_TOPICS_COARSE; $CLUSTER_DIR = $CLUSTER_DIR_C; + $LABELED_DIR = $LABELED_DIR_C; label_spans_with_topics(); $NUM_TOPICS = $NUM_TOPICS_FINE; $CLUSTER_DIR = $CLUSTER_DIR_F; + $LABELED_DIR = $LABELED_DIR_F; label_spans_with_topics(); extract_freqs(); } else { @@ -297,10 +304,10 @@ sub label_spans_with_topics { sub extract_freqs { print STDERR "\n!!!EXTRACTING FREQUENCIES\n"; - my $IN_COARSE = "$CLUSTER_DIR_C/labeled_spans.txt"; - my $IN_FINE = "$CLUSTER_DIR_F/labeled_spans.txt"; - my $OUT_SPANS = "$CLUSTER_DIR_F/labeled_spans.hier$NUM_TOPICS_COARSE-$NUM_TOPICS_FINE.txt"; - my $FREQS = "$CLUSTER_DIR_F/label_freq.hier$NUM_TOPICS_COARSE-$NUM_TOPICS_FINE.txt"; + my $IN_COARSE = "$LABELED_DIR_C/labeled_spans.txt"; + my $IN_FINE = "$LABELED_DIR_F/labeled_spans.txt"; + my $OUT_SPANS = "$LABELED_DIR_F/labeled_spans.hier$NUM_TOPICS_COARSE-$NUM_TOPICS_FINE.txt"; + my $FREQS = "$LABELED_DIR_F/label_freq.hier$NUM_TOPICS_COARSE-$NUM_TOPICS_FINE.txt"; my $COARSE_EXPR = "\'s/\\(X[0-9][0-9]*\\)/\\1c/g\'"; #' my $FINE_EXPR = "\'s/\\(X[0-9][0-9]*\\)/\\1f/g\'"; #' my %finehier = (); @@ -341,9 +348,9 @@ sub extract_freqs { } print FREQS "\n"; } - foreach my $fine_cat (keys %finehier) { - print FREQS "$fine_cat -> $finehier{$fine_cat}\n"; - } +# foreach my $fine_cat (keys %finehier) { +# print FREQS "$fine_cat -> $finehier{$fine_cat}\n"; +# } close FREQS; $CLUSTER_DIR = $CLUSTER_DIR_F; } @@ -364,7 +371,7 @@ sub grammar_extract { sub grammar_extract_bidir { #gzcat ex.output.gz | ./mr_stripe_rule_reduce -p -b | sort -t $'\t' -k 1 | ./mr_stripe_rule_reduce | gzip > phrase-table.gz - my $LABELED = ($HIER_CAT ? "$CLUSTER_DIR_F/corpus.src_trg_al_label.hier" : "$LABELED_DIR/corpus.src_trg_al_label"); + my $LABELED = "$LABELED_DIR/corpus.src_trg_al_label"; print STDERR "\n!!!EXTRACTING GRAMMAR\n"; my $OUTGRAMMAR = "$GRAMMAR_DIR/grammar.bidir.gz"; if (-e $OUTGRAMMAR) { |