#!/usr/bin/perl -w use strict; use Getopt::Long "GetOptions"; my @grammars; my $OUTPUTPREFIX = './giwork/bo.hier.grammar'; safemkdir($OUTPUTPREFIX); my $backoff_levels = 1; my $glue_levels = 1; usage() unless &GetOptions('grmr=s@' => \ @grammars, 'outprefix=s' => \ $OUTPUTPREFIX, 'bo-lvls=i' => \ $backoff_levels, 'glue-lvls=i' => \ $glue_levels, ); my $OUTDIR = $OUTPUTPREFIX . '/hier'; print STDERR "@grammars\n"; my %grmr = (); foreach my $grammar (@grammars) { $grammar =~ m/\/[^\/]*\.t(\d+)\.[^\/]*/; $grmr{$1} = $grammar; } my @index = sort keys %grmr; $OUTDIR = $OUTDIR . join('-',@index); safemkdir($OUTDIR); my $BACKOFF_GRMR = $OUTDIR . '/backoff.hier.gz'; safesystem("echo \"\" | gzip > $BACKOFF_GRMR"); my $GLUE_GRMR = $OUTDIR . '/glue.hier.gz'; safesystem("echo \"\" | gzip > $GLUE_GRMR"); my $joinedgrammars = $OUTDIR . '/grammar.hier.gz'; join_grammars(); for my $i (0..(scalar @index)-2) { my $freqs = extract_freqs($index[$i], $index[$i+1]); if ($i < $backoff_levels) { create_backoff_rules($index[$i],$index[$i+1],$freqs); } if ($i < $glue_levels) { add_glue_rules($index[$i]); } } output_grammar_info(); sub usage { print <<EOT; Usage: $0 [OPTIONS] corpus.fr-en-al Induces a grammar using Pitman-Yor topic modeling or Posterior Regularisation. EOT exit 1; }; sub safemkdir { my $dir = shift; if (-d $dir) { return 1; } return mkdir($dir); } sub safesystem { print STDERR "Executing: @_\n"; system(@_); if ($? == -1) { print STDERR "ERROR: Failed to execute: @_\n $!\n"; exit(1); } elsif ($? & 127) { printf STDERR "ERROR: Execution of: @_\n died with signal %d, %s coredump\n", ($? & 127), ($? & 128) ? 'with' : 'without'; exit(1); } else { my $exitcode = $? >> 8; print STDERR "Exit code: $exitcode\n" if $exitcode; return ! $exitcode; } } sub join_grammars { print STDERR "\n!!! JOINING GRAMMARS\n"; if(-e $joinedgrammars) { print STDERR "$joinedgrammars exists, reusing...\n"; return; } safesystem("echo \"\" | gzip > $joinedgrammars"); foreach my $i (@index) { my $g = $grmr{$i}; safesystem("zcat $g | sed -r -e 's/X([0-9]+)/X$i\\1/g' - | gzip > $g.2.gz"); safesystem("zcat $joinedgrammars $g.2.gz | gzip > $joinedgrammars.2.gz"); safesystem("mv $joinedgrammars.2.gz $joinedgrammars"); } } sub extract_freqs { my($grmr1,$grmr2) = @_; print STDERR "\n!!!EXTRACTING FREQUENCIES: $grmr1->$grmr2\n"; my $IN_COARSE = substr($grmr{$grmr1},0,index($grmr{$grmr1},".grammar/")) . "/labeled_spans.txt"; my $IN_FINE = substr($grmr{$grmr2},0,index($grmr{$grmr2},".grammar/")) . "/labeled_spans.txt"; my $OUT_SPANS = "$OUTDIR/labeled_spans.hier$grmr1-$grmr2.txt"; my $FREQS = "$OUTDIR/label_freq.hier$grmr1-$grmr2.txt"; if(-e $OUT_SPANS && -e $FREQS) { print STDERR "$OUT_SPANS exists, reusing...\n"; print STDERR "$FREQS exists, reusing...\n"; return $FREQS; } safesystem("paste -d ' ' $IN_COARSE $IN_FINE > $OUT_SPANS"); my %FREQ_HIER = (); my %finehier = (); open SPANS, $OUT_SPANS or die $!; while (<SPANS>) { my ($tmp, $coarse, $fine) = split /\|\|\|/; my @coarse_spans = $coarse =~ /\d+-\d+:X(\d+)/g; my @fine_spans = $fine =~ /\d+-\d+:X(\d+)/g; foreach my $i (0..(scalar @coarse_spans)-1) { my $coarse_cat = $coarse_spans[$i]; my $fine_cat = $fine_spans[$i]; $FREQ_HIER{$coarse_cat}{$fine_cat}++; } } close SPANS; foreach (values %FREQ_HIER) { my $coarse_freq = $_; my $total = 0; $total+=$_ for (values %{ $coarse_freq }); $coarse_freq->{$_}=log($coarse_freq->{$_}/$total) for (keys %{ $coarse_freq }); } open FREQS, ">", $FREQS or die $!; foreach my $coarse_cat (keys %FREQ_HIER) { print FREQS "$coarse_cat |||"; foreach my $fine_cat (keys %{$FREQ_HIER{$coarse_cat}}) { my $freq = $FREQ_HIER{$coarse_cat}{$fine_cat}; print FREQS " $fine_cat:$freq"; if(! exists $finehier{$fine_cat} || $finehier{$fine_cat} < $freq) { $finehier{$fine_cat} = $coarse_cat; } } print FREQS "\n"; } # foreach my $fine_cat (keys %finehier) { # print FREQS "$fine_cat -> $finehier{$fine_cat}\n"; # } close FREQS; return $FREQS; } sub create_backoff_rules { print STDERR "\n!!! CREATING BACKOFF RULES\n"; my ($grmr1, $grmr2, $freq) = @_; my $OUTFILE = "$OUTDIR/backoff.hier$grmr1-$grmr2.txt"; if(-e $OUTFILE) { print STDERR "$OUTFILE exists, reusing...\n"; return; } open FREQS, $freq or die $!; open TMP, ">", $OUTFILE or die $!; while (<FREQS>) { my $line = $_; $line = m/^(\d+) \|\|\| (.+)$/; my $coarse = $1; $line = $2; my @finefreq = $line =~ m/(\d+):(\S+)/g; for(my $i = 0; $i < scalar @finefreq; $i+=2) { my $finecat = $finefreq[$i]; my $finefreq = $finefreq[$i+1]; print TMP "[X$grmr1$coarse] ||| [X$grmr2$finecat,1]\t[1] ||| BackoffRule=$finefreq A=0-0\n"; } } close TMP; close FREQS; safesystem("zcat $BACKOFF_GRMR | cat - $OUTFILE | gzip > $BACKOFF_GRMR.2.gz"); safesystem("mv $BACKOFF_GRMR.2.gz $BACKOFF_GRMR"); } sub add_glue_rules { print STDERR "\n!!! CREATING GLUE RULES\n"; my ($grmr) = @_; my $OUTFILE = "$OUTDIR/glue.$grmr.gz"; if (-e $OUTFILE) { print STDERR "$OUTFILE exists, reusing...\n"; return; } open TMP, ">", $OUTFILE or die $!; for my $i (0..($grmr-1)) { print TMP "[S] ||| [S,1] [X$grmr$i,2] ||| [1] [2] ||| Glue=1\n"; print TMP "[S] ||| [X$grmr$i,1] ||| [1] ||| GlueTop=1\n"; } close TMP; safesystem("zcat $GLUE_GRMR | cat - $OUTFILE | gzip > $GLUE_GRMR.2.gz"); safesystem("mv $GLUE_GRMR.2.gz $GLUE_GRMR"); } sub output_grammar_info { print STDERR "\n!!! GRAMMAR INFORMATION\n"; print STDOUT "GRAMMAR: \t$joinedgrammars\n"; print STDOUT "GLUE: \t$GLUE_GRMR\n"; print STDOUT "BACKOFF: \t$BACKOFF_GRMR\n"; }