From e26434979adc33bd949566ba7bf02dff64e80a3e Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Tue, 2 Oct 2012 00:19:43 -0400 Subject: cdec cleanup, remove bayesian stuff, parsing stuff --- gi/pipeline/scripts/filter-by-f.pl | 56 -------------------- gi/pipeline/scripts/patch-corpus.pl | 65 ------------------------ gi/pipeline/scripts/refilter.pl | 40 --------------- gi/pipeline/scripts/rekey.pl | 8 --- gi/pipeline/scripts/remove-tags-from-contexts.pl | 53 ------------------- gi/pipeline/scripts/remove-tags-from-corpus.pl | 44 ---------------- gi/pipeline/scripts/sort-by-key.sh | 5 -- gi/pipeline/scripts/xfeats.pl | 39 -------------- 8 files changed, 310 deletions(-) delete mode 100755 gi/pipeline/scripts/filter-by-f.pl delete mode 100755 gi/pipeline/scripts/patch-corpus.pl delete mode 100755 gi/pipeline/scripts/refilter.pl delete mode 100755 gi/pipeline/scripts/rekey.pl delete mode 100755 gi/pipeline/scripts/remove-tags-from-contexts.pl delete mode 100755 gi/pipeline/scripts/remove-tags-from-corpus.pl delete mode 100755 gi/pipeline/scripts/sort-by-key.sh delete mode 100755 gi/pipeline/scripts/xfeats.pl (limited to 'gi/pipeline/scripts') diff --git a/gi/pipeline/scripts/filter-by-f.pl b/gi/pipeline/scripts/filter-by-f.pl deleted file mode 100755 index 0cef0606..00000000 --- a/gi/pipeline/scripts/filter-by-f.pl +++ /dev/null @@ -1,56 +0,0 @@ -#!/usr/bin/perl -w -use strict; - -my $SCRIPT_DIR; BEGIN { use Cwd qw/ abs_path /; use File::Basename; $SCRIPT_DIR = dirname(abs_path($0)); push @INC, $SCRIPT_DIR; } - -my $REKEY="$SCRIPT_DIR/rekey.pl"; -my $REFILTER="$SCRIPT_DIR/refilter.pl"; -my $SORT="$SCRIPT_DIR/sort-by-key.sh"; -assert_exec($REKEY, $REFILTER, $SORT); - - -die "Usage: $0 NUM-TRANSLATIONS ingrammar.gz outgrammar.gz\n" unless scalar @ARGV == 3; -my $translations = shift @ARGV; -die "Need number: $translations" unless $translations > 0; -die unless $ARGV[0] =~ /\.gz$/; -die unless $ARGV[1] =~ /\.gz$/; -die if $ARGV[0] eq $ARGV[1]; -die "Can't find $ARGV[0]" unless -f $ARGV[0]; - -my $cmd = "gunzip -c $ARGV[0] | $REKEY | $SORT | $REFILTER $translations | gzip > $ARGV[1]"; -safesystem($ARGV[1], $cmd) or die "Filtering failed"; -exit 0; - -sub assert_exec { - my @files = @_; - for my $file (@files) { - die "Can't find $file - did you run make?\n" unless -e $file; - die "Can't execute $file" unless -e $file; - } -}; - -sub safesystem { - my $output = shift @_; - print STDERR "Executing: @_\n"; - system(@_); - if ($? == -1) { - print STDERR "ERROR: Failed to execute: @_\n $!\n"; - if (defined $output && -e $output) { printf STDERR "Removing $output\n"; `rm -rf $output`; } - exit(1); - } - elsif ($? & 127) { - printf STDERR "ERROR: Execution of: @_\n died with signal %d, %s coredump\n", - ($? & 127), ($? & 128) ? 'with' : 'without'; - if (defined $output && -e $output) { printf STDERR "Removing $output\n"; `rm -rf $output`; } - exit(1); - } - else { - my $exitcode = $? >> 8; - if ($exitcode) { - print STDERR "Exit code: $exitcode\n"; - if (defined $output && -e $output) { printf STDERR "Removing $output\n"; `rm -rf $output`; } - } - return ! $exitcode; - } -} - diff --git a/gi/pipeline/scripts/patch-corpus.pl b/gi/pipeline/scripts/patch-corpus.pl deleted file mode 100755 index c0eec43e..00000000 --- a/gi/pipeline/scripts/patch-corpus.pl +++ /dev/null @@ -1,65 +0,0 @@ -#!/usr/bin/perl -w -use strict; - -my $PATCH = shift @ARGV; -my $TGT = 1; -my $APPEND; -while ($PATCH eq "-s" || $PATCH eq "-a") { - if ($PATCH eq "-s") { - undef $TGT; - } else { - $APPEND = 1; - } - $PATCH = shift @ARGV; -} - -die "Usage: $0 [-s] [-a] tagged.en[_fr] < lexical.en_fr_al[_...]\n" unless $PATCH; - -open P, "<$PATCH" or die "Can't read tagged corpus $PATCH: $!"; -my $first=

; close P; -my @fields = split / \|\|\| /, $first; -die "Bad format!" if (scalar @fields > 2); - -if (scalar @fields != 1) { - # TODO support this - die "Patching source and target not supported yet!"; -} - -my $line = 0; -open P, "<$PATCH" or die "Can't read tagged corpus $PATCH: $!"; -while(my $pline =

) { - chomp $pline; - $line++; - my $line = <>; - die "Too few lines in lexical corpus!" unless $line; - chomp $line; - @fields = split / \|\|\| /, $line; - my @pwords = split /\s+/, $pline; - if ($TGT) { - my @lwords = split /\s+/, $fields[1]; - die "Length mismatch in line $line!\n" unless (scalar @pwords == scalar @lwords); - if ($APPEND) { - foreach my $i (0..(scalar @pwords-1)) { - $lwords[$i] = $lwords[$i] . '_' . $pwords[$i]; - } - $fields[1] = join ' ', @lwords; - } else { - $fields[1] = $pline; - } - } else { # source side - my @lwords = split /\s+/, $fields[0]; - die "Length mismatch in line $line!\n" unless (scalar @pwords == scalar @lwords); - if ($APPEND) { - foreach my $i (0..(scalar @pwords-1)) { - $lwords[$i] = $lwords[$i] . '_' . $pwords[$i]; - } - $fields[0] = join ' ', @lwords; - } else { - $fields[0] = $pline; - } - } - print join ' ||| ', @fields; - print "\n"; -} - - diff --git a/gi/pipeline/scripts/refilter.pl b/gi/pipeline/scripts/refilter.pl deleted file mode 100755 index a783eb4e..00000000 --- a/gi/pipeline/scripts/refilter.pl +++ /dev/null @@ -1,40 +0,0 @@ -#!/usr/bin/perl -w -use strict; - -my $NUM_TRANSLATIONS = shift @ARGV; -unless ($NUM_TRANSLATIONS) { $NUM_TRANSLATIONS=30; } -print STDERR "KEEPING $NUM_TRANSLATIONS TRANSLATIONS FOR SOURCE\n"; - -my $pk = ''; -my %dict; -while(<>) { - s/^(.+)\t//; - my $key = $1; - if ($key ne $pk) { - if ($pk) { - emit_dict(); - } - %dict = (); - $pk = $key; - } - my ($lhs, $f, $e, $s) = split / \|\|\| /; - my $score = 0; - if ($s =~ /XEF=([^ ]+)/) { - $score += $1; - } else { die; } - if ($s =~ /GenerativeProb=([^ ]+)/) { - $score += ($1 / 10); - } else { die; } - $dict{"$lhs ||| $f ||| $e ||| $s"} = $score; -} -emit_dict(); - -sub emit_dict { - my $cc = 0; - for my $k (sort { $dict{$a} <=> $dict{$b} } keys %dict) { - print "$k"; - $cc++; - if ($cc >= $NUM_TRANSLATIONS) { last; } - } -} - diff --git a/gi/pipeline/scripts/rekey.pl b/gi/pipeline/scripts/rekey.pl deleted file mode 100755 index 31eb86b8..00000000 --- a/gi/pipeline/scripts/rekey.pl +++ /dev/null @@ -1,8 +0,0 @@ -#!/usr/bin/perl - -while(<>) { - my ($lhs, $f, $e, $s) = split / \|\|\| /; - $f =~ s/\[X[0-9]+\]/\[X\]/g; - print "$f\t$_"; -} - diff --git a/gi/pipeline/scripts/remove-tags-from-contexts.pl b/gi/pipeline/scripts/remove-tags-from-contexts.pl deleted file mode 100755 index 20698816..00000000 --- a/gi/pipeline/scripts/remove-tags-from-contexts.pl +++ /dev/null @@ -1,53 +0,0 @@ -#!/usr/bin/perl -w -use strict; - -use Getopt::Long "GetOptions"; - -my $PHRASE = 'tok'; -my $CONTEXT = 'tag'; - -die "Usage: $0 [--phrase=tok|tag] [--context=tok|tag] < corpus" - unless &GetOptions('phrase=s' => \$PHRASE, 'context=s' => \$CONTEXT); - -my $lno = 0; -while(my $line = <>) { - $lno++; - chomp $line; - my @top = split /\t/, $line; - die unless (scalar @top == 2); - - my @pwords = split /\s+/, $top[0]; - foreach my $token (@pwords) { - #print $token . "\n"; - my @parts = split /_(?!.*_)/, $token; - die unless (scalar @parts == 2); - if ($PHRASE eq "tok") { - $token = $parts[0] - } elsif ($PHRASE eq "tag") { - $token = $parts[1] - } - } - - my @fields = split / \|\|\| /, $top[1]; - foreach my $i (0..((scalar @fields) / 2 - 1)) { - #print $i . ": " . $fields[2*$i] . " of " . (scalar @fields) . "\n"; - my @cwords = split /\s+/, $fields[2*$i]; - foreach my $token (@cwords) { - #print $i . ": " . $token . "\n"; - my @parts = split /_(?!.*_)/, $token; - if (scalar @parts == 2) { - if ($CONTEXT eq "tok") { - $token = $parts[0] - } elsif ($CONTEXT eq "tag") { - $token = $parts[1] - } - } - } - $fields[2*$i] = join ' ', @cwords; - } - - print join ' ', @pwords; - print "\t"; - print join ' ||| ', @fields; - print "\n"; -} diff --git a/gi/pipeline/scripts/remove-tags-from-corpus.pl b/gi/pipeline/scripts/remove-tags-from-corpus.pl deleted file mode 100755 index be3e97c0..00000000 --- a/gi/pipeline/scripts/remove-tags-from-corpus.pl +++ /dev/null @@ -1,44 +0,0 @@ -#!/usr/bin/perl -w -use strict; - -use Getopt::Long "GetOptions"; - -my $LANGUAGE = shift @ARGV; -$LANGUAGE = 'target' unless ($LANGUAGE); - -my $lno = 0; -while(my $line = <>) { - $lno++; - chomp $line; - - my @fields = split / \|\|\| /, $line; - - if ($LANGUAGE eq "source" or $LANGUAGE eq "both") { - my @cwords = split /\s+/, $fields[0]; - foreach my $token (@cwords) { - my @parts = split /_(?!.*_)/, $token; - if (scalar @parts == 2) { - $token = $parts[0] - } else { - print STDERR "WARNING: invalid tagged token $token\n"; - } - } - $fields[0] = join ' ', @cwords; - } - - if ($LANGUAGE eq "target" or $LANGUAGE eq "both") { - my @cwords = split /\s+/, $fields[1]; - foreach my $token (@cwords) { - my @parts = split /_(?!.*_)/, $token; - if (scalar @parts == 2) { - $token = $parts[1] - } else { - print STDERR "WARNING: invalid tagged token $token\n"; - } - } - $fields[0] = join ' ', @cwords; - } - - print join ' ||| ', @fields; - print "\n"; -} diff --git a/gi/pipeline/scripts/sort-by-key.sh b/gi/pipeline/scripts/sort-by-key.sh deleted file mode 100755 index 7ae33e03..00000000 --- a/gi/pipeline/scripts/sort-by-key.sh +++ /dev/null @@ -1,5 +0,0 @@ -#!/bin/bash - -export LANG=C -sort -t $'\t' -k 1 -T /tmp -S 6000000000 - diff --git a/gi/pipeline/scripts/xfeats.pl b/gi/pipeline/scripts/xfeats.pl deleted file mode 100755 index dc578513..00000000 --- a/gi/pipeline/scripts/xfeats.pl +++ /dev/null @@ -1,39 +0,0 @@ -#!/usr/bin/perl -w -use strict; - -die "Usage: $0 x-grammar.scfg[.gz] < cat-grammar.scfg\n" unless scalar @ARGV > 0; - -my $xgrammar = shift @ARGV; -die "Can't find $xgrammar" unless -f $xgrammar; -my $fh; -if ($xgrammar =~ /\.gz$/) { - open $fh, "gunzip -c $xgrammar|" or die "Can't fork: $!"; -} else { - open $fh, "<$xgrammar" or die "Can't read $xgrammar: $!"; -} -print STDERR "Reading X-feats from $xgrammar...\n"; -my %dict; -while(<$fh>) { - chomp; - my ($lhs, $f, $e, $feats) = split / \|\|\| /; - my $xfeats; - my $cc = 0; - my @xfeats = (); - while ($feats =~ /(EGivenF|FGivenE|LogRuleCount|LogECount|LogFCount|SingletonRule|SingletonE|SingletonF)=([^ ]+)( |$)/og) { - push @xfeats, "X_$1=$2"; - } - #print "$lhs ||| $f ||| $e ||| @xfeats\n"; - $dict{"$lhs ||| $f ||| $e"} = "@xfeats"; -} -close $fh; - -print STDERR "Add features...\n"; -while(<>) { - chomp; - my ($lhs, $f, $e) = split / \|\|\| /; - $f=~ s/\[[^]]+,([12])\]/\[X,$1\]/g; - my $xfeats = $dict{"[X] ||| $f ||| $e"}; - die "Can't find x features for: $_\n" unless $xfeats; - print "$_ $xfeats\n"; -} - -- cgit v1.2.3