diff options
| author | Chris Dyer <cdyer@cab.ark.cs.cmu.edu> | 2012-10-02 00:19:43 -0400 | 
|---|---|---|
| committer | Chris Dyer <cdyer@cab.ark.cs.cmu.edu> | 2012-10-02 00:19:43 -0400 | 
| commit | e26434979adc33bd949566ba7bf02dff64e80a3e (patch) | |
| tree | d1c72495e3af6301bd28e7e66c42de0c7a944d1f /gi/pipeline/scripts | |
| parent | 0870d4a1f5e14cc7daf553b180d599f09f6614a2 (diff) | |
cdec cleanup, remove bayesian stuff, parsing stuff
Diffstat (limited to 'gi/pipeline/scripts')
| -rwxr-xr-x | gi/pipeline/scripts/filter-by-f.pl | 56 | ||||
| -rwxr-xr-x | gi/pipeline/scripts/patch-corpus.pl | 65 | ||||
| -rwxr-xr-x | gi/pipeline/scripts/refilter.pl | 40 | ||||
| -rwxr-xr-x | gi/pipeline/scripts/rekey.pl | 8 | ||||
| -rwxr-xr-x | gi/pipeline/scripts/remove-tags-from-contexts.pl | 53 | ||||
| -rwxr-xr-x | gi/pipeline/scripts/remove-tags-from-corpus.pl | 44 | ||||
| -rwxr-xr-x | gi/pipeline/scripts/sort-by-key.sh | 5 | ||||
| -rwxr-xr-x | gi/pipeline/scripts/xfeats.pl | 39 | 
8 files changed, 0 insertions, 310 deletions
| diff --git a/gi/pipeline/scripts/filter-by-f.pl b/gi/pipeline/scripts/filter-by-f.pl deleted file mode 100755 index 0cef0606..00000000 --- a/gi/pipeline/scripts/filter-by-f.pl +++ /dev/null @@ -1,56 +0,0 @@ -#!/usr/bin/perl -w -use strict; - -my $SCRIPT_DIR; BEGIN { use Cwd qw/ abs_path /; use File::Basename; $SCRIPT_DIR = dirname(abs_path($0)); push @INC, $SCRIPT_DIR; } - -my $REKEY="$SCRIPT_DIR/rekey.pl"; -my $REFILTER="$SCRIPT_DIR/refilter.pl"; -my $SORT="$SCRIPT_DIR/sort-by-key.sh"; -assert_exec($REKEY, $REFILTER, $SORT); - - -die "Usage: $0 NUM-TRANSLATIONS ingrammar.gz outgrammar.gz\n" unless scalar @ARGV == 3; -my $translations = shift @ARGV; -die "Need number: $translations" unless $translations > 0; -die unless $ARGV[0] =~ /\.gz$/; -die unless $ARGV[1] =~ /\.gz$/; -die if $ARGV[0] eq $ARGV[1]; -die "Can't find $ARGV[0]" unless -f $ARGV[0]; - -my $cmd = "gunzip -c $ARGV[0] | $REKEY | $SORT | $REFILTER $translations | gzip > $ARGV[1]"; -safesystem($ARGV[1], $cmd) or die "Filtering failed"; -exit 0; - -sub assert_exec { -  my @files = @_; -  for my $file (@files) { -    die "Can't find $file - did you run make?\n" unless -e $file; -    die "Can't execute $file" unless -e $file; -  } -}; - -sub safesystem { -  my $output = shift @_; -  print STDERR "Executing: @_\n"; -  system(@_); -  if ($? == -1) { -      print STDERR "ERROR: Failed to execute: @_\n  $!\n"; -      if (defined $output && -e $output) { printf STDERR "Removing $output\n"; `rm -rf $output`; } -      exit(1); -  } -  elsif ($? & 127) { -      printf STDERR "ERROR: Execution of: @_\n  died with signal %d, %s coredump\n", -          ($? & 127),  ($? & 128) ? 'with' : 'without'; -      if (defined $output && -e $output) { printf STDERR "Removing $output\n"; `rm -rf $output`; } -      exit(1); -  } -  else { -    my $exitcode = $? >> 8; -    if ($exitcode) { -      print STDERR "Exit code: $exitcode\n"; -      if (defined $output && -e $output) { printf STDERR "Removing $output\n"; `rm -rf $output`; } -    } -    return ! $exitcode; -  } -} - diff --git a/gi/pipeline/scripts/patch-corpus.pl b/gi/pipeline/scripts/patch-corpus.pl deleted file mode 100755 index c0eec43e..00000000 --- a/gi/pipeline/scripts/patch-corpus.pl +++ /dev/null @@ -1,65 +0,0 @@ -#!/usr/bin/perl -w -use strict; - -my $PATCH = shift @ARGV; -my $TGT = 1; -my $APPEND; -while ($PATCH eq "-s" || $PATCH eq "-a") { -    if ($PATCH eq "-s") { -        undef $TGT; -    } else { -        $APPEND = 1; -    } -    $PATCH = shift @ARGV; -} - -die "Usage: $0 [-s] [-a] tagged.en[_fr] < lexical.en_fr_al[_...]\n" unless $PATCH; - -open P, "<$PATCH" or die "Can't read tagged corpus $PATCH: $!"; -my $first=<P>; close P; -my @fields = split / \|\|\| /, $first; -die "Bad format!" if (scalar @fields > 2); - -if (scalar @fields != 1) { -  # TODO support this -  die "Patching source and target not supported yet!"; -} - -my $line = 0; -open P, "<$PATCH" or die "Can't read tagged corpus $PATCH: $!"; -while(my $pline = <P>) { -  chomp $pline; -  $line++; -  my $line = <>; -  die "Too few lines in lexical corpus!" unless $line; -  chomp $line; -  @fields = split / \|\|\| /, $line; -  my @pwords = split /\s+/, $pline; -  if ($TGT) { -      my @lwords = split /\s+/, $fields[1]; -      die "Length mismatch in line $line!\n" unless (scalar @pwords == scalar @lwords); -      if ($APPEND) { -          foreach my $i (0..(scalar @pwords-1)) { -              $lwords[$i] = $lwords[$i] . '_' . $pwords[$i]; -          } -          $fields[1] = join ' ', @lwords; -      } else { -          $fields[1] = $pline; -      } -  } else { # source side -      my @lwords = split /\s+/, $fields[0]; -      die "Length mismatch in line $line!\n" unless (scalar @pwords == scalar @lwords); -      if ($APPEND) { -          foreach my $i (0..(scalar @pwords-1)) { -              $lwords[$i] = $lwords[$i] . '_' . $pwords[$i]; -          } -          $fields[0] = join ' ', @lwords; -      } else { -          $fields[0] = $pline; -      } -  } -  print join ' ||| ', @fields; -  print "\n"; -} - - diff --git a/gi/pipeline/scripts/refilter.pl b/gi/pipeline/scripts/refilter.pl deleted file mode 100755 index a783eb4e..00000000 --- a/gi/pipeline/scripts/refilter.pl +++ /dev/null @@ -1,40 +0,0 @@ -#!/usr/bin/perl -w -use strict; - -my $NUM_TRANSLATIONS = shift @ARGV; -unless ($NUM_TRANSLATIONS) { $NUM_TRANSLATIONS=30; } -print STDERR "KEEPING $NUM_TRANSLATIONS TRANSLATIONS FOR SOURCE\n"; - -my $pk = ''; -my %dict; -while(<>) { -  s/^(.+)\t//; -  my $key = $1; -  if ($key ne $pk) { -    if ($pk) { -      emit_dict(); -    } -    %dict = (); -    $pk = $key; -  } -  my ($lhs, $f, $e, $s) = split / \|\|\| /; -  my $score = 0; -  if ($s =~ /XEF=([^ ]+)/) { -    $score += $1; -  } else { die; } -  if ($s =~ /GenerativeProb=([^ ]+)/) { -    $score += ($1 / 10); -  } else { die; } -  $dict{"$lhs ||| $f ||| $e ||| $s"} = $score; -} -emit_dict(); - -sub emit_dict { -  my $cc = 0; -  for my $k (sort { $dict{$a} <=> $dict{$b} } keys %dict) { -    print "$k"; -    $cc++; -    if ($cc >= $NUM_TRANSLATIONS) { last; } -  } -} - diff --git a/gi/pipeline/scripts/rekey.pl b/gi/pipeline/scripts/rekey.pl deleted file mode 100755 index 31eb86b8..00000000 --- a/gi/pipeline/scripts/rekey.pl +++ /dev/null @@ -1,8 +0,0 @@ -#!/usr/bin/perl - -while(<>) { -  my ($lhs, $f, $e, $s) = split / \|\|\| /; -  $f =~ s/\[X[0-9]+\]/\[X\]/g; -  print "$f\t$_"; -} - diff --git a/gi/pipeline/scripts/remove-tags-from-contexts.pl b/gi/pipeline/scripts/remove-tags-from-contexts.pl deleted file mode 100755 index 20698816..00000000 --- a/gi/pipeline/scripts/remove-tags-from-contexts.pl +++ /dev/null @@ -1,53 +0,0 @@ -#!/usr/bin/perl -w -use strict; - -use Getopt::Long "GetOptions"; - -my $PHRASE = 'tok'; -my $CONTEXT = 'tag'; - -die "Usage: $0 [--phrase=tok|tag] [--context=tok|tag] < corpus"  -    unless &GetOptions('phrase=s' => \$PHRASE, 'context=s' => \$CONTEXT); - -my $lno = 0; -while(my $line = <>) { -    $lno++; -    chomp $line; -    my @top = split /\t/, $line; -    die unless (scalar @top == 2);  - -    my @pwords = split /\s+/, $top[0]; -    foreach my $token (@pwords) { -        #print $token . "\n"; -        my @parts = split /_(?!.*_)/, $token; -        die unless (scalar @parts == 2);  -        if ($PHRASE eq "tok") { -            $token = $parts[0] -        } elsif ($PHRASE eq "tag") { -            $token = $parts[1] -        } -    } - -    my @fields = split / \|\|\| /, $top[1]; -    foreach my $i (0..((scalar @fields) / 2 - 1)) { -        #print $i . ": " . $fields[2*$i] . " of " . (scalar @fields) . "\n"; -        my @cwords = split /\s+/, $fields[2*$i]; -        foreach my $token (@cwords) { -            #print $i . ": " . $token . "\n"; -            my @parts = split /_(?!.*_)/, $token; -            if (scalar @parts == 2) { -                if ($CONTEXT eq "tok") { -                    $token = $parts[0] -                } elsif ($CONTEXT eq "tag") { -                    $token = $parts[1] -                } -            } -        } -        $fields[2*$i] = join ' ', @cwords; -    } - -    print join ' ', @pwords; -    print "\t"; -    print join ' ||| ', @fields; -    print "\n"; -} diff --git a/gi/pipeline/scripts/remove-tags-from-corpus.pl b/gi/pipeline/scripts/remove-tags-from-corpus.pl deleted file mode 100755 index be3e97c0..00000000 --- a/gi/pipeline/scripts/remove-tags-from-corpus.pl +++ /dev/null @@ -1,44 +0,0 @@ -#!/usr/bin/perl -w -use strict; - -use Getopt::Long "GetOptions"; - -my $LANGUAGE = shift @ARGV; -$LANGUAGE = 'target' unless ($LANGUAGE); - -my $lno = 0; -while(my $line = <>) { -    $lno++; -    chomp $line; - -    my @fields = split / \|\|\| /, $line; - -    if ($LANGUAGE eq "source" or $LANGUAGE eq "both") { -        my @cwords = split /\s+/, $fields[0]; -        foreach my $token (@cwords) { -            my @parts = split /_(?!.*_)/, $token; -            if (scalar @parts == 2) { -                $token = $parts[0] -            } else { -                print STDERR "WARNING: invalid tagged token $token\n"; -            } -        } -        $fields[0] = join ' ', @cwords; -    } - -    if ($LANGUAGE eq "target" or $LANGUAGE eq "both") { -        my @cwords = split /\s+/, $fields[1]; -        foreach my $token (@cwords) { -            my @parts = split /_(?!.*_)/, $token; -            if (scalar @parts == 2) { -                $token = $parts[1] -            } else { -                print STDERR "WARNING: invalid tagged token $token\n"; -            } -        } -        $fields[0] = join ' ', @cwords; -    } - -    print join ' ||| ', @fields; -    print "\n"; -} diff --git a/gi/pipeline/scripts/sort-by-key.sh b/gi/pipeline/scripts/sort-by-key.sh deleted file mode 100755 index 7ae33e03..00000000 --- a/gi/pipeline/scripts/sort-by-key.sh +++ /dev/null @@ -1,5 +0,0 @@ -#!/bin/bash - -export LANG=C -sort -t $'\t' -k 1 -T /tmp -S 6000000000 - diff --git a/gi/pipeline/scripts/xfeats.pl b/gi/pipeline/scripts/xfeats.pl deleted file mode 100755 index dc578513..00000000 --- a/gi/pipeline/scripts/xfeats.pl +++ /dev/null @@ -1,39 +0,0 @@ -#!/usr/bin/perl -w -use strict; - -die "Usage: $0 x-grammar.scfg[.gz] < cat-grammar.scfg\n" unless scalar @ARGV > 0; - -my $xgrammar = shift @ARGV; -die "Can't find $xgrammar" unless -f $xgrammar; -my $fh; -if ($xgrammar =~ /\.gz$/) { -  open $fh, "gunzip -c $xgrammar|" or die "Can't fork: $!"; -} else { -  open $fh, "<$xgrammar" or die "Can't read $xgrammar: $!"; -} -print STDERR "Reading X-feats from $xgrammar...\n"; -my %dict; -while(<$fh>) { -  chomp; -  my ($lhs, $f, $e, $feats) = split / \|\|\| /; -  my $xfeats; -  my $cc = 0; -  my @xfeats = (); -  while ($feats =~ /(EGivenF|FGivenE|LogRuleCount|LogECount|LogFCount|SingletonRule|SingletonE|SingletonF)=([^ ]+)( |$)/og) { -    push @xfeats, "X_$1=$2"; -  } -  #print "$lhs ||| $f ||| $e ||| @xfeats\n"; -  $dict{"$lhs ||| $f ||| $e"} = "@xfeats"; -} -close $fh; - -print STDERR "Add features...\n"; -while(<>) { -  chomp; -  my ($lhs, $f, $e) = split / \|\|\| /; -  $f=~ s/\[[^]]+,([12])\]/\[X,$1\]/g; -  my $xfeats = $dict{"[X] ||| $f ||| $e"}; -  die "Can't find x features for: $_\n" unless $xfeats; -  print "$_ $xfeats\n"; -} - | 
