cdec cleanup, remove bayesian stuff, parsing stuff

author: Chris Dyer <cdyer@cab.ark.cs.cmu.edu> 2012-10-02 00:19:43 -0400
committer: Chris Dyer <cdyer@cab.ark.cs.cmu.edu> 2012-10-02 00:19:43 -0400
commit: e26434979adc33bd949566ba7bf02dff64e80a3e (patch)
tree: d1c72495e3af6301bd28e7e66c42de0c7a944d1f /gi/pipeline/scripts
parent: 0870d4a1f5e14cc7daf553b180d599f09f6614a2 (diff)
8 files changed, 0 insertions, 310 deletions
diff --git a/gi/pipeline/scripts/filter-by-f.pl b/gi/pipeline/scripts/filter-by-f.pl
deleted file mode 100755
index 0cef0606..00000000
--- a/gi/pipeline/scripts/filter-by-f.pl
+++ /dev/null
@@ -1,56 +0,0 @@
-#!/usr/bin/perl -w
-use strict;
-
-my $SCRIPT_DIR; BEGIN { use Cwd qw/ abs_path /; use File::Basename; $SCRIPT_DIR = dirname(abs_path($0)); push @INC, $SCRIPT_DIR; }
-
-my $REKEY="$SCRIPT_DIR/rekey.pl";
-my $REFILTER="$SCRIPT_DIR/refilter.pl";
-my $SORT="$SCRIPT_DIR/sort-by-key.sh";
-assert_exec($REKEY, $REFILTER, $SORT);
-
-
-die "Usage: $0 NUM-TRANSLATIONS ingrammar.gz outgrammar.gz\n" unless scalar @ARGV == 3;
-my $translations = shift @ARGV;
-die "Need number: $translations" unless $translations > 0;
-die unless $ARGV[0] =~ /\.gz$/;
-die unless $ARGV[1] =~ /\.gz$/;
-die if $ARGV[0] eq $ARGV[1];
-die "Can't find $ARGV[0]" unless -f $ARGV[0];
-
-my $cmd = "gunzip -c $ARGV[0] | $REKEY | $SORT | $REFILTER $translations | gzip > $ARGV[1]";
-safesystem($ARGV[1], $cmd) or die "Filtering failed";
-exit 0;
-
-sub assert_exec {
-  my @files = @_;
-  for my $file (@files) {
-    die "Can't find $file - did you run make?\n" unless -e $file;
-    die "Can't execute $file" unless -e $file;
-  }
-};
-
-sub safesystem {
-  my $output = shift @_;
-  print STDERR "Executing: @_\n";
-  system(@_);
-  if ($? == -1) {
-      print STDERR "ERROR: Failed to execute: @_\n  $!\n";
-      if (defined $output && -e $output) { printf STDERR "Removing $output\n"; `rm -rf $output`; }
-      exit(1);
-  }
-  elsif ($? & 127) {
-      printf STDERR "ERROR: Execution of: @_\n  died with signal %d, %s coredump\n",
-          ($? & 127),  ($? & 128) ? 'with' : 'without';
-      if (defined $output && -e $output) { printf STDERR "Removing $output\n"; `rm -rf $output`; }
-      exit(1);
-  }
-  else {
-    my $exitcode = $? >> 8;
-    if ($exitcode) {
-      print STDERR "Exit code: $exitcode\n";
-      if (defined $output && -e $output) { printf STDERR "Removing $output\n"; `rm -rf $output`; }
-    }
-    return ! $exitcode;
-  }
-}
-
diff --git a/gi/pipeline/scripts/patch-corpus.pl b/gi/pipeline/scripts/patch-corpus.pl
deleted file mode 100755
index c0eec43e..00000000
--- a/gi/pipeline/scripts/patch-corpus.pl
+++ /dev/null
@@ -1,65 +0,0 @@
-#!/usr/bin/perl -w
-use strict;
-
-my $PATCH = shift @ARGV;
-my $TGT = 1;
-my $APPEND;
-while ($PATCH eq "-s" || $PATCH eq "-a") {
-    if ($PATCH eq "-s") {
-        undef $TGT;
-    } else {
-        $APPEND = 1;
-    }
-    $PATCH = shift @ARGV;
-}
-
-die "Usage: $0 [-s] [-a] tagged.en[_fr] < lexical.en_fr_al[_...]\n" unless $PATCH;
-
-open P, "<$PATCH" or die "Can't read tagged corpus $PATCH: $!";
-my $first=<P>; close P;
-my @fields = split / \|\|\| /, $first;
-die "Bad format!" if (scalar @fields > 2);
-
-if (scalar @fields != 1) {
-  # TODO support this
-  die "Patching source and target not supported yet!";
-}
-
-my $line = 0;
-open P, "<$PATCH" or die "Can't read tagged corpus $PATCH: $!";
-while(my $pline = <P>) {
-  chomp $pline;
-  $line++;
-  my $line = <>;
-  die "Too few lines in lexical corpus!" unless $line;
-  chomp $line;
-  @fields = split / \|\|\| /, $line;
-  my @pwords = split /\s+/, $pline;
-  if ($TGT) {
-      my @lwords = split /\s+/, $fields[1];
-      die "Length mismatch in line $line!\n" unless (scalar @pwords == scalar @lwords);
-      if ($APPEND) {
-          foreach my $i (0..(scalar @pwords-1)) {
-              $lwords[$i] = $lwords[$i] . '_' . $pwords[$i];
-          }
-          $fields[1] = join ' ', @lwords;
-      } else {
-          $fields[1] = $pline;
-      }
-  } else { # source side
-      my @lwords = split /\s+/, $fields[0];
-      die "Length mismatch in line $line!\n" unless (scalar @pwords == scalar @lwords);
-      if ($APPEND) {
-          foreach my $i (0..(scalar @pwords-1)) {
-              $lwords[$i] = $lwords[$i] . '_' . $pwords[$i];
-          }
-          $fields[0] = join ' ', @lwords;
-      } else {
-          $fields[0] = $pline;
-      }
-  }
-  print join ' ||| ', @fields;
-  print "\n";
-}
-
-
diff --git a/gi/pipeline/scripts/refilter.pl b/gi/pipeline/scripts/refilter.pl
deleted file mode 100755
index a783eb4e..00000000
--- a/gi/pipeline/scripts/refilter.pl
+++ /dev/null
@@ -1,40 +0,0 @@
-#!/usr/bin/perl -w
-use strict;
-
-my $NUM_TRANSLATIONS = shift @ARGV;
-unless ($NUM_TRANSLATIONS) { $NUM_TRANSLATIONS=30; }
-print STDERR "KEEPING $NUM_TRANSLATIONS TRANSLATIONS FOR SOURCE\n";
-
-my $pk = '';
-my %dict;
-while(<>) {
-  s/^(.+)\t//;
-  my $key = $1;
-  if ($key ne $pk) {
-    if ($pk) {
-      emit_dict();
-    }
-    %dict = ();
-    $pk = $key;
-  }
-  my ($lhs, $f, $e, $s) = split / \|\|\| /;
-  my $score = 0;
-  if ($s =~ /XEF=([^ ]+)/) {
-    $score += $1;
-  } else { die; }
-  if ($s =~ /GenerativeProb=([^ ]+)/) {
-    $score += ($1 / 10);
-  } else { die; }
-  $dict{"$lhs ||| $f ||| $e ||| $s"} = $score;
-}
-emit_dict();
-
-sub emit_dict {
-  my $cc = 0;
-  for my $k (sort { $dict{$a} <=> $dict{$b} } keys %dict) {
-    print "$k";
-    $cc++;
-    if ($cc >= $NUM_TRANSLATIONS) { last; }
-  }
-}
-
diff --git a/gi/pipeline/scripts/rekey.pl b/gi/pipeline/scripts/rekey.pl
deleted file mode 100755
index 31eb86b8..00000000
--- a/gi/pipeline/scripts/rekey.pl
+++ /dev/null
@@ -1,8 +0,0 @@
-#!/usr/bin/perl
-
-while(<>) {
-  my ($lhs, $f, $e, $s) = split / \|\|\| /;
-  $f =~ s/\[X[0-9]+\]/\[X\]/g;
-  print "$f\t$_";
-}
-
diff --git a/gi/pipeline/scripts/remove-tags-from-contexts.pl b/gi/pipeline/scripts/remove-tags-from-contexts.pl
deleted file mode 100755
index 20698816..00000000
--- a/gi/pipeline/scripts/remove-tags-from-contexts.pl
+++ /dev/null
@@ -1,53 +0,0 @@
-#!/usr/bin/perl -w
-use strict;
-
-use Getopt::Long "GetOptions";
-
-my $PHRASE = 'tok';
-my $CONTEXT = 'tag';
-
-die "Usage: $0 [--phrase=tok|tag] [--context=tok|tag] < corpus" 
-    unless &GetOptions('phrase=s' => \$PHRASE, 'context=s' => \$CONTEXT);
-
-my $lno = 0;
-while(my $line = <>) {
-    $lno++;
-    chomp $line;
-    my @top = split /\t/, $line;
-    die unless (scalar @top == 2); 
-
-    my @pwords = split /\s+/, $top[0];
-    foreach my $token (@pwords) {
-        #print $token . "\n";
-        my @parts = split /_(?!.*_)/, $token;
-        die unless (scalar @parts == 2); 
-        if ($PHRASE eq "tok") {
-            $token = $parts[0]
-        } elsif ($PHRASE eq "tag") {
-            $token = $parts[1]
-        }
-    }
-
-    my @fields = split / \|\|\| /, $top[1];
-    foreach my $i (0..((scalar @fields) / 2 - 1)) {
-        #print $i . ": " . $fields[2*$i] . " of " . (scalar @fields) . "\n";
-        my @cwords = split /\s+/, $fields[2*$i];
-        foreach my $token (@cwords) {
-            #print $i . ": " . $token . "\n";
-            my @parts = split /_(?!.*_)/, $token;
-            if (scalar @parts == 2) {
-                if ($CONTEXT eq "tok") {
-                    $token = $parts[0]
-                } elsif ($CONTEXT eq "tag") {
-                    $token = $parts[1]
-                }
-            }
-        }
-        $fields[2*$i] = join ' ', @cwords;
-    }
-
-    print join ' ', @pwords;
-    print "\t";
-    print join ' ||| ', @fields;
-    print "\n";
-}
diff --git a/gi/pipeline/scripts/remove-tags-from-corpus.pl b/gi/pipeline/scripts/remove-tags-from-corpus.pl
deleted file mode 100755
index be3e97c0..00000000
--- a/gi/pipeline/scripts/remove-tags-from-corpus.pl
+++ /dev/null
@@ -1,44 +0,0 @@
-#!/usr/bin/perl -w
-use strict;
-
-use Getopt::Long "GetOptions";
-
-my $LANGUAGE = shift @ARGV;
-$LANGUAGE = 'target' unless ($LANGUAGE);
-
-my $lno = 0;
-while(my $line = <>) {
-    $lno++;
-    chomp $line;
-
-    my @fields = split / \|\|\| /, $line;
-
-    if ($LANGUAGE eq "source" or $LANGUAGE eq "both") {
-        my @cwords = split /\s+/, $fields[0];
-        foreach my $token (@cwords) {
-            my @parts = split /_(?!.*_)/, $token;
-            if (scalar @parts == 2) {
-                $token = $parts[0]
-            } else {
-                print STDERR "WARNING: invalid tagged token $token\n";
-            }
-        }
-        $fields[0] = join ' ', @cwords;
-    }
-
-    if ($LANGUAGE eq "target" or $LANGUAGE eq "both") {
-        my @cwords = split /\s+/, $fields[1];
-        foreach my $token (@cwords) {
-            my @parts = split /_(?!.*_)/, $token;
-            if (scalar @parts == 2) {
-                $token = $parts[1]
-            } else {
-                print STDERR "WARNING: invalid tagged token $token\n";
-            }
-        }
-        $fields[0] = join ' ', @cwords;
-    }
-
-    print join ' ||| ', @fields;
-    print "\n";
-}
diff --git a/gi/pipeline/scripts/sort-by-key.sh b/gi/pipeline/scripts/sort-by-key.sh
deleted file mode 100755
index 7ae33e03..00000000
--- a/gi/pipeline/scripts/sort-by-key.sh
+++ /dev/null
@@ -1,5 +0,0 @@
-#!/bin/bash
-
-export LANG=C
-sort -t $'\t' -k 1 -T /tmp -S 6000000000
-
diff --git a/gi/pipeline/scripts/xfeats.pl b/gi/pipeline/scripts/xfeats.pl
deleted file mode 100755
index dc578513..00000000
--- a/gi/pipeline/scripts/xfeats.pl
+++ /dev/null
@@ -1,39 +0,0 @@
-#!/usr/bin/perl -w
-use strict;
-
-die "Usage: $0 x-grammar.scfg[.gz] < cat-grammar.scfg\n" unless scalar @ARGV > 0;
-
-my $xgrammar = shift @ARGV;
-die "Can't find $xgrammar" unless -f $xgrammar;
-my $fh;
-if ($xgrammar =~ /\.gz$/) {
-  open $fh, "gunzip -c $xgrammar|" or die "Can't fork: $!";
-} else {
-  open $fh, "<$xgrammar" or die "Can't read $xgrammar: $!";
-}
-print STDERR "Reading X-feats from $xgrammar...\n";
-my %dict;
-while(<$fh>) {
-  chomp;
-  my ($lhs, $f, $e, $feats) = split / \|\|\| /;
-  my $xfeats;
-  my $cc = 0;
-  my @xfeats = ();
-  while ($feats =~ /(EGivenF|FGivenE|LogRuleCount|LogECount|LogFCount|SingletonRule|SingletonE|SingletonF)=([^ ]+)( |$)/og) {
-    push @xfeats, "X_$1=$2";
-  }
-  #print "$lhs ||| $f ||| $e ||| @xfeats\n";
-  $dict{"$lhs ||| $f ||| $e"} = "@xfeats";
-}
-close $fh;
-
-print STDERR "Add features...\n";
-while(<>) {
-  chomp;
-  my ($lhs, $f, $e) = split / \|\|\| /;
-  $f=~ s/\[[^]]+,([12])\]/\[X,$1\]/g;
-  my $xfeats = $dict{"[X] ||| $f ||| $e"};
-  die "Can't find x features for: $_\n" unless $xfeats;
-  print "$_ $xfeats\n";
-}
-
author	Chris Dyer <cdyer@cab.ark.cs.cmu.edu>	2012-10-02 00:19:43 -0400
committer	Chris Dyer <cdyer@cab.ark.cs.cmu.edu>	2012-10-02 00:19:43 -0400
commit	e26434979adc33bd949566ba7bf02dff64e80a3e (patch)
tree	d1c72495e3af6301bd28e7e66c42de0c7a944d1f /gi/pipeline/scripts
parent	0870d4a1f5e14cc7daf553b180d599f09f6614a2 (diff)