summaryrefslogtreecommitdiff
path: root/gi/pipeline/scripts
diff options
context:
space:
mode:
Diffstat (limited to 'gi/pipeline/scripts')
-rwxr-xr-xgi/pipeline/scripts/filter-by-f.pl56
-rwxr-xr-xgi/pipeline/scripts/patch-corpus.pl65
-rwxr-xr-xgi/pipeline/scripts/refilter.pl40
-rwxr-xr-xgi/pipeline/scripts/rekey.pl8
-rwxr-xr-xgi/pipeline/scripts/remove-tags-from-contexts.pl53
-rwxr-xr-xgi/pipeline/scripts/remove-tags-from-corpus.pl44
-rwxr-xr-xgi/pipeline/scripts/sort-by-key.sh5
-rwxr-xr-xgi/pipeline/scripts/xfeats.pl39
8 files changed, 0 insertions, 310 deletions
diff --git a/gi/pipeline/scripts/filter-by-f.pl b/gi/pipeline/scripts/filter-by-f.pl
deleted file mode 100755
index 0cef0606..00000000
--- a/gi/pipeline/scripts/filter-by-f.pl
+++ /dev/null
@@ -1,56 +0,0 @@
-#!/usr/bin/perl -w
-use strict;
-
-my $SCRIPT_DIR; BEGIN { use Cwd qw/ abs_path /; use File::Basename; $SCRIPT_DIR = dirname(abs_path($0)); push @INC, $SCRIPT_DIR; }
-
-my $REKEY="$SCRIPT_DIR/rekey.pl";
-my $REFILTER="$SCRIPT_DIR/refilter.pl";
-my $SORT="$SCRIPT_DIR/sort-by-key.sh";
-assert_exec($REKEY, $REFILTER, $SORT);
-
-
-die "Usage: $0 NUM-TRANSLATIONS ingrammar.gz outgrammar.gz\n" unless scalar @ARGV == 3;
-my $translations = shift @ARGV;
-die "Need number: $translations" unless $translations > 0;
-die unless $ARGV[0] =~ /\.gz$/;
-die unless $ARGV[1] =~ /\.gz$/;
-die if $ARGV[0] eq $ARGV[1];
-die "Can't find $ARGV[0]" unless -f $ARGV[0];
-
-my $cmd = "gunzip -c $ARGV[0] | $REKEY | $SORT | $REFILTER $translations | gzip > $ARGV[1]";
-safesystem($ARGV[1], $cmd) or die "Filtering failed";
-exit 0;
-
-sub assert_exec {
- my @files = @_;
- for my $file (@files) {
- die "Can't find $file - did you run make?\n" unless -e $file;
- die "Can't execute $file" unless -e $file;
- }
-};
-
-sub safesystem {
- my $output = shift @_;
- print STDERR "Executing: @_\n";
- system(@_);
- if ($? == -1) {
- print STDERR "ERROR: Failed to execute: @_\n $!\n";
- if (defined $output && -e $output) { printf STDERR "Removing $output\n"; `rm -rf $output`; }
- exit(1);
- }
- elsif ($? & 127) {
- printf STDERR "ERROR: Execution of: @_\n died with signal %d, %s coredump\n",
- ($? & 127), ($? & 128) ? 'with' : 'without';
- if (defined $output && -e $output) { printf STDERR "Removing $output\n"; `rm -rf $output`; }
- exit(1);
- }
- else {
- my $exitcode = $? >> 8;
- if ($exitcode) {
- print STDERR "Exit code: $exitcode\n";
- if (defined $output && -e $output) { printf STDERR "Removing $output\n"; `rm -rf $output`; }
- }
- return ! $exitcode;
- }
-}
-
diff --git a/gi/pipeline/scripts/patch-corpus.pl b/gi/pipeline/scripts/patch-corpus.pl
deleted file mode 100755
index c0eec43e..00000000
--- a/gi/pipeline/scripts/patch-corpus.pl
+++ /dev/null
@@ -1,65 +0,0 @@
-#!/usr/bin/perl -w
-use strict;
-
-my $PATCH = shift @ARGV;
-my $TGT = 1;
-my $APPEND;
-while ($PATCH eq "-s" || $PATCH eq "-a") {
- if ($PATCH eq "-s") {
- undef $TGT;
- } else {
- $APPEND = 1;
- }
- $PATCH = shift @ARGV;
-}
-
-die "Usage: $0 [-s] [-a] tagged.en[_fr] < lexical.en_fr_al[_...]\n" unless $PATCH;
-
-open P, "<$PATCH" or die "Can't read tagged corpus $PATCH: $!";
-my $first=<P>; close P;
-my @fields = split / \|\|\| /, $first;
-die "Bad format!" if (scalar @fields > 2);
-
-if (scalar @fields != 1) {
- # TODO support this
- die "Patching source and target not supported yet!";
-}
-
-my $line = 0;
-open P, "<$PATCH" or die "Can't read tagged corpus $PATCH: $!";
-while(my $pline = <P>) {
- chomp $pline;
- $line++;
- my $line = <>;
- die "Too few lines in lexical corpus!" unless $line;
- chomp $line;
- @fields = split / \|\|\| /, $line;
- my @pwords = split /\s+/, $pline;
- if ($TGT) {
- my @lwords = split /\s+/, $fields[1];
- die "Length mismatch in line $line!\n" unless (scalar @pwords == scalar @lwords);
- if ($APPEND) {
- foreach my $i (0..(scalar @pwords-1)) {
- $lwords[$i] = $lwords[$i] . '_' . $pwords[$i];
- }
- $fields[1] = join ' ', @lwords;
- } else {
- $fields[1] = $pline;
- }
- } else { # source side
- my @lwords = split /\s+/, $fields[0];
- die "Length mismatch in line $line!\n" unless (scalar @pwords == scalar @lwords);
- if ($APPEND) {
- foreach my $i (0..(scalar @pwords-1)) {
- $lwords[$i] = $lwords[$i] . '_' . $pwords[$i];
- }
- $fields[0] = join ' ', @lwords;
- } else {
- $fields[0] = $pline;
- }
- }
- print join ' ||| ', @fields;
- print "\n";
-}
-
-
diff --git a/gi/pipeline/scripts/refilter.pl b/gi/pipeline/scripts/refilter.pl
deleted file mode 100755
index a783eb4e..00000000
--- a/gi/pipeline/scripts/refilter.pl
+++ /dev/null
@@ -1,40 +0,0 @@
-#!/usr/bin/perl -w
-use strict;
-
-my $NUM_TRANSLATIONS = shift @ARGV;
-unless ($NUM_TRANSLATIONS) { $NUM_TRANSLATIONS=30; }
-print STDERR "KEEPING $NUM_TRANSLATIONS TRANSLATIONS FOR SOURCE\n";
-
-my $pk = '';
-my %dict;
-while(<>) {
- s/^(.+)\t//;
- my $key = $1;
- if ($key ne $pk) {
- if ($pk) {
- emit_dict();
- }
- %dict = ();
- $pk = $key;
- }
- my ($lhs, $f, $e, $s) = split / \|\|\| /;
- my $score = 0;
- if ($s =~ /XEF=([^ ]+)/) {
- $score += $1;
- } else { die; }
- if ($s =~ /GenerativeProb=([^ ]+)/) {
- $score += ($1 / 10);
- } else { die; }
- $dict{"$lhs ||| $f ||| $e ||| $s"} = $score;
-}
-emit_dict();
-
-sub emit_dict {
- my $cc = 0;
- for my $k (sort { $dict{$a} <=> $dict{$b} } keys %dict) {
- print "$k";
- $cc++;
- if ($cc >= $NUM_TRANSLATIONS) { last; }
- }
-}
-
diff --git a/gi/pipeline/scripts/rekey.pl b/gi/pipeline/scripts/rekey.pl
deleted file mode 100755
index 31eb86b8..00000000
--- a/gi/pipeline/scripts/rekey.pl
+++ /dev/null
@@ -1,8 +0,0 @@
-#!/usr/bin/perl
-
-while(<>) {
- my ($lhs, $f, $e, $s) = split / \|\|\| /;
- $f =~ s/\[X[0-9]+\]/\[X\]/g;
- print "$f\t$_";
-}
-
diff --git a/gi/pipeline/scripts/remove-tags-from-contexts.pl b/gi/pipeline/scripts/remove-tags-from-contexts.pl
deleted file mode 100755
index 20698816..00000000
--- a/gi/pipeline/scripts/remove-tags-from-contexts.pl
+++ /dev/null
@@ -1,53 +0,0 @@
-#!/usr/bin/perl -w
-use strict;
-
-use Getopt::Long "GetOptions";
-
-my $PHRASE = 'tok';
-my $CONTEXT = 'tag';
-
-die "Usage: $0 [--phrase=tok|tag] [--context=tok|tag] < corpus"
- unless &GetOptions('phrase=s' => \$PHRASE, 'context=s' => \$CONTEXT);
-
-my $lno = 0;
-while(my $line = <>) {
- $lno++;
- chomp $line;
- my @top = split /\t/, $line;
- die unless (scalar @top == 2);
-
- my @pwords = split /\s+/, $top[0];
- foreach my $token (@pwords) {
- #print $token . "\n";
- my @parts = split /_(?!.*_)/, $token;
- die unless (scalar @parts == 2);
- if ($PHRASE eq "tok") {
- $token = $parts[0]
- } elsif ($PHRASE eq "tag") {
- $token = $parts[1]
- }
- }
-
- my @fields = split / \|\|\| /, $top[1];
- foreach my $i (0..((scalar @fields) / 2 - 1)) {
- #print $i . ": " . $fields[2*$i] . " of " . (scalar @fields) . "\n";
- my @cwords = split /\s+/, $fields[2*$i];
- foreach my $token (@cwords) {
- #print $i . ": " . $token . "\n";
- my @parts = split /_(?!.*_)/, $token;
- if (scalar @parts == 2) {
- if ($CONTEXT eq "tok") {
- $token = $parts[0]
- } elsif ($CONTEXT eq "tag") {
- $token = $parts[1]
- }
- }
- }
- $fields[2*$i] = join ' ', @cwords;
- }
-
- print join ' ', @pwords;
- print "\t";
- print join ' ||| ', @fields;
- print "\n";
-}
diff --git a/gi/pipeline/scripts/remove-tags-from-corpus.pl b/gi/pipeline/scripts/remove-tags-from-corpus.pl
deleted file mode 100755
index be3e97c0..00000000
--- a/gi/pipeline/scripts/remove-tags-from-corpus.pl
+++ /dev/null
@@ -1,44 +0,0 @@
-#!/usr/bin/perl -w
-use strict;
-
-use Getopt::Long "GetOptions";
-
-my $LANGUAGE = shift @ARGV;
-$LANGUAGE = 'target' unless ($LANGUAGE);
-
-my $lno = 0;
-while(my $line = <>) {
- $lno++;
- chomp $line;
-
- my @fields = split / \|\|\| /, $line;
-
- if ($LANGUAGE eq "source" or $LANGUAGE eq "both") {
- my @cwords = split /\s+/, $fields[0];
- foreach my $token (@cwords) {
- my @parts = split /_(?!.*_)/, $token;
- if (scalar @parts == 2) {
- $token = $parts[0]
- } else {
- print STDERR "WARNING: invalid tagged token $token\n";
- }
- }
- $fields[0] = join ' ', @cwords;
- }
-
- if ($LANGUAGE eq "target" or $LANGUAGE eq "both") {
- my @cwords = split /\s+/, $fields[1];
- foreach my $token (@cwords) {
- my @parts = split /_(?!.*_)/, $token;
- if (scalar @parts == 2) {
- $token = $parts[1]
- } else {
- print STDERR "WARNING: invalid tagged token $token\n";
- }
- }
- $fields[0] = join ' ', @cwords;
- }
-
- print join ' ||| ', @fields;
- print "\n";
-}
diff --git a/gi/pipeline/scripts/sort-by-key.sh b/gi/pipeline/scripts/sort-by-key.sh
deleted file mode 100755
index 7ae33e03..00000000
--- a/gi/pipeline/scripts/sort-by-key.sh
+++ /dev/null
@@ -1,5 +0,0 @@
-#!/bin/bash
-
-export LANG=C
-sort -t $'\t' -k 1 -T /tmp -S 6000000000
-
diff --git a/gi/pipeline/scripts/xfeats.pl b/gi/pipeline/scripts/xfeats.pl
deleted file mode 100755
index dc578513..00000000
--- a/gi/pipeline/scripts/xfeats.pl
+++ /dev/null
@@ -1,39 +0,0 @@
-#!/usr/bin/perl -w
-use strict;
-
-die "Usage: $0 x-grammar.scfg[.gz] < cat-grammar.scfg\n" unless scalar @ARGV > 0;
-
-my $xgrammar = shift @ARGV;
-die "Can't find $xgrammar" unless -f $xgrammar;
-my $fh;
-if ($xgrammar =~ /\.gz$/) {
- open $fh, "gunzip -c $xgrammar|" or die "Can't fork: $!";
-} else {
- open $fh, "<$xgrammar" or die "Can't read $xgrammar: $!";
-}
-print STDERR "Reading X-feats from $xgrammar...\n";
-my %dict;
-while(<$fh>) {
- chomp;
- my ($lhs, $f, $e, $feats) = split / \|\|\| /;
- my $xfeats;
- my $cc = 0;
- my @xfeats = ();
- while ($feats =~ /(EGivenF|FGivenE|LogRuleCount|LogECount|LogFCount|SingletonRule|SingletonE|SingletonF)=([^ ]+)( |$)/og) {
- push @xfeats, "X_$1=$2";
- }
- #print "$lhs ||| $f ||| $e ||| @xfeats\n";
- $dict{"$lhs ||| $f ||| $e"} = "@xfeats";
-}
-close $fh;
-
-print STDERR "Add features...\n";
-while(<>) {
- chomp;
- my ($lhs, $f, $e) = split / \|\|\| /;
- $f=~ s/\[[^]]+,([12])\]/\[X,$1\]/g;
- my $xfeats = $dict{"[X] ||| $f ||| $e"};
- die "Can't find x features for: $_\n" unless $xfeats;
- print "$_ $xfeats\n";
-}
-