summaryrefslogtreecommitdiff
path: root/gi/pipeline/scripts/patch-corpus.pl
diff options
context:
space:
mode:
authorChris Dyer <cdyer@cab.ark.cs.cmu.edu>2012-10-02 00:19:43 -0400
committerChris Dyer <cdyer@cab.ark.cs.cmu.edu>2012-10-02 00:19:43 -0400
commite26434979adc33bd949566ba7bf02dff64e80a3e (patch)
treed1c72495e3af6301bd28e7e66c42de0c7a944d1f /gi/pipeline/scripts/patch-corpus.pl
parent0870d4a1f5e14cc7daf553b180d599f09f6614a2 (diff)
cdec cleanup, remove bayesian stuff, parsing stuff
Diffstat (limited to 'gi/pipeline/scripts/patch-corpus.pl')
-rwxr-xr-xgi/pipeline/scripts/patch-corpus.pl65
1 files changed, 0 insertions, 65 deletions
diff --git a/gi/pipeline/scripts/patch-corpus.pl b/gi/pipeline/scripts/patch-corpus.pl
deleted file mode 100755
index c0eec43e..00000000
--- a/gi/pipeline/scripts/patch-corpus.pl
+++ /dev/null
@@ -1,65 +0,0 @@
-#!/usr/bin/perl -w
-use strict;
-
-my $PATCH = shift @ARGV;
-my $TGT = 1;
-my $APPEND;
-while ($PATCH eq "-s" || $PATCH eq "-a") {
- if ($PATCH eq "-s") {
- undef $TGT;
- } else {
- $APPEND = 1;
- }
- $PATCH = shift @ARGV;
-}
-
-die "Usage: $0 [-s] [-a] tagged.en[_fr] < lexical.en_fr_al[_...]\n" unless $PATCH;
-
-open P, "<$PATCH" or die "Can't read tagged corpus $PATCH: $!";
-my $first=<P>; close P;
-my @fields = split / \|\|\| /, $first;
-die "Bad format!" if (scalar @fields > 2);
-
-if (scalar @fields != 1) {
- # TODO support this
- die "Patching source and target not supported yet!";
-}
-
-my $line = 0;
-open P, "<$PATCH" or die "Can't read tagged corpus $PATCH: $!";
-while(my $pline = <P>) {
- chomp $pline;
- $line++;
- my $line = <>;
- die "Too few lines in lexical corpus!" unless $line;
- chomp $line;
- @fields = split / \|\|\| /, $line;
- my @pwords = split /\s+/, $pline;
- if ($TGT) {
- my @lwords = split /\s+/, $fields[1];
- die "Length mismatch in line $line!\n" unless (scalar @pwords == scalar @lwords);
- if ($APPEND) {
- foreach my $i (0..(scalar @pwords-1)) {
- $lwords[$i] = $lwords[$i] . '_' . $pwords[$i];
- }
- $fields[1] = join ' ', @lwords;
- } else {
- $fields[1] = $pline;
- }
- } else { # source side
- my @lwords = split /\s+/, $fields[0];
- die "Length mismatch in line $line!\n" unless (scalar @pwords == scalar @lwords);
- if ($APPEND) {
- foreach my $i (0..(scalar @pwords-1)) {
- $lwords[$i] = $lwords[$i] . '_' . $pwords[$i];
- }
- $fields[0] = join ' ', @lwords;
- } else {
- $fields[0] = $pline;
- }
- }
- print join ' ||| ', @fields;
- print "\n";
-}
-
-