summaryrefslogtreecommitdiff
path: root/gi/pipeline/scripts
diff options
context:
space:
mode:
authortrevor.cohn <trevor.cohn@ec762483-ff6d-05da-a07a-a48fb63a330f>2010-07-23 16:39:41 +0000
committertrevor.cohn <trevor.cohn@ec762483-ff6d-05da-a07a-a48fb63a330f>2010-07-23 16:39:41 +0000
commit7d0cad292c444baddd70c3b76540304364d454d9 (patch)
treeb93b34d81dc3681a401ff811be61cca218d9a8eb /gi/pipeline/scripts
parente0bca5fea3b0267819186d0fc34c036e6b77679c (diff)
Pipeline code for running with mixing tokens and tags in the clustering.
git-svn-id: https://ws10smt.googlecode.com/svn/trunk@377 ec762483-ff6d-05da-a07a-a48fb63a330f
Diffstat (limited to 'gi/pipeline/scripts')
-rwxr-xr-xgi/pipeline/scripts/patch-corpus.pl31
-rwxr-xr-xgi/pipeline/scripts/remove-tags-from-contexts.pl53
-rwxr-xr-xgi/pipeline/scripts/remove-tags-from-corpus.pl51
3 files changed, 99 insertions, 36 deletions
diff --git a/gi/pipeline/scripts/patch-corpus.pl b/gi/pipeline/scripts/patch-corpus.pl
index 200022bc..c0eec43e 100755
--- a/gi/pipeline/scripts/patch-corpus.pl
+++ b/gi/pipeline/scripts/patch-corpus.pl
@@ -3,12 +3,17 @@ use strict;
my $PATCH = shift @ARGV;
my $TGT = 1;
-if ($PATCH eq "-s") {
- undef $TGT;
+my $APPEND;
+while ($PATCH eq "-s" || $PATCH eq "-a") {
+ if ($PATCH eq "-s") {
+ undef $TGT;
+ } else {
+ $APPEND = 1;
+ }
$PATCH = shift @ARGV;
}
-die "Usage: $0 [-s] tagged.en[_fr] < lexical.en_fr_al[_...]\n" unless $PATCH;
+die "Usage: $0 [-s] [-a] tagged.en[_fr] < lexical.en_fr_al[_...]\n" unless $PATCH;
open P, "<$PATCH" or die "Can't read tagged corpus $PATCH: $!";
my $first=<P>; close P;
@@ -33,11 +38,25 @@ while(my $pline = <P>) {
if ($TGT) {
my @lwords = split /\s+/, $fields[1];
die "Length mismatch in line $line!\n" unless (scalar @pwords == scalar @lwords);
- $fields[1] = $pline;
- } else {
+ if ($APPEND) {
+ foreach my $i (0..(scalar @pwords-1)) {
+ $lwords[$i] = $lwords[$i] . '_' . $pwords[$i];
+ }
+ $fields[1] = join ' ', @lwords;
+ } else {
+ $fields[1] = $pline;
+ }
+ } else { # source side
my @lwords = split /\s+/, $fields[0];
die "Length mismatch in line $line!\n" unless (scalar @pwords == scalar @lwords);
- $fields[0] = $pline;
+ if ($APPEND) {
+ foreach my $i (0..(scalar @pwords-1)) {
+ $lwords[$i] = $lwords[$i] . '_' . $pwords[$i];
+ }
+ $fields[0] = join ' ', @lwords;
+ } else {
+ $fields[0] = $pline;
+ }
}
print join ' ||| ', @fields;
print "\n";
diff --git a/gi/pipeline/scripts/remove-tags-from-contexts.pl b/gi/pipeline/scripts/remove-tags-from-contexts.pl
new file mode 100755
index 00000000..20698816
--- /dev/null
+++ b/gi/pipeline/scripts/remove-tags-from-contexts.pl
@@ -0,0 +1,53 @@
+#!/usr/bin/perl -w
+use strict;
+
+use Getopt::Long "GetOptions";
+
+my $PHRASE = 'tok';
+my $CONTEXT = 'tag';
+
+die "Usage: $0 [--phrase=tok|tag] [--context=tok|tag] < corpus"
+ unless &GetOptions('phrase=s' => \$PHRASE, 'context=s' => \$CONTEXT);
+
+my $lno = 0;
+while(my $line = <>) {
+ $lno++;
+ chomp $line;
+ my @top = split /\t/, $line;
+ die unless (scalar @top == 2);
+
+ my @pwords = split /\s+/, $top[0];
+ foreach my $token (@pwords) {
+ #print $token . "\n";
+ my @parts = split /_(?!.*_)/, $token;
+ die unless (scalar @parts == 2);
+ if ($PHRASE eq "tok") {
+ $token = $parts[0]
+ } elsif ($PHRASE eq "tag") {
+ $token = $parts[1]
+ }
+ }
+
+ my @fields = split / \|\|\| /, $top[1];
+ foreach my $i (0..((scalar @fields) / 2 - 1)) {
+ #print $i . ": " . $fields[2*$i] . " of " . (scalar @fields) . "\n";
+ my @cwords = split /\s+/, $fields[2*$i];
+ foreach my $token (@cwords) {
+ #print $i . ": " . $token . "\n";
+ my @parts = split /_(?!.*_)/, $token;
+ if (scalar @parts == 2) {
+ if ($CONTEXT eq "tok") {
+ $token = $parts[0]
+ } elsif ($CONTEXT eq "tag") {
+ $token = $parts[1]
+ }
+ }
+ }
+ $fields[2*$i] = join ' ', @cwords;
+ }
+
+ print join ' ', @pwords;
+ print "\t";
+ print join ' ||| ', @fields;
+ print "\n";
+}
diff --git a/gi/pipeline/scripts/remove-tags-from-corpus.pl b/gi/pipeline/scripts/remove-tags-from-corpus.pl
index 5460db95..be3e97c0 100755
--- a/gi/pipeline/scripts/remove-tags-from-corpus.pl
+++ b/gi/pipeline/scripts/remove-tags-from-corpus.pl
@@ -3,51 +3,42 @@ use strict;
use Getopt::Long "GetOptions";
-my $PHRASE = 'tok';
-my $CONTEXT = 'tag';
-
-die "Usage: $0 [--phrase=tok|tag] [--context=tok|tag] < corpus"
- unless &GetOptions('phrase=s' => \$PHRASE, 'context=s' => \$CONTEXT);
+my $LANGUAGE = shift @ARGV;
+$LANGUAGE = 'target' unless ($LANGUAGE);
my $lno = 0;
while(my $line = <>) {
$lno++;
chomp $line;
- my @top = split /\t/, $line;
- die unless (scalar @top == 2);
- my @pwords = split /\s+/, $top[0];
- foreach my $token (@pwords) {
- #print $token . "\n";
- my @parts = split /_(?!_)/, $token;
- die unless (scalar @parts == 2);
- if ($PHRASE eq "tok") {
- $token = $parts[0]
- } elsif ($PHRASE eq "tag") {
- $token = $parts[1]
+ my @fields = split / \|\|\| /, $line;
+
+ if ($LANGUAGE eq "source" or $LANGUAGE eq "both") {
+ my @cwords = split /\s+/, $fields[0];
+ foreach my $token (@cwords) {
+ my @parts = split /_(?!.*_)/, $token;
+ if (scalar @parts == 2) {
+ $token = $parts[0]
+ } else {
+ print STDERR "WARNING: invalid tagged token $token\n";
+ }
}
+ $fields[0] = join ' ', @cwords;
}
- my @fields = split / \|\|\| /, $top[1];
- foreach my $i (0..((scalar @fields) / 2 - 1)) {
- #print $i . ": " . $fields[2*$i] . " of " . (scalar @fields) . "\n";
- my @cwords = split /\s+/, $fields[2*$i];
+ if ($LANGUAGE eq "target" or $LANGUAGE eq "both") {
+ my @cwords = split /\s+/, $fields[1];
foreach my $token (@cwords) {
- #print $i . ": " . $token . "\n";
- my @parts = split /_/, $token;
+ my @parts = split /_(?!.*_)/, $token;
if (scalar @parts == 2) {
- if ($CONTEXT eq "tok") {
- $token = $parts[0]
- } elsif ($CONTEXT eq "tag") {
- $token = $parts[1]
- }
+ $token = $parts[1]
+ } else {
+ print STDERR "WARNING: invalid tagged token $token\n";
}
}
- $fields[2*$i] = join ' ', @cwords;
+ $fields[0] = join ' ', @cwords;
}
- print join ' ', @pwords;
- print "\t";
print join ' ||| ', @fields;
print "\n";
}