From 7d0cad292c444baddd70c3b76540304364d454d9 Mon Sep 17 00:00:00 2001 From: "trevor.cohn" Date: Fri, 23 Jul 2010 16:39:41 +0000 Subject: Pipeline code for running with mixing tokens and tags in the clustering. git-svn-id: https://ws10smt.googlecode.com/svn/trunk@377 ec762483-ff6d-05da-a07a-a48fb63a330f --- gi/pipeline/scripts/patch-corpus.pl | 31 +++++++++++--- gi/pipeline/scripts/remove-tags-from-contexts.pl | 53 ++++++++++++++++++++++++ gi/pipeline/scripts/remove-tags-from-corpus.pl | 51 ++++++++++------------- 3 files changed, 99 insertions(+), 36 deletions(-) create mode 100755 gi/pipeline/scripts/remove-tags-from-contexts.pl (limited to 'gi/pipeline/scripts') diff --git a/gi/pipeline/scripts/patch-corpus.pl b/gi/pipeline/scripts/patch-corpus.pl index 200022bc..c0eec43e 100755 --- a/gi/pipeline/scripts/patch-corpus.pl +++ b/gi/pipeline/scripts/patch-corpus.pl @@ -3,12 +3,17 @@ use strict; my $PATCH = shift @ARGV; my $TGT = 1; -if ($PATCH eq "-s") { - undef $TGT; +my $APPEND; +while ($PATCH eq "-s" || $PATCH eq "-a") { + if ($PATCH eq "-s") { + undef $TGT; + } else { + $APPEND = 1; + } $PATCH = shift @ARGV; } -die "Usage: $0 [-s] tagged.en[_fr] < lexical.en_fr_al[_...]\n" unless $PATCH; +die "Usage: $0 [-s] [-a] tagged.en[_fr] < lexical.en_fr_al[_...]\n" unless $PATCH; open P, "<$PATCH" or die "Can't read tagged corpus $PATCH: $!"; my $first=

; close P; @@ -33,11 +38,25 @@ while(my $pline =

) { if ($TGT) { my @lwords = split /\s+/, $fields[1]; die "Length mismatch in line $line!\n" unless (scalar @pwords == scalar @lwords); - $fields[1] = $pline; - } else { + if ($APPEND) { + foreach my $i (0..(scalar @pwords-1)) { + $lwords[$i] = $lwords[$i] . '_' . $pwords[$i]; + } + $fields[1] = join ' ', @lwords; + } else { + $fields[1] = $pline; + } + } else { # source side my @lwords = split /\s+/, $fields[0]; die "Length mismatch in line $line!\n" unless (scalar @pwords == scalar @lwords); - $fields[0] = $pline; + if ($APPEND) { + foreach my $i (0..(scalar @pwords-1)) { + $lwords[$i] = $lwords[$i] . '_' . $pwords[$i]; + } + $fields[0] = join ' ', @lwords; + } else { + $fields[0] = $pline; + } } print join ' ||| ', @fields; print "\n"; diff --git a/gi/pipeline/scripts/remove-tags-from-contexts.pl b/gi/pipeline/scripts/remove-tags-from-contexts.pl new file mode 100755 index 00000000..20698816 --- /dev/null +++ b/gi/pipeline/scripts/remove-tags-from-contexts.pl @@ -0,0 +1,53 @@ +#!/usr/bin/perl -w +use strict; + +use Getopt::Long "GetOptions"; + +my $PHRASE = 'tok'; +my $CONTEXT = 'tag'; + +die "Usage: $0 [--phrase=tok|tag] [--context=tok|tag] < corpus" + unless &GetOptions('phrase=s' => \$PHRASE, 'context=s' => \$CONTEXT); + +my $lno = 0; +while(my $line = <>) { + $lno++; + chomp $line; + my @top = split /\t/, $line; + die unless (scalar @top == 2); + + my @pwords = split /\s+/, $top[0]; + foreach my $token (@pwords) { + #print $token . "\n"; + my @parts = split /_(?!.*_)/, $token; + die unless (scalar @parts == 2); + if ($PHRASE eq "tok") { + $token = $parts[0] + } elsif ($PHRASE eq "tag") { + $token = $parts[1] + } + } + + my @fields = split / \|\|\| /, $top[1]; + foreach my $i (0..((scalar @fields) / 2 - 1)) { + #print $i . ": " . $fields[2*$i] . " of " . (scalar @fields) . "\n"; + my @cwords = split /\s+/, $fields[2*$i]; + foreach my $token (@cwords) { + #print $i . ": " . $token . "\n"; + my @parts = split /_(?!.*_)/, $token; + if (scalar @parts == 2) { + if ($CONTEXT eq "tok") { + $token = $parts[0] + } elsif ($CONTEXT eq "tag") { + $token = $parts[1] + } + } + } + $fields[2*$i] = join ' ', @cwords; + } + + print join ' ', @pwords; + print "\t"; + print join ' ||| ', @fields; + print "\n"; +} diff --git a/gi/pipeline/scripts/remove-tags-from-corpus.pl b/gi/pipeline/scripts/remove-tags-from-corpus.pl index 5460db95..be3e97c0 100755 --- a/gi/pipeline/scripts/remove-tags-from-corpus.pl +++ b/gi/pipeline/scripts/remove-tags-from-corpus.pl @@ -3,51 +3,42 @@ use strict; use Getopt::Long "GetOptions"; -my $PHRASE = 'tok'; -my $CONTEXT = 'tag'; - -die "Usage: $0 [--phrase=tok|tag] [--context=tok|tag] < corpus" - unless &GetOptions('phrase=s' => \$PHRASE, 'context=s' => \$CONTEXT); +my $LANGUAGE = shift @ARGV; +$LANGUAGE = 'target' unless ($LANGUAGE); my $lno = 0; while(my $line = <>) { $lno++; chomp $line; - my @top = split /\t/, $line; - die unless (scalar @top == 2); - my @pwords = split /\s+/, $top[0]; - foreach my $token (@pwords) { - #print $token . "\n"; - my @parts = split /_(?!_)/, $token; - die unless (scalar @parts == 2); - if ($PHRASE eq "tok") { - $token = $parts[0] - } elsif ($PHRASE eq "tag") { - $token = $parts[1] + my @fields = split / \|\|\| /, $line; + + if ($LANGUAGE eq "source" or $LANGUAGE eq "both") { + my @cwords = split /\s+/, $fields[0]; + foreach my $token (@cwords) { + my @parts = split /_(?!.*_)/, $token; + if (scalar @parts == 2) { + $token = $parts[0] + } else { + print STDERR "WARNING: invalid tagged token $token\n"; + } } + $fields[0] = join ' ', @cwords; } - my @fields = split / \|\|\| /, $top[1]; - foreach my $i (0..((scalar @fields) / 2 - 1)) { - #print $i . ": " . $fields[2*$i] . " of " . (scalar @fields) . "\n"; - my @cwords = split /\s+/, $fields[2*$i]; + if ($LANGUAGE eq "target" or $LANGUAGE eq "both") { + my @cwords = split /\s+/, $fields[1]; foreach my $token (@cwords) { - #print $i . ": " . $token . "\n"; - my @parts = split /_/, $token; + my @parts = split /_(?!.*_)/, $token; if (scalar @parts == 2) { - if ($CONTEXT eq "tok") { - $token = $parts[0] - } elsif ($CONTEXT eq "tag") { - $token = $parts[1] - } + $token = $parts[1] + } else { + print STDERR "WARNING: invalid tagged token $token\n"; } } - $fields[2*$i] = join ' ', @cwords; + $fields[0] = join ' ', @cwords; } - print join ' ', @pwords; - print "\t"; print join ' ||| ', @fields; print "\n"; } -- cgit v1.2.3