From 4e0d38e98a7ba9e0c746ce7dbb22135c89a999f7 Mon Sep 17 00:00:00 2001 From: "trevor.cohn" Date: Thu, 22 Jul 2010 20:55:29 +0000 Subject: Fixed filename clash error when running with tagged corpus and source and target laguagages git-svn-id: https://ws10smt.googlecode.com/svn/trunk@369 ec762483-ff6d-05da-a07a-a48fb63a330f --- gi/pipeline/scripts/remove-tags-from-corpus.pl | 53 ++++++++++++++++++++++++++ 1 file changed, 53 insertions(+) create mode 100755 gi/pipeline/scripts/remove-tags-from-corpus.pl (limited to 'gi/pipeline/scripts') diff --git a/gi/pipeline/scripts/remove-tags-from-corpus.pl b/gi/pipeline/scripts/remove-tags-from-corpus.pl new file mode 100755 index 00000000..5460db95 --- /dev/null +++ b/gi/pipeline/scripts/remove-tags-from-corpus.pl @@ -0,0 +1,53 @@ +#!/usr/bin/perl -w +use strict; + +use Getopt::Long "GetOptions"; + +my $PHRASE = 'tok'; +my $CONTEXT = 'tag'; + +die "Usage: $0 [--phrase=tok|tag] [--context=tok|tag] < corpus" + unless &GetOptions('phrase=s' => \$PHRASE, 'context=s' => \$CONTEXT); + +my $lno = 0; +while(my $line = <>) { + $lno++; + chomp $line; + my @top = split /\t/, $line; + die unless (scalar @top == 2); + + my @pwords = split /\s+/, $top[0]; + foreach my $token (@pwords) { + #print $token . "\n"; + my @parts = split /_(?!_)/, $token; + die unless (scalar @parts == 2); + if ($PHRASE eq "tok") { + $token = $parts[0] + } elsif ($PHRASE eq "tag") { + $token = $parts[1] + } + } + + my @fields = split / \|\|\| /, $top[1]; + foreach my $i (0..((scalar @fields) / 2 - 1)) { + #print $i . ": " . $fields[2*$i] . " of " . (scalar @fields) . "\n"; + my @cwords = split /\s+/, $fields[2*$i]; + foreach my $token (@cwords) { + #print $i . ": " . $token . "\n"; + my @parts = split /_/, $token; + if (scalar @parts == 2) { + if ($CONTEXT eq "tok") { + $token = $parts[0] + } elsif ($CONTEXT eq "tag") { + $token = $parts[1] + } + } + } + $fields[2*$i] = join ' ', @cwords; + } + + print join ' ', @pwords; + print "\t"; + print join ' ||| ', @fields; + print "\n"; +} -- cgit v1.2.3