diff options
author | trevor.cohn <trevor.cohn@ec762483-ff6d-05da-a07a-a48fb63a330f> | 2010-07-23 16:39:41 +0000 |
---|---|---|
committer | trevor.cohn <trevor.cohn@ec762483-ff6d-05da-a07a-a48fb63a330f> | 2010-07-23 16:39:41 +0000 |
commit | 7d0cad292c444baddd70c3b76540304364d454d9 (patch) | |
tree | b93b34d81dc3681a401ff811be61cca218d9a8eb /gi/pipeline/scripts/remove-tags-from-corpus.pl | |
parent | e0bca5fea3b0267819186d0fc34c036e6b77679c (diff) |
Pipeline code for running with mixing tokens and tags in the clustering.
git-svn-id: https://ws10smt.googlecode.com/svn/trunk@377 ec762483-ff6d-05da-a07a-a48fb63a330f
Diffstat (limited to 'gi/pipeline/scripts/remove-tags-from-corpus.pl')
-rwxr-xr-x | gi/pipeline/scripts/remove-tags-from-corpus.pl | 51 |
1 files changed, 21 insertions, 30 deletions
diff --git a/gi/pipeline/scripts/remove-tags-from-corpus.pl b/gi/pipeline/scripts/remove-tags-from-corpus.pl index 5460db95..be3e97c0 100755 --- a/gi/pipeline/scripts/remove-tags-from-corpus.pl +++ b/gi/pipeline/scripts/remove-tags-from-corpus.pl @@ -3,51 +3,42 @@ use strict; use Getopt::Long "GetOptions"; -my $PHRASE = 'tok'; -my $CONTEXT = 'tag'; - -die "Usage: $0 [--phrase=tok|tag] [--context=tok|tag] < corpus" - unless &GetOptions('phrase=s' => \$PHRASE, 'context=s' => \$CONTEXT); +my $LANGUAGE = shift @ARGV; +$LANGUAGE = 'target' unless ($LANGUAGE); my $lno = 0; while(my $line = <>) { $lno++; chomp $line; - my @top = split /\t/, $line; - die unless (scalar @top == 2); - my @pwords = split /\s+/, $top[0]; - foreach my $token (@pwords) { - #print $token . "\n"; - my @parts = split /_(?!_)/, $token; - die unless (scalar @parts == 2); - if ($PHRASE eq "tok") { - $token = $parts[0] - } elsif ($PHRASE eq "tag") { - $token = $parts[1] + my @fields = split / \|\|\| /, $line; + + if ($LANGUAGE eq "source" or $LANGUAGE eq "both") { + my @cwords = split /\s+/, $fields[0]; + foreach my $token (@cwords) { + my @parts = split /_(?!.*_)/, $token; + if (scalar @parts == 2) { + $token = $parts[0] + } else { + print STDERR "WARNING: invalid tagged token $token\n"; + } } + $fields[0] = join ' ', @cwords; } - my @fields = split / \|\|\| /, $top[1]; - foreach my $i (0..((scalar @fields) / 2 - 1)) { - #print $i . ": " . $fields[2*$i] . " of " . (scalar @fields) . "\n"; - my @cwords = split /\s+/, $fields[2*$i]; + if ($LANGUAGE eq "target" or $LANGUAGE eq "both") { + my @cwords = split /\s+/, $fields[1]; foreach my $token (@cwords) { - #print $i . ": " . $token . "\n"; - my @parts = split /_/, $token; + my @parts = split /_(?!.*_)/, $token; if (scalar @parts == 2) { - if ($CONTEXT eq "tok") { - $token = $parts[0] - } elsif ($CONTEXT eq "tag") { - $token = $parts[1] - } + $token = $parts[1] + } else { + print STDERR "WARNING: invalid tagged token $token\n"; } } - $fields[2*$i] = join ' ', @cwords; + $fields[0] = join ' ', @cwords; } - print join ' ', @pwords; - print "\t"; print join ' ||| ', @fields; print "\n"; } |