Pipeline code for running with mixing tokens and tags in the clustering.

git-svn-id: https://ws10smt.googlecode.com/svn/trunk@377 ec762483-ff6d-05da-a07a-a48fb63a330f
author: trevor.cohn <trevor.cohn@ec762483-ff6d-05da-a07a-a48fb63a330f> 2010-07-23 16:39:41 +0000
committer: trevor.cohn <trevor.cohn@ec762483-ff6d-05da-a07a-a48fb63a330f> 2010-07-23 16:39:41 +0000
commit: 7d0cad292c444baddd70c3b76540304364d454d9 (patch)
tree: b93b34d81dc3681a401ff811be61cca218d9a8eb /gi/pipeline/scripts/remove-tags-from-corpus.pl
parent: e0bca5fea3b0267819186d0fc34c036e6b77679c (diff)
1 files changed, 21 insertions, 30 deletions
diff --git a/gi/pipeline/scripts/remove-tags-from-corpus.pl b/gi/pipeline/scripts/remove-tags-from-corpus.pl
index 5460db95..be3e97c0 100755
--- a/gi/pipeline/scripts/remove-tags-from-corpus.pl
+++ b/gi/pipeline/scripts/remove-tags-from-corpus.pl
@@ -3,51 +3,42 @@ use strict;
 
 use Getopt::Long "GetOptions";
 
-my $PHRASE = 'tok';
-my $CONTEXT = 'tag';
-
-die "Usage: $0 [--phrase=tok|tag] [--context=tok|tag] < corpus" 
-    unless &GetOptions('phrase=s' => \$PHRASE, 'context=s' => \$CONTEXT);
+my $LANGUAGE = shift @ARGV;
+$LANGUAGE = 'target' unless ($LANGUAGE);
 
 my $lno = 0;
 while(my $line = <>) {
     $lno++;
     chomp $line;
-    my @top = split /\t/, $line;
-    die unless (scalar @top == 2); 
 
-    my @pwords = split /\s+/, $top[0];
-    foreach my $token (@pwords) {
-        #print $token . "\n";
-        my @parts = split /_(?!_)/, $token;
-        die unless (scalar @parts == 2); 
-        if ($PHRASE eq "tok") {
-            $token = $parts[0]
-        } elsif ($PHRASE eq "tag") {
-            $token = $parts[1]
+    my @fields = split / \|\|\| /, $line;
+
+    if ($LANGUAGE eq "source" or $LANGUAGE eq "both") {
+        my @cwords = split /\s+/, $fields[0];
+        foreach my $token (@cwords) {
+            my @parts = split /_(?!.*_)/, $token;
+            if (scalar @parts == 2) {
+                $token = $parts[0]
+            } else {
+                print STDERR "WARNING: invalid tagged token $token\n";
+            }
         }
+        $fields[0] = join ' ', @cwords;
     }
 
-    my @fields = split / \|\|\| /, $top[1];
-    foreach my $i (0..((scalar @fields) / 2 - 1)) {
-        #print $i . ": " . $fields[2*$i] . " of " . (scalar @fields) . "\n";
-        my @cwords = split /\s+/, $fields[2*$i];
+    if ($LANGUAGE eq "target" or $LANGUAGE eq "both") {
+        my @cwords = split /\s+/, $fields[1];
         foreach my $token (@cwords) {
-            #print $i . ": " . $token . "\n";
-            my @parts = split /_/, $token;
+            my @parts = split /_(?!.*_)/, $token;
             if (scalar @parts == 2) {
-                if ($CONTEXT eq "tok") {
-                    $token = $parts[0]
-                } elsif ($CONTEXT eq "tag") {
-                    $token = $parts[1]
-                }
+                $token = $parts[1]
+            } else {
+                print STDERR "WARNING: invalid tagged token $token\n";
             }
         }
-        $fields[2*$i] = join ' ', @cwords;
+        $fields[0] = join ' ', @cwords;
     }
 
-    print join ' ', @pwords;
-    print "\t";
     print join ' ||| ', @fields;
     print "\n";
 }
author	trevor.cohn <trevor.cohn@ec762483-ff6d-05da-a07a-a48fb63a330f>	2010-07-23 16:39:41 +0000
committer	trevor.cohn <trevor.cohn@ec762483-ff6d-05da-a07a-a48fb63a330f>	2010-07-23 16:39:41 +0000
commit	7d0cad292c444baddd70c3b76540304364d454d9 (patch)
tree	b93b34d81dc3681a401ff811be61cca218d9a8eb /gi/pipeline/scripts/remove-tags-from-corpus.pl
parent	e0bca5fea3b0267819186d0fc34c036e6b77679c (diff)