From 4e0d38e98a7ba9e0c746ce7dbb22135c89a999f7 Mon Sep 17 00:00:00 2001
From: "trevor.cohn" <trevor.cohn@ec762483-ff6d-05da-a07a-a48fb63a330f>
Date: Thu, 22 Jul 2010 20:55:29 +0000
Subject: Fixed filename clash error when running with tagged corpus and source
 and target laguagages

git-svn-id: https://ws10smt.googlecode.com/svn/trunk@369 ec762483-ff6d-05da-a07a-a48fb63a330f
---
 gi/pipeline/local-gi-pipeline.pl               |  4 +-
 gi/pipeline/scripts/remove-tags-from-corpus.pl | 53 ++++++++++++++++++++++++++
 2 files changed, 55 insertions(+), 2 deletions(-)
 create mode 100755 gi/pipeline/scripts/remove-tags-from-corpus.pl

(limited to 'gi')

diff --git a/gi/pipeline/local-gi-pipeline.pl b/gi/pipeline/local-gi-pipeline.pl
index f72637af..46062c8d 100755
--- a/gi/pipeline/local-gi-pipeline.pl
+++ b/gi/pipeline/local-gi-pipeline.pl
@@ -94,9 +94,9 @@ $HIER_CAT = ( $NUM_TOPICS_COARSE ? 1 : 0 );
 
 print STDERR "   Output: $OUTPUT\n";
 my $DATA_DIR = $OUTPUT . '/corpora';
-my $LEX_NAME = 'corpus.f_e_a.lex';
+my $LEX_NAME = 'corpus.f_e_a.$LANGUAGE.lex';
 my $CORPUS_LEX = $DATA_DIR . '/' . $LEX_NAME;  # corpus used to extract rules
-my $CORPUS_CLUSTER = $DATA_DIR . '/corpus.f_e_a.cluster'; # corpus used for clustering (often identical)
+my $CORPUS_CLUSTER = $DATA_DIR . '/corpus.f_e_a.$LANGUAGE.cluster'; # corpus used for clustering (often identical)
 
 my $CONTEXT_DIR = $OUTPUT . '/' . context_dir();
 my $CLUSTER_DIR = $OUTPUT . '/' . cluster_dir();
diff --git a/gi/pipeline/scripts/remove-tags-from-corpus.pl b/gi/pipeline/scripts/remove-tags-from-corpus.pl
new file mode 100755
index 00000000..5460db95
--- /dev/null
+++ b/gi/pipeline/scripts/remove-tags-from-corpus.pl
@@ -0,0 +1,53 @@
+#!/usr/bin/perl -w
+use strict;
+
+use Getopt::Long "GetOptions";
+
+my $PHRASE = 'tok';
+my $CONTEXT = 'tag';
+
+die "Usage: $0 [--phrase=tok|tag] [--context=tok|tag] < corpus" 
+    unless &GetOptions('phrase=s' => \$PHRASE, 'context=s' => \$CONTEXT);
+
+my $lno = 0;
+while(my $line = <>) {
+    $lno++;
+    chomp $line;
+    my @top = split /\t/, $line;
+    die unless (scalar @top == 2); 
+
+    my @pwords = split /\s+/, $top[0];
+    foreach my $token (@pwords) {
+        #print $token . "\n";
+        my @parts = split /_(?!_)/, $token;
+        die unless (scalar @parts == 2); 
+        if ($PHRASE eq "tok") {
+            $token = $parts[0]
+        } elsif ($PHRASE eq "tag") {
+            $token = $parts[1]
+        }
+    }
+
+    my @fields = split / \|\|\| /, $top[1];
+    foreach my $i (0..((scalar @fields) / 2 - 1)) {
+        #print $i . ": " . $fields[2*$i] . " of " . (scalar @fields) . "\n";
+        my @cwords = split /\s+/, $fields[2*$i];
+        foreach my $token (@cwords) {
+            #print $i . ": " . $token . "\n";
+            my @parts = split /_/, $token;
+            if (scalar @parts == 2) {
+                if ($CONTEXT eq "tok") {
+                    $token = $parts[0]
+                } elsif ($CONTEXT eq "tag") {
+                    $token = $parts[1]
+                }
+            }
+        }
+        $fields[2*$i] = join ' ', @cwords;
+    }
+
+    print join ' ', @pwords;
+    print "\t";
+    print join ' ||| ', @fields;
+    print "\n";
+}
-- 
cgit v1.2.3