diff options
author | redpony <redpony@ec762483-ff6d-05da-a07a-a48fb63a330f> | 2010-07-09 18:54:51 +0000 |
---|---|---|
committer | redpony <redpony@ec762483-ff6d-05da-a07a-a48fb63a330f> | 2010-07-09 18:54:51 +0000 |
commit | 48ab19ad0dd6abb50b8cef7a7b91719473fa01f2 (patch) | |
tree | 975b2456530045ea2f59acc553077bc9b73c9a3c /gi/pipeline/scripts | |
parent | a71041194a4420164145c01e4c8973e30319b574 (diff) |
support doing clustering on tagged corpora
git-svn-id: https://ws10smt.googlecode.com/svn/trunk@208 ec762483-ff6d-05da-a07a-a48fb63a330f
Diffstat (limited to 'gi/pipeline/scripts')
-rwxr-xr-x | gi/pipeline/scripts/patch-corpus.pl | 34 |
1 files changed, 34 insertions, 0 deletions
diff --git a/gi/pipeline/scripts/patch-corpus.pl b/gi/pipeline/scripts/patch-corpus.pl new file mode 100755 index 00000000..2b181837 --- /dev/null +++ b/gi/pipeline/scripts/patch-corpus.pl @@ -0,0 +1,34 @@ +#!/usr/bin/perl -w +use strict; + +my $PATCH = shift @ARGV; +die "Usage: $0 tagged.en[_fr] < lexical.en_fr_al[_...]\n" unless $PATCH; + +open P, "<$PATCH" or die "Can't read tagged corpus $PATCH: $!"; +my $first=<P>; close P; +my @fields = split / \|\|\| /, $first; +die "Bad format!" if (scalar @fields > 2); + +if (scalar @fields != 1) { + # TODO support this + die "Patching source and target not supported yet!"; +} + +my $line = 0; +open P, "<$PATCH" or die "Can't read tagged corpus $PATCH: $!"; +while(my $pline = <P>) { + chomp $pline; + $line++; + my $line = <>; + die "Too few lines in lexical corpus!" unless $line; + chomp $line; + @fields = split / \|\|\| /, $line; + my @pwords = split /\s+/, $pline; + my @lwords = split /\s+/, $fields[1]; + die "Length mismatch in line $line!\n" unless (scalar @pwords == scalar @lwords); + $fields[1] = $pline; + print join ' ||| ', @fields; + print "\n"; +} + + |