summaryrefslogtreecommitdiff
path: root/gi/pipeline
diff options
context:
space:
mode:
authorredpony <redpony@ec762483-ff6d-05da-a07a-a48fb63a330f>2010-07-05 16:34:30 +0000
committerredpony <redpony@ec762483-ff6d-05da-a07a-a48fb63a330f>2010-07-05 16:34:30 +0000
commit72b3810e23fb6cf8edf55da868aa613b0200ecdb (patch)
treea276ee4a4a0344bad62477e68fd95e9fe34fddba /gi/pipeline
parentd80bc8cbadd46d202bbdf0df0d29ecd0ae819698 (diff)
add xfeats featurizer
git-svn-id: https://ws10smt.googlecode.com/svn/trunk@130 ec762483-ff6d-05da-a07a-a48fb63a330f
Diffstat (limited to 'gi/pipeline')
-rwxr-xr-xgi/pipeline/scripts/xfeats.pl36
1 files changed, 36 insertions, 0 deletions
diff --git a/gi/pipeline/scripts/xfeats.pl b/gi/pipeline/scripts/xfeats.pl
new file mode 100755
index 00000000..bdb9224c
--- /dev/null
+++ b/gi/pipeline/scripts/xfeats.pl
@@ -0,0 +1,36 @@
+#!/usr/bin/perl -w
+use strict;
+
+die "Usage: $0 x-grammar.scfg < cat-grammar.scfg\n" unless scalar @ARGV > 0;
+
+my $xgrammar = shift @ARGV;
+open F, "<$xgrammar" or die "Can't read $xgrammar: $!";
+print STDERR "Reading X-feats from $xgrammar...\n";
+my %dict;
+while(<F>) {
+ chomp;
+ my ($lhs, $f, $e, $feats) = split / \|\|\| /;
+ my $xfeats;
+ my $cc = 0;
+ if ($feats =~ /(EGivenF=[^ ]+)( |$)/) {
+ $xfeats = "X_$1"; $cc++;
+ }
+ if ($feats =~ /(FGivenE=[^ ]+)( |$)/) {
+ $xfeats = "$xfeats X_$1"; $cc++;
+ }
+ die "EGivenF and FGivenE features not found: $_" unless $cc == 2;
+ #print "$lhs ||| $f ||| $e ||| $xfeats\n";
+ $dict{"$lhs ||| $f ||| $e"} = $xfeats;
+}
+close F;
+
+print STDERR "Add features...\n";
+while(<>) {
+ chomp;
+ my ($lhs, $f, $e) = split / \|\|\| /;
+ $f=~ s/\[[^]]+,([12])\]/\[X,$1\]/g;
+ my $xfeats = $dict{"[X] ||| $f ||| $e"};
+ die "Can't find x features for: $_\n" unless $xfeats;
+ print "$_ $xfeats\n";
+}
+