From 863c1e258f1da790456b166ddedb1ce61f614d4b Mon Sep 17 00:00:00 2001
From: redpony <redpony@ec762483-ff6d-05da-a07a-a48fb63a330f>
Date: Mon, 5 Jul 2010 16:34:30 +0000
Subject: add xfeats featurizer

git-svn-id: https://ws10smt.googlecode.com/svn/trunk@130 ec762483-ff6d-05da-a07a-a48fb63a330f
---
 gi/pipeline/scripts/xfeats.pl | 36 ++++++++++++++++++++++++++++++++++++
 1 file changed, 36 insertions(+)
 create mode 100755 gi/pipeline/scripts/xfeats.pl

(limited to 'gi/pipeline')

diff --git a/gi/pipeline/scripts/xfeats.pl b/gi/pipeline/scripts/xfeats.pl
new file mode 100755
index 00000000..bdb9224c
--- /dev/null
+++ b/gi/pipeline/scripts/xfeats.pl
@@ -0,0 +1,36 @@
+#!/usr/bin/perl -w
+use strict;
+
+die "Usage: $0 x-grammar.scfg < cat-grammar.scfg\n" unless scalar @ARGV > 0;
+
+my $xgrammar = shift @ARGV;
+open F, "<$xgrammar" or die "Can't read $xgrammar: $!";
+print STDERR "Reading X-feats from $xgrammar...\n";
+my %dict;
+while(<F>) {
+  chomp;
+  my ($lhs, $f, $e, $feats) = split / \|\|\| /;
+  my $xfeats;
+  my $cc = 0;
+  if ($feats =~ /(EGivenF=[^ ]+)( |$)/) {
+    $xfeats = "X_$1"; $cc++;
+  }
+  if ($feats =~ /(FGivenE=[^ ]+)( |$)/) {
+    $xfeats = "$xfeats X_$1"; $cc++;
+  }
+  die "EGivenF and FGivenE features not found: $_" unless $cc == 2;
+  #print "$lhs ||| $f ||| $e ||| $xfeats\n";
+  $dict{"$lhs ||| $f ||| $e"} = $xfeats;
+}
+close F;
+
+print STDERR "Add features...\n";
+while(<>) {
+  chomp;
+  my ($lhs, $f, $e) = split / \|\|\| /;
+  $f=~ s/\[[^]]+,([12])\]/\[X,$1\]/g;
+  my $xfeats = $dict{"[X] ||| $f ||| $e"};
+  die "Can't find x features for: $_\n" unless $xfeats;
+  print "$_ $xfeats\n";
+}
+
-- 
cgit v1.2.3