diff options
author | redpony <redpony@ec762483-ff6d-05da-a07a-a48fb63a330f> | 2010-07-05 16:34:30 +0000 |
---|---|---|
committer | redpony <redpony@ec762483-ff6d-05da-a07a-a48fb63a330f> | 2010-07-05 16:34:30 +0000 |
commit | 72b3810e23fb6cf8edf55da868aa613b0200ecdb (patch) | |
tree | a276ee4a4a0344bad62477e68fd95e9fe34fddba /gi | |
parent | d80bc8cbadd46d202bbdf0df0d29ecd0ae819698 (diff) |
add xfeats featurizer
git-svn-id: https://ws10smt.googlecode.com/svn/trunk@130 ec762483-ff6d-05da-a07a-a48fb63a330f
Diffstat (limited to 'gi')
-rwxr-xr-x | gi/pipeline/scripts/xfeats.pl | 36 |
1 files changed, 36 insertions, 0 deletions
diff --git a/gi/pipeline/scripts/xfeats.pl b/gi/pipeline/scripts/xfeats.pl new file mode 100755 index 00000000..bdb9224c --- /dev/null +++ b/gi/pipeline/scripts/xfeats.pl @@ -0,0 +1,36 @@ +#!/usr/bin/perl -w +use strict; + +die "Usage: $0 x-grammar.scfg < cat-grammar.scfg\n" unless scalar @ARGV > 0; + +my $xgrammar = shift @ARGV; +open F, "<$xgrammar" or die "Can't read $xgrammar: $!"; +print STDERR "Reading X-feats from $xgrammar...\n"; +my %dict; +while(<F>) { + chomp; + my ($lhs, $f, $e, $feats) = split / \|\|\| /; + my $xfeats; + my $cc = 0; + if ($feats =~ /(EGivenF=[^ ]+)( |$)/) { + $xfeats = "X_$1"; $cc++; + } + if ($feats =~ /(FGivenE=[^ ]+)( |$)/) { + $xfeats = "$xfeats X_$1"; $cc++; + } + die "EGivenF and FGivenE features not found: $_" unless $cc == 2; + #print "$lhs ||| $f ||| $e ||| $xfeats\n"; + $dict{"$lhs ||| $f ||| $e"} = $xfeats; +} +close F; + +print STDERR "Add features...\n"; +while(<>) { + chomp; + my ($lhs, $f, $e) = split / \|\|\| /; + $f=~ s/\[[^]]+,([12])\]/\[X,$1\]/g; + my $xfeats = $dict{"[X] ||| $f ||| $e"}; + die "Can't find x features for: $_\n" unless $xfeats; + print "$_ $xfeats\n"; +} + |