From 6206d7a1638bbecbb2bb22754d1ce1217819be86 Mon Sep 17 00:00:00 2001 From: redpony Date: Tue, 6 Jul 2010 13:42:42 +0000 Subject: handle more features git-svn-id: https://ws10smt.googlecode.com/svn/trunk@148 ec762483-ff6d-05da-a07a-a48fb63a330f --- gi/pipeline/scripts/xfeats.pl | 27 +++++++++++++++------------ 1 file changed, 15 insertions(+), 12 deletions(-) diff --git a/gi/pipeline/scripts/xfeats.pl b/gi/pipeline/scripts/xfeats.pl index bdb9224c..dc578513 100755 --- a/gi/pipeline/scripts/xfeats.pl +++ b/gi/pipeline/scripts/xfeats.pl @@ -1,28 +1,31 @@ #!/usr/bin/perl -w use strict; -die "Usage: $0 x-grammar.scfg < cat-grammar.scfg\n" unless scalar @ARGV > 0; +die "Usage: $0 x-grammar.scfg[.gz] < cat-grammar.scfg\n" unless scalar @ARGV > 0; my $xgrammar = shift @ARGV; -open F, "<$xgrammar" or die "Can't read $xgrammar: $!"; +die "Can't find $xgrammar" unless -f $xgrammar; +my $fh; +if ($xgrammar =~ /\.gz$/) { + open $fh, "gunzip -c $xgrammar|" or die "Can't fork: $!"; +} else { + open $fh, "<$xgrammar" or die "Can't read $xgrammar: $!"; +} print STDERR "Reading X-feats from $xgrammar...\n"; my %dict; -while() { +while(<$fh>) { chomp; my ($lhs, $f, $e, $feats) = split / \|\|\| /; my $xfeats; my $cc = 0; - if ($feats =~ /(EGivenF=[^ ]+)( |$)/) { - $xfeats = "X_$1"; $cc++; - } - if ($feats =~ /(FGivenE=[^ ]+)( |$)/) { - $xfeats = "$xfeats X_$1"; $cc++; + my @xfeats = (); + while ($feats =~ /(EGivenF|FGivenE|LogRuleCount|LogECount|LogFCount|SingletonRule|SingletonE|SingletonF)=([^ ]+)( |$)/og) { + push @xfeats, "X_$1=$2"; } - die "EGivenF and FGivenE features not found: $_" unless $cc == 2; - #print "$lhs ||| $f ||| $e ||| $xfeats\n"; - $dict{"$lhs ||| $f ||| $e"} = $xfeats; + #print "$lhs ||| $f ||| $e ||| @xfeats\n"; + $dict{"$lhs ||| $f ||| $e"} = "@xfeats"; } -close F; +close $fh; print STDERR "Add features...\n"; while(<>) { -- cgit v1.2.3