From 9c9213239263e8e8de2f154068cc3ad44e0c2100 Mon Sep 17 00:00:00 2001 From: Victor Chahuneau Date: Tue, 14 Aug 2012 22:50:37 -0400 Subject: [cdec.sa] Explicit feature names in grammar extractor output + sparse features in extractor + hg.intersect(string) + basestring = str|unicode --- python/pkg/cdec/sa/__init__.py | 2 +- python/pkg/cdec/sa/extractor.py | 8 ++++---- python/pkg/cdec/sa/features.py | 8 ++++---- 3 files changed, 9 insertions(+), 9 deletions(-) (limited to 'python/pkg') diff --git a/python/pkg/cdec/sa/__init__.py b/python/pkg/cdec/sa/__init__.py index fd4a4148..ab8be809 100644 --- a/python/pkg/cdec/sa/__init__.py +++ b/python/pkg/cdec/sa/__init__.py @@ -1,4 +1,4 @@ from cdec.sa._sa import sym_fromstring,\ SuffixArray, DataArray, LCP, Precomputation, Alignment, BiLex,\ - HieroCachingRuleFactory, Sampler + HieroCachingRuleFactory, Sampler, Scorer from cdec.sa.extractor import GrammarExtractor diff --git a/python/pkg/cdec/sa/extractor.py b/python/pkg/cdec/sa/extractor.py index bb912e16..90cc4c51 100644 --- a/python/pkg/cdec/sa/extractor.py +++ b/python/pkg/cdec/sa/extractor.py @@ -57,8 +57,8 @@ class GrammarExtractor: # lexical weighting tables tt = cdec.sa.BiLex(from_binary=config['lex_file']) - self.models = (EgivenFCoherent, SampleCountF, CountEF, - MaxLexFgivenE(tt), MaxLexEgivenF(tt), IsSingletonF, IsSingletonFE) + scorer = cdec.sa.Scorer(EgivenFCoherent, SampleCountF, CountEF, + MaxLexFgivenE(tt), MaxLexEgivenF(tt), IsSingletonF, IsSingletonFE) fsarray = cdec.sa.SuffixArray(from_binary=config['f_sa_file']) edarray = cdec.sa.DataArray(from_binary=config['e_file']) @@ -67,7 +67,7 @@ class GrammarExtractor: # -1 = don't sample, use all data (VERY SLOW!) sampler = cdec.sa.Sampler(300, fsarray) - self.factory.configure(fsarray, edarray, sampler) + self.factory.configure(fsarray, edarray, sampler, scorer) def grammar(self, sentence): if isinstance(sentence, unicode): @@ -75,4 +75,4 @@ class GrammarExtractor: cnet = chain(('',), sentence.split(), ('',)) cnet = (cdec.sa.sym_fromstring(word, terminal=True) for word in cnet) cnet = tuple(((word, None, 1), ) for word in cnet) - return self.factory.input(cnet, self.models) + return self.factory.input(cnet) diff --git a/python/pkg/cdec/sa/features.py b/python/pkg/cdec/sa/features.py index 325b9e13..8fd370cc 100644 --- a/python/pkg/cdec/sa/features.py +++ b/python/pkg/cdec/sa/features.py @@ -20,7 +20,7 @@ def CoherenceProb(fphrase, ephrase, paircount, fcount, fsample_count): return -math.log10(fcount/fsample_count) def MaxLexEgivenF(ttable): - def feature(fphrase, ephrase, paircount, fcount, fsample_count): + def MaxLexEgivenF(fphrase, ephrase, paircount, fcount, fsample_count): fwords = fphrase.words fwords.append('NULL') def score(): @@ -28,10 +28,10 @@ def MaxLexEgivenF(ttable): maxScore = max(ttable.get_score(f, e, 0) for f in fwords) yield -math.log10(maxScore) if maxScore > 0 else MAXSCORE return sum(score()) - return feature + return MaxLexEgivenF def MaxLexFgivenE(ttable): - def feature(fphrase, ephrase, paircount, fcount, fsample_count): + def MaxLexFgivenE(fphrase, ephrase, paircount, fcount, fsample_count): ewords = ephrase.words ewords.append('NULL') def score(): @@ -39,7 +39,7 @@ def MaxLexFgivenE(ttable): maxScore = max(ttable.get_score(f, e, 1) for e in ewords) yield -math.log10(maxScore) if maxScore > 0 else MAXSCORE return sum(score()) - return feature + return MaxLexFgivenE def IsSingletonF(fphrase, ephrase, paircount, fcount, fsample_count): return (fcount == 1) -- cgit v1.2.3