summaryrefslogtreecommitdiff
path: root/python/pkg/cdec/sa
diff options
context:
space:
mode:
authorVictor Chahuneau <vchahune@cs.cmu.edu>2012-08-14 22:50:37 -0400
committerVictor Chahuneau <vchahune@cs.cmu.edu>2012-08-14 22:50:37 -0400
commit9c9213239263e8e8de2f154068cc3ad44e0c2100 (patch)
treea9ee2f722e4dc5705ae9f90f6fb3b67a278c5fd9 /python/pkg/cdec/sa
parent0823824b5fa1504b6b2c48328aa8fc8468017cba (diff)
[cdec.sa] Explicit feature names in grammar extractor output
+ sparse features in extractor + hg.intersect(string) + basestring = str|unicode
Diffstat (limited to 'python/pkg/cdec/sa')
-rw-r--r--python/pkg/cdec/sa/__init__.py2
-rw-r--r--python/pkg/cdec/sa/extractor.py8
-rw-r--r--python/pkg/cdec/sa/features.py8
3 files changed, 9 insertions, 9 deletions
diff --git a/python/pkg/cdec/sa/__init__.py b/python/pkg/cdec/sa/__init__.py
index fd4a4148..ab8be809 100644
--- a/python/pkg/cdec/sa/__init__.py
+++ b/python/pkg/cdec/sa/__init__.py
@@ -1,4 +1,4 @@
from cdec.sa._sa import sym_fromstring,\
SuffixArray, DataArray, LCP, Precomputation, Alignment, BiLex,\
- HieroCachingRuleFactory, Sampler
+ HieroCachingRuleFactory, Sampler, Scorer
from cdec.sa.extractor import GrammarExtractor
diff --git a/python/pkg/cdec/sa/extractor.py b/python/pkg/cdec/sa/extractor.py
index bb912e16..90cc4c51 100644
--- a/python/pkg/cdec/sa/extractor.py
+++ b/python/pkg/cdec/sa/extractor.py
@@ -57,8 +57,8 @@ class GrammarExtractor:
# lexical weighting tables
tt = cdec.sa.BiLex(from_binary=config['lex_file'])
- self.models = (EgivenFCoherent, SampleCountF, CountEF,
- MaxLexFgivenE(tt), MaxLexEgivenF(tt), IsSingletonF, IsSingletonFE)
+ scorer = cdec.sa.Scorer(EgivenFCoherent, SampleCountF, CountEF,
+ MaxLexFgivenE(tt), MaxLexEgivenF(tt), IsSingletonF, IsSingletonFE)
fsarray = cdec.sa.SuffixArray(from_binary=config['f_sa_file'])
edarray = cdec.sa.DataArray(from_binary=config['e_file'])
@@ -67,7 +67,7 @@ class GrammarExtractor:
# -1 = don't sample, use all data (VERY SLOW!)
sampler = cdec.sa.Sampler(300, fsarray)
- self.factory.configure(fsarray, edarray, sampler)
+ self.factory.configure(fsarray, edarray, sampler, scorer)
def grammar(self, sentence):
if isinstance(sentence, unicode):
@@ -75,4 +75,4 @@ class GrammarExtractor:
cnet = chain(('<s>',), sentence.split(), ('</s>',))
cnet = (cdec.sa.sym_fromstring(word, terminal=True) for word in cnet)
cnet = tuple(((word, None, 1), ) for word in cnet)
- return self.factory.input(cnet, self.models)
+ return self.factory.input(cnet)
diff --git a/python/pkg/cdec/sa/features.py b/python/pkg/cdec/sa/features.py
index 325b9e13..8fd370cc 100644
--- a/python/pkg/cdec/sa/features.py
+++ b/python/pkg/cdec/sa/features.py
@@ -20,7 +20,7 @@ def CoherenceProb(fphrase, ephrase, paircount, fcount, fsample_count):
return -math.log10(fcount/fsample_count)
def MaxLexEgivenF(ttable):
- def feature(fphrase, ephrase, paircount, fcount, fsample_count):
+ def MaxLexEgivenF(fphrase, ephrase, paircount, fcount, fsample_count):
fwords = fphrase.words
fwords.append('NULL')
def score():
@@ -28,10 +28,10 @@ def MaxLexEgivenF(ttable):
maxScore = max(ttable.get_score(f, e, 0) for f in fwords)
yield -math.log10(maxScore) if maxScore > 0 else MAXSCORE
return sum(score())
- return feature
+ return MaxLexEgivenF
def MaxLexFgivenE(ttable):
- def feature(fphrase, ephrase, paircount, fcount, fsample_count):
+ def MaxLexFgivenE(fphrase, ephrase, paircount, fcount, fsample_count):
ewords = ephrase.words
ewords.append('NULL')
def score():
@@ -39,7 +39,7 @@ def MaxLexFgivenE(ttable):
maxScore = max(ttable.get_score(f, e, 1) for e in ewords)
yield -math.log10(maxScore) if maxScore > 0 else MAXSCORE
return sum(score())
- return feature
+ return MaxLexFgivenE
def IsSingletonF(fphrase, ephrase, paircount, fcount, fsample_count):
return (fcount == 1)