summaryrefslogtreecommitdiff
path: root/python/pkg/cdec/sa/extractor.py
diff options
context:
space:
mode:
authorPatrick Simianer <simianer@cl.uni-heidelberg.de>2012-11-05 15:29:46 +0100
committerPatrick Simianer <simianer@cl.uni-heidelberg.de>2012-11-05 15:29:46 +0100
commit6f29f345dc06c1a1033475eac1d1340781d1d603 (patch)
tree6fa4cdd7aefd7d54c9585c2c6274db61bb8b159a /python/pkg/cdec/sa/extractor.py
parentb510da2e562c695c90d565eb295c749569c59be8 (diff)
parentc615c37501fa8576584a510a9d2bfe2fdd5bace7 (diff)
merge upstream/master
Diffstat (limited to 'python/pkg/cdec/sa/extractor.py')
-rw-r--r--python/pkg/cdec/sa/extractor.py22
1 files changed, 14 insertions, 8 deletions
diff --git a/python/pkg/cdec/sa/extractor.py b/python/pkg/cdec/sa/extractor.py
index bb912e16..a5ce8a68 100644
--- a/python/pkg/cdec/sa/extractor.py
+++ b/python/pkg/cdec/sa/extractor.py
@@ -9,7 +9,7 @@ import cdec.sa
MAX_INITIAL_SIZE = 15
class GrammarExtractor:
- def __init__(self, config):
+ def __init__(self, config, features=None):
if isinstance(config, str) or isinstance(config, unicode):
if not os.path.exists(config):
raise IOError('cannot read configuration from {0}'.format(config))
@@ -57,8 +57,11 @@ class GrammarExtractor:
# lexical weighting tables
tt = cdec.sa.BiLex(from_binary=config['lex_file'])
- self.models = (EgivenFCoherent, SampleCountF, CountEF,
- MaxLexFgivenE(tt), MaxLexEgivenF(tt), IsSingletonF, IsSingletonFE)
+ # TODO: use @cdec.sa.features decorator for standard features too
+ # + add a mask to disable features
+ scorer = cdec.sa.Scorer(EgivenFCoherent, SampleCountF, CountEF,
+ MaxLexFgivenE(tt), MaxLexEgivenF(tt), IsSingletonF, IsSingletonFE,
+ *cdec.sa._SA_FEATURES)
fsarray = cdec.sa.SuffixArray(from_binary=config['f_sa_file'])
edarray = cdec.sa.DataArray(from_binary=config['e_file'])
@@ -67,12 +70,15 @@ class GrammarExtractor:
# -1 = don't sample, use all data (VERY SLOW!)
sampler = cdec.sa.Sampler(300, fsarray)
- self.factory.configure(fsarray, edarray, sampler)
+ self.factory.configure(fsarray, edarray, sampler, scorer)
+ # Initialize feature definitions with configuration
+ for fn in cdec.sa._SA_CONFIGURE:
+ fn(config)
def grammar(self, sentence):
if isinstance(sentence, unicode):
sentence = sentence.encode('utf8')
- cnet = chain(('<s>',), sentence.split(), ('</s>',))
- cnet = (cdec.sa.sym_fromstring(word, terminal=True) for word in cnet)
- cnet = tuple(((word, None, 1), ) for word in cnet)
- return self.factory.input(cnet, self.models)
+ words = tuple(chain(('<s>',), sentence.split(), ('</s>',)))
+ meta = cdec.sa.annotate(words)
+ cnet = cdec.sa.make_lattice(words)
+ return self.factory.input(cnet, meta)