diff options
Diffstat (limited to 'python/pkg/cdec/sa/extractor.py')
-rw-r--r-- | python/pkg/cdec/sa/extractor.py | 13 |
1 files changed, 9 insertions, 4 deletions
diff --git a/python/pkg/cdec/sa/extractor.py b/python/pkg/cdec/sa/extractor.py index 89e35bf8..a5ce8a68 100644 --- a/python/pkg/cdec/sa/extractor.py +++ b/python/pkg/cdec/sa/extractor.py @@ -57,6 +57,8 @@ class GrammarExtractor: # lexical weighting tables tt = cdec.sa.BiLex(from_binary=config['lex_file']) + # TODO: use @cdec.sa.features decorator for standard features too + # + add a mask to disable features scorer = cdec.sa.Scorer(EgivenFCoherent, SampleCountF, CountEF, MaxLexFgivenE(tt), MaxLexEgivenF(tt), IsSingletonF, IsSingletonFE, *cdec.sa._SA_FEATURES) @@ -69,11 +71,14 @@ class GrammarExtractor: sampler = cdec.sa.Sampler(300, fsarray) self.factory.configure(fsarray, edarray, sampler, scorer) + # Initialize feature definitions with configuration + for fn in cdec.sa._SA_CONFIGURE: + fn(config) def grammar(self, sentence): if isinstance(sentence, unicode): sentence = sentence.encode('utf8') - cnet = chain(('<s>',), sentence.split(), ('</s>',)) - cnet = (cdec.sa.sym_fromstring(word, terminal=True) for word in cnet) - cnet = tuple(((word, None, 1), ) for word in cnet) - return self.factory.input(cnet) + words = tuple(chain(('<s>',), sentence.split(), ('</s>',))) + meta = cdec.sa.annotate(words) + cnet = cdec.sa.make_lattice(words) + return self.factory.input(cnet, meta) |