diff options
Diffstat (limited to 'python/cdec/sa/extractor.py')
-rw-r--r-- | python/cdec/sa/extractor.py | 33 |
1 files changed, 23 insertions, 10 deletions
diff --git a/python/cdec/sa/extractor.py b/python/cdec/sa/extractor.py index 244f84be..777f5afd 100644 --- a/python/cdec/sa/extractor.py +++ b/python/cdec/sa/extractor.py @@ -1,25 +1,44 @@ from itertools import chain -import os, sys +import logging +import os +import sys import cdec.configobj from cdec.sa._sa import gzip_or_text from cdec.sa.features import EgivenFCoherent, SampleCountF, CountEF,\ MaxLexEgivenF, MaxLexFgivenE, IsSingletonF, IsSingletonFE,\ - IsSupportedOnline, CountExceptLM, CountExceptLex + IsSupportedOnline import cdec.sa # maximum span of a grammar rule in TEST DATA MAX_INITIAL_SIZE = 15 class GrammarExtractor: - def __init__(self, config, online=False, vocab=None, features=None): + def __init__(self, config, online=False, features=None): + + logging.basicConfig(level=logging.INFO) + logger = logging.getLogger('cdec.sa') + if isinstance(config, basestring): if not os.path.exists(config): raise IOError('cannot read configuration from {0}'.format(config)) config = cdec.configobj.ConfigObj(config, unrepr=True) + + logger.info('Loading alignment...') alignment = cdec.sa.Alignment(from_binary=config['a_file']) + + # lexical weighting tables + if not online: + logger.info('Loading bilexical dictionary...') + tt = cdec.sa.BiLex(from_binary=config['lex_file']) + else: + logger.info('Loading online bilexical dictionary...') + tt = cdec.sa.online.Bilex(config['bilex_file']) + self.factory = cdec.sa.HieroCachingRuleFactory( # compiled alignment object (REQUIRED) alignment, + # bilexical dictionary if online + bilex=tt if online else None, # name of generic nonterminal used by Hiero category="[X]", # maximum number of contiguous chunks of terminal symbols in RHS of a rule @@ -56,17 +75,11 @@ class GrammarExtractor: tight_phrases=config.get('tight_phrases', True), ) - # lexical weighting tables - tt = cdec.sa.BiLex(from_binary=config['lex_file']) - # TODO: clean this up + # Load data and add features for online grammar extraction extended_features = [] if online: extended_features.append(IsSupportedOnline) - if vocab: - vcb_set = set(line.strip() for line in gzip_or_text(vocab)) - extended_features.append(CountExceptLM(vcb_set)) - extended_features.append(CountExceptLex(tt)) # TODO: use @cdec.sa.features decorator for standard features too # + add a mask to disable features |