diff options
author | Chris Dyer <redpony@gmail.com> | 2014-03-12 02:30:32 -0400 |
---|---|---|
committer | Chris Dyer <redpony@gmail.com> | 2014-03-12 02:30:32 -0400 |
commit | bcff95cd2879fa20a0bfd00e64a2555f6eab1c2b (patch) | |
tree | 3f833b5e0efc819a5b923353a9045485a98c4910 /python/cdec/sa/extractor.py | |
parent | 10a668822715cee024a7e7391c62caa8e078e840 (diff) | |
parent | efbc43b40c8c3204245814b65a7be280498281bd (diff) |
Merge branch 'master' of https://github.com/redpony/cdec
Diffstat (limited to 'python/cdec/sa/extractor.py')
-rw-r--r-- | python/cdec/sa/extractor.py | 27 |
1 files changed, 20 insertions, 7 deletions
diff --git a/python/cdec/sa/extractor.py b/python/cdec/sa/extractor.py index c2ded1d6..777f5afd 100644 --- a/python/cdec/sa/extractor.py +++ b/python/cdec/sa/extractor.py @@ -1,5 +1,7 @@ from itertools import chain -import os, sys +import logging +import os +import sys import cdec.configobj from cdec.sa._sa import gzip_or_text from cdec.sa.features import EgivenFCoherent, SampleCountF, CountEF,\ @@ -12,14 +14,31 @@ MAX_INITIAL_SIZE = 15 class GrammarExtractor: def __init__(self, config, online=False, features=None): + + logging.basicConfig(level=logging.INFO) + logger = logging.getLogger('cdec.sa') + if isinstance(config, basestring): if not os.path.exists(config): raise IOError('cannot read configuration from {0}'.format(config)) config = cdec.configobj.ConfigObj(config, unrepr=True) + + logger.info('Loading alignment...') alignment = cdec.sa.Alignment(from_binary=config['a_file']) + + # lexical weighting tables + if not online: + logger.info('Loading bilexical dictionary...') + tt = cdec.sa.BiLex(from_binary=config['lex_file']) + else: + logger.info('Loading online bilexical dictionary...') + tt = cdec.sa.online.Bilex(config['bilex_file']) + self.factory = cdec.sa.HieroCachingRuleFactory( # compiled alignment object (REQUIRED) alignment, + # bilexical dictionary if online + bilex=tt if online else None, # name of generic nonterminal used by Hiero category="[X]", # maximum number of contiguous chunks of terminal symbols in RHS of a rule @@ -56,12 +75,6 @@ class GrammarExtractor: tight_phrases=config.get('tight_phrases', True), ) - # lexical weighting tables - if not online: - tt = cdec.sa.BiLex(from_binary=config['lex_file']) - else: - tt = cdec.sa.online.Bilex(config['bilex_file']) - # TODO: clean this up # Load data and add features for online grammar extraction extended_features = [] |