summaryrefslogtreecommitdiff
path: root/python/cdec/sa/extractor.py
diff options
context:
space:
mode:
authorChris Dyer <redpony@gmail.com>2014-03-12 02:30:32 -0400
committerChris Dyer <redpony@gmail.com>2014-03-12 02:30:32 -0400
commitbcff95cd2879fa20a0bfd00e64a2555f6eab1c2b (patch)
tree3f833b5e0efc819a5b923353a9045485a98c4910 /python/cdec/sa/extractor.py
parent10a668822715cee024a7e7391c62caa8e078e840 (diff)
parentefbc43b40c8c3204245814b65a7be280498281bd (diff)
Merge branch 'master' of https://github.com/redpony/cdec
Diffstat (limited to 'python/cdec/sa/extractor.py')
-rw-r--r--python/cdec/sa/extractor.py27
1 files changed, 20 insertions, 7 deletions
diff --git a/python/cdec/sa/extractor.py b/python/cdec/sa/extractor.py
index c2ded1d6..777f5afd 100644
--- a/python/cdec/sa/extractor.py
+++ b/python/cdec/sa/extractor.py
@@ -1,5 +1,7 @@
from itertools import chain
-import os, sys
+import logging
+import os
+import sys
import cdec.configobj
from cdec.sa._sa import gzip_or_text
from cdec.sa.features import EgivenFCoherent, SampleCountF, CountEF,\
@@ -12,14 +14,31 @@ MAX_INITIAL_SIZE = 15
class GrammarExtractor:
def __init__(self, config, online=False, features=None):
+
+ logging.basicConfig(level=logging.INFO)
+ logger = logging.getLogger('cdec.sa')
+
if isinstance(config, basestring):
if not os.path.exists(config):
raise IOError('cannot read configuration from {0}'.format(config))
config = cdec.configobj.ConfigObj(config, unrepr=True)
+
+ logger.info('Loading alignment...')
alignment = cdec.sa.Alignment(from_binary=config['a_file'])
+
+ # lexical weighting tables
+ if not online:
+ logger.info('Loading bilexical dictionary...')
+ tt = cdec.sa.BiLex(from_binary=config['lex_file'])
+ else:
+ logger.info('Loading online bilexical dictionary...')
+ tt = cdec.sa.online.Bilex(config['bilex_file'])
+
self.factory = cdec.sa.HieroCachingRuleFactory(
# compiled alignment object (REQUIRED)
alignment,
+ # bilexical dictionary if online
+ bilex=tt if online else None,
# name of generic nonterminal used by Hiero
category="[X]",
# maximum number of contiguous chunks of terminal symbols in RHS of a rule
@@ -56,12 +75,6 @@ class GrammarExtractor:
tight_phrases=config.get('tight_phrases', True),
)
- # lexical weighting tables
- if not online:
- tt = cdec.sa.BiLex(from_binary=config['lex_file'])
- else:
- tt = cdec.sa.online.Bilex(config['bilex_file'])
-
# TODO: clean this up
# Load data and add features for online grammar extraction
extended_features = []