summaryrefslogtreecommitdiff
path: root/python/cdec/sa/extractor.py
diff options
context:
space:
mode:
authormjdenkowski <michael.j.denkowski@gmail.com>2014-03-11 15:46:30 -0400
committermjdenkowski <michael.j.denkowski@gmail.com>2014-03-11 15:46:30 -0400
commit1197fb64e67b95ed497df4ebca5dd69e3e2db1b5 (patch)
treedc6b6b847a3a981f50a43f23ed518143ba57cbfe /python/cdec/sa/extractor.py
parent8c2d8217f068d8f107f95171496a25013d4e35fe (diff)
Update lexical weights in online grammar extraction
Diffstat (limited to 'python/cdec/sa/extractor.py')
-rw-r--r--python/cdec/sa/extractor.py27
1 files changed, 20 insertions, 7 deletions
diff --git a/python/cdec/sa/extractor.py b/python/cdec/sa/extractor.py
index c2ded1d6..777f5afd 100644
--- a/python/cdec/sa/extractor.py
+++ b/python/cdec/sa/extractor.py
@@ -1,5 +1,7 @@
from itertools import chain
-import os, sys
+import logging
+import os
+import sys
import cdec.configobj
from cdec.sa._sa import gzip_or_text
from cdec.sa.features import EgivenFCoherent, SampleCountF, CountEF,\
@@ -12,14 +14,31 @@ MAX_INITIAL_SIZE = 15
class GrammarExtractor:
def __init__(self, config, online=False, features=None):
+
+ logging.basicConfig(level=logging.INFO)
+ logger = logging.getLogger('cdec.sa')
+
if isinstance(config, basestring):
if not os.path.exists(config):
raise IOError('cannot read configuration from {0}'.format(config))
config = cdec.configobj.ConfigObj(config, unrepr=True)
+
+ logger.info('Loading alignment...')
alignment = cdec.sa.Alignment(from_binary=config['a_file'])
+
+ # lexical weighting tables
+ if not online:
+ logger.info('Loading bilexical dictionary...')
+ tt = cdec.sa.BiLex(from_binary=config['lex_file'])
+ else:
+ logger.info('Loading online bilexical dictionary...')
+ tt = cdec.sa.online.Bilex(config['bilex_file'])
+
self.factory = cdec.sa.HieroCachingRuleFactory(
# compiled alignment object (REQUIRED)
alignment,
+ # bilexical dictionary if online
+ bilex=tt if online else None,
# name of generic nonterminal used by Hiero
category="[X]",
# maximum number of contiguous chunks of terminal symbols in RHS of a rule
@@ -56,12 +75,6 @@ class GrammarExtractor:
tight_phrases=config.get('tight_phrases', True),
)
- # lexical weighting tables
- if not online:
- tt = cdec.sa.BiLex(from_binary=config['lex_file'])
- else:
- tt = cdec.sa.online.Bilex(config['bilex_file'])
-
# TODO: clean this up
# Load data and add features for online grammar extraction
extended_features = []