summaryrefslogtreecommitdiff
path: root/python/pkg/cdec/sa/extractor.py
diff options
context:
space:
mode:
Diffstat (limited to 'python/pkg/cdec/sa/extractor.py')
-rw-r--r--python/pkg/cdec/sa/extractor.py14
1 files changed, 10 insertions, 4 deletions
diff --git a/python/pkg/cdec/sa/extractor.py b/python/pkg/cdec/sa/extractor.py
index e09f79ea..0cf5f6b3 100644
--- a/python/pkg/cdec/sa/extractor.py
+++ b/python/pkg/cdec/sa/extractor.py
@@ -1,10 +1,13 @@
from itertools import chain
import os
+import logging
import cdec.configobj
from cdec.sa.features import EgivenFCoherent, SampleCountF, CountEF,\
MaxLexEgivenF, MaxLexFgivenE, IsSingletonF, IsSingletonFE
import cdec.sa
+logger = logging.getLogger('cdec.sa')
+
# maximum span of a grammar rule in TEST DATA
MAX_INITIAL_SIZE = 15
@@ -14,7 +17,10 @@ class GrammarExtractor:
if not os.path.exists(config):
raise IOError('cannot read configuration from {0}'.format(config))
config = cdec.configobj.ConfigObj(config, unrepr=True)
- alignment = cdec.sa.Alignment(from_binary=config['a_file'])
+ mmaped = config.get('memory_map', False)
+ if mmaped:
+ logger.info('Memory mapping parallel data')
+ alignment = cdec.sa.Alignment(from_binary=config['a_file'], mmaped=mmaped)
self.factory = cdec.sa.HieroCachingRuleFactory(
# compiled alignment object (REQUIRED)
alignment,
@@ -55,7 +61,7 @@ class GrammarExtractor:
)
# lexical weighting tables
- tt = cdec.sa.BiLex(from_binary=config['lex_file'])
+ tt = cdec.sa.BiLex(from_binary=config['lex_file'], mmaped=mmaped)
# TODO: use @cdec.sa.features decorator for standard features too
# + add a mask to disable features
@@ -63,8 +69,8 @@ class GrammarExtractor:
MaxLexFgivenE(tt), MaxLexEgivenF(tt), IsSingletonF, IsSingletonFE,
*cdec.sa._SA_FEATURES)
- fsarray = cdec.sa.SuffixArray(from_binary=config['f_sa_file'])
- edarray = cdec.sa.DataArray(from_binary=config['e_file'])
+ fsarray = cdec.sa.SuffixArray(from_binary=config['f_sa_file'], mmaped=mmaped)
+ edarray = cdec.sa.DataArray(from_binary=config['e_file'], mmaped=mmaped)
# lower=faster, higher=better; improvements level off above 200-300 range,
# -1 = don't sample, use all data (VERY SLOW!)