Memory mapping for IntList/FloatList

+ vocabulary class for DataArray & BiLex
author: Victor Chahuneau <vchahune@cs.cmu.edu> 2012-12-23 23:07:31 +0100
committer: Victor Chahuneau <vchahune@cs.cmu.edu> 2012-12-23 23:07:31 +0100
commit: e66ce4f37f98af2e23c800c7dc6fd8fc83d07353 (patch)
tree: 9972e8ed1adeb56ede19b2c6020e92a5116860e4 /python/pkg/cdec/sa
parent: 597d89c11db53e91bc011eab70fd613bbe6453e8 (diff)
1 files changed, 10 insertions, 4 deletions
diff --git a/python/pkg/cdec/sa/extractor.py b/python/pkg/cdec/sa/extractor.py
index e09f79ea..0cf5f6b3 100644
--- a/python/pkg/cdec/sa/extractor.py
+++ b/python/pkg/cdec/sa/extractor.py
@@ -1,10 +1,13 @@
 from itertools import chain
 import os
+import logging
 import cdec.configobj
 from cdec.sa.features import EgivenFCoherent, SampleCountF, CountEF,\
         MaxLexEgivenF, MaxLexFgivenE, IsSingletonF, IsSingletonFE
 import cdec.sa
 
+logger = logging.getLogger('cdec.sa')
+
 # maximum span of a grammar rule in TEST DATA
 MAX_INITIAL_SIZE = 15
 
@@ -14,7 +17,10 @@ class GrammarExtractor:
             if not os.path.exists(config):
                 raise IOError('cannot read configuration from {0}'.format(config))
             config = cdec.configobj.ConfigObj(config, unrepr=True)
-        alignment = cdec.sa.Alignment(from_binary=config['a_file'])
+        mmaped = config.get('memory_map', False)
+        if mmaped:
+            logger.info('Memory mapping parallel data')
+        alignment = cdec.sa.Alignment(from_binary=config['a_file'], mmaped=mmaped)
         self.factory = cdec.sa.HieroCachingRuleFactory(
                 # compiled alignment object (REQUIRED)
                 alignment,
@@ -55,7 +61,7 @@ class GrammarExtractor:
                 )
 
         # lexical weighting tables
-        tt = cdec.sa.BiLex(from_binary=config['lex_file'])
+        tt = cdec.sa.BiLex(from_binary=config['lex_file'], mmaped=mmaped)
 
         # TODO: use @cdec.sa.features decorator for standard features too
         # + add a mask to disable features
@@ -63,8 +69,8 @@ class GrammarExtractor:
             MaxLexFgivenE(tt), MaxLexEgivenF(tt), IsSingletonF, IsSingletonFE,
             *cdec.sa._SA_FEATURES)
 
-        fsarray = cdec.sa.SuffixArray(from_binary=config['f_sa_file'])
-        edarray = cdec.sa.DataArray(from_binary=config['e_file'])
+        fsarray = cdec.sa.SuffixArray(from_binary=config['f_sa_file'], mmaped=mmaped)
+        edarray = cdec.sa.DataArray(from_binary=config['e_file'], mmaped=mmaped)
 
         # lower=faster, higher=better; improvements level off above 200-300 range,
         # -1 = don't sample, use all data (VERY SLOW!)
author	Victor Chahuneau <vchahune@cs.cmu.edu>	2012-12-23 23:07:31 +0100
committer	Victor Chahuneau <vchahune@cs.cmu.edu>	2012-12-23 23:07:31 +0100
commit	e66ce4f37f98af2e23c800c7dc6fd8fc83d07353 (patch)
tree	9972e8ed1adeb56ede19b2c6020e92a5116860e4 /python/pkg/cdec/sa
parent	597d89c11db53e91bc011eab70fd613bbe6453e8 (diff)