diff options
author | Victor Chahuneau <vchahune@cs.cmu.edu> | 2012-12-23 23:07:31 +0100 |
---|---|---|
committer | Victor Chahuneau <vchahune@cs.cmu.edu> | 2012-12-23 23:07:31 +0100 |
commit | e66ce4f37f98af2e23c800c7dc6fd8fc83d07353 (patch) | |
tree | 9972e8ed1adeb56ede19b2c6020e92a5116860e4 /python/pkg | |
parent | 597d89c11db53e91bc011eab70fd613bbe6453e8 (diff) |
Memory mapping for IntList/FloatList
+ vocabulary class for DataArray & BiLex
Diffstat (limited to 'python/pkg')
-rw-r--r-- | python/pkg/cdec/sa/extractor.py | 14 |
1 files changed, 10 insertions, 4 deletions
diff --git a/python/pkg/cdec/sa/extractor.py b/python/pkg/cdec/sa/extractor.py index e09f79ea..0cf5f6b3 100644 --- a/python/pkg/cdec/sa/extractor.py +++ b/python/pkg/cdec/sa/extractor.py @@ -1,10 +1,13 @@ from itertools import chain import os +import logging import cdec.configobj from cdec.sa.features import EgivenFCoherent, SampleCountF, CountEF,\ MaxLexEgivenF, MaxLexFgivenE, IsSingletonF, IsSingletonFE import cdec.sa +logger = logging.getLogger('cdec.sa') + # maximum span of a grammar rule in TEST DATA MAX_INITIAL_SIZE = 15 @@ -14,7 +17,10 @@ class GrammarExtractor: if not os.path.exists(config): raise IOError('cannot read configuration from {0}'.format(config)) config = cdec.configobj.ConfigObj(config, unrepr=True) - alignment = cdec.sa.Alignment(from_binary=config['a_file']) + mmaped = config.get('memory_map', False) + if mmaped: + logger.info('Memory mapping parallel data') + alignment = cdec.sa.Alignment(from_binary=config['a_file'], mmaped=mmaped) self.factory = cdec.sa.HieroCachingRuleFactory( # compiled alignment object (REQUIRED) alignment, @@ -55,7 +61,7 @@ class GrammarExtractor: ) # lexical weighting tables - tt = cdec.sa.BiLex(from_binary=config['lex_file']) + tt = cdec.sa.BiLex(from_binary=config['lex_file'], mmaped=mmaped) # TODO: use @cdec.sa.features decorator for standard features too # + add a mask to disable features @@ -63,8 +69,8 @@ class GrammarExtractor: MaxLexFgivenE(tt), MaxLexEgivenF(tt), IsSingletonF, IsSingletonFE, *cdec.sa._SA_FEATURES) - fsarray = cdec.sa.SuffixArray(from_binary=config['f_sa_file']) - edarray = cdec.sa.DataArray(from_binary=config['e_file']) + fsarray = cdec.sa.SuffixArray(from_binary=config['f_sa_file'], mmaped=mmaped) + edarray = cdec.sa.DataArray(from_binary=config['e_file'], mmaped=mmaped) # lower=faster, higher=better; improvements level off above 200-300 range, # -1 = don't sample, use all data (VERY SLOW!) |