From 17bef03f04cf4decc41d477ec015886663120477 Mon Sep 17 00:00:00 2001 From: Victor Chahuneau Date: Thu, 13 Dec 2012 00:19:45 -0500 Subject: Enable loose phrase extraction parameter (default is still tight) use --loose when compiling corpus or tight_phrases = False in config --- python/pkg/cdec/sa/compile.py | 8 ++++++-- python/pkg/cdec/sa/extractor.py | 6 +++--- 2 files changed, 9 insertions(+), 5 deletions(-) (limited to 'python/pkg/cdec/sa') diff --git a/python/pkg/cdec/sa/compile.py b/python/pkg/cdec/sa/compile.py index 393c72a4..c0402761 100644 --- a/python/pkg/cdec/sa/compile.py +++ b/python/pkg/cdec/sa/compile.py @@ -35,6 +35,8 @@ def main(): help='Number of pre-computed frequent patterns') parser.add_argument('--rank2', '-r2', type=int, default=10, help='Number of pre-computed super-frequent patterns)') + parser.add_argument('--loose', action='store_true', + help='Enable loose phrase extraction (default: tight)') parser.add_argument('-c', '--config', default='/dev/stdout', help='Output configuration') parser.add_argument('-f', '--source', @@ -53,8 +55,10 @@ def main(): parser.error('a parallel corpus is required\n' '\tuse -f (source) with -e (target) or -b (bitext)') - param_names = ("max_len", "max_nt", "max_size", "min_gap", "rank1", "rank2") - params = (args.maxlen, args.maxnt, args.maxsize, args.mingap, args.rank1, args.rank2) + param_names = ('max_len', 'max_nt', 'max_size', 'min_gap', + 'rank1', 'rank2', 'tight_phrases') + params = (args.maxlen, args.maxnt, args.maxsize, args.mingap, + args.rank1, args.rank2, not args.loose) if not os.path.exists(args.output): os.mkdir(args.output) diff --git a/python/pkg/cdec/sa/extractor.py b/python/pkg/cdec/sa/extractor.py index a5ce8a68..e09f79ea 100644 --- a/python/pkg/cdec/sa/extractor.py +++ b/python/pkg/cdec/sa/extractor.py @@ -10,7 +10,7 @@ MAX_INITIAL_SIZE = 15 class GrammarExtractor: def __init__(self, config, features=None): - if isinstance(config, str) or isinstance(config, unicode): + if isinstance(config, basestring): if not os.path.exists(config): raise IOError('cannot read configuration from {0}'.format(config)) config = cdec.configobj.ConfigObj(config, unrepr=True) @@ -50,8 +50,8 @@ class GrammarExtractor: train_max_initial_size=config['max_size'], # minimum span of an RHS nonterminal in a rule extracted from TRAINING DATA train_min_gap_size=config['min_gap'], - # True if phrases should be tight, False otherwise (better but slower) - tight_phrases=True, + # False if phrases should be loose (better but slower), True otherwise + tight_phrases=config.get('tight_phrases', True), ) # lexical weighting tables -- cgit v1.2.3