Enable loose phrase extraction parameter

(default is still tight) use --loose when compiling corpus or tight_phrases = False in config
author: Victor Chahuneau <vchahune@cs.cmu.edu> 2012-12-13 00:19:45 -0500
committer: Victor Chahuneau <vchahune@cs.cmu.edu> 2012-12-13 00:19:45 -0500
commit: 85fb3e7fa679101b30c6d1d5e3347b019a6c73d2 (patch)
tree: 1e536e44b1095641b8591a9f112baf42f7255029 /python/pkg/cdec/sa
parent: c24567c8d5ef343ba18dc14b100c9915a35d6e44 (diff)
2 files changed, 9 insertions, 5 deletions
diff --git a/python/pkg/cdec/sa/compile.py b/python/pkg/cdec/sa/compile.py
index 393c72a4..c0402761 100644
--- a/python/pkg/cdec/sa/compile.py
+++ b/python/pkg/cdec/sa/compile.py
@@ -35,6 +35,8 @@ def main():
                         help='Number of pre-computed frequent patterns')
     parser.add_argument('--rank2', '-r2', type=int, default=10,
                         help='Number of pre-computed super-frequent patterns)')
+    parser.add_argument('--loose', action='store_true',
+                        help='Enable loose phrase extraction (default: tight)')
     parser.add_argument('-c', '--config', default='/dev/stdout',
                         help='Output configuration')
     parser.add_argument('-f', '--source',
@@ -53,8 +55,10 @@ def main():
         parser.error('a parallel corpus is required\n'
         '\tuse -f (source) with -e (target) or -b (bitext)')
 
-    param_names = ("max_len", "max_nt", "max_size", "min_gap", "rank1", "rank2")
-    params = (args.maxlen, args.maxnt, args.maxsize, args.mingap, args.rank1, args.rank2)
+    param_names = ('max_len', 'max_nt', 'max_size', 'min_gap',
+            'rank1', 'rank2', 'tight_phrases')
+    params = (args.maxlen, args.maxnt, args.maxsize, args.mingap,
+            args.rank1, args.rank2, not args.loose)
 
     if not os.path.exists(args.output):
         os.mkdir(args.output)
diff --git a/python/pkg/cdec/sa/extractor.py b/python/pkg/cdec/sa/extractor.py
index a5ce8a68..e09f79ea 100644
--- a/python/pkg/cdec/sa/extractor.py
+++ b/python/pkg/cdec/sa/extractor.py
@@ -10,7 +10,7 @@ MAX_INITIAL_SIZE = 15
 
 class GrammarExtractor:
     def __init__(self, config, features=None):
-        if isinstance(config, str) or isinstance(config, unicode):
+        if isinstance(config, basestring):
             if not os.path.exists(config):
                 raise IOError('cannot read configuration from {0}'.format(config))
             config = cdec.configobj.ConfigObj(config, unrepr=True)
@@ -50,8 +50,8 @@ class GrammarExtractor:
                 train_max_initial_size=config['max_size'],
                 # minimum span of an RHS nonterminal in a rule extracted from TRAINING DATA
                 train_min_gap_size=config['min_gap'],
-                # True if phrases should be tight, False otherwise (better but slower)
-                tight_phrases=True,
+                # False if phrases should be loose (better but slower), True otherwise
+                tight_phrases=config.get('tight_phrases', True),
                 )
 
         # lexical weighting tables
author	Victor Chahuneau <vchahune@cs.cmu.edu>	2012-12-13 00:19:45 -0500
committer	Victor Chahuneau <vchahune@cs.cmu.edu>	2012-12-13 00:19:45 -0500
commit	85fb3e7fa679101b30c6d1d5e3347b019a6c73d2 (patch)
tree	1e536e44b1095641b8591a9f112baf42f7255029 /python/pkg/cdec/sa
parent	c24567c8d5ef343ba18dc14b100c9915a35d6e44 (diff)