summaryrefslogtreecommitdiff
path: root/python/pkg/cdec/sa/compile.py
diff options
context:
space:
mode:
authorAvneesh Saluja <asaluja@gmail.com>2013-03-28 18:28:16 -0700
committerAvneesh Saluja <asaluja@gmail.com>2013-03-28 18:28:16 -0700
commit3d8d656fa7911524e0e6885647173474524e0784 (patch)
tree81b1ee2fcb67980376d03f0aa48e42e53abff222 /python/pkg/cdec/sa/compile.py
parentbe7f57fdd484e063775d7abf083b9fa4c403b610 (diff)
parent96fedabebafe7a38a6d5928be8fff767e411d705 (diff)
fixed conflicts
Diffstat (limited to 'python/pkg/cdec/sa/compile.py')
-rw-r--r--python/pkg/cdec/sa/compile.py13
1 files changed, 10 insertions, 3 deletions
diff --git a/python/pkg/cdec/sa/compile.py b/python/pkg/cdec/sa/compile.py
index 393c72a4..ce249c0f 100644
--- a/python/pkg/cdec/sa/compile.py
+++ b/python/pkg/cdec/sa/compile.py
@@ -4,9 +4,10 @@ import os
import logging
import cdec.configobj
import cdec.sa
+import sys
MAX_PHRASE_LENGTH = 4
-def precompute(f_sa, max_len, max_nt, max_size, min_gap, rank1, rank2):
+def precompute(f_sa, max_len, max_nt, max_size, min_gap, rank1, rank2, tight_phrases):
lcp = cdec.sa.LCP(f_sa)
stats = sorted(lcp.compute_stats(MAX_PHRASE_LENGTH), reverse=True)
precomp = cdec.sa.Precomputation(from_stats=stats,
@@ -20,6 +21,8 @@ def precompute(f_sa, max_len, max_nt, max_size, min_gap, rank1, rank2):
return precomp
def main():
+ sys.setrecursionlimit(sys.getrecursionlimit() * 100)
+
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger('cdec.sa.compile')
parser = argparse.ArgumentParser(description='Compile a corpus into a suffix array.')
@@ -35,6 +38,8 @@ def main():
help='Number of pre-computed frequent patterns')
parser.add_argument('--rank2', '-r2', type=int, default=10,
help='Number of pre-computed super-frequent patterns)')
+ parser.add_argument('--loose', action='store_true',
+ help='Enable loose phrase extraction (default: tight)')
parser.add_argument('-c', '--config', default='/dev/stdout',
help='Output configuration')
parser.add_argument('-f', '--source',
@@ -53,8 +58,10 @@ def main():
parser.error('a parallel corpus is required\n'
'\tuse -f (source) with -e (target) or -b (bitext)')
- param_names = ("max_len", "max_nt", "max_size", "min_gap", "rank1", "rank2")
- params = (args.maxlen, args.maxnt, args.maxsize, args.mingap, args.rank1, args.rank2)
+ param_names = ('max_len', 'max_nt', 'max_size', 'min_gap',
+ 'rank1', 'rank2', 'tight_phrases')
+ params = (args.maxlen, args.maxnt, args.maxsize, args.mingap,
+ args.rank1, args.rank2, not args.loose)
if not os.path.exists(args.output):
os.mkdir(args.output)