summaryrefslogtreecommitdiff
path: root/python/pkg/cdec/sa
diff options
context:
space:
mode:
authorChris Dyer <cdyer@allegro.clab.cs.cmu.edu>2013-04-23 19:35:18 -0400
committerChris Dyer <cdyer@allegro.clab.cs.cmu.edu>2013-04-23 19:35:18 -0400
commitc164dc0ed8a32e4095ba1b36495e0f743b8cc1ea (patch)
tree78b81e4c63adfa67adb7b8f80c3e6be87b4a2b2a /python/pkg/cdec/sa
parent0e46089cafa4e8e2f060e370d7afaceeda6b90a9 (diff)
parentd467e14b28085809c31431be0478eb3d9322fe96 (diff)
merge paul's extractor code
Diffstat (limited to 'python/pkg/cdec/sa')
-rw-r--r--python/pkg/cdec/sa/compile.py21
-rw-r--r--python/pkg/cdec/sa/extract.py7
-rw-r--r--python/pkg/cdec/sa/features.py4
3 files changed, 28 insertions, 4 deletions
diff --git a/python/pkg/cdec/sa/compile.py b/python/pkg/cdec/sa/compile.py
index ce249c0f..d4cd8387 100644
--- a/python/pkg/cdec/sa/compile.py
+++ b/python/pkg/cdec/sa/compile.py
@@ -4,6 +4,7 @@ import os
import logging
import cdec.configobj
import cdec.sa
+from cdec.sa._sa import monitor_cpu
import sys
MAX_PHRASE_LENGTH = 4
@@ -21,6 +22,7 @@ def precompute(f_sa, max_len, max_nt, max_size, min_gap, rank1, rank2, tight_phr
return precomp
def main():
+ preprocess_start_time = monitor_cpu()
sys.setrecursionlimit(sys.getrecursionlimit() * 100)
logging.basicConfig(level=logging.INFO)
@@ -73,31 +75,46 @@ def main():
a_bin = os.path.join(args.output, 'a.bin')
lex_bin = os.path.join(args.output, 'lex.bin')
+ start_time = monitor_cpu()
logger.info('Compiling source suffix array')
if args.bitext:
f_sa = cdec.sa.SuffixArray(from_text=args.bitext, side='source')
else:
f_sa = cdec.sa.SuffixArray(from_text=args.source)
f_sa.write_binary(f_sa_bin)
+ stop_time = monitor_cpu()
+ logger.info('Compiling source suffix array took %f seconds', stop_time - start_time)
+ start_time = monitor_cpu()
logger.info('Compiling target data array')
if args.bitext:
e = cdec.sa.DataArray(from_text=args.bitext, side='target')
else:
e = cdec.sa.DataArray(from_text=args.target)
e.write_binary(e_bin)
+ stop_time = monitor_cpu()
+ logger.info('Compiling target data array took %f seconds', stop_time - start_time)
+ start_time = monitor_cpu()
logger.info('Precomputing frequent phrases')
precompute(f_sa, *params).write_binary(precomp_bin)
+ stop_time = monitor_cpu()
+ logger.info('Compiling precomputations took %f seconds', stop_time - start_time)
+ start_time = monitor_cpu()
logger.info('Compiling alignment')
a = cdec.sa.Alignment(from_text=args.alignment)
a.write_binary(a_bin)
+ stop_time = monitor_cpu()
+ logger.info('Compiling alignment took %f seonds', stop_time - start_time)
+ start_time = monitor_cpu()
logger.info('Compiling bilexical dictionary')
lex = cdec.sa.BiLex(from_data=True, alignment=a, earray=e, fsarray=f_sa)
lex.write_binary(lex_bin)
-
+ stop_time = monitor_cpu()
+ logger.info('Compiling bilexical dictionary took %f seconds', stop_time - start_time)
+
# Write configuration
config = cdec.configobj.ConfigObj(args.config, unrepr=True)
config['f_sa_file'] = os.path.abspath(f_sa_bin)
@@ -108,6 +125,8 @@ def main():
for name, value in zip(param_names, params):
config[name] = value
config.write()
+ preprocess_stop_time = monitor_cpu()
+ logger.info('Overall preprocessing step took %f seconds', preprocess_stop_time - preprocess_start_time)
if __name__ == '__main__':
main()
diff --git a/python/pkg/cdec/sa/extract.py b/python/pkg/cdec/sa/extract.py
index bf39d080..b6502c52 100644
--- a/python/pkg/cdec/sa/extract.py
+++ b/python/pkg/cdec/sa/extract.py
@@ -8,6 +8,7 @@ import logging
import signal
import multiprocessing as mp
import cdec.sa
+from cdec.sa._sa import monitor_cpu
extractor, prefix = None, None
online, compress = False, False
@@ -88,9 +89,10 @@ def main():
sys.stderr.write('Error: feature definition file <{}>'
' should be a python module\n'.format(featdef))
sys.exit(1)
-
+
online = args.online
+ start_time = monitor_cpu()
if args.jobs > 1:
logging.info('Starting %d workers; chunk size: %d', args.jobs, args.chunksize)
pool = mp.Pool(args.jobs, make_extractor, (args,))
@@ -104,5 +106,8 @@ def main():
for output in map(extract, enumerate(sys.stdin)):
print(output)
+ stop_time = monitor_cpu()
+ logging.info("Overall extraction step took %f seconds", stop_time - start_time)
+
if __name__ == '__main__':
main()
diff --git a/python/pkg/cdec/sa/features.py b/python/pkg/cdec/sa/features.py
index 46412cd5..c8fc1cca 100644
--- a/python/pkg/cdec/sa/features.py
+++ b/python/pkg/cdec/sa/features.py
@@ -105,7 +105,7 @@ def IsSingletonF(ctx):
count = ctx.fcount
else:
count = ctx.fcount + ctx.online.fcount
- return (count == 1)
+ return math.fabs(count - 1) < 1e-6
def IsSingletonFE(ctx):
if not ctx.online:
@@ -139,4 +139,4 @@ def IsSupportedOnline(ctx): # Occurs in online data?
if ctx.online:
return (ctx.online.paircount > 0.01)
else:
- return False \ No newline at end of file
+ return False