diff options
| author | Chris Dyer <cdyer@allegro.clab.cs.cmu.edu> | 2013-04-23 19:35:18 -0400 | 
|---|---|---|
| committer | Chris Dyer <cdyer@allegro.clab.cs.cmu.edu> | 2013-04-23 19:35:18 -0400 | 
| commit | 6d347f1ce078dede3da0e1498f75e357351c6543 (patch) | |
| tree | 8e872b8747c530e741e55e25e9917c1bd8b32c5b /python/pkg/cdec/sa | |
| parent | d11b76def6899790161c47a73018146311356d8b (diff) | |
| parent | 5e9605b65202f4e5fc59843b197d88c4774f0ac8 (diff) | |
merge paul's extractor code
Diffstat (limited to 'python/pkg/cdec/sa')
| -rw-r--r-- | python/pkg/cdec/sa/compile.py | 21 | ||||
| -rw-r--r-- | python/pkg/cdec/sa/extract.py | 7 | ||||
| -rw-r--r-- | python/pkg/cdec/sa/features.py | 4 | 
3 files changed, 28 insertions, 4 deletions
| diff --git a/python/pkg/cdec/sa/compile.py b/python/pkg/cdec/sa/compile.py index ce249c0f..d4cd8387 100644 --- a/python/pkg/cdec/sa/compile.py +++ b/python/pkg/cdec/sa/compile.py @@ -4,6 +4,7 @@ import os  import logging  import cdec.configobj  import cdec.sa +from cdec.sa._sa import monitor_cpu  import sys  MAX_PHRASE_LENGTH = 4 @@ -21,6 +22,7 @@ def precompute(f_sa, max_len, max_nt, max_size, min_gap, rank1, rank2, tight_phr      return precomp  def main(): +    preprocess_start_time = monitor_cpu()      sys.setrecursionlimit(sys.getrecursionlimit() * 100)      logging.basicConfig(level=logging.INFO) @@ -73,31 +75,46 @@ def main():      a_bin = os.path.join(args.output, 'a.bin')      lex_bin = os.path.join(args.output, 'lex.bin') +    start_time = monitor_cpu()      logger.info('Compiling source suffix array')      if args.bitext:          f_sa = cdec.sa.SuffixArray(from_text=args.bitext, side='source')      else:          f_sa = cdec.sa.SuffixArray(from_text=args.source)      f_sa.write_binary(f_sa_bin) +    stop_time = monitor_cpu() +    logger.info('Compiling source suffix array took %f seconds', stop_time - start_time) +    start_time = monitor_cpu()      logger.info('Compiling target data array')      if args.bitext:          e = cdec.sa.DataArray(from_text=args.bitext, side='target')      else:          e = cdec.sa.DataArray(from_text=args.target)      e.write_binary(e_bin) +    stop_time = monitor_cpu() +    logger.info('Compiling target data array took %f seconds', stop_time - start_time) +    start_time = monitor_cpu()      logger.info('Precomputing frequent phrases')      precompute(f_sa, *params).write_binary(precomp_bin) +    stop_time = monitor_cpu() +    logger.info('Compiling precomputations took %f seconds', stop_time - start_time) +    start_time = monitor_cpu()      logger.info('Compiling alignment')      a = cdec.sa.Alignment(from_text=args.alignment)      a.write_binary(a_bin) +    stop_time = monitor_cpu() +    logger.info('Compiling alignment took %f seonds', stop_time - start_time) +    start_time = monitor_cpu()      logger.info('Compiling bilexical dictionary')      lex = cdec.sa.BiLex(from_data=True, alignment=a, earray=e, fsarray=f_sa)      lex.write_binary(lex_bin) -     +    stop_time = monitor_cpu() +    logger.info('Compiling bilexical dictionary took %f seconds', stop_time - start_time) +      # Write configuration      config = cdec.configobj.ConfigObj(args.config, unrepr=True)      config['f_sa_file'] = os.path.abspath(f_sa_bin) @@ -108,6 +125,8 @@ def main():      for name, value in zip(param_names, params):          config[name] = value      config.write() +    preprocess_stop_time = monitor_cpu() +    logger.info('Overall preprocessing step took %f seconds', preprocess_stop_time - preprocess_start_time)  if __name__ == '__main__':      main() diff --git a/python/pkg/cdec/sa/extract.py b/python/pkg/cdec/sa/extract.py index bf39d080..b6502c52 100644 --- a/python/pkg/cdec/sa/extract.py +++ b/python/pkg/cdec/sa/extract.py @@ -8,6 +8,7 @@ import logging  import signal  import multiprocessing as mp  import cdec.sa +from cdec.sa._sa import monitor_cpu  extractor, prefix = None, None  online, compress = False, False @@ -88,9 +89,10 @@ def main():              sys.stderr.write('Error: feature definition file <{}>'                      ' should be a python module\n'.format(featdef))              sys.exit(1) -     +      online = args.online +    start_time = monitor_cpu()      if args.jobs > 1:          logging.info('Starting %d workers; chunk size: %d', args.jobs, args.chunksize)          pool = mp.Pool(args.jobs, make_extractor, (args,)) @@ -104,5 +106,8 @@ def main():          for output in map(extract, enumerate(sys.stdin)):              print(output) +    stop_time = monitor_cpu() +    logging.info("Overall extraction step took %f seconds", stop_time - start_time) +  if __name__ == '__main__':      main() diff --git a/python/pkg/cdec/sa/features.py b/python/pkg/cdec/sa/features.py index 46412cd5..c8fc1cca 100644 --- a/python/pkg/cdec/sa/features.py +++ b/python/pkg/cdec/sa/features.py @@ -105,7 +105,7 @@ def IsSingletonF(ctx):          count = ctx.fcount      else:          count = ctx.fcount + ctx.online.fcount   -    return (count == 1) +    return math.fabs(count - 1) < 1e-6  def IsSingletonFE(ctx):      if not ctx.online: @@ -139,4 +139,4 @@ def IsSupportedOnline(ctx): # Occurs in online data?      if ctx.online:          return (ctx.online.paircount > 0.01)      else: -        return False
\ No newline at end of file +        return False | 
