diff options
| author | Patrick Simianer <p@simianer.de> | 2013-05-02 09:09:59 +0200 | 
|---|---|---|
| committer | Patrick Simianer <p@simianer.de> | 2013-05-02 09:09:59 +0200 | 
| commit | 9e50f0237413180fba11b500c9dce5c600e3c157 (patch) | |
| tree | 556fc31d231353c853a864afffddd43dc525549a /python/pkg | |
| parent | d18024a41cbc1b54db88d499571349a6234b6db8 (diff) | |
| parent | 14ed53426726202813a8e82d706b44266f015fe1 (diff) | |
Merge remote-tracking branch 'upstream/master'
Diffstat (limited to 'python/pkg')
| -rw-r--r-- | python/pkg/cdec/sa/compile.py | 21 | ||||
| -rw-r--r-- | python/pkg/cdec/sa/extract.py | 10 | ||||
| -rw-r--r-- | python/pkg/cdec/sa/features.py | 4 | 
3 files changed, 31 insertions, 4 deletions
| diff --git a/python/pkg/cdec/sa/compile.py b/python/pkg/cdec/sa/compile.py index ce249c0f..d4cd8387 100644 --- a/python/pkg/cdec/sa/compile.py +++ b/python/pkg/cdec/sa/compile.py @@ -4,6 +4,7 @@ import os  import logging  import cdec.configobj  import cdec.sa +from cdec.sa._sa import monitor_cpu  import sys  MAX_PHRASE_LENGTH = 4 @@ -21,6 +22,7 @@ def precompute(f_sa, max_len, max_nt, max_size, min_gap, rank1, rank2, tight_phr      return precomp  def main(): +    preprocess_start_time = monitor_cpu()      sys.setrecursionlimit(sys.getrecursionlimit() * 100)      logging.basicConfig(level=logging.INFO) @@ -73,31 +75,46 @@ def main():      a_bin = os.path.join(args.output, 'a.bin')      lex_bin = os.path.join(args.output, 'lex.bin') +    start_time = monitor_cpu()      logger.info('Compiling source suffix array')      if args.bitext:          f_sa = cdec.sa.SuffixArray(from_text=args.bitext, side='source')      else:          f_sa = cdec.sa.SuffixArray(from_text=args.source)      f_sa.write_binary(f_sa_bin) +    stop_time = monitor_cpu() +    logger.info('Compiling source suffix array took %f seconds', stop_time - start_time) +    start_time = monitor_cpu()      logger.info('Compiling target data array')      if args.bitext:          e = cdec.sa.DataArray(from_text=args.bitext, side='target')      else:          e = cdec.sa.DataArray(from_text=args.target)      e.write_binary(e_bin) +    stop_time = monitor_cpu() +    logger.info('Compiling target data array took %f seconds', stop_time - start_time) +    start_time = monitor_cpu()      logger.info('Precomputing frequent phrases')      precompute(f_sa, *params).write_binary(precomp_bin) +    stop_time = monitor_cpu() +    logger.info('Compiling precomputations took %f seconds', stop_time - start_time) +    start_time = monitor_cpu()      logger.info('Compiling alignment')      a = cdec.sa.Alignment(from_text=args.alignment)      a.write_binary(a_bin) +    stop_time = monitor_cpu() +    logger.info('Compiling alignment took %f seonds', stop_time - start_time) +    start_time = monitor_cpu()      logger.info('Compiling bilexical dictionary')      lex = cdec.sa.BiLex(from_data=True, alignment=a, earray=e, fsarray=f_sa)      lex.write_binary(lex_bin) -     +    stop_time = monitor_cpu() +    logger.info('Compiling bilexical dictionary took %f seconds', stop_time - start_time) +      # Write configuration      config = cdec.configobj.ConfigObj(args.config, unrepr=True)      config['f_sa_file'] = os.path.abspath(f_sa_bin) @@ -108,6 +125,8 @@ def main():      for name, value in zip(param_names, params):          config[name] = value      config.write() +    preprocess_stop_time = monitor_cpu() +    logger.info('Overall preprocessing step took %f seconds', preprocess_stop_time - preprocess_start_time)  if __name__ == '__main__':      main() diff --git a/python/pkg/cdec/sa/extract.py b/python/pkg/cdec/sa/extract.py index 782bed8b..b6502c52 100644 --- a/python/pkg/cdec/sa/extract.py +++ b/python/pkg/cdec/sa/extract.py @@ -8,6 +8,7 @@ import logging  import signal  import multiprocessing as mp  import cdec.sa +from cdec.sa._sa import monitor_cpu  extractor, prefix = None, None  online, compress = False, False @@ -62,6 +63,7 @@ def extract(inp):      return '<seg grammar="{}" id="{}">{}</seg>{}'.format(grammar_file, i, sentence, suffix)  def main(): +    global online      logging.basicConfig(level=logging.INFO)      parser = argparse.ArgumentParser(description='Extract grammars from a compiled corpus.')      parser.add_argument('-c', '--config', required=True, @@ -87,7 +89,10 @@ def main():              sys.stderr.write('Error: feature definition file <{}>'                      ' should be a python module\n'.format(featdef))              sys.exit(1) -     + +    online = args.online + +    start_time = monitor_cpu()      if args.jobs > 1:          logging.info('Starting %d workers; chunk size: %d', args.jobs, args.chunksize)          pool = mp.Pool(args.jobs, make_extractor, (args,)) @@ -101,5 +106,8 @@ def main():          for output in map(extract, enumerate(sys.stdin)):              print(output) +    stop_time = monitor_cpu() +    logging.info("Overall extraction step took %f seconds", stop_time - start_time) +  if __name__ == '__main__':      main() diff --git a/python/pkg/cdec/sa/features.py b/python/pkg/cdec/sa/features.py index 46412cd5..c8fc1cca 100644 --- a/python/pkg/cdec/sa/features.py +++ b/python/pkg/cdec/sa/features.py @@ -105,7 +105,7 @@ def IsSingletonF(ctx):          count = ctx.fcount      else:          count = ctx.fcount + ctx.online.fcount   -    return (count == 1) +    return math.fabs(count - 1) < 1e-6  def IsSingletonFE(ctx):      if not ctx.online: @@ -139,4 +139,4 @@ def IsSupportedOnline(ctx): # Occurs in online data?      if ctx.online:          return (ctx.online.paircount > 0.01)      else: -        return False
\ No newline at end of file +        return False | 
