diff options
-rwxr-xr-x | realtime/realtime.py | 5 | ||||
-rw-r--r-- | realtime/rt/aligner.py | 8 | ||||
-rw-r--r-- | realtime/rt/decoder.py | 6 | ||||
-rw-r--r-- | realtime/rt/rt.py | 50 |
4 files changed, 40 insertions, 29 deletions
diff --git a/realtime/realtime.py b/realtime/realtime.py index be02d486..6ee785f8 100755 --- a/realtime/realtime.py +++ b/realtime/realtime.py @@ -57,6 +57,7 @@ def main(): parser = Parser(description='Real-time adaptive translation with cdec. (See README.md)') parser.add_argument('-c', '--config', required=True, help='Config directory') + parser.add_argument('-s', '--state', help='Load state file to default context (saved incremental data)') parser.add_argument('-n', '--normalize', help='Normalize text (tokenize, translate, detokenize)', action='store_true') parser.add_argument('-T', '--temp', help='Temp directory (default /tmp)', default='/tmp') parser.add_argument('-a', '--cache', help='Grammar cache size (default 5)', default='5') @@ -74,6 +75,10 @@ def main(): debug(translator, args.debug_test) return + # Load state if given + if args.state: + rtd.load_state(state) + # Read lines and commands while True: line = sys.stdin.readline() diff --git a/realtime/rt/aligner.py b/realtime/rt/aligner.py index def3fcb5..bcc1ef87 100644 --- a/realtime/rt/aligner.py +++ b/realtime/rt/aligner.py @@ -6,6 +6,8 @@ import threading import util +logger = logging.getLogger('rt.aligner') + class ForceAligner: def __init__(self, fwd_params, fwd_err, rev_params, rev_err): @@ -21,13 +23,13 @@ class ForceAligner: rev_cmd = [fast_align, '-i', '-', '-d', '-T', rev_T, '-m', rev_m, '-f', rev_params, '-r'] tools_cmd = [atools, '-i', '-', '-j', '-', '-c', 'grow-diag-final-and'] - logging.info('Executing: {}'.format(' '.join(fwd_cmd))) + logger.info('Executing: {}'.format(' '.join(fwd_cmd))) self.fwd_align = util.popen_io(fwd_cmd) - logging.info('Executing: {}'.format(' '.join(rev_cmd))) + logger.info('Executing: {}'.format(' '.join(rev_cmd))) self.rev_align = util.popen_io(rev_cmd) - logging.info('Executing: {}'.format(' '.join(tools_cmd))) + logger.info('Executing: {}'.format(' '.join(tools_cmd))) self.tools = util.popen_io(tools_cmd) # Used to guarantee thread safety diff --git a/realtime/rt/decoder.py b/realtime/rt/decoder.py index 1cee4610..e6e7489d 100644 --- a/realtime/rt/decoder.py +++ b/realtime/rt/decoder.py @@ -5,6 +5,8 @@ import threading import util +logger = logging.getLogger('rt.decoder') + class Decoder: def close(self, force=False): @@ -29,7 +31,7 @@ class CdecDecoder(Decoder): cdec_root = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) decoder = os.path.join(cdec_root, 'decoder', 'cdec') decoder_cmd = [decoder, '-c', config, '-w', weights] - logging.info('Executing: {}'.format(' '.join(decoder_cmd))) + logger.info('Executing: {}'.format(' '.join(decoder_cmd))) self.decoder = util.popen_io(decoder_cmd) self.lock = util.FIFOLock() @@ -40,7 +42,7 @@ class MIRADecoder(Decoder): mira = os.path.join(cdec_root, 'training', 'mira', 'kbest_cut_mira') # optimizer=2 step=0.001 best=500, k=500, uniq, stream mira_cmd = [mira, '-c', config, '-w', weights, '-o', '2', '-C', '0.001', '-b', '500', '-k', '500', '-u', '-t'] - logging.info('Executing: {}'.format(' '.join(mira_cmd))) + logger.info('Executing: {}'.format(' '.join(mira_cmd))) self.decoder = util.popen_io(mira_cmd) self.lock = util.FIFOLock() diff --git a/realtime/rt/rt.py b/realtime/rt/rt.py index 43cc43b4..db831712 100644 --- a/realtime/rt/rt.py +++ b/realtime/rt/rt.py @@ -19,6 +19,8 @@ import util # Dummy input token that is unlikely to appear in normalized data (but no fatal errors if it does) LIKELY_OOV = '(OOV)' +logger = logging.getLogger('rt') + class RealtimeDecoder: '''Do not use directly unless you know what you're doing. Use RealtimeTranslator.''' @@ -48,7 +50,7 @@ class RealtimeDecoder: self.decoder = decoder.MIRADecoder(decoder_config_file, decoder_weights) def close(self, force=False): - logging.info('Closing decoder and removing {}'.format(self.tmp)) + logger.info('Closing decoder and removing {}'.format(self.tmp)) self.decoder.close(force) self.ref_fifo.close() shutil.rmtree(self.tmp) @@ -75,7 +77,7 @@ class RealtimeTranslator: self.config = configdir # Temporary work dir self.tmp = tempfile.mkdtemp(dir=tmpdir, prefix='realtime.') - logging.info('Using temp dir {}'.format(self.tmp)) + logger.info('Using temp dir {}'.format(self.tmp)) # Normalization self.norm = norm @@ -127,10 +129,10 @@ class RealtimeTranslator: def close(self, force=False): '''Cleanup''' if force: - logging.info('Forced shutdown: stopping immediately') + logger.info('Forced shutdown: stopping immediately') for ctx_name in list(self.ctx_names): self.drop_ctx(ctx_name, force) - logging.info('Closing processes') + logger.info('Closing processes') self.aligner.close(force) if self.norm: if not force: @@ -141,7 +143,7 @@ class RealtimeTranslator: if not force: self.tokenizer_lock.release() self.detokenizer_lock.release() - logging.info('Deleting {}'.format(self.tmp)) + logger.info('Deleting {}'.format(self.tmp)) shutil.rmtree(self.tmp) def lazy_ctx(self, ctx_name): @@ -149,7 +151,7 @@ class RealtimeTranslator: NOT threadsafe, acquire ctx_name lock before calling.''' if ctx_name in self.ctx_names: return - logging.info('New context: {}'.format(ctx_name)) + logger.info('New context: {}'.format(ctx_name)) self.ctx_names.add(ctx_name) self.ctx_data[ctx_name] = [] self.grammar_files[ctx_name] = collections.deque() @@ -164,11 +166,11 @@ class RealtimeTranslator: if not force: lock.acquire() if ctx_name not in self.ctx_names: - logging.info('No context found, no action: {}'.format(ctx_name)) + logger.info('No context found, no action: {}'.format(ctx_name)) if not force: lock.release() return - logging.info('Dropping context: {}'.format(ctx_name)) + logger.info('Dropping context: {}'.format(ctx_name)) self.ctx_names.remove(ctx_name) self.ctx_data.pop(ctx_name) self.extractor.drop_ctx(ctx_name) @@ -193,7 +195,7 @@ class RealtimeTranslator: grammar_file = grammar_dict.get(sentence, None) # Cache hit if grammar_file: - logging.info('Grammar cache hit: {}'.format(grammar_file)) + logger.info('Grammar cache hit: {}'.format(grammar_file)) self.extractor_lock.release() return grammar_file # Extract and cache @@ -226,18 +228,18 @@ class RealtimeTranslator: return '' if self.norm: sentence = self.tokenize(sentence) - logging.info('Normalized input: {}'.format(sentence)) + logger.info('Normalized input: {}'.format(sentence)) grammar_file = self.grammar(sentence, ctx_name) decoder = self.decoders[ctx_name] start_time = time.time() hyp = decoder.decoder.decode(sentence, grammar_file) stop_time = time.time() - logging.info('Translation time: {} seconds'.format(stop_time - start_time)) + logger.info('Translation time: {} seconds'.format(stop_time - start_time)) # Empty reference: HPYPLM does not learn prior to next translation decoder.ref_fifo.write('\n') decoder.ref_fifo.flush() if self.norm: - logging.info('Normalized translation: {}'.format(hyp)) + logger.info('Normalized translation: {}'.format(hyp)) hyp = self.detokenize(hyp) lock.release() return hyp @@ -271,9 +273,9 @@ class RealtimeTranslator: cmd_name = cmd_name[0] (command, nargs) = self.COMMANDS.get(cmd_name, (None, None)) if command and len(args[1:]) in nargs: - logging.info('{} ({}) ||| {}'.format(cmd_name, ctx_name, ' ||| '.join(args[1:]))) + logger.info('{} ({}) ||| {}'.format(cmd_name, ctx_name, ' ||| '.join(args[1:]))) return command(*args[1:], ctx_name=ctx_name) - logging.info('ERROR: command: {}'.format(' ||| '.join(args))) + logger.info('ERROR: command: {}'.format(' ||| '.join(args))) def learn(self, source, target, ctx_name=None): '''Learn from training instance (inc extracting grammar if needed) @@ -282,7 +284,7 @@ class RealtimeTranslator: lock.acquire() self.lazy_ctx(ctx_name) if '' in (source.strip(), target.strip()): - logging.info('ERROR: empty source or target: {} ||| {}'.format(source, target)) + logger.info('ERROR: empty source or target: {} ||| {}'.format(source, target)) lock.release() return if self.norm: @@ -294,15 +296,15 @@ class RealtimeTranslator: # MIRA update before adding data to grammar extractor decoder = self.decoders[ctx_name] mira_log = decoder.decoder.update(source, grammar_file, target) - logging.info('MIRA: {}'.format(mira_log)) + logger.info('MIRA: {}'.format(mira_log)) # Add to HPYPLM by writing to fifo (read on next translation) - logging.info('Adding to HPYPLM: {}'.format(target)) + logger.info('Adding to HPYPLM: {}'.format(target)) decoder.ref_fifo.write('{}\n'.format(target)) decoder.ref_fifo.flush() # Store incremental data for save/load self.ctx_data[ctx_name].append((source, target, alignment)) # Add aligned sentence pair to grammar extractor - logging.info('Adding to bitext: {} ||| {} ||| {}'.format(source, target, alignment)) + logger.info('Adding to bitext: {} ||| {} ||| {}'.format(source, target, alignment)) self.extractor.add_instance(source, target, alignment, ctx_name) # Clear (old) cached grammar rm_grammar = self.grammar_dict[ctx_name].pop(source) @@ -315,7 +317,7 @@ class RealtimeTranslator: self.lazy_ctx(ctx_name) ctx_data = self.ctx_data[ctx_name] out = open(filename, 'w') if filename else sys.stdout - logging.info('Saving state for context ({}) with {} sentences'.format(ctx_name, len(ctx_data))) + logger.info('Saving state for context ({}) with {} sentences'.format(ctx_name, len(ctx_data))) out.write('{}\n'.format(self.decoders[ctx_name].decoder.get_weights())) for (source, target, alignment) in ctx_data: out.write('{} ||| {} ||| {}\n'.format(source, target, alignment)) @@ -333,8 +335,8 @@ class RealtimeTranslator: input = open(filename) if filename else sys.stdin # Non-initial load error if ctx_data: - logging.info('ERROR: Incremental data has already been added to context ({})'.format(ctx_name)) - logging.info(' State can only be loaded to a new context.') + logger.info('ERROR: Incremental data has already been added to context ({})'.format(ctx_name)) + logger.info(' State can only be loaded to a new context.') lock.release() return # Many things can go wrong if bad state data is given @@ -343,7 +345,7 @@ class RealtimeTranslator: line = input.readline().strip() # Throws exception if bad line decoder.decoder.set_weights(line) - logging.info('Loading state...') + logger.info('Loading state...') start_time = time.time() # Lines source ||| target ||| alignment while True: @@ -362,12 +364,12 @@ class RealtimeTranslator: decoder.ref_fifo.write('{}\n'.format(target)) decoder.ref_fifo.flush() stop_time = time.time() - logging.info('Loaded state for context ({}) with {} sentences in {} seconds'.format(ctx_name, len(ctx_data), stop_time - start_time)) + logger.info('Loaded state for context ({}) with {} sentences in {} seconds'.format(ctx_name, len(ctx_data), stop_time - start_time)) lock.release() # Recover from bad load attempt by restarting context. # Guaranteed not to cause data loss since only a new context can load state. except: - logging.info('ERROR: could not load state, restarting context ({})'.format(ctx_name)) + logger.info('ERROR: could not load state, restarting context ({})'.format(ctx_name)) # ctx_name is already owned and needs to be restarted before other blocking threads use self.drop_ctx(ctx_name, force=True) self.lazy_ctx(ctx_name) |