summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMichael Denkowski <mdenkows@cs.cmu.edu>2013-10-01 14:08:05 -0700
committerMichael Denkowski <mdenkows@cs.cmu.edu>2013-10-01 14:08:05 -0700
commit51a83643d95ab0d7add9dd66b0b38044db10a797 (patch)
tree7413fc7a105c1a1f9a3d0a2fb2f25d34640d8c8c
parent4830491825f0c8740d505c192d60b388f64acf1b (diff)
Better logging, save/load to default context
-rwxr-xr-xrealtime/realtime.py5
-rw-r--r--realtime/rt/aligner.py8
-rw-r--r--realtime/rt/decoder.py6
-rw-r--r--realtime/rt/rt.py50
4 files changed, 40 insertions, 29 deletions
diff --git a/realtime/realtime.py b/realtime/realtime.py
index be02d486..6ee785f8 100755
--- a/realtime/realtime.py
+++ b/realtime/realtime.py
@@ -57,6 +57,7 @@ def main():
parser = Parser(description='Real-time adaptive translation with cdec. (See README.md)')
parser.add_argument('-c', '--config', required=True, help='Config directory')
+ parser.add_argument('-s', '--state', help='Load state file to default context (saved incremental data)')
parser.add_argument('-n', '--normalize', help='Normalize text (tokenize, translate, detokenize)', action='store_true')
parser.add_argument('-T', '--temp', help='Temp directory (default /tmp)', default='/tmp')
parser.add_argument('-a', '--cache', help='Grammar cache size (default 5)', default='5')
@@ -74,6 +75,10 @@ def main():
debug(translator, args.debug_test)
return
+ # Load state if given
+ if args.state:
+ rtd.load_state(state)
+
# Read lines and commands
while True:
line = sys.stdin.readline()
diff --git a/realtime/rt/aligner.py b/realtime/rt/aligner.py
index def3fcb5..bcc1ef87 100644
--- a/realtime/rt/aligner.py
+++ b/realtime/rt/aligner.py
@@ -6,6 +6,8 @@ import threading
import util
+logger = logging.getLogger('rt.aligner')
+
class ForceAligner:
def __init__(self, fwd_params, fwd_err, rev_params, rev_err):
@@ -21,13 +23,13 @@ class ForceAligner:
rev_cmd = [fast_align, '-i', '-', '-d', '-T', rev_T, '-m', rev_m, '-f', rev_params, '-r']
tools_cmd = [atools, '-i', '-', '-j', '-', '-c', 'grow-diag-final-and']
- logging.info('Executing: {}'.format(' '.join(fwd_cmd)))
+ logger.info('Executing: {}'.format(' '.join(fwd_cmd)))
self.fwd_align = util.popen_io(fwd_cmd)
- logging.info('Executing: {}'.format(' '.join(rev_cmd)))
+ logger.info('Executing: {}'.format(' '.join(rev_cmd)))
self.rev_align = util.popen_io(rev_cmd)
- logging.info('Executing: {}'.format(' '.join(tools_cmd)))
+ logger.info('Executing: {}'.format(' '.join(tools_cmd)))
self.tools = util.popen_io(tools_cmd)
# Used to guarantee thread safety
diff --git a/realtime/rt/decoder.py b/realtime/rt/decoder.py
index 1cee4610..e6e7489d 100644
--- a/realtime/rt/decoder.py
+++ b/realtime/rt/decoder.py
@@ -5,6 +5,8 @@ import threading
import util
+logger = logging.getLogger('rt.decoder')
+
class Decoder:
def close(self, force=False):
@@ -29,7 +31,7 @@ class CdecDecoder(Decoder):
cdec_root = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
decoder = os.path.join(cdec_root, 'decoder', 'cdec')
decoder_cmd = [decoder, '-c', config, '-w', weights]
- logging.info('Executing: {}'.format(' '.join(decoder_cmd)))
+ logger.info('Executing: {}'.format(' '.join(decoder_cmd)))
self.decoder = util.popen_io(decoder_cmd)
self.lock = util.FIFOLock()
@@ -40,7 +42,7 @@ class MIRADecoder(Decoder):
mira = os.path.join(cdec_root, 'training', 'mira', 'kbest_cut_mira')
# optimizer=2 step=0.001 best=500, k=500, uniq, stream
mira_cmd = [mira, '-c', config, '-w', weights, '-o', '2', '-C', '0.001', '-b', '500', '-k', '500', '-u', '-t']
- logging.info('Executing: {}'.format(' '.join(mira_cmd)))
+ logger.info('Executing: {}'.format(' '.join(mira_cmd)))
self.decoder = util.popen_io(mira_cmd)
self.lock = util.FIFOLock()
diff --git a/realtime/rt/rt.py b/realtime/rt/rt.py
index 43cc43b4..db831712 100644
--- a/realtime/rt/rt.py
+++ b/realtime/rt/rt.py
@@ -19,6 +19,8 @@ import util
# Dummy input token that is unlikely to appear in normalized data (but no fatal errors if it does)
LIKELY_OOV = '(OOV)'
+logger = logging.getLogger('rt')
+
class RealtimeDecoder:
'''Do not use directly unless you know what you're doing. Use RealtimeTranslator.'''
@@ -48,7 +50,7 @@ class RealtimeDecoder:
self.decoder = decoder.MIRADecoder(decoder_config_file, decoder_weights)
def close(self, force=False):
- logging.info('Closing decoder and removing {}'.format(self.tmp))
+ logger.info('Closing decoder and removing {}'.format(self.tmp))
self.decoder.close(force)
self.ref_fifo.close()
shutil.rmtree(self.tmp)
@@ -75,7 +77,7 @@ class RealtimeTranslator:
self.config = configdir
# Temporary work dir
self.tmp = tempfile.mkdtemp(dir=tmpdir, prefix='realtime.')
- logging.info('Using temp dir {}'.format(self.tmp))
+ logger.info('Using temp dir {}'.format(self.tmp))
# Normalization
self.norm = norm
@@ -127,10 +129,10 @@ class RealtimeTranslator:
def close(self, force=False):
'''Cleanup'''
if force:
- logging.info('Forced shutdown: stopping immediately')
+ logger.info('Forced shutdown: stopping immediately')
for ctx_name in list(self.ctx_names):
self.drop_ctx(ctx_name, force)
- logging.info('Closing processes')
+ logger.info('Closing processes')
self.aligner.close(force)
if self.norm:
if not force:
@@ -141,7 +143,7 @@ class RealtimeTranslator:
if not force:
self.tokenizer_lock.release()
self.detokenizer_lock.release()
- logging.info('Deleting {}'.format(self.tmp))
+ logger.info('Deleting {}'.format(self.tmp))
shutil.rmtree(self.tmp)
def lazy_ctx(self, ctx_name):
@@ -149,7 +151,7 @@ class RealtimeTranslator:
NOT threadsafe, acquire ctx_name lock before calling.'''
if ctx_name in self.ctx_names:
return
- logging.info('New context: {}'.format(ctx_name))
+ logger.info('New context: {}'.format(ctx_name))
self.ctx_names.add(ctx_name)
self.ctx_data[ctx_name] = []
self.grammar_files[ctx_name] = collections.deque()
@@ -164,11 +166,11 @@ class RealtimeTranslator:
if not force:
lock.acquire()
if ctx_name not in self.ctx_names:
- logging.info('No context found, no action: {}'.format(ctx_name))
+ logger.info('No context found, no action: {}'.format(ctx_name))
if not force:
lock.release()
return
- logging.info('Dropping context: {}'.format(ctx_name))
+ logger.info('Dropping context: {}'.format(ctx_name))
self.ctx_names.remove(ctx_name)
self.ctx_data.pop(ctx_name)
self.extractor.drop_ctx(ctx_name)
@@ -193,7 +195,7 @@ class RealtimeTranslator:
grammar_file = grammar_dict.get(sentence, None)
# Cache hit
if grammar_file:
- logging.info('Grammar cache hit: {}'.format(grammar_file))
+ logger.info('Grammar cache hit: {}'.format(grammar_file))
self.extractor_lock.release()
return grammar_file
# Extract and cache
@@ -226,18 +228,18 @@ class RealtimeTranslator:
return ''
if self.norm:
sentence = self.tokenize(sentence)
- logging.info('Normalized input: {}'.format(sentence))
+ logger.info('Normalized input: {}'.format(sentence))
grammar_file = self.grammar(sentence, ctx_name)
decoder = self.decoders[ctx_name]
start_time = time.time()
hyp = decoder.decoder.decode(sentence, grammar_file)
stop_time = time.time()
- logging.info('Translation time: {} seconds'.format(stop_time - start_time))
+ logger.info('Translation time: {} seconds'.format(stop_time - start_time))
# Empty reference: HPYPLM does not learn prior to next translation
decoder.ref_fifo.write('\n')
decoder.ref_fifo.flush()
if self.norm:
- logging.info('Normalized translation: {}'.format(hyp))
+ logger.info('Normalized translation: {}'.format(hyp))
hyp = self.detokenize(hyp)
lock.release()
return hyp
@@ -271,9 +273,9 @@ class RealtimeTranslator:
cmd_name = cmd_name[0]
(command, nargs) = self.COMMANDS.get(cmd_name, (None, None))
if command and len(args[1:]) in nargs:
- logging.info('{} ({}) ||| {}'.format(cmd_name, ctx_name, ' ||| '.join(args[1:])))
+ logger.info('{} ({}) ||| {}'.format(cmd_name, ctx_name, ' ||| '.join(args[1:])))
return command(*args[1:], ctx_name=ctx_name)
- logging.info('ERROR: command: {}'.format(' ||| '.join(args)))
+ logger.info('ERROR: command: {}'.format(' ||| '.join(args)))
def learn(self, source, target, ctx_name=None):
'''Learn from training instance (inc extracting grammar if needed)
@@ -282,7 +284,7 @@ class RealtimeTranslator:
lock.acquire()
self.lazy_ctx(ctx_name)
if '' in (source.strip(), target.strip()):
- logging.info('ERROR: empty source or target: {} ||| {}'.format(source, target))
+ logger.info('ERROR: empty source or target: {} ||| {}'.format(source, target))
lock.release()
return
if self.norm:
@@ -294,15 +296,15 @@ class RealtimeTranslator:
# MIRA update before adding data to grammar extractor
decoder = self.decoders[ctx_name]
mira_log = decoder.decoder.update(source, grammar_file, target)
- logging.info('MIRA: {}'.format(mira_log))
+ logger.info('MIRA: {}'.format(mira_log))
# Add to HPYPLM by writing to fifo (read on next translation)
- logging.info('Adding to HPYPLM: {}'.format(target))
+ logger.info('Adding to HPYPLM: {}'.format(target))
decoder.ref_fifo.write('{}\n'.format(target))
decoder.ref_fifo.flush()
# Store incremental data for save/load
self.ctx_data[ctx_name].append((source, target, alignment))
# Add aligned sentence pair to grammar extractor
- logging.info('Adding to bitext: {} ||| {} ||| {}'.format(source, target, alignment))
+ logger.info('Adding to bitext: {} ||| {} ||| {}'.format(source, target, alignment))
self.extractor.add_instance(source, target, alignment, ctx_name)
# Clear (old) cached grammar
rm_grammar = self.grammar_dict[ctx_name].pop(source)
@@ -315,7 +317,7 @@ class RealtimeTranslator:
self.lazy_ctx(ctx_name)
ctx_data = self.ctx_data[ctx_name]
out = open(filename, 'w') if filename else sys.stdout
- logging.info('Saving state for context ({}) with {} sentences'.format(ctx_name, len(ctx_data)))
+ logger.info('Saving state for context ({}) with {} sentences'.format(ctx_name, len(ctx_data)))
out.write('{}\n'.format(self.decoders[ctx_name].decoder.get_weights()))
for (source, target, alignment) in ctx_data:
out.write('{} ||| {} ||| {}\n'.format(source, target, alignment))
@@ -333,8 +335,8 @@ class RealtimeTranslator:
input = open(filename) if filename else sys.stdin
# Non-initial load error
if ctx_data:
- logging.info('ERROR: Incremental data has already been added to context ({})'.format(ctx_name))
- logging.info(' State can only be loaded to a new context.')
+ logger.info('ERROR: Incremental data has already been added to context ({})'.format(ctx_name))
+ logger.info(' State can only be loaded to a new context.')
lock.release()
return
# Many things can go wrong if bad state data is given
@@ -343,7 +345,7 @@ class RealtimeTranslator:
line = input.readline().strip()
# Throws exception if bad line
decoder.decoder.set_weights(line)
- logging.info('Loading state...')
+ logger.info('Loading state...')
start_time = time.time()
# Lines source ||| target ||| alignment
while True:
@@ -362,12 +364,12 @@ class RealtimeTranslator:
decoder.ref_fifo.write('{}\n'.format(target))
decoder.ref_fifo.flush()
stop_time = time.time()
- logging.info('Loaded state for context ({}) with {} sentences in {} seconds'.format(ctx_name, len(ctx_data), stop_time - start_time))
+ logger.info('Loaded state for context ({}) with {} sentences in {} seconds'.format(ctx_name, len(ctx_data), stop_time - start_time))
lock.release()
# Recover from bad load attempt by restarting context.
# Guaranteed not to cause data loss since only a new context can load state.
except:
- logging.info('ERROR: could not load state, restarting context ({})'.format(ctx_name))
+ logger.info('ERROR: could not load state, restarting context ({})'.format(ctx_name))
# ctx_name is already owned and needs to be restarted before other blocking threads use
self.drop_ctx(ctx_name, force=True)
self.lazy_ctx(ctx_name)