From 8b429e08566ffc640c2de0f0eca66c354c8377f9 Mon Sep 17 00:00:00 2001 From: mjdenkowski Date: Wed, 11 Sep 2013 16:20:36 -0400 Subject: Find pycdec --- realtime/rt/aligner.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'realtime/rt/aligner.py') diff --git a/realtime/rt/aligner.py b/realtime/rt/aligner.py index 3c6ea144..80835412 100644 --- a/realtime/rt/aligner.py +++ b/realtime/rt/aligner.py @@ -9,7 +9,7 @@ class ForceAligner: def __init__(self, fwd_params, fwd_err, rev_params, rev_err): - cdec_root = os.path.dirname(os.path.dirname(os.path.dirname(__file__))) + cdec_root = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) fast_align = os.path.join(cdec_root, 'word-aligner', 'fast_align') atools = os.path.join(cdec_root, 'utils', 'atools') -- cgit v1.2.3 From 5866bdb0541bf136d897cc8ecc72c5ed4b6a93ee Mon Sep 17 00:00:00 2001 From: Michael Denkowski Date: Wed, 25 Sep 2013 16:20:51 -0700 Subject: Super multi-user thread safety update --- realtime/realtime.py | 43 +++++---- realtime/rt/aligner.py | 11 ++- realtime/rt/decoder.py | 32 ++++++- realtime/rt/rt.py | 250 +++++++++++++++++++++++++++++++++++-------------- 4 files changed, 241 insertions(+), 95 deletions(-) (limited to 'realtime/rt/aligner.py') diff --git a/realtime/realtime.py b/realtime/realtime.py index 3c384fa2..282d3311 100755 --- a/realtime/realtime.py +++ b/realtime/realtime.py @@ -2,7 +2,9 @@ import argparse import logging +import signal import sys +import threading import rt @@ -22,34 +24,37 @@ def main(): parser.add_argument('-T', '--temp', help='Temp directory (default /tmp)', default='/tmp') parser.add_argument('-a', '--cache', help='Grammar cache size (default 5)', default='5') parser.add_argument('-v', '--verbose', help='Info to stderr', action='store_true') + parser.add_argument('-D', '--debug-test', help='Test thread safety (debug use only)', action='store_true') args = parser.parse_args() if args.verbose: logging.basicConfig(level=logging.INFO) - with rt.RealtimeDecoder(args.config, tmpdir=args.temp, cache_size=int(args.cache), norm=args.normalize) as rtd: + with rt.RealtimeTranslator(args.config, tmpdir=args.temp, cache_size=int(args.cache), norm=args.normalize) as translator: - try: # Load state if given if args.state: with open(args.state) as input: rtd.load_state(input) - # Read lines and commands - while True: - line = sys.stdin.readline() - if not line: - break - line = line.strip() - if '|||' in line: - rtd.command_line(line) - else: - hyp = rtd.decode(line) - sys.stdout.write('{}\n'.format(hyp)) - sys.stdout.flush() - - # Clean exit on ctrl+c - except KeyboardInterrupt: - logging.info('Caught KeyboardInterrupt, exiting') - + if not args.debug_test: + run(translator) + else: + # TODO: write test + run(translator) + +def run(translator, input=sys.stdin, output=sys.stdout, ctx_name=None): + # Read lines and commands + while True: + line = input.readline() + if not line: + break + line = line.strip() + if '|||' in line: + translator.command_line(line, ctx_name) + else: + hyp = translator.decode(line, ctx_name) + output.write('{}\n'.format(hyp)) + output.flush() + if __name__ == '__main__': main() diff --git a/realtime/rt/aligner.py b/realtime/rt/aligner.py index 80835412..a14121db 100644 --- a/realtime/rt/aligner.py +++ b/realtime/rt/aligner.py @@ -2,6 +2,7 @@ import logging import os import sys import subprocess +import threading import util @@ -29,10 +30,16 @@ class ForceAligner: logging.info('Executing: {}'.format(' '.join(tools_cmd))) self.tools = util.popen_io(tools_cmd) + # Used to guarantee thread safety + self.semaphore = threading.Semaphore() + def align(self, source, target): + '''Threadsafe''' return self.align_formatted('{} ||| {}'.format(source, target)) def align_formatted(self, line): + '''Threadsafe''' + self.semaphore.acquire() self.fwd_align.stdin.write('{}\n'.format(line)) self.rev_align.stdin.write('{}\n'.format(line)) # f words ||| e words ||| links ||| score @@ -40,7 +47,9 @@ class ForceAligner: rev_line = self.rev_align.stdout.readline().split('|||')[2].strip() self.tools.stdin.write('{}\n'.format(fwd_line)) self.tools.stdin.write('{}\n'.format(rev_line)) - return self.tools.stdout.readline().strip() + al_line = self.tools.stdout.readline().strip() + self.semaphore.release() + return al_line def close(self): self.fwd_align.stdin.close() diff --git a/realtime/rt/decoder.py b/realtime/rt/decoder.py index aa6db64d..72b5b959 100644 --- a/realtime/rt/decoder.py +++ b/realtime/rt/decoder.py @@ -1,27 +1,37 @@ import logging import os import subprocess +import threading import util class Decoder: - def close(self): + def close(self, force=False): + if not force: + self.semaphore.acquire() self.decoder.stdin.close() + if not force: + self.semaphore.release() def decode(self, sentence, grammar=None): + '''Threadsafe''' input = '{s}\n'.format(s=sentence, g=grammar) if grammar else '{}\n'.format(sentence) + self.semaphore.acquire() self.decoder.stdin.write(input) - return self.decoder.stdout.readline().strip() + hyp = self.decoder.stdout.readline().strip() + self.semaphore.release() + return hyp class CdecDecoder(Decoder): - + def __init__(self, config, weights): cdec_root = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) decoder = os.path.join(cdec_root, 'decoder', 'cdec') decoder_cmd = [decoder, '-c', config, '-w', weights] logging.info('Executing: {}'.format(' '.join(decoder_cmd))) self.decoder = util.popen_io(decoder_cmd) + self.semaphore = threading.Semaphore() class MIRADecoder(Decoder): @@ -32,15 +42,27 @@ class MIRADecoder(Decoder): mira_cmd = [mira, '-c', config, '-w', weights, '-o', '2', '-C', '0.001', '-b', '500', '-k', '500', '-u', '-t'] logging.info('Executing: {}'.format(' '.join(mira_cmd))) self.decoder = util.popen_io(mira_cmd) + self.semaphore = threading.Semaphore() def get_weights(self): + '''Threadsafe''' + self.semaphore.acquire() self.decoder.stdin.write('WEIGHTS ||| WRITE\n') - return self.decoder.stdout.readline().strip() + weights = self.decoder.stdout.readline().strip() + self.semaphore.release() + return weights def set_weights(self, w_line): + '''Threadsafe''' + self.semaphore.acquire() self.decoder.stdin.write('WEIGHTS ||| {}\n'.format(w_line)) + self.semaphore.release() def update(self, sentence, grammar, reference): + '''Threadsafe''' input = 'LEARN ||| {s} ||| {r}\n'.format(s=sentence, g=grammar, r=reference) + self.semaphore.acquire() self.decoder.stdin.write(input) - return self.decoder.stdout.readline().strip() + log = self.decoder.stdout.readline().strip() + self.semaphore.release() + return log diff --git a/realtime/rt/rt.py b/realtime/rt/rt.py index 033ed790..6f1fb70f 100644 --- a/realtime/rt/rt.py +++ b/realtime/rt/rt.py @@ -8,6 +8,7 @@ import shutil import sys import subprocess import tempfile +import threading import time import cdec @@ -15,18 +16,56 @@ import aligner import decoder import util -LIKELY_OOV = '("OOV")' +# Dummy input token that is unlikely to appear in normalized data (but no fatal errors if it does) +LIKELY_OOV = '(OOV)' class RealtimeDecoder: + '''Do not use directly unless you know what you're doing. Use RealtimeTranslator.''' + + def __init__(self, configdir, tmpdir): + + cdec_root = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + + self.tmp = tmpdir + os.mkdir(self.tmp) + + # HPYPLM reference stream + ref_fifo_file = os.path.join(self.tmp, 'ref.fifo') + os.mkfifo(ref_fifo_file) + self.ref_fifo = open(ref_fifo_file, 'w+') + # Start with empty line (do not learn prior to first input) + self.ref_fifo.write('\n') + self.ref_fifo.flush() + + # Decoder + decoder_config = [[f.strip() for f in line.split('=')] for line in open(os.path.join(configdir, 'cdec.ini'))] + util.cdec_ini_for_realtime(decoder_config, os.path.abspath(configdir), ref_fifo_file) + decoder_config_file = os.path.join(self.tmp, 'cdec.ini') + with open(decoder_config_file, 'w') as output: + for (k, v) in decoder_config: + output.write('{}={}\n'.format(k, v)) + decoder_weights = os.path.join(configdir, 'weights.final') + self.decoder = decoder.MIRADecoder(decoder_config_file, decoder_weights) + + def close(self, force=False): + logging.info('Closing decoder and removing {}'.format(self.tmp)) + self.decoder.close(force) + self.ref_fifo.close() + shutil.rmtree(self.tmp) + +class RealtimeTranslator: + '''Main entry point into API: serves translations to any number of concurrent users''' def __init__(self, configdir, tmpdir='/tmp', cache_size=5, norm=False, state=None): + # TODO: save/load self.commands = {'LEARN': self.learn, 'SAVE': self.save_state, 'LOAD': self.load_state} cdec_root = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) - self.inc_data = [] # instances of (source, target) + ### Single instance for all contexts + self.config = configdir # Temporary work dir self.tmp = tempfile.mkdtemp(dir=tmpdir, prefix='realtime.') logging.info('Using temp dir {}'.format(self.tmp)) @@ -35,7 +74,9 @@ class RealtimeDecoder: self.norm = norm if self.norm: self.tokenizer = util.popen_io([os.path.join(cdec_root, 'corpus', 'tokenize-anything.sh'), '-u']) + self.tokenizer_sem = threading.Semaphore() self.detokenizer = util.popen_io([os.path.join(cdec_root, 'corpus', 'untok.pl')]) + self.detokenizer_sem = threading.Semaphore() # Word aligner fwd_params = os.path.join(configdir, 'a.fwd_params') @@ -50,28 +91,24 @@ class RealtimeDecoder: util.sa_ini_for_realtime(sa_config, os.path.abspath(configdir)) sa_config.write() self.extractor = cdec.sa.GrammarExtractor(sa_config.filename, online=True) - self.grammar_files = collections.deque() - self.grammar_dict = {} self.cache_size = cache_size - # HPYPLM reference stream - ref_fifo_file = os.path.join(self.tmp, 'ref.fifo') - os.mkfifo(ref_fifo_file) - self.ref_fifo = open(ref_fifo_file, 'w+') - # Start with empty line (do not learn prior to first input) - self.ref_fifo.write('\n') - self.ref_fifo.flush() + ### One instance per context - # Decoder - decoder_config = [[f.strip() for f in line.split('=')] for line in open(os.path.join(configdir, 'cdec.ini'))] - util.cdec_ini_for_realtime(decoder_config, os.path.abspath(configdir), ref_fifo_file) - decoder_config_file = os.path.join(self.tmp, 'cdec.ini') - with open(decoder_config_file, 'w') as output: - for (k, v) in decoder_config: - output.write('{}={}\n'.format(k, v)) - decoder_weights = os.path.join(configdir, 'weights.final') - self.decoder = decoder.MIRADecoder(decoder_config_file, decoder_weights) + self.ctx_names = set() + # All context-dependent operations are atomic + self.ctx_sems = collections.defaultdict(threading.Semaphore) + # ctx -> list of (source, target, alignment) + self.ctx_data = {} + + # ctx -> deque of file + self.grammar_files = {} + # ctx -> dict of {sentence: file} + self.grammar_dict = {} + self.decoders = {} + + # TODO: state # Load state if given if state: with open(state) as input: @@ -80,125 +117,197 @@ class RealtimeDecoder: def __enter__(self): return self - def __exit__(self, type, value, traceback): - self.close() + def __exit__(self, ex_type, ex_value, ex_traceback): + self.close(ex_type is KeyboardInterrupt) - def close(self): + def close(self, force=False): + '''Cleanup''' + if force: + logging.info('Forced shutdown: stopping immediately') + for ctx_name in list(self.ctx_names): + self.drop_ctx(ctx_name, force) logging.info('Closing processes') self.aligner.close() - self.decoder.close() - self.ref_fifo.close() if self.norm: self.tokenizer.stdin.close() self.detokenizer.stdin.close() logging.info('Deleting {}'.format(self.tmp)) shutil.rmtree(self.tmp) - def grammar(self, sentence): - grammar_file = self.grammar_dict.get(sentence, None) + def lazy_ctx(self, ctx_name): + '''Initialize a context (inc starting a new decoder) if needed''' + self.ctx_sems[ctx_name].acquire() + if ctx_name in self.ctx_names: + self.ctx_sems[ctx_name].release() + return + logging.info('New context: {}'.format(ctx_name)) + self.ctx_names.add(ctx_name) + self.ctx_data[ctx_name] = [] + self.grammar_files[ctx_name] = collections.deque() + self.grammar_dict[ctx_name] = {} + tmpdir = os.path.join(self.tmp, 'decoder.{}'.format(ctx_name)) + self.decoders[ctx_name] = RealtimeDecoder(self.config, tmpdir) + self.ctx_sems[ctx_name].release() + + def drop_ctx(self, ctx_name, force=False): + '''Delete a context (inc stopping the decoder)''' + if not force: + sem = self.ctx_sems[ctx_name] + sem.acquire() + logging.info('Dropping context: {}'.format(ctx_name)) + self.ctx_names.remove(ctx_name) + self.ctx_data.pop(ctx_name) + self.extractor.drop_ctx(ctx_name) + self.grammar_files.pop(ctx_name) + self.grammar_dict.pop(ctx_name) + self.decoders.pop(ctx_name).close(force) + self.ctx_sems.pop(ctx_name) + if not force: + sem.release() + + def grammar(self, sentence, ctx_name=None): + '''Extract a sentence-level grammar on demand (or return cached)''' + self.lazy_ctx(ctx_name) + sem = self.ctx_sems[ctx_name] + sem.acquire() + grammar_dict = self.grammar_dict[ctx_name] + grammar_file = grammar_dict.get(sentence, None) # Cache hit if grammar_file: - logging.info('Grammar cache hit') + logging.info('Grammar cache hit: {}'.format(grammar_file)) + sem.release() return grammar_file # Extract and cache - (fid, grammar_file) = tempfile.mkstemp(dir=self.tmp, prefix='grammar.') + (fid, grammar_file) = tempfile.mkstemp(dir=self.decoders[ctx_name].tmp, prefix='grammar.') os.close(fid) with open(grammar_file, 'w') as output: - for rule in self.extractor.grammar(sentence): + for rule in self.extractor.grammar(sentence, ctx_name): output.write('{}\n'.format(str(rule))) - if len(self.grammar_files) == self.cache_size: - rm_sent = self.grammar_files.popleft() + grammar_files = self.grammar_files[ctx_name] + if len(grammar_files) == self.cache_size: + rm_sent = grammar_files.popleft() # If not already removed by learn method - if rm_sent in self.grammar_dict: - rm_grammar = self.grammar_dict.pop(rm_sent) + if rm_sent in grammar_dict: + rm_grammar = grammar_dict.pop(rm_sent) os.remove(rm_grammar) - self.grammar_files.append(sentence) - self.grammar_dict[sentence] = grammar_file + grammar_files.append(sentence) + grammar_dict[sentence] = grammar_file + sem.release() return grammar_file - def decode(self, sentence): + def decode(self, sentence, ctx_name=None): + '''Decode a sentence (inc extracting a grammar if needed)''' + self.lazy_ctx(ctx_name) # Empty in, empty out if sentence.strip() == '': return '' if self.norm: sentence = self.tokenize(sentence) logging.info('Normalized input: {}'.format(sentence)) - grammar_file = self.grammar(sentence) + # grammar method is threadsafe + grammar_file = self.grammar(sentence, ctx_name) + decoder = self.decoders[ctx_name] + sem = self.ctx_sems[ctx_name] + sem.acquire() start_time = time.time() - hyp = self.decoder.decode(sentence, grammar_file) + hyp = decoder.decoder.decode(sentence, grammar_file) stop_time = time.time() logging.info('Translation time: {} seconds'.format(stop_time - start_time)) # Empty reference: HPYPLM does not learn prior to next translation - self.ref_fifo.write('\n') - self.ref_fifo.flush() + decoder.ref_fifo.write('\n') + decoder.ref_fifo.flush() + sem.release() if self.norm: logging.info('Normalized translation: {}'.format(hyp)) hyp = self.detokenize(hyp) return hyp def tokenize(self, line): + self.tokenizer_sem.acquire() self.tokenizer.stdin.write('{}\n'.format(line)) - return self.tokenizer.stdout.readline().strip() + tok_line = self.tokenizer.stdout.readline().strip() + self.tokenizer_sem.release() + return tok_line def detokenize(self, line): + self.detokenizer_sem.acquire() self.detokenizer.stdin.write('{}\n'.format(line)) - return self.detokenizer.stdout.readline().strip() + detok_line = self.detokenizer.stdout.readline().strip() + self.detokenizer_sem.release() + return detok_line - def command_line(self, line): + # TODO + def command_line(self, line, ctx_name=None): args = [f.strip() for f in line.split('|||')] try: if len(args) == 2 and not args[1]: - self.commands[args[0]]() + self.commands[args[0]](ctx_name) else: - self.commands[args[0]](*args[1:]) + self.commands[args[0]](*args[1:], ctx_name=ctx_name) except: logging.info('Command error: {}'.format(' ||| '.join(args))) - def learn(self, source, target): + def learn(self, source, target, ctx_name=None): + self.lazy_ctx(ctx_name) if '' in (source.strip(), target.strip()): logging.info('Error empty source or target: {} ||| {}'.format(source, target)) return if self.norm: source = self.tokenize(source) target = self.tokenize(target) + # Align instance (threadsafe) + alignment = self.aligner.align(source, target) + # grammar method is threadsafe + grammar_file = self.grammar(source, ctx_name) + sem = self.ctx_sems[ctx_name] + sem.acquire() # MIRA update before adding data to grammar extractor - grammar_file = self.grammar(source) - mira_log = self.decoder.update(source, grammar_file, target) + decoder = self.decoders[ctx_name] + mira_log = decoder.decoder.update(source, grammar_file, target) logging.info('MIRA: {}'.format(mira_log)) - # Align instance - alignment = self.aligner.align(source, target) + # Add to HPYPLM by writing to fifo (read on next translation) + logging.info('Adding to HPYPLM: {}'.format(target)) + decoder.ref_fifo.write('{}\n'.format(target)) + decoder.ref_fifo.flush() # Store incremental data for save/load - self.inc_data.append((source, target, alignment)) + self.ctx_data[ctx_name].append((source, target, alignment)) # Add aligned sentence pair to grammar extractor logging.info('Adding to bitext: {} ||| {} ||| {}'.format(source, target, alignment)) - self.extractor.add_instance(source, target, alignment) + self.extractor.add_instance(source, target, alignment, ctx_name) # Clear (old) cached grammar - rm_grammar = self.grammar_dict.pop(source) + rm_grammar = self.grammar_dict[ctx_name].pop(source) os.remove(rm_grammar) - # Add to HPYPLM by writing to fifo (read on next translation) - logging.info('Adding to HPYPLM: {}'.format(target)) - self.ref_fifo.write('{}\n'.format(target)) - self.ref_fifo.flush() + sem.release() - def save_state(self, filename=None): + def save_state(self, filename=None, ctx_name=None): + self.lazy_ctx(ctx_name) out = open(filename, 'w') if filename else sys.stdout - logging.info('Saving state with {} sentences'.format(len(self.inc_data))) - out.write('{}\n'.format(self.decoder.get_weights())) - for (source, target, alignment) in self.inc_data: + sem = self.ctx_sems[ctx_name] + sem.acquire() + ctx_data = self.ctx_data[ctx_name] + logging.info('Saving state with {} sentences'.format(len(self.ctx_data))) + out.write('{}\n'.format(self.decoders[ctx_name].decoder.get_weights())) + for (source, target, alignment) in ctx_data: out.write('{} ||| {} ||| {}\n'.format(source, target, alignment)) + sem.release() out.write('EOF\n') if filename: out.close() - def load_state(self, input=sys.stdin): - # Non-initial load error - if self.inc_data: + def load_state(self, input=sys.stdin, ctx_name=None): + self.lazy_ctx(ctx_name) + sem = self.ctx_sems[ctx_name] + sem.acquire() + ctx_data = self.ctx_data[ctx_name] + decoder = self.decoders[ctx_name] + # Non-initial load error + if ctx_data: logging.info('Error: Incremental data has already been added to decoder.') logging.info(' State can only be loaded by a freshly started decoder.') return # MIRA weights line = input.readline().strip() - self.decoder.set_weights(line) + decoder.decoder.set_weights(line) logging.info('Loading state...') start_time = time.time() # Lines source ||| target ||| alignment @@ -207,12 +316,13 @@ class RealtimeDecoder: if line == 'EOF': break (source, target, alignment) = line.split(' ||| ') - self.inc_data.append((source, target, alignment)) + ctx_data.append((source, target, alignment)) # Extractor - self.extractor.add_instance(source, target, alignment) + self.extractor.add_instance(source, target, alignment, ctx_name) # HPYPLM - hyp = self.decoder.decode(LIKELY_OOV) + hyp = decoder.decoder.decode(LIKELY_OOV) self.ref_fifo.write('{}\n'.format(target)) self.ref_fifo.flush() stop_time = time.time() - logging.info('Loaded state with {} sentences in {} seconds'.format(len(self.inc_data), stop_time - start_time)) + logging.info('Loaded state with {} sentences in {} seconds'.format(len(ctx_data), stop_time - start_time)) + sem.release() -- cgit v1.2.3 From cb718c763e07b8e1417383ef7ae5c1aca36d2a0a Mon Sep 17 00:00:00 2001 From: Michael Denkowski Date: Thu, 26 Sep 2013 14:28:42 -0700 Subject: FIFO Locks --- realtime/rt/aligner.py | 6 ++--- realtime/rt/decoder.py | 24 +++++++++--------- realtime/rt/rt.py | 66 +++++++++++++++++++++++++------------------------- realtime/rt/util.py | 19 +++++++++++++++ 4 files changed, 67 insertions(+), 48 deletions(-) (limited to 'realtime/rt/aligner.py') diff --git a/realtime/rt/aligner.py b/realtime/rt/aligner.py index a14121db..62ce32b8 100644 --- a/realtime/rt/aligner.py +++ b/realtime/rt/aligner.py @@ -31,7 +31,7 @@ class ForceAligner: self.tools = util.popen_io(tools_cmd) # Used to guarantee thread safety - self.semaphore = threading.Semaphore() + self.lock = util.FIFOLock() def align(self, source, target): '''Threadsafe''' @@ -39,7 +39,7 @@ class ForceAligner: def align_formatted(self, line): '''Threadsafe''' - self.semaphore.acquire() + self.lock.acquire() self.fwd_align.stdin.write('{}\n'.format(line)) self.rev_align.stdin.write('{}\n'.format(line)) # f words ||| e words ||| links ||| score @@ -48,7 +48,7 @@ class ForceAligner: self.tools.stdin.write('{}\n'.format(fwd_line)) self.tools.stdin.write('{}\n'.format(rev_line)) al_line = self.tools.stdout.readline().strip() - self.semaphore.release() + self.lock.release() return al_line def close(self): diff --git a/realtime/rt/decoder.py b/realtime/rt/decoder.py index 72b5b959..7c36b441 100644 --- a/realtime/rt/decoder.py +++ b/realtime/rt/decoder.py @@ -9,18 +9,18 @@ class Decoder: def close(self, force=False): if not force: - self.semaphore.acquire() + self.lock.acquire() self.decoder.stdin.close() if not force: - self.semaphore.release() + self.lock.release() def decode(self, sentence, grammar=None): '''Threadsafe''' input = '{s}\n'.format(s=sentence, g=grammar) if grammar else '{}\n'.format(sentence) - self.semaphore.acquire() + self.lock.acquire() self.decoder.stdin.write(input) hyp = self.decoder.stdout.readline().strip() - self.semaphore.release() + self.lock.release() return hyp class CdecDecoder(Decoder): @@ -31,7 +31,7 @@ class CdecDecoder(Decoder): decoder_cmd = [decoder, '-c', config, '-w', weights] logging.info('Executing: {}'.format(' '.join(decoder_cmd))) self.decoder = util.popen_io(decoder_cmd) - self.semaphore = threading.Semaphore() + self.lock = util.FIFOLock() class MIRADecoder(Decoder): @@ -42,27 +42,27 @@ class MIRADecoder(Decoder): mira_cmd = [mira, '-c', config, '-w', weights, '-o', '2', '-C', '0.001', '-b', '500', '-k', '500', '-u', '-t'] logging.info('Executing: {}'.format(' '.join(mira_cmd))) self.decoder = util.popen_io(mira_cmd) - self.semaphore = threading.Semaphore() + self.lock = util.FIFOLock() def get_weights(self): '''Threadsafe''' - self.semaphore.acquire() + self.lock.acquire() self.decoder.stdin.write('WEIGHTS ||| WRITE\n') weights = self.decoder.stdout.readline().strip() - self.semaphore.release() + self.lock.release() return weights def set_weights(self, w_line): '''Threadsafe''' - self.semaphore.acquire() + self.lock.acquire() self.decoder.stdin.write('WEIGHTS ||| {}\n'.format(w_line)) - self.semaphore.release() + self.lock.release() def update(self, sentence, grammar, reference): '''Threadsafe''' input = 'LEARN ||| {s} ||| {r}\n'.format(s=sentence, g=grammar, r=reference) - self.semaphore.acquire() + self.lock.acquire() self.decoder.stdin.write(input) log = self.decoder.stdout.readline().strip() - self.semaphore.release() + self.lock.release() return log diff --git a/realtime/rt/rt.py b/realtime/rt/rt.py index f8126283..1e78e188 100644 --- a/realtime/rt/rt.py +++ b/realtime/rt/rt.py @@ -74,9 +74,9 @@ class RealtimeTranslator: self.norm = norm if self.norm: self.tokenizer = util.popen_io([os.path.join(cdec_root, 'corpus', 'tokenize-anything.sh'), '-u']) - self.tokenizer_sem = threading.Semaphore() + self.tokenizer_lock = util.FIFOLock() self.detokenizer = util.popen_io([os.path.join(cdec_root, 'corpus', 'untok.pl')]) - self.detokenizer_sem = threading.Semaphore() + self.detokenizer_lock = util.FIFOLock() # Word aligner fwd_params = os.path.join(configdir, 'a.fwd_params') @@ -97,12 +97,12 @@ class RealtimeTranslator: self.ctx_names = set() # All context-dependent operations are atomic - self.ctx_sems = collections.defaultdict(threading.Semaphore) + self.ctx_locks = collections.defaultdict(util.FIFOLock) # ctx -> list of (source, target, alignment) self.ctx_data = {} # Grammar extractor is not threadsafe - self.extractor_sem = threading.Semaphore() + self.extractor_lock = util.FIFOLock() # ctx -> deque of file self.grammar_files = {} # ctx -> dict of {sentence: file} @@ -138,9 +138,9 @@ class RealtimeTranslator: def lazy_ctx(self, ctx_name): '''Initialize a context (inc starting a new decoder) if needed''' - self.ctx_sems[ctx_name].acquire() + self.ctx_locks[ctx_name].acquire() if ctx_name in self.ctx_names: - self.ctx_sems[ctx_name].release() + self.ctx_locks[ctx_name].release() return logging.info('New context: {}'.format(ctx_name)) self.ctx_names.add(ctx_name) @@ -149,13 +149,13 @@ class RealtimeTranslator: self.grammar_dict[ctx_name] = {} tmpdir = os.path.join(self.tmp, 'decoder.{}'.format(ctx_name)) self.decoders[ctx_name] = RealtimeDecoder(self.config, tmpdir) - self.ctx_sems[ctx_name].release() + self.ctx_locks[ctx_name].release() def drop_ctx(self, ctx_name, force=False): '''Delete a context (inc stopping the decoder)''' if not force: - sem = self.ctx_sems[ctx_name] - sem.acquire() + lock = self.ctx_locks[ctx_name] + lock.acquire() logging.info('Dropping context: {}'.format(ctx_name)) self.ctx_names.remove(ctx_name) self.ctx_data.pop(ctx_name) @@ -163,30 +163,30 @@ class RealtimeTranslator: self.grammar_files.pop(ctx_name) self.grammar_dict.pop(ctx_name) self.decoders.pop(ctx_name).close(force) - self.ctx_sems.pop(ctx_name) + self.ctx_locks.pop(ctx_name) if not force: - sem.release() + lock.release() def grammar(self, sentence, ctx_name=None): '''Extract a sentence-level grammar on demand (or return cached)''' self.lazy_ctx(ctx_name) - sem = self.ctx_sems[ctx_name] - sem.acquire() + lock = self.ctx_locks[ctx_name] + lock.acquire() grammar_dict = self.grammar_dict[ctx_name] grammar_file = grammar_dict.get(sentence, None) # Cache hit if grammar_file: logging.info('Grammar cache hit: {}'.format(grammar_file)) - sem.release() + lock.release() return grammar_file # Extract and cache (fid, grammar_file) = tempfile.mkstemp(dir=self.decoders[ctx_name].tmp, prefix='grammar.') os.close(fid) with open(grammar_file, 'w') as output: - self.extractor_sem.acquire() + self.extractor_lock.acquire() for rule in self.extractor.grammar(sentence, ctx_name): output.write('{}\n'.format(str(rule))) - self.extractor_sem.release() + self.extractor_lock.release() grammar_files = self.grammar_files[ctx_name] if len(grammar_files) == self.cache_size: rm_sent = grammar_files.popleft() @@ -196,7 +196,7 @@ class RealtimeTranslator: os.remove(rm_grammar) grammar_files.append(sentence) grammar_dict[sentence] = grammar_file - sem.release() + lock.release() return grammar_file def decode(self, sentence, ctx_name=None): @@ -211,8 +211,8 @@ class RealtimeTranslator: # grammar method is threadsafe grammar_file = self.grammar(sentence, ctx_name) decoder = self.decoders[ctx_name] - sem = self.ctx_sems[ctx_name] - sem.acquire() + lock = self.ctx_locks[ctx_name] + lock.acquire() start_time = time.time() hyp = decoder.decoder.decode(sentence, grammar_file) stop_time = time.time() @@ -220,24 +220,24 @@ class RealtimeTranslator: # Empty reference: HPYPLM does not learn prior to next translation decoder.ref_fifo.write('\n') decoder.ref_fifo.flush() - sem.release() + lock.release() if self.norm: logging.info('Normalized translation: {}'.format(hyp)) hyp = self.detokenize(hyp) return hyp def tokenize(self, line): - self.tokenizer_sem.acquire() + self.tokenizer_lock.acquire() self.tokenizer.stdin.write('{}\n'.format(line)) tok_line = self.tokenizer.stdout.readline().strip() - self.tokenizer_sem.release() + self.tokenizer_lock.release() return tok_line def detokenize(self, line): - self.detokenizer_sem.acquire() + self.detokenizer_lock.acquire() self.detokenizer.stdin.write('{}\n'.format(line)) detok_line = self.detokenizer.stdout.readline().strip() - self.detokenizer_sem.release() + self.detokenizer_lock.release() return detok_line # TODO @@ -263,8 +263,8 @@ class RealtimeTranslator: alignment = self.aligner.align(source, target) # grammar method is threadsafe grammar_file = self.grammar(source, ctx_name) - sem = self.ctx_sems[ctx_name] - sem.acquire() + lock = self.ctx_locks[ctx_name] + lock.acquire() # MIRA update before adding data to grammar extractor decoder = self.decoders[ctx_name] mira_log = decoder.decoder.update(source, grammar_file, target) @@ -281,27 +281,27 @@ class RealtimeTranslator: # Clear (old) cached grammar rm_grammar = self.grammar_dict[ctx_name].pop(source) os.remove(rm_grammar) - sem.release() + lock.release() def save_state(self, filename=None, ctx_name=None): self.lazy_ctx(ctx_name) out = open(filename, 'w') if filename else sys.stdout - sem = self.ctx_sems[ctx_name] - sem.acquire() + lock = self.ctx_locks[ctx_name] + lock.acquire() ctx_data = self.ctx_data[ctx_name] logging.info('Saving state with {} sentences'.format(len(self.ctx_data))) out.write('{}\n'.format(self.decoders[ctx_name].decoder.get_weights())) for (source, target, alignment) in ctx_data: out.write('{} ||| {} ||| {}\n'.format(source, target, alignment)) - sem.release() + lock.release() out.write('EOF\n') if filename: out.close() def load_state(self, input=sys.stdin, ctx_name=None): self.lazy_ctx(ctx_name) - sem = self.ctx_sems[ctx_name] - sem.acquire() + lock = self.ctx_locks[ctx_name] + lock.acquire() ctx_data = self.ctx_data[ctx_name] decoder = self.decoders[ctx_name] # Non-initial load error @@ -329,4 +329,4 @@ class RealtimeTranslator: self.ref_fifo.flush() stop_time = time.time() logging.info('Loaded state with {} sentences in {} seconds'.format(len(ctx_data), stop_time - start_time)) - sem.release() + lock.release() diff --git a/realtime/rt/util.py b/realtime/rt/util.py index 6e07f116..05dcae96 100644 --- a/realtime/rt/util.py +++ b/realtime/rt/util.py @@ -1,4 +1,5 @@ import os +import Queue import subprocess import sys import threading @@ -13,6 +14,24 @@ SA_INI_FILES = set(( 'precompute_file', )) +class FIFOLock: + + def __init__(self): + self.q = Queue.Queue() + self.i = 0 + + def acquire(self): + self.i += 1 + if self.i > 1: + event = threading.Event() + self.q.put(event) + event.wait() + + def release(self): + self.i -= 1 + if self.i > 0: + self.q.get().set() + def cdec_ini_for_config(config): # This is a list of (k, v), not a ConfigObj or dict for i in range(len(config)): -- cgit v1.2.3 From b8116c5c3c7e31a276ff38fc8173eab37f292364 Mon Sep 17 00:00:00 2001 From: Michael Denkowski Date: Fri, 27 Sep 2013 13:39:24 -0700 Subject: Decoding and learning with multiple contexts is threadsafe and FIFO. --- realtime/realtime.py | 17 ++++++++----- realtime/rt/aligner.py | 10 +++++--- realtime/rt/decoder.py | 8 +++---- realtime/rt/rt.py | 65 ++++++++++++++++++++++++++++---------------------- realtime/rt/util.py | 8 +++++++ 5 files changed, 67 insertions(+), 41 deletions(-) (limited to 'realtime/rt/aligner.py') diff --git a/realtime/realtime.py b/realtime/realtime.py index bbec288b..38da4413 100755 --- a/realtime/realtime.py +++ b/realtime/realtime.py @@ -31,22 +31,27 @@ def test1(translator, input, output, ctx_name): out.close() def debug(translator, input): - # Test 1: identical output + # Test 1: multiple contexts threads = [] for i in range(4): t = threading.Thread(target=test1, args=(translator, input, '{}.out.{}'.format(input, i), str(i))) threads.append(t) t.start() time.sleep(30) - for t in threads: - t.join() - # Test 2: flood (same number of lines) - threads = [] + # Test 2: flood out = open('{}.out.flood'.format(input), 'w') - for line in open(input): + inp = open(input) + while True: + line = inp.readline() + if not line: + break + line = line.strip() t = threading.Thread(target=handle_line, args=(translator, line.strip(), out, None)) threads.append(t) t.start() + time.sleep(1) + translator.drop_ctx(None) + # Join test threads for t in threads: t.join() diff --git a/realtime/rt/aligner.py b/realtime/rt/aligner.py index 62ce32b8..def3fcb5 100644 --- a/realtime/rt/aligner.py +++ b/realtime/rt/aligner.py @@ -34,11 +34,11 @@ class ForceAligner: self.lock = util.FIFOLock() def align(self, source, target): - '''Threadsafe''' + '''Threadsafe, FIFO''' return self.align_formatted('{} ||| {}'.format(source, target)) def align_formatted(self, line): - '''Threadsafe''' + '''Threadsafe, FIFO''' self.lock.acquire() self.fwd_align.stdin.write('{}\n'.format(line)) self.rev_align.stdin.write('{}\n'.format(line)) @@ -51,10 +51,14 @@ class ForceAligner: self.lock.release() return al_line - def close(self): + def close(self, force=False): + if not force: + self.lock.acquire() self.fwd_align.stdin.close() self.rev_align.stdin.close() self.tools.stdin.close() + if not force: + self.lock.release() def read_err(self, err): (T, m) = ('', '') diff --git a/realtime/rt/decoder.py b/realtime/rt/decoder.py index 7c36b441..da646f68 100644 --- a/realtime/rt/decoder.py +++ b/realtime/rt/decoder.py @@ -15,7 +15,7 @@ class Decoder: self.lock.release() def decode(self, sentence, grammar=None): - '''Threadsafe''' + '''Threadsafe, FIFO''' input = '{s}\n'.format(s=sentence, g=grammar) if grammar else '{}\n'.format(sentence) self.lock.acquire() self.decoder.stdin.write(input) @@ -45,7 +45,7 @@ class MIRADecoder(Decoder): self.lock = util.FIFOLock() def get_weights(self): - '''Threadsafe''' + '''Threadsafe, FIFO''' self.lock.acquire() self.decoder.stdin.write('WEIGHTS ||| WRITE\n') weights = self.decoder.stdout.readline().strip() @@ -53,13 +53,13 @@ class MIRADecoder(Decoder): return weights def set_weights(self, w_line): - '''Threadsafe''' + '''Threadsafe, FIFO''' self.lock.acquire() self.decoder.stdin.write('WEIGHTS ||| {}\n'.format(w_line)) self.lock.release() def update(self, sentence, grammar, reference): - '''Threadsafe''' + '''Threadsafe, FIFO''' input = 'LEARN ||| {s} ||| {r}\n'.format(s=sentence, g=grammar, r=reference) self.lock.acquire() self.decoder.stdin.write(input) diff --git a/realtime/rt/rt.py b/realtime/rt/rt.py index 1e78e188..5ace5d59 100644 --- a/realtime/rt/rt.py +++ b/realtime/rt/rt.py @@ -129,18 +129,23 @@ class RealtimeTranslator: for ctx_name in list(self.ctx_names): self.drop_ctx(ctx_name, force) logging.info('Closing processes') - self.aligner.close() + self.aligner.close(force) if self.norm: + if not force: + self.tokenizer_lock.acquire() + self.detokenizer_lock.acquire() self.tokenizer.stdin.close() self.detokenizer.stdin.close() + if not force: + self.tokenizer_lock.release() + self.detokenizer_lock.release() logging.info('Deleting {}'.format(self.tmp)) shutil.rmtree(self.tmp) def lazy_ctx(self, ctx_name): - '''Initialize a context (inc starting a new decoder) if needed''' - self.ctx_locks[ctx_name].acquire() + '''Initialize a context (inc starting a new decoder) if needed. + NOT threadsafe, acquire ctx_name lock before calling.''' if ctx_name in self.ctx_names: - self.ctx_locks[ctx_name].release() return logging.info('New context: {}'.format(ctx_name)) self.ctx_names.add(ctx_name) @@ -149,12 +154,12 @@ class RealtimeTranslator: self.grammar_dict[ctx_name] = {} tmpdir = os.path.join(self.tmp, 'decoder.{}'.format(ctx_name)) self.decoders[ctx_name] = RealtimeDecoder(self.config, tmpdir) - self.ctx_locks[ctx_name].release() - def drop_ctx(self, ctx_name, force=False): - '''Delete a context (inc stopping the decoder)''' + def drop_ctx(self, ctx_name=None, force=False): + '''Delete a context (inc stopping the decoder) + Threadsafe and FIFO unless forced.''' + lock = self.ctx_locks[ctx_name] if not force: - lock = self.ctx_locks[ctx_name] lock.acquire() logging.info('Dropping context: {}'.format(ctx_name)) self.ctx_names.remove(ctx_name) @@ -168,25 +173,24 @@ class RealtimeTranslator: lock.release() def grammar(self, sentence, ctx_name=None): - '''Extract a sentence-level grammar on demand (or return cached)''' + '''Extract a sentence-level grammar on demand (or return cached) + Threadsafe wrt extractor but NOT decoder. Acquire ctx_name lock + before calling.''' + self.extractor_lock.acquire() self.lazy_ctx(ctx_name) - lock = self.ctx_locks[ctx_name] - lock.acquire() grammar_dict = self.grammar_dict[ctx_name] grammar_file = grammar_dict.get(sentence, None) # Cache hit if grammar_file: logging.info('Grammar cache hit: {}'.format(grammar_file)) - lock.release() + self.extractor_lock.release() return grammar_file # Extract and cache (fid, grammar_file) = tempfile.mkstemp(dir=self.decoders[ctx_name].tmp, prefix='grammar.') os.close(fid) with open(grammar_file, 'w') as output: - self.extractor_lock.acquire() for rule in self.extractor.grammar(sentence, ctx_name): output.write('{}\n'.format(str(rule))) - self.extractor_lock.release() grammar_files = self.grammar_files[ctx_name] if len(grammar_files) == self.cache_size: rm_sent = grammar_files.popleft() @@ -196,23 +200,25 @@ class RealtimeTranslator: os.remove(rm_grammar) grammar_files.append(sentence) grammar_dict[sentence] = grammar_file - lock.release() + self.extractor_lock.release() return grammar_file def decode(self, sentence, ctx_name=None): - '''Decode a sentence (inc extracting a grammar if needed)''' + '''Decode a sentence (inc extracting a grammar if needed) + Threadsafe, FIFO''' + lock = self.ctx_locks[ctx_name] + lock.acquire() self.lazy_ctx(ctx_name) + logging.info('DECODE: {}'.format(sentence)) # Empty in, empty out if sentence.strip() == '': + lock.release() return '' if self.norm: sentence = self.tokenize(sentence) logging.info('Normalized input: {}'.format(sentence)) - # grammar method is threadsafe grammar_file = self.grammar(sentence, ctx_name) decoder = self.decoders[ctx_name] - lock = self.ctx_locks[ctx_name] - lock.acquire() start_time = time.time() hyp = decoder.decoder.decode(sentence, grammar_file) stop_time = time.time() @@ -220,10 +226,10 @@ class RealtimeTranslator: # Empty reference: HPYPLM does not learn prior to next translation decoder.ref_fifo.write('\n') decoder.ref_fifo.flush() - lock.release() if self.norm: logging.info('Normalized translation: {}'.format(hyp)) hyp = self.detokenize(hyp) + lock.release() return hyp def tokenize(self, line): @@ -242,29 +248,32 @@ class RealtimeTranslator: # TODO def command_line(self, line, ctx_name=None): - args = [f.strip() for f in line.split('|||')] - try: + args = [f.strip() for f in line.split('|||')] + #try: if len(args) == 2 and not args[1]: self.commands[args[0]](ctx_name) else: self.commands[args[0]](*args[1:], ctx_name=ctx_name) - except: - logging.info('Command error: {}'.format(' ||| '.join(args))) + #except: + # logging.info('Command error: {}'.format(' ||| '.join(args))) def learn(self, source, target, ctx_name=None): + '''Learn from training instance (inc extracting grammar if needed) + Threadsafe, FIFO''' + lock = self.ctx_locks[ctx_name] + lock.acquire() self.lazy_ctx(ctx_name) + logging.info('LEARN: {}'.format(source)) if '' in (source.strip(), target.strip()): logging.info('Error empty source or target: {} ||| {}'.format(source, target)) + lock.release() return if self.norm: source = self.tokenize(source) target = self.tokenize(target) - # Align instance (threadsafe) + # Align instance alignment = self.aligner.align(source, target) - # grammar method is threadsafe grammar_file = self.grammar(source, ctx_name) - lock = self.ctx_locks[ctx_name] - lock.acquire() # MIRA update before adding data to grammar extractor decoder = self.decoders[ctx_name] mira_log = decoder.decoder.update(source, grammar_file, target) diff --git a/realtime/rt/util.py b/realtime/rt/util.py index 05dcae96..52767dac 100644 --- a/realtime/rt/util.py +++ b/realtime/rt/util.py @@ -15,22 +15,30 @@ SA_INI_FILES = set(( )) class FIFOLock: + '''Lock that preserves FIFO order of blocking threads''' def __init__(self): self.q = Queue.Queue() self.i = 0 + self.lock = threading.Lock() def acquire(self): + self.lock.acquire() self.i += 1 if self.i > 1: event = threading.Event() self.q.put(event) + self.lock.release() event.wait() + return + self.lock.release() def release(self): + self.lock.acquire() self.i -= 1 if self.i > 0: self.q.get().set() + self.lock.release() def cdec_ini_for_config(config): # This is a list of (k, v), not a ConfigObj or dict -- cgit v1.2.3 From 51a83643d95ab0d7add9dd66b0b38044db10a797 Mon Sep 17 00:00:00 2001 From: Michael Denkowski Date: Tue, 1 Oct 2013 14:08:05 -0700 Subject: Better logging, save/load to default context --- realtime/realtime.py | 5 +++++ realtime/rt/aligner.py | 8 +++++--- realtime/rt/decoder.py | 6 ++++-- realtime/rt/rt.py | 50 ++++++++++++++++++++++++++------------------------ 4 files changed, 40 insertions(+), 29 deletions(-) (limited to 'realtime/rt/aligner.py') diff --git a/realtime/realtime.py b/realtime/realtime.py index be02d486..6ee785f8 100755 --- a/realtime/realtime.py +++ b/realtime/realtime.py @@ -57,6 +57,7 @@ def main(): parser = Parser(description='Real-time adaptive translation with cdec. (See README.md)') parser.add_argument('-c', '--config', required=True, help='Config directory') + parser.add_argument('-s', '--state', help='Load state file to default context (saved incremental data)') parser.add_argument('-n', '--normalize', help='Normalize text (tokenize, translate, detokenize)', action='store_true') parser.add_argument('-T', '--temp', help='Temp directory (default /tmp)', default='/tmp') parser.add_argument('-a', '--cache', help='Grammar cache size (default 5)', default='5') @@ -74,6 +75,10 @@ def main(): debug(translator, args.debug_test) return + # Load state if given + if args.state: + rtd.load_state(state) + # Read lines and commands while True: line = sys.stdin.readline() diff --git a/realtime/rt/aligner.py b/realtime/rt/aligner.py index def3fcb5..bcc1ef87 100644 --- a/realtime/rt/aligner.py +++ b/realtime/rt/aligner.py @@ -6,6 +6,8 @@ import threading import util +logger = logging.getLogger('rt.aligner') + class ForceAligner: def __init__(self, fwd_params, fwd_err, rev_params, rev_err): @@ -21,13 +23,13 @@ class ForceAligner: rev_cmd = [fast_align, '-i', '-', '-d', '-T', rev_T, '-m', rev_m, '-f', rev_params, '-r'] tools_cmd = [atools, '-i', '-', '-j', '-', '-c', 'grow-diag-final-and'] - logging.info('Executing: {}'.format(' '.join(fwd_cmd))) + logger.info('Executing: {}'.format(' '.join(fwd_cmd))) self.fwd_align = util.popen_io(fwd_cmd) - logging.info('Executing: {}'.format(' '.join(rev_cmd))) + logger.info('Executing: {}'.format(' '.join(rev_cmd))) self.rev_align = util.popen_io(rev_cmd) - logging.info('Executing: {}'.format(' '.join(tools_cmd))) + logger.info('Executing: {}'.format(' '.join(tools_cmd))) self.tools = util.popen_io(tools_cmd) # Used to guarantee thread safety diff --git a/realtime/rt/decoder.py b/realtime/rt/decoder.py index 1cee4610..e6e7489d 100644 --- a/realtime/rt/decoder.py +++ b/realtime/rt/decoder.py @@ -5,6 +5,8 @@ import threading import util +logger = logging.getLogger('rt.decoder') + class Decoder: def close(self, force=False): @@ -29,7 +31,7 @@ class CdecDecoder(Decoder): cdec_root = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) decoder = os.path.join(cdec_root, 'decoder', 'cdec') decoder_cmd = [decoder, '-c', config, '-w', weights] - logging.info('Executing: {}'.format(' '.join(decoder_cmd))) + logger.info('Executing: {}'.format(' '.join(decoder_cmd))) self.decoder = util.popen_io(decoder_cmd) self.lock = util.FIFOLock() @@ -40,7 +42,7 @@ class MIRADecoder(Decoder): mira = os.path.join(cdec_root, 'training', 'mira', 'kbest_cut_mira') # optimizer=2 step=0.001 best=500, k=500, uniq, stream mira_cmd = [mira, '-c', config, '-w', weights, '-o', '2', '-C', '0.001', '-b', '500', '-k', '500', '-u', '-t'] - logging.info('Executing: {}'.format(' '.join(mira_cmd))) + logger.info('Executing: {}'.format(' '.join(mira_cmd))) self.decoder = util.popen_io(mira_cmd) self.lock = util.FIFOLock() diff --git a/realtime/rt/rt.py b/realtime/rt/rt.py index 43cc43b4..db831712 100644 --- a/realtime/rt/rt.py +++ b/realtime/rt/rt.py @@ -19,6 +19,8 @@ import util # Dummy input token that is unlikely to appear in normalized data (but no fatal errors if it does) LIKELY_OOV = '(OOV)' +logger = logging.getLogger('rt') + class RealtimeDecoder: '''Do not use directly unless you know what you're doing. Use RealtimeTranslator.''' @@ -48,7 +50,7 @@ class RealtimeDecoder: self.decoder = decoder.MIRADecoder(decoder_config_file, decoder_weights) def close(self, force=False): - logging.info('Closing decoder and removing {}'.format(self.tmp)) + logger.info('Closing decoder and removing {}'.format(self.tmp)) self.decoder.close(force) self.ref_fifo.close() shutil.rmtree(self.tmp) @@ -75,7 +77,7 @@ class RealtimeTranslator: self.config = configdir # Temporary work dir self.tmp = tempfile.mkdtemp(dir=tmpdir, prefix='realtime.') - logging.info('Using temp dir {}'.format(self.tmp)) + logger.info('Using temp dir {}'.format(self.tmp)) # Normalization self.norm = norm @@ -127,10 +129,10 @@ class RealtimeTranslator: def close(self, force=False): '''Cleanup''' if force: - logging.info('Forced shutdown: stopping immediately') + logger.info('Forced shutdown: stopping immediately') for ctx_name in list(self.ctx_names): self.drop_ctx(ctx_name, force) - logging.info('Closing processes') + logger.info('Closing processes') self.aligner.close(force) if self.norm: if not force: @@ -141,7 +143,7 @@ class RealtimeTranslator: if not force: self.tokenizer_lock.release() self.detokenizer_lock.release() - logging.info('Deleting {}'.format(self.tmp)) + logger.info('Deleting {}'.format(self.tmp)) shutil.rmtree(self.tmp) def lazy_ctx(self, ctx_name): @@ -149,7 +151,7 @@ class RealtimeTranslator: NOT threadsafe, acquire ctx_name lock before calling.''' if ctx_name in self.ctx_names: return - logging.info('New context: {}'.format(ctx_name)) + logger.info('New context: {}'.format(ctx_name)) self.ctx_names.add(ctx_name) self.ctx_data[ctx_name] = [] self.grammar_files[ctx_name] = collections.deque() @@ -164,11 +166,11 @@ class RealtimeTranslator: if not force: lock.acquire() if ctx_name not in self.ctx_names: - logging.info('No context found, no action: {}'.format(ctx_name)) + logger.info('No context found, no action: {}'.format(ctx_name)) if not force: lock.release() return - logging.info('Dropping context: {}'.format(ctx_name)) + logger.info('Dropping context: {}'.format(ctx_name)) self.ctx_names.remove(ctx_name) self.ctx_data.pop(ctx_name) self.extractor.drop_ctx(ctx_name) @@ -193,7 +195,7 @@ class RealtimeTranslator: grammar_file = grammar_dict.get(sentence, None) # Cache hit if grammar_file: - logging.info('Grammar cache hit: {}'.format(grammar_file)) + logger.info('Grammar cache hit: {}'.format(grammar_file)) self.extractor_lock.release() return grammar_file # Extract and cache @@ -226,18 +228,18 @@ class RealtimeTranslator: return '' if self.norm: sentence = self.tokenize(sentence) - logging.info('Normalized input: {}'.format(sentence)) + logger.info('Normalized input: {}'.format(sentence)) grammar_file = self.grammar(sentence, ctx_name) decoder = self.decoders[ctx_name] start_time = time.time() hyp = decoder.decoder.decode(sentence, grammar_file) stop_time = time.time() - logging.info('Translation time: {} seconds'.format(stop_time - start_time)) + logger.info('Translation time: {} seconds'.format(stop_time - start_time)) # Empty reference: HPYPLM does not learn prior to next translation decoder.ref_fifo.write('\n') decoder.ref_fifo.flush() if self.norm: - logging.info('Normalized translation: {}'.format(hyp)) + logger.info('Normalized translation: {}'.format(hyp)) hyp = self.detokenize(hyp) lock.release() return hyp @@ -271,9 +273,9 @@ class RealtimeTranslator: cmd_name = cmd_name[0] (command, nargs) = self.COMMANDS.get(cmd_name, (None, None)) if command and len(args[1:]) in nargs: - logging.info('{} ({}) ||| {}'.format(cmd_name, ctx_name, ' ||| '.join(args[1:]))) + logger.info('{} ({}) ||| {}'.format(cmd_name, ctx_name, ' ||| '.join(args[1:]))) return command(*args[1:], ctx_name=ctx_name) - logging.info('ERROR: command: {}'.format(' ||| '.join(args))) + logger.info('ERROR: command: {}'.format(' ||| '.join(args))) def learn(self, source, target, ctx_name=None): '''Learn from training instance (inc extracting grammar if needed) @@ -282,7 +284,7 @@ class RealtimeTranslator: lock.acquire() self.lazy_ctx(ctx_name) if '' in (source.strip(), target.strip()): - logging.info('ERROR: empty source or target: {} ||| {}'.format(source, target)) + logger.info('ERROR: empty source or target: {} ||| {}'.format(source, target)) lock.release() return if self.norm: @@ -294,15 +296,15 @@ class RealtimeTranslator: # MIRA update before adding data to grammar extractor decoder = self.decoders[ctx_name] mira_log = decoder.decoder.update(source, grammar_file, target) - logging.info('MIRA: {}'.format(mira_log)) + logger.info('MIRA: {}'.format(mira_log)) # Add to HPYPLM by writing to fifo (read on next translation) - logging.info('Adding to HPYPLM: {}'.format(target)) + logger.info('Adding to HPYPLM: {}'.format(target)) decoder.ref_fifo.write('{}\n'.format(target)) decoder.ref_fifo.flush() # Store incremental data for save/load self.ctx_data[ctx_name].append((source, target, alignment)) # Add aligned sentence pair to grammar extractor - logging.info('Adding to bitext: {} ||| {} ||| {}'.format(source, target, alignment)) + logger.info('Adding to bitext: {} ||| {} ||| {}'.format(source, target, alignment)) self.extractor.add_instance(source, target, alignment, ctx_name) # Clear (old) cached grammar rm_grammar = self.grammar_dict[ctx_name].pop(source) @@ -315,7 +317,7 @@ class RealtimeTranslator: self.lazy_ctx(ctx_name) ctx_data = self.ctx_data[ctx_name] out = open(filename, 'w') if filename else sys.stdout - logging.info('Saving state for context ({}) with {} sentences'.format(ctx_name, len(ctx_data))) + logger.info('Saving state for context ({}) with {} sentences'.format(ctx_name, len(ctx_data))) out.write('{}\n'.format(self.decoders[ctx_name].decoder.get_weights())) for (source, target, alignment) in ctx_data: out.write('{} ||| {} ||| {}\n'.format(source, target, alignment)) @@ -333,8 +335,8 @@ class RealtimeTranslator: input = open(filename) if filename else sys.stdin # Non-initial load error if ctx_data: - logging.info('ERROR: Incremental data has already been added to context ({})'.format(ctx_name)) - logging.info(' State can only be loaded to a new context.') + logger.info('ERROR: Incremental data has already been added to context ({})'.format(ctx_name)) + logger.info(' State can only be loaded to a new context.') lock.release() return # Many things can go wrong if bad state data is given @@ -343,7 +345,7 @@ class RealtimeTranslator: line = input.readline().strip() # Throws exception if bad line decoder.decoder.set_weights(line) - logging.info('Loading state...') + logger.info('Loading state...') start_time = time.time() # Lines source ||| target ||| alignment while True: @@ -362,12 +364,12 @@ class RealtimeTranslator: decoder.ref_fifo.write('{}\n'.format(target)) decoder.ref_fifo.flush() stop_time = time.time() - logging.info('Loaded state for context ({}) with {} sentences in {} seconds'.format(ctx_name, len(ctx_data), stop_time - start_time)) + logger.info('Loaded state for context ({}) with {} sentences in {} seconds'.format(ctx_name, len(ctx_data), stop_time - start_time)) lock.release() # Recover from bad load attempt by restarting context. # Guaranteed not to cause data loss since only a new context can load state. except: - logging.info('ERROR: could not load state, restarting context ({})'.format(ctx_name)) + logger.info('ERROR: could not load state, restarting context ({})'.format(ctx_name)) # ctx_name is already owned and needs to be restarted before other blocking threads use self.drop_ctx(ctx_name, force=True) self.lazy_ctx(ctx_name) -- cgit v1.2.3 From af80da3fecbd56554314b1135872272cc7d3793a Mon Sep 17 00:00:00 2001 From: mjdenkowski Date: Fri, 18 Oct 2013 16:18:04 -0400 Subject: wait() to avoid zombies --- realtime/rt/aligner.py | 3 +++ realtime/rt/decoder.py | 5 +++-- realtime/rt/rt.py | 2 ++ 3 files changed, 8 insertions(+), 2 deletions(-) (limited to 'realtime/rt/aligner.py') diff --git a/realtime/rt/aligner.py b/realtime/rt/aligner.py index bcc1ef87..c34805eb 100644 --- a/realtime/rt/aligner.py +++ b/realtime/rt/aligner.py @@ -57,8 +57,11 @@ class ForceAligner: if not force: self.lock.acquire() self.fwd_align.stdin.close() + self.fwd_align.wait() self.rev_align.stdin.close() + self.rev_align.wait() self.tools.stdin.close() + self.tools.wait() if not force: self.lock.release() diff --git a/realtime/rt/decoder.py b/realtime/rt/decoder.py index e6e7489d..ed45c248 100644 --- a/realtime/rt/decoder.py +++ b/realtime/rt/decoder.py @@ -13,13 +13,14 @@ class Decoder: if not force: self.lock.acquire() self.decoder.stdin.close() + self.decoder.wait() if not force: self.lock.release() def decode(self, sentence, grammar=None): '''Threadsafe, FIFO''' - input = '{s}\n'.format(s=sentence, g=grammar) if grammar else '{}\n'.format(sentence) self.lock.acquire() + input = '{s}\n'.format(s=sentence, g=grammar) if grammar else '{}\n'.format(sentence) self.decoder.stdin.write(input) hyp = self.decoder.stdout.readline().strip() self.lock.release() @@ -71,8 +72,8 @@ class MIRADecoder(Decoder): def update(self, sentence, grammar, reference): '''Threadsafe, FIFO''' - input = 'LEARN ||| {s} ||| {r}\n'.format(s=sentence, g=grammar, r=reference) self.lock.acquire() + input = 'LEARN ||| {s} ||| {r}\n'.format(s=sentence, g=grammar, r=reference) self.decoder.stdin.write(input) log = self.decoder.stdout.readline().strip() self.lock.release() diff --git a/realtime/rt/rt.py b/realtime/rt/rt.py index 7cc5bc10..d1d01ad8 100644 --- a/realtime/rt/rt.py +++ b/realtime/rt/rt.py @@ -140,7 +140,9 @@ class RealtimeTranslator: self.tokenizer_lock.acquire() self.detokenizer_lock.acquire() self.tokenizer.stdin.close() + self.tokenizer.wait() self.detokenizer.stdin.close() + self.detokenizer.wait() if not force: self.tokenizer_lock.release() self.detokenizer_lock.release() -- cgit v1.2.3 From 074fa88375967adababc632ea763e9dea389831e Mon Sep 17 00:00:00 2001 From: mjdenkowski Date: Wed, 30 Oct 2013 23:37:39 -0400 Subject: Specify heuristic for force alignment --- realtime/rt/aligner.py | 4 ++-- word-aligner/force_align.py | 6 ++++-- 2 files changed, 6 insertions(+), 4 deletions(-) (limited to 'realtime/rt/aligner.py') diff --git a/realtime/rt/aligner.py b/realtime/rt/aligner.py index c34805eb..e1782496 100644 --- a/realtime/rt/aligner.py +++ b/realtime/rt/aligner.py @@ -10,7 +10,7 @@ logger = logging.getLogger('rt.aligner') class ForceAligner: - def __init__(self, fwd_params, fwd_err, rev_params, rev_err): + def __init__(self, fwd_params, fwd_err, rev_params, rev_err, heuristic='grow-diag-final-and'): cdec_root = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) fast_align = os.path.join(cdec_root, 'word-aligner', 'fast_align') @@ -21,7 +21,7 @@ class ForceAligner: fwd_cmd = [fast_align, '-i', '-', '-d', '-T', fwd_T, '-m', fwd_m, '-f', fwd_params] rev_cmd = [fast_align, '-i', '-', '-d', '-T', rev_T, '-m', rev_m, '-f', rev_params, '-r'] - tools_cmd = [atools, '-i', '-', '-j', '-', '-c', 'grow-diag-final-and'] + tools_cmd = [atools, '-i', '-', '-j', '-', '-c', heuristic] logger.info('Executing: {}'.format(' '.join(fwd_cmd))) self.fwd_align = util.popen_io(fwd_cmd) diff --git a/word-aligner/force_align.py b/word-aligner/force_align.py index 8b6ca224..b03d446e 100755 --- a/word-aligner/force_align.py +++ b/word-aligner/force_align.py @@ -5,13 +5,15 @@ import sys def main(): - if len(sys.argv[1:]) != 4: + if len(sys.argv[1:]) < 4: sys.stderr.write('run:\n') sys.stderr.write(' fast_align -i corpus.f-e -d -v -o -p fwd_params >fwd_align 2>fwd_err\n') sys.stderr.write(' fast_align -i corpus.f-e -r -d -v -o -p rev_params >rev_align 2>rev_err\n') sys.stderr.write('\n') sys.stderr.write('then run:\n') - sys.stderr.write(' {} fwd_params fwd_err rev_params rev_err out.f-e.gdfa\n'.format(sys.argv[0])) + sys.stderr.write(' {} fwd_params fwd_err rev_params rev_err [heuristic] out.f-e.gdfa\n'.format(sys.argv[0])) + sys.stderr.write('\n') + sys.stderr.write('where heuristic is one of: (intersect union grow-diag grow-diag-final grow-diag-final-and) default=grow-diag-final-and\n') sys.exit(2) # Hook into realtime -- cgit v1.2.3