#!/usr/bin/env python import argparse import collections import logging import os import shutil import sys import subprocess import tempfile import threading import time import cdec import aligner import decoder import util # Dummy input token that is unlikely to appear in normalized data (but no fatal errors if it does) LIKELY_OOV = '(OOV)' class RealtimeDecoder: '''Do not use directly unless you know what you're doing. Use RealtimeTranslator.''' def __init__(self, configdir, tmpdir): cdec_root = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) self.tmp = tmpdir os.mkdir(self.tmp) # HPYPLM reference stream ref_fifo_file = os.path.join(self.tmp, 'ref.fifo') os.mkfifo(ref_fifo_file) self.ref_fifo = open(ref_fifo_file, 'w+') # Start with empty line (do not learn prior to first input) self.ref_fifo.write('\n') self.ref_fifo.flush() # Decoder decoder_config = [[f.strip() for f in line.split('=')] for line in open(os.path.join(configdir, 'cdec.ini'))] util.cdec_ini_for_realtime(decoder_config, os.path.abspath(configdir), ref_fifo_file) decoder_config_file = os.path.join(self.tmp, 'cdec.ini') with open(decoder_config_file, 'w') as output: for (k, v) in decoder_config: output.write('{}={}\n'.format(k, v)) decoder_weights = os.path.join(configdir, 'weights.final') self.decoder = decoder.MIRADecoder(decoder_config_file, decoder_weights) def close(self, force=False): logging.info('Closing decoder and removing {}'.format(self.tmp)) self.decoder.close(force) self.ref_fifo.close() shutil.rmtree(self.tmp) class RealtimeTranslator: '''Main entry point into API: serves translations to any number of concurrent users''' def __init__(self, configdir, tmpdir='/tmp', cache_size=5, norm=False, state=None): # name -> (method, set of possible nargs) self.COMMANDS = { 'TR': (self.translate, set((1,))), 'LEARN': (self.learn, set((2,))), 'SAVE': (self.save_state, set((0, 1))), 'LOAD': (self.load_state, set((0, 1))), } cdec_root = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) ### Single instance for all contexts self.config = configdir # Temporary work dir self.tmp = tempfile.mkdtemp(dir=tmpdir, prefix='realtime.') logging.info('Using temp dir {}'.format(self.tmp)) # Normalization self.norm = norm if self.norm: self.tokenizer = util.popen_io([os.path.join(cdec_root, 'corpus', 'tokenize-anything.sh'), '-u']) self.tokenizer_lock = util.FIFOLock() self.detokenizer = util.popen_io([os.path.join(cdec_root, 'corpus', 'untok.pl')]) self.detokenizer_lock = util.FIFOLock() # Word aligner fwd_params = os.path.join(configdir, 'a.fwd_params') fwd_err = os.path.join(configdir, 'a.fwd_err') rev_params = os.path.join(configdir, 'a.rev_params') rev_err = os.path.join(configdir, 'a.rev_err') self.aligner = aligner.ForceAligner(fwd_params, fwd_err, rev_params, rev_err) # Grammar extractor sa_config = cdec.configobj.ConfigObj(os.path.join(configdir, 'sa.ini'), unrepr=True) sa_config.filename = os.path.join(self.tmp, 'sa.ini') util.sa_ini_for_realtime(sa_config, os.path.abspath(configdir)) sa_config.write() self.extractor = cdec.sa.GrammarExtractor(sa_config.filename, online=True) self.cache_size = cache_size ### One instance per context self.ctx_names = set() # All context-dependent operations are atomic self.ctx_locks = collections.defaultdict(util.FIFOLock) # ctx -> list of (source, target, alignment) self.ctx_data = {} # Grammar extractor is not threadsafe self.extractor_lock = util.FIFOLock() # ctx -> deque of file self.grammar_files = {} # ctx -> dict of {sentence: file} self.grammar_dict = {} self.decoders = {} # TODO: state # Load state if given if state: with open(state) as input: self.load_state(input) def __enter__(self): return self def __exit__(self, ex_type, ex_value, ex_traceback): self.close(ex_type is KeyboardInterrupt) def close(self, force=False): '''Cleanup''' if force: logging.info('Forced shutdown: stopping immediately') for ctx_name in list(self.ctx_names): self.drop_ctx(ctx_name, force) logging.info('Closing processes') self.aligner.close(force) if self.norm: if not force: self.tokenizer_lock.acquire() self.detokenizer_lock.acquire() self.tokenizer.stdin.close() self.detokenizer.stdin.close() if not force: self.tokenizer_lock.release() self.detokenizer_lock.release() logging.info('Deleting {}'.format(self.tmp)) shutil.rmtree(self.tmp) def lazy_ctx(self, ctx_name): '''Initialize a context (inc starting a new decoder) if needed. NOT threadsafe, acquire ctx_name lock before calling.''' if ctx_name in self.ctx_names: return logging.info('New context: {}'.format(ctx_name)) self.ctx_names.add(ctx_name) self.ctx_data[ctx_name] = [] self.grammar_files[ctx_name] = collections.deque() self.grammar_dict[ctx_name] = {} tmpdir = os.path.join(self.tmp, 'decoder.{}'.format(ctx_name)) self.decoders[ctx_name] = RealtimeDecoder(self.config, tmpdir) def drop_ctx(self, ctx_name=None, force=False): '''Delete a context (inc stopping the decoder) Threadsafe and FIFO unless forced.''' lock = self.ctx_locks[ctx_name] if not force: lock.acquire() logging.info('Dropping context: {}'.format(ctx_name)) self.ctx_names.remove(ctx_name) self.ctx_data.pop(ctx_name) self.extractor.drop_ctx(ctx_name) self.grammar_files.pop(ctx_name) self.grammar_dict.pop(ctx_name) self.decoders.pop(ctx_name).close(force) self.ctx_locks.pop(ctx_name) if not force: lock.release() def grammar(self, sentence, ctx_name=None): '''Extract a sentence-level grammar on demand (or return cached) Threadsafe wrt extractor but NOT decoder. Acquire ctx_name lock before calling.''' self.extractor_lock.acquire() self.lazy_ctx(ctx_name) grammar_dict = self.grammar_dict[ctx_name] grammar_file = grammar_dict.get(sentence, None) # Cache hit if grammar_file: logging.info('Grammar cache hit: {}'.format(grammar_file)) self.extractor_lock.release() return grammar_file # Extract and cache (fid, grammar_file) = tempfile.mkstemp(dir=self.decoders[ctx_name].tmp, prefix='grammar.') os.close(fid) with open(grammar_file, 'w') as output: for rule in self.extractor.grammar(sentence, ctx_name): output.write('{}\n'.format(str(rule))) grammar_files = self.grammar_files[ctx_name] if len(grammar_files) == self.cache_size: rm_sent = grammar_files.popleft() # If not already removed by learn method if rm_sent in grammar_dict: rm_grammar = grammar_dict.pop(rm_sent) os.remove(rm_grammar) grammar_files.append(sentence) grammar_dict[sentence] = grammar_file self.extractor_lock.release() return grammar_file def translate(self, sentence, ctx_name=None): '''Decode a sentence (inc extracting a grammar if needed) Threadsafe, FIFO''' lock = self.ctx_locks[ctx_name] lock.acquire() self.lazy_ctx(ctx_name) # Empty in, empty out if sentence.strip() == '': lock.release() return '' if self.norm: sentence = self.tokenize(sentence) logging.info('Normalized input: {}'.format(sentence)) grammar_file = self.grammar(sentence, ctx_name) decoder = self.decoders[ctx_name] start_time = time.time() hyp = decoder.decoder.decode(sentence, grammar_file) stop_time = time.time() logging.info('Translation time: {} seconds'.format(stop_time - start_time)) # Empty reference: HPYPLM does not learn prior to next translation decoder.ref_fifo.write('\n') decoder.ref_fifo.flush() if self.norm: logging.info('Normalized translation: {}'.format(hyp)) hyp = self.detokenize(hyp) lock.release() return hyp def tokenize(self, line): self.tokenizer_lock.acquire() self.tokenizer.stdin.write('{}\n'.format(line)) tok_line = self.tokenizer.stdout.readline().strip() self.tokenizer_lock.release() return tok_line def detokenize(self, line): self.detokenizer_lock.acquire() self.detokenizer.stdin.write('{}\n'.format(line)) detok_line = self.detokenizer.stdout.readline().strip() self.detokenizer_lock.release() return detok_line def command_line(self, line, ctx_name=None): args = [f.strip() for f in line.split('|||')] (command, nargs) = self.COMMANDS[args[0]] # ctx_name provided if len(args[1:]) + 1 in nargs: logging.info('Context {}: {} ||| {}'.format(args[1], args[0], ' ||| '.join(args[2:]))) return command(*args[2:], ctx_name=args[1]) # No ctx_name, use default or passed elif len(args[1:]) in nargs: logging.info('Context {}: {} ||| {}'.format(ctx_name, args[0], ' ||| '.join(args[1:]))) return command(*args[1:], ctx_name=ctx_name) # nargs doesn't match else: logging.info('Command error: {}'.format(' ||| '.join(args))) def learn(self, source, target, ctx_name=None): '''Learn from training instance (inc extracting grammar if needed) Threadsafe, FIFO''' lock = self.ctx_locks[ctx_name] lock.acquire() self.lazy_ctx(ctx_name) if '' in (source.strip(), target.strip()): logging.info('Error empty source or target: {} ||| {}'.format(source, target)) lock.release() return if self.norm: source = self.tokenize(source) target = self.tokenize(target) # Align instance alignment = self.aligner.align(source, target) grammar_file = self.grammar(source, ctx_name) # MIRA update before adding data to grammar extractor decoder = self.decoders[ctx_name] mira_log = decoder.decoder.update(source, grammar_file, target) logging.info('MIRA: {}'.format(mira_log)) # Add to HPYPLM by writing to fifo (read on next translation) logging.info('Adding to HPYPLM: {}'.format(target)) decoder.ref_fifo.write('{}\n'.format(target)) decoder.ref_fifo.flush() # Store incremental data for save/load self.ctx_data[ctx_name].append((source, target, alignment)) # Add aligned sentence pair to grammar extractor logging.info('Adding to bitext: {} ||| {} ||| {}'.format(source, target, alignment)) self.extractor.add_instance(source, target, alignment, ctx_name) # Clear (old) cached grammar rm_grammar = self.grammar_dict[ctx_name].pop(source) os.remove(rm_grammar) lock.release() def save_state(self, filename=None, ctx_name=None): self.lazy_ctx(ctx_name) out = open(filename, 'w') if filename else sys.stdout lock = self.ctx_locks[ctx_name] lock.acquire() ctx_data = self.ctx_data[ctx_name] logging.info('Saving state with {} sentences'.format(len(self.ctx_data))) out.write('{}\n'.format(self.decoders[ctx_name].decoder.get_weights())) for (source, target, alignment) in ctx_data: out.write('{} ||| {} ||| {}\n'.format(source, target, alignment)) lock.release() out.write('EOF\n') if filename: out.close() def load_state(self, input=sys.stdin, ctx_name=None): self.lazy_ctx(ctx_name) lock = self.ctx_locks[ctx_name] lock.acquire() ctx_data = self.ctx_data[ctx_name] decoder = self.decoders[ctx_name] # Non-initial load error if ctx_data: logging.info('Error: Incremental data has already been added to decoder.') logging.info(' State can only be loaded by a freshly started decoder.') return # MIRA weights line = input.readline().strip() decoder.decoder.set_weights(line) logging.info('Loading state...') start_time = time.time() # Lines source ||| target ||| alignment while True: line = input.readline().strip() if line == 'EOF': break (source, target, alignment) = line.split(' ||| ') ctx_data.append((source, target, alignment)) # Extractor self.extractor.add_instance(source, target, alignment, ctx_name) # HPYPLM hyp = decoder.decoder.decode(LIKELY_OOV) self.ref_fifo.write('{}\n'.format(target)) self.ref_fifo.flush() stop_time = time.time() logging.info('Loaded state with {} sentences in {} seconds'.format(len(ctx_data), stop_time - start_time)) lock.release()