diff options
Diffstat (limited to 'realtime')
-rwxr-xr-x | realtime/mkconfig.py | 25 | ||||
-rw-r--r-- | realtime/rt/decoder.py | 6 | ||||
-rw-r--r-- | realtime/rt/rt.py | 118 | ||||
-rw-r--r-- | realtime/rt/util.py | 1 |
4 files changed, 115 insertions, 35 deletions
diff --git a/realtime/mkconfig.py b/realtime/mkconfig.py index f193c57a..f9938051 100755 --- a/realtime/mkconfig.py +++ b/realtime/mkconfig.py @@ -12,11 +12,14 @@ from cdec.configobj import ConfigObj def main(): - if len(sys.argv[1:]) != 12: - sys.stderr.write('usage: {} a.fwd_params a.fwd_err a.rev_params a.rev_err sa sa.ini mono.klm libcdec_ff_hpyplm.so corpus.hpyplm cdec.ini weights.final output.d\n'.format(sys.argv[0])) + if len(sys.argv[1:]) not in (10, 12): + sys.stderr.write('usage: {} a.fwd_params a.fwd_err a.rev_params a.rev_err sa sa.ini mono.klm cdec.ini weights.final output.d [libcdec_ff_hpyplm.so corpus.hpyplm]\n'.format(sys.argv[0])) sys.exit(2) - (a_fwd_params, a_fwd_err, a_rev_params, a_rev_err, sa, sa_ini, mono_klm, libcdec_ff_hpyplm_so, corpus_hpyplm, cdec_ini, weights_final, output_d) = sys.argv[1:] + (a_fwd_params, a_fwd_err, a_rev_params, a_rev_err, sa, sa_ini, mono_klm, cdec_ini, weights_final, output_d) = sys.argv[1:11] + + # Optional + (libcdec_ff_hpyplm_so, corpus_hpyplm) = sys.argv[11:13] if len(sys.argv[1:]) == 12 else (None, None) if os.path.exists(output_d): sys.stderr.write('Directory {} exists, exiting.\n'.format(output_d)) @@ -40,8 +43,10 @@ def main(): # language models shutil.copy(mono_klm, os.path.join(output_d, 'mono.klm')) - shutil.copy(libcdec_ff_hpyplm_so, os.path.join(output_d, 'libcdec_ff_hpyplm.so')) - shutil.copy(corpus_hpyplm, os.path.join(output_d, 'corpus.hpyplm')) + if libcdec_ff_hpyplm_so: + shutil.copy(libcdec_ff_hpyplm_so, os.path.join(output_d, 'libcdec_ff_hpyplm.so')) + if corpus_hpyplm: + shutil.copy(corpus_hpyplm, os.path.join(output_d, 'corpus.hpyplm')) # decoder config config = [[f.strip() for f in line.split('=')] for line in open(cdec_ini)] @@ -52,6 +57,16 @@ def main(): # weights shutil.copy(weights_final, os.path.join(output_d, 'weights.final')) + + # other options + rt_ini = os.path.join(output_d, 'rt.ini') + with open(rt_ini, 'w') as out: + if libcdec_ff_hpyplm_so and corpus_hpyplm: + out.write('hpyplm=true\n') + else: + out.write('hpyplm=false\n') + out.write('metric=ibm_bleu\n') + sys.stderr.write('IMPORTANT: add any additional options such as metric=meteor to {}\n'.format(rt_ini)) if __name__ == '__main__': main() diff --git a/realtime/rt/decoder.py b/realtime/rt/decoder.py index ed45c248..1bdd3f1f 100644 --- a/realtime/rt/decoder.py +++ b/realtime/rt/decoder.py @@ -38,11 +38,11 @@ class CdecDecoder(Decoder): class MIRADecoder(Decoder): - def __init__(self, config, weights): + def __init__(self, config, weights, metric='ibm_bleu'): cdec_root = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) mira = os.path.join(cdec_root, 'training', 'mira', 'kbest_cut_mira') - # optimizer=2 step=0.001 best=500, k=500, uniq, stream - mira_cmd = [mira, '-c', config, '-w', weights, '-o', '2', '-C', '0.001', '-b', '500', '-k', '500', '-u', '-t'] + # optimizer=2 step=0.001 best=500, k=500, uniq, stream, metric + mira_cmd = [mira, '-c', config, '-w', weights, '-o', '2', '-C', '0.001', '-b', '500', '-k', '500', '-u', '-t', '-m', metric] logger.info('Executing: {}'.format(' '.join(mira_cmd))) self.decoder = util.popen_io(mira_cmd) self.lock = util.FIFOLock() diff --git a/realtime/rt/rt.py b/realtime/rt/rt.py index c0aec410..70ed0c3c 100644 --- a/realtime/rt/rt.py +++ b/realtime/rt/rt.py @@ -5,6 +5,7 @@ import collections import logging import os import shutil +import signal import StringIO import subprocess import sys @@ -20,40 +21,94 @@ import util # Dummy input token that is unlikely to appear in normalized data (but no fatal errors if it does) LIKELY_OOV = '(OOV)' +# For parsing rt.ini +TRUE = ('true', 'True', 'TRUE') + logger = logging.getLogger('rt') +class ExtractorWrapper: + '''Wrap cdec.sa.GrammarExtractor. Used to keep multiple instances of the extractor from causing Python to segfault. + Do not use directly unless you know what you're doing.''' + + def __init__(self, config): + # Make sure pycdec is on PYTHONPATH + cdec_root = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + pycdec = os.path.join(cdec_root, 'python') + env = os.environ.copy() + python_path = env.get('PYTHONPATH', '') + if 'cdec/python' not in python_path: + python_path = '{}:{}'.format(python_path, pycdec) if len(python_path) > 0 else pycdec + env['PYTHONPATH'] = python_path + # Start grammar extractor as separate process using stdio + cmd = ['python', '-m', 'cdec.sa.extract', '-o', '-z', '-c', config, '-t'] + logger.info('Executing: {}'.format(' '.join(cmd))) + self.p = subprocess.Popen(cmd, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE, env=env) + util.consume_stream(self.p.stderr) + self.lock = util.FIFOLock() + + def close(self, force=False): + if not force: + self.lock.acquire() + self.p.stdin.close() + self.p.wait() + self.lock.release() + else: + os.kill(self.p.pid, signal.SIGTERM) + + + def drop_ctx(self, ctx_name): + self.lock.acquire() + self.p.stdin.write('{} ||| drop\n'.format(ctx_name)) + self.p.stdout.readline() + self.lock.release() + + def grammar(self, sentence, grammar_file, ctx_name): + self.lock.acquire() + self.p.stdin.write('{} ||| {} ||| {}\n'.format(ctx_name, sentence, grammar_file)) + self.p.stdout.readline() + self.lock.release() + + def add_instance(self, source, target, alignment, ctx_name): + self.lock.acquire() + self.p.stdin.write('{} ||| {} ||| {} ||| {}\n'.format(ctx_name, source, target, alignment)) + self.p.stdout.readline() + self.lock.release() + class RealtimeDecoder: '''Do not use directly unless you know what you're doing. Use RealtimeTranslator.''' - def __init__(self, configdir, tmpdir): - + def __init__(self, configdir, tmpdir, hpyplm=False, metric='ibm_bleu'): + cdec_root = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) self.tmp = tmpdir os.mkdir(self.tmp) # HPYPLM reference stream - ref_fifo_file = os.path.join(self.tmp, 'ref.fifo') - os.mkfifo(ref_fifo_file) - self.ref_fifo = open(ref_fifo_file, 'w+') - # Start with empty line (do not learn prior to first input) - self.ref_fifo.write('\n') - self.ref_fifo.flush() + self.hpyplm = hpyplm + if self.hpyplm: + ref_fifo_file = os.path.join(self.tmp, 'ref.fifo') + os.mkfifo(ref_fifo_file) + self.ref_fifo = open(ref_fifo_file, 'w+') + # Start with empty line (do not learn prior to first input) + self.ref_fifo.write('\n') + self.ref_fifo.flush() # Decoder decoder_config = [[f.strip() for f in line.split('=')] for line in open(os.path.join(configdir, 'cdec.ini'))] - util.cdec_ini_for_realtime(decoder_config, os.path.abspath(configdir), ref_fifo_file) + util.cdec_ini_for_realtime(decoder_config, os.path.abspath(configdir), ref_fifo_file if self.hpyplm else None) decoder_config_file = os.path.join(self.tmp, 'cdec.ini') with open(decoder_config_file, 'w') as output: for (k, v) in decoder_config: output.write('{}={}\n'.format(k, v)) decoder_weights = os.path.join(configdir, 'weights.final') - self.decoder = decoder.MIRADecoder(decoder_config_file, decoder_weights) + self.decoder = decoder.MIRADecoder(decoder_config_file, decoder_weights, metric=metric) def close(self, force=False): logger.info('Closing decoder and removing {}'.format(self.tmp)) self.decoder.close(force) - self.ref_fifo.close() + if self.hpyplm: + self.ref_fifo.close() shutil.rmtree(self.tmp) class RealtimeTranslator: @@ -73,6 +128,11 @@ class RealtimeTranslator: cdec_root = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + # rt.ini options + ini = dict(line.strip().split('=') for line in open(os.path.join(configdir, 'rt.ini'))) + self.hpyplm = (ini.get('hpyplm', 'false') in TRUE) + self.metric = ini.get('metric', 'ibm_bleu') + ### Single instance for all contexts self.config = configdir @@ -100,7 +160,7 @@ class RealtimeTranslator: sa_config.filename = os.path.join(self.tmp, 'sa.ini') util.sa_ini_for_realtime(sa_config, os.path.abspath(configdir)) sa_config.write() - self.extractor = cdec.sa.GrammarExtractor(sa_config.filename, online=True) + self.extractor = ExtractorWrapper(sa_config.filename) self.cache_size = cache_size ### One instance per context @@ -131,10 +191,13 @@ class RealtimeTranslator: '''Cleanup''' if force: logger.info('Forced shutdown: stopping immediately') - for ctx_name in list(self.ctx_names): - self.drop_ctx(ctx_name, force) + # Drop contexts before closing processes unless forced + if not force: + for ctx_name in list(self.ctx_names): + self.drop_ctx(ctx_name, force) logger.info('Closing processes') self.aligner.close(force) + self.extractor.close(force) if self.norm: if not force: self.tokenizer_lock.acquire() @@ -160,7 +223,7 @@ class RealtimeTranslator: self.grammar_files[ctx_name] = collections.deque() self.grammar_dict[ctx_name] = {} tmpdir = os.path.join(self.tmp, 'decoder.{}'.format(ctx_name)) - self.decoders[ctx_name] = RealtimeDecoder(self.config, tmpdir) + self.decoders[ctx_name] = RealtimeDecoder(self.config, tmpdir, hpyplm=self.hpyplm, metric=self.metric) def drop_ctx(self, ctx_name=None, force=False): '''Delete a context (inc stopping the decoder) @@ -202,11 +265,9 @@ class RealtimeTranslator: self.extractor_lock.release() return grammar_file # Extract and cache - (fid, grammar_file) = tempfile.mkstemp(dir=self.decoders[ctx_name].tmp, prefix='grammar.') + (fid, grammar_file) = tempfile.mkstemp(dir=self.decoders[ctx_name].tmp, prefix='grammar.', suffix='.gz') os.close(fid) - with open(grammar_file, 'w') as output: - for rule in self.extractor.grammar(sentence, ctx_name): - output.write('{}\n'.format(str(rule))) + self.extractor.grammar(sentence, grammar_file, ctx_name) grammar_files = self.grammar_files[ctx_name] if len(grammar_files) == self.cache_size: rm_sent = grammar_files.popleft() @@ -239,8 +300,9 @@ class RealtimeTranslator: stop_time = time.time() logger.info('({}) Translation time: {} seconds'.format(ctx_name, stop_time - start_time)) # Empty reference: HPYPLM does not learn prior to next translation - decoder.ref_fifo.write('\n') - decoder.ref_fifo.flush() + if self.hpyplm: + decoder.ref_fifo.write('\n') + decoder.ref_fifo.flush() if self.norm: logger.info('({}) Normalized translation: {}'.format(ctx_name, hyp)) hyp = self.detokenize(hyp) @@ -301,9 +363,10 @@ class RealtimeTranslator: mira_log = decoder.decoder.update(source, grammar_file, target) logger.info('({}) MIRA HBF: {}'.format(ctx_name, mira_log)) # Add to HPYPLM by writing to fifo (read on next translation) - logger.info('({}) Adding to HPYPLM: {}'.format(ctx_name, target)) - decoder.ref_fifo.write('{}\n'.format(target)) - decoder.ref_fifo.flush() + if self.hpyplm: + logger.info('({}) Adding to HPYPLM: {}'.format(ctx_name, target)) + decoder.ref_fifo.write('{}\n'.format(target)) + decoder.ref_fifo.flush() # Store incremental data for save/load self.ctx_data[ctx_name].append((source, target, alignment)) # Add aligned sentence pair to grammar extractor @@ -381,9 +444,10 @@ class RealtimeTranslator: # Extractor self.extractor.add_instance(source, target, alignment, ctx_name) # HPYPLM - hyp = decoder.decoder.decode(LIKELY_OOV) - decoder.ref_fifo.write('{}\n'.format(target)) - decoder.ref_fifo.flush() + if self.hpyplm: + hyp = decoder.decoder.decode(LIKELY_OOV) + decoder.ref_fifo.write('{}\n'.format(target)) + decoder.ref_fifo.flush() stop_time = time.time() logger.info('({}) Loaded state with {} sentences in {} seconds'.format(ctx_name, len(ctx_data), stop_time - start_time)) lock.release() diff --git a/realtime/rt/util.py b/realtime/rt/util.py index a7333bbd..8cd41a1e 100644 --- a/realtime/rt/util.py +++ b/realtime/rt/util.py @@ -10,6 +10,7 @@ SA_INI_FILES = set(( 'a_file', 'lex_file', 'precompute_file', + 'bilex_file', )) class FIFOLock: |