From 40eac315f63b018eec10da4124b801869cd788f5 Mon Sep 17 00:00:00 2001 From: Michael Denkowski Date: Wed, 4 Sep 2013 12:27:22 -0700 Subject: Infrastructure for HPYPLM, config file management. --- realtime/rt/aligner.py | 6 +++++ realtime/rt/decoder.py | 3 +++ realtime/rt/rt.py | 36 +++++++++++++++++++++----- realtime/rt/util.py | 68 ++++++++++++++++++++++++++++++-------------------- 4 files changed, 80 insertions(+), 33 deletions(-) (limited to 'realtime/rt') diff --git a/realtime/rt/aligner.py b/realtime/rt/aligner.py index 4a0ace48..3c6ea144 100644 --- a/realtime/rt/aligner.py +++ b/realtime/rt/aligner.py @@ -1,3 +1,4 @@ +import logging import os import sys import subprocess @@ -19,8 +20,13 @@ class ForceAligner: rev_cmd = [fast_align, '-i', '-', '-d', '-T', rev_T, '-m', rev_m, '-f', rev_params, '-r'] tools_cmd = [atools, '-i', '-', '-j', '-', '-c', 'grow-diag-final-and'] + logging.info('Executing: {}'.format(' '.join(fwd_cmd))) self.fwd_align = util.popen_io(fwd_cmd) + + logging.info('Executing: {}'.format(' '.join(rev_cmd))) self.rev_align = util.popen_io(rev_cmd) + + logging.info('Executing: {}'.format(' '.join(tools_cmd))) self.tools = util.popen_io(tools_cmd) def align(self, source, target): diff --git a/realtime/rt/decoder.py b/realtime/rt/decoder.py index 6bbef6f2..0a202fae 100644 --- a/realtime/rt/decoder.py +++ b/realtime/rt/decoder.py @@ -1,3 +1,4 @@ +import logging import os import subprocess @@ -19,6 +20,7 @@ class CdecDecoder(Decoder): cdec_root = os.path.dirname(os.path.dirname(os.path.dirname(__file__))) decoder = os.path.join(cdec_root, 'decoder', 'cdec') decoder_cmd = [decoder, '-c', config, '-w', weights] + logging.info('Executing: {}'.format(' '.join(decoder_cmd))) self.decoder = util.popen_io(decoder_cmd) class MIRADecoder(Decoder): @@ -28,6 +30,7 @@ class MIRADecoder(Decoder): mira = os.path.join(cdec_root, 'training', 'mira', 'kbest_cut_mira') # optimizer=2 step=0.001 best=500, k=500, uniq, stream mira_cmd = [mira, '-c', config, '-w', weights, '-o', '2', '-C', '0.001', '-b', '500', '-k', '500', '-u', '-t'] + logging.info('Executing: {}'.format(' '.join(mira_cmd))) self.decoder = util.popen_io(mira_cmd) def update(self, sentence, grammar, reference): diff --git a/realtime/rt/rt.py b/realtime/rt/rt.py index fc6e3929..b04b4ed5 100644 --- a/realtime/rt/rt.py +++ b/realtime/rt/rt.py @@ -10,11 +10,12 @@ import subprocess import tempfile import time -import cdec.configobj +from cdec.configobj import ConfigObj import cdec.sa import aligner import decoder +import util class RealtimeDecoder: @@ -32,21 +33,38 @@ class RealtimeDecoder: self.aligner = aligner.ForceAligner(fwd_params, fwd_err, rev_params, rev_err) # Grammar extractor - sa_config = os.path.join(configdir, 'sa.ini') - self.extractor = cdec.sa.GrammarExtractor(sa_config, online=True) + sa_config = ConfigObj(os.path.join(configdir, 'sa.ini'), unrepr=True) + sa_config.filename = os.path.join(self.tmp, 'sa.ini') + util.sa_ini_for_realtime(sa_config, os.path.abspath(configdir)) + sa_config.write() + self.extractor = cdec.sa.GrammarExtractor(sa_config.filename, online=True) self.grammar_files = collections.deque() self.grammar_dict = {} self.cache_size = cache_size + # HPYPLM reference stream + ref_fifo_file = os.path.join(self.tmp, 'ref.fifo') + os.mkfifo(ref_fifo_file) + self.ref_fifo = open(ref_fifo_file, 'w+') + # Start with empty line (do not learn prior to first input) + self.ref_fifo.write('\n') + self.ref_fifo.flush() + # Decoder - decoder_config = os.path.join(configdir, 'cdec.ini') + decoder_config = [[f.strip() for f in line.split('=')] for line in open(os.path.join(configdir, 'cdec.ini'))] + util.cdec_ini_for_realtime(decoder_config, os.path.abspath(configdir), ref_fifo_file) + decoder_config_file = os.path.join(self.tmp, 'cdec.ini') + with open(decoder_config_file, 'w') as output: + for (k, v) in decoder_config: + output.write('{}={}\n'.format(k, v)) decoder_weights = os.path.join(configdir, 'weights.final') - self.decoder = decoder.MIRADecoder(decoder_config, decoder_weights) + self.decoder = decoder.MIRADecoder(decoder_config_file, decoder_weights) def close(self): logging.info('Closing processes') self.aligner.close() self.decoder.close() + self.ref_fifo.close() logging.info('Deleting {}'.format(self.tmp)) shutil.rmtree(self.tmp) @@ -75,6 +93,9 @@ class RealtimeDecoder: grammar_file = self.grammar(sentence) start_time = time.time() hyp = self.decoder.decode(sentence, grammar_file) + # Empty reference: HPYPLM does not learn prior to next translation + self.ref_fifo.write('\n') + self.ref_fifo.flush() stop_time = time.time() logging.info('Translation time: {} seconds'.format(stop_time - start_time)) return hyp @@ -91,4 +112,7 @@ class RealtimeDecoder: # Clear (old) cached grammar rm_grammar = self.grammar_dict.pop(source) os.remove(rm_grammar) - # TODO: Add to LM by writing to fifo + # Add to HPYPLM by writing to fifo (read on next translation) + logging.info('Adding to HPYPLM: {}'.format(target)) + self.ref_fifo.write('{}\n'.format(target)) + self.ref_fifo.flush() diff --git a/realtime/rt/util.py b/realtime/rt/util.py index 885298e6..10e94909 100644 --- a/realtime/rt/util.py +++ b/realtime/rt/util.py @@ -13,12 +13,49 @@ SA_INI_FILES = set(( 'precompute_file', )) +def cdec_ini_for_config(config): + cdec_ini_handle(config, os.path.basename, hpyplm_rm_ref) + +def cdec_ini_for_realtime(config, path, ref_fifo): + cdec_ini_handle(config, lambda x: os.path.join(path, x), lambda x: hpyplm_add_ref(x, ref_fifo)) + +def cdec_ini_handle(config, path_fn, hpyplm_fn): + # This is a list of (k, v), not a ConfigObj or dict + for i in range(len(config)): + if config[i][0] == 'feature_function': + if config[i][1].startswith('KLanguageModel'): + f = config[i][1].split() + f[-1] = path_fn(f[-1]) + config[i][1] = ' '.join(f) + elif config[i][1].startswith('External'): + f = config[i][1].split() + if f[1].endswith('libcdec_ff_hpyplm.so'): + # Modify paths + for j in range(1, len(f)): + if not f[j].startswith('-'): + f[j] = path_fn(f[j]) + # Modify hpyplm args + hpyplm_fn(f) + config[i][1] = ' '.join(f) + def consume_stream(stream): def consume(s): for _ in s: pass threading.Thread(target=consume, args=(stream,)).start() +def hpyplm_add_ref(f, ref): + f.append('-r') + f.append(ref) + f.append('-t') + +def hpyplm_rm_ref(f): + for i in range(1, len(f)): + if f[i] == '-r': + f.pop(i) + f.pop(i) + return + def popen_io(cmd): p = subprocess.Popen(cmd, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE) consume_stream(p.stderr) @@ -29,35 +66,12 @@ def popen_io_v(cmd): p = subprocess.Popen(cmd, stdin=subprocess.PIPE, stdout=subprocess.PIPE) return p -def sa_ini_addpath(config, path): +def sa_ini_for_config(config): for key in config: if key in SA_INI_FILES: - config[key] = os.path.join(path, config[key]) + config[key] = os.path.join('sa', os.path.basename(config[key])) -def sa_ini_basename(config): +def sa_ini_for_realtime(config, path): for key in config: if key in SA_INI_FILES: - config[key] = os.path.join('sa', os.path.basename(config[key])) - -def cdec_ini_addpath(config, path): - cdec_ini_fn(config, lambda x: os.path.join(path, x)) - -def cdec_ini_basename(config): - cdec_ini_fn(config, os.path.basename) - -def cdec_ini_fn(config, fn): - # This is a list of (k, v), not a ConfigObj or dict - for i in range(len(config)): - if config[i][0] == 'feature_function': - if config[i][1].startswith('KLanguageModel'): - f = config[i][1].split() - f[-1] = fn(f[-1]) - config[i][1] = ' '.join(f) - elif config[i][1].startswith('External'): - f = config[i][1].split() - if f[1].endswith('libcdec_ff_hpyplm.so'): - for j in range(1, len(f)): - if not f[j].startswith('-'): - f[j] = fn(f[j]) - config[i][1] = ' '.join(f) - + config[key] = os.path.join(path, config[key]) -- cgit v1.2.3