From ac469cdf4c70154a1c2cedce9edf5cdc3bdb2d61 Mon Sep 17 00:00:00 2001 From: Michael Denkowski Date: Mon, 19 Aug 2013 08:23:42 -0700 Subject: Realtime translation (mostly a cdec wrapper for now) --- realtime/README.md | 1 + realtime/realtime.py | 103 ++++++++++++++++++++++++++++++++++++++++++++ realtime/rt/__init__.py | 2 + realtime/rt/aligner.py | 50 +++++++++++++++++++++ realtime/rt/decoder.py | 23 ++++++++++ realtime/rt/util.py | 13 ++++++ word-aligner/force_align.py | 46 ++++---------------- 7 files changed, 200 insertions(+), 38 deletions(-) create mode 100644 realtime/README.md create mode 100755 realtime/realtime.py create mode 100644 realtime/rt/__init__.py create mode 100644 realtime/rt/aligner.py create mode 100644 realtime/rt/decoder.py create mode 100644 realtime/rt/util.py diff --git a/realtime/README.md b/realtime/README.md new file mode 100644 index 00000000..b37dddc8 --- /dev/null +++ b/realtime/README.md @@ -0,0 +1 @@ +More to come. diff --git a/realtime/realtime.py b/realtime/realtime.py new file mode 100755 index 00000000..a6b3ed52 --- /dev/null +++ b/realtime/realtime.py @@ -0,0 +1,103 @@ +#!/usr/bin/env python +import argparse +import os +import logging +import cdec.configobj +import cdec.sa +import shutil +import sys +import subprocess +import tempfile +import time + +from rt import ForceAligner +from rt import CdecDecoder + +class RealtimeDecoder: + + def __init__(self, configdir, tmpdir='/tmp'): + + # Temporary work dir + self.tmp = tempfile.mkdtemp(dir=tmpdir) + logging.info('Using temp dir {}'.format(self.tmp)) + + # Word aligner + fwd_params = os.path.join(configdir, 'a.fwd_params') + fwd_err = os.path.join(configdir, 'a.fwd_err') + rev_params = os.path.join(configdir, 'a.rev_params') + rev_err = os.path.join(configdir, 'a.rev_err') + self.aligner = ForceAligner(fwd_params, fwd_err, rev_params, rev_err) + + # Grammar extractor + sa_config = os.path.join(configdir, 'sa.ini') + self.extractor = cdec.sa.GrammarExtractor(sa_config) + + # Decoder + decoder_config = os.path.join(configdir, 'cdec.ini') + decoder_weights = os.path.join(configdir, 'weights.final') + self.decoder = CdecDecoder(decoder_config, decoder_weights) + + def close(self): + logging.info('Closing processes') + self.aligner.close() + self.decoder.close() + logging.info('Deleting {}'.format(self.tmp)) + shutil.rmtree(self.tmp) + + def grammar(self, sentence): + grammar_file = tempfile.mkstemp(dir=self.tmp)[1] + with open(grammar_file, 'w') as output: + for rule in self.extractor.grammar(sentence): + output.write(str(rule) + '\n') + return grammar_file + + def decode(self, sentence): + grammar_file = self.grammar(sentence) + start_time = time.time() + hyp = self.decoder.decode(sentence, grammar_file) + stop_time = time.time() + logging.info('Translation time: {} seconds'.format(stop_time - start_time)) + os.remove(grammar_file) + return hyp + + def learn(self, source, target): + pass + +def main(): + + parser = argparse.ArgumentParser(description='Real-time adaptive translation with cdec.') + parser.add_argument('-c', '--config', required=True, help='Config directory (see README.md)') + parser.add_argument('-T', '--temp', help='Temp directory (default /tmp)', default='/tmp') + parser.add_argument('-v', '--verbose', help='Info to stderr', action='store_true') + args = parser.parse_args() + + if not args.config: + parser.error('specify a configuration directory') + + if args.verbose: + logging.basicConfig(level=logging.INFO) + + rtd = RealtimeDecoder(args.config) + + try: + for line in sys.stdin: + input = [f.strip() for f in line.split('|||')] + if len(input) == 1: + hyp = rtd.decode(input[0]) + sys.stdout.write('{}\n'.format(hyp)) + elif len(input) == 2: + rtd.learn(*input) + + # Clean exit on ctrl+c + except KeyboardInterrupt: + logging.info('Caught KeyboardInterrupt, exiting') + + # Cleanup + rtd.close() + + +def mkconfig(): + pass + +if __name__ == '__main__': + main() diff --git a/realtime/rt/__init__.py b/realtime/rt/__init__.py new file mode 100644 index 00000000..7a1aeda7 --- /dev/null +++ b/realtime/rt/__init__.py @@ -0,0 +1,2 @@ +from aligner import * +from decoder import * diff --git a/realtime/rt/aligner.py b/realtime/rt/aligner.py new file mode 100644 index 00000000..d94dbda0 --- /dev/null +++ b/realtime/rt/aligner.py @@ -0,0 +1,50 @@ +import os +import sys +import subprocess + +import util + +class ForceAligner: + + def __init__(self, fwd_params, fwd_err, rev_params, rev_err): + + cdec_root = os.path.dirname(os.path.dirname(os.path.dirname(__file__))) + fast_align = os.path.join(cdec_root, 'word-aligner', 'fast_align') + atools = os.path.join(cdec_root, 'utils', 'atools') + + (fwd_T, fwd_m) = self.read_err(fwd_err) + (rev_T, rev_m) = self.read_err(rev_err) + + fwd_cmd = [fast_align, '-i', '-', '-d', '-T', fwd_T, '-m', fwd_m, '-f', fwd_params] + rev_cmd = [fast_align, '-i', '-', '-d', '-T', rev_T, '-m', rev_m, '-f', rev_params, '-r'] + tools_cmd = [atools, '-i', '-', '-j', '-', '-c', 'grow-diag-final-and'] + + self.fwd_align = util.popen_io(fwd_cmd) + self.rev_align = util.popen_io(rev_cmd) + self.tools = util.popen_io(tools_cmd) + + def align(self, line): + self.fwd_align.stdin.write('{}\n'.format(line)) + self.rev_align.stdin.write('{}\n'.format(line)) + # f words ||| e words ||| links ||| score + fwd_line = self.fwd_align.stdout.readline().split('|||')[2].strip() + rev_line = self.rev_align.stdout.readline().split('|||')[2].strip() + self.tools.stdin.write('{}\n'.format(fwd_line)) + self.tools.stdin.write('{}\n'.format(rev_line)) + return self.tools.stdout.readline().strip() + + def close(self): + self.fwd_align.stdin.close() + self.rev_align.stdin.close() + self.tools.stdin.close() + + def read_err(self, err): + (T, m) = ('', '') + for line in open(err): + # expected target length = source length * N + if 'expected target length' in line: + m = line.split()[-1] + # final tension: N + elif 'final tension' in line: + T = line.split()[-1] + return (T, m) diff --git a/realtime/rt/decoder.py b/realtime/rt/decoder.py new file mode 100644 index 00000000..f4fea0e2 --- /dev/null +++ b/realtime/rt/decoder.py @@ -0,0 +1,23 @@ +import os +import subprocess + +import util + +class Decoder: + + def close(self): + self.decoder.stdin.close() + +class CdecDecoder(Decoder): + + def __init__(self, config, weights): + cdec_root = os.path.dirname(os.path.dirname(os.path.dirname(__file__))) + decoder = os.path.join(cdec_root, 'decoder', 'cdec') + decoder_cmd = [decoder, '-c', config, '-w', weights] + self.decoder = util.popen_io(decoder_cmd) + + def decode(self, sentence, grammar): + input = '{s}\n'.format(i=id, s=sentence, g=grammar) + self.decoder.stdin.write(input) + return self.decoder.stdout.readline().strip() + diff --git a/realtime/rt/util.py b/realtime/rt/util.py new file mode 100644 index 00000000..7f877161 --- /dev/null +++ b/realtime/rt/util.py @@ -0,0 +1,13 @@ +import subprocess +import threading + +def popen_io(cmd): + p = subprocess.Popen(cmd, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + consume_stream(p.stderr) + return p + +def consume_stream(stream): + def consume(s): + for _ in s: + pass + threading.Thread(target=consume, args=(stream,)).start() diff --git a/word-aligner/force_align.py b/word-aligner/force_align.py index a0c1aad7..ad6d95fa 100755 --- a/word-aligner/force_align.py +++ b/word-aligner/force_align.py @@ -2,7 +2,6 @@ import os import sys -import subprocess def main(): @@ -15,45 +14,16 @@ def main(): sys.stderr.write(' {} fwd_params fwd_err rev_params rev_err out.f-e.gdfa\n'.format(sys.argv[0])) sys.exit(2) - (f_p, f_err, r_p, r_err) = sys.argv[1:] - - (f_T, f_m) = find_Tm(f_err) - (r_T, r_m) = find_Tm(r_err) - - fast_align = os.path.join(os.path.dirname(__file__), 'fast_align') - f_cmd = [fast_align, '-i', '-', '-d', '-T', f_T, '-m', f_m, '-f', f_p] - r_cmd = [fast_align, '-i', '-', '-d', '-T', r_T, '-m', r_m, '-f', r_p, '-r'] - - atools = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'utils', 'atools') - tools_cmd = [atools, '-i', '-', '-j', '-', '-c', 'grow-diag-final-and'] - - sys.stderr.write('running: {}\n'.format(' '.join(f_cmd))) - sys.stderr.write('running: {}\n'.format(' '.join(r_cmd))) - sys.stderr.write('running: {}\n'.format(' '.join(tools_cmd))) - - f_a = subprocess.Popen(f_cmd, stdin=subprocess.PIPE, stdout=subprocess.PIPE) - r_a = subprocess.Popen(r_cmd, stdin=subprocess.PIPE, stdout=subprocess.PIPE) - tools = subprocess.Popen(tools_cmd, stdin=subprocess.PIPE, stdout=subprocess.PIPE) + # Hook into realtime + sys.path.append(os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), 'realtime')) + from rt import ForceAligner + aligner = ForceAligner(*sys.argv[1:]) + for line in sys.stdin: - f_a.stdin.write(line) - r_a.stdin.write(line) - # f words ||| e words ||| links ||| score - f_line = f_a.stdout.readline().split('|||')[2].strip() - r_line = r_a.stdout.readline().split('|||')[2].strip() - tools.stdin.write('{}\n'.format(f_line)) - tools.stdin.write('{}\n'.format(r_line)) - sys.stdout.write(tools.stdout.readline()) - -def find_Tm(err): - (T, m) = ('', '') - for line in open(err): - # expected target length = source length * N - if 'expected target length' in line: - m = line.split()[-1] - elif 'final tension' in line: - T = line.split()[-1] - return (T, m) + sys.stdout.write('{}\n'.format(aligner.align(line.strip()))) + aligner.close() + if __name__ == '__main__': main() -- cgit v1.2.3