diff options
| -rw-r--r-- | realtime/README.md | 1 | ||||
| -rwxr-xr-x | realtime/realtime.py | 103 | ||||
| -rw-r--r-- | realtime/rt/__init__.py | 2 | ||||
| -rw-r--r-- | realtime/rt/aligner.py | 50 | ||||
| -rw-r--r-- | realtime/rt/decoder.py | 23 | ||||
| -rw-r--r-- | realtime/rt/util.py | 13 | ||||
| -rwxr-xr-x | word-aligner/force_align.py | 46 | 
7 files changed, 200 insertions, 38 deletions
| diff --git a/realtime/README.md b/realtime/README.md new file mode 100644 index 00000000..b37dddc8 --- /dev/null +++ b/realtime/README.md @@ -0,0 +1 @@ +More to come. diff --git a/realtime/realtime.py b/realtime/realtime.py new file mode 100755 index 00000000..a6b3ed52 --- /dev/null +++ b/realtime/realtime.py @@ -0,0 +1,103 @@ +#!/usr/bin/env python +import argparse +import os +import logging +import cdec.configobj +import cdec.sa +import shutil +import sys +import subprocess +import tempfile +import time + +from rt import ForceAligner +from rt import CdecDecoder + +class RealtimeDecoder: + +    def __init__(self, configdir, tmpdir='/tmp'): + +        # Temporary work dir +        self.tmp = tempfile.mkdtemp(dir=tmpdir) +        logging.info('Using temp dir {}'.format(self.tmp)) + +        # Word aligner +        fwd_params = os.path.join(configdir, 'a.fwd_params') +        fwd_err = os.path.join(configdir, 'a.fwd_err') +        rev_params = os.path.join(configdir, 'a.rev_params') +        rev_err = os.path.join(configdir, 'a.rev_err') +        self.aligner = ForceAligner(fwd_params, fwd_err, rev_params, rev_err) + +        # Grammar extractor +        sa_config = os.path.join(configdir, 'sa.ini') +        self.extractor = cdec.sa.GrammarExtractor(sa_config) + +        # Decoder +        decoder_config = os.path.join(configdir, 'cdec.ini') +        decoder_weights = os.path.join(configdir, 'weights.final') +        self.decoder = CdecDecoder(decoder_config, decoder_weights) + +    def close(self): +        logging.info('Closing processes') +        self.aligner.close() +        self.decoder.close() +        logging.info('Deleting {}'.format(self.tmp)) +        shutil.rmtree(self.tmp) + +    def grammar(self, sentence): +        grammar_file = tempfile.mkstemp(dir=self.tmp)[1] +        with open(grammar_file, 'w') as output: +            for rule in self.extractor.grammar(sentence): +                output.write(str(rule) + '\n') +        return grammar_file +         +    def decode(self, sentence): +        grammar_file = self.grammar(sentence) +        start_time = time.time() +        hyp = self.decoder.decode(sentence, grammar_file) +        stop_time = time.time() +        logging.info('Translation time: {} seconds'.format(stop_time - start_time)) +        os.remove(grammar_file) +        return hyp + +    def learn(self, source, target): +        pass + +def main(): + +    parser = argparse.ArgumentParser(description='Real-time adaptive translation with cdec.') +    parser.add_argument('-c', '--config', required=True, help='Config directory (see README.md)') +    parser.add_argument('-T', '--temp', help='Temp directory (default /tmp)', default='/tmp') +    parser.add_argument('-v', '--verbose', help='Info to stderr', action='store_true') +    args = parser.parse_args() + +    if not args.config: +        parser.error('specify a configuration directory') + +    if args.verbose: +        logging.basicConfig(level=logging.INFO) + +    rtd = RealtimeDecoder(args.config) + +    try: +        for line in sys.stdin: +            input = [f.strip() for f in line.split('|||')] +            if len(input) == 1: +                hyp = rtd.decode(input[0]) +                sys.stdout.write('{}\n'.format(hyp)) +            elif len(input) == 2: +                rtd.learn(*input) + +    # Clean exit on ctrl+c +    except KeyboardInterrupt: +        logging.info('Caught KeyboardInterrupt, exiting') + +    # Cleanup +    rtd.close() + + +def mkconfig(): +    pass + +if __name__ == '__main__': +    main() diff --git a/realtime/rt/__init__.py b/realtime/rt/__init__.py new file mode 100644 index 00000000..7a1aeda7 --- /dev/null +++ b/realtime/rt/__init__.py @@ -0,0 +1,2 @@ +from aligner import * +from decoder import * diff --git a/realtime/rt/aligner.py b/realtime/rt/aligner.py new file mode 100644 index 00000000..d94dbda0 --- /dev/null +++ b/realtime/rt/aligner.py @@ -0,0 +1,50 @@ +import os +import sys +import subprocess + +import util + +class ForceAligner: + +    def __init__(self, fwd_params, fwd_err, rev_params, rev_err): + +        cdec_root = os.path.dirname(os.path.dirname(os.path.dirname(__file__))) +        fast_align = os.path.join(cdec_root, 'word-aligner', 'fast_align') +        atools = os.path.join(cdec_root, 'utils', 'atools') + +        (fwd_T, fwd_m) = self.read_err(fwd_err) +        (rev_T, rev_m) = self.read_err(rev_err) + +        fwd_cmd = [fast_align, '-i', '-', '-d', '-T', fwd_T, '-m', fwd_m, '-f', fwd_params] +        rev_cmd = [fast_align, '-i', '-', '-d', '-T', rev_T, '-m', rev_m, '-f', rev_params, '-r'] +        tools_cmd = [atools, '-i', '-', '-j', '-', '-c', 'grow-diag-final-and'] + +        self.fwd_align = util.popen_io(fwd_cmd) +        self.rev_align = util.popen_io(rev_cmd) +        self.tools = util.popen_io(tools_cmd) + +    def align(self, line): +        self.fwd_align.stdin.write('{}\n'.format(line)) +        self.rev_align.stdin.write('{}\n'.format(line)) +        # f words ||| e words ||| links ||| score +        fwd_line = self.fwd_align.stdout.readline().split('|||')[2].strip() +        rev_line = self.rev_align.stdout.readline().split('|||')[2].strip() +        self.tools.stdin.write('{}\n'.format(fwd_line)) +        self.tools.stdin.write('{}\n'.format(rev_line)) +        return self.tools.stdout.readline().strip() +  +    def close(self): +        self.fwd_align.stdin.close() +        self.rev_align.stdin.close() +        self.tools.stdin.close() + +    def read_err(self, err): +        (T, m) = ('', '') +        for line in open(err): +            # expected target length = source length * N +            if 'expected target length' in line: +                m = line.split()[-1] +            # final tension: N +            elif 'final tension' in line: +                T = line.split()[-1] +        return (T, m) diff --git a/realtime/rt/decoder.py b/realtime/rt/decoder.py new file mode 100644 index 00000000..f4fea0e2 --- /dev/null +++ b/realtime/rt/decoder.py @@ -0,0 +1,23 @@ +import os +import subprocess + +import util + +class Decoder: + +    def close(self): +        self.decoder.stdin.close() + +class CdecDecoder(Decoder): +     +    def __init__(self, config, weights): +        cdec_root = os.path.dirname(os.path.dirname(os.path.dirname(__file__))) +        decoder = os.path.join(cdec_root, 'decoder', 'cdec') +        decoder_cmd = [decoder, '-c', config, '-w', weights] +        self.decoder = util.popen_io(decoder_cmd) + +    def decode(self, sentence, grammar): +        input = '<seg grammar="{g}">{s}</seg>\n'.format(i=id, s=sentence, g=grammar) +        self.decoder.stdin.write(input) +        return self.decoder.stdout.readline().strip() + diff --git a/realtime/rt/util.py b/realtime/rt/util.py new file mode 100644 index 00000000..7f877161 --- /dev/null +++ b/realtime/rt/util.py @@ -0,0 +1,13 @@ +import subprocess +import threading + +def popen_io(cmd): +    p = subprocess.Popen(cmd, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE) +    consume_stream(p.stderr) +    return p + +def consume_stream(stream): +    def consume(s): +        for _ in s: +            pass +    threading.Thread(target=consume, args=(stream,)).start() diff --git a/word-aligner/force_align.py b/word-aligner/force_align.py index a0c1aad7..ad6d95fa 100755 --- a/word-aligner/force_align.py +++ b/word-aligner/force_align.py @@ -2,7 +2,6 @@  import os  import sys -import subprocess  def main(): @@ -15,45 +14,16 @@ def main():          sys.stderr.write('  {} fwd_params fwd_err rev_params rev_err <in.f-e >out.f-e.gdfa\n'.format(sys.argv[0]))          sys.exit(2) -    (f_p, f_err, r_p, r_err) = sys.argv[1:] -     -    (f_T, f_m) = find_Tm(f_err) -    (r_T, r_m) = find_Tm(r_err) -     -    fast_align = os.path.join(os.path.dirname(__file__), 'fast_align') -    f_cmd = [fast_align, '-i', '-', '-d', '-T', f_T, '-m', f_m, '-f', f_p] -    r_cmd = [fast_align, '-i', '-', '-d', '-T', r_T, '-m', r_m, '-f', r_p, '-r'] - -    atools = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'utils', 'atools') -    tools_cmd = [atools, '-i', '-', '-j', '-', '-c', 'grow-diag-final-and'] - -    sys.stderr.write('running: {}\n'.format(' '.join(f_cmd))) -    sys.stderr.write('running: {}\n'.format(' '.join(r_cmd))) -    sys.stderr.write('running: {}\n'.format(' '.join(tools_cmd))) - -    f_a = subprocess.Popen(f_cmd, stdin=subprocess.PIPE, stdout=subprocess.PIPE) -    r_a = subprocess.Popen(r_cmd, stdin=subprocess.PIPE, stdout=subprocess.PIPE) -    tools = subprocess.Popen(tools_cmd, stdin=subprocess.PIPE, stdout=subprocess.PIPE) +    # Hook into realtime +    sys.path.append(os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), 'realtime')) +    from rt import ForceAligner +    aligner = ForceAligner(*sys.argv[1:]) +          for line in sys.stdin: -        f_a.stdin.write(line) -        r_a.stdin.write(line) -        # f words ||| e words ||| links ||| score -        f_line = f_a.stdout.readline().split('|||')[2].strip() -        r_line = r_a.stdout.readline().split('|||')[2].strip() -        tools.stdin.write('{}\n'.format(f_line)) -        tools.stdin.write('{}\n'.format(r_line)) -        sys.stdout.write(tools.stdout.readline()) - -def find_Tm(err): -    (T, m) = ('', '') -    for line in open(err): -        # expected target length = source length * N -        if 'expected target length' in line: -            m = line.split()[-1] -        elif 'final tension' in line: -            T = line.split()[-1] -    return (T, m) +        sys.stdout.write('{}\n'.format(aligner.align(line.strip()))) +    aligner.close() +      if __name__ == '__main__':      main() | 
