summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMichael Denkowski <mdenkows@cs.cmu.edu>2013-08-19 08:23:42 -0700
committerMichael Denkowski <mdenkows@cs.cmu.edu>2013-08-19 08:23:42 -0700
commitac469cdf4c70154a1c2cedce9edf5cdc3bdb2d61 (patch)
tree3e8bcbc6b00533e7a79e3cf28c2ac6aa4bdadd8d
parentf4a3a2547316ca5d31366e6808858fe94981415c (diff)
Realtime translation (mostly a cdec wrapper for now)
-rw-r--r--realtime/README.md1
-rwxr-xr-xrealtime/realtime.py103
-rw-r--r--realtime/rt/__init__.py2
-rw-r--r--realtime/rt/aligner.py50
-rw-r--r--realtime/rt/decoder.py23
-rw-r--r--realtime/rt/util.py13
-rwxr-xr-xword-aligner/force_align.py46
7 files changed, 200 insertions, 38 deletions
diff --git a/realtime/README.md b/realtime/README.md
new file mode 100644
index 00000000..b37dddc8
--- /dev/null
+++ b/realtime/README.md
@@ -0,0 +1 @@
+More to come.
diff --git a/realtime/realtime.py b/realtime/realtime.py
new file mode 100755
index 00000000..a6b3ed52
--- /dev/null
+++ b/realtime/realtime.py
@@ -0,0 +1,103 @@
+#!/usr/bin/env python
+import argparse
+import os
+import logging
+import cdec.configobj
+import cdec.sa
+import shutil
+import sys
+import subprocess
+import tempfile
+import time
+
+from rt import ForceAligner
+from rt import CdecDecoder
+
+class RealtimeDecoder:
+
+ def __init__(self, configdir, tmpdir='/tmp'):
+
+ # Temporary work dir
+ self.tmp = tempfile.mkdtemp(dir=tmpdir)
+ logging.info('Using temp dir {}'.format(self.tmp))
+
+ # Word aligner
+ fwd_params = os.path.join(configdir, 'a.fwd_params')
+ fwd_err = os.path.join(configdir, 'a.fwd_err')
+ rev_params = os.path.join(configdir, 'a.rev_params')
+ rev_err = os.path.join(configdir, 'a.rev_err')
+ self.aligner = ForceAligner(fwd_params, fwd_err, rev_params, rev_err)
+
+ # Grammar extractor
+ sa_config = os.path.join(configdir, 'sa.ini')
+ self.extractor = cdec.sa.GrammarExtractor(sa_config)
+
+ # Decoder
+ decoder_config = os.path.join(configdir, 'cdec.ini')
+ decoder_weights = os.path.join(configdir, 'weights.final')
+ self.decoder = CdecDecoder(decoder_config, decoder_weights)
+
+ def close(self):
+ logging.info('Closing processes')
+ self.aligner.close()
+ self.decoder.close()
+ logging.info('Deleting {}'.format(self.tmp))
+ shutil.rmtree(self.tmp)
+
+ def grammar(self, sentence):
+ grammar_file = tempfile.mkstemp(dir=self.tmp)[1]
+ with open(grammar_file, 'w') as output:
+ for rule in self.extractor.grammar(sentence):
+ output.write(str(rule) + '\n')
+ return grammar_file
+
+ def decode(self, sentence):
+ grammar_file = self.grammar(sentence)
+ start_time = time.time()
+ hyp = self.decoder.decode(sentence, grammar_file)
+ stop_time = time.time()
+ logging.info('Translation time: {} seconds'.format(stop_time - start_time))
+ os.remove(grammar_file)
+ return hyp
+
+ def learn(self, source, target):
+ pass
+
+def main():
+
+ parser = argparse.ArgumentParser(description='Real-time adaptive translation with cdec.')
+ parser.add_argument('-c', '--config', required=True, help='Config directory (see README.md)')
+ parser.add_argument('-T', '--temp', help='Temp directory (default /tmp)', default='/tmp')
+ parser.add_argument('-v', '--verbose', help='Info to stderr', action='store_true')
+ args = parser.parse_args()
+
+ if not args.config:
+ parser.error('specify a configuration directory')
+
+ if args.verbose:
+ logging.basicConfig(level=logging.INFO)
+
+ rtd = RealtimeDecoder(args.config)
+
+ try:
+ for line in sys.stdin:
+ input = [f.strip() for f in line.split('|||')]
+ if len(input) == 1:
+ hyp = rtd.decode(input[0])
+ sys.stdout.write('{}\n'.format(hyp))
+ elif len(input) == 2:
+ rtd.learn(*input)
+
+ # Clean exit on ctrl+c
+ except KeyboardInterrupt:
+ logging.info('Caught KeyboardInterrupt, exiting')
+
+ # Cleanup
+ rtd.close()
+
+
+def mkconfig():
+ pass
+
+if __name__ == '__main__':
+ main()
diff --git a/realtime/rt/__init__.py b/realtime/rt/__init__.py
new file mode 100644
index 00000000..7a1aeda7
--- /dev/null
+++ b/realtime/rt/__init__.py
@@ -0,0 +1,2 @@
+from aligner import *
+from decoder import *
diff --git a/realtime/rt/aligner.py b/realtime/rt/aligner.py
new file mode 100644
index 00000000..d94dbda0
--- /dev/null
+++ b/realtime/rt/aligner.py
@@ -0,0 +1,50 @@
+import os
+import sys
+import subprocess
+
+import util
+
+class ForceAligner:
+
+ def __init__(self, fwd_params, fwd_err, rev_params, rev_err):
+
+ cdec_root = os.path.dirname(os.path.dirname(os.path.dirname(__file__)))
+ fast_align = os.path.join(cdec_root, 'word-aligner', 'fast_align')
+ atools = os.path.join(cdec_root, 'utils', 'atools')
+
+ (fwd_T, fwd_m) = self.read_err(fwd_err)
+ (rev_T, rev_m) = self.read_err(rev_err)
+
+ fwd_cmd = [fast_align, '-i', '-', '-d', '-T', fwd_T, '-m', fwd_m, '-f', fwd_params]
+ rev_cmd = [fast_align, '-i', '-', '-d', '-T', rev_T, '-m', rev_m, '-f', rev_params, '-r']
+ tools_cmd = [atools, '-i', '-', '-j', '-', '-c', 'grow-diag-final-and']
+
+ self.fwd_align = util.popen_io(fwd_cmd)
+ self.rev_align = util.popen_io(rev_cmd)
+ self.tools = util.popen_io(tools_cmd)
+
+ def align(self, line):
+ self.fwd_align.stdin.write('{}\n'.format(line))
+ self.rev_align.stdin.write('{}\n'.format(line))
+ # f words ||| e words ||| links ||| score
+ fwd_line = self.fwd_align.stdout.readline().split('|||')[2].strip()
+ rev_line = self.rev_align.stdout.readline().split('|||')[2].strip()
+ self.tools.stdin.write('{}\n'.format(fwd_line))
+ self.tools.stdin.write('{}\n'.format(rev_line))
+ return self.tools.stdout.readline().strip()
+
+ def close(self):
+ self.fwd_align.stdin.close()
+ self.rev_align.stdin.close()
+ self.tools.stdin.close()
+
+ def read_err(self, err):
+ (T, m) = ('', '')
+ for line in open(err):
+ # expected target length = source length * N
+ if 'expected target length' in line:
+ m = line.split()[-1]
+ # final tension: N
+ elif 'final tension' in line:
+ T = line.split()[-1]
+ return (T, m)
diff --git a/realtime/rt/decoder.py b/realtime/rt/decoder.py
new file mode 100644
index 00000000..f4fea0e2
--- /dev/null
+++ b/realtime/rt/decoder.py
@@ -0,0 +1,23 @@
+import os
+import subprocess
+
+import util
+
+class Decoder:
+
+ def close(self):
+ self.decoder.stdin.close()
+
+class CdecDecoder(Decoder):
+
+ def __init__(self, config, weights):
+ cdec_root = os.path.dirname(os.path.dirname(os.path.dirname(__file__)))
+ decoder = os.path.join(cdec_root, 'decoder', 'cdec')
+ decoder_cmd = [decoder, '-c', config, '-w', weights]
+ self.decoder = util.popen_io(decoder_cmd)
+
+ def decode(self, sentence, grammar):
+ input = '<seg grammar="{g}">{s}</seg>\n'.format(i=id, s=sentence, g=grammar)
+ self.decoder.stdin.write(input)
+ return self.decoder.stdout.readline().strip()
+
diff --git a/realtime/rt/util.py b/realtime/rt/util.py
new file mode 100644
index 00000000..7f877161
--- /dev/null
+++ b/realtime/rt/util.py
@@ -0,0 +1,13 @@
+import subprocess
+import threading
+
+def popen_io(cmd):
+ p = subprocess.Popen(cmd, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+ consume_stream(p.stderr)
+ return p
+
+def consume_stream(stream):
+ def consume(s):
+ for _ in s:
+ pass
+ threading.Thread(target=consume, args=(stream,)).start()
diff --git a/word-aligner/force_align.py b/word-aligner/force_align.py
index a0c1aad7..ad6d95fa 100755
--- a/word-aligner/force_align.py
+++ b/word-aligner/force_align.py
@@ -2,7 +2,6 @@
import os
import sys
-import subprocess
def main():
@@ -15,45 +14,16 @@ def main():
sys.stderr.write(' {} fwd_params fwd_err rev_params rev_err <in.f-e >out.f-e.gdfa\n'.format(sys.argv[0]))
sys.exit(2)
- (f_p, f_err, r_p, r_err) = sys.argv[1:]
-
- (f_T, f_m) = find_Tm(f_err)
- (r_T, r_m) = find_Tm(r_err)
-
- fast_align = os.path.join(os.path.dirname(__file__), 'fast_align')
- f_cmd = [fast_align, '-i', '-', '-d', '-T', f_T, '-m', f_m, '-f', f_p]
- r_cmd = [fast_align, '-i', '-', '-d', '-T', r_T, '-m', r_m, '-f', r_p, '-r']
-
- atools = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'utils', 'atools')
- tools_cmd = [atools, '-i', '-', '-j', '-', '-c', 'grow-diag-final-and']
-
- sys.stderr.write('running: {}\n'.format(' '.join(f_cmd)))
- sys.stderr.write('running: {}\n'.format(' '.join(r_cmd)))
- sys.stderr.write('running: {}\n'.format(' '.join(tools_cmd)))
-
- f_a = subprocess.Popen(f_cmd, stdin=subprocess.PIPE, stdout=subprocess.PIPE)
- r_a = subprocess.Popen(r_cmd, stdin=subprocess.PIPE, stdout=subprocess.PIPE)
- tools = subprocess.Popen(tools_cmd, stdin=subprocess.PIPE, stdout=subprocess.PIPE)
+ # Hook into realtime
+ sys.path.append(os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), 'realtime'))
+ from rt import ForceAligner
+ aligner = ForceAligner(*sys.argv[1:])
+
for line in sys.stdin:
- f_a.stdin.write(line)
- r_a.stdin.write(line)
- # f words ||| e words ||| links ||| score
- f_line = f_a.stdout.readline().split('|||')[2].strip()
- r_line = r_a.stdout.readline().split('|||')[2].strip()
- tools.stdin.write('{}\n'.format(f_line))
- tools.stdin.write('{}\n'.format(r_line))
- sys.stdout.write(tools.stdout.readline())
-
-def find_Tm(err):
- (T, m) = ('', '')
- for line in open(err):
- # expected target length = source length * N
- if 'expected target length' in line:
- m = line.split()[-1]
- elif 'final tension' in line:
- T = line.split()[-1]
- return (T, m)
+ sys.stdout.write('{}\n'.format(aligner.align(line.strip())))
+ aligner.close()
+
if __name__ == '__main__':
main()