From eec45e082f4261871bb6547a14511d2c722e3f59 Mon Sep 17 00:00:00 2001 From: Michael Denkowski Date: Tue, 3 Sep 2013 12:13:03 -0700 Subject: MIRA updates in realtime.py --- realtime/realtime.py | 47 ++++++++++++++++++++++++++++++++++++++--------- realtime/rt/aligner.py | 5 ++++- realtime/rt/decoder.py | 7 ++++++- realtime/rt/util.py | 6 ++++++ 4 files changed, 54 insertions(+), 11 deletions(-) diff --git a/realtime/realtime.py b/realtime/realtime.py index c169ce4c..1f67bed7 100755 --- a/realtime/realtime.py +++ b/realtime/realtime.py @@ -1,21 +1,24 @@ #!/usr/bin/env python + import argparse -import os +import collections import logging -import cdec.configobj -import cdec.sa +import os import shutil import sys import subprocess import tempfile import time +import cdec.configobj +import cdec.sa + from rt import ForceAligner from rt import MIRADecoder class RealtimeDecoder: - def __init__(self, configdir, tmpdir='/tmp'): + def __init__(self, configdir, tmpdir='/tmp', cache_size=5): # Temporary work dir self.tmp = tempfile.mkdtemp(dir=tmpdir) @@ -31,6 +34,9 @@ class RealtimeDecoder: # Grammar extractor sa_config = os.path.join(configdir, 'sa.ini') self.extractor = cdec.sa.GrammarExtractor(sa_config, online=True) + self.grammar_files = collections.deque() + self.grammar_dict = {} + self.cache_size = cache_size # Decoder decoder_config = os.path.join(configdir, 'cdec.ini') @@ -45,10 +51,24 @@ class RealtimeDecoder: shutil.rmtree(self.tmp) def grammar(self, sentence): + grammar_file = self.grammar_dict.get(sentence, None) + # Cache hit + if grammar_file: + logging.info('Grammar cache hit') + return grammar_file + # Extract and cache grammar_file = tempfile.mkstemp(dir=self.tmp)[1] with open(grammar_file, 'w') as output: for rule in self.extractor.grammar(sentence): output.write(str(rule) + '\n') + if len(self.grammar_files) == self.cache_size: + rm_sent = self.grammar_files.popleft() + # If not already removed by learn method + if rm_sent in self.grammar_dict: + rm_grammar = self.grammar_dict.pop(rm_sent) + os.remove(rm_grammar) + self.grammar_files.append(sentence) + self.grammar_dict[sentence] = grammar_file return grammar_file def decode(self, sentence): @@ -57,15 +77,21 @@ class RealtimeDecoder: hyp = self.decoder.decode(sentence, grammar_file) stop_time = time.time() logging.info('Translation time: {} seconds'.format(stop_time - start_time)) - os.remove(grammar_file) return hyp def learn(self, source, target): - alignment = self.aligner.align('{} ||| {}'.format(source, target)) + # MIRA update before adding data to grammar extractor + grammar_file = self.grammar(source) + mira_log = self.decoder.update(source, grammar_file, target) + logging.info('MIRA: {}'.format(mira_log)) + # Add aligned sentence pair to grammar extractor + alignment = self.aligner.align(source, target) logging.info('Adding instance: {} ||| {} ||| {}'.format(source, target, alignment)) self.extractor.add_instance(source, target, alignment) - # TODO: Add to LM - # TODO: MIRA update + # Clear (old) cached grammar + rm_grammar = self.grammar_dict.pop(source) + os.remove(rm_grammar) + # TODO: Add to LM by writing to fifo def main(): @@ -84,7 +110,10 @@ def main(): rtd = RealtimeDecoder(args.config) try: - for line in sys.stdin: + while True: + line = sys.stdin.readline() + if not line: + break input = [f.strip() for f in line.split('|||')] if len(input) == 1: hyp = rtd.decode(input[0]) diff --git a/realtime/rt/aligner.py b/realtime/rt/aligner.py index d94dbda0..4a0ace48 100644 --- a/realtime/rt/aligner.py +++ b/realtime/rt/aligner.py @@ -23,7 +23,10 @@ class ForceAligner: self.rev_align = util.popen_io(rev_cmd) self.tools = util.popen_io(tools_cmd) - def align(self, line): + def align(self, source, target): + return self.align_formatted('{} ||| {}'.format(source, target)) + + def align_formatted(self, line): self.fwd_align.stdin.write('{}\n'.format(line)) self.rev_align.stdin.write('{}\n'.format(line)) # f words ||| e words ||| links ||| score diff --git a/realtime/rt/decoder.py b/realtime/rt/decoder.py index 786bc07a..6bbef6f2 100644 --- a/realtime/rt/decoder.py +++ b/realtime/rt/decoder.py @@ -9,7 +9,7 @@ class Decoder: self.decoder.stdin.close() def decode(self, sentence, grammar): - input = '{s}\n'.format(i=id, s=sentence, g=grammar) + input = '{s}\n'.format(s=sentence, g=grammar) self.decoder.stdin.write(input) return self.decoder.stdout.readline().strip() @@ -29,3 +29,8 @@ class MIRADecoder(Decoder): # optimizer=2 step=0.001 best=500, k=500, uniq, stream mira_cmd = [mira, '-c', config, '-w', weights, '-o', '2', '-C', '0.001', '-b', '500', '-k', '500', '-u', '-t'] self.decoder = util.popen_io(mira_cmd) + + def update(self, sentence, grammar, reference): + input = '{s} ||| {r}\n'.format(s=sentence, g=grammar, r=reference) + self.decoder.stdin.write(input) + return self.decoder.stdout.readline().strip() diff --git a/realtime/rt/util.py b/realtime/rt/util.py index 7f877161..263e33fb 100644 --- a/realtime/rt/util.py +++ b/realtime/rt/util.py @@ -1,4 +1,5 @@ import subprocess +import sys import threading def popen_io(cmd): @@ -6,6 +7,11 @@ def popen_io(cmd): consume_stream(p.stderr) return p +def popen_io_v(cmd): + sys.stderr.write('{}\n'.format(' '.join(cmd))) + p = subprocess.Popen(cmd, stdin=subprocess.PIPE, stdout=subprocess.PIPE) + return p + def consume_stream(stream): def consume(s): for _ in s: -- cgit v1.2.3