From eec45e082f4261871bb6547a14511d2c722e3f59 Mon Sep 17 00:00:00 2001 From: Michael Denkowski Date: Tue, 3 Sep 2013 12:13:03 -0700 Subject: MIRA updates in realtime.py --- realtime/realtime.py | 47 ++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 38 insertions(+), 9 deletions(-) (limited to 'realtime/realtime.py') diff --git a/realtime/realtime.py b/realtime/realtime.py index c169ce4c..1f67bed7 100755 --- a/realtime/realtime.py +++ b/realtime/realtime.py @@ -1,21 +1,24 @@ #!/usr/bin/env python + import argparse -import os +import collections import logging -import cdec.configobj -import cdec.sa +import os import shutil import sys import subprocess import tempfile import time +import cdec.configobj +import cdec.sa + from rt import ForceAligner from rt import MIRADecoder class RealtimeDecoder: - def __init__(self, configdir, tmpdir='/tmp'): + def __init__(self, configdir, tmpdir='/tmp', cache_size=5): # Temporary work dir self.tmp = tempfile.mkdtemp(dir=tmpdir) @@ -31,6 +34,9 @@ class RealtimeDecoder: # Grammar extractor sa_config = os.path.join(configdir, 'sa.ini') self.extractor = cdec.sa.GrammarExtractor(sa_config, online=True) + self.grammar_files = collections.deque() + self.grammar_dict = {} + self.cache_size = cache_size # Decoder decoder_config = os.path.join(configdir, 'cdec.ini') @@ -45,10 +51,24 @@ class RealtimeDecoder: shutil.rmtree(self.tmp) def grammar(self, sentence): + grammar_file = self.grammar_dict.get(sentence, None) + # Cache hit + if grammar_file: + logging.info('Grammar cache hit') + return grammar_file + # Extract and cache grammar_file = tempfile.mkstemp(dir=self.tmp)[1] with open(grammar_file, 'w') as output: for rule in self.extractor.grammar(sentence): output.write(str(rule) + '\n') + if len(self.grammar_files) == self.cache_size: + rm_sent = self.grammar_files.popleft() + # If not already removed by learn method + if rm_sent in self.grammar_dict: + rm_grammar = self.grammar_dict.pop(rm_sent) + os.remove(rm_grammar) + self.grammar_files.append(sentence) + self.grammar_dict[sentence] = grammar_file return grammar_file def decode(self, sentence): @@ -57,15 +77,21 @@ class RealtimeDecoder: hyp = self.decoder.decode(sentence, grammar_file) stop_time = time.time() logging.info('Translation time: {} seconds'.format(stop_time - start_time)) - os.remove(grammar_file) return hyp def learn(self, source, target): - alignment = self.aligner.align('{} ||| {}'.format(source, target)) + # MIRA update before adding data to grammar extractor + grammar_file = self.grammar(source) + mira_log = self.decoder.update(source, grammar_file, target) + logging.info('MIRA: {}'.format(mira_log)) + # Add aligned sentence pair to grammar extractor + alignment = self.aligner.align(source, target) logging.info('Adding instance: {} ||| {} ||| {}'.format(source, target, alignment)) self.extractor.add_instance(source, target, alignment) - # TODO: Add to LM - # TODO: MIRA update + # Clear (old) cached grammar + rm_grammar = self.grammar_dict.pop(source) + os.remove(rm_grammar) + # TODO: Add to LM by writing to fifo def main(): @@ -84,7 +110,10 @@ def main(): rtd = RealtimeDecoder(args.config) try: - for line in sys.stdin: + while True: + line = sys.stdin.readline() + if not line: + break input = [f.strip() for f in line.split('|||')] if len(input) == 1: hyp = rtd.decode(input[0]) -- cgit v1.2.3