MIRA updates in realtime.py

author: Michael Denkowski <mdenkows@cs.cmu.edu> 2013-09-03 12:13:03 -0700
committer: Michael Denkowski <mdenkows@cs.cmu.edu> 2013-09-03 12:13:03 -0700
commit: def48bde959fa932cbe87228dc84afc8e635b49b (patch)
tree: c4e028f3934ac0a581bc0c21e41204a80e385ddb
parent: e6f15583081547a4adc2fca7f2ed96cb515a48f5 (diff)
4 files changed, 54 insertions, 11 deletions
diff --git a/realtime/realtime.py b/realtime/realtime.py
index c169ce4c..1f67bed7 100755
--- a/realtime/realtime.py
+++ b/realtime/realtime.py
@@ -1,21 +1,24 @@
 #!/usr/bin/env python
+
 import argparse
-import os
+import collections
 import logging
-import cdec.configobj
-import cdec.sa
+import os
 import shutil
 import sys
 import subprocess
 import tempfile
 import time
 
+import cdec.configobj
+import cdec.sa
+
 from rt import ForceAligner
 from rt import MIRADecoder
 
 class RealtimeDecoder:
 
-    def __init__(self, configdir, tmpdir='/tmp'):
+    def __init__(self, configdir, tmpdir='/tmp', cache_size=5):
 
         # Temporary work dir
         self.tmp = tempfile.mkdtemp(dir=tmpdir)
@@ -31,6 +34,9 @@ class RealtimeDecoder:
         # Grammar extractor
         sa_config = os.path.join(configdir, 'sa.ini')
         self.extractor = cdec.sa.GrammarExtractor(sa_config, online=True)
+        self.grammar_files = collections.deque()
+        self.grammar_dict = {}
+        self.cache_size = cache_size
 
         # Decoder
         decoder_config = os.path.join(configdir, 'cdec.ini')
@@ -45,10 +51,24 @@ class RealtimeDecoder:
         shutil.rmtree(self.tmp)
 
     def grammar(self, sentence):
+        grammar_file = self.grammar_dict.get(sentence, None)
+        # Cache hit
+        if grammar_file:
+            logging.info('Grammar cache hit')
+            return grammar_file
+        # Extract and cache
         grammar_file = tempfile.mkstemp(dir=self.tmp)[1]
         with open(grammar_file, 'w') as output:
             for rule in self.extractor.grammar(sentence):
                 output.write(str(rule) + '\n')
+        if len(self.grammar_files) == self.cache_size:
+            rm_sent = self.grammar_files.popleft()
+            # If not already removed by learn method
+            if rm_sent in self.grammar_dict:
+                rm_grammar = self.grammar_dict.pop(rm_sent)
+                os.remove(rm_grammar)
+        self.grammar_files.append(sentence)
+        self.grammar_dict[sentence] = grammar_file
         return grammar_file
         
     def decode(self, sentence):
@@ -57,15 +77,21 @@ class RealtimeDecoder:
         hyp = self.decoder.decode(sentence, grammar_file)
         stop_time = time.time()
         logging.info('Translation time: {} seconds'.format(stop_time - start_time))
-        os.remove(grammar_file)
         return hyp
 
     def learn(self, source, target):
-        alignment = self.aligner.align('{} ||| {}'.format(source, target))
+        # MIRA update before adding data to grammar extractor
+        grammar_file = self.grammar(source)
+        mira_log = self.decoder.update(source, grammar_file, target)
+        logging.info('MIRA: {}'.format(mira_log))
+        # Add aligned sentence pair to grammar extractor
+        alignment = self.aligner.align(source, target)
         logging.info('Adding instance: {} ||| {} ||| {}'.format(source, target, alignment))
         self.extractor.add_instance(source, target, alignment)
-        # TODO: Add to LM
-        # TODO: MIRA update
+        # Clear (old) cached grammar
+        rm_grammar = self.grammar_dict.pop(source)
+        os.remove(rm_grammar)
+        # TODO: Add to LM by writing to fifo
 
 def main():
 
@@ -84,7 +110,10 @@ def main():
     rtd = RealtimeDecoder(args.config)
 
     try:
-        for line in sys.stdin:
+        while True:
+            line = sys.stdin.readline()
+            if not line:
+                break
             input = [f.strip() for f in line.split('|||')]
             if len(input) == 1:
                 hyp = rtd.decode(input[0])
diff --git a/realtime/rt/aligner.py b/realtime/rt/aligner.py
index d94dbda0..4a0ace48 100644
--- a/realtime/rt/aligner.py
+++ b/realtime/rt/aligner.py
@@ -23,7 +23,10 @@ class ForceAligner:
         self.rev_align = util.popen_io(rev_cmd)
         self.tools = util.popen_io(tools_cmd)
 
-    def align(self, line):
+    def align(self, source, target):
+        return self.align_formatted('{} ||| {}'.format(source, target))
+
+    def align_formatted(self, line):
         self.fwd_align.stdin.write('{}\n'.format(line))
         self.rev_align.stdin.write('{}\n'.format(line))
         # f words ||| e words ||| links ||| score
diff --git a/realtime/rt/decoder.py b/realtime/rt/decoder.py
index 786bc07a..6bbef6f2 100644
--- a/realtime/rt/decoder.py
+++ b/realtime/rt/decoder.py
@@ -9,7 +9,7 @@ class Decoder:
         self.decoder.stdin.close()
 
     def decode(self, sentence, grammar):
-        input = '<seg grammar="{g}">{s}</seg>\n'.format(i=id, s=sentence, g=grammar)
+        input = '<seg grammar="{g}">{s}</seg>\n'.format(s=sentence, g=grammar)
         self.decoder.stdin.write(input)
         return self.decoder.stdout.readline().strip()
 
@@ -29,3 +29,8 @@ class MIRADecoder(Decoder):
         #                                              optimizer=2 step=0.001    best=500,    k=500,       uniq, stream
         mira_cmd = [mira, '-c', config, '-w', weights, '-o', '2', '-C', '0.001', '-b', '500', '-k', '500', '-u', '-t']
         self.decoder = util.popen_io(mira_cmd)
+
+    def update(self, sentence, grammar, reference):
+        input = '<seg grammar="{g}">{s}</seg> ||| {r}\n'.format(s=sentence, g=grammar, r=reference)
+        self.decoder.stdin.write(input)
+        return self.decoder.stdout.readline().strip()
diff --git a/realtime/rt/util.py b/realtime/rt/util.py
index 7f877161..263e33fb 100644
--- a/realtime/rt/util.py
+++ b/realtime/rt/util.py
@@ -1,4 +1,5 @@
 import subprocess
+import sys
 import threading
 
 def popen_io(cmd):
@@ -6,6 +7,11 @@ def popen_io(cmd):
     consume_stream(p.stderr)
     return p
 
+def popen_io_v(cmd):
+    sys.stderr.write('{}\n'.format(' '.join(cmd)))
+    p = subprocess.Popen(cmd, stdin=subprocess.PIPE, stdout=subprocess.PIPE)
+    return p
+
 def consume_stream(stream):
     def consume(s):
         for _ in s:
author	Michael Denkowski <mdenkows@cs.cmu.edu>	2013-09-03 12:13:03 -0700
committer	Michael Denkowski <mdenkows@cs.cmu.edu>	2013-09-03 12:13:03 -0700
commit	def48bde959fa932cbe87228dc84afc8e635b49b (patch)
tree	c4e028f3934ac0a581bc0c21e41204a80e385ddb
parent	e6f15583081547a4adc2fca7f2ed96cb515a48f5 (diff)