Merge remote-tracking branch 'upstream/master'

author: Patrick Simianer <p@simianer.de> 2014-06-12 13:56:42 +0200
committer: Patrick Simianer <p@simianer.de> 2014-06-12 13:56:42 +0200
commit: 244971287003d079e46193b8a209c28955f90134 (patch)
tree: 8beaae6b12b913acb213fc7f2415fd63886192f9 /realtime/rt/rt.py
parent: 5250fd67a4b8f242068cff87f0a6a4211f8b0fcf (diff)
parent: b66e838ed52decc0be1eb5817b2a77c3840db2c5 (diff)
1 files changed, 91 insertions, 27 deletions
diff --git a/realtime/rt/rt.py b/realtime/rt/rt.py
index c0aec410..70ed0c3c 100644
--- a/realtime/rt/rt.py
+++ b/realtime/rt/rt.py
@@ -5,6 +5,7 @@ import collections
 import logging
 import os
 import shutil
+import signal
 import StringIO
 import subprocess
 import sys
@@ -20,40 +21,94 @@ import util
 # Dummy input token that is unlikely to appear in normalized data (but no fatal errors if it does)
 LIKELY_OOV = '(OOV)'
 
+# For parsing rt.ini
+TRUE = ('true', 'True', 'TRUE')
+
 logger = logging.getLogger('rt')
 
+class ExtractorWrapper:
+    '''Wrap cdec.sa.GrammarExtractor.  Used to keep multiple instances of the extractor from causing Python to segfault.
+       Do not use directly unless you know what you're doing.'''
+
+    def __init__(self, config):
+        # Make sure pycdec is on PYTHONPATH
+        cdec_root = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+        pycdec = os.path.join(cdec_root, 'python')
+        env = os.environ.copy()
+        python_path = env.get('PYTHONPATH', '')
+        if 'cdec/python' not in python_path:
+            python_path = '{}:{}'.format(python_path, pycdec) if len(python_path) > 0 else pycdec
+            env['PYTHONPATH'] = python_path
+        # Start grammar extractor as separate process using stdio
+        cmd = ['python', '-m', 'cdec.sa.extract', '-o', '-z', '-c', config, '-t']
+        logger.info('Executing: {}'.format(' '.join(cmd)))
+        self.p = subprocess.Popen(cmd, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE, env=env)
+        util.consume_stream(self.p.stderr)
+        self.lock = util.FIFOLock()
+
+    def close(self, force=False):
+        if not force:
+            self.lock.acquire()
+            self.p.stdin.close()
+            self.p.wait()
+            self.lock.release()
+        else:
+            os.kill(self.p.pid, signal.SIGTERM)
+            
+
+    def drop_ctx(self, ctx_name):
+        self.lock.acquire()
+        self.p.stdin.write('{} ||| drop\n'.format(ctx_name))
+        self.p.stdout.readline()
+        self.lock.release()
+
+    def grammar(self, sentence, grammar_file, ctx_name):
+        self.lock.acquire()
+        self.p.stdin.write('{} ||| {} ||| {}\n'.format(ctx_name, sentence, grammar_file))
+        self.p.stdout.readline()
+        self.lock.release()
+
+    def add_instance(self, source, target, alignment, ctx_name):
+        self.lock.acquire()
+        self.p.stdin.write('{} ||| {} ||| {} ||| {}\n'.format(ctx_name, source, target, alignment))
+        self.p.stdout.readline()
+        self.lock.release()
+
 class RealtimeDecoder:
     '''Do not use directly unless you know what you're doing.  Use RealtimeTranslator.'''
 
-    def __init__(self, configdir, tmpdir):
-
+    def __init__(self, configdir, tmpdir, hpyplm=False, metric='ibm_bleu'):
+    
         cdec_root = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
 
         self.tmp = tmpdir
         os.mkdir(self.tmp)
 
         # HPYPLM reference stream
-        ref_fifo_file = os.path.join(self.tmp, 'ref.fifo')
-        os.mkfifo(ref_fifo_file)
-        self.ref_fifo = open(ref_fifo_file, 'w+')
-        # Start with empty line (do not learn prior to first input)
-        self.ref_fifo.write('\n')
-        self.ref_fifo.flush()
+        self.hpyplm = hpyplm
+        if self.hpyplm:
+            ref_fifo_file = os.path.join(self.tmp, 'ref.fifo')
+            os.mkfifo(ref_fifo_file)
+            self.ref_fifo = open(ref_fifo_file, 'w+')
+            # Start with empty line (do not learn prior to first input)
+            self.ref_fifo.write('\n')
+            self.ref_fifo.flush()
 
         # Decoder
         decoder_config = [[f.strip() for f in line.split('=')] for line in open(os.path.join(configdir, 'cdec.ini'))]
-        util.cdec_ini_for_realtime(decoder_config, os.path.abspath(configdir), ref_fifo_file)
+        util.cdec_ini_for_realtime(decoder_config, os.path.abspath(configdir), ref_fifo_file if self.hpyplm else None)
         decoder_config_file = os.path.join(self.tmp, 'cdec.ini')
         with open(decoder_config_file, 'w') as output:
             for (k, v) in decoder_config:
                 output.write('{}={}\n'.format(k, v))
         decoder_weights = os.path.join(configdir, 'weights.final')
-        self.decoder = decoder.MIRADecoder(decoder_config_file, decoder_weights)
+        self.decoder = decoder.MIRADecoder(decoder_config_file, decoder_weights, metric=metric)
 
     def close(self, force=False):
         logger.info('Closing decoder and removing {}'.format(self.tmp))
         self.decoder.close(force)
-        self.ref_fifo.close()
+        if self.hpyplm:
+            self.ref_fifo.close()
         shutil.rmtree(self.tmp)
 
 class RealtimeTranslator:
@@ -73,6 +128,11 @@ class RealtimeTranslator:
         
         cdec_root = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
 
+        # rt.ini options
+        ini = dict(line.strip().split('=') for line in open(os.path.join(configdir, 'rt.ini')))
+        self.hpyplm = (ini.get('hpyplm', 'false') in TRUE)
+        self.metric = ini.get('metric', 'ibm_bleu')
+        
         ### Single instance for all contexts
 
         self.config = configdir
@@ -100,7 +160,7 @@ class RealtimeTranslator:
         sa_config.filename = os.path.join(self.tmp, 'sa.ini')
         util.sa_ini_for_realtime(sa_config, os.path.abspath(configdir))
         sa_config.write()
-        self.extractor = cdec.sa.GrammarExtractor(sa_config.filename, online=True)
+        self.extractor = ExtractorWrapper(sa_config.filename)
         self.cache_size = cache_size
 
         ### One instance per context
@@ -131,10 +191,13 @@ class RealtimeTranslator:
         '''Cleanup'''
         if force:
             logger.info('Forced shutdown: stopping immediately')
-        for ctx_name in list(self.ctx_names):
-            self.drop_ctx(ctx_name, force)
+        # Drop contexts before closing processes unless forced
+        if not force:
+            for ctx_name in list(self.ctx_names):
+                self.drop_ctx(ctx_name, force)
         logger.info('Closing processes')
         self.aligner.close(force)
+        self.extractor.close(force)
         if self.norm:
             if not force:
                 self.tokenizer_lock.acquire()
@@ -160,7 +223,7 @@ class RealtimeTranslator:
         self.grammar_files[ctx_name] = collections.deque()
         self.grammar_dict[ctx_name] = {}
         tmpdir = os.path.join(self.tmp, 'decoder.{}'.format(ctx_name))
-        self.decoders[ctx_name] = RealtimeDecoder(self.config, tmpdir)
+        self.decoders[ctx_name] = RealtimeDecoder(self.config, tmpdir, hpyplm=self.hpyplm, metric=self.metric)
 
     def drop_ctx(self, ctx_name=None, force=False):
         '''Delete a context (inc stopping the decoder)
@@ -202,11 +265,9 @@ class RealtimeTranslator:
             self.extractor_lock.release()
             return grammar_file
         # Extract and cache
-        (fid, grammar_file) = tempfile.mkstemp(dir=self.decoders[ctx_name].tmp, prefix='grammar.')
+        (fid, grammar_file) = tempfile.mkstemp(dir=self.decoders[ctx_name].tmp, prefix='grammar.', suffix='.gz')
         os.close(fid)
-        with open(grammar_file, 'w') as output:
-            for rule in self.extractor.grammar(sentence, ctx_name):
-                output.write('{}\n'.format(str(rule)))
+        self.extractor.grammar(sentence, grammar_file, ctx_name)
         grammar_files = self.grammar_files[ctx_name]
         if len(grammar_files) == self.cache_size:
             rm_sent = grammar_files.popleft()
@@ -239,8 +300,9 @@ class RealtimeTranslator:
         stop_time = time.time()
         logger.info('({}) Translation time: {} seconds'.format(ctx_name, stop_time - start_time))
         # Empty reference: HPYPLM does not learn prior to next translation
-        decoder.ref_fifo.write('\n')
-        decoder.ref_fifo.flush()
+        if self.hpyplm:
+            decoder.ref_fifo.write('\n')
+            decoder.ref_fifo.flush()
         if self.norm:
             logger.info('({}) Normalized translation: {}'.format(ctx_name, hyp))
             hyp = self.detokenize(hyp)
@@ -301,9 +363,10 @@ class RealtimeTranslator:
         mira_log = decoder.decoder.update(source, grammar_file, target)
         logger.info('({}) MIRA HBF: {}'.format(ctx_name, mira_log))
         # Add to HPYPLM by writing to fifo (read on next translation)
-        logger.info('({}) Adding to HPYPLM: {}'.format(ctx_name, target))
-        decoder.ref_fifo.write('{}\n'.format(target))
-        decoder.ref_fifo.flush()
+        if self.hpyplm:
+            logger.info('({}) Adding to HPYPLM: {}'.format(ctx_name, target))
+            decoder.ref_fifo.write('{}\n'.format(target))
+            decoder.ref_fifo.flush()
         # Store incremental data for save/load
         self.ctx_data[ctx_name].append((source, target, alignment))
         # Add aligned sentence pair to grammar extractor
@@ -381,9 +444,10 @@ class RealtimeTranslator:
                 # Extractor
                 self.extractor.add_instance(source, target, alignment, ctx_name)
                 # HPYPLM
-                hyp = decoder.decoder.decode(LIKELY_OOV)
-                decoder.ref_fifo.write('{}\n'.format(target))
-                decoder.ref_fifo.flush()
+                if self.hpyplm:
+                    hyp = decoder.decoder.decode(LIKELY_OOV)
+                    decoder.ref_fifo.write('{}\n'.format(target))
+                    decoder.ref_fifo.flush()
             stop_time = time.time()
             logger.info('({}) Loaded state with {} sentences in {} seconds'.format(ctx_name, len(ctx_data), stop_time - start_time))
             lock.release()
author	Patrick Simianer <p@simianer.de>	2014-06-12 13:56:42 +0200
committer	Patrick Simianer <p@simianer.de>	2014-06-12 13:56:42 +0200
commit	244971287003d079e46193b8a209c28955f90134 (patch)
tree	8beaae6b12b913acb213fc7f2415fd63886192f9 /realtime/rt/rt.py
parent	5250fd67a4b8f242068cff87f0a6a4211f8b0fcf (diff)
parent	b66e838ed52decc0be1eb5817b2a77c3840db2c5 (diff)