summaryrefslogtreecommitdiff
path: root/realtime
diff options
context:
space:
mode:
Diffstat (limited to 'realtime')
-rwxr-xr-xrealtime/mkconfig.py25
-rw-r--r--realtime/rt/decoder.py6
-rw-r--r--realtime/rt/rt.py118
-rw-r--r--realtime/rt/util.py1
4 files changed, 115 insertions, 35 deletions
diff --git a/realtime/mkconfig.py b/realtime/mkconfig.py
index f193c57a..f9938051 100755
--- a/realtime/mkconfig.py
+++ b/realtime/mkconfig.py
@@ -12,11 +12,14 @@ from cdec.configobj import ConfigObj
def main():
- if len(sys.argv[1:]) != 12:
- sys.stderr.write('usage: {} a.fwd_params a.fwd_err a.rev_params a.rev_err sa sa.ini mono.klm libcdec_ff_hpyplm.so corpus.hpyplm cdec.ini weights.final output.d\n'.format(sys.argv[0]))
+ if len(sys.argv[1:]) not in (10, 12):
+ sys.stderr.write('usage: {} a.fwd_params a.fwd_err a.rev_params a.rev_err sa sa.ini mono.klm cdec.ini weights.final output.d [libcdec_ff_hpyplm.so corpus.hpyplm]\n'.format(sys.argv[0]))
sys.exit(2)
- (a_fwd_params, a_fwd_err, a_rev_params, a_rev_err, sa, sa_ini, mono_klm, libcdec_ff_hpyplm_so, corpus_hpyplm, cdec_ini, weights_final, output_d) = sys.argv[1:]
+ (a_fwd_params, a_fwd_err, a_rev_params, a_rev_err, sa, sa_ini, mono_klm, cdec_ini, weights_final, output_d) = sys.argv[1:11]
+
+ # Optional
+ (libcdec_ff_hpyplm_so, corpus_hpyplm) = sys.argv[11:13] if len(sys.argv[1:]) == 12 else (None, None)
if os.path.exists(output_d):
sys.stderr.write('Directory {} exists, exiting.\n'.format(output_d))
@@ -40,8 +43,10 @@ def main():
# language models
shutil.copy(mono_klm, os.path.join(output_d, 'mono.klm'))
- shutil.copy(libcdec_ff_hpyplm_so, os.path.join(output_d, 'libcdec_ff_hpyplm.so'))
- shutil.copy(corpus_hpyplm, os.path.join(output_d, 'corpus.hpyplm'))
+ if libcdec_ff_hpyplm_so:
+ shutil.copy(libcdec_ff_hpyplm_so, os.path.join(output_d, 'libcdec_ff_hpyplm.so'))
+ if corpus_hpyplm:
+ shutil.copy(corpus_hpyplm, os.path.join(output_d, 'corpus.hpyplm'))
# decoder config
config = [[f.strip() for f in line.split('=')] for line in open(cdec_ini)]
@@ -52,6 +57,16 @@ def main():
# weights
shutil.copy(weights_final, os.path.join(output_d, 'weights.final'))
+
+ # other options
+ rt_ini = os.path.join(output_d, 'rt.ini')
+ with open(rt_ini, 'w') as out:
+ if libcdec_ff_hpyplm_so and corpus_hpyplm:
+ out.write('hpyplm=true\n')
+ else:
+ out.write('hpyplm=false\n')
+ out.write('metric=ibm_bleu\n')
+ sys.stderr.write('IMPORTANT: add any additional options such as metric=meteor to {}\n'.format(rt_ini))
if __name__ == '__main__':
main()
diff --git a/realtime/rt/decoder.py b/realtime/rt/decoder.py
index ed45c248..1bdd3f1f 100644
--- a/realtime/rt/decoder.py
+++ b/realtime/rt/decoder.py
@@ -38,11 +38,11 @@ class CdecDecoder(Decoder):
class MIRADecoder(Decoder):
- def __init__(self, config, weights):
+ def __init__(self, config, weights, metric='ibm_bleu'):
cdec_root = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
mira = os.path.join(cdec_root, 'training', 'mira', 'kbest_cut_mira')
- # optimizer=2 step=0.001 best=500, k=500, uniq, stream
- mira_cmd = [mira, '-c', config, '-w', weights, '-o', '2', '-C', '0.001', '-b', '500', '-k', '500', '-u', '-t']
+ # optimizer=2 step=0.001 best=500, k=500, uniq, stream, metric
+ mira_cmd = [mira, '-c', config, '-w', weights, '-o', '2', '-C', '0.001', '-b', '500', '-k', '500', '-u', '-t', '-m', metric]
logger.info('Executing: {}'.format(' '.join(mira_cmd)))
self.decoder = util.popen_io(mira_cmd)
self.lock = util.FIFOLock()
diff --git a/realtime/rt/rt.py b/realtime/rt/rt.py
index c0aec410..70ed0c3c 100644
--- a/realtime/rt/rt.py
+++ b/realtime/rt/rt.py
@@ -5,6 +5,7 @@ import collections
import logging
import os
import shutil
+import signal
import StringIO
import subprocess
import sys
@@ -20,40 +21,94 @@ import util
# Dummy input token that is unlikely to appear in normalized data (but no fatal errors if it does)
LIKELY_OOV = '(OOV)'
+# For parsing rt.ini
+TRUE = ('true', 'True', 'TRUE')
+
logger = logging.getLogger('rt')
+class ExtractorWrapper:
+ '''Wrap cdec.sa.GrammarExtractor. Used to keep multiple instances of the extractor from causing Python to segfault.
+ Do not use directly unless you know what you're doing.'''
+
+ def __init__(self, config):
+ # Make sure pycdec is on PYTHONPATH
+ cdec_root = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+ pycdec = os.path.join(cdec_root, 'python')
+ env = os.environ.copy()
+ python_path = env.get('PYTHONPATH', '')
+ if 'cdec/python' not in python_path:
+ python_path = '{}:{}'.format(python_path, pycdec) if len(python_path) > 0 else pycdec
+ env['PYTHONPATH'] = python_path
+ # Start grammar extractor as separate process using stdio
+ cmd = ['python', '-m', 'cdec.sa.extract', '-o', '-z', '-c', config, '-t']
+ logger.info('Executing: {}'.format(' '.join(cmd)))
+ self.p = subprocess.Popen(cmd, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE, env=env)
+ util.consume_stream(self.p.stderr)
+ self.lock = util.FIFOLock()
+
+ def close(self, force=False):
+ if not force:
+ self.lock.acquire()
+ self.p.stdin.close()
+ self.p.wait()
+ self.lock.release()
+ else:
+ os.kill(self.p.pid, signal.SIGTERM)
+
+
+ def drop_ctx(self, ctx_name):
+ self.lock.acquire()
+ self.p.stdin.write('{} ||| drop\n'.format(ctx_name))
+ self.p.stdout.readline()
+ self.lock.release()
+
+ def grammar(self, sentence, grammar_file, ctx_name):
+ self.lock.acquire()
+ self.p.stdin.write('{} ||| {} ||| {}\n'.format(ctx_name, sentence, grammar_file))
+ self.p.stdout.readline()
+ self.lock.release()
+
+ def add_instance(self, source, target, alignment, ctx_name):
+ self.lock.acquire()
+ self.p.stdin.write('{} ||| {} ||| {} ||| {}\n'.format(ctx_name, source, target, alignment))
+ self.p.stdout.readline()
+ self.lock.release()
+
class RealtimeDecoder:
'''Do not use directly unless you know what you're doing. Use RealtimeTranslator.'''
- def __init__(self, configdir, tmpdir):
-
+ def __init__(self, configdir, tmpdir, hpyplm=False, metric='ibm_bleu'):
+
cdec_root = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
self.tmp = tmpdir
os.mkdir(self.tmp)
# HPYPLM reference stream
- ref_fifo_file = os.path.join(self.tmp, 'ref.fifo')
- os.mkfifo(ref_fifo_file)
- self.ref_fifo = open(ref_fifo_file, 'w+')
- # Start with empty line (do not learn prior to first input)
- self.ref_fifo.write('\n')
- self.ref_fifo.flush()
+ self.hpyplm = hpyplm
+ if self.hpyplm:
+ ref_fifo_file = os.path.join(self.tmp, 'ref.fifo')
+ os.mkfifo(ref_fifo_file)
+ self.ref_fifo = open(ref_fifo_file, 'w+')
+ # Start with empty line (do not learn prior to first input)
+ self.ref_fifo.write('\n')
+ self.ref_fifo.flush()
# Decoder
decoder_config = [[f.strip() for f in line.split('=')] for line in open(os.path.join(configdir, 'cdec.ini'))]
- util.cdec_ini_for_realtime(decoder_config, os.path.abspath(configdir), ref_fifo_file)
+ util.cdec_ini_for_realtime(decoder_config, os.path.abspath(configdir), ref_fifo_file if self.hpyplm else None)
decoder_config_file = os.path.join(self.tmp, 'cdec.ini')
with open(decoder_config_file, 'w') as output:
for (k, v) in decoder_config:
output.write('{}={}\n'.format(k, v))
decoder_weights = os.path.join(configdir, 'weights.final')
- self.decoder = decoder.MIRADecoder(decoder_config_file, decoder_weights)
+ self.decoder = decoder.MIRADecoder(decoder_config_file, decoder_weights, metric=metric)
def close(self, force=False):
logger.info('Closing decoder and removing {}'.format(self.tmp))
self.decoder.close(force)
- self.ref_fifo.close()
+ if self.hpyplm:
+ self.ref_fifo.close()
shutil.rmtree(self.tmp)
class RealtimeTranslator:
@@ -73,6 +128,11 @@ class RealtimeTranslator:
cdec_root = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+ # rt.ini options
+ ini = dict(line.strip().split('=') for line in open(os.path.join(configdir, 'rt.ini')))
+ self.hpyplm = (ini.get('hpyplm', 'false') in TRUE)
+ self.metric = ini.get('metric', 'ibm_bleu')
+
### Single instance for all contexts
self.config = configdir
@@ -100,7 +160,7 @@ class RealtimeTranslator:
sa_config.filename = os.path.join(self.tmp, 'sa.ini')
util.sa_ini_for_realtime(sa_config, os.path.abspath(configdir))
sa_config.write()
- self.extractor = cdec.sa.GrammarExtractor(sa_config.filename, online=True)
+ self.extractor = ExtractorWrapper(sa_config.filename)
self.cache_size = cache_size
### One instance per context
@@ -131,10 +191,13 @@ class RealtimeTranslator:
'''Cleanup'''
if force:
logger.info('Forced shutdown: stopping immediately')
- for ctx_name in list(self.ctx_names):
- self.drop_ctx(ctx_name, force)
+ # Drop contexts before closing processes unless forced
+ if not force:
+ for ctx_name in list(self.ctx_names):
+ self.drop_ctx(ctx_name, force)
logger.info('Closing processes')
self.aligner.close(force)
+ self.extractor.close(force)
if self.norm:
if not force:
self.tokenizer_lock.acquire()
@@ -160,7 +223,7 @@ class RealtimeTranslator:
self.grammar_files[ctx_name] = collections.deque()
self.grammar_dict[ctx_name] = {}
tmpdir = os.path.join(self.tmp, 'decoder.{}'.format(ctx_name))
- self.decoders[ctx_name] = RealtimeDecoder(self.config, tmpdir)
+ self.decoders[ctx_name] = RealtimeDecoder(self.config, tmpdir, hpyplm=self.hpyplm, metric=self.metric)
def drop_ctx(self, ctx_name=None, force=False):
'''Delete a context (inc stopping the decoder)
@@ -202,11 +265,9 @@ class RealtimeTranslator:
self.extractor_lock.release()
return grammar_file
# Extract and cache
- (fid, grammar_file) = tempfile.mkstemp(dir=self.decoders[ctx_name].tmp, prefix='grammar.')
+ (fid, grammar_file) = tempfile.mkstemp(dir=self.decoders[ctx_name].tmp, prefix='grammar.', suffix='.gz')
os.close(fid)
- with open(grammar_file, 'w') as output:
- for rule in self.extractor.grammar(sentence, ctx_name):
- output.write('{}\n'.format(str(rule)))
+ self.extractor.grammar(sentence, grammar_file, ctx_name)
grammar_files = self.grammar_files[ctx_name]
if len(grammar_files) == self.cache_size:
rm_sent = grammar_files.popleft()
@@ -239,8 +300,9 @@ class RealtimeTranslator:
stop_time = time.time()
logger.info('({}) Translation time: {} seconds'.format(ctx_name, stop_time - start_time))
# Empty reference: HPYPLM does not learn prior to next translation
- decoder.ref_fifo.write('\n')
- decoder.ref_fifo.flush()
+ if self.hpyplm:
+ decoder.ref_fifo.write('\n')
+ decoder.ref_fifo.flush()
if self.norm:
logger.info('({}) Normalized translation: {}'.format(ctx_name, hyp))
hyp = self.detokenize(hyp)
@@ -301,9 +363,10 @@ class RealtimeTranslator:
mira_log = decoder.decoder.update(source, grammar_file, target)
logger.info('({}) MIRA HBF: {}'.format(ctx_name, mira_log))
# Add to HPYPLM by writing to fifo (read on next translation)
- logger.info('({}) Adding to HPYPLM: {}'.format(ctx_name, target))
- decoder.ref_fifo.write('{}\n'.format(target))
- decoder.ref_fifo.flush()
+ if self.hpyplm:
+ logger.info('({}) Adding to HPYPLM: {}'.format(ctx_name, target))
+ decoder.ref_fifo.write('{}\n'.format(target))
+ decoder.ref_fifo.flush()
# Store incremental data for save/load
self.ctx_data[ctx_name].append((source, target, alignment))
# Add aligned sentence pair to grammar extractor
@@ -381,9 +444,10 @@ class RealtimeTranslator:
# Extractor
self.extractor.add_instance(source, target, alignment, ctx_name)
# HPYPLM
- hyp = decoder.decoder.decode(LIKELY_OOV)
- decoder.ref_fifo.write('{}\n'.format(target))
- decoder.ref_fifo.flush()
+ if self.hpyplm:
+ hyp = decoder.decoder.decode(LIKELY_OOV)
+ decoder.ref_fifo.write('{}\n'.format(target))
+ decoder.ref_fifo.flush()
stop_time = time.time()
logger.info('({}) Loaded state with {} sentences in {} seconds'.format(ctx_name, len(ctx_data), stop_time - start_time))
lock.release()
diff --git a/realtime/rt/util.py b/realtime/rt/util.py
index a7333bbd..8cd41a1e 100644
--- a/realtime/rt/util.py
+++ b/realtime/rt/util.py
@@ -10,6 +10,7 @@ SA_INI_FILES = set((
'a_file',
'lex_file',
'precompute_file',
+ 'bilex_file',
))
class FIFOLock: