summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMichael Denkowski <mdenkows@cs.cmu.edu>2013-09-04 12:27:22 -0700
committerMichael Denkowski <mdenkows@cs.cmu.edu>2013-09-04 12:27:22 -0700
commit40eac315f63b018eec10da4124b801869cd788f5 (patch)
tree54ba29aa4b5ed47ee305d8238f8a52a91ecdc521
parent7c583986ab4774480d45ada79a812c9c8853296b (diff)
Infrastructure for HPYPLM, config file management.
-rwxr-xr-xrealtime/mkconfig.py6
-rw-r--r--realtime/rt/aligner.py6
-rw-r--r--realtime/rt/decoder.py3
-rw-r--r--realtime/rt/rt.py36
-rw-r--r--realtime/rt/util.py68
5 files changed, 83 insertions, 36 deletions
diff --git a/realtime/mkconfig.py b/realtime/mkconfig.py
index 954283ce..32388978 100755
--- a/realtime/mkconfig.py
+++ b/realtime/mkconfig.py
@@ -31,9 +31,9 @@ def main():
# grammar extractor
shutil.copytree(sa, os.path.join(output_d, 'sa'))
- config = ConfigObj(sa_ini)
+ config = ConfigObj(sa_ini, unrepr=True)
config.filename = os.path.join(output_d, 'sa.ini')
- rt.util.sa_ini_basename(config)
+ rt.util.sa_ini_for_config(config)
config.write()
# language models
@@ -43,7 +43,7 @@ def main():
# decoder config
config = [[f.strip() for f in line.split('=')] for line in open(cdec_ini)]
- rt.util.cdec_ini_basename(config)
+ rt.util.cdec_ini_for_config(config)
with open(os.path.join(output_d, 'cdec.ini'), 'w') as output:
for (k, v) in config:
output.write('{}={}\n'.format(k, v))
diff --git a/realtime/rt/aligner.py b/realtime/rt/aligner.py
index 4a0ace48..3c6ea144 100644
--- a/realtime/rt/aligner.py
+++ b/realtime/rt/aligner.py
@@ -1,3 +1,4 @@
+import logging
import os
import sys
import subprocess
@@ -19,8 +20,13 @@ class ForceAligner:
rev_cmd = [fast_align, '-i', '-', '-d', '-T', rev_T, '-m', rev_m, '-f', rev_params, '-r']
tools_cmd = [atools, '-i', '-', '-j', '-', '-c', 'grow-diag-final-and']
+ logging.info('Executing: {}'.format(' '.join(fwd_cmd)))
self.fwd_align = util.popen_io(fwd_cmd)
+
+ logging.info('Executing: {}'.format(' '.join(rev_cmd)))
self.rev_align = util.popen_io(rev_cmd)
+
+ logging.info('Executing: {}'.format(' '.join(tools_cmd)))
self.tools = util.popen_io(tools_cmd)
def align(self, source, target):
diff --git a/realtime/rt/decoder.py b/realtime/rt/decoder.py
index 6bbef6f2..0a202fae 100644
--- a/realtime/rt/decoder.py
+++ b/realtime/rt/decoder.py
@@ -1,3 +1,4 @@
+import logging
import os
import subprocess
@@ -19,6 +20,7 @@ class CdecDecoder(Decoder):
cdec_root = os.path.dirname(os.path.dirname(os.path.dirname(__file__)))
decoder = os.path.join(cdec_root, 'decoder', 'cdec')
decoder_cmd = [decoder, '-c', config, '-w', weights]
+ logging.info('Executing: {}'.format(' '.join(decoder_cmd)))
self.decoder = util.popen_io(decoder_cmd)
class MIRADecoder(Decoder):
@@ -28,6 +30,7 @@ class MIRADecoder(Decoder):
mira = os.path.join(cdec_root, 'training', 'mira', 'kbest_cut_mira')
# optimizer=2 step=0.001 best=500, k=500, uniq, stream
mira_cmd = [mira, '-c', config, '-w', weights, '-o', '2', '-C', '0.001', '-b', '500', '-k', '500', '-u', '-t']
+ logging.info('Executing: {}'.format(' '.join(mira_cmd)))
self.decoder = util.popen_io(mira_cmd)
def update(self, sentence, grammar, reference):
diff --git a/realtime/rt/rt.py b/realtime/rt/rt.py
index fc6e3929..b04b4ed5 100644
--- a/realtime/rt/rt.py
+++ b/realtime/rt/rt.py
@@ -10,11 +10,12 @@ import subprocess
import tempfile
import time
-import cdec.configobj
+from cdec.configobj import ConfigObj
import cdec.sa
import aligner
import decoder
+import util
class RealtimeDecoder:
@@ -32,21 +33,38 @@ class RealtimeDecoder:
self.aligner = aligner.ForceAligner(fwd_params, fwd_err, rev_params, rev_err)
# Grammar extractor
- sa_config = os.path.join(configdir, 'sa.ini')
- self.extractor = cdec.sa.GrammarExtractor(sa_config, online=True)
+ sa_config = ConfigObj(os.path.join(configdir, 'sa.ini'), unrepr=True)
+ sa_config.filename = os.path.join(self.tmp, 'sa.ini')
+ util.sa_ini_for_realtime(sa_config, os.path.abspath(configdir))
+ sa_config.write()
+ self.extractor = cdec.sa.GrammarExtractor(sa_config.filename, online=True)
self.grammar_files = collections.deque()
self.grammar_dict = {}
self.cache_size = cache_size
+ # HPYPLM reference stream
+ ref_fifo_file = os.path.join(self.tmp, 'ref.fifo')
+ os.mkfifo(ref_fifo_file)
+ self.ref_fifo = open(ref_fifo_file, 'w+')
+ # Start with empty line (do not learn prior to first input)
+ self.ref_fifo.write('\n')
+ self.ref_fifo.flush()
+
# Decoder
- decoder_config = os.path.join(configdir, 'cdec.ini')
+ decoder_config = [[f.strip() for f in line.split('=')] for line in open(os.path.join(configdir, 'cdec.ini'))]
+ util.cdec_ini_for_realtime(decoder_config, os.path.abspath(configdir), ref_fifo_file)
+ decoder_config_file = os.path.join(self.tmp, 'cdec.ini')
+ with open(decoder_config_file, 'w') as output:
+ for (k, v) in decoder_config:
+ output.write('{}={}\n'.format(k, v))
decoder_weights = os.path.join(configdir, 'weights.final')
- self.decoder = decoder.MIRADecoder(decoder_config, decoder_weights)
+ self.decoder = decoder.MIRADecoder(decoder_config_file, decoder_weights)
def close(self):
logging.info('Closing processes')
self.aligner.close()
self.decoder.close()
+ self.ref_fifo.close()
logging.info('Deleting {}'.format(self.tmp))
shutil.rmtree(self.tmp)
@@ -75,6 +93,9 @@ class RealtimeDecoder:
grammar_file = self.grammar(sentence)
start_time = time.time()
hyp = self.decoder.decode(sentence, grammar_file)
+ # Empty reference: HPYPLM does not learn prior to next translation
+ self.ref_fifo.write('\n')
+ self.ref_fifo.flush()
stop_time = time.time()
logging.info('Translation time: {} seconds'.format(stop_time - start_time))
return hyp
@@ -91,4 +112,7 @@ class RealtimeDecoder:
# Clear (old) cached grammar
rm_grammar = self.grammar_dict.pop(source)
os.remove(rm_grammar)
- # TODO: Add to LM by writing to fifo
+ # Add to HPYPLM by writing to fifo (read on next translation)
+ logging.info('Adding to HPYPLM: {}'.format(target))
+ self.ref_fifo.write('{}\n'.format(target))
+ self.ref_fifo.flush()
diff --git a/realtime/rt/util.py b/realtime/rt/util.py
index 885298e6..10e94909 100644
--- a/realtime/rt/util.py
+++ b/realtime/rt/util.py
@@ -13,12 +13,49 @@ SA_INI_FILES = set((
'precompute_file',
))
+def cdec_ini_for_config(config):
+ cdec_ini_handle(config, os.path.basename, hpyplm_rm_ref)
+
+def cdec_ini_for_realtime(config, path, ref_fifo):
+ cdec_ini_handle(config, lambda x: os.path.join(path, x), lambda x: hpyplm_add_ref(x, ref_fifo))
+
+def cdec_ini_handle(config, path_fn, hpyplm_fn):
+ # This is a list of (k, v), not a ConfigObj or dict
+ for i in range(len(config)):
+ if config[i][0] == 'feature_function':
+ if config[i][1].startswith('KLanguageModel'):
+ f = config[i][1].split()
+ f[-1] = path_fn(f[-1])
+ config[i][1] = ' '.join(f)
+ elif config[i][1].startswith('External'):
+ f = config[i][1].split()
+ if f[1].endswith('libcdec_ff_hpyplm.so'):
+ # Modify paths
+ for j in range(1, len(f)):
+ if not f[j].startswith('-'):
+ f[j] = path_fn(f[j])
+ # Modify hpyplm args
+ hpyplm_fn(f)
+ config[i][1] = ' '.join(f)
+
def consume_stream(stream):
def consume(s):
for _ in s:
pass
threading.Thread(target=consume, args=(stream,)).start()
+def hpyplm_add_ref(f, ref):
+ f.append('-r')
+ f.append(ref)
+ f.append('-t')
+
+def hpyplm_rm_ref(f):
+ for i in range(1, len(f)):
+ if f[i] == '-r':
+ f.pop(i)
+ f.pop(i)
+ return
+
def popen_io(cmd):
p = subprocess.Popen(cmd, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
consume_stream(p.stderr)
@@ -29,35 +66,12 @@ def popen_io_v(cmd):
p = subprocess.Popen(cmd, stdin=subprocess.PIPE, stdout=subprocess.PIPE)
return p
-def sa_ini_addpath(config, path):
+def sa_ini_for_config(config):
for key in config:
if key in SA_INI_FILES:
- config[key] = os.path.join(path, config[key])
+ config[key] = os.path.join('sa', os.path.basename(config[key]))
-def sa_ini_basename(config):
+def sa_ini_for_realtime(config, path):
for key in config:
if key in SA_INI_FILES:
- config[key] = os.path.join('sa', os.path.basename(config[key]))
-
-def cdec_ini_addpath(config, path):
- cdec_ini_fn(config, lambda x: os.path.join(path, x))
-
-def cdec_ini_basename(config):
- cdec_ini_fn(config, os.path.basename)
-
-def cdec_ini_fn(config, fn):
- # This is a list of (k, v), not a ConfigObj or dict
- for i in range(len(config)):
- if config[i][0] == 'feature_function':
- if config[i][1].startswith('KLanguageModel'):
- f = config[i][1].split()
- f[-1] = fn(f[-1])
- config[i][1] = ' '.join(f)
- elif config[i][1].startswith('External'):
- f = config[i][1].split()
- if f[1].endswith('libcdec_ff_hpyplm.so'):
- for j in range(1, len(f)):
- if not f[j].startswith('-'):
- f[j] = fn(f[j])
- config[i][1] = ' '.join(f)
-
+ config[key] = os.path.join(path, config[key])