summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMichael Denkowski <mdenkows@cs.cmu.edu>2013-09-05 11:27:08 -0700
committerMichael Denkowski <mdenkows@cs.cmu.edu>2013-09-05 11:27:08 -0700
commit208fabfbbe19c1ba2ee744e9d16b54805ec8b141 (patch)
tree7c670e317b060cbaa8c0622094fe0c3804674c7b
parent76c26e382a9d5e2c95064488f060107e95470055 (diff)
Option for text normalization
-rwxr-xr-xrealtime/realtime.py3
-rw-r--r--realtime/rt/rt.py41
2 files changed, 39 insertions, 5 deletions
diff --git a/realtime/realtime.py b/realtime/realtime.py
index eeaca0f4..dff7e90c 100755
--- a/realtime/realtime.py
+++ b/realtime/realtime.py
@@ -17,6 +17,7 @@ def main():
parser = Parser(description='Real-time adaptive translation with cdec.')
parser.add_argument('-c', '--config', required=True, help='Config directory (see README.md)')
+ parser.add_argument('-n', '--normalize', help='Normalize text (tokenize, translate, detokenize)', action='store_true')
parser.add_argument('-T', '--temp', help='Temp directory (default /tmp)', default='/tmp')
parser.add_argument('-a', '--cache', help='Grammar cache size (default 5)', default='5')
parser.add_argument('-v', '--verbose', help='Info to stderr', action='store_true')
@@ -25,7 +26,7 @@ def main():
if args.verbose:
logging.basicConfig(level=logging.INFO)
- rtd = rt.RealtimeDecoder(args.config, tmpdir=args.temp, cache_size=int(args.cache))
+ rtd = rt.RealtimeDecoder(args.config, tmpdir=args.temp, cache_size=int(args.cache), norm=args.normalize)
try:
while True:
diff --git a/realtime/rt/rt.py b/realtime/rt/rt.py
index b04b4ed5..2930c212 100644
--- a/realtime/rt/rt.py
+++ b/realtime/rt/rt.py
@@ -19,12 +19,20 @@ import util
class RealtimeDecoder:
- def __init__(self, configdir, tmpdir='/tmp', cache_size=5):
+ def __init__(self, configdir, tmpdir='/tmp', cache_size=5, norm=False):
+
+ cdec_root = os.path.dirname(os.path.dirname(os.path.dirname(__file__)))
# Temporary work dir
self.tmp = tempfile.mkdtemp(dir=tmpdir, prefix='realtime.')
logging.info('Using temp dir {}'.format(self.tmp))
+ # Normalization
+ self.norm = norm
+ if self.norm:
+ self.tokenizer = util.popen_io([os.path.join(cdec_root, 'corpus', 'tokenize-anything.sh'), '-u'])
+ self.detokenizer = util.popen_io([os.path.join(cdec_root, 'corpus', 'untok.pl')])
+
# Word aligner
fwd_params = os.path.join(configdir, 'a.fwd_params')
fwd_err = os.path.join(configdir, 'a.fwd_err')
@@ -65,6 +73,9 @@ class RealtimeDecoder:
self.aligner.close()
self.decoder.close()
self.ref_fifo.close()
+ if self.norm:
+ self.tokenizer.stdin.close()
+ self.detokenizer.stdin.close()
logging.info('Deleting {}'.format(self.tmp))
shutil.rmtree(self.tmp)
@@ -78,7 +89,7 @@ class RealtimeDecoder:
grammar_file = tempfile.mkstemp(dir=self.tmp, prefix='grammar.')[1]
with open(grammar_file, 'w') as output:
for rule in self.extractor.grammar(sentence):
- output.write(str(rule) + '\n')
+ output.write('{}\n'.format(str(rule)))
if len(self.grammar_files) == self.cache_size:
rm_sent = self.grammar_files.popleft()
# If not already removed by learn method
@@ -90,17 +101,39 @@ class RealtimeDecoder:
return grammar_file
def decode(self, sentence):
+ # Empty in, empty out
+ if sentence.strip() == '':
+ return ''
+ if self.norm:
+ sentence = self.tokenize(sentence)
+ logging.info('Normalized input: {}'.format(sentence))
grammar_file = self.grammar(sentence)
start_time = time.time()
hyp = self.decoder.decode(sentence, grammar_file)
+ stop_time = time.time()
+ logging.info('Translation time: {} seconds'.format(stop_time - start_time))
# Empty reference: HPYPLM does not learn prior to next translation
self.ref_fifo.write('\n')
self.ref_fifo.flush()
- stop_time = time.time()
- logging.info('Translation time: {} seconds'.format(stop_time - start_time))
+ if self.norm:
+ hyp = self.detokenize(hyp)
return hyp
+ def tokenize(self, line):
+ self.tokenizer.stdin.write('{}\n'.format(line))
+ return self.tokenizer.stdout.readline().strip()
+
+ def detokenize(self, line):
+ self.detokenizer.stdin.write('{}\n'.format(line))
+ return self.detokenizer.stdout.readline().strip()
+
def learn(self, source, target):
+ if '' in (source.strip(), target.strip()):
+ logging.info('Error empty source or target: {} ||| {}'.format(source, target))
+ return
+ if self.norm:
+ source = self.tokenize(source)
+ target = self.tokenize(target)
# MIRA update before adding data to grammar extractor
grammar_file = self.grammar(source)
mira_log = self.decoder.update(source, grammar_file, target)