From 40688c6c8ac48c809e1b4f1fa10d93144620dead Mon Sep 17 00:00:00 2001 From: Michael Denkowski Date: Thu, 5 Sep 2013 11:27:08 -0700 Subject: Option for text normalization --- realtime/realtime.py | 3 ++- realtime/rt/rt.py | 41 +++++++++++++++++++++++++++++++++++++---- 2 files changed, 39 insertions(+), 5 deletions(-) diff --git a/realtime/realtime.py b/realtime/realtime.py index eeaca0f4..dff7e90c 100755 --- a/realtime/realtime.py +++ b/realtime/realtime.py @@ -17,6 +17,7 @@ def main(): parser = Parser(description='Real-time adaptive translation with cdec.') parser.add_argument('-c', '--config', required=True, help='Config directory (see README.md)') + parser.add_argument('-n', '--normalize', help='Normalize text (tokenize, translate, detokenize)', action='store_true') parser.add_argument('-T', '--temp', help='Temp directory (default /tmp)', default='/tmp') parser.add_argument('-a', '--cache', help='Grammar cache size (default 5)', default='5') parser.add_argument('-v', '--verbose', help='Info to stderr', action='store_true') @@ -25,7 +26,7 @@ def main(): if args.verbose: logging.basicConfig(level=logging.INFO) - rtd = rt.RealtimeDecoder(args.config, tmpdir=args.temp, cache_size=int(args.cache)) + rtd = rt.RealtimeDecoder(args.config, tmpdir=args.temp, cache_size=int(args.cache), norm=args.normalize) try: while True: diff --git a/realtime/rt/rt.py b/realtime/rt/rt.py index b04b4ed5..2930c212 100644 --- a/realtime/rt/rt.py +++ b/realtime/rt/rt.py @@ -19,12 +19,20 @@ import util class RealtimeDecoder: - def __init__(self, configdir, tmpdir='/tmp', cache_size=5): + def __init__(self, configdir, tmpdir='/tmp', cache_size=5, norm=False): + + cdec_root = os.path.dirname(os.path.dirname(os.path.dirname(__file__))) # Temporary work dir self.tmp = tempfile.mkdtemp(dir=tmpdir, prefix='realtime.') logging.info('Using temp dir {}'.format(self.tmp)) + # Normalization + self.norm = norm + if self.norm: + self.tokenizer = util.popen_io([os.path.join(cdec_root, 'corpus', 'tokenize-anything.sh'), '-u']) + self.detokenizer = util.popen_io([os.path.join(cdec_root, 'corpus', 'untok.pl')]) + # Word aligner fwd_params = os.path.join(configdir, 'a.fwd_params') fwd_err = os.path.join(configdir, 'a.fwd_err') @@ -65,6 +73,9 @@ class RealtimeDecoder: self.aligner.close() self.decoder.close() self.ref_fifo.close() + if self.norm: + self.tokenizer.stdin.close() + self.detokenizer.stdin.close() logging.info('Deleting {}'.format(self.tmp)) shutil.rmtree(self.tmp) @@ -78,7 +89,7 @@ class RealtimeDecoder: grammar_file = tempfile.mkstemp(dir=self.tmp, prefix='grammar.')[1] with open(grammar_file, 'w') as output: for rule in self.extractor.grammar(sentence): - output.write(str(rule) + '\n') + output.write('{}\n'.format(str(rule))) if len(self.grammar_files) == self.cache_size: rm_sent = self.grammar_files.popleft() # If not already removed by learn method @@ -90,17 +101,39 @@ class RealtimeDecoder: return grammar_file def decode(self, sentence): + # Empty in, empty out + if sentence.strip() == '': + return '' + if self.norm: + sentence = self.tokenize(sentence) + logging.info('Normalized input: {}'.format(sentence)) grammar_file = self.grammar(sentence) start_time = time.time() hyp = self.decoder.decode(sentence, grammar_file) + stop_time = time.time() + logging.info('Translation time: {} seconds'.format(stop_time - start_time)) # Empty reference: HPYPLM does not learn prior to next translation self.ref_fifo.write('\n') self.ref_fifo.flush() - stop_time = time.time() - logging.info('Translation time: {} seconds'.format(stop_time - start_time)) + if self.norm: + hyp = self.detokenize(hyp) return hyp + def tokenize(self, line): + self.tokenizer.stdin.write('{}\n'.format(line)) + return self.tokenizer.stdout.readline().strip() + + def detokenize(self, line): + self.detokenizer.stdin.write('{}\n'.format(line)) + return self.detokenizer.stdout.readline().strip() + def learn(self, source, target): + if '' in (source.strip(), target.strip()): + logging.info('Error empty source or target: {} ||| {}'.format(source, target)) + return + if self.norm: + source = self.tokenize(source) + target = self.tokenize(target) # MIRA update before adding data to grammar extractor grammar_file = self.grammar(source) mira_log = self.decoder.update(source, grammar_file, target) -- cgit v1.2.3