diff options
| author | Michael Denkowski <mdenkows@cs.cmu.edu> | 2013-09-05 11:27:08 -0700 | 
|---|---|---|
| committer | Michael Denkowski <mdenkows@cs.cmu.edu> | 2013-09-05 11:27:08 -0700 | 
| commit | 208fabfbbe19c1ba2ee744e9d16b54805ec8b141 (patch) | |
| tree | 7c670e317b060cbaa8c0622094fe0c3804674c7b /realtime/rt | |
| parent | 76c26e382a9d5e2c95064488f060107e95470055 (diff) | |
Option for text normalization
Diffstat (limited to 'realtime/rt')
| -rw-r--r-- | realtime/rt/rt.py | 41 | 
1 files changed, 37 insertions, 4 deletions
| diff --git a/realtime/rt/rt.py b/realtime/rt/rt.py index b04b4ed5..2930c212 100644 --- a/realtime/rt/rt.py +++ b/realtime/rt/rt.py @@ -19,12 +19,20 @@ import util  class RealtimeDecoder: -    def __init__(self, configdir, tmpdir='/tmp', cache_size=5): +    def __init__(self, configdir, tmpdir='/tmp', cache_size=5, norm=False): + +        cdec_root = os.path.dirname(os.path.dirname(os.path.dirname(__file__)))          # Temporary work dir          self.tmp = tempfile.mkdtemp(dir=tmpdir, prefix='realtime.')          logging.info('Using temp dir {}'.format(self.tmp)) +        # Normalization +        self.norm = norm +        if self.norm: +            self.tokenizer = util.popen_io([os.path.join(cdec_root, 'corpus', 'tokenize-anything.sh'), '-u']) +            self.detokenizer = util.popen_io([os.path.join(cdec_root, 'corpus', 'untok.pl')]) +          # Word aligner          fwd_params = os.path.join(configdir, 'a.fwd_params')          fwd_err = os.path.join(configdir, 'a.fwd_err') @@ -65,6 +73,9 @@ class RealtimeDecoder:          self.aligner.close()          self.decoder.close()          self.ref_fifo.close() +        if self.norm: +            self.tokenizer.stdin.close() +            self.detokenizer.stdin.close()          logging.info('Deleting {}'.format(self.tmp))          shutil.rmtree(self.tmp) @@ -78,7 +89,7 @@ class RealtimeDecoder:          grammar_file = tempfile.mkstemp(dir=self.tmp, prefix='grammar.')[1]          with open(grammar_file, 'w') as output:              for rule in self.extractor.grammar(sentence): -                output.write(str(rule) + '\n') +                output.write('{}\n'.format(str(rule)))          if len(self.grammar_files) == self.cache_size:              rm_sent = self.grammar_files.popleft()              # If not already removed by learn method @@ -90,17 +101,39 @@ class RealtimeDecoder:          return grammar_file      def decode(self, sentence): +        # Empty in, empty out +        if sentence.strip() == '': +            return '' +        if self.norm: +            sentence = self.tokenize(sentence) +            logging.info('Normalized input: {}'.format(sentence))          grammar_file = self.grammar(sentence)          start_time = time.time()          hyp = self.decoder.decode(sentence, grammar_file) +        stop_time = time.time() +        logging.info('Translation time: {} seconds'.format(stop_time - start_time))          # Empty reference: HPYPLM does not learn prior to next translation          self.ref_fifo.write('\n')          self.ref_fifo.flush() -        stop_time = time.time() -        logging.info('Translation time: {} seconds'.format(stop_time - start_time)) +        if self.norm: +            hyp = self.detokenize(hyp)          return hyp +    def tokenize(self, line): +        self.tokenizer.stdin.write('{}\n'.format(line)) +        return self.tokenizer.stdout.readline().strip() + +    def detokenize(self, line): +        self.detokenizer.stdin.write('{}\n'.format(line)) +        return self.detokenizer.stdout.readline().strip() +      def learn(self, source, target): +        if '' in (source.strip(), target.strip()): +            logging.info('Error empty source or target: {} ||| {}'.format(source, target)) +            return +        if self.norm: +            source = self.tokenize(source) +            target = self.tokenize(target)          # MIRA update before adding data to grammar extractor          grammar_file = self.grammar(source)          mira_log = self.decoder.update(source, grammar_file, target) | 
