diff options
author | Chris Dyer <cdyer@allegro.clab.cs.cmu.edu> | 2012-11-14 23:14:34 -0500 |
---|---|---|
committer | Chris Dyer <cdyer@allegro.clab.cs.cmu.edu> | 2012-11-14 23:14:34 -0500 |
commit | 1ba06b3f515eccfe5dfe791ae3c8c3b9799c9b27 (patch) | |
tree | 0585c7caf7dc0e9e8a8904ed9ffe40acc3df24ea | |
parent | f8d9ff4aaeb1d1f773bacfe9ee75d1d1778ec26b (diff) |
deal with references
-rw-r--r-- | python/pkg/cdec/sa/extract.py | 8 |
1 files changed, 7 insertions, 1 deletions
diff --git a/python/pkg/cdec/sa/extract.py b/python/pkg/cdec/sa/extract.py index 10a81556..b7d2fe6e 100644 --- a/python/pkg/cdec/sa/extract.py +++ b/python/pkg/cdec/sa/extract.py @@ -3,6 +3,7 @@ import sys import os import argparse import logging +import re import multiprocessing as mp import signal import cdec.sa @@ -27,12 +28,17 @@ def extract(inp): global extractor, prefix i, sentence = inp sentence = sentence[:-1] + fields = re.split('\s*\|\|\|\s*', sentence) + suffix = '' + if len(fields) > 1: + sentence = fields[0] + suffix = ' ||| ' + ' ||| '.join(fields[1:]) grammar_file = os.path.join(prefix, 'grammar.{0}'.format(i)) with open(grammar_file, 'w') as output: for rule in extractor.grammar(sentence): output.write(str(rule)+'\n') grammar_file = os.path.abspath(grammar_file) - return '<seg grammar="{0}" id="{1}">{2}</seg>'.format(grammar_file, i, sentence) + return '<seg grammar="{0}" id="{1}"> {2} </seg>{3}'.format(grammar_file, i, sentence, suffix) def main(): logging.basicConfig(level=logging.INFO) |