summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorChris Dyer <cdyer@allegro.clab.cs.cmu.edu>2012-11-14 23:14:34 -0500
committerChris Dyer <cdyer@allegro.clab.cs.cmu.edu>2012-11-14 23:14:34 -0500
commit1ba06b3f515eccfe5dfe791ae3c8c3b9799c9b27 (patch)
tree0585c7caf7dc0e9e8a8904ed9ffe40acc3df24ea
parentf8d9ff4aaeb1d1f773bacfe9ee75d1d1778ec26b (diff)
deal with references
-rw-r--r--python/pkg/cdec/sa/extract.py8
1 files changed, 7 insertions, 1 deletions
diff --git a/python/pkg/cdec/sa/extract.py b/python/pkg/cdec/sa/extract.py
index 10a81556..b7d2fe6e 100644
--- a/python/pkg/cdec/sa/extract.py
+++ b/python/pkg/cdec/sa/extract.py
@@ -3,6 +3,7 @@ import sys
import os
import argparse
import logging
+import re
import multiprocessing as mp
import signal
import cdec.sa
@@ -27,12 +28,17 @@ def extract(inp):
global extractor, prefix
i, sentence = inp
sentence = sentence[:-1]
+ fields = re.split('\s*\|\|\|\s*', sentence)
+ suffix = ''
+ if len(fields) > 1:
+ sentence = fields[0]
+ suffix = ' ||| ' + ' ||| '.join(fields[1:])
grammar_file = os.path.join(prefix, 'grammar.{0}'.format(i))
with open(grammar_file, 'w') as output:
for rule in extractor.grammar(sentence):
output.write(str(rule)+'\n')
grammar_file = os.path.abspath(grammar_file)
- return '<seg grammar="{0}" id="{1}">{2}</seg>'.format(grammar_file, i, sentence)
+ return '<seg grammar="{0}" id="{1}"> {2} </seg>{3}'.format(grammar_file, i, sentence, suffix)
def main():
logging.basicConfig(level=logging.INFO)