from nltk.stem.porter import PorterStemmer from nltk.stem.snowball import GermanStemmer import os import re import util import xml.etree.ElementTree as ET class IdStemmer: def stem(self, word): return word class Extractor: NP_WEIGHT = 50 def __init__(self, config): self.config = config if config.stem: if config.lang == 'en': self.stemmer = PorterStemmer() elif config.lang == 'de': self.stemmer = GermanStemmer() else: self.stemmer = IdStemmer() def run(self): if self.config.corpus == 'geo': self.run_geo() elif self.config.corpus == 'robo': self.run_robo() elif self.config.corpus == 'atis': self.run_atis() else: assert False def run_atis(self): train_nl = open('%s/train.nl' % self.config.experiment_dir, 'w') train_nl_lm = open('%s/train.nl.lm' % self.config.experiment_dir, 'w') train_nl_np = open('%s/train.np.nl' % self.config.experiment_dir, 'w') train_mrl = open('%s/train.mrl' % self.config.experiment_dir, 'w') train_mrl_lm = open('%s/train.mrl.lm' % self.config.experiment_dir, 'w') train_mrl_np = open('%s/train.np.mrl' % self.config.experiment_dir, 'w') train_fun = open('%s/train.fun' % self.config.experiment_dir, 'w') tune_nl = open('%s/tune.nl' % self.config.experiment_dir, 'w') tune_mrl = open('%s/tune.mrl' % self.config.experiment_dir, 'w') test_nl = open('%s/test.nl' % self.config.experiment_dir, 'w') test_mrl = open('%s/test.mrl' % self.config.experiment_dir, 'w') test_fun = open('%s/test.fun' % self.config.experiment_dir, 'w') if self.config.run == 'debug': with open('%s/atis-train.sem' % self.config.data_dir) as data_file: counter = 0 for line in data_file: nl, slot = line.split('<=>', 1) nl = self.preprocess_nl(nl) slot = self.replace_specials(slot) fun = self.slot_to_fun(slot) mrl = util.fun_to_mrl(fun, True) if counter % 4 in (0,1): print >>train_nl, nl print >>train_mrl, mrl print >>train_fun, fun print >>train_nl_np, nl print >>train_mrl_np, mrl print >>train_nl_lm, '', nl, '' print >>train_mrl_lm, '', mrl, '' elif counter % 4 == 2: print >>tune_nl, nl print >>tune_mrl, mrl else: print >>test_nl, nl print >>test_mrl, mrl print >>test_fun, fun counter += 1 else: train_path = '%s/atis-train.sem' % self.config.data_dir if self.config.run == 'dev': tune_path = train_path test_path = '%s/atis-dev.sem' % self.config.data_dir elif self.config.run == 'test': tune_path = '%s/atis-dev.sem' % self.config.data_dir test_path = '%s/atis-test.sem' % self.config.data_dir with open(train_path) as train_file: for line in train_file: nl, slot = line.split('<=>', 1) nl = self.preprocess_nl(nl) slot = self.replace_specials(slot) fun = self.slot_to_fun(slot) mrl = util.fun_to_mrl(fun, True) print >>train_nl, nl print >>train_mrl, mrl print >>train_fun, fun print >>train_nl_np, nl print >>train_mrl_np, mrl print >>train_nl_lm, '', nl, '' print >>train_mrl_lm, '', mrl, '' with open(tune_path) as tune_file: for line in tune_file: nl, slot = line.split('<=>', 1) nl = self.preprocess_nl(nl) slot = self.replace_specials(slot) fun = self.slot_to_fun(slot) mrl = util.fun_to_mrl(fun, True) print >>tune_nl, nl print >>tune_mrl, mrl with open(test_path) as test_file: for line in test_file: nl, slot = line.split('<=>', 1) nl = self.preprocess_nl(nl) slot = self.replace_specials(slot) fun = self.slot_to_fun(slot) mrl = util.fun_to_mrl(fun, True) print >>test_nl, nl print >>test_mrl, mrl print >>test_fun, fun for np_name in os.listdir('%s/db' % self.config.data_dir): np_path = '%s/db/%s' % (self.config.data_dir, np_name) with open(np_path) as np_file: for line in np_file: names = re.findall(r'"([^"]+)"', line) for name in names: nl = name mrl = "%s" % self.replace_specials(name) mrl = mrl.replace(' ', '_') mrl = mrl + '@s' print >>train_nl_np, nl print >>train_mrl_np, mrl print >>train_nl_lm, nl print >>train_mrl_lm, mrl train_nl.close() train_nl_lm.close() train_mrl.close() train_mrl_lm.close() train_fun.close() test_nl.close() test_mrl.close() test_fun.close() tune_nl.close() tune_mrl.close() def run_robo(self): train_ids, tune_ids, test_ids = self.get_folds() tune_ids = test_ids train_nl = open('%s/train.nl' % self.config.experiment_dir, 'w') train_nl_lm = open('%s/train.nl.lm' % self.config.experiment_dir, 'w') train_nl_np = open('%s/train.np.nl' % self.config.experiment_dir, 'w') train_mrl = open('%s/train.mrl' % self.config.experiment_dir, 'w') train_mrl_lm = open('%s/train.mrl.lm' % self.config.experiment_dir, 'w') train_mrl_np = open('%s/train.np.mrl' % self.config.experiment_dir, 'w') train_fun = open('%s/train.fun' % self.config.experiment_dir, 'w') tune_nl = open('%s/tune.nl' % self.config.experiment_dir, 'w') tune_mrl = open('%s/tune.mrl' % self.config.experiment_dir, 'w') test_nl = open('%s/test.nl' % self.config.experiment_dir, 'w') test_mrl = open('%s/test.mrl' % self.config.experiment_dir, 'w') test_fun = open('%s/test.fun' % self.config.experiment_dir, 'w') corpus = ET.parse('%s/corpus.xml' % self.config.data_dir) corpus_root = corpus.getroot() for node in corpus_root.findall('example'): nl = node.find("nl[@lang='%s']" % self.config.lang).text nl = self.preprocess_nl(nl) clang = node.find("mrl[@lang='robocup-clang']").text clang = self.replace_specials(clang) fun = self.clang_to_fun(clang) #print fun mrl = util.fun_to_mrl(fun) eid = int(node.attrib['id']) if eid in tune_ids: print >>tune_nl, nl print >>tune_mrl, mrl elif eid in train_ids: print >>train_nl, nl print >>train_mrl, mrl print >>train_fun, fun print >>train_nl_np, nl print >>train_mrl_np, mrl print >>train_nl_lm, '', nl, '' print >>train_mrl_lm, '', mrl, '' if eid in test_ids: #elif eid in test_ids: print >>test_nl, nl print >>test_mrl, mrl print >>test_fun, fun nps_file = open('%s/names' % self.config.data_dir) while True: line = nps_file.readline() if not line: break nl = nps_file.readline().strip()[3:] nl = self.preprocess_nl(nl) nps_file.readline() nps_file.readline() while True: line = nps_file.readline().strip() if line == '': break m = re.match('^\*n:(Num|Unum|Ident) -> \(\{ (\S+) \}\)$', line) mrl = m.group(2) + '@0' for i in range(self.NP_WEIGHT): print >>train_nl_np, nl print >>train_mrl_np, mrl print >>train_nl_lm, nl print >>train_mrl_lm, mrl train_nl.close() train_nl_lm.close() train_mrl.close() train_mrl_lm.close() train_fun.close() test_nl.close() test_mrl.close() test_fun.close() tune_nl.close() tune_mrl.close() def run_geo(self): train_ids, tune_ids, test_ids = self.get_folds() train_nl = open('%s/train.nl' % self.config.experiment_dir, 'w') train_nl_lm = open('%s/train.nl.lm' % self.config.experiment_dir, 'w') train_nl_np = open('%s/train.np.nl' % self.config.experiment_dir, 'w') train_mrl = open('%s/train.mrl' % self.config.experiment_dir, 'w') train_mrl_lm = open('%s/train.mrl.lm' % self.config.experiment_dir, 'w') train_mrl_np = open('%s/train.np.mrl' % self.config.experiment_dir, 'w') train_fun = open('%s/train.fun' % self.config.experiment_dir, 'w') unlabeled_nl = open('%s/unlabeled.nl' % self.config.experiment_dir, 'w') tune_nl = open('%s/tune.nl' % self.config.experiment_dir, 'w') tune_mrl = open('%s/tune.mrl' % self.config.experiment_dir, 'w') test_nl = open('%s/test.nl' % self.config.experiment_dir, 'w') test_mrl = open('%s/test.mrl' % self.config.experiment_dir, 'w') test_fun = open('%s/test.fun' % self.config.experiment_dir, 'w') corpus = ET.parse('%s/corpus-true.xml' % self.config.data_dir) corpus_root = corpus.getroot() counter = 0 #stop_labeling = False for node in corpus_root.findall('example'): nl = node.find("nl[@lang='%s']" % self.config.lang).text nl = self.preprocess_nl(nl) fun = node.find("mrl[@lang='geo-funql']").text fun = self.preprocess_fun(fun) #fun = self.replace_specials(fun) mrl = util.fun_to_mrl(fun) eid = int(node.attrib['id']) unlabel_this = (counter >= 10 * self.config.lfrac) counter += 1 counter %= 10 if eid in tune_ids: print >>tune_nl, nl print >>tune_mrl, mrl elif eid in train_ids and not unlabel_this: print >>train_nl, nl print >>train_mrl, mrl print >>train_fun, fun print >>train_nl_np, nl print >>train_mrl_np, mrl print >>train_nl_lm, '', nl, '' print >>train_mrl_lm, '', mrl, '' elif eid in train_ids and unlabel_this: print >>unlabeled_nl, nl elif eid in test_ids: print >>test_nl, nl print >>test_mrl, mrl print >>test_fun, fun nplist = ET.parse('%s/nps-true.xml' % self.config.data_dir) nplist_root = nplist.getroot() for node in nplist_root.findall('example'): fun = node.find("mrl[@lang='geo-funql']").text fun = self.preprocess_fun(fun) #fun = self.replace_specials(fun) mrl = util.fun_to_mrl(fun) big_np = len(mrl.split()) > 1 if (self.config.np_type == 'big' and not big_np) or \ (self.config.np_type == 'small' and big_np): continue for nl_node in node.findall("nl[@lang='%s']" % self.config.lang): nl = nl_node.text nl = self.preprocess_nl(nl) for i in range(self.NP_WEIGHT): print >>train_nl_np, nl print >>train_mrl_np, mrl print >>train_nl_lm, nl print >>train_mrl_lm, mrl train_nl.close() train_nl_lm.close() train_mrl.close() train_mrl_lm.close() train_fun.close() test_nl.close() test_mrl.close() test_fun.close() tune_nl.close() tune_mrl.close() def get_folds(self): if self.config.corpus == 'geo': if self.config.run in ('debug', 'dev'): train_ids_file = '%s/folds600/fold-%d-train.ids' \ % (self.config.data_dir, self.config.fold) tune_ids_file = None test_ids_file = '%s/folds600/fold-%d-test.ids' \ % (self.config.data_dir, self.config.fold) elif self.config.run == 'test': train_ids_file = '%s/split880/fold-0-train.ids' % self.config.data_dir tune_ids_file = '%s/split880/fold-0-tune.ids' % self.config.data_dir test_ids_file = '%s/split880/fold-0-test.ids' % self.config.data_dir elif self.config.corpus == 'robo': if self.config.run in ('debug', 'dev'): train_ids_file = '%s/split-300/run-0/fold-%d/train-N270' \ % (self.config.data_dir, self.config.fold) tune_ids_file = None test_ids_file = '%s/split-300/run-0/fold-%d/test' \ % (self.config.data_dir, self.config.fold) else: assert False train_ids = set() tune_ids = set() test_ids = set() with open(train_ids_file) as fold_file: for line in fold_file.readlines(): train_ids.add(int(line)) if tune_ids_file: with open(tune_ids_file) as fold_file: for line in fold_file.readlines(): tune_ids.add(int(line)) with open(test_ids_file) as fold_file: for line in fold_file.readlines(): test_ids.add(int(line)) return train_ids, tune_ids, test_ids def preprocess_nl(self, nl): nl = nl.strip().lower() if self.config.stem and self.config.lang == 'de': # German stemmer can't handle UTF-8 nl = nl.encode('ascii', 'ignore') else: nl = nl.encode('utf-8', 'ignore') if nl[-2:] == ' .' or nl[-2:] == ' ?': nl = nl[:-2] if self.config.stem: nl = ' '.join([self.stemmer.stem(tok) for tok in nl.split()]) return nl def preprocess_fun(self, fun): return fun.strip() def replace_specials(self, mrl): mrl = mrl.replace('.', 'xxd') mrl = mrl.replace("'", 'xxq') mrl = mrl.replace('/', 'xxs') #mrl = re.sub(r"(' *[^'()]*)\'([^'()]* *')", r'\1_q_\2', mrl) #mrl = re.sub(r"(' *[^'()]*)\.([^'()]* *')", r'\1_dot_\2', mrl) #mrl = re.sub(r"(' *[^'()]*)\/([^'()]* *')", r'\1_slash_\2', mrl) return mrl def clang_to_fun(self, clang): clang = clang.strip() clang = re.sub(r'\s+', ' ', clang) clang = re.sub(r'\{([\d|X]+( [\d|X]+)*)\}', r'(set \1)', clang) clang = re.sub(r'\(([\w.-]+) ?', r'\1(', clang) clang = self.strip_bare_parens(clang) clang = clang.replace('()', '') clang = clang.replace(' ', ',') clang = clang.replace('"', '') clang = re.sub(r'definerule\([^,]+,[^,]+,', r'definerule(', clang) return clang def strip_bare_parens(self, clang): try: start = clang.index(' (')+1 except ValueError: return clang end = start+1 pcounter = 0 while pcounter >= 0: c = clang[end:end+1] if c == '(': pcounter += 1 elif c == ')': pcounter -= 1 end += 1 end -= 1 r = clang[:start] + clang[start+1:end] + clang[end+1:] return r def slot_to_fun(self, slot): slot = slot.strip() slot = slot.replace('value', '"value"') slot = slot.replace('="', "('") slot = slot.replace('",', "'),") slot = slot.replace('")', "'))") slot = slot.replace("'value'", 'value') return slot