From b0bff4f48b2de88560199be09e5a29feecaa267c Mon Sep 17 00:00:00 2001 From: Patrick Simianer Date: Mon, 14 Apr 2014 16:05:40 +0200 Subject: smt-semparse --- .gitignore | 2 - cfg.rb | 12 +- data/geoquery/README | 2 + data/geoquery/smt-semparse/decode_sentence.py | 36 ++++++ data/geoquery/smt-semparse/functionalizer.py | 143 +++++++++++++++++++++ data/geoquery/smt-semparse/moses.py | 173 ++++++++++++++++++++++++++ 6 files changed, 360 insertions(+), 8 deletions(-) delete mode 100644 .gitignore create mode 100644 data/geoquery/smt-semparse/decode_sentence.py create mode 100644 data/geoquery/smt-semparse/functionalizer.py create mode 100644 data/geoquery/smt-semparse/moses.py diff --git a/.gitignore b/.gitignore deleted file mode 100644 index 1509677..0000000 --- a/.gitignore +++ /dev/null @@ -1,2 +0,0 @@ -prototype -proper diff --git a/cfg.rb b/cfg.rb index 5d789c5..7063730 100644 --- a/cfg.rb +++ b/cfg.rb @@ -1,12 +1,12 @@ # edit here to change the parser -SMT_SEMPARSE = 'python /workspace/grounded/smt-semparse-cp/decode_sentence.py /workspace/grounded/smt-semparse-cp/working/full_dataset' -# this is the 'fixed' version of eval.pl -EVAL_PL = '/workspace/grounded/wasp-1.0/data/geo-funql/eval/eval.pl' +SMT_SEMPARSE = 'python /path/to/decode_sentence.py/of/smt-semparse /path/to/smt-semparse/workdir' +# this should be a 'fixed' (one that doesn't abbreviate its output) version of eval.pl +EVAL_PL = '/path/to/wasp-1.0/data/geo-funql/eval/eval.pl' # set to true to ignore zombie eval.pl procs ACCEPT_ZOMBIES = true -TIMEOUT=60 +TIMEOUT = 60 # cdec binary -CDEC_BIN = '/toolbox/cdec-dtrain/decoder/cdec' +CDEC_BIN = '/path/to/cdec' # memcached has to be running -$cache = Memcached.new('localhost:11211') +$cache = Memcached.new('localhost:31337') diff --git a/data/geoquery/README b/data/geoquery/README index 54bdf58..86d6e0a 100644 --- a/data/geoquery/README +++ b/data/geoquery/README @@ -14,3 +14,5 @@ split880.train.ids : 880 train/test split train ids ../stopwords.en : English stopwords file ../weights.init : initial weights +smt-semparse/ : slightly adapted code for Andreas' smt-semparse + diff --git a/data/geoquery/smt-semparse/decode_sentence.py b/data/geoquery/smt-semparse/decode_sentence.py new file mode 100644 index 0000000..1914734 --- /dev/null +++ b/data/geoquery/smt-semparse/decode_sentence.py @@ -0,0 +1,36 @@ +import sys +import os +import tempfile, shutil +from src.extractor import Extractor +from src.smt_semparse_config import SMTSemparseConfig +from src.moses import Moses +from src.functionalizer import Functionalizer + +#input: English sentence +if __name__ == '__main__': + sentence = '' + if len(sys.argv) == 3: + experiment_dir = sys.argv[1] + sentence = sys.argv[2] + else: + assert False + + # load config + config = SMTSemparseConfig('/workspace/grounded/smt-semparse-cp/settings.yaml', '/workspace/grounded/smt-semparse-cp/dependencies.yaml') + + #stem + sentence = Extractor(config).preprocess_nl(sentence) + + # we need a temp dir! + temp_dir = tempfile.mkdtemp() + + #decode + moses = Moses(config) + moses.decode_sentence(experiment_dir, sentence, temp_dir) + + #convert to bracket structure + print Functionalizer(config).run_sentence(experiment_dir, temp_dir) + + #delete tmp files + shutil.rmtree(temp_dir) + diff --git a/data/geoquery/smt-semparse/functionalizer.py b/data/geoquery/smt-semparse/functionalizer.py new file mode 100644 index 0000000..782b4e5 --- /dev/null +++ b/data/geoquery/smt-semparse/functionalizer.py @@ -0,0 +1,143 @@ +import logging +import util +import sys + +class Functionalizer: + + def __init__(self, config): + self.config = config + + def run(self): + hyp_file = open('%s/hyp.mrl.nbest' % self.config.experiment_dir) + fun_file = open('%s/hyp.fun' % self.config.experiment_dir, 'w') + + hypsets = [] + hypset = [] + last_eid = 0 + for line in hyp_file: + parts = line.split('|||') + eid = int(parts[0]) + if eid != last_eid: + hypsets.append(hypset) + hypset = [] + last_eid = eid + score = parts[2] + ' ||| ' + parts[3].strip() + hyp = parts[1].strip() + hypset.append((hyp,score)) + hypsets.append(hypset) + + counter = 0 + for hypset in hypsets: + hypset = list(reversed(hypset)) + while hypset: + hyp, score = hypset.pop() + fun = self.functionalize(hyp) + if fun: + print >>fun_file, counter, '|||', fun, '|||', score + break + counter += 1 + + def run_sentence(self, experiment_dir, temp_dir): + hyp_file = open('%s/nbest.tmp' % temp_dir, 'r') + + hypsets = [] + hypset = [] + last_eid = 0 + for line in hyp_file: + parts = line.split('|||') + eid = int(parts[0]) + if eid != last_eid: + hypsets.append(hypset) + hypset = [] + last_eid = eid + score = parts[2] + ' ||| ' + parts[3].strip() + hyp = parts[1].strip() + hypset.append((hyp,score)) + hypsets.append(hypset) + hyp_file.close() + + counter = 0 + for hypset in hypsets: + hypset = list(reversed(hypset)) + while hypset: + hyp, score = hypset.pop() + fun = self.functionalize(hyp) + if fun: + return fun + break + counter += 1 + return "" + + #xc = 0 + def functionalize(self, mrl): + + #if '_@0' in mrl and 'cityid@2' in mrl: + # #print '===' + # #print mrl + # self.xc += 1 + # if self.xc > 5: + # exit() + + stack = [] + r = [] + tokens = list(reversed(mrl.split())) + + #print tokens + + while tokens: + it = tokens.pop() + #print it + if util.ARITY_SEP not in it: + token = it + arity = util.ARITY_STR + logging.warn('unrecognized token: %s', it) + else: + token, arity = it.rsplit(util.ARITY_SEP) + if arity == util.ARITY_STR: + arity = 0 + arity_str = True + elif not (arity == util.ARITY_ANY): + arity = int(arity) + arity_str = False + + if arity == util.ARITY_ANY or arity > 0: + r.append(token) + r.append('(') + stack.append(arity) + else: + assert arity == 0 + if arity_str: + r.append("'%s'" % token.replace('_', ' ')) + else: + r.append(token) + #print r + while stack: + top = stack.pop() + if top == util.ARITY_ANY and tokens: + r.append(',') + stack.append(util.ARITY_ANY) + break + elif top != util.ARITY_ANY and top > 1: + r.append(',') + stack.append(top - 1) + break + else: + r.append(')') + + if not stack and tokens: + return None + + if stack: + return None + + r = ''.join(r) + + # nasty hacks to fix misplaced _ + if '(_' in r: + return None + if ',_' in r and not ('cityid' in r): + return None + if '_),_)' in r: + return None + + return r diff --git a/data/geoquery/smt-semparse/moses.py b/data/geoquery/smt-semparse/moses.py new file mode 100644 index 0000000..9a159c3 --- /dev/null +++ b/data/geoquery/smt-semparse/moses.py @@ -0,0 +1,173 @@ +import logging +import os +import subprocess +import gzip + +from subprocess import Popen, PIPE, STDOUT + +class Moses: + + def __init__(self, config): + self.config = config + + def run_train(self): + args = [self.config.moses_train, + '--root-dir', self.config.experiment_dir, + '--corpus', '%s/%s' % (self.config.experiment_dir, + self.config.train_name), + '--f', self.config.src, + '--e', self.config.tgt, + '--lm', '0:3:%s/%s.arpa' % (self.config.experiment_dir, self.config.tgt), + #'-score-options', "'--OnlyDirect --NoPhraseCount'" + '--alignment', self.config.symm, + '-external-bin-dir', self.config.giza] + if self.config.model == 'hier': + args += ['-hierarchical', '-glue-grammar'] + + logging.info(' '.join(args)) + + log = open('%s/train.log' % self.config.experiment_dir, 'w') + p = subprocess.Popen(args, stdout=subprocess.PIPE, stderr=log) + p.wait() + log.close() + + def run_retrain(self): + old_train_nl = '%s/%s.nl' % (self.config.experiment_dir, + self.config.train_name) + old_train_mrl = '%s/%s.mrl' % (self.config.experiment_dir, + self.config.train_name) + moved_train_nl = '%s.notune' % old_train_nl + moved_train_mrl = '%s.notune' % old_train_mrl + tune_nl = '%s/tune.nl' % self.config.experiment_dir + tune_mrl = '%s/tune.mrl' % self.config.experiment_dir + os.rename(old_train_nl, moved_train_nl) + os.rename(old_train_mrl, moved_train_mrl) + with open(old_train_nl, 'w') as rt_train_nl: + subprocess.call(['cat', moved_train_nl, tune_nl], stdout=rt_train_nl) + with open(old_train_mrl, 'w') as rt_train_mrl: + subprocess.call(['cat', moved_train_mrl, tune_mrl], stdout=rt_train_mrl) + + os.remove('%s/model/extract.inv.gz' % self.config.experiment_dir) + os.remove('%s/model/extract.gz' % self.config.experiment_dir) + if self.config.model == 'hier': + os.remove('%s/model/rule-table.gz' % self.config.experiment_dir) + else: + os.remove('%s/model/phrase-table.gz' % self.config.experiment_dir) + + self.run_train() + + def parens_ok(self, line): + mrl_part = line.split(' ||| ')[1] + tokens = [t[-1] for t in mrl_part.split() if t[-2] == '@'] + tokens.reverse() + stack = [] + while tokens: + t = tokens.pop() + assert t != '*' + if t == 's': + t = 0 + t = int(t) + if t > 0: + stack.append(t) + else: + while stack: + top = stack.pop() + if top > 1: + stack.append(top - 1) + break + if tokens and not stack: + return False + return True + + def filter_phrase_table(self): + table_name = 'phrase' if self.config.model == 'phrase' else 'rule' + oldname = '%s/model/%s-table.gz' % (self.config.experiment_dir, table_name) + newname = '%s/model/%s-table.old.gz' % (self.config.experiment_dir, table_name) + os.rename(oldname, newname) + + with gzip.open(oldname, 'w') as filtered_table_f: + with gzip.open(newname, 'r') as old_table_f: + for line in old_table_f: + if self.parens_ok(line): + print >>filtered_table_f, line, + + def run_tune(self): + wd = os.getcwd() + os.chdir(self.config.experiment_dir) + args = [self.config.moses_tune, + '%s/tune.%s' % (self.config.experiment_dir, self.config.src), + '%s/tune.%s' % (self.config.experiment_dir, self.config.tgt)] + if self.config.model == 'hier': + args += [self.config.moses_decode_hier] + else: + args += [self.config.moses_decode_phrase] + args += ['%s/model/moses.ini' % self.config.experiment_dir, + '--mertdir', '%s/bin' % self.config.moses] + if self.config.model == 'hier': + args += ['--filtercmd', + '%s/scripts/training/filter-model-given-input.pl --Hierarchical'\ + % self.config.moses] + + log = open('%s/tune.log' % self.config.experiment_dir, 'w') + p = subprocess.Popen(args, stdout=subprocess.PIPE, stderr=log) + p.wait() + log.close() + os.chdir(wd) + + def run_decode(self): + if self.config.model == 'phrase': + args = [self.config.moses_decode_phrase] + elif self.config.model == 'hier': + args = [self.config.moses_decode_hier] + else: + assert False + + if self.config.run == 'test' or self.config.run == 'all': + args += ['-f', '%s/mert-work/moses.ini' % self.config.experiment_dir] + else: + args += ['-f', '%s/model/moses.ini' % self.config.experiment_dir] + #args += ['-f', '%s/model/moses.ini' % self.config.experiment_dir] + + args += ['-drop-unknown', + '-n-best-list', '%s/hyp.%s.nbest' % (self.config.experiment_dir, self.config.tgt), + str(self.config.nbest), 'distinct', + '-threads', '3'] + + #nullfile = open(os.devnull, 'w') + infile = open('%s/test.%s' % (self.config.experiment_dir, self.config.src)) + outfile = open('%s/hyp.%s' % (self.config.experiment_dir, self.config.tgt), 'w') + log = open('%s/decode.log' % self.config.experiment_dir, 'w') + p = subprocess.Popen(args, stdin=infile, stdout=outfile, stderr=log) + p.wait() + infile.close() + log.close() + outfile.close() + + def decode_sentence(self, experiment_dir, sentence, temp_dir): + if self.config.model == 'phrase': + args = [self.config.moses_decode_phrase] + elif self.config.model == 'hier': + args = [self.config.moses_decode_hier] + else: + assert False + + if self.config.run == 'test' or self.config.run == 'all': + args += ['-f', '%s/mert-work/moses.ini' % experiment_dir] + else: + args += ['-f', '%s/model/moses.ini' % experiment_dir] + + args += ['-drop-unknown', + '-n-best-list', '%s/nbest.tmp' % temp_dir, + str(self.config.nbest), 'distinct', + '-threads', '1'] + + infile = open('%s/sent.tmp' % temp_dir, 'w') + print >>infile, sentence + infile.close + infile = open('%s/sent.tmp' % temp_dir, 'r') + nullfile = open(os.devnull, 'w') + p = subprocess.Popen(args, stdin=infile, stdout=nullfile, stderr=nullfile) + p.wait() + infile.close() + return + -- cgit v1.2.3