summaryrefslogtreecommitdiff
path: root/data
diff options
context:
space:
mode:
authorPatrick Simianer <simianer@cl.uni-heidelberg.de>2014-04-14 16:05:40 +0200
committerPatrick Simianer <simianer@cl.uni-heidelberg.de>2014-04-14 16:05:40 +0200
commitb0bff4f48b2de88560199be09e5a29feecaa267c (patch)
tree448db9b6a6964223c2c5ec59a233885e01d22b74 /data
parent1bf772018b77e68137614a11add9f9f2f43ad344 (diff)
smt-semparse
Diffstat (limited to 'data')
-rw-r--r--data/geoquery/README2
-rw-r--r--data/geoquery/smt-semparse/decode_sentence.py36
-rw-r--r--data/geoquery/smt-semparse/functionalizer.py143
-rw-r--r--data/geoquery/smt-semparse/moses.py173
4 files changed, 354 insertions, 0 deletions
diff --git a/data/geoquery/README b/data/geoquery/README
index 54bdf58..86d6e0a 100644
--- a/data/geoquery/README
+++ b/data/geoquery/README
@@ -14,3 +14,5 @@ split880.train.ids : 880 train/test split train ids
../stopwords.en : English stopwords file
../weights.init : initial weights
+smt-semparse/ : slightly adapted code for Andreas' smt-semparse
+
diff --git a/data/geoquery/smt-semparse/decode_sentence.py b/data/geoquery/smt-semparse/decode_sentence.py
new file mode 100644
index 0000000..1914734
--- /dev/null
+++ b/data/geoquery/smt-semparse/decode_sentence.py
@@ -0,0 +1,36 @@
+import sys
+import os
+import tempfile, shutil
+from src.extractor import Extractor
+from src.smt_semparse_config import SMTSemparseConfig
+from src.moses import Moses
+from src.functionalizer import Functionalizer
+
+#input: English sentence
+if __name__ == '__main__':
+ sentence = ''
+ if len(sys.argv) == 3:
+ experiment_dir = sys.argv[1]
+ sentence = sys.argv[2]
+ else:
+ assert False
+
+ # load config
+ config = SMTSemparseConfig('/workspace/grounded/smt-semparse-cp/settings.yaml', '/workspace/grounded/smt-semparse-cp/dependencies.yaml')
+
+ #stem
+ sentence = Extractor(config).preprocess_nl(sentence)
+
+ # we need a temp dir!
+ temp_dir = tempfile.mkdtemp()
+
+ #decode
+ moses = Moses(config)
+ moses.decode_sentence(experiment_dir, sentence, temp_dir)
+
+ #convert to bracket structure
+ print Functionalizer(config).run_sentence(experiment_dir, temp_dir)
+
+ #delete tmp files
+ shutil.rmtree(temp_dir)
+
diff --git a/data/geoquery/smt-semparse/functionalizer.py b/data/geoquery/smt-semparse/functionalizer.py
new file mode 100644
index 0000000..782b4e5
--- /dev/null
+++ b/data/geoquery/smt-semparse/functionalizer.py
@@ -0,0 +1,143 @@
+import logging
+import util
+import sys
+
+class Functionalizer:
+
+ def __init__(self, config):
+ self.config = config
+
+ def run(self):
+ hyp_file = open('%s/hyp.mrl.nbest' % self.config.experiment_dir)
+ fun_file = open('%s/hyp.fun' % self.config.experiment_dir, 'w')
+
+ hypsets = []
+ hypset = []
+ last_eid = 0
+ for line in hyp_file:
+ parts = line.split('|||')
+ eid = int(parts[0])
+ if eid != last_eid:
+ hypsets.append(hypset)
+ hypset = []
+ last_eid = eid
+ score = parts[2] + ' ||| ' + parts[3].strip()
+ hyp = parts[1].strip()
+ hypset.append((hyp,score))
+ hypsets.append(hypset)
+
+ counter = 0
+ for hypset in hypsets:
+ hypset = list(reversed(hypset))
+ while hypset:
+ hyp, score = hypset.pop()
+ fun = self.functionalize(hyp)
+ if fun:
+ print >>fun_file, counter, '|||', fun, '|||', score
+ break
+ counter += 1
+
+ def run_sentence(self, experiment_dir, temp_dir):
+ hyp_file = open('%s/nbest.tmp' % temp_dir, 'r')
+
+ hypsets = []
+ hypset = []
+ last_eid = 0
+ for line in hyp_file:
+ parts = line.split('|||')
+ eid = int(parts[0])
+ if eid != last_eid:
+ hypsets.append(hypset)
+ hypset = []
+ last_eid = eid
+ score = parts[2] + ' ||| ' + parts[3].strip()
+ hyp = parts[1].strip()
+ hypset.append((hyp,score))
+ hypsets.append(hypset)
+ hyp_file.close()
+
+ counter = 0
+ for hypset in hypsets:
+ hypset = list(reversed(hypset))
+ while hypset:
+ hyp, score = hypset.pop()
+ fun = self.functionalize(hyp)
+ if fun:
+ return fun
+ break
+ counter += 1
+ return ""
+
+ #xc = 0
+ def functionalize(self, mrl):
+
+ #if '_@0' in mrl and 'cityid@2' in mrl:
+ # #print '==='
+ # #print mrl
+ # self.xc += 1
+ # if self.xc > 5:
+ # exit()
+
+ stack = []
+ r = []
+ tokens = list(reversed(mrl.split()))
+
+ #print tokens
+
+ while tokens:
+ it = tokens.pop()
+ #print it
+ if util.ARITY_SEP not in it:
+ token = it
+ arity = util.ARITY_STR
+ logging.warn('unrecognized token: %s', it)
+ else:
+ token, arity = it.rsplit(util.ARITY_SEP)
+ if arity == util.ARITY_STR:
+ arity = 0
+ arity_str = True
+ elif not (arity == util.ARITY_ANY):
+ arity = int(arity)
+ arity_str = False
+
+ if arity == util.ARITY_ANY or arity > 0:
+ r.append(token)
+ r.append('(')
+ stack.append(arity)
+ else:
+ assert arity == 0
+ if arity_str:
+ r.append("'%s'" % token.replace('_', ' '))
+ else:
+ r.append(token)
+ #print r
+ while stack:
+ top = stack.pop()
+ if top == util.ARITY_ANY and tokens:
+ r.append(',')
+ stack.append(util.ARITY_ANY)
+ break
+ elif top != util.ARITY_ANY and top > 1:
+ r.append(',')
+ stack.append(top - 1)
+ break
+ else:
+ r.append(')')
+
+ if not stack and tokens:
+ return None
+
+ if stack:
+ return None
+
+ r = ''.join(r)
+
+ # nasty hacks to fix misplaced _
+ if '(_' in r:
+ return None
+ if ',_' in r and not ('cityid' in r):
+ return None
+ if '_),_)' in r:
+ return None
+
+ return r
diff --git a/data/geoquery/smt-semparse/moses.py b/data/geoquery/smt-semparse/moses.py
new file mode 100644
index 0000000..9a159c3
--- /dev/null
+++ b/data/geoquery/smt-semparse/moses.py
@@ -0,0 +1,173 @@
+import logging
+import os
+import subprocess
+import gzip
+
+from subprocess import Popen, PIPE, STDOUT
+
+class Moses:
+
+ def __init__(self, config):
+ self.config = config
+
+ def run_train(self):
+ args = [self.config.moses_train,
+ '--root-dir', self.config.experiment_dir,
+ '--corpus', '%s/%s' % (self.config.experiment_dir,
+ self.config.train_name),
+ '--f', self.config.src,
+ '--e', self.config.tgt,
+ '--lm', '0:3:%s/%s.arpa' % (self.config.experiment_dir, self.config.tgt),
+ #'-score-options', "'--OnlyDirect --NoPhraseCount'"
+ '--alignment', self.config.symm,
+ '-external-bin-dir', self.config.giza]
+ if self.config.model == 'hier':
+ args += ['-hierarchical', '-glue-grammar']
+
+ logging.info(' '.join(args))
+
+ log = open('%s/train.log' % self.config.experiment_dir, 'w')
+ p = subprocess.Popen(args, stdout=subprocess.PIPE, stderr=log)
+ p.wait()
+ log.close()
+
+ def run_retrain(self):
+ old_train_nl = '%s/%s.nl' % (self.config.experiment_dir,
+ self.config.train_name)
+ old_train_mrl = '%s/%s.mrl' % (self.config.experiment_dir,
+ self.config.train_name)
+ moved_train_nl = '%s.notune' % old_train_nl
+ moved_train_mrl = '%s.notune' % old_train_mrl
+ tune_nl = '%s/tune.nl' % self.config.experiment_dir
+ tune_mrl = '%s/tune.mrl' % self.config.experiment_dir
+ os.rename(old_train_nl, moved_train_nl)
+ os.rename(old_train_mrl, moved_train_mrl)
+ with open(old_train_nl, 'w') as rt_train_nl:
+ subprocess.call(['cat', moved_train_nl, tune_nl], stdout=rt_train_nl)
+ with open(old_train_mrl, 'w') as rt_train_mrl:
+ subprocess.call(['cat', moved_train_mrl, tune_mrl], stdout=rt_train_mrl)
+
+ os.remove('%s/model/extract.inv.gz' % self.config.experiment_dir)
+ os.remove('%s/model/extract.gz' % self.config.experiment_dir)
+ if self.config.model == 'hier':
+ os.remove('%s/model/rule-table.gz' % self.config.experiment_dir)
+ else:
+ os.remove('%s/model/phrase-table.gz' % self.config.experiment_dir)
+
+ self.run_train()
+
+ def parens_ok(self, line):
+ mrl_part = line.split(' ||| ')[1]
+ tokens = [t[-1] for t in mrl_part.split() if t[-2] == '@']
+ tokens.reverse()
+ stack = []
+ while tokens:
+ t = tokens.pop()
+ assert t != '*'
+ if t == 's':
+ t = 0
+ t = int(t)
+ if t > 0:
+ stack.append(t)
+ else:
+ while stack:
+ top = stack.pop()
+ if top > 1:
+ stack.append(top - 1)
+ break
+ if tokens and not stack:
+ return False
+ return True
+
+ def filter_phrase_table(self):
+ table_name = 'phrase' if self.config.model == 'phrase' else 'rule'
+ oldname = '%s/model/%s-table.gz' % (self.config.experiment_dir, table_name)
+ newname = '%s/model/%s-table.old.gz' % (self.config.experiment_dir, table_name)
+ os.rename(oldname, newname)
+
+ with gzip.open(oldname, 'w') as filtered_table_f:
+ with gzip.open(newname, 'r') as old_table_f:
+ for line in old_table_f:
+ if self.parens_ok(line):
+ print >>filtered_table_f, line,
+
+ def run_tune(self):
+ wd = os.getcwd()
+ os.chdir(self.config.experiment_dir)
+ args = [self.config.moses_tune,
+ '%s/tune.%s' % (self.config.experiment_dir, self.config.src),
+ '%s/tune.%s' % (self.config.experiment_dir, self.config.tgt)]
+ if self.config.model == 'hier':
+ args += [self.config.moses_decode_hier]
+ else:
+ args += [self.config.moses_decode_phrase]
+ args += ['%s/model/moses.ini' % self.config.experiment_dir,
+ '--mertdir', '%s/bin' % self.config.moses]
+ if self.config.model == 'hier':
+ args += ['--filtercmd',
+ '%s/scripts/training/filter-model-given-input.pl --Hierarchical'\
+ % self.config.moses]
+
+ log = open('%s/tune.log' % self.config.experiment_dir, 'w')
+ p = subprocess.Popen(args, stdout=subprocess.PIPE, stderr=log)
+ p.wait()
+ log.close()
+ os.chdir(wd)
+
+ def run_decode(self):
+ if self.config.model == 'phrase':
+ args = [self.config.moses_decode_phrase]
+ elif self.config.model == 'hier':
+ args = [self.config.moses_decode_hier]
+ else:
+ assert False
+
+ if self.config.run == 'test' or self.config.run == 'all':
+ args += ['-f', '%s/mert-work/moses.ini' % self.config.experiment_dir]
+ else:
+ args += ['-f', '%s/model/moses.ini' % self.config.experiment_dir]
+ #args += ['-f', '%s/model/moses.ini' % self.config.experiment_dir]
+
+ args += ['-drop-unknown',
+ '-n-best-list', '%s/hyp.%s.nbest' % (self.config.experiment_dir, self.config.tgt),
+ str(self.config.nbest), 'distinct',
+ '-threads', '3']
+
+ #nullfile = open(os.devnull, 'w')
+ infile = open('%s/test.%s' % (self.config.experiment_dir, self.config.src))
+ outfile = open('%s/hyp.%s' % (self.config.experiment_dir, self.config.tgt), 'w')
+ log = open('%s/decode.log' % self.config.experiment_dir, 'w')
+ p = subprocess.Popen(args, stdin=infile, stdout=outfile, stderr=log)
+ p.wait()
+ infile.close()
+ log.close()
+ outfile.close()
+
+ def decode_sentence(self, experiment_dir, sentence, temp_dir):
+ if self.config.model == 'phrase':
+ args = [self.config.moses_decode_phrase]
+ elif self.config.model == 'hier':
+ args = [self.config.moses_decode_hier]
+ else:
+ assert False
+
+ if self.config.run == 'test' or self.config.run == 'all':
+ args += ['-f', '%s/mert-work/moses.ini' % experiment_dir]
+ else:
+ args += ['-f', '%s/model/moses.ini' % experiment_dir]
+
+ args += ['-drop-unknown',
+ '-n-best-list', '%s/nbest.tmp' % temp_dir,
+ str(self.config.nbest), 'distinct',
+ '-threads', '1']
+
+ infile = open('%s/sent.tmp' % temp_dir, 'w')
+ print >>infile, sentence
+ infile.close
+ infile = open('%s/sent.tmp' % temp_dir, 'r')
+ nullfile = open(os.devnull, 'w')
+ p = subprocess.Popen(args, stdin=infile, stdout=nullfile, stderr=nullfile)
+ p.wait()
+ infile.close()
+ return
+