diff options
-rw-r--r-- | README.md | 39 | ||||
-rw-r--r-- | data/geoquery/README | 3 | ||||
-rw-r--r-- | data/geoquery/smt-semparse/decode_sentence.py | 36 | ||||
-rw-r--r-- | data/geoquery/smt-semparse/functionalizer.py | 143 | ||||
-rw-r--r-- | data/geoquery/smt-semparse/moses.py | 173 | ||||
-rw-r--r-- | data/geoquery/wasp-1.0/eval.pl (renamed from data/geoquery/wasp/eval.pl) | 0 | ||||
-rw-r--r-- | data/geoquery/wasp-1.0/geoquery.pl (renamed from data/geoquery/wasp/geoquery.pl) | 0 | ||||
-rw-r--r-- | example/cfg.rb | 7 | ||||
-rwxr-xr-x | example/run.sh (renamed from example/example.sh) | 10 | ||||
-rw-r--r-- | hopefear.rb | 18 | ||||
-rwxr-xr-x | rebol.rb | 10 |
11 files changed, 61 insertions, 378 deletions
@@ -1,8 +1,41 @@ rebol ===== -code for grounded SMT +Code for grounded SMT on geoquery data. +(This has nothing to do with the programming language REBOL! http://www.rebol.com/ ) -This has nothing to do with the programming language REBOL -http://www.rebol.com/ + +Dependencies +------------ + +*WASP*-1.0 includes the geoquery knowledge base and scripts for querying it. +The evaluation scripts were slightly modified to include the full output. +These scripts are in data/geoquery/wasp/, they go into wasp-1.0/data/geo-funql/eval/. +WASP-1.0 can be downloaded from here [1]. + +You'll also need some *Prolog* environment, e.g. SWI-Prolog [2]. + +We use the a slightly modified implementation of *smt-semparse*, +as described in 'Semantic parsing as machine translation' (Andreas et al, ACL 2013). +Our fork can be found here [3]. This depends on more stuff, e.g. the Moses decoder +and SRILM. + +For translation we use the *cdec* toolkit, [4]. + +As semantic parsing is quite slow and rebol does it quite often, +results are cached with *memcached* [5]. + +You'll also need the following *ruby gems*: + * https://rubygems.org/gems/memcached + * http://rubygems.org/gems/nlp_ruby + * http://trollop.rubyforge.org/ + + + +--- +[1] http://www.cs.utexas.edu/~ml/wasp/wasp-1.0.tar.bz2 +[2] http://www.swi-prolog.org/ +[3] https://github.com/pks/smt-semparse +[4] https://github.com/redpony/cdec +[5] http://memcached.org/ diff --git a/data/geoquery/README b/data/geoquery/README index 8e147fa..05067ee 100644 --- a/data/geoquery/README +++ b/data/geoquery/README @@ -13,6 +13,5 @@ split880.train.ids : 880 train/test split train ids ../stopwords.en : English stopwords file ../weights.init : initial weights -smt-semparse/ : slightly adapted code for Andreas' smt-semparse -wasp/ : adapted stuff from wasp-1.0 +wasp-1.0/ : modified stuff from wasp-1.0 diff --git a/data/geoquery/smt-semparse/decode_sentence.py b/data/geoquery/smt-semparse/decode_sentence.py deleted file mode 100644 index 1914734..0000000 --- a/data/geoquery/smt-semparse/decode_sentence.py +++ /dev/null @@ -1,36 +0,0 @@ -import sys
-import os
-import tempfile, shutil
-from src.extractor import Extractor
-from src.smt_semparse_config import SMTSemparseConfig
-from src.moses import Moses
-from src.functionalizer import Functionalizer
-
-#input: English sentence
-if __name__ == '__main__':
- sentence = ''
- if len(sys.argv) == 3:
- experiment_dir = sys.argv[1]
- sentence = sys.argv[2]
- else:
- assert False
-
- # load config
- config = SMTSemparseConfig('/workspace/grounded/smt-semparse-cp/settings.yaml', '/workspace/grounded/smt-semparse-cp/dependencies.yaml')
-
- #stem
- sentence = Extractor(config).preprocess_nl(sentence)
-
- # we need a temp dir!
- temp_dir = tempfile.mkdtemp()
-
- #decode
- moses = Moses(config)
- moses.decode_sentence(experiment_dir, sentence, temp_dir)
-
- #convert to bracket structure
- print Functionalizer(config).run_sentence(experiment_dir, temp_dir)
-
- #delete tmp files
- shutil.rmtree(temp_dir)
-
diff --git a/data/geoquery/smt-semparse/functionalizer.py b/data/geoquery/smt-semparse/functionalizer.py deleted file mode 100644 index 782b4e5..0000000 --- a/data/geoquery/smt-semparse/functionalizer.py +++ /dev/null @@ -1,143 +0,0 @@ -import logging -import util -import sys - -class Functionalizer: - - def __init__(self, config): - self.config = config - - def run(self): - hyp_file = open('%s/hyp.mrl.nbest' % self.config.experiment_dir) - fun_file = open('%s/hyp.fun' % self.config.experiment_dir, 'w') - - hypsets = [] - hypset = [] - last_eid = 0 - for line in hyp_file: - parts = line.split('|||') - eid = int(parts[0]) - if eid != last_eid: - hypsets.append(hypset) - hypset = [] - last_eid = eid - score = parts[2] + ' ||| ' + parts[3].strip() - hyp = parts[1].strip() - hypset.append((hyp,score)) - hypsets.append(hypset) - - counter = 0 - for hypset in hypsets: - hypset = list(reversed(hypset)) - while hypset: - hyp, score = hypset.pop() - fun = self.functionalize(hyp) - if fun: - print >>fun_file, counter, '|||', fun, '|||', score - break - counter += 1 - - def run_sentence(self, experiment_dir, temp_dir): - hyp_file = open('%s/nbest.tmp' % temp_dir, 'r') - - hypsets = [] - hypset = [] - last_eid = 0 - for line in hyp_file: - parts = line.split('|||') - eid = int(parts[0]) - if eid != last_eid: - hypsets.append(hypset) - hypset = [] - last_eid = eid - score = parts[2] + ' ||| ' + parts[3].strip() - hyp = parts[1].strip() - hypset.append((hyp,score)) - hypsets.append(hypset) - hyp_file.close() - - counter = 0 - for hypset in hypsets: - hypset = list(reversed(hypset)) - while hypset: - hyp, score = hypset.pop() - fun = self.functionalize(hyp) - if fun: - return fun - break - counter += 1 - return "" - - #xc = 0 - def functionalize(self, mrl): - - #if '_@0' in mrl and 'cityid@2' in mrl: - # #print '===' - # #print mrl - # self.xc += 1 - # if self.xc > 5: - # exit() - - stack = [] - r = [] - tokens = list(reversed(mrl.split())) - - #print tokens - - while tokens: - it = tokens.pop() - #print it - if util.ARITY_SEP not in it: - token = it - arity = util.ARITY_STR - logging.warn('unrecognized token: %s', it) - else: - token, arity = it.rsplit(util.ARITY_SEP) - if arity == util.ARITY_STR: - arity = 0 - arity_str = True - elif not (arity == util.ARITY_ANY): - arity = int(arity) - arity_str = False - - if arity == util.ARITY_ANY or arity > 0: - r.append(token) - r.append('(') - stack.append(arity) - else: - assert arity == 0 - if arity_str: - r.append("'%s'" % token.replace('_', ' ')) - else: - r.append(token) - #print r - while stack: - top = stack.pop() - if top == util.ARITY_ANY and tokens: - r.append(',') - stack.append(util.ARITY_ANY) - break - elif top != util.ARITY_ANY and top > 1: - r.append(',') - stack.append(top - 1) - break - else: - r.append(')') - - if not stack and tokens: - return None - - if stack: - return None - - r = ''.join(r) - - # nasty hacks to fix misplaced _ - if '(_' in r: - return None - if ',_' in r and not ('cityid' in r): - return None - if '_),_)' in r: - return None - - return r diff --git a/data/geoquery/smt-semparse/moses.py b/data/geoquery/smt-semparse/moses.py deleted file mode 100644 index 9a159c3..0000000 --- a/data/geoquery/smt-semparse/moses.py +++ /dev/null @@ -1,173 +0,0 @@ -import logging -import os -import subprocess -import gzip - -from subprocess import Popen, PIPE, STDOUT - -class Moses: - - def __init__(self, config): - self.config = config - - def run_train(self): - args = [self.config.moses_train, - '--root-dir', self.config.experiment_dir, - '--corpus', '%s/%s' % (self.config.experiment_dir, - self.config.train_name), - '--f', self.config.src, - '--e', self.config.tgt, - '--lm', '0:3:%s/%s.arpa' % (self.config.experiment_dir, self.config.tgt), - #'-score-options', "'--OnlyDirect --NoPhraseCount'" - '--alignment', self.config.symm, - '-external-bin-dir', self.config.giza] - if self.config.model == 'hier': - args += ['-hierarchical', '-glue-grammar'] - - logging.info(' '.join(args)) - - log = open('%s/train.log' % self.config.experiment_dir, 'w') - p = subprocess.Popen(args, stdout=subprocess.PIPE, stderr=log) - p.wait() - log.close() - - def run_retrain(self): - old_train_nl = '%s/%s.nl' % (self.config.experiment_dir, - self.config.train_name) - old_train_mrl = '%s/%s.mrl' % (self.config.experiment_dir, - self.config.train_name) - moved_train_nl = '%s.notune' % old_train_nl - moved_train_mrl = '%s.notune' % old_train_mrl - tune_nl = '%s/tune.nl' % self.config.experiment_dir - tune_mrl = '%s/tune.mrl' % self.config.experiment_dir - os.rename(old_train_nl, moved_train_nl) - os.rename(old_train_mrl, moved_train_mrl) - with open(old_train_nl, 'w') as rt_train_nl: - subprocess.call(['cat', moved_train_nl, tune_nl], stdout=rt_train_nl) - with open(old_train_mrl, 'w') as rt_train_mrl: - subprocess.call(['cat', moved_train_mrl, tune_mrl], stdout=rt_train_mrl) - - os.remove('%s/model/extract.inv.gz' % self.config.experiment_dir) - os.remove('%s/model/extract.gz' % self.config.experiment_dir) - if self.config.model == 'hier': - os.remove('%s/model/rule-table.gz' % self.config.experiment_dir) - else: - os.remove('%s/model/phrase-table.gz' % self.config.experiment_dir) - - self.run_train() - - def parens_ok(self, line): - mrl_part = line.split(' ||| ')[1] - tokens = [t[-1] for t in mrl_part.split() if t[-2] == '@'] - tokens.reverse() - stack = [] - while tokens: - t = tokens.pop() - assert t != '*' - if t == 's': - t = 0 - t = int(t) - if t > 0: - stack.append(t) - else: - while stack: - top = stack.pop() - if top > 1: - stack.append(top - 1) - break - if tokens and not stack: - return False - return True - - def filter_phrase_table(self): - table_name = 'phrase' if self.config.model == 'phrase' else 'rule' - oldname = '%s/model/%s-table.gz' % (self.config.experiment_dir, table_name) - newname = '%s/model/%s-table.old.gz' % (self.config.experiment_dir, table_name) - os.rename(oldname, newname) - - with gzip.open(oldname, 'w') as filtered_table_f: - with gzip.open(newname, 'r') as old_table_f: - for line in old_table_f: - if self.parens_ok(line): - print >>filtered_table_f, line, - - def run_tune(self): - wd = os.getcwd() - os.chdir(self.config.experiment_dir) - args = [self.config.moses_tune, - '%s/tune.%s' % (self.config.experiment_dir, self.config.src), - '%s/tune.%s' % (self.config.experiment_dir, self.config.tgt)] - if self.config.model == 'hier': - args += [self.config.moses_decode_hier] - else: - args += [self.config.moses_decode_phrase] - args += ['%s/model/moses.ini' % self.config.experiment_dir, - '--mertdir', '%s/bin' % self.config.moses] - if self.config.model == 'hier': - args += ['--filtercmd', - '%s/scripts/training/filter-model-given-input.pl --Hierarchical'\ - % self.config.moses] - - log = open('%s/tune.log' % self.config.experiment_dir, 'w') - p = subprocess.Popen(args, stdout=subprocess.PIPE, stderr=log) - p.wait() - log.close() - os.chdir(wd) - - def run_decode(self): - if self.config.model == 'phrase': - args = [self.config.moses_decode_phrase] - elif self.config.model == 'hier': - args = [self.config.moses_decode_hier] - else: - assert False - - if self.config.run == 'test' or self.config.run == 'all': - args += ['-f', '%s/mert-work/moses.ini' % self.config.experiment_dir] - else: - args += ['-f', '%s/model/moses.ini' % self.config.experiment_dir] - #args += ['-f', '%s/model/moses.ini' % self.config.experiment_dir] - - args += ['-drop-unknown', - '-n-best-list', '%s/hyp.%s.nbest' % (self.config.experiment_dir, self.config.tgt), - str(self.config.nbest), 'distinct', - '-threads', '3'] - - #nullfile = open(os.devnull, 'w') - infile = open('%s/test.%s' % (self.config.experiment_dir, self.config.src)) - outfile = open('%s/hyp.%s' % (self.config.experiment_dir, self.config.tgt), 'w') - log = open('%s/decode.log' % self.config.experiment_dir, 'w') - p = subprocess.Popen(args, stdin=infile, stdout=outfile, stderr=log) - p.wait() - infile.close() - log.close() - outfile.close() - - def decode_sentence(self, experiment_dir, sentence, temp_dir): - if self.config.model == 'phrase': - args = [self.config.moses_decode_phrase] - elif self.config.model == 'hier': - args = [self.config.moses_decode_hier] - else: - assert False - - if self.config.run == 'test' or self.config.run == 'all': - args += ['-f', '%s/mert-work/moses.ini' % experiment_dir] - else: - args += ['-f', '%s/model/moses.ini' % experiment_dir] - - args += ['-drop-unknown', - '-n-best-list', '%s/nbest.tmp' % temp_dir, - str(self.config.nbest), 'distinct', - '-threads', '1'] - - infile = open('%s/sent.tmp' % temp_dir, 'w') - print >>infile, sentence - infile.close - infile = open('%s/sent.tmp' % temp_dir, 'r') - nullfile = open(os.devnull, 'w') - p = subprocess.Popen(args, stdin=infile, stdout=nullfile, stderr=nullfile) - p.wait() - infile.close() - return - diff --git a/data/geoquery/wasp/eval.pl b/data/geoquery/wasp-1.0/eval.pl index e00a067..e00a067 100644 --- a/data/geoquery/wasp/eval.pl +++ b/data/geoquery/wasp-1.0/eval.pl diff --git a/data/geoquery/wasp/geoquery.pl b/data/geoquery/wasp-1.0/geoquery.pl index 5d5d9bc..5d5d9bc 100644 --- a/data/geoquery/wasp/geoquery.pl +++ b/data/geoquery/wasp-1.0/geoquery.pl diff --git a/example/cfg.rb b/example/cfg.rb index fbf8e7f..94fef2e 100644 --- a/example/cfg.rb +++ b/example/cfg.rb @@ -1,7 +1,8 @@ -SMT_SEMPARSE = 'python /workspace/grounded/smt-semparse-cp/decode_sentence.py /workspace/grounded/smt-semparse-cp/working/full_dataset' -EVAL_PL = '/workspace/grounded/wasp-1.0/data/geo-funql/eval/eval.pl' +_PATH = '/workspace/grounded/test' +SMT_SEMPARSE = "python #{_PATH}/smt-semparse/decode_sentence.py /workspace/grounded/test/smt-semparse/work/full_dataset" +EVAL_PL = "#{_PATH}/wasp-1.0/data/geo-funql/eval/eval.pl" ACCEPT_ZOMBIES = true TIMEOUT = 60 -CDEC_BIN = '/toolbox/cdec-dtrain/decoder/cdec' +CDEC_BIN = '/toolbox/cdec/decoder/cdec' $cache = Memcached.new('localhost:31337') diff --git a/example/example.sh b/example/run.sh index b359dfd..fba3931 100755 --- a/example/example.sh +++ b/example/run.sh @@ -1,8 +1,10 @@ #!/bin/bash -# memcached has to be running! `memcached -p 31337` +# memcached has to be running! +#memcached -p 31337 + +CDEC=/toolbox/cdec -# run rebol with rampion variant for 1 epoch over 10 examples (data.*) ../rebol.rb \ -k 100 \ -i $(pwd)/data.in \ @@ -17,10 +19,10 @@ -l \ -e 0.01 \ -j 1 \ - -v rampion 2>output.stderr > output.stdout + -v rebol 2>output.stderr > output.stdout # translate test -/toolbox/cdec-dtrain/decoder/cdec \ +$CDEC/decoder/cdec \ -c cdec.ini \ -w output-weights 2>/dev/null \ < data.in \ diff --git a/hopefear.rb b/hopefear.rb index aed0c9c..93534b6 100644 --- a/hopefear.rb +++ b/hopefear.rb @@ -2,11 +2,11 @@ def hope_and_fear kbest, action max = -1.0/0 max_idx = -1 kbest.each_with_index { |k,i| - if action=='hope' && k.scores[:decoder] + k.scores[:psb] > max - max_idx = i; max = k.scores[:decoder] + k.scores[:psb] + if action=='hope' && k.scores[:decoder] + k.scores[:per_sentence_bleu] > max + max_idx = i; max = k.scores[:decoder] + k.scores[:per_sentence_bleu] end - if action=='fear' && k.scores[:decoder] - k.scores[:psb] > max - max_idx = i; max = k.scores[:decoder] - k.scores[:psb] + if action=='fear' && k.scores[:decoder] - k.scores[:per_sentence_bleu] > max + max_idx = i; max = k.scores[:decoder] - k.scores[:per_sentence_bleu] end } return max_idx @@ -18,10 +18,10 @@ def gethopefear_rebol kbest, feedback, gold, max, own_reference=nil if feedback == true # hope hope = kbest[0] - new_reference = hope - kbest.each { |k| k.scores[:psb] = BLEU::per_sentence_bleu k.s, new_reference } + new_reference = hope.s + kbest.each { |k| k.scores[:per_sentence_bleu] = BLEU::per_sentence_bleu k.s, new_reference } # fear - kbest.sort_by { |k| -(k.scores[:model]-k.score[:psb]) }.each_with_index { |k,i| + kbest.sort_by { |k| -(k.scores[:decoder]-k.scores[:per_sentence_bleu]) }.each_with_index { |k,i| break if i==max if !exec(k.s, gold, true)[0] fear = k @@ -33,7 +33,7 @@ def gethopefear_rebol kbest, feedback, gold, max, own_reference=nil # fear fear = kbest[0] # hope - kbest.sort_by { |k| -(k.scores[:model]+k.score[:psb]) }.each_with_index { |k,i| + kbest.sort_by { |k| -(k.scores[:decoder]+k.scores[:per_sentence_bleu]) }.each_with_index { |k,i| break if i==max if exec(k.s, gold, true)[0] hope = k @@ -67,7 +67,7 @@ def gethopefear_exec kbest, feedback, gold, max, own_reference=nil type1 = type2 = false if feedback == true hope = kbest[0] - new_reference = hope + new_reference = hope.s type1 = true elsif own_reference hope = own_reference @@ -185,7 +185,7 @@ def main end # get per-sentence BLEU scores - kbest.each { |k| k.scores[:psb] = BLEU::per_sentence_bleu k.s, references[j] } + kbest.each { |k| k.scores[:per_sentence_bleu] = BLEU::per_sentence_bleu k.s, references[j] } # map decoder scores to [0,1] adjust_model_scores kbest, cfg[:scale_model] @@ -224,7 +224,7 @@ def main end if new_reference - own_references[j] = new_reference.s + own_references[j] = new_reference if new_reference!=references[j] end type1_updates+=1 if type1 @@ -304,11 +304,11 @@ def main eos - STDERR.write "<<< #{own_references.size} OWN REFERENCES" + STDERR.write "<<< #{own_references.reject{|i|!i}.size} OWN REFERENCES\n" own_references.each_with_index { |i,j| - STDERR.write "#{j} '#{i}'" if i + STDERR.write "#{j} '#{i}'\n" if i } - STDERR.write ">>>" + STDERR.write ">>>\n" } end |