summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPatrick Simianer <simianer@cl.uni-heidelberg.de>2014-04-25 18:21:11 +0200
committerPatrick Simianer <simianer@cl.uni-heidelberg.de>2014-04-25 18:21:11 +0200
commit5f04f17907ce37a8e8dd6cff9cb6dc026b30b8f6 (patch)
treeefebc9d2ff2a6cda1bb0c1da60bcbfc9b001ad3d
parent4e3260df76571ee3be531a6d7c0c1b5c93a056a4 (diff)
better readme, example and code
-rw-r--r--README.md39
-rw-r--r--data/geoquery/README3
-rw-r--r--data/geoquery/smt-semparse/decode_sentence.py36
-rw-r--r--data/geoquery/smt-semparse/functionalizer.py143
-rw-r--r--data/geoquery/smt-semparse/moses.py173
-rw-r--r--data/geoquery/wasp-1.0/eval.pl (renamed from data/geoquery/wasp/eval.pl)0
-rw-r--r--data/geoquery/wasp-1.0/geoquery.pl (renamed from data/geoquery/wasp/geoquery.pl)0
-rw-r--r--example/cfg.rb7
-rwxr-xr-xexample/run.sh (renamed from example/example.sh)10
-rw-r--r--hopefear.rb18
-rwxr-xr-xrebol.rb10
11 files changed, 61 insertions, 378 deletions
diff --git a/README.md b/README.md
index f43f61d..182195e 100644
--- a/README.md
+++ b/README.md
@@ -1,8 +1,41 @@
rebol
=====
-code for grounded SMT
+Code for grounded SMT on geoquery data.
+(This has nothing to do with the programming language REBOL! http://www.rebol.com/ )
-This has nothing to do with the programming language REBOL
-http://www.rebol.com/
+
+Dependencies
+------------
+
+*WASP*-1.0 includes the geoquery knowledge base and scripts for querying it.
+The evaluation scripts were slightly modified to include the full output.
+These scripts are in data/geoquery/wasp/, they go into wasp-1.0/data/geo-funql/eval/.
+WASP-1.0 can be downloaded from here [1].
+
+You'll also need some *Prolog* environment, e.g. SWI-Prolog [2].
+
+We use the a slightly modified implementation of *smt-semparse*,
+as described in 'Semantic parsing as machine translation' (Andreas et al, ACL 2013).
+Our fork can be found here [3]. This depends on more stuff, e.g. the Moses decoder
+and SRILM.
+
+For translation we use the *cdec* toolkit, [4].
+
+As semantic parsing is quite slow and rebol does it quite often,
+results are cached with *memcached* [5].
+
+You'll also need the following *ruby gems*:
+ * https://rubygems.org/gems/memcached
+ * http://rubygems.org/gems/nlp_ruby
+ * http://trollop.rubyforge.org/
+
+
+
+---
+[1] http://www.cs.utexas.edu/~ml/wasp/wasp-1.0.tar.bz2
+[2] http://www.swi-prolog.org/
+[3] https://github.com/pks/smt-semparse
+[4] https://github.com/redpony/cdec
+[5] http://memcached.org/
diff --git a/data/geoquery/README b/data/geoquery/README
index 8e147fa..05067ee 100644
--- a/data/geoquery/README
+++ b/data/geoquery/README
@@ -13,6 +13,5 @@ split880.train.ids : 880 train/test split train ids
../stopwords.en : English stopwords file
../weights.init : initial weights
-smt-semparse/ : slightly adapted code for Andreas' smt-semparse
-wasp/ : adapted stuff from wasp-1.0
+wasp-1.0/ : modified stuff from wasp-1.0
diff --git a/data/geoquery/smt-semparse/decode_sentence.py b/data/geoquery/smt-semparse/decode_sentence.py
deleted file mode 100644
index 1914734..0000000
--- a/data/geoquery/smt-semparse/decode_sentence.py
+++ /dev/null
@@ -1,36 +0,0 @@
-import sys
-import os
-import tempfile, shutil
-from src.extractor import Extractor
-from src.smt_semparse_config import SMTSemparseConfig
-from src.moses import Moses
-from src.functionalizer import Functionalizer
-
-#input: English sentence
-if __name__ == '__main__':
- sentence = ''
- if len(sys.argv) == 3:
- experiment_dir = sys.argv[1]
- sentence = sys.argv[2]
- else:
- assert False
-
- # load config
- config = SMTSemparseConfig('/workspace/grounded/smt-semparse-cp/settings.yaml', '/workspace/grounded/smt-semparse-cp/dependencies.yaml')
-
- #stem
- sentence = Extractor(config).preprocess_nl(sentence)
-
- # we need a temp dir!
- temp_dir = tempfile.mkdtemp()
-
- #decode
- moses = Moses(config)
- moses.decode_sentence(experiment_dir, sentence, temp_dir)
-
- #convert to bracket structure
- print Functionalizer(config).run_sentence(experiment_dir, temp_dir)
-
- #delete tmp files
- shutil.rmtree(temp_dir)
-
diff --git a/data/geoquery/smt-semparse/functionalizer.py b/data/geoquery/smt-semparse/functionalizer.py
deleted file mode 100644
index 782b4e5..0000000
--- a/data/geoquery/smt-semparse/functionalizer.py
+++ /dev/null
@@ -1,143 +0,0 @@
-import logging
-import util
-import sys
-
-class Functionalizer:
-
- def __init__(self, config):
- self.config = config
-
- def run(self):
- hyp_file = open('%s/hyp.mrl.nbest' % self.config.experiment_dir)
- fun_file = open('%s/hyp.fun' % self.config.experiment_dir, 'w')
-
- hypsets = []
- hypset = []
- last_eid = 0
- for line in hyp_file:
- parts = line.split('|||')
- eid = int(parts[0])
- if eid != last_eid:
- hypsets.append(hypset)
- hypset = []
- last_eid = eid
- score = parts[2] + ' ||| ' + parts[3].strip()
- hyp = parts[1].strip()
- hypset.append((hyp,score))
- hypsets.append(hypset)
-
- counter = 0
- for hypset in hypsets:
- hypset = list(reversed(hypset))
- while hypset:
- hyp, score = hypset.pop()
- fun = self.functionalize(hyp)
- if fun:
- print >>fun_file, counter, '|||', fun, '|||', score
- break
- counter += 1
-
- def run_sentence(self, experiment_dir, temp_dir):
- hyp_file = open('%s/nbest.tmp' % temp_dir, 'r')
-
- hypsets = []
- hypset = []
- last_eid = 0
- for line in hyp_file:
- parts = line.split('|||')
- eid = int(parts[0])
- if eid != last_eid:
- hypsets.append(hypset)
- hypset = []
- last_eid = eid
- score = parts[2] + ' ||| ' + parts[3].strip()
- hyp = parts[1].strip()
- hypset.append((hyp,score))
- hypsets.append(hypset)
- hyp_file.close()
-
- counter = 0
- for hypset in hypsets:
- hypset = list(reversed(hypset))
- while hypset:
- hyp, score = hypset.pop()
- fun = self.functionalize(hyp)
- if fun:
- return fun
- break
- counter += 1
- return ""
-
- #xc = 0
- def functionalize(self, mrl):
-
- #if '_@0' in mrl and 'cityid@2' in mrl:
- # #print '==='
- # #print mrl
- # self.xc += 1
- # if self.xc > 5:
- # exit()
-
- stack = []
- r = []
- tokens = list(reversed(mrl.split()))
-
- #print tokens
-
- while tokens:
- it = tokens.pop()
- #print it
- if util.ARITY_SEP not in it:
- token = it
- arity = util.ARITY_STR
- logging.warn('unrecognized token: %s', it)
- else:
- token, arity = it.rsplit(util.ARITY_SEP)
- if arity == util.ARITY_STR:
- arity = 0
- arity_str = True
- elif not (arity == util.ARITY_ANY):
- arity = int(arity)
- arity_str = False
-
- if arity == util.ARITY_ANY or arity > 0:
- r.append(token)
- r.append('(')
- stack.append(arity)
- else:
- assert arity == 0
- if arity_str:
- r.append("'%s'" % token.replace('_', ' '))
- else:
- r.append(token)
- #print r
- while stack:
- top = stack.pop()
- if top == util.ARITY_ANY and tokens:
- r.append(',')
- stack.append(util.ARITY_ANY)
- break
- elif top != util.ARITY_ANY and top > 1:
- r.append(',')
- stack.append(top - 1)
- break
- else:
- r.append(')')
-
- if not stack and tokens:
- return None
-
- if stack:
- return None
-
- r = ''.join(r)
-
- # nasty hacks to fix misplaced _
- if '(_' in r:
- return None
- if ',_' in r and not ('cityid' in r):
- return None
- if '_),_)' in r:
- return None
-
- return r
diff --git a/data/geoquery/smt-semparse/moses.py b/data/geoquery/smt-semparse/moses.py
deleted file mode 100644
index 9a159c3..0000000
--- a/data/geoquery/smt-semparse/moses.py
+++ /dev/null
@@ -1,173 +0,0 @@
-import logging
-import os
-import subprocess
-import gzip
-
-from subprocess import Popen, PIPE, STDOUT
-
-class Moses:
-
- def __init__(self, config):
- self.config = config
-
- def run_train(self):
- args = [self.config.moses_train,
- '--root-dir', self.config.experiment_dir,
- '--corpus', '%s/%s' % (self.config.experiment_dir,
- self.config.train_name),
- '--f', self.config.src,
- '--e', self.config.tgt,
- '--lm', '0:3:%s/%s.arpa' % (self.config.experiment_dir, self.config.tgt),
- #'-score-options', "'--OnlyDirect --NoPhraseCount'"
- '--alignment', self.config.symm,
- '-external-bin-dir', self.config.giza]
- if self.config.model == 'hier':
- args += ['-hierarchical', '-glue-grammar']
-
- logging.info(' '.join(args))
-
- log = open('%s/train.log' % self.config.experiment_dir, 'w')
- p = subprocess.Popen(args, stdout=subprocess.PIPE, stderr=log)
- p.wait()
- log.close()
-
- def run_retrain(self):
- old_train_nl = '%s/%s.nl' % (self.config.experiment_dir,
- self.config.train_name)
- old_train_mrl = '%s/%s.mrl' % (self.config.experiment_dir,
- self.config.train_name)
- moved_train_nl = '%s.notune' % old_train_nl
- moved_train_mrl = '%s.notune' % old_train_mrl
- tune_nl = '%s/tune.nl' % self.config.experiment_dir
- tune_mrl = '%s/tune.mrl' % self.config.experiment_dir
- os.rename(old_train_nl, moved_train_nl)
- os.rename(old_train_mrl, moved_train_mrl)
- with open(old_train_nl, 'w') as rt_train_nl:
- subprocess.call(['cat', moved_train_nl, tune_nl], stdout=rt_train_nl)
- with open(old_train_mrl, 'w') as rt_train_mrl:
- subprocess.call(['cat', moved_train_mrl, tune_mrl], stdout=rt_train_mrl)
-
- os.remove('%s/model/extract.inv.gz' % self.config.experiment_dir)
- os.remove('%s/model/extract.gz' % self.config.experiment_dir)
- if self.config.model == 'hier':
- os.remove('%s/model/rule-table.gz' % self.config.experiment_dir)
- else:
- os.remove('%s/model/phrase-table.gz' % self.config.experiment_dir)
-
- self.run_train()
-
- def parens_ok(self, line):
- mrl_part = line.split(' ||| ')[1]
- tokens = [t[-1] for t in mrl_part.split() if t[-2] == '@']
- tokens.reverse()
- stack = []
- while tokens:
- t = tokens.pop()
- assert t != '*'
- if t == 's':
- t = 0
- t = int(t)
- if t > 0:
- stack.append(t)
- else:
- while stack:
- top = stack.pop()
- if top > 1:
- stack.append(top - 1)
- break
- if tokens and not stack:
- return False
- return True
-
- def filter_phrase_table(self):
- table_name = 'phrase' if self.config.model == 'phrase' else 'rule'
- oldname = '%s/model/%s-table.gz' % (self.config.experiment_dir, table_name)
- newname = '%s/model/%s-table.old.gz' % (self.config.experiment_dir, table_name)
- os.rename(oldname, newname)
-
- with gzip.open(oldname, 'w') as filtered_table_f:
- with gzip.open(newname, 'r') as old_table_f:
- for line in old_table_f:
- if self.parens_ok(line):
- print >>filtered_table_f, line,
-
- def run_tune(self):
- wd = os.getcwd()
- os.chdir(self.config.experiment_dir)
- args = [self.config.moses_tune,
- '%s/tune.%s' % (self.config.experiment_dir, self.config.src),
- '%s/tune.%s' % (self.config.experiment_dir, self.config.tgt)]
- if self.config.model == 'hier':
- args += [self.config.moses_decode_hier]
- else:
- args += [self.config.moses_decode_phrase]
- args += ['%s/model/moses.ini' % self.config.experiment_dir,
- '--mertdir', '%s/bin' % self.config.moses]
- if self.config.model == 'hier':
- args += ['--filtercmd',
- '%s/scripts/training/filter-model-given-input.pl --Hierarchical'\
- % self.config.moses]
-
- log = open('%s/tune.log' % self.config.experiment_dir, 'w')
- p = subprocess.Popen(args, stdout=subprocess.PIPE, stderr=log)
- p.wait()
- log.close()
- os.chdir(wd)
-
- def run_decode(self):
- if self.config.model == 'phrase':
- args = [self.config.moses_decode_phrase]
- elif self.config.model == 'hier':
- args = [self.config.moses_decode_hier]
- else:
- assert False
-
- if self.config.run == 'test' or self.config.run == 'all':
- args += ['-f', '%s/mert-work/moses.ini' % self.config.experiment_dir]
- else:
- args += ['-f', '%s/model/moses.ini' % self.config.experiment_dir]
- #args += ['-f', '%s/model/moses.ini' % self.config.experiment_dir]
-
- args += ['-drop-unknown',
- '-n-best-list', '%s/hyp.%s.nbest' % (self.config.experiment_dir, self.config.tgt),
- str(self.config.nbest), 'distinct',
- '-threads', '3']
-
- #nullfile = open(os.devnull, 'w')
- infile = open('%s/test.%s' % (self.config.experiment_dir, self.config.src))
- outfile = open('%s/hyp.%s' % (self.config.experiment_dir, self.config.tgt), 'w')
- log = open('%s/decode.log' % self.config.experiment_dir, 'w')
- p = subprocess.Popen(args, stdin=infile, stdout=outfile, stderr=log)
- p.wait()
- infile.close()
- log.close()
- outfile.close()
-
- def decode_sentence(self, experiment_dir, sentence, temp_dir):
- if self.config.model == 'phrase':
- args = [self.config.moses_decode_phrase]
- elif self.config.model == 'hier':
- args = [self.config.moses_decode_hier]
- else:
- assert False
-
- if self.config.run == 'test' or self.config.run == 'all':
- args += ['-f', '%s/mert-work/moses.ini' % experiment_dir]
- else:
- args += ['-f', '%s/model/moses.ini' % experiment_dir]
-
- args += ['-drop-unknown',
- '-n-best-list', '%s/nbest.tmp' % temp_dir,
- str(self.config.nbest), 'distinct',
- '-threads', '1']
-
- infile = open('%s/sent.tmp' % temp_dir, 'w')
- print >>infile, sentence
- infile.close
- infile = open('%s/sent.tmp' % temp_dir, 'r')
- nullfile = open(os.devnull, 'w')
- p = subprocess.Popen(args, stdin=infile, stdout=nullfile, stderr=nullfile)
- p.wait()
- infile.close()
- return
-
diff --git a/data/geoquery/wasp/eval.pl b/data/geoquery/wasp-1.0/eval.pl
index e00a067..e00a067 100644
--- a/data/geoquery/wasp/eval.pl
+++ b/data/geoquery/wasp-1.0/eval.pl
diff --git a/data/geoquery/wasp/geoquery.pl b/data/geoquery/wasp-1.0/geoquery.pl
index 5d5d9bc..5d5d9bc 100644
--- a/data/geoquery/wasp/geoquery.pl
+++ b/data/geoquery/wasp-1.0/geoquery.pl
diff --git a/example/cfg.rb b/example/cfg.rb
index fbf8e7f..94fef2e 100644
--- a/example/cfg.rb
+++ b/example/cfg.rb
@@ -1,7 +1,8 @@
-SMT_SEMPARSE = 'python /workspace/grounded/smt-semparse-cp/decode_sentence.py /workspace/grounded/smt-semparse-cp/working/full_dataset'
-EVAL_PL = '/workspace/grounded/wasp-1.0/data/geo-funql/eval/eval.pl'
+_PATH = '/workspace/grounded/test'
+SMT_SEMPARSE = "python #{_PATH}/smt-semparse/decode_sentence.py /workspace/grounded/test/smt-semparse/work/full_dataset"
+EVAL_PL = "#{_PATH}/wasp-1.0/data/geo-funql/eval/eval.pl"
ACCEPT_ZOMBIES = true
TIMEOUT = 60
-CDEC_BIN = '/toolbox/cdec-dtrain/decoder/cdec'
+CDEC_BIN = '/toolbox/cdec/decoder/cdec'
$cache = Memcached.new('localhost:31337')
diff --git a/example/example.sh b/example/run.sh
index b359dfd..fba3931 100755
--- a/example/example.sh
+++ b/example/run.sh
@@ -1,8 +1,10 @@
#!/bin/bash
-# memcached has to be running! `memcached -p 31337`
+# memcached has to be running!
+#memcached -p 31337
+
+CDEC=/toolbox/cdec
-# run rebol with rampion variant for 1 epoch over 10 examples (data.*)
../rebol.rb \
-k 100 \
-i $(pwd)/data.in \
@@ -17,10 +19,10 @@
-l \
-e 0.01 \
-j 1 \
- -v rampion 2>output.stderr > output.stdout
+ -v rebol 2>output.stderr > output.stdout
# translate test
-/toolbox/cdec-dtrain/decoder/cdec \
+$CDEC/decoder/cdec \
-c cdec.ini \
-w output-weights 2>/dev/null \
< data.in \
diff --git a/hopefear.rb b/hopefear.rb
index aed0c9c..93534b6 100644
--- a/hopefear.rb
+++ b/hopefear.rb
@@ -2,11 +2,11 @@ def hope_and_fear kbest, action
max = -1.0/0
max_idx = -1
kbest.each_with_index { |k,i|
- if action=='hope' && k.scores[:decoder] + k.scores[:psb] > max
- max_idx = i; max = k.scores[:decoder] + k.scores[:psb]
+ if action=='hope' && k.scores[:decoder] + k.scores[:per_sentence_bleu] > max
+ max_idx = i; max = k.scores[:decoder] + k.scores[:per_sentence_bleu]
end
- if action=='fear' && k.scores[:decoder] - k.scores[:psb] > max
- max_idx = i; max = k.scores[:decoder] - k.scores[:psb]
+ if action=='fear' && k.scores[:decoder] - k.scores[:per_sentence_bleu] > max
+ max_idx = i; max = k.scores[:decoder] - k.scores[:per_sentence_bleu]
end
}
return max_idx
@@ -18,10 +18,10 @@ def gethopefear_rebol kbest, feedback, gold, max, own_reference=nil
if feedback == true
# hope
hope = kbest[0]
- new_reference = hope
- kbest.each { |k| k.scores[:psb] = BLEU::per_sentence_bleu k.s, new_reference }
+ new_reference = hope.s
+ kbest.each { |k| k.scores[:per_sentence_bleu] = BLEU::per_sentence_bleu k.s, new_reference }
# fear
- kbest.sort_by { |k| -(k.scores[:model]-k.score[:psb]) }.each_with_index { |k,i|
+ kbest.sort_by { |k| -(k.scores[:decoder]-k.scores[:per_sentence_bleu]) }.each_with_index { |k,i|
break if i==max
if !exec(k.s, gold, true)[0]
fear = k
@@ -33,7 +33,7 @@ def gethopefear_rebol kbest, feedback, gold, max, own_reference=nil
# fear
fear = kbest[0]
# hope
- kbest.sort_by { |k| -(k.scores[:model]+k.score[:psb]) }.each_with_index { |k,i|
+ kbest.sort_by { |k| -(k.scores[:decoder]+k.scores[:per_sentence_bleu]) }.each_with_index { |k,i|
break if i==max
if exec(k.s, gold, true)[0]
hope = k
@@ -67,7 +67,7 @@ def gethopefear_exec kbest, feedback, gold, max, own_reference=nil
type1 = type2 = false
if feedback == true
hope = kbest[0]
- new_reference = hope
+ new_reference = hope.s
type1 = true
elsif own_reference
hope = own_reference
diff --git a/rebol.rb b/rebol.rb
index 3c54a3c..37d4a17 100755
--- a/rebol.rb
+++ b/rebol.rb
@@ -185,7 +185,7 @@ def main
end
# get per-sentence BLEU scores
- kbest.each { |k| k.scores[:psb] = BLEU::per_sentence_bleu k.s, references[j] }
+ kbest.each { |k| k.scores[:per_sentence_bleu] = BLEU::per_sentence_bleu k.s, references[j] }
# map decoder scores to [0,1]
adjust_model_scores kbest, cfg[:scale_model]
@@ -224,7 +224,7 @@ def main
end
if new_reference
- own_references[j] = new_reference.s
+ own_references[j] = new_reference if new_reference!=references[j]
end
type1_updates+=1 if type1
@@ -304,11 +304,11 @@ def main
eos
- STDERR.write "<<< #{own_references.size} OWN REFERENCES"
+ STDERR.write "<<< #{own_references.reject{|i|!i}.size} OWN REFERENCES\n"
own_references.each_with_index { |i,j|
- STDERR.write "#{j} '#{i}'" if i
+ STDERR.write "#{j} '#{i}'\n" if i
}
- STDERR.write ">>>"
+ STDERR.write ">>>\n"
}
end