From 7120911494711094c6dcd9dc0da741d686207aab Mon Sep 17 00:00:00 2001 From: Patrick Simianer Date: Fri, 25 Apr 2014 18:40:18 +0200 Subject: modifications --- decode_sentence.py | 37 +++++++++++++++++++++++++++++++++++++ dependencies.yaml | 9 +++++---- settings.yaml | 6 +++--- smt-semparse-full-dataset.tar.gz | Bin 0 -> 14324575 bytes src/functionalizer.py | 31 +++++++++++++++++++++++++++++++ src/moses.py | 38 +++++++++++++++++++++++++++++++++++--- src/smt_semparse_config.py | 4 ++-- 7 files changed, 113 insertions(+), 12 deletions(-) create mode 100644 decode_sentence.py create mode 100644 smt-semparse-full-dataset.tar.gz diff --git a/decode_sentence.py b/decode_sentence.py new file mode 100644 index 0000000..36f059b --- /dev/null +++ b/decode_sentence.py @@ -0,0 +1,37 @@ +import sys +import os +import tempfile, shutil +from src.extractor import Extractor +from src.smt_semparse_config import SMTSemparseConfig +from src.moses import Moses +from src.functionalizer import Functionalizer + +#input: English sentence +if __name__ == '__main__': + sentence = '' + if len(sys.argv) == 3: + experiment_dir = sys.argv[1] + sentence = sys.argv[2] + else: + assert False + + # load config + _dir = os.path.dirname(os.path.abspath(__file__)) + config = SMTSemparseConfig(_dir+'/settings.yaml', _dir+'/dependencies.yaml') + + #stem + sentence = Extractor(config).preprocess_nl(sentence) + + # we need a temp dir! + temp_dir = tempfile.mkdtemp() + + #decode + moses = Moses(config) + moses.decode_sentence(experiment_dir, sentence, temp_dir) + + #convert to bracket structure + print Functionalizer(config).run_sentence(experiment_dir, temp_dir) + + #delete tmp files + shutil.rmtree(temp_dir) + diff --git a/dependencies.yaml b/dependencies.yaml index 37083df..f98ae6b 100644 --- a/dependencies.yaml +++ b/dependencies.yaml @@ -1,8 +1,9 @@ -smt_semparse: /home/jacob/src/smt-semparse +smt_semparse: /workspace/grounded/test/smt-semparse -moses: /home/jacob/src/3p/mosesdecoder -srilm: /home/jacob/src/3p/srilm1.6.0 +moses: /workspace/grounded/test/mosesdecoder +giza: /workspace/grounded/test/mosesdecoder/tools +srilm: /workspace/grounded/test/srilm prolog: /usr/bin/swipl -wasp: /home/jacob/src/3p/wasp-1.0 +wasp: /workspace/grounded/test/wasp-1.0 srilm_arch: i686-m64 diff --git a/settings.yaml b/settings.yaml index 3d0669d..40bb44f 100644 --- a/settings.yaml +++ b/settings.yaml @@ -1,12 +1,12 @@ nbest: 100 # how many entries in the nbest list? corpus: geo # which corpus? [geo, robo] lang: en # which language? [en, de, el, th] -stem: false # run the stemmer? -symm: srctotgt # which symmetrization? [e.g. srctotgt, tgttosrc, grow, ...] +stem: true # run the stemmer? +symm: tgttosrc # which symmetrization? [e.g. srctotgt, tgttosrc, grow, ...] np: true # use NP list? np_type: all # what version of NP list? model: hier # which machine translation model? [phrase, hier] -run: test # which experiment? [dev, test, debug] +run: test # which experiment? [dev, test, debug, all] workdir: work # where? # experimental, and unrelated to published work diff --git a/smt-semparse-full-dataset.tar.gz b/smt-semparse-full-dataset.tar.gz new file mode 100644 index 0000000..08086eb Binary files /dev/null and b/smt-semparse-full-dataset.tar.gz differ diff --git a/src/functionalizer.py b/src/functionalizer.py index 66325a0..782b4e5 100644 --- a/src/functionalizer.py +++ b/src/functionalizer.py @@ -37,6 +37,37 @@ class Functionalizer: break counter += 1 + def run_sentence(self, experiment_dir, temp_dir): + hyp_file = open('%s/nbest.tmp' % temp_dir, 'r') + + hypsets = [] + hypset = [] + last_eid = 0 + for line in hyp_file: + parts = line.split('|||') + eid = int(parts[0]) + if eid != last_eid: + hypsets.append(hypset) + hypset = [] + last_eid = eid + score = parts[2] + ' ||| ' + parts[3].strip() + hyp = parts[1].strip() + hypset.append((hyp,score)) + hypsets.append(hypset) + hyp_file.close() + + counter = 0 + for hypset in hypsets: + hypset = list(reversed(hypset)) + while hypset: + hyp, score = hypset.pop() + fun = self.functionalize(hyp) + if fun: + return fun + break + counter += 1 + return "" + #xc = 0 def functionalize(self, mrl): diff --git a/src/moses.py b/src/moses.py index 857ddbf..9a159c3 100644 --- a/src/moses.py +++ b/src/moses.py @@ -3,6 +3,8 @@ import os import subprocess import gzip +from subprocess import Popen, PIPE, STDOUT + class Moses: def __init__(self, config): @@ -17,7 +19,8 @@ class Moses: '--e', self.config.tgt, '--lm', '0:3:%s/%s.arpa' % (self.config.experiment_dir, self.config.tgt), #'-score-options', "'--OnlyDirect --NoPhraseCount'" - '--alignment', self.config.symm] + '--alignment', self.config.symm, + '-external-bin-dir', self.config.giza] if self.config.model == 'hier': args += ['-hierarchical', '-glue-grammar'] @@ -99,7 +102,7 @@ class Moses: else: args += [self.config.moses_decode_phrase] args += ['%s/model/moses.ini' % self.config.experiment_dir, - '--mertdir', '%s/dist/bin' % self.config.moses] + '--mertdir', '%s/bin' % self.config.moses] if self.config.model == 'hier': args += ['--filtercmd', '%s/scripts/training/filter-model-given-input.pl --Hierarchical'\ @@ -119,7 +122,7 @@ class Moses: else: assert False - if self.config.run == 'test': + if self.config.run == 'test' or self.config.run == 'all': args += ['-f', '%s/mert-work/moses.ini' % self.config.experiment_dir] else: args += ['-f', '%s/model/moses.ini' % self.config.experiment_dir] @@ -139,3 +142,32 @@ class Moses: infile.close() log.close() outfile.close() + + def decode_sentence(self, experiment_dir, sentence, temp_dir): + if self.config.model == 'phrase': + args = [self.config.moses_decode_phrase] + elif self.config.model == 'hier': + args = [self.config.moses_decode_hier] + else: + assert False + + if self.config.run == 'test' or self.config.run == 'all': + args += ['-f', '%s/mert-work/moses.ini' % experiment_dir] + else: + args += ['-f', '%s/model/moses.ini' % experiment_dir] + + args += ['-drop-unknown', + '-n-best-list', '%s/nbest.tmp' % temp_dir, + str(self.config.nbest), 'distinct', + '-threads', '1'] + + infile = open('%s/sent.tmp' % temp_dir, 'w') + print >>infile, sentence + infile.close + infile = open('%s/sent.tmp' % temp_dir, 'r') + nullfile = open(os.devnull, 'w') + p = subprocess.Popen(args, stdin=infile, stdout=nullfile, stderr=nullfile) + p.wait() + infile.close() + return + diff --git a/src/smt_semparse_config.py b/src/smt_semparse_config.py index 71eaf24..6bf50d7 100644 --- a/src/smt_semparse_config.py +++ b/src/smt_semparse_config.py @@ -17,8 +17,8 @@ class SMTSemparseConfig(Config): self.put('moses_train', '%s/scripts/training/train-model.perl' % self.moses) self.put('moses_tune', '%s/scripts/training/mert-moses.pl' % self.moses) - self.put('moses_decode_phrase', '%s/dist/bin/moses' % self.moses) - self.put('moses_decode_hier', '%s/dist/bin/moses_chart' % self.moses) + self.put('moses_decode_phrase', '%s/bin/moses' % self.moses) + self.put('moses_decode_hier', '%s/bin/moses_chart' % self.moses) self.put('bleu_eval', '%s/scripts/generic/multi-bleu.perl' % self.moses) self.put('wasp_eval', '%s/data/geo-funql/eval/eval.pl' % self.wasp) -- cgit v1.2.3