From 5f04f17907ce37a8e8dd6cff9cb6dc026b30b8f6 Mon Sep 17 00:00:00 2001
From: Patrick Simianer <simianer@cl.uni-heidelberg.de>
Date: Fri, 25 Apr 2014 18:21:11 +0200
Subject: better readme, example and code

---
 data/geoquery/smt-semparse/decode_sentence.py |  36 ------
 data/geoquery/smt-semparse/functionalizer.py  | 143 ---------------------
 data/geoquery/smt-semparse/moses.py           | 173 --------------------------
 3 files changed, 352 deletions(-)
 delete mode 100644 data/geoquery/smt-semparse/decode_sentence.py
 delete mode 100644 data/geoquery/smt-semparse/functionalizer.py
 delete mode 100644 data/geoquery/smt-semparse/moses.py

(limited to 'data/geoquery/smt-semparse')

diff --git a/data/geoquery/smt-semparse/decode_sentence.py b/data/geoquery/smt-semparse/decode_sentence.py
deleted file mode 100644
index 1914734..0000000
--- a/data/geoquery/smt-semparse/decode_sentence.py
+++ /dev/null
@@ -1,36 +0,0 @@
-import sys
-import os
-import tempfile, shutil
-from src.extractor import Extractor
-from src.smt_semparse_config import SMTSemparseConfig
-from src.moses import Moses
-from src.functionalizer import Functionalizer
-
-#input: English sentence
-if __name__ == '__main__':
-  sentence = ''
-  if len(sys.argv) == 3:
-    experiment_dir = sys.argv[1]
-    sentence = sys.argv[2]
-  else:
-    assert False
-	
-  # load config
-  config = SMTSemparseConfig('/workspace/grounded/smt-semparse-cp/settings.yaml', '/workspace/grounded/smt-semparse-cp/dependencies.yaml')
-
-  #stem
-  sentence = Extractor(config).preprocess_nl(sentence)
-
-  # we need a temp dir!
-  temp_dir = tempfile.mkdtemp()
-
-  #decode
-  moses = Moses(config)
-  moses.decode_sentence(experiment_dir, sentence, temp_dir)
-
-  #convert to bracket structure
-  print Functionalizer(config).run_sentence(experiment_dir, temp_dir)
-
-  #delete tmp files
-  shutil.rmtree(temp_dir)
-
diff --git a/data/geoquery/smt-semparse/functionalizer.py b/data/geoquery/smt-semparse/functionalizer.py
deleted file mode 100644
index 782b4e5..0000000
--- a/data/geoquery/smt-semparse/functionalizer.py
+++ /dev/null
@@ -1,143 +0,0 @@
-import logging
-import util
-import sys
-
-class Functionalizer:
-
-  def __init__(self, config):
-    self.config = config
-
-  def run(self):
-    hyp_file = open('%s/hyp.mrl.nbest' % self.config.experiment_dir)
-    fun_file = open('%s/hyp.fun' % self.config.experiment_dir, 'w')
-
-    hypsets = []
-    hypset = []
-    last_eid = 0
-    for line in hyp_file:
-      parts = line.split('|||')
-      eid = int(parts[0])
-      if eid != last_eid:
-        hypsets.append(hypset)
-        hypset = []
-        last_eid = eid
-      score = parts[2] + ' ||| ' + parts[3].strip()
-      hyp = parts[1].strip()
-      hypset.append((hyp,score))
-    hypsets.append(hypset)
-
-    counter = 0
-    for hypset in hypsets:
-      hypset = list(reversed(hypset))
-      while hypset:
-        hyp, score = hypset.pop()
-        fun = self.functionalize(hyp)
-        if fun:
-          print >>fun_file, counter, '|||', fun, '|||', score
-          break
-      counter += 1
-
-  def run_sentence(self, experiment_dir, temp_dir):
-    hyp_file = open('%s/nbest.tmp' % temp_dir, 'r')
-
-    hypsets = []
-    hypset = []
-    last_eid = 0
-    for line in hyp_file:
-      parts = line.split('|||')
-      eid = int(parts[0])
-      if eid != last_eid:
-        hypsets.append(hypset)
-        hypset = []
-        last_eid = eid
-      score = parts[2] + ' ||| ' + parts[3].strip()
-      hyp = parts[1].strip()
-      hypset.append((hyp,score))
-    hypsets.append(hypset)
-    hyp_file.close()
-
-    counter = 0
-    for hypset in hypsets:
-      hypset = list(reversed(hypset))
-      while hypset:
-        hyp, score = hypset.pop()
-        fun = self.functionalize(hyp)
-        if fun:
-          return fun
-          break
-      counter += 1
-    return ""
-
-  #xc = 0
-  def functionalize(self, mrl):
-
-    #if '_@0' in mrl and 'cityid@2' in mrl:
-    #  #print '==='
-    #  #print mrl
-    #  self.xc += 1
-    #  if self.xc > 5:
-    #    exit()
-
-    stack = []
-    r = []
-    tokens = list(reversed(mrl.split()))
-
-    #print tokens
-
-    while tokens:
-      it = tokens.pop()
-      #print it
-      if util.ARITY_SEP not in it:
-        token = it
-        arity = util.ARITY_STR
-        logging.warn('unrecognized token: %s', it)
-      else:
-        token, arity = it.rsplit(util.ARITY_SEP)
-      if arity == util.ARITY_STR:
-        arity = 0
-        arity_str = True
-      elif not (arity == util.ARITY_ANY):
-        arity = int(arity)
-        arity_str = False
-      
-      if arity == util.ARITY_ANY or arity > 0:
-        r.append(token)
-        r.append('(')
-        stack.append(arity)
-      else:
-        assert arity == 0
-        if arity_str:
-          r.append("'%s'" % token.replace('_', ' '))
-        else:
-          r.append(token)
-          #print r
-        while stack:
-          top = stack.pop()
-          if top == util.ARITY_ANY and tokens:
-            r.append(',')
-            stack.append(util.ARITY_ANY)
-            break
-          elif top != util.ARITY_ANY and top > 1:
-            r.append(',')
-            stack.append(top - 1)
-            break
-          else:
-            r.append(')')
-
-        if not stack and tokens:
-          return None
-
-    if stack:
-      return None
-
-    r = ''.join(r)
-
-    # nasty hacks to fix misplaced _
-    if '(_' in r:
-      return None
-    if ',_' in r and not ('cityid' in r):
-      return None
-    if '_),_)' in r:
-      return None
-
-    return r
diff --git a/data/geoquery/smt-semparse/moses.py b/data/geoquery/smt-semparse/moses.py
deleted file mode 100644
index 9a159c3..0000000
--- a/data/geoquery/smt-semparse/moses.py
+++ /dev/null
@@ -1,173 +0,0 @@
-import logging
-import os
-import subprocess
-import gzip
-
-from subprocess import Popen, PIPE, STDOUT
-
-class Moses:
-
-  def __init__(self, config):
-    self.config = config
-
-  def run_train(self):
-    args = [self.config.moses_train,
-            '--root-dir', self.config.experiment_dir,
-            '--corpus', '%s/%s' % (self.config.experiment_dir,
-                                   self.config.train_name),
-            '--f', self.config.src,
-            '--e', self.config.tgt,
-            '--lm', '0:3:%s/%s.arpa' % (self.config.experiment_dir, self.config.tgt),
-            #'-score-options', "'--OnlyDirect --NoPhraseCount'"
-            '--alignment', self.config.symm,
-			'-external-bin-dir', self.config.giza]
-    if self.config.model == 'hier':
-      args += ['-hierarchical', '-glue-grammar']
-
-    logging.info(' '.join(args))
-
-    log = open('%s/train.log' % self.config.experiment_dir, 'w')
-    p = subprocess.Popen(args, stdout=subprocess.PIPE, stderr=log)
-    p.wait()
-    log.close()
-
-  def run_retrain(self):
-    old_train_nl = '%s/%s.nl' % (self.config.experiment_dir,
-        self.config.train_name)
-    old_train_mrl = '%s/%s.mrl' % (self.config.experiment_dir,
-        self.config.train_name)
-    moved_train_nl = '%s.notune' % old_train_nl
-    moved_train_mrl = '%s.notune' % old_train_mrl
-    tune_nl = '%s/tune.nl' % self.config.experiment_dir
-    tune_mrl = '%s/tune.mrl' % self.config.experiment_dir
-    os.rename(old_train_nl, moved_train_nl)
-    os.rename(old_train_mrl, moved_train_mrl)
-    with open(old_train_nl, 'w') as rt_train_nl:
-      subprocess.call(['cat', moved_train_nl, tune_nl], stdout=rt_train_nl)
-    with open(old_train_mrl, 'w') as rt_train_mrl:
-      subprocess.call(['cat', moved_train_mrl, tune_mrl], stdout=rt_train_mrl)
-
-    os.remove('%s/model/extract.inv.gz' % self.config.experiment_dir)
-    os.remove('%s/model/extract.gz' % self.config.experiment_dir)
-    if self.config.model == 'hier':
-      os.remove('%s/model/rule-table.gz' % self.config.experiment_dir)
-    else:
-      os.remove('%s/model/phrase-table.gz' % self.config.experiment_dir)
-
-    self.run_train()
-
-  def parens_ok(self, line):
-    mrl_part = line.split(' ||| ')[1]
-    tokens = [t[-1] for t in mrl_part.split() if t[-2] == '@']
-    tokens.reverse()
-    stack = []
-    while tokens:
-      t = tokens.pop()
-      assert t != '*'
-      if t == 's':
-        t = 0
-      t = int(t)
-      if t > 0:
-        stack.append(t)
-      else:
-        while stack:
-          top = stack.pop()
-          if top > 1:
-            stack.append(top - 1)
-            break
-        if tokens and not stack:
-          return False
-    return True
-
-  def filter_phrase_table(self):
-    table_name = 'phrase' if self.config.model == 'phrase' else 'rule'
-    oldname = '%s/model/%s-table.gz' % (self.config.experiment_dir, table_name)
-    newname = '%s/model/%s-table.old.gz' % (self.config.experiment_dir, table_name)
-    os.rename(oldname, newname)
-
-    with gzip.open(oldname, 'w') as filtered_table_f:
-      with gzip.open(newname, 'r') as old_table_f:
-        for line in old_table_f:
-          if self.parens_ok(line):
-            print >>filtered_table_f, line,
-
-  def run_tune(self):
-    wd = os.getcwd()
-    os.chdir(self.config.experiment_dir)
-    args = [self.config.moses_tune,
-            '%s/tune.%s' % (self.config.experiment_dir, self.config.src),
-            '%s/tune.%s' % (self.config.experiment_dir, self.config.tgt)]
-    if self.config.model == 'hier':
-      args += [self.config.moses_decode_hier]
-    else:
-      args += [self.config.moses_decode_phrase]
-    args += ['%s/model/moses.ini' % self.config.experiment_dir,
-             '--mertdir', '%s/bin' % self.config.moses]
-    if self.config.model == 'hier':
-      args += ['--filtercmd', 
-               '%s/scripts/training/filter-model-given-input.pl --Hierarchical'\
-                   % self.config.moses]
-
-    log = open('%s/tune.log' % self.config.experiment_dir, 'w')
-    p = subprocess.Popen(args, stdout=subprocess.PIPE, stderr=log)
-    p.wait()
-    log.close()
-    os.chdir(wd)
-
-  def run_decode(self):
-    if self.config.model == 'phrase':
-      args = [self.config.moses_decode_phrase]
-    elif self.config.model == 'hier':
-      args = [self.config.moses_decode_hier]
-    else:
-      assert False
-
-    if self.config.run == 'test' or self.config.run == 'all':
-      args += ['-f', '%s/mert-work/moses.ini' % self.config.experiment_dir]
-    else:
-      args += ['-f', '%s/model/moses.ini' % self.config.experiment_dir]
-    #args += ['-f', '%s/model/moses.ini' % self.config.experiment_dir]
-
-    args += ['-drop-unknown',
-             '-n-best-list', '%s/hyp.%s.nbest' % (self.config.experiment_dir, self.config.tgt),
-                             str(self.config.nbest), 'distinct',
-             '-threads', '3']
-
-    #nullfile = open(os.devnull, 'w')
-    infile = open('%s/test.%s' % (self.config.experiment_dir, self.config.src))
-    outfile = open('%s/hyp.%s' % (self.config.experiment_dir, self.config.tgt), 'w')
-    log = open('%s/decode.log' % self.config.experiment_dir, 'w')
-    p = subprocess.Popen(args, stdin=infile, stdout=outfile, stderr=log)
-    p.wait()
-    infile.close()
-    log.close()
-    outfile.close()
-
-  def decode_sentence(self, experiment_dir, sentence, temp_dir):
-    if self.config.model == 'phrase':
-      args = [self.config.moses_decode_phrase]
-    elif self.config.model == 'hier':
-      args = [self.config.moses_decode_hier]
-    else:
-      assert False
-
-    if self.config.run == 'test' or self.config.run == 'all':
-      args += ['-f', '%s/mert-work/moses.ini' % experiment_dir]
-    else:
-      args += ['-f', '%s/model/moses.ini' % experiment_dir]
-
-    args += ['-drop-unknown',
-             '-n-best-list', '%s/nbest.tmp' % temp_dir,
-                             str(self.config.nbest), 'distinct',
-             '-threads', '1']
-
-    infile = open('%s/sent.tmp' % temp_dir, 'w')
-    print >>infile, sentence
-    infile.close
-    infile = open('%s/sent.tmp' % temp_dir, 'r')
-    nullfile = open(os.devnull, 'w')
-    p = subprocess.Popen(args, stdin=infile, stdout=nullfile, stderr=nullfile)
-    p.wait()
-    infile.close()
-    return
-
-- 
cgit v1.2.3