From 925087356b853e2099c1b60d8b757d7aa02121a9 Mon Sep 17 00:00:00 2001
From: Chris Dyer <cdyer@cab.ark.cs.cmu.edu>
Date: Tue, 2 Oct 2012 00:19:43 -0400
Subject: cdec cleanup, remove bayesian stuff, parsing stuff

---
 gi/pyp-topics/scripts/contexts2documents.py    |  37 -------
 gi/pyp-topics/scripts/extract_contexts.py      | 144 -------------------------
 gi/pyp-topics/scripts/extract_contexts_test.py |  72 -------------
 gi/pyp-topics/scripts/extract_leaves.py        |  49 ---------
 gi/pyp-topics/scripts/map-documents.py         |  20 ----
 gi/pyp-topics/scripts/map-terms.py             |  20 ----
 gi/pyp-topics/scripts/run.sh                   |  13 ---
 gi/pyp-topics/scripts/score-mkcls.py           |  61 -----------
 gi/pyp-topics/scripts/score-topics.py          |  64 -----------
 gi/pyp-topics/scripts/spans2labels.py          | 137 -----------------------
 gi/pyp-topics/scripts/tokens2classes.py        |  27 -----
 gi/pyp-topics/scripts/topics.py                |  20 ----
 12 files changed, 664 deletions(-)
 delete mode 100755 gi/pyp-topics/scripts/contexts2documents.py
 delete mode 100755 gi/pyp-topics/scripts/extract_contexts.py
 delete mode 100755 gi/pyp-topics/scripts/extract_contexts_test.py
 delete mode 100755 gi/pyp-topics/scripts/extract_leaves.py
 delete mode 100755 gi/pyp-topics/scripts/map-documents.py
 delete mode 100755 gi/pyp-topics/scripts/map-terms.py
 delete mode 100644 gi/pyp-topics/scripts/run.sh
 delete mode 100755 gi/pyp-topics/scripts/score-mkcls.py
 delete mode 100755 gi/pyp-topics/scripts/score-topics.py
 delete mode 100755 gi/pyp-topics/scripts/spans2labels.py
 delete mode 100755 gi/pyp-topics/scripts/tokens2classes.py
 delete mode 100755 gi/pyp-topics/scripts/topics.py

(limited to 'gi/pyp-topics/scripts')

diff --git a/gi/pyp-topics/scripts/contexts2documents.py b/gi/pyp-topics/scripts/contexts2documents.py
deleted file mode 100755
index 9be4ebbb..00000000
--- a/gi/pyp-topics/scripts/contexts2documents.py
+++ /dev/null
@@ -1,37 +0,0 @@
-#!/usr/bin/python
-
-import sys
-from operator import itemgetter
-
-if len(sys.argv) > 3:
-  print "Usage: contexts2documents.py [contexts_index_out] [phrases_index_out]"
-  exit(1)
-
-context_index = {} 
-phrase_index = {}
-for line in sys.stdin:
-  phrase, line_tail = line.split('\t')
-
-  raw_contexts = line_tail.split('|||')
-  contexts = [c.strip() for x,c in enumerate(raw_contexts) if x%2 == 0]
-  counts   = [int(c.split('=')[1].strip()) for x,c in enumerate(raw_contexts) if x%2 != 0]
-  phrase_index.setdefault(phrase, len(phrase_index))
-  print len(contexts),
-  for context,count in zip(contexts,counts): 
-    c = context_index.setdefault(context, len(context_index))
-    print "%d:%d" % (c,count),
-  print
-if 1 < len(sys.argv) < 4:
-  contexts_out = open(sys.argv[1],'w')
-  contexts = context_index.items()
-  contexts.sort(key = itemgetter(1))
-  for context in contexts: 
-    print >>contexts_out, context[0]
-  contexts_out.close()
-if len(sys.argv) == 3:
-  phrases_out = open(sys.argv[2],'w')
-  phrases = phrase_index.items()
-  phrases.sort(key = itemgetter(1))
-  for phrase in phrases: 
-    print >>phrases_out, phrase[0]
-  phrases_out.close()
diff --git a/gi/pyp-topics/scripts/extract_contexts.py b/gi/pyp-topics/scripts/extract_contexts.py
deleted file mode 100755
index b2723f2a..00000000
--- a/gi/pyp-topics/scripts/extract_contexts.py
+++ /dev/null
@@ -1,144 +0,0 @@
-#!/usr/bin/python
-
-import sys,collections
-
-def extract_backoff(context_list, order):
-  assert len(context_list) == (2*order)
-  backoffs = []
-  for i in range(1,order+1):
-    if i == order:
-      backoffs.append(([context_list[i-1]+"|"], ["|"+context_list[i]]))
-    else:
-      right_limit = 2*order-i
-      core = context_list[i:right_limit]
-      left = [context_list[i-1]+"|"*(order-i+1)]
-      right = ["|"*(order-i+1)+context_list[right_limit]]
-      backoffs.append((core, left, right))
-# print context_list, backoffs
-  return backoffs
-
-def tuple_to_str(t):
-  s=""
-  for i,x in enumerate(t):
-    if i > 0: s += "|"
-    s += str(x)
-  return s
-
-if len(sys.argv) < 3:
-  print "Usage: extract-contexts.py output_filename order cutoff lowercase"
-  exit(1)
-
-output_filename = sys.argv[1]
-order = int(sys.argv[2])
-cutoff = 0
-if len(sys.argv) > 3:
-  cutoff = int(sys.argv[3])
-lowercase = False
-if len(sys.argv) > 4:
-  lowercase = bool(sys.argv[4])
-
-contexts_dict={}
-contexts_list=[]
-contexts_freq=collections.defaultdict(int)
-contexts_backoff={}
-
-token_dict={}
-token_list=[]
-documents_dict=collections.defaultdict(dict)
-
-contexts_at_order = [i for i in range(order+1)]
-
-prefix = ["<s%d>|<s>"%i for i in range(order)]
-suffix = ["</s%d>|</s>"%i for i in range(order)]
-
-for line in sys.stdin:
-  tokens = list(prefix)
-  tokens.extend(line.split())
-  tokens.extend(suffix)
-  if lowercase:
-    tokens = map(lambda x: x.lower(), tokens)
-
-  for i in range(order, len(tokens)-order):
-    context_list = []
-    term=""
-    for j in range(i-order, i+order+1):
-      token,tag = tokens[j].rsplit('|',2)
-      if j != i:
-        context_list.append(token)
-      else:
-        if token not in token_dict: 
-          token_dict[token] = len(token_dict)
-          token_list.append(token)
-        term = token_dict[token] 
-
-    context = tuple_to_str(tuple(context_list))
-
-    if context not in contexts_dict: 
-      context_index = len(contexts_dict)
-      contexts_dict[context] = context_index
-      contexts_list.append(context)
-      contexts_at_order[0] += 1
-
-      # handle backoff
-      backoff_contexts = extract_backoff(context_list, order)
-      bo_indexes=[(context_index,)]
-#     bo_indexes=[(context,)]
-      for i,bo in enumerate(backoff_contexts):
-        factor_indexes=[]
-        for factor in bo:
-          bo_tuple = tuple_to_str(tuple(factor))
-          if bo_tuple not in contexts_dict:
-            contexts_dict[bo_tuple] = len(contexts_dict)
-            contexts_list.append(bo_tuple)
-            contexts_at_order[i+1] += 1
-#         factor_indexes.append(bo_tuple)
-          factor_indexes.append(contexts_dict[bo_tuple])
-        bo_indexes.append(tuple(factor_indexes))
-      
-      for i in range(len(bo_indexes)-1):
-        contexts_backoff[bo_indexes[i][0]] = bo_indexes[i+1]
-
-    context_index = contexts_dict[context]
-    contexts_freq[context_index] += 1
-
-    if context_index not in documents_dict[term]:
-      documents_dict[term][context_index] = 1
-    else:
-      documents_dict[term][context_index] += 1
-
-term_file = open(output_filename+".terms",'w')
-for t in token_list: print >>term_file, t
-term_file.close()
-
-contexts_file = open(output_filename+".contexts",'w')
-for c in contexts_list: 
-  print >>contexts_file, c
-contexts_file.close()
-
-data_file = open(output_filename+".data",'w')
-for t in range(len(token_list)): 
-  line=""
-  num_active=0
-  for c in documents_dict[t]:
-    count = documents_dict[t][c]
-    if contexts_freq[c] >= cutoff:
-      line += (' ' + str(c) + ':' + str(count))
-      num_active += 1
-  if num_active > 0:
-    print >>data_file, "%d%s" % (num_active,line)
-data_file.close()
-
-contexts_backoff_file = open(output_filename+".contexts_backoff",'w')
-print >>contexts_backoff_file, len(contexts_list), order,
-#for x in contexts_at_order: 
-#  print >>contexts_backoff_file, x,
-#print >>contexts_backoff_file
-for x in range(order-1):
-  print >>contexts_backoff_file, 3,
-print >>contexts_backoff_file, 2
-
-for x in contexts_backoff: 
-  print >>contexts_backoff_file, x, 
-  for y in contexts_backoff[x]: print >>contexts_backoff_file, y,
-  print >>contexts_backoff_file 
-contexts_backoff_file.close()
diff --git a/gi/pyp-topics/scripts/extract_contexts_test.py b/gi/pyp-topics/scripts/extract_contexts_test.py
deleted file mode 100755
index 693b6e0b..00000000
--- a/gi/pyp-topics/scripts/extract_contexts_test.py
+++ /dev/null
@@ -1,72 +0,0 @@
-#!/usr/bin/python
-
-import sys,collections
-
-def tuple_to_str(t):
-  s=""
-  for i,x in enumerate(t):
-    if i > 0: s += "|"
-    s += str(x)
-  return s
-
-if len(sys.argv) < 5:
-  print "Usage: extract-contexts_test.py output_filename vocab contexts order lowercase"
-  exit(1)
-
-output_filename = sys.argv[1]
-output = open(output_filename+".test_data",'w')
-
-unk_term="-UNK-"
-vocab_dict={}
-for i,x in enumerate(file(sys.argv[2], 'r').readlines()): 
-  vocab_dict[x.strip()]=i
-
-contexts_dict={}
-contexts_list=[]
-for i,x in enumerate(file(sys.argv[3], 'r').readlines()): 
-  contexts_dict[x.strip()]=i
-  contexts_list.append(x.strip())
-
-order = int(sys.argv[4])
-
-lowercase = False
-if len(sys.argv) > 5:
-  lowercase = bool(sys.argv[5])
-if lowercase: unk_term = unk_term.lower()
-
-prefix = ["<s%d>|<s>"%i for i in range(order)]
-suffix = ["</s%d>|</s>"%i for i in range(order)]
-
-assert unk_term in vocab_dict
-for line in sys.stdin:
-  tokens = list(prefix)
-  tokens.extend(line.split())
-  tokens.extend(suffix)
-  if lowercase:
-    tokens = map(lambda x: x.lower(), tokens)
-
-  for i in range(order, len(tokens)-order):
-    context_list=[]
-    term=""
-    for j in range(i-order, i+order+1):
-      token,tag = tokens[j].rsplit('|',2)
-      if j != i:
-        context_list.append(token)
-      else:
-        if token not in vocab_dict: 
-          term = vocab_dict[unk_term] 
-        else:
-          term = vocab_dict[token] 
-    context = tuple_to_str(context_list)
-    if context not in contexts_dict: 
-      contexts_dict[context] = len(contexts_dict)
-      contexts_list.append(context)
-    context_index = contexts_dict[context]
-    print >>output, "%d:%d" % (term,context_index),
-  print >>output
-output.close()
-
-contexts_file = open(output_filename+".test_contexts",'w')
-for c in contexts_list: 
-  print >>contexts_file, c
-contexts_file.close()
diff --git a/gi/pyp-topics/scripts/extract_leaves.py b/gi/pyp-topics/scripts/extract_leaves.py
deleted file mode 100755
index 14783b36..00000000
--- a/gi/pyp-topics/scripts/extract_leaves.py
+++ /dev/null
@@ -1,49 +0,0 @@
-#!/usr/bin/python
-
-import nltk
-import nltk.probability
-import sys
-import getopt 
-
-lexicalise=False
-rm_traces=False
-cutoff=100
-length_cutoff=10000
-try:                                
-  opts, args = getopt.getopt(sys.argv[1:], "hs:c:l", ["help", "lexicalise", "cutoff","sentence-length","remove-traces"])
-except getopt.GetoptError:          
-  print "Usage: extract_leaves.py [-lsc]"                        
-  sys.exit(2)                     
-for opt, arg in opts:                
-  if opt in ("-h", "--help"):      
-    print "Usage: extract_leaves.py [-lsc]"                        
-    sys.exit()                  
-  elif opt in ("-l", "--lexicalise"):                
-    lexicalise = True                 
-  elif opt in ("-c", "--cutoff"):                
-    cutoff = int(arg) 
-  elif opt in ("-s", "--sentence-length"):                
-    length_cutoff = int(arg) 
-  elif opt in ("--remove-traces"):                
-    rm_traces = True                 
-
-token_freq = nltk.probability.FreqDist()
-lines = []
-for line in sys.stdin:
-  t = nltk.Tree.parse(line)
-  pos = t.pos()
-  if len(pos) <= length_cutoff:
-    lines.append(pos)
-    for token, tag in pos:
-      token_freq.inc(token)  
-
-for line in lines:
-  for token,tag in line:
-    if not (rm_traces and tag == "-NONE-"):
-      if lexicalise:
-        if token_freq[token] < cutoff:
-          token = '-UNK-'
-        print '%s|%s' % (token,tag),
-      else:
-        print '%s' % tag,
-  print
diff --git a/gi/pyp-topics/scripts/map-documents.py b/gi/pyp-topics/scripts/map-documents.py
deleted file mode 100755
index 703de312..00000000
--- a/gi/pyp-topics/scripts/map-documents.py
+++ /dev/null
@@ -1,20 +0,0 @@
-#!/usr/bin/python
-
-import sys
-
-if len(sys.argv) != 2:
-  print "Usage: map-documents.py vocab-file"
-  exit(1)
-
-vocab = file(sys.argv[1], 'r').readlines()
-term_dict = map(lambda x: x.strip(), vocab)
-
-for line in sys.stdin:
-  tokens = line.split()
-  for token in tokens:
-    elements = token.split(':')
-    if len(elements) == 1:
-      print "%s" % (term_dict[int(elements[0])]),
-    else:
-      print "%s:%s" % (term_dict[int(elements[0])], elements[1]),
-  print
diff --git a/gi/pyp-topics/scripts/map-terms.py b/gi/pyp-topics/scripts/map-terms.py
deleted file mode 100755
index eb0298d7..00000000
--- a/gi/pyp-topics/scripts/map-terms.py
+++ /dev/null
@@ -1,20 +0,0 @@
-#!/usr/bin/python
-
-import sys
-
-if len(sys.argv) != 2:
-  print "Usage: map-terms.py vocab-file"
-  exit(1)
-
-vocab = file(sys.argv[1], 'r').readlines()
-term_dict = map(lambda x: x.strip().replace(' ','_'), vocab)
-
-for line in sys.stdin:
-  tokens = line.split()
-  for token in tokens:
-    elements = token.split(':')
-    if len(elements) == 1:
-      print "%s" % (term_dict[int(elements[0])]),
-    else:
-      print "%s:%s" % (term_dict[int(elements[0])], elements[1]),
-  print
diff --git a/gi/pyp-topics/scripts/run.sh b/gi/pyp-topics/scripts/run.sh
deleted file mode 100644
index 19e625b1..00000000
--- a/gi/pyp-topics/scripts/run.sh
+++ /dev/null
@@ -1,13 +0,0 @@
-#!/bin/sh
-
-
-./simple-extract-context.sh ~/workspace/clsp2010/jhuws2010/data/btec/split.zh-en.al 1 | ~/workspace/pyp-topics/scripts/contexts2documents.py > split.zh-en.data
-
-~/workspace/pyp-topics/bin/pyp-topics-train -d split.zh-en.data -t 50 -s 100 -o split.zh-en.documents.gz -w split.zh-en.topics.gz
-gunzip split.zh-en.documents.gz
-
-~/workspace/cdec/extools/extractor -i ../jhuws2010/data/btec/split.zh-en.al -S 1 -c 500000 -L 12 --base_phrase_spans | ~/workspace/pyp-topics/scripts/spans2labels.py split.zh-en.phrases split.zh-en.contexts split.zh-en.documents > corpus.zh-en.labelled_spans
-
-paste -d " " ~/workspace/clsp2010/jhuws2010/data/btec/split.zh-en.al corpus.labelled_spans > split.zh-en.labelled_spans
-
-./simple-extract.sh ~/workspace/clsp2010/scratch/split.zh-en.labelled_spans
diff --git a/gi/pyp-topics/scripts/score-mkcls.py b/gi/pyp-topics/scripts/score-mkcls.py
deleted file mode 100755
index 6bd33fc5..00000000
--- a/gi/pyp-topics/scripts/score-mkcls.py
+++ /dev/null
@@ -1,61 +0,0 @@
-#!/usr/bin/python
-
-import sys
-from collections import defaultdict
-
-def dict_max(d):
-  max_val=-1
-  max_key=None
-  for k in d:
-    if d[k] > max_val: 
-      max_val = d[k]
-      max_key = k
-  assert max_key
-  return max_key
-
-if len(sys.argv) != 3:
-  print "Usage: score-mkcls.py gold classes"
-  exit(1)
-
-gold_file=open(sys.argv[1],'r')
-
-term_to_topics = {}
-for line in open(sys.argv[2],'r'):
-  term,cls = line.split()
-  term_to_topics[term] = cls
-
-gold_to_topics = defaultdict(dict)
-topics_to_gold = defaultdict(dict)
-
-for gold_line in gold_file:
-  gold_tokens = gold_line.split()
-  for gold_token in gold_tokens:
-    gold_term,gold_tag = gold_token.rsplit('|',1)
-    pred_token = term_to_topics[gold_term]
-    gold_to_topics[gold_tag][pred_token] \
-      = gold_to_topics[gold_tag].get(pred_token, 0) + 1
-    topics_to_gold[pred_token][gold_tag] \
-      = topics_to_gold[pred_token].get(gold_tag, 0) + 1
-
-pred=0
-correct=0
-gold_file=open(sys.argv[1],'r')
-for gold_line in gold_file:
-  gold_tokens = gold_line.split()
-
-  for gold_token in gold_tokens:
-    gold_term,gold_tag = gold_token.rsplit('|',1)
-    pred_token = term_to_topics[gold_term]
-    print "%s|%s|%s" % (gold_token, pred_token, dict_max(topics_to_gold[pred_token])),
-    pred += 1
-    if gold_tag == dict_max(topics_to_gold[pred_token]):
-      correct += 1
-  print
-print >>sys.stderr, "Many-to-One Accuracy = %f" % (float(correct) / pred)
-#for x in gold_to_topics: 
-#  print x,dict_max(gold_to_topics[x])
-#print "###################################################"
-#for x in range(len(topics_to_gold)): 
-#  print x,dict_max(topics_to_gold[str(x)])
-#  print x,topics_to_gold[str(x)]
-#print term_to_topics
diff --git a/gi/pyp-topics/scripts/score-topics.py b/gi/pyp-topics/scripts/score-topics.py
deleted file mode 100755
index 1d8a1fcd..00000000
--- a/gi/pyp-topics/scripts/score-topics.py
+++ /dev/null
@@ -1,64 +0,0 @@
-#!/usr/bin/python
-
-import sys
-from collections import defaultdict
-
-def dict_max(d):
-  max_val=-1
-  max_key=None
-  for k in d:
-    if d[k] > max_val: 
-      max_val = d[k]
-      max_key = k
-  assert max_key
-  return max_key
-
-if len(sys.argv) != 3:
-  print "Usage: score-topics.py gold pred"
-  exit(1)
-
-gold_file=open(sys.argv[1],'r')
-pred_file=open(sys.argv[2],'r')
-
-gold_to_topics = defaultdict(dict)
-topics_to_gold = defaultdict(dict)
-term_to_topics = defaultdict(dict)
-
-for gold_line,pred_line in zip(gold_file,pred_file):
-  gold_tokens = gold_line.split()
-  pred_tokens = pred_line.split()
-  assert len(gold_tokens) == len(pred_tokens)
-
-  for gold_token,pred_token in zip(gold_tokens,pred_tokens):
-    gold_term,gold_tag = gold_token.rsplit('|',1)
-    gold_to_topics[gold_tag][pred_token] \
-      = gold_to_topics[gold_tag].get(pred_token, 0) + 1
-    term_to_topics[gold_term][pred_token] \
-      = term_to_topics[gold_term].get(pred_token, 0) + 1
-    topics_to_gold[pred_token][gold_tag] \
-      = topics_to_gold[pred_token].get(gold_tag, 0) + 1
-
-pred=0
-correct=0
-gold_file=open(sys.argv[1],'r')
-pred_file=open(sys.argv[2],'r')
-for gold_line,pred_line in zip(gold_file,pred_file):
-  gold_tokens = gold_line.split()
-  pred_tokens = pred_line.split()
-
-  for gold_token,pred_token in zip(gold_tokens,pred_tokens):
-    gold_term,gold_tag = gold_token.rsplit('|',1)
-#   print "%s|%s" % (gold_token, dict_max(gold_to_topics[gold_tag])),
-    print "%s|%s|%s" % (gold_token, pred_token, dict_max(topics_to_gold[pred_token])),
-    pred += 1
-    if gold_tag == dict_max(topics_to_gold[pred_token]):
-      correct += 1
-  print
-print >>sys.stderr, "Many-to-One Accuracy = %f" % (float(correct) / pred)
-#for x in gold_to_topics: 
-#  print x,dict_max(gold_to_topics[x])
-#print "###################################################"
-#for x in range(len(topics_to_gold)): 
-#  print x,dict_max(topics_to_gold[str(x)])
-#  print x,topics_to_gold[str(x)]
-#print term_to_topics
diff --git a/gi/pyp-topics/scripts/spans2labels.py b/gi/pyp-topics/scripts/spans2labels.py
deleted file mode 100755
index 50fa8106..00000000
--- a/gi/pyp-topics/scripts/spans2labels.py
+++ /dev/null
@@ -1,137 +0,0 @@
-#!/usr/bin/python
-
-import sys
-from operator import itemgetter
-
-if len(sys.argv) <= 2:
-  print "Usage: spans2labels.py phrase_context_index [order] [threshold] [languages={s,t,b}{s,t,b}] [type={tag,tok,both},{tag,tok,both}]"
-  exit(1)
-
-order=1
-threshold = 0
-cutoff_cat = "<UNK>"
-if len(sys.argv) > 2:
-  order = int(sys.argv[2])
-if len(sys.argv) > 3:
-  threshold = float(sys.argv[3])
-phr=ctx='t'
-if len(sys.argv) > 4:
-  phr, ctx = sys.argv[4]
-  assert phr in 'stb'
-  assert ctx in 'stb'
-phr_typ = ctx_typ = 'both'
-if len(sys.argv) > 5:
-  phr_typ, ctx_typ = sys.argv[5].split(',')
-  assert phr_typ in ('tag', 'tok', 'both')
-  assert ctx_typ in ('tag', 'tok', 'both')
-
-#print >>sys.stderr, "Loading phrase index"
-phrase_context_index = {}
-for line in file(sys.argv[1], 'r'):
-  phrase,tail= line.split('\t')
-  contexts = tail.split(" ||| ")
-  try: # remove Phil's bizarre integer pair
-       x,y = contexts[0].split()
-       x=int(x); y=int(y)
-       contexts = contexts[1:]
-  except:
-       pass
-  if len(contexts) == 1: continue
-  assert len(contexts) % 2 == 0
-  for i in range(0, len(contexts), 2):
-    #parse contexts[i+1] = " C=1 P=0.8 ... "
-    features=dict([ keyval.split('=') for keyval in contexts[i+1].split()])
-    category = features['C']    
-    if features.has_key('P') and float(features['P']) < threshold:
-	category = cutoff_cat
-    
-    phrase_context_index[(phrase,contexts[i])] = category 
-    #print (phrase,contexts[i]), category
-
-#print >>sys.stderr, "Labelling spans"
-for line in sys.stdin:
-  #print >>sys.stderr, "line", line.strip()
-  line_segments = line.split(' ||| ')
-  assert len(line_segments) >= 3
-  source = ['<s>' for x in range(order)] + line_segments[0].split() + ['</s>' for x in range(order)]
-  target = ['<s>' for x in range(order)] + line_segments[1].split() + ['</s>' for x in range(order)]
-  phrases = [ [int(i) for i in x.split('-')] for x in line_segments[2].split()]
-
-  if phr_typ != 'both' or ctx_typ != 'both':
-    if phr in 'tb' or ctx in 'tb':
-        target_toks = ['<s>' for x in range(order)] + map(lambda x: x.rsplit('_', 1)[0], line_segments[1].split()) + ['</s>' for x in range(order)]
-        target_tags = ['<s>' for x in range(order)] + map(lambda x: x.rsplit('_', 1)[-1], line_segments[1].split()) + ['</s>' for x in range(order)]
-
-        if phr in 'tb':
-            if phr_typ == 'tok':
-                targetP = target_toks
-            elif phr_typ == 'tag':
-                targetP = target_tags
-        if ctx in 'tb':
-            if ctx_typ == 'tok':
-                targetC = target_toks
-            elif ctx_typ == 'tag':
-                targetC = target_tags
-
-    if phr in 'sb' or ctx in 'sb':
-        source_toks = ['<s>' for x in range(order)] + map(lambda x: x.rsplit('_', 1)[0], line_segments[0].split()) + ['</s>' for x in range(order)]
-        source_tags = ['<s>' for x in range(order)] + map(lambda x: x.rsplit('_', 1)[-1], line_segments[0].split()) + ['</s>' for x in range(order)]
-
-        if phr in 'sb':
-            if phr_typ == 'tok':
-                sourceP = source_toks
-            elif phr_typ == 'tag':
-                sourceP = source_tags
-        if ctx in 'sb':
-            if ctx_typ == 'tok':
-                sourceC = source_toks
-            elif ctx_typ == 'tag':
-                sourceC = source_tags
-  else:
-    sourceP = sourceC = source
-    targetP = targetC = target
-
-  #print >>sys.stderr, "line", source, '---', target, 'phrases', phrases
-
-  print "|||",
-
-  for s1,s2,t1,t2 in phrases:
-    s1 += order
-    s2 += order
-    t1 += order
-    t2 += order
-
-    phraset = phrases = contextt = contexts = ''
-    if phr in 'tb':
-        phraset = reduce(lambda x, y: x+y+" ", targetP[t1:t2], "").strip()
-    if phr in 'sb':
-        phrases = reduce(lambda x, y: x+y+" ", sourceP[s1:s2], "").strip()
-
-    if ctx in 'tb':
-        left_context = reduce(lambda x, y: x+y+" ", targetC[t1-order:t1], "")
-        right_context = reduce(lambda x, y: x+y+" ", targetC[t2:t2+order], "").strip()
-        contextt = "%s<PHRASE> %s" % (left_context, right_context)
-    if ctx in 'sb':
-        left_context = reduce(lambda x, y: x+y+" ", sourceC[s1-order:s1], "")
-        right_context = reduce(lambda x, y: x+y+" ", sourceC[s2:s2+order], "").strip()
-        contexts = "%s<PHRASE> %s" % (left_context, right_context)
-
-    if phr == 'b':
-        phrase = phraset + ' <SPLIT> ' + phrases
-    elif phr == 's':
-        phrase = phrases
-    else:
-        phrase = phraset
-
-    if ctx == 'b':
-        context = contextt + ' <SPLIT> ' + contexts
-    elif ctx == 's':
-        context = contexts
-    else:
-        context = contextt
-
-    #print "%d-%d-%d-%d looking up" % (s1-order,s2-order,t1-order,t2-order), (phrase, context)
-    label = phrase_context_index.get((phrase,context), cutoff_cat)
-    if label != cutoff_cat: #cutoff'd spans are left unlabelled
-      print "%d-%d-%d-%d:X%s" % (s1-order,s2-order,t1-order,t2-order,label),
-  print
diff --git a/gi/pyp-topics/scripts/tokens2classes.py b/gi/pyp-topics/scripts/tokens2classes.py
deleted file mode 100755
index 33df255f..00000000
--- a/gi/pyp-topics/scripts/tokens2classes.py
+++ /dev/null
@@ -1,27 +0,0 @@
-#!/usr/bin/python
-
-import sys
-
-if len(sys.argv) != 3:
-  print "Usage: tokens2classes.py source_classes target_classes"
-  exit(1)
-
-source_to_topics = {}
-for line in open(sys.argv[1],'r'):
-  term,cls = line.split()
-  source_to_topics[term] = cls
-
-target_to_topics = {}
-for line in open(sys.argv[2],'r'):
-  term,cls = line.split()
-  target_to_topics[term] = cls
-
-for line in sys.stdin:
-  source, target, tail = line.split(" ||| ")
-
-  for token in source.split():
-    print source_to_topics[token],
-  print "|||",
-  for token in target.split():
-    print target_to_topics[token],
-  print "|||", tail,
diff --git a/gi/pyp-topics/scripts/topics.py b/gi/pyp-topics/scripts/topics.py
deleted file mode 100755
index 0db1af71..00000000
--- a/gi/pyp-topics/scripts/topics.py
+++ /dev/null
@@ -1,20 +0,0 @@
-#!/usr/bin/python
-
-import sys
-
-if len(sys.argv) != 2:
-  print "Usage: topics.py words-per-topic"
-  exit(1)
-
-for t,line in enumerate(sys.stdin):
-  tokens = line.split()
-  terms = []
-  for token in tokens:
-    elements = token.rsplit(':',1)
-    terms.append((int(elements[1]),elements[0]))
-  terms.sort()
-  terms.reverse()
-
-  print "Topic %d:" % t
-  map(lambda (x,y) : sys.stdout.write("   %s:%s\n" % (y,x)), terms[:int(sys.argv[1])])
-  print
-- 
cgit v1.2.3