cdec cleanup, remove bayesian stuff, parsing stuff

author: Chris Dyer <cdyer@cab.ark.cs.cmu.edu> 2012-10-02 00:19:43 -0400
committer: Chris Dyer <cdyer@cab.ark.cs.cmu.edu> 2012-10-02 00:19:43 -0400
commit: e26434979adc33bd949566ba7bf02dff64e80a3e (patch)
tree: d1c72495e3af6301bd28e7e66c42de0c7a944d1f /gi/pyp-topics/scripts/extract_contexts.py
parent: 0870d4a1f5e14cc7daf553b180d599f09f6614a2 (diff)
1 files changed, 0 insertions, 144 deletions
diff --git a/gi/pyp-topics/scripts/extract_contexts.py b/gi/pyp-topics/scripts/extract_contexts.py
deleted file mode 100755
index b2723f2a..00000000
--- a/gi/pyp-topics/scripts/extract_contexts.py
+++ /dev/null
@@ -1,144 +0,0 @@
-#!/usr/bin/python
-
-import sys,collections
-
-def extract_backoff(context_list, order):
-  assert len(context_list) == (2*order)
-  backoffs = []
-  for i in range(1,order+1):
-    if i == order:
-      backoffs.append(([context_list[i-1]+"|"], ["|"+context_list[i]]))
-    else:
-      right_limit = 2*order-i
-      core = context_list[i:right_limit]
-      left = [context_list[i-1]+"|"*(order-i+1)]
-      right = ["|"*(order-i+1)+context_list[right_limit]]
-      backoffs.append((core, left, right))
-# print context_list, backoffs
-  return backoffs
-
-def tuple_to_str(t):
-  s=""
-  for i,x in enumerate(t):
-    if i > 0: s += "|"
-    s += str(x)
-  return s
-
-if len(sys.argv) < 3:
-  print "Usage: extract-contexts.py output_filename order cutoff lowercase"
-  exit(1)
-
-output_filename = sys.argv[1]
-order = int(sys.argv[2])
-cutoff = 0
-if len(sys.argv) > 3:
-  cutoff = int(sys.argv[3])
-lowercase = False
-if len(sys.argv) > 4:
-  lowercase = bool(sys.argv[4])
-
-contexts_dict={}
-contexts_list=[]
-contexts_freq=collections.defaultdict(int)
-contexts_backoff={}
-
-token_dict={}
-token_list=[]
-documents_dict=collections.defaultdict(dict)
-
-contexts_at_order = [i for i in range(order+1)]
-
-prefix = ["<s%d>|<s>"%i for i in range(order)]
-suffix = ["</s%d>|</s>"%i for i in range(order)]
-
-for line in sys.stdin:
-  tokens = list(prefix)
-  tokens.extend(line.split())
-  tokens.extend(suffix)
-  if lowercase:
-    tokens = map(lambda x: x.lower(), tokens)
-
-  for i in range(order, len(tokens)-order):
-    context_list = []
-    term=""
-    for j in range(i-order, i+order+1):
-      token,tag = tokens[j].rsplit('|',2)
-      if j != i:
-        context_list.append(token)
-      else:
-        if token not in token_dict: 
-          token_dict[token] = len(token_dict)
-          token_list.append(token)
-        term = token_dict[token] 
-
-    context = tuple_to_str(tuple(context_list))
-
-    if context not in contexts_dict: 
-      context_index = len(contexts_dict)
-      contexts_dict[context] = context_index
-      contexts_list.append(context)
-      contexts_at_order[0] += 1
-
-      # handle backoff
-      backoff_contexts = extract_backoff(context_list, order)
-      bo_indexes=[(context_index,)]
-#     bo_indexes=[(context,)]
-      for i,bo in enumerate(backoff_contexts):
-        factor_indexes=[]
-        for factor in bo:
-          bo_tuple = tuple_to_str(tuple(factor))
-          if bo_tuple not in contexts_dict:
-            contexts_dict[bo_tuple] = len(contexts_dict)
-            contexts_list.append(bo_tuple)
-            contexts_at_order[i+1] += 1
-#         factor_indexes.append(bo_tuple)
-          factor_indexes.append(contexts_dict[bo_tuple])
-        bo_indexes.append(tuple(factor_indexes))
-      
-      for i in range(len(bo_indexes)-1):
-        contexts_backoff[bo_indexes[i][0]] = bo_indexes[i+1]
-
-    context_index = contexts_dict[context]
-    contexts_freq[context_index] += 1
-
-    if context_index not in documents_dict[term]:
-      documents_dict[term][context_index] = 1
-    else:
-      documents_dict[term][context_index] += 1
-
-term_file = open(output_filename+".terms",'w')
-for t in token_list: print >>term_file, t
-term_file.close()
-
-contexts_file = open(output_filename+".contexts",'w')
-for c in contexts_list: 
-  print >>contexts_file, c
-contexts_file.close()
-
-data_file = open(output_filename+".data",'w')
-for t in range(len(token_list)): 
-  line=""
-  num_active=0
-  for c in documents_dict[t]:
-    count = documents_dict[t][c]
-    if contexts_freq[c] >= cutoff:
-      line += (' ' + str(c) + ':' + str(count))
-      num_active += 1
-  if num_active > 0:
-    print >>data_file, "%d%s" % (num_active,line)
-data_file.close()
-
-contexts_backoff_file = open(output_filename+".contexts_backoff",'w')
-print >>contexts_backoff_file, len(contexts_list), order,
-#for x in contexts_at_order: 
-#  print >>contexts_backoff_file, x,
-#print >>contexts_backoff_file
-for x in range(order-1):
-  print >>contexts_backoff_file, 3,
-print >>contexts_backoff_file, 2
-
-for x in contexts_backoff: 
-  print >>contexts_backoff_file, x, 
-  for y in contexts_backoff[x]: print >>contexts_backoff_file, y,
-  print >>contexts_backoff_file 
-contexts_backoff_file.close()
author	Chris Dyer <cdyer@cab.ark.cs.cmu.edu>	2012-10-02 00:19:43 -0400
committer	Chris Dyer <cdyer@cab.ark.cs.cmu.edu>	2012-10-02 00:19:43 -0400
commit	e26434979adc33bd949566ba7bf02dff64e80a3e (patch)
tree	d1c72495e3af6301bd28e7e66c42de0c7a944d1f /gi/pyp-topics/scripts/extract_contexts.py
parent	0870d4a1f5e14cc7daf553b180d599f09f6614a2 (diff)