summaryrefslogtreecommitdiff
path: root/gi/pyp-topics/scripts/extract_contexts.py
diff options
context:
space:
mode:
authorChris Dyer <cdyer@cab.ark.cs.cmu.edu>2012-10-02 00:19:43 -0400
committerChris Dyer <cdyer@cab.ark.cs.cmu.edu>2012-10-02 00:19:43 -0400
commit925087356b853e2099c1b60d8b757d7aa02121a9 (patch)
tree579925c5c9d3da51f43018a5c6d1c4dfbb72b089 /gi/pyp-topics/scripts/extract_contexts.py
parentea79e535d69f6854d01c62e3752971fb6730d8e7 (diff)
cdec cleanup, remove bayesian stuff, parsing stuff
Diffstat (limited to 'gi/pyp-topics/scripts/extract_contexts.py')
-rwxr-xr-xgi/pyp-topics/scripts/extract_contexts.py144
1 files changed, 0 insertions, 144 deletions
diff --git a/gi/pyp-topics/scripts/extract_contexts.py b/gi/pyp-topics/scripts/extract_contexts.py
deleted file mode 100755
index b2723f2a..00000000
--- a/gi/pyp-topics/scripts/extract_contexts.py
+++ /dev/null
@@ -1,144 +0,0 @@
-#!/usr/bin/python
-
-import sys,collections
-
-def extract_backoff(context_list, order):
- assert len(context_list) == (2*order)
- backoffs = []
- for i in range(1,order+1):
- if i == order:
- backoffs.append(([context_list[i-1]+"|"], ["|"+context_list[i]]))
- else:
- right_limit = 2*order-i
- core = context_list[i:right_limit]
- left = [context_list[i-1]+"|"*(order-i+1)]
- right = ["|"*(order-i+1)+context_list[right_limit]]
- backoffs.append((core, left, right))
-# print context_list, backoffs
- return backoffs
-
-def tuple_to_str(t):
- s=""
- for i,x in enumerate(t):
- if i > 0: s += "|"
- s += str(x)
- return s
-
-if len(sys.argv) < 3:
- print "Usage: extract-contexts.py output_filename order cutoff lowercase"
- exit(1)
-
-output_filename = sys.argv[1]
-order = int(sys.argv[2])
-cutoff = 0
-if len(sys.argv) > 3:
- cutoff = int(sys.argv[3])
-lowercase = False
-if len(sys.argv) > 4:
- lowercase = bool(sys.argv[4])
-
-contexts_dict={}
-contexts_list=[]
-contexts_freq=collections.defaultdict(int)
-contexts_backoff={}
-
-token_dict={}
-token_list=[]
-documents_dict=collections.defaultdict(dict)
-
-contexts_at_order = [i for i in range(order+1)]
-
-prefix = ["<s%d>|<s>"%i for i in range(order)]
-suffix = ["</s%d>|</s>"%i for i in range(order)]
-
-for line in sys.stdin:
- tokens = list(prefix)
- tokens.extend(line.split())
- tokens.extend(suffix)
- if lowercase:
- tokens = map(lambda x: x.lower(), tokens)
-
- for i in range(order, len(tokens)-order):
- context_list = []
- term=""
- for j in range(i-order, i+order+1):
- token,tag = tokens[j].rsplit('|',2)
- if j != i:
- context_list.append(token)
- else:
- if token not in token_dict:
- token_dict[token] = len(token_dict)
- token_list.append(token)
- term = token_dict[token]
-
- context = tuple_to_str(tuple(context_list))
-
- if context not in contexts_dict:
- context_index = len(contexts_dict)
- contexts_dict[context] = context_index
- contexts_list.append(context)
- contexts_at_order[0] += 1
-
- # handle backoff
- backoff_contexts = extract_backoff(context_list, order)
- bo_indexes=[(context_index,)]
-# bo_indexes=[(context,)]
- for i,bo in enumerate(backoff_contexts):
- factor_indexes=[]
- for factor in bo:
- bo_tuple = tuple_to_str(tuple(factor))
- if bo_tuple not in contexts_dict:
- contexts_dict[bo_tuple] = len(contexts_dict)
- contexts_list.append(bo_tuple)
- contexts_at_order[i+1] += 1
-# factor_indexes.append(bo_tuple)
- factor_indexes.append(contexts_dict[bo_tuple])
- bo_indexes.append(tuple(factor_indexes))
-
- for i in range(len(bo_indexes)-1):
- contexts_backoff[bo_indexes[i][0]] = bo_indexes[i+1]
-
- context_index = contexts_dict[context]
- contexts_freq[context_index] += 1
-
- if context_index not in documents_dict[term]:
- documents_dict[term][context_index] = 1
- else:
- documents_dict[term][context_index] += 1
-
-term_file = open(output_filename+".terms",'w')
-for t in token_list: print >>term_file, t
-term_file.close()
-
-contexts_file = open(output_filename+".contexts",'w')
-for c in contexts_list:
- print >>contexts_file, c
-contexts_file.close()
-
-data_file = open(output_filename+".data",'w')
-for t in range(len(token_list)):
- line=""
- num_active=0
- for c in documents_dict[t]:
- count = documents_dict[t][c]
- if contexts_freq[c] >= cutoff:
- line += (' ' + str(c) + ':' + str(count))
- num_active += 1
- if num_active > 0:
- print >>data_file, "%d%s" % (num_active,line)
-data_file.close()
-
-contexts_backoff_file = open(output_filename+".contexts_backoff",'w')
-print >>contexts_backoff_file, len(contexts_list), order,
-#for x in contexts_at_order:
-# print >>contexts_backoff_file, x,
-#print >>contexts_backoff_file
-for x in range(order-1):
- print >>contexts_backoff_file, 3,
-print >>contexts_backoff_file, 2
-
-for x in contexts_backoff:
- print >>contexts_backoff_file, x,
- for y in contexts_backoff[x]: print >>contexts_backoff_file, y,
- print >>contexts_backoff_file
-contexts_backoff_file.close()