From e26434979adc33bd949566ba7bf02dff64e80a3e Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Tue, 2 Oct 2012 00:19:43 -0400 Subject: cdec cleanup, remove bayesian stuff, parsing stuff --- gi/pyp-topics/scripts/extract_contexts.py | 144 ------------------------------ 1 file changed, 144 deletions(-) delete mode 100755 gi/pyp-topics/scripts/extract_contexts.py (limited to 'gi/pyp-topics/scripts/extract_contexts.py') diff --git a/gi/pyp-topics/scripts/extract_contexts.py b/gi/pyp-topics/scripts/extract_contexts.py deleted file mode 100755 index b2723f2a..00000000 --- a/gi/pyp-topics/scripts/extract_contexts.py +++ /dev/null @@ -1,144 +0,0 @@ -#!/usr/bin/python - -import sys,collections - -def extract_backoff(context_list, order): - assert len(context_list) == (2*order) - backoffs = [] - for i in range(1,order+1): - if i == order: - backoffs.append(([context_list[i-1]+"|"], ["|"+context_list[i]])) - else: - right_limit = 2*order-i - core = context_list[i:right_limit] - left = [context_list[i-1]+"|"*(order-i+1)] - right = ["|"*(order-i+1)+context_list[right_limit]] - backoffs.append((core, left, right)) -# print context_list, backoffs - return backoffs - -def tuple_to_str(t): - s="" - for i,x in enumerate(t): - if i > 0: s += "|" - s += str(x) - return s - -if len(sys.argv) < 3: - print "Usage: extract-contexts.py output_filename order cutoff lowercase" - exit(1) - -output_filename = sys.argv[1] -order = int(sys.argv[2]) -cutoff = 0 -if len(sys.argv) > 3: - cutoff = int(sys.argv[3]) -lowercase = False -if len(sys.argv) > 4: - lowercase = bool(sys.argv[4]) - -contexts_dict={} -contexts_list=[] -contexts_freq=collections.defaultdict(int) -contexts_backoff={} - -token_dict={} -token_list=[] -documents_dict=collections.defaultdict(dict) - -contexts_at_order = [i for i in range(order+1)] - -prefix = ["|"%i for i in range(order)] -suffix = ["|"%i for i in range(order)] - -for line in sys.stdin: - tokens = list(prefix) - tokens.extend(line.split()) - tokens.extend(suffix) - if lowercase: - tokens = map(lambda x: x.lower(), tokens) - - for i in range(order, len(tokens)-order): - context_list = [] - term="" - for j in range(i-order, i+order+1): - token,tag = tokens[j].rsplit('|',2) - if j != i: - context_list.append(token) - else: - if token not in token_dict: - token_dict[token] = len(token_dict) - token_list.append(token) - term = token_dict[token] - - context = tuple_to_str(tuple(context_list)) - - if context not in contexts_dict: - context_index = len(contexts_dict) - contexts_dict[context] = context_index - contexts_list.append(context) - contexts_at_order[0] += 1 - - # handle backoff - backoff_contexts = extract_backoff(context_list, order) - bo_indexes=[(context_index,)] -# bo_indexes=[(context,)] - for i,bo in enumerate(backoff_contexts): - factor_indexes=[] - for factor in bo: - bo_tuple = tuple_to_str(tuple(factor)) - if bo_tuple not in contexts_dict: - contexts_dict[bo_tuple] = len(contexts_dict) - contexts_list.append(bo_tuple) - contexts_at_order[i+1] += 1 -# factor_indexes.append(bo_tuple) - factor_indexes.append(contexts_dict[bo_tuple]) - bo_indexes.append(tuple(factor_indexes)) - - for i in range(len(bo_indexes)-1): - contexts_backoff[bo_indexes[i][0]] = bo_indexes[i+1] - - context_index = contexts_dict[context] - contexts_freq[context_index] += 1 - - if context_index not in documents_dict[term]: - documents_dict[term][context_index] = 1 - else: - documents_dict[term][context_index] += 1 - -term_file = open(output_filename+".terms",'w') -for t in token_list: print >>term_file, t -term_file.close() - -contexts_file = open(output_filename+".contexts",'w') -for c in contexts_list: - print >>contexts_file, c -contexts_file.close() - -data_file = open(output_filename+".data",'w') -for t in range(len(token_list)): - line="" - num_active=0 - for c in documents_dict[t]: - count = documents_dict[t][c] - if contexts_freq[c] >= cutoff: - line += (' ' + str(c) + ':' + str(count)) - num_active += 1 - if num_active > 0: - print >>data_file, "%d%s" % (num_active,line) -data_file.close() - -contexts_backoff_file = open(output_filename+".contexts_backoff",'w') -print >>contexts_backoff_file, len(contexts_list), order, -#for x in contexts_at_order: -# print >>contexts_backoff_file, x, -#print >>contexts_backoff_file -for x in range(order-1): - print >>contexts_backoff_file, 3, -print >>contexts_backoff_file, 2 - -for x in contexts_backoff: - print >>contexts_backoff_file, x, - for y in contexts_backoff[x]: print >>contexts_backoff_file, y, - print >>contexts_backoff_file -contexts_backoff_file.close() -- cgit v1.2.3