#!/usr/bin/python import sys,collections def extract_backoff(context_list, order): assert len(context_list) == (2*order) backoffs = [] for i in range(1,order+1): if i == order: backoffs.append(([context_list[i-1]+"|"], ["|"+context_list[i]])) else: right_limit = 2*order-i core = context_list[i:right_limit] left = [context_list[i-1]+"|"*(order-i+1)] right = ["|"*(order-i+1)+context_list[right_limit]] backoffs.append((core, left, right)) # print context_list, backoffs return backoffs def tuple_to_str(t): s="" for i,x in enumerate(t): if i > 0: s += "|" s += str(x) return s if len(sys.argv) < 3: print "Usage: extract-contexts.py output_filename order cutoff lowercase" exit(1) output_filename = sys.argv[1] order = int(sys.argv[2]) cutoff = 0 if len(sys.argv) > 3: cutoff = int(sys.argv[3]) lowercase = False if len(sys.argv) > 4: lowercase = bool(sys.argv[4]) contexts_dict={} contexts_list=[] contexts_freq=collections.defaultdict(int) contexts_backoff={} token_dict={} token_list=[] documents_dict=collections.defaultdict(dict) contexts_at_order = [i for i in range(order+1)] prefix = ["<s%d>|<s>"%i for i in range(order)] suffix = ["</s%d>|</s>"%i for i in range(order)] for line in sys.stdin: tokens = list(prefix) tokens.extend(line.split()) tokens.extend(suffix) if lowercase: tokens = map(lambda x: x.lower(), tokens) for i in range(order, len(tokens)-order): context_list = [] term="" for j in range(i-order, i+order+1): token,tag = tokens[j].rsplit('|',2) if j != i: context_list.append(token) else: if token not in token_dict: token_dict[token] = len(token_dict) token_list.append(token) term = token_dict[token] context = tuple_to_str(tuple(context_list)) if context not in contexts_dict: context_index = len(contexts_dict) contexts_dict[context] = context_index contexts_list.append(context) contexts_at_order[0] += 1 # handle backoff backoff_contexts = extract_backoff(context_list, order) bo_indexes=[(context_index,)] # bo_indexes=[(context,)] for i,bo in enumerate(backoff_contexts): factor_indexes=[] for factor in bo: bo_tuple = tuple_to_str(tuple(factor)) if bo_tuple not in contexts_dict: contexts_dict[bo_tuple] = len(contexts_dict) contexts_list.append(bo_tuple) contexts_at_order[i+1] += 1 # factor_indexes.append(bo_tuple) factor_indexes.append(contexts_dict[bo_tuple]) bo_indexes.append(tuple(factor_indexes)) for i in range(len(bo_indexes)-1): contexts_backoff[bo_indexes[i][0]] = bo_indexes[i+1] context_index = contexts_dict[context] contexts_freq[context_index] += 1 if context_index not in documents_dict[term]: documents_dict[term][context_index] = 1 else: documents_dict[term][context_index] += 1 term_file = open(output_filename+".terms",'w') for t in token_list: print >>term_file, t term_file.close() contexts_file = open(output_filename+".contexts",'w') for c in contexts_list: print >>contexts_file, c contexts_file.close() data_file = open(output_filename+".data",'w') for t in range(len(token_list)): line="" num_active=0 for c in documents_dict[t]: count = documents_dict[t][c] if contexts_freq[c] >= cutoff: line += (' ' + str(c) + ':' + str(count)) num_active += 1 if num_active > 0: print >>data_file, "%d%s" % (num_active,line) data_file.close() contexts_backoff_file = open(output_filename+".contexts_backoff",'w') print >>contexts_backoff_file, len(contexts_list), order, #for x in contexts_at_order: # print >>contexts_backoff_file, x, #print >>contexts_backoff_file for x in range(order-1): print >>contexts_backoff_file, 3, print >>contexts_backoff_file, 2 for x in contexts_backoff: print >>contexts_backoff_file, x, for y in contexts_backoff[x]: print >>contexts_backoff_file, y, print >>contexts_backoff_file contexts_backoff_file.close()