diff options
author | philblunsom@gmail.com <philblunsom@gmail.com@ec762483-ff6d-05da-a07a-a48fb63a330f> | 2010-06-22 20:34:00 +0000 |
---|---|---|
committer | philblunsom@gmail.com <philblunsom@gmail.com@ec762483-ff6d-05da-a07a-a48fb63a330f> | 2010-06-22 20:34:00 +0000 |
commit | 2f2ba42a1453f4a3a08f9c1ecfc53c1b1c83d550 (patch) | |
tree | 646e81b6325280f64a72771b5eeadf5118e465a9 /gi/pyp-topics/scripts/extract_contexts.py | |
parent | 2f2e36ca3060e7e9853c3d611f6cc5e112a76ddd (diff) |
Initial ci of gi dir
git-svn-id: https://ws10smt.googlecode.com/svn/trunk@5 ec762483-ff6d-05da-a07a-a48fb63a330f
Diffstat (limited to 'gi/pyp-topics/scripts/extract_contexts.py')
-rwxr-xr-x | gi/pyp-topics/scripts/extract_contexts.py | 144 |
1 files changed, 144 insertions, 0 deletions
diff --git a/gi/pyp-topics/scripts/extract_contexts.py b/gi/pyp-topics/scripts/extract_contexts.py new file mode 100755 index 00000000..b2723f2a --- /dev/null +++ b/gi/pyp-topics/scripts/extract_contexts.py @@ -0,0 +1,144 @@ +#!/usr/bin/python + +import sys,collections + +def extract_backoff(context_list, order): + assert len(context_list) == (2*order) + backoffs = [] + for i in range(1,order+1): + if i == order: + backoffs.append(([context_list[i-1]+"|"], ["|"+context_list[i]])) + else: + right_limit = 2*order-i + core = context_list[i:right_limit] + left = [context_list[i-1]+"|"*(order-i+1)] + right = ["|"*(order-i+1)+context_list[right_limit]] + backoffs.append((core, left, right)) +# print context_list, backoffs + return backoffs + +def tuple_to_str(t): + s="" + for i,x in enumerate(t): + if i > 0: s += "|" + s += str(x) + return s + +if len(sys.argv) < 3: + print "Usage: extract-contexts.py output_filename order cutoff lowercase" + exit(1) + +output_filename = sys.argv[1] +order = int(sys.argv[2]) +cutoff = 0 +if len(sys.argv) > 3: + cutoff = int(sys.argv[3]) +lowercase = False +if len(sys.argv) > 4: + lowercase = bool(sys.argv[4]) + +contexts_dict={} +contexts_list=[] +contexts_freq=collections.defaultdict(int) +contexts_backoff={} + +token_dict={} +token_list=[] +documents_dict=collections.defaultdict(dict) + +contexts_at_order = [i for i in range(order+1)] + +prefix = ["<s%d>|<s>"%i for i in range(order)] +suffix = ["</s%d>|</s>"%i for i in range(order)] + +for line in sys.stdin: + tokens = list(prefix) + tokens.extend(line.split()) + tokens.extend(suffix) + if lowercase: + tokens = map(lambda x: x.lower(), tokens) + + for i in range(order, len(tokens)-order): + context_list = [] + term="" + for j in range(i-order, i+order+1): + token,tag = tokens[j].rsplit('|',2) + if j != i: + context_list.append(token) + else: + if token not in token_dict: + token_dict[token] = len(token_dict) + token_list.append(token) + term = token_dict[token] + + context = tuple_to_str(tuple(context_list)) + + if context not in contexts_dict: + context_index = len(contexts_dict) + contexts_dict[context] = context_index + contexts_list.append(context) + contexts_at_order[0] += 1 + + # handle backoff + backoff_contexts = extract_backoff(context_list, order) + bo_indexes=[(context_index,)] +# bo_indexes=[(context,)] + for i,bo in enumerate(backoff_contexts): + factor_indexes=[] + for factor in bo: + bo_tuple = tuple_to_str(tuple(factor)) + if bo_tuple not in contexts_dict: + contexts_dict[bo_tuple] = len(contexts_dict) + contexts_list.append(bo_tuple) + contexts_at_order[i+1] += 1 +# factor_indexes.append(bo_tuple) + factor_indexes.append(contexts_dict[bo_tuple]) + bo_indexes.append(tuple(factor_indexes)) + + for i in range(len(bo_indexes)-1): + contexts_backoff[bo_indexes[i][0]] = bo_indexes[i+1] + + context_index = contexts_dict[context] + contexts_freq[context_index] += 1 + + if context_index not in documents_dict[term]: + documents_dict[term][context_index] = 1 + else: + documents_dict[term][context_index] += 1 + +term_file = open(output_filename+".terms",'w') +for t in token_list: print >>term_file, t +term_file.close() + +contexts_file = open(output_filename+".contexts",'w') +for c in contexts_list: + print >>contexts_file, c +contexts_file.close() + +data_file = open(output_filename+".data",'w') +for t in range(len(token_list)): + line="" + num_active=0 + for c in documents_dict[t]: + count = documents_dict[t][c] + if contexts_freq[c] >= cutoff: + line += (' ' + str(c) + ':' + str(count)) + num_active += 1 + if num_active > 0: + print >>data_file, "%d%s" % (num_active,line) +data_file.close() + +contexts_backoff_file = open(output_filename+".contexts_backoff",'w') +print >>contexts_backoff_file, len(contexts_list), order, +#for x in contexts_at_order: +# print >>contexts_backoff_file, x, +#print >>contexts_backoff_file +for x in range(order-1): + print >>contexts_backoff_file, 3, +print >>contexts_backoff_file, 2 + +for x in contexts_backoff: + print >>contexts_backoff_file, x, + for y in contexts_backoff[x]: print >>contexts_backoff_file, y, + print >>contexts_backoff_file +contexts_backoff_file.close() |