From 925087356b853e2099c1b60d8b757d7aa02121a9 Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Tue, 2 Oct 2012 00:19:43 -0400 Subject: cdec cleanup, remove bayesian stuff, parsing stuff --- gi/pyp-topics/scripts/extract_contexts_test.py | 72 -------------------------- 1 file changed, 72 deletions(-) delete mode 100755 gi/pyp-topics/scripts/extract_contexts_test.py (limited to 'gi/pyp-topics/scripts/extract_contexts_test.py') diff --git a/gi/pyp-topics/scripts/extract_contexts_test.py b/gi/pyp-topics/scripts/extract_contexts_test.py deleted file mode 100755 index 693b6e0b..00000000 --- a/gi/pyp-topics/scripts/extract_contexts_test.py +++ /dev/null @@ -1,72 +0,0 @@ -#!/usr/bin/python - -import sys,collections - -def tuple_to_str(t): - s="" - for i,x in enumerate(t): - if i > 0: s += "|" - s += str(x) - return s - -if len(sys.argv) < 5: - print "Usage: extract-contexts_test.py output_filename vocab contexts order lowercase" - exit(1) - -output_filename = sys.argv[1] -output = open(output_filename+".test_data",'w') - -unk_term="-UNK-" -vocab_dict={} -for i,x in enumerate(file(sys.argv[2], 'r').readlines()): - vocab_dict[x.strip()]=i - -contexts_dict={} -contexts_list=[] -for i,x in enumerate(file(sys.argv[3], 'r').readlines()): - contexts_dict[x.strip()]=i - contexts_list.append(x.strip()) - -order = int(sys.argv[4]) - -lowercase = False -if len(sys.argv) > 5: - lowercase = bool(sys.argv[5]) -if lowercase: unk_term = unk_term.lower() - -prefix = ["|"%i for i in range(order)] -suffix = ["|"%i for i in range(order)] - -assert unk_term in vocab_dict -for line in sys.stdin: - tokens = list(prefix) - tokens.extend(line.split()) - tokens.extend(suffix) - if lowercase: - tokens = map(lambda x: x.lower(), tokens) - - for i in range(order, len(tokens)-order): - context_list=[] - term="" - for j in range(i-order, i+order+1): - token,tag = tokens[j].rsplit('|',2) - if j != i: - context_list.append(token) - else: - if token not in vocab_dict: - term = vocab_dict[unk_term] - else: - term = vocab_dict[token] - context = tuple_to_str(context_list) - if context not in contexts_dict: - contexts_dict[context] = len(contexts_dict) - contexts_list.append(context) - context_index = contexts_dict[context] - print >>output, "%d:%d" % (term,context_index), - print >>output -output.close() - -contexts_file = open(output_filename+".test_contexts",'w') -for c in contexts_list: - print >>contexts_file, c -contexts_file.close() -- cgit v1.2.3