diff options
Diffstat (limited to 'gi/pyp-topics/scripts')
-rwxr-xr-x | gi/pyp-topics/scripts/contexts2documents.py | 29 | ||||
-rwxr-xr-x | gi/pyp-topics/scripts/extract_contexts.py | 144 | ||||
-rwxr-xr-x | gi/pyp-topics/scripts/extract_contexts_test.py | 72 | ||||
-rwxr-xr-x | gi/pyp-topics/scripts/extract_leaves.py | 49 | ||||
-rwxr-xr-x | gi/pyp-topics/scripts/map-documents.py | 20 | ||||
-rwxr-xr-x | gi/pyp-topics/scripts/map-terms.py | 20 | ||||
-rwxr-xr-x | gi/pyp-topics/scripts/score-mkcls.py | 61 | ||||
-rwxr-xr-x | gi/pyp-topics/scripts/score-topics.py | 64 | ||||
-rwxr-xr-x | gi/pyp-topics/scripts/spans2labels.py | 46 | ||||
-rwxr-xr-x | gi/pyp-topics/scripts/topics.py | 20 |
10 files changed, 525 insertions, 0 deletions
diff --git a/gi/pyp-topics/scripts/contexts2documents.py b/gi/pyp-topics/scripts/contexts2documents.py new file mode 100755 index 00000000..c625d17d --- /dev/null +++ b/gi/pyp-topics/scripts/contexts2documents.py @@ -0,0 +1,29 @@ +#!/usr/bin/python + +import sys +from operator import itemgetter + +if len(sys.argv) > 2: + print "Usage: contexts2documents.py [contexts_index_out]" + exit(1) + +context_index = {} +for line in sys.stdin: + phrase, line_tail = line.split('\t') + + raw_contexts = line_tail.split('|||') + contexts = [c.strip() for x,c in enumerate(raw_contexts) if x%2 == 0] + counts = [int(c.split('=')[1].strip()) for x,c in enumerate(raw_contexts) if x%2 != 0] + + print len(contexts), + for context,count in zip(contexts,counts): + c = context_index.setdefault(context, len(context_index)) + print "%d:%d" % (c,count), + print +if len(sys.argv) == 2: + contexts_out = open(sys.argv[1],'w') + contexts = context_index.items() + contexts.sort(key = itemgetter(1)) + for context in contexts: + print >>contexts_out, context[0] + contexts_out.close() diff --git a/gi/pyp-topics/scripts/extract_contexts.py b/gi/pyp-topics/scripts/extract_contexts.py new file mode 100755 index 00000000..b2723f2a --- /dev/null +++ b/gi/pyp-topics/scripts/extract_contexts.py @@ -0,0 +1,144 @@ +#!/usr/bin/python + +import sys,collections + +def extract_backoff(context_list, order): + assert len(context_list) == (2*order) + backoffs = [] + for i in range(1,order+1): + if i == order: + backoffs.append(([context_list[i-1]+"|"], ["|"+context_list[i]])) + else: + right_limit = 2*order-i + core = context_list[i:right_limit] + left = [context_list[i-1]+"|"*(order-i+1)] + right = ["|"*(order-i+1)+context_list[right_limit]] + backoffs.append((core, left, right)) +# print context_list, backoffs + return backoffs + +def tuple_to_str(t): + s="" + for i,x in enumerate(t): + if i > 0: s += "|" + s += str(x) + return s + +if len(sys.argv) < 3: + print "Usage: extract-contexts.py output_filename order cutoff lowercase" + exit(1) + +output_filename = sys.argv[1] +order = int(sys.argv[2]) +cutoff = 0 +if len(sys.argv) > 3: + cutoff = int(sys.argv[3]) +lowercase = False +if len(sys.argv) > 4: + lowercase = bool(sys.argv[4]) + +contexts_dict={} +contexts_list=[] +contexts_freq=collections.defaultdict(int) +contexts_backoff={} + +token_dict={} +token_list=[] +documents_dict=collections.defaultdict(dict) + +contexts_at_order = [i for i in range(order+1)] + +prefix = ["<s%d>|<s>"%i for i in range(order)] +suffix = ["</s%d>|</s>"%i for i in range(order)] + +for line in sys.stdin: + tokens = list(prefix) + tokens.extend(line.split()) + tokens.extend(suffix) + if lowercase: + tokens = map(lambda x: x.lower(), tokens) + + for i in range(order, len(tokens)-order): + context_list = [] + term="" + for j in range(i-order, i+order+1): + token,tag = tokens[j].rsplit('|',2) + if j != i: + context_list.append(token) + else: + if token not in token_dict: + token_dict[token] = len(token_dict) + token_list.append(token) + term = token_dict[token] + + context = tuple_to_str(tuple(context_list)) + + if context not in contexts_dict: + context_index = len(contexts_dict) + contexts_dict[context] = context_index + contexts_list.append(context) + contexts_at_order[0] += 1 + + # handle backoff + backoff_contexts = extract_backoff(context_list, order) + bo_indexes=[(context_index,)] +# bo_indexes=[(context,)] + for i,bo in enumerate(backoff_contexts): + factor_indexes=[] + for factor in bo: + bo_tuple = tuple_to_str(tuple(factor)) + if bo_tuple not in contexts_dict: + contexts_dict[bo_tuple] = len(contexts_dict) + contexts_list.append(bo_tuple) + contexts_at_order[i+1] += 1 +# factor_indexes.append(bo_tuple) + factor_indexes.append(contexts_dict[bo_tuple]) + bo_indexes.append(tuple(factor_indexes)) + + for i in range(len(bo_indexes)-1): + contexts_backoff[bo_indexes[i][0]] = bo_indexes[i+1] + + context_index = contexts_dict[context] + contexts_freq[context_index] += 1 + + if context_index not in documents_dict[term]: + documents_dict[term][context_index] = 1 + else: + documents_dict[term][context_index] += 1 + +term_file = open(output_filename+".terms",'w') +for t in token_list: print >>term_file, t +term_file.close() + +contexts_file = open(output_filename+".contexts",'w') +for c in contexts_list: + print >>contexts_file, c +contexts_file.close() + +data_file = open(output_filename+".data",'w') +for t in range(len(token_list)): + line="" + num_active=0 + for c in documents_dict[t]: + count = documents_dict[t][c] + if contexts_freq[c] >= cutoff: + line += (' ' + str(c) + ':' + str(count)) + num_active += 1 + if num_active > 0: + print >>data_file, "%d%s" % (num_active,line) +data_file.close() + +contexts_backoff_file = open(output_filename+".contexts_backoff",'w') +print >>contexts_backoff_file, len(contexts_list), order, +#for x in contexts_at_order: +# print >>contexts_backoff_file, x, +#print >>contexts_backoff_file +for x in range(order-1): + print >>contexts_backoff_file, 3, +print >>contexts_backoff_file, 2 + +for x in contexts_backoff: + print >>contexts_backoff_file, x, + for y in contexts_backoff[x]: print >>contexts_backoff_file, y, + print >>contexts_backoff_file +contexts_backoff_file.close() diff --git a/gi/pyp-topics/scripts/extract_contexts_test.py b/gi/pyp-topics/scripts/extract_contexts_test.py new file mode 100755 index 00000000..693b6e0b --- /dev/null +++ b/gi/pyp-topics/scripts/extract_contexts_test.py @@ -0,0 +1,72 @@ +#!/usr/bin/python + +import sys,collections + +def tuple_to_str(t): + s="" + for i,x in enumerate(t): + if i > 0: s += "|" + s += str(x) + return s + +if len(sys.argv) < 5: + print "Usage: extract-contexts_test.py output_filename vocab contexts order lowercase" + exit(1) + +output_filename = sys.argv[1] +output = open(output_filename+".test_data",'w') + +unk_term="-UNK-" +vocab_dict={} +for i,x in enumerate(file(sys.argv[2], 'r').readlines()): + vocab_dict[x.strip()]=i + +contexts_dict={} +contexts_list=[] +for i,x in enumerate(file(sys.argv[3], 'r').readlines()): + contexts_dict[x.strip()]=i + contexts_list.append(x.strip()) + +order = int(sys.argv[4]) + +lowercase = False +if len(sys.argv) > 5: + lowercase = bool(sys.argv[5]) +if lowercase: unk_term = unk_term.lower() + +prefix = ["<s%d>|<s>"%i for i in range(order)] +suffix = ["</s%d>|</s>"%i for i in range(order)] + +assert unk_term in vocab_dict +for line in sys.stdin: + tokens = list(prefix) + tokens.extend(line.split()) + tokens.extend(suffix) + if lowercase: + tokens = map(lambda x: x.lower(), tokens) + + for i in range(order, len(tokens)-order): + context_list=[] + term="" + for j in range(i-order, i+order+1): + token,tag = tokens[j].rsplit('|',2) + if j != i: + context_list.append(token) + else: + if token not in vocab_dict: + term = vocab_dict[unk_term] + else: + term = vocab_dict[token] + context = tuple_to_str(context_list) + if context not in contexts_dict: + contexts_dict[context] = len(contexts_dict) + contexts_list.append(context) + context_index = contexts_dict[context] + print >>output, "%d:%d" % (term,context_index), + print >>output +output.close() + +contexts_file = open(output_filename+".test_contexts",'w') +for c in contexts_list: + print >>contexts_file, c +contexts_file.close() diff --git a/gi/pyp-topics/scripts/extract_leaves.py b/gi/pyp-topics/scripts/extract_leaves.py new file mode 100755 index 00000000..14783b36 --- /dev/null +++ b/gi/pyp-topics/scripts/extract_leaves.py @@ -0,0 +1,49 @@ +#!/usr/bin/python + +import nltk +import nltk.probability +import sys +import getopt + +lexicalise=False +rm_traces=False +cutoff=100 +length_cutoff=10000 +try: + opts, args = getopt.getopt(sys.argv[1:], "hs:c:l", ["help", "lexicalise", "cutoff","sentence-length","remove-traces"]) +except getopt.GetoptError: + print "Usage: extract_leaves.py [-lsc]" + sys.exit(2) +for opt, arg in opts: + if opt in ("-h", "--help"): + print "Usage: extract_leaves.py [-lsc]" + sys.exit() + elif opt in ("-l", "--lexicalise"): + lexicalise = True + elif opt in ("-c", "--cutoff"): + cutoff = int(arg) + elif opt in ("-s", "--sentence-length"): + length_cutoff = int(arg) + elif opt in ("--remove-traces"): + rm_traces = True + +token_freq = nltk.probability.FreqDist() +lines = [] +for line in sys.stdin: + t = nltk.Tree.parse(line) + pos = t.pos() + if len(pos) <= length_cutoff: + lines.append(pos) + for token, tag in pos: + token_freq.inc(token) + +for line in lines: + for token,tag in line: + if not (rm_traces and tag == "-NONE-"): + if lexicalise: + if token_freq[token] < cutoff: + token = '-UNK-' + print '%s|%s' % (token,tag), + else: + print '%s' % tag, + print diff --git a/gi/pyp-topics/scripts/map-documents.py b/gi/pyp-topics/scripts/map-documents.py new file mode 100755 index 00000000..703de312 --- /dev/null +++ b/gi/pyp-topics/scripts/map-documents.py @@ -0,0 +1,20 @@ +#!/usr/bin/python + +import sys + +if len(sys.argv) != 2: + print "Usage: map-documents.py vocab-file" + exit(1) + +vocab = file(sys.argv[1], 'r').readlines() +term_dict = map(lambda x: x.strip(), vocab) + +for line in sys.stdin: + tokens = line.split() + for token in tokens: + elements = token.split(':') + if len(elements) == 1: + print "%s" % (term_dict[int(elements[0])]), + else: + print "%s:%s" % (term_dict[int(elements[0])], elements[1]), + print diff --git a/gi/pyp-topics/scripts/map-terms.py b/gi/pyp-topics/scripts/map-terms.py new file mode 100755 index 00000000..eb0298d7 --- /dev/null +++ b/gi/pyp-topics/scripts/map-terms.py @@ -0,0 +1,20 @@ +#!/usr/bin/python + +import sys + +if len(sys.argv) != 2: + print "Usage: map-terms.py vocab-file" + exit(1) + +vocab = file(sys.argv[1], 'r').readlines() +term_dict = map(lambda x: x.strip().replace(' ','_'), vocab) + +for line in sys.stdin: + tokens = line.split() + for token in tokens: + elements = token.split(':') + if len(elements) == 1: + print "%s" % (term_dict[int(elements[0])]), + else: + print "%s:%s" % (term_dict[int(elements[0])], elements[1]), + print diff --git a/gi/pyp-topics/scripts/score-mkcls.py b/gi/pyp-topics/scripts/score-mkcls.py new file mode 100755 index 00000000..6bd33fc5 --- /dev/null +++ b/gi/pyp-topics/scripts/score-mkcls.py @@ -0,0 +1,61 @@ +#!/usr/bin/python + +import sys +from collections import defaultdict + +def dict_max(d): + max_val=-1 + max_key=None + for k in d: + if d[k] > max_val: + max_val = d[k] + max_key = k + assert max_key + return max_key + +if len(sys.argv) != 3: + print "Usage: score-mkcls.py gold classes" + exit(1) + +gold_file=open(sys.argv[1],'r') + +term_to_topics = {} +for line in open(sys.argv[2],'r'): + term,cls = line.split() + term_to_topics[term] = cls + +gold_to_topics = defaultdict(dict) +topics_to_gold = defaultdict(dict) + +for gold_line in gold_file: + gold_tokens = gold_line.split() + for gold_token in gold_tokens: + gold_term,gold_tag = gold_token.rsplit('|',1) + pred_token = term_to_topics[gold_term] + gold_to_topics[gold_tag][pred_token] \ + = gold_to_topics[gold_tag].get(pred_token, 0) + 1 + topics_to_gold[pred_token][gold_tag] \ + = topics_to_gold[pred_token].get(gold_tag, 0) + 1 + +pred=0 +correct=0 +gold_file=open(sys.argv[1],'r') +for gold_line in gold_file: + gold_tokens = gold_line.split() + + for gold_token in gold_tokens: + gold_term,gold_tag = gold_token.rsplit('|',1) + pred_token = term_to_topics[gold_term] + print "%s|%s|%s" % (gold_token, pred_token, dict_max(topics_to_gold[pred_token])), + pred += 1 + if gold_tag == dict_max(topics_to_gold[pred_token]): + correct += 1 + print +print >>sys.stderr, "Many-to-One Accuracy = %f" % (float(correct) / pred) +#for x in gold_to_topics: +# print x,dict_max(gold_to_topics[x]) +#print "###################################################" +#for x in range(len(topics_to_gold)): +# print x,dict_max(topics_to_gold[str(x)]) +# print x,topics_to_gold[str(x)] +#print term_to_topics diff --git a/gi/pyp-topics/scripts/score-topics.py b/gi/pyp-topics/scripts/score-topics.py new file mode 100755 index 00000000..1d8a1fcd --- /dev/null +++ b/gi/pyp-topics/scripts/score-topics.py @@ -0,0 +1,64 @@ +#!/usr/bin/python + +import sys +from collections import defaultdict + +def dict_max(d): + max_val=-1 + max_key=None + for k in d: + if d[k] > max_val: + max_val = d[k] + max_key = k + assert max_key + return max_key + +if len(sys.argv) != 3: + print "Usage: score-topics.py gold pred" + exit(1) + +gold_file=open(sys.argv[1],'r') +pred_file=open(sys.argv[2],'r') + +gold_to_topics = defaultdict(dict) +topics_to_gold = defaultdict(dict) +term_to_topics = defaultdict(dict) + +for gold_line,pred_line in zip(gold_file,pred_file): + gold_tokens = gold_line.split() + pred_tokens = pred_line.split() + assert len(gold_tokens) == len(pred_tokens) + + for gold_token,pred_token in zip(gold_tokens,pred_tokens): + gold_term,gold_tag = gold_token.rsplit('|',1) + gold_to_topics[gold_tag][pred_token] \ + = gold_to_topics[gold_tag].get(pred_token, 0) + 1 + term_to_topics[gold_term][pred_token] \ + = term_to_topics[gold_term].get(pred_token, 0) + 1 + topics_to_gold[pred_token][gold_tag] \ + = topics_to_gold[pred_token].get(gold_tag, 0) + 1 + +pred=0 +correct=0 +gold_file=open(sys.argv[1],'r') +pred_file=open(sys.argv[2],'r') +for gold_line,pred_line in zip(gold_file,pred_file): + gold_tokens = gold_line.split() + pred_tokens = pred_line.split() + + for gold_token,pred_token in zip(gold_tokens,pred_tokens): + gold_term,gold_tag = gold_token.rsplit('|',1) +# print "%s|%s" % (gold_token, dict_max(gold_to_topics[gold_tag])), + print "%s|%s|%s" % (gold_token, pred_token, dict_max(topics_to_gold[pred_token])), + pred += 1 + if gold_tag == dict_max(topics_to_gold[pred_token]): + correct += 1 + print +print >>sys.stderr, "Many-to-One Accuracy = %f" % (float(correct) / pred) +#for x in gold_to_topics: +# print x,dict_max(gold_to_topics[x]) +#print "###################################################" +#for x in range(len(topics_to_gold)): +# print x,dict_max(topics_to_gold[str(x)]) +# print x,topics_to_gold[str(x)] +#print term_to_topics diff --git a/gi/pyp-topics/scripts/spans2labels.py b/gi/pyp-topics/scripts/spans2labels.py new file mode 100755 index 00000000..b523e191 --- /dev/null +++ b/gi/pyp-topics/scripts/spans2labels.py @@ -0,0 +1,46 @@ +#!/usr/bin/python + +import sys +from operator import itemgetter + +if len(sys.argv) != 4: + print "Usage: spans2labels.py phrase_index context_index phrase_context_index" + exit(1) + +phrase_index = dict(map(lambda x: (x[1].strip(),x[0]), enumerate(file(sys.argv[1], 'r').readlines()))) +context_index = dict(map(lambda x: (x[1].strip(),x[0]), enumerate(file(sys.argv[2], 'r').readlines()))) + +phrase_context_index = {} +for i,line in enumerate(file(sys.argv[3], 'r').readlines()): + for c,l in map(lambda x: x.split(':'), line.split()[1:]): + phrase_context_index[(int(i),int(c))] = l + +for line in sys.stdin: + line_segments = line.split('|||') + source = ['<s>'] + line_segments[0].split() + ['</s>'] + target = ['<s>'] + line_segments[1].split() + ['</s>'] + phrases = [ [int(i) for i in x.split('-')] for x in line_segments[2].split()] + +# for x in source[1:-1]: +# print x, +# print "|||", +# for x in target[1:-1]: +# print x, + print "|||", + + for s1,s2,t1,t2 in phrases: + s1 += 1 + s2 += 1 + t1 += 1 + t2 += 1 + + phrase = reduce(lambda x, y: x+y+" ", target[t1:t2], "").strip() + context = "%s <PHRASE> %s" % (target[t1-1], target[t2]) + + pi = phrase_index[phrase] + ci = context_index[context] + label = phrase_context_index[(pi,ci)] + print "%s-%s:%s" % (t1-1,t2-1,label), +# print phrase, pi, context, ci +# print phrase_context_index[(pi,ci)] + print diff --git a/gi/pyp-topics/scripts/topics.py b/gi/pyp-topics/scripts/topics.py new file mode 100755 index 00000000..0db1af71 --- /dev/null +++ b/gi/pyp-topics/scripts/topics.py @@ -0,0 +1,20 @@ +#!/usr/bin/python + +import sys + +if len(sys.argv) != 2: + print "Usage: topics.py words-per-topic" + exit(1) + +for t,line in enumerate(sys.stdin): + tokens = line.split() + terms = [] + for token in tokens: + elements = token.rsplit(':',1) + terms.append((int(elements[1]),elements[0])) + terms.sort() + terms.reverse() + + print "Topic %d:" % t + map(lambda (x,y) : sys.stdout.write(" %s:%s\n" % (y,x)), terms[:int(sys.argv[1])]) + print |