diff options
author | philblunsom@gmail.com <philblunsom@gmail.com@ec762483-ff6d-05da-a07a-a48fb63a330f> | 2010-06-22 20:34:00 +0000 |
---|---|---|
committer | philblunsom@gmail.com <philblunsom@gmail.com@ec762483-ff6d-05da-a07a-a48fb63a330f> | 2010-06-22 20:34:00 +0000 |
commit | 2f2ba42a1453f4a3a08f9c1ecfc53c1b1c83d550 (patch) | |
tree | 646e81b6325280f64a72771b5eeadf5118e465a9 /gi/pyp-topics/scripts/extract_contexts_test.py | |
parent | 2f2e36ca3060e7e9853c3d611f6cc5e112a76ddd (diff) |
Initial ci of gi dir
git-svn-id: https://ws10smt.googlecode.com/svn/trunk@5 ec762483-ff6d-05da-a07a-a48fb63a330f
Diffstat (limited to 'gi/pyp-topics/scripts/extract_contexts_test.py')
-rwxr-xr-x | gi/pyp-topics/scripts/extract_contexts_test.py | 72 |
1 files changed, 72 insertions, 0 deletions
diff --git a/gi/pyp-topics/scripts/extract_contexts_test.py b/gi/pyp-topics/scripts/extract_contexts_test.py new file mode 100755 index 00000000..693b6e0b --- /dev/null +++ b/gi/pyp-topics/scripts/extract_contexts_test.py @@ -0,0 +1,72 @@ +#!/usr/bin/python + +import sys,collections + +def tuple_to_str(t): + s="" + for i,x in enumerate(t): + if i > 0: s += "|" + s += str(x) + return s + +if len(sys.argv) < 5: + print "Usage: extract-contexts_test.py output_filename vocab contexts order lowercase" + exit(1) + +output_filename = sys.argv[1] +output = open(output_filename+".test_data",'w') + +unk_term="-UNK-" +vocab_dict={} +for i,x in enumerate(file(sys.argv[2], 'r').readlines()): + vocab_dict[x.strip()]=i + +contexts_dict={} +contexts_list=[] +for i,x in enumerate(file(sys.argv[3], 'r').readlines()): + contexts_dict[x.strip()]=i + contexts_list.append(x.strip()) + +order = int(sys.argv[4]) + +lowercase = False +if len(sys.argv) > 5: + lowercase = bool(sys.argv[5]) +if lowercase: unk_term = unk_term.lower() + +prefix = ["<s%d>|<s>"%i for i in range(order)] +suffix = ["</s%d>|</s>"%i for i in range(order)] + +assert unk_term in vocab_dict +for line in sys.stdin: + tokens = list(prefix) + tokens.extend(line.split()) + tokens.extend(suffix) + if lowercase: + tokens = map(lambda x: x.lower(), tokens) + + for i in range(order, len(tokens)-order): + context_list=[] + term="" + for j in range(i-order, i+order+1): + token,tag = tokens[j].rsplit('|',2) + if j != i: + context_list.append(token) + else: + if token not in vocab_dict: + term = vocab_dict[unk_term] + else: + term = vocab_dict[token] + context = tuple_to_str(context_list) + if context not in contexts_dict: + contexts_dict[context] = len(contexts_dict) + contexts_list.append(context) + context_index = contexts_dict[context] + print >>output, "%d:%d" % (term,context_index), + print >>output +output.close() + +contexts_file = open(output_filename+".test_contexts",'w') +for c in contexts_list: + print >>contexts_file, c +contexts_file.close() |