diff options
author | Chris Dyer <cdyer@cs.cmu.edu> | 2012-10-11 14:06:32 -0400 |
---|---|---|
committer | Chris Dyer <cdyer@cs.cmu.edu> | 2012-10-11 14:06:32 -0400 |
commit | 07ea7b64b6f85e5798a8068453ed9fd2b97396db (patch) | |
tree | 644496a1690d84d82a396bbc1e39160788beb2cd /gi/pyp-topics/scripts/extract_contexts_test.py | |
parent | 37b9e45e5cb29d708f7249dbe0b0fb27685282a0 (diff) | |
parent | a36fcc5d55c1de84ae68c1091ebff2b1c32dc3b7 (diff) |
Merge branch 'master' of https://github.com/redpony/cdec
Diffstat (limited to 'gi/pyp-topics/scripts/extract_contexts_test.py')
-rwxr-xr-x | gi/pyp-topics/scripts/extract_contexts_test.py | 72 |
1 files changed, 0 insertions, 72 deletions
diff --git a/gi/pyp-topics/scripts/extract_contexts_test.py b/gi/pyp-topics/scripts/extract_contexts_test.py deleted file mode 100755 index 693b6e0b..00000000 --- a/gi/pyp-topics/scripts/extract_contexts_test.py +++ /dev/null @@ -1,72 +0,0 @@ -#!/usr/bin/python - -import sys,collections - -def tuple_to_str(t): - s="" - for i,x in enumerate(t): - if i > 0: s += "|" - s += str(x) - return s - -if len(sys.argv) < 5: - print "Usage: extract-contexts_test.py output_filename vocab contexts order lowercase" - exit(1) - -output_filename = sys.argv[1] -output = open(output_filename+".test_data",'w') - -unk_term="-UNK-" -vocab_dict={} -for i,x in enumerate(file(sys.argv[2], 'r').readlines()): - vocab_dict[x.strip()]=i - -contexts_dict={} -contexts_list=[] -for i,x in enumerate(file(sys.argv[3], 'r').readlines()): - contexts_dict[x.strip()]=i - contexts_list.append(x.strip()) - -order = int(sys.argv[4]) - -lowercase = False -if len(sys.argv) > 5: - lowercase = bool(sys.argv[5]) -if lowercase: unk_term = unk_term.lower() - -prefix = ["<s%d>|<s>"%i for i in range(order)] -suffix = ["</s%d>|</s>"%i for i in range(order)] - -assert unk_term in vocab_dict -for line in sys.stdin: - tokens = list(prefix) - tokens.extend(line.split()) - tokens.extend(suffix) - if lowercase: - tokens = map(lambda x: x.lower(), tokens) - - for i in range(order, len(tokens)-order): - context_list=[] - term="" - for j in range(i-order, i+order+1): - token,tag = tokens[j].rsplit('|',2) - if j != i: - context_list.append(token) - else: - if token not in vocab_dict: - term = vocab_dict[unk_term] - else: - term = vocab_dict[token] - context = tuple_to_str(context_list) - if context not in contexts_dict: - contexts_dict[context] = len(contexts_dict) - contexts_list.append(context) - context_index = contexts_dict[context] - print >>output, "%d:%d" % (term,context_index), - print >>output -output.close() - -contexts_file = open(output_filename+".test_contexts",'w') -for c in contexts_list: - print >>contexts_file, c -contexts_file.close() |