summaryrefslogtreecommitdiff
path: root/gi/pyp-topics/scripts
diff options
context:
space:
mode:
authorbothameister <bothameister@ec762483-ff6d-05da-a07a-a48fb63a330f>2010-07-13 23:37:29 +0000
committerbothameister <bothameister@ec762483-ff6d-05da-a07a-a48fb63a330f>2010-07-13 23:37:29 +0000
commitc29321deae3bc178e9ea0501f598a40894c6bc98 (patch)
treef5f84e6554272c580eaef5a1f42643949809093f /gi/pyp-topics/scripts
parent1975a182d76171fee56faf671bedcbf13b9dc9ba (diff)
added thresholding for span labelling
git-svn-id: https://ws10smt.googlecode.com/svn/trunk@247 ec762483-ff6d-05da-a07a-a48fb63a330f
Diffstat (limited to 'gi/pyp-topics/scripts')
-rwxr-xr-xgi/pyp-topics/scripts/spans2labels.py18
1 files changed, 14 insertions, 4 deletions
diff --git a/gi/pyp-topics/scripts/spans2labels.py b/gi/pyp-topics/scripts/spans2labels.py
index 0560af39..f990582e 100755
--- a/gi/pyp-topics/scripts/spans2labels.py
+++ b/gi/pyp-topics/scripts/spans2labels.py
@@ -4,12 +4,16 @@ import sys
from operator import itemgetter
if len(sys.argv) <= 2:
- print "Usage: spans2labels.py phrase_context_index [order]"
+ print "Usage: spans2labels.py phrase_context_index [order] [threshold]"
exit(1)
order=1
+threshold = 0
+cutoff_cat = "<UNK>"
if len(sys.argv) > 2:
order = int(sys.argv[2])
+if len(sys.argv) > 3:
+ threshold = float(sys.argv[3])
phrase_context_index = {}
for line in file(sys.argv[1], 'r'):
@@ -24,9 +28,15 @@ for line in file(sys.argv[1], 'r'):
if len(contexts) == 1: continue
assert len(contexts) % 2 == 0
for i in range(0, len(contexts), 2):
- category = contexts[i+1].split("=")[1].strip()
- phrase_context_index[(phrase,contexts[i])] = category
- #print (phrase,contexts[i]), category
+ #parse contexts[i+1] = " C=1 P=0.8 "
+ features = contexts[i+1].split()
+ category = features[0].split("=")[1].strip()
+ prob = float(features[1].split("=")[1].strip())
+ if prob >= threshold:
+ phrase_context_index[(phrase,contexts[i])] = category
+ else:
+ phrase_context_index[(phrase,contexts[i])] = cutoff_cat
+# print (phrase,contexts[i]), category, prob
for line in sys.stdin:
line_segments = line.split('|||')