summaryrefslogtreecommitdiff
path: root/gi/pyp-topics/scripts/contexts2documents.py
diff options
context:
space:
mode:
authorphilblunsom@gmail.com <philblunsom@gmail.com@ec762483-ff6d-05da-a07a-a48fb63a330f>2010-06-22 20:34:00 +0000
committerphilblunsom@gmail.com <philblunsom@gmail.com@ec762483-ff6d-05da-a07a-a48fb63a330f>2010-06-22 20:34:00 +0000
commit2f2ba42a1453f4a3a08f9c1ecfc53c1b1c83d550 (patch)
tree646e81b6325280f64a72771b5eeadf5118e465a9 /gi/pyp-topics/scripts/contexts2documents.py
parent2f2e36ca3060e7e9853c3d611f6cc5e112a76ddd (diff)
Initial ci of gi dir
git-svn-id: https://ws10smt.googlecode.com/svn/trunk@5 ec762483-ff6d-05da-a07a-a48fb63a330f
Diffstat (limited to 'gi/pyp-topics/scripts/contexts2documents.py')
-rwxr-xr-xgi/pyp-topics/scripts/contexts2documents.py29
1 files changed, 29 insertions, 0 deletions
diff --git a/gi/pyp-topics/scripts/contexts2documents.py b/gi/pyp-topics/scripts/contexts2documents.py
new file mode 100755
index 00000000..c625d17d
--- /dev/null
+++ b/gi/pyp-topics/scripts/contexts2documents.py
@@ -0,0 +1,29 @@
+#!/usr/bin/python
+
+import sys
+from operator import itemgetter
+
+if len(sys.argv) > 2:
+ print "Usage: contexts2documents.py [contexts_index_out]"
+ exit(1)
+
+context_index = {}
+for line in sys.stdin:
+ phrase, line_tail = line.split('\t')
+
+ raw_contexts = line_tail.split('|||')
+ contexts = [c.strip() for x,c in enumerate(raw_contexts) if x%2 == 0]
+ counts = [int(c.split('=')[1].strip()) for x,c in enumerate(raw_contexts) if x%2 != 0]
+
+ print len(contexts),
+ for context,count in zip(contexts,counts):
+ c = context_index.setdefault(context, len(context_index))
+ print "%d:%d" % (c,count),
+ print
+if len(sys.argv) == 2:
+ contexts_out = open(sys.argv[1],'w')
+ contexts = context_index.items()
+ contexts.sort(key = itemgetter(1))
+ for context in contexts:
+ print >>contexts_out, context[0]
+ contexts_out.close()