summaryrefslogtreecommitdiff
path: root/gi/pyp-topics
diff options
context:
space:
mode:
authorredpony <redpony@ec762483-ff6d-05da-a07a-a48fb63a330f>2010-06-23 22:07:34 +0000
committerredpony <redpony@ec762483-ff6d-05da-a07a-a48fb63a330f>2010-06-23 22:07:34 +0000
commit8cfda7b7677801f30ef15e319f6ac49847a5a6c9 (patch)
tree78e7339506c965a080bf361ff8bbf9452374c606 /gi/pyp-topics
parent7c26e270a555d524c4e6eebf572e115213ed2695 (diff)
very simple local grammar induction pipeline
git-svn-id: https://ws10smt.googlecode.com/svn/trunk@16 ec762483-ff6d-05da-a07a-a48fb63a330f
Diffstat (limited to 'gi/pyp-topics')
-rwxr-xr-xgi/pyp-topics/scripts/contexts2documents.py16
-rwxr-xr-xgi/pyp-topics/scripts/spans2labels.py2
2 files changed, 13 insertions, 5 deletions
diff --git a/gi/pyp-topics/scripts/contexts2documents.py b/gi/pyp-topics/scripts/contexts2documents.py
index c625d17d..9be4ebbb 100755
--- a/gi/pyp-topics/scripts/contexts2documents.py
+++ b/gi/pyp-topics/scripts/contexts2documents.py
@@ -3,27 +3,35 @@
import sys
from operator import itemgetter
-if len(sys.argv) > 2:
- print "Usage: contexts2documents.py [contexts_index_out]"
+if len(sys.argv) > 3:
+ print "Usage: contexts2documents.py [contexts_index_out] [phrases_index_out]"
exit(1)
context_index = {}
+phrase_index = {}
for line in sys.stdin:
phrase, line_tail = line.split('\t')
raw_contexts = line_tail.split('|||')
contexts = [c.strip() for x,c in enumerate(raw_contexts) if x%2 == 0]
counts = [int(c.split('=')[1].strip()) for x,c in enumerate(raw_contexts) if x%2 != 0]
-
+ phrase_index.setdefault(phrase, len(phrase_index))
print len(contexts),
for context,count in zip(contexts,counts):
c = context_index.setdefault(context, len(context_index))
print "%d:%d" % (c,count),
print
-if len(sys.argv) == 2:
+if 1 < len(sys.argv) < 4:
contexts_out = open(sys.argv[1],'w')
contexts = context_index.items()
contexts.sort(key = itemgetter(1))
for context in contexts:
print >>contexts_out, context[0]
contexts_out.close()
+if len(sys.argv) == 3:
+ phrases_out = open(sys.argv[2],'w')
+ phrases = phrase_index.items()
+ phrases.sort(key = itemgetter(1))
+ for phrase in phrases:
+ print >>phrases_out, phrase[0]
+ phrases_out.close()
diff --git a/gi/pyp-topics/scripts/spans2labels.py b/gi/pyp-topics/scripts/spans2labels.py
index b523e191..409fda92 100755
--- a/gi/pyp-topics/scripts/spans2labels.py
+++ b/gi/pyp-topics/scripts/spans2labels.py
@@ -40,7 +40,7 @@ for line in sys.stdin:
pi = phrase_index[phrase]
ci = context_index[context]
label = phrase_context_index[(pi,ci)]
- print "%s-%s:%s" % (t1-1,t2-1,label),
+ print "%s-%s:X%s" % (t1-1,t2-1,label),
# print phrase, pi, context, ci
# print phrase_context_index[(pi,ci)]
print