diff options
author | redpony <redpony@ec762483-ff6d-05da-a07a-a48fb63a330f> | 2010-06-23 22:07:34 +0000 |
---|---|---|
committer | redpony <redpony@ec762483-ff6d-05da-a07a-a48fb63a330f> | 2010-06-23 22:07:34 +0000 |
commit | 5ed01d87524dc4471e4fe601e528b2753f0038b6 (patch) | |
tree | 509ffc2d2691d9e26bfab40590933337e1870f19 /gi/pyp-topics/scripts | |
parent | cf2f68eca737c60f2490d81ea0fde9ef714123c3 (diff) |
very simple local grammar induction pipeline
git-svn-id: https://ws10smt.googlecode.com/svn/trunk@16 ec762483-ff6d-05da-a07a-a48fb63a330f
Diffstat (limited to 'gi/pyp-topics/scripts')
-rwxr-xr-x | gi/pyp-topics/scripts/contexts2documents.py | 16 | ||||
-rwxr-xr-x | gi/pyp-topics/scripts/spans2labels.py | 2 |
2 files changed, 13 insertions, 5 deletions
diff --git a/gi/pyp-topics/scripts/contexts2documents.py b/gi/pyp-topics/scripts/contexts2documents.py index c625d17d..9be4ebbb 100755 --- a/gi/pyp-topics/scripts/contexts2documents.py +++ b/gi/pyp-topics/scripts/contexts2documents.py @@ -3,27 +3,35 @@ import sys from operator import itemgetter -if len(sys.argv) > 2: - print "Usage: contexts2documents.py [contexts_index_out]" +if len(sys.argv) > 3: + print "Usage: contexts2documents.py [contexts_index_out] [phrases_index_out]" exit(1) context_index = {} +phrase_index = {} for line in sys.stdin: phrase, line_tail = line.split('\t') raw_contexts = line_tail.split('|||') contexts = [c.strip() for x,c in enumerate(raw_contexts) if x%2 == 0] counts = [int(c.split('=')[1].strip()) for x,c in enumerate(raw_contexts) if x%2 != 0] - + phrase_index.setdefault(phrase, len(phrase_index)) print len(contexts), for context,count in zip(contexts,counts): c = context_index.setdefault(context, len(context_index)) print "%d:%d" % (c,count), print -if len(sys.argv) == 2: +if 1 < len(sys.argv) < 4: contexts_out = open(sys.argv[1],'w') contexts = context_index.items() contexts.sort(key = itemgetter(1)) for context in contexts: print >>contexts_out, context[0] contexts_out.close() +if len(sys.argv) == 3: + phrases_out = open(sys.argv[2],'w') + phrases = phrase_index.items() + phrases.sort(key = itemgetter(1)) + for phrase in phrases: + print >>phrases_out, phrase[0] + phrases_out.close() diff --git a/gi/pyp-topics/scripts/spans2labels.py b/gi/pyp-topics/scripts/spans2labels.py index b523e191..409fda92 100755 --- a/gi/pyp-topics/scripts/spans2labels.py +++ b/gi/pyp-topics/scripts/spans2labels.py @@ -40,7 +40,7 @@ for line in sys.stdin: pi = phrase_index[phrase] ci = context_index[context] label = phrase_context_index[(pi,ci)] - print "%s-%s:%s" % (t1-1,t2-1,label), + print "%s-%s:X%s" % (t1-1,t2-1,label), # print phrase, pi, context, ci # print phrase_context_index[(pi,ci)] print |