summaryrefslogtreecommitdiff
path: root/gi/morf-segmentation/morfsegment.py
diff options
context:
space:
mode:
Diffstat (limited to 'gi/morf-segmentation/morfsegment.py')
-rwxr-xr-xgi/morf-segmentation/morfsegment.py50
1 files changed, 50 insertions, 0 deletions
diff --git a/gi/morf-segmentation/morfsegment.py b/gi/morf-segmentation/morfsegment.py
new file mode 100755
index 00000000..e5597c0b
--- /dev/null
+++ b/gi/morf-segmentation/morfsegment.py
@@ -0,0 +1,50 @@
+#!/usr/bin/python
+
+import sys
+import gzip
+
+#usage: morfsegment.py inputvocab.gz segmentation.ready
+# stdin: the data to segment
+# stdout: the segmented data
+
+if len(sys.argv) < 3:
+ print "usage: morfsegment.py inputvocab.gz segmentation.ready [marker]"
+ print " stdin: the data to segment"
+ print " stdout: the segmented data"
+ sys.exit()
+
+#read index:
+split_index={}
+
+marker="#"
+
+if len(sys.argv) > 3:
+ marker=sys.argv[3]
+
+word_vocab=gzip.open(sys.argv[1], 'rb') #inputvocab.gz
+seg_vocab=open(sys.argv[2], 'r') #segm.ready..
+
+for seg in seg_vocab:
+ #seg = ver# #wonder\n
+ #wordline = 1 verwonder\n
+ word = word_vocab.readline().strip().split(' ')
+ assert(len(word) == 2)
+ word = word[1]
+ seg=seg.strip()
+
+ if seg != word:
+ split_index[word] = seg
+
+word_vocab.close()
+seg_vocab.close()
+
+for line in sys.stdin:
+ words = line.strip().split()
+
+ newsent = []
+ for word in words:
+ splitword = split_index.get(word, word)
+ newsent.append(splitword)
+
+ print ' '.join(newsent)
+