From c57c05d19fb306f7f50cc02516a8a2901c920cca Mon Sep 17 00:00:00 2001
From: bothameister <bothameister@ec762483-ff6d-05da-a07a-a48fb63a330f>
Date: Fri, 23 Jul 2010 18:03:47 +0000
Subject: Adding morphology-segmentation stuff. Changes include:
 local-gi-pipeline (--morf arg), eval-pipeline (--oov-grammar, --lmorder)

git-svn-id: https://ws10smt.googlecode.com/svn/trunk@382 ec762483-ff6d-05da-a07a-a48fb63a330f
---
 gi/morf-segmentation/morfsegment.py | 50 +++++++++++++++++++++++++++++++++++++
 1 file changed, 50 insertions(+)
 create mode 100755 gi/morf-segmentation/morfsegment.py

(limited to 'gi/morf-segmentation/morfsegment.py')

diff --git a/gi/morf-segmentation/morfsegment.py b/gi/morf-segmentation/morfsegment.py
new file mode 100755
index 00000000..e5597c0b
--- /dev/null
+++ b/gi/morf-segmentation/morfsegment.py
@@ -0,0 +1,50 @@
+#!/usr/bin/python
+
+import sys
+import gzip
+
+#usage: morfsegment.py inputvocab.gz segmentation.ready
+#  stdin: the data to segment
+#  stdout: the segmented data
+
+if len(sys.argv) < 3:
+  print "usage: morfsegment.py inputvocab.gz segmentation.ready [marker]"
+  print "  stdin: the data to segment"
+  print "  stdout: the segmented data"
+  sys.exit()
+
+#read index:
+split_index={}
+
+marker="#"
+
+if len(sys.argv) > 3:
+  marker=sys.argv[3]
+
+word_vocab=gzip.open(sys.argv[1], 'rb') #inputvocab.gz
+seg_vocab=open(sys.argv[2], 'r') #segm.ready..
+
+for seg in seg_vocab:
+  #seg = ver# #wonder\n
+  #wordline = 1 verwonder\n
+  word = word_vocab.readline().strip().split(' ')
+  assert(len(word) == 2)
+  word = word[1]
+  seg=seg.strip()
+
+  if seg != word:
+    split_index[word] = seg
+
+word_vocab.close()
+seg_vocab.close()
+
+for line in sys.stdin:
+  words = line.strip().split()
+
+  newsent = []
+  for word in words:
+    splitword = split_index.get(word, word)
+    newsent.append(splitword)
+
+  print ' '.join(newsent)
+
-- 
cgit v1.2.3