summaryrefslogtreecommitdiff
path: root/gi/morf-segmentation/morfsegment.py
diff options
context:
space:
mode:
authorKenneth Heafield <github@kheafield.com>2012-10-22 12:07:20 +0100
committerKenneth Heafield <github@kheafield.com>2012-10-22 12:07:20 +0100
commit5f98fe5c4f2a2090eeb9d30c030305a70a8347d1 (patch)
tree9b6002f850e6dea1e3400c6b19bb31a9cdf3067f /gi/morf-segmentation/morfsegment.py
parentcf9994131993b40be62e90e213b1e11e6b550143 (diff)
parent21825a09d97c2e0afd20512f306fb25fed55e529 (diff)
Merge remote branch 'upstream/master'
Conflicts: Jamroot bjam decoder/Jamfile decoder/cdec.cc dpmert/Jamfile jam-files/sanity.jam klm/lm/Jamfile klm/util/Jamfile mira/Jamfile
Diffstat (limited to 'gi/morf-segmentation/morfsegment.py')
-rwxr-xr-xgi/morf-segmentation/morfsegment.py50
1 files changed, 0 insertions, 50 deletions
diff --git a/gi/morf-segmentation/morfsegment.py b/gi/morf-segmentation/morfsegment.py
deleted file mode 100755
index 85b9d4fb..00000000
--- a/gi/morf-segmentation/morfsegment.py
+++ /dev/null
@@ -1,50 +0,0 @@
-#!/usr/bin/python
-
-import sys
-import gzip
-
-#usage: morfsegment.py inputvocab.gz segmentation.ready
-# stdin: the data to segment
-# stdout: the segmented data
-
-if len(sys.argv) < 3:
- print "usage: morfsegment.py inputvocab.gz segmentation.ready [marker]"
- print " stdin: the data to segment"
- print " stdout: the segmented data"
- sys.exit()
-
-#read index:
-split_index={}
-
-marker="##"
-
-if len(sys.argv) > 3:
- marker=sys.argv[3]
-
-word_vocab=gzip.open(sys.argv[1], 'rb') #inputvocab.gz
-seg_vocab=open(sys.argv[2], 'r') #segm.ready..
-
-for seg in seg_vocab:
- #seg = ver# #wonder\n
- #wordline = 1 verwonder\n
- word = word_vocab.readline().strip().split(' ')
- assert(len(word) == 2)
- word = word[1]
- seg=seg.strip()
-
- if seg != word:
- split_index[word] = seg
-
-word_vocab.close()
-seg_vocab.close()
-
-for line in sys.stdin:
- words = line.strip().split()
-
- newsent = []
- for word in words:
- splitword = split_index.get(word, word)
- newsent.append(splitword)
-
- print ' '.join(newsent)
-