diff options
author | Kenneth Heafield <github@kheafield.com> | 2012-10-22 12:07:20 +0100 |
---|---|---|
committer | Kenneth Heafield <github@kheafield.com> | 2012-10-22 12:07:20 +0100 |
commit | 5f98fe5c4f2a2090eeb9d30c030305a70a8347d1 (patch) | |
tree | 9b6002f850e6dea1e3400c6b19bb31a9cdf3067f /gi/morf-segmentation/morfsegment.py | |
parent | cf9994131993b40be62e90e213b1e11e6b550143 (diff) | |
parent | 21825a09d97c2e0afd20512f306fb25fed55e529 (diff) |
Merge remote branch 'upstream/master'
Conflicts:
Jamroot
bjam
decoder/Jamfile
decoder/cdec.cc
dpmert/Jamfile
jam-files/sanity.jam
klm/lm/Jamfile
klm/util/Jamfile
mira/Jamfile
Diffstat (limited to 'gi/morf-segmentation/morfsegment.py')
-rwxr-xr-x | gi/morf-segmentation/morfsegment.py | 50 |
1 files changed, 0 insertions, 50 deletions
diff --git a/gi/morf-segmentation/morfsegment.py b/gi/morf-segmentation/morfsegment.py deleted file mode 100755 index 85b9d4fb..00000000 --- a/gi/morf-segmentation/morfsegment.py +++ /dev/null @@ -1,50 +0,0 @@ -#!/usr/bin/python - -import sys -import gzip - -#usage: morfsegment.py inputvocab.gz segmentation.ready -# stdin: the data to segment -# stdout: the segmented data - -if len(sys.argv) < 3: - print "usage: morfsegment.py inputvocab.gz segmentation.ready [marker]" - print " stdin: the data to segment" - print " stdout: the segmented data" - sys.exit() - -#read index: -split_index={} - -marker="##" - -if len(sys.argv) > 3: - marker=sys.argv[3] - -word_vocab=gzip.open(sys.argv[1], 'rb') #inputvocab.gz -seg_vocab=open(sys.argv[2], 'r') #segm.ready.. - -for seg in seg_vocab: - #seg = ver# #wonder\n - #wordline = 1 verwonder\n - word = word_vocab.readline().strip().split(' ') - assert(len(word) == 2) - word = word[1] - seg=seg.strip() - - if seg != word: - split_index[word] = seg - -word_vocab.close() -seg_vocab.close() - -for line in sys.stdin: - words = line.strip().split() - - newsent = [] - for word in words: - splitword = split_index.get(word, word) - newsent.append(splitword) - - print ' '.join(newsent) - |