From c57c05d19fb306f7f50cc02516a8a2901c920cca Mon Sep 17 00:00:00 2001 From: bothameister Date: Fri, 23 Jul 2010 18:03:47 +0000 Subject: Adding morphology-segmentation stuff. Changes include: local-gi-pipeline (--morf arg), eval-pipeline (--oov-grammar, --lmorder) git-svn-id: https://ws10smt.googlecode.com/svn/trunk@382 ec762483-ff6d-05da-a07a-a48fb63a330f --- gi/morf-segmentation/morfsegment.py | 50 +++++++++++++++++++++++++++++++++++++ 1 file changed, 50 insertions(+) create mode 100755 gi/morf-segmentation/morfsegment.py (limited to 'gi/morf-segmentation/morfsegment.py') diff --git a/gi/morf-segmentation/morfsegment.py b/gi/morf-segmentation/morfsegment.py new file mode 100755 index 00000000..e5597c0b --- /dev/null +++ b/gi/morf-segmentation/morfsegment.py @@ -0,0 +1,50 @@ +#!/usr/bin/python + +import sys +import gzip + +#usage: morfsegment.py inputvocab.gz segmentation.ready +# stdin: the data to segment +# stdout: the segmented data + +if len(sys.argv) < 3: + print "usage: morfsegment.py inputvocab.gz segmentation.ready [marker]" + print " stdin: the data to segment" + print " stdout: the segmented data" + sys.exit() + +#read index: +split_index={} + +marker="#" + +if len(sys.argv) > 3: + marker=sys.argv[3] + +word_vocab=gzip.open(sys.argv[1], 'rb') #inputvocab.gz +seg_vocab=open(sys.argv[2], 'r') #segm.ready.. + +for seg in seg_vocab: + #seg = ver# #wonder\n + #wordline = 1 verwonder\n + word = word_vocab.readline().strip().split(' ') + assert(len(word) == 2) + word = word[1] + seg=seg.strip() + + if seg != word: + split_index[word] = seg + +word_vocab.close() +seg_vocab.close() + +for line in sys.stdin: + words = line.strip().split() + + newsent = [] + for word in words: + splitword = split_index.get(word, word) + newsent.append(splitword) + + print ' '.join(newsent) + -- cgit v1.2.3