diff options
author | bothameister <bothameister@ec762483-ff6d-05da-a07a-a48fb63a330f> | 2010-07-23 18:03:47 +0000 |
---|---|---|
committer | bothameister <bothameister@ec762483-ff6d-05da-a07a-a48fb63a330f> | 2010-07-23 18:03:47 +0000 |
commit | c57c05d19fb306f7f50cc02516a8a2901c920cca (patch) | |
tree | 1120643e63ea2b46d6a3bc0b338fb225682c9dd7 /gi/morf-segmentation/linestripper.py | |
parent | 58681ee5816d13c04002ca8aebe23c2768da4e5b (diff) |
Adding morphology-segmentation stuff. Changes include: local-gi-pipeline (--morf arg), eval-pipeline (--oov-grammar, --lmorder)
git-svn-id: https://ws10smt.googlecode.com/svn/trunk@382 ec762483-ff6d-05da-a07a-a48fb63a330f
Diffstat (limited to 'gi/morf-segmentation/linestripper.py')
-rwxr-xr-x | gi/morf-segmentation/linestripper.py | 40 |
1 files changed, 40 insertions, 0 deletions
diff --git a/gi/morf-segmentation/linestripper.py b/gi/morf-segmentation/linestripper.py new file mode 100755 index 00000000..04e9044a --- /dev/null +++ b/gi/morf-segmentation/linestripper.py @@ -0,0 +1,40 @@ +#!/usr/bin/python + +import sys + +#linestripper file file maxlen [numlines] + +if len(sys.argv) < 3: + print "linestripper file1 file2 maxlen [numlines]" + print " outputs subset of file1 to stdout, ..of file2 to stderr" + sys.exit(1) + + +f1 = open(sys.argv[1],'r') +f2 = open(sys.argv[2],'r') + +maxlen=int(sys.argv[3]) +numlines = 0 + +if len(sys.argv) > 4: + numlines = int(sys.argv[4]) + +count=0 +for line1 in f1: + line2 = f2.readline() + + w1 = len(line1.strip().split()) + w2 = len(line2.strip().split()) + + if w1 <= maxlen and w2 <= maxlen: + count = count + 1 + sys.stdout.write(line1) + sys.stderr.write(line2) + + if numlines > 0 and count >= numlines: + break + +f1.close() +f2.close() + + |