summaryrefslogtreecommitdiff
path: root/gi/morf-segmentation/linestripper.py
diff options
context:
space:
mode:
authorbothameister <bothameister@ec762483-ff6d-05da-a07a-a48fb63a330f>2010-07-23 18:03:47 +0000
committerbothameister <bothameister@ec762483-ff6d-05da-a07a-a48fb63a330f>2010-07-23 18:03:47 +0000
commitc57c05d19fb306f7f50cc02516a8a2901c920cca (patch)
tree1120643e63ea2b46d6a3bc0b338fb225682c9dd7 /gi/morf-segmentation/linestripper.py
parent58681ee5816d13c04002ca8aebe23c2768da4e5b (diff)
Adding morphology-segmentation stuff. Changes include: local-gi-pipeline (--morf arg), eval-pipeline (--oov-grammar, --lmorder)
git-svn-id: https://ws10smt.googlecode.com/svn/trunk@382 ec762483-ff6d-05da-a07a-a48fb63a330f
Diffstat (limited to 'gi/morf-segmentation/linestripper.py')
-rwxr-xr-xgi/morf-segmentation/linestripper.py40
1 files changed, 40 insertions, 0 deletions
diff --git a/gi/morf-segmentation/linestripper.py b/gi/morf-segmentation/linestripper.py
new file mode 100755
index 00000000..04e9044a
--- /dev/null
+++ b/gi/morf-segmentation/linestripper.py
@@ -0,0 +1,40 @@
+#!/usr/bin/python
+
+import sys
+
+#linestripper file file maxlen [numlines]
+
+if len(sys.argv) < 3:
+ print "linestripper file1 file2 maxlen [numlines]"
+ print " outputs subset of file1 to stdout, ..of file2 to stderr"
+ sys.exit(1)
+
+
+f1 = open(sys.argv[1],'r')
+f2 = open(sys.argv[2],'r')
+
+maxlen=int(sys.argv[3])
+numlines = 0
+
+if len(sys.argv) > 4:
+ numlines = int(sys.argv[4])
+
+count=0
+for line1 in f1:
+ line2 = f2.readline()
+
+ w1 = len(line1.strip().split())
+ w2 = len(line2.strip().split())
+
+ if w1 <= maxlen and w2 <= maxlen:
+ count = count + 1
+ sys.stdout.write(line1)
+ sys.stderr.write(line2)
+
+ if numlines > 0 and count >= numlines:
+ break
+
+f1.close()
+f2.close()
+
+