summaryrefslogtreecommitdiff
path: root/gi/pyp-topics/scripts/extract_leaves.py
diff options
context:
space:
mode:
authorphilblunsom@gmail.com <philblunsom@gmail.com@ec762483-ff6d-05da-a07a-a48fb63a330f>2010-06-22 20:34:00 +0000
committerphilblunsom@gmail.com <philblunsom@gmail.com@ec762483-ff6d-05da-a07a-a48fb63a330f>2010-06-22 20:34:00 +0000
commit2f2ba42a1453f4a3a08f9c1ecfc53c1b1c83d550 (patch)
tree646e81b6325280f64a72771b5eeadf5118e465a9 /gi/pyp-topics/scripts/extract_leaves.py
parent2f2e36ca3060e7e9853c3d611f6cc5e112a76ddd (diff)
Initial ci of gi dir
git-svn-id: https://ws10smt.googlecode.com/svn/trunk@5 ec762483-ff6d-05da-a07a-a48fb63a330f
Diffstat (limited to 'gi/pyp-topics/scripts/extract_leaves.py')
-rwxr-xr-xgi/pyp-topics/scripts/extract_leaves.py49
1 files changed, 49 insertions, 0 deletions
diff --git a/gi/pyp-topics/scripts/extract_leaves.py b/gi/pyp-topics/scripts/extract_leaves.py
new file mode 100755
index 00000000..14783b36
--- /dev/null
+++ b/gi/pyp-topics/scripts/extract_leaves.py
@@ -0,0 +1,49 @@
+#!/usr/bin/python
+
+import nltk
+import nltk.probability
+import sys
+import getopt
+
+lexicalise=False
+rm_traces=False
+cutoff=100
+length_cutoff=10000
+try:
+ opts, args = getopt.getopt(sys.argv[1:], "hs:c:l", ["help", "lexicalise", "cutoff","sentence-length","remove-traces"])
+except getopt.GetoptError:
+ print "Usage: extract_leaves.py [-lsc]"
+ sys.exit(2)
+for opt, arg in opts:
+ if opt in ("-h", "--help"):
+ print "Usage: extract_leaves.py [-lsc]"
+ sys.exit()
+ elif opt in ("-l", "--lexicalise"):
+ lexicalise = True
+ elif opt in ("-c", "--cutoff"):
+ cutoff = int(arg)
+ elif opt in ("-s", "--sentence-length"):
+ length_cutoff = int(arg)
+ elif opt in ("--remove-traces"):
+ rm_traces = True
+
+token_freq = nltk.probability.FreqDist()
+lines = []
+for line in sys.stdin:
+ t = nltk.Tree.parse(line)
+ pos = t.pos()
+ if len(pos) <= length_cutoff:
+ lines.append(pos)
+ for token, tag in pos:
+ token_freq.inc(token)
+
+for line in lines:
+ for token,tag in line:
+ if not (rm_traces and tag == "-NONE-"):
+ if lexicalise:
+ if token_freq[token] < cutoff:
+ token = '-UNK-'
+ print '%s|%s' % (token,tag),
+ else:
+ print '%s' % tag,
+ print