1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
|
#!/usr/bin/python
import nltk
import nltk.probability
import sys
import getopt
lexicalise=False
rm_traces=False
cutoff=100
length_cutoff=10000
try:
opts, args = getopt.getopt(sys.argv[1:], "hs:c:l", ["help", "lexicalise", "cutoff","sentence-length","remove-traces"])
except getopt.GetoptError:
print "Usage: extract_leaves.py [-lsc]"
sys.exit(2)
for opt, arg in opts:
if opt in ("-h", "--help"):
print "Usage: extract_leaves.py [-lsc]"
sys.exit()
elif opt in ("-l", "--lexicalise"):
lexicalise = True
elif opt in ("-c", "--cutoff"):
cutoff = int(arg)
elif opt in ("-s", "--sentence-length"):
length_cutoff = int(arg)
elif opt in ("--remove-traces"):
rm_traces = True
token_freq = nltk.probability.FreqDist()
lines = []
for line in sys.stdin:
t = nltk.Tree.parse(line)
pos = t.pos()
if len(pos) <= length_cutoff:
lines.append(pos)
for token, tag in pos:
token_freq.inc(token)
for line in lines:
for token,tag in line:
if not (rm_traces and tag == "-NONE-"):
if lexicalise:
if token_freq[token] < cutoff:
token = '-UNK-'
print '%s|%s' % (token,tag),
else:
print '%s' % tag,
print
|