summaryrefslogtreecommitdiff
path: root/gi/pyp-topics/scripts/extract_leaves.py
blob: 14783b3644702771807485472353c49fcb65f966 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
#!/usr/bin/python

import nltk
import nltk.probability
import sys
import getopt 

lexicalise=False
rm_traces=False
cutoff=100
length_cutoff=10000
try:                                
  opts, args = getopt.getopt(sys.argv[1:], "hs:c:l", ["help", "lexicalise", "cutoff","sentence-length","remove-traces"])
except getopt.GetoptError:          
  print "Usage: extract_leaves.py [-lsc]"                        
  sys.exit(2)                     
for opt, arg in opts:                
  if opt in ("-h", "--help"):      
    print "Usage: extract_leaves.py [-lsc]"                        
    sys.exit()                  
  elif opt in ("-l", "--lexicalise"):                
    lexicalise = True                 
  elif opt in ("-c", "--cutoff"):                
    cutoff = int(arg) 
  elif opt in ("-s", "--sentence-length"):                
    length_cutoff = int(arg) 
  elif opt in ("--remove-traces"):                
    rm_traces = True                 

token_freq = nltk.probability.FreqDist()
lines = []
for line in sys.stdin:
  t = nltk.Tree.parse(line)
  pos = t.pos()
  if len(pos) <= length_cutoff:
    lines.append(pos)
    for token, tag in pos:
      token_freq.inc(token)  

for line in lines:
  for token,tag in line:
    if not (rm_traces and tag == "-NONE-"):
      if lexicalise:
        if token_freq[token] < cutoff:
          token = '-UNK-'
        print '%s|%s' % (token,tag),
      else:
        print '%s' % tag,
  print