summaryrefslogtreecommitdiff
path: root/gi/pyp-topics/scripts/extract_leaves.py
diff options
context:
space:
mode:
Diffstat (limited to 'gi/pyp-topics/scripts/extract_leaves.py')
-rwxr-xr-xgi/pyp-topics/scripts/extract_leaves.py49
1 files changed, 0 insertions, 49 deletions
diff --git a/gi/pyp-topics/scripts/extract_leaves.py b/gi/pyp-topics/scripts/extract_leaves.py
deleted file mode 100755
index 14783b36..00000000
--- a/gi/pyp-topics/scripts/extract_leaves.py
+++ /dev/null
@@ -1,49 +0,0 @@
-#!/usr/bin/python
-
-import nltk
-import nltk.probability
-import sys
-import getopt
-
-lexicalise=False
-rm_traces=False
-cutoff=100
-length_cutoff=10000
-try:
- opts, args = getopt.getopt(sys.argv[1:], "hs:c:l", ["help", "lexicalise", "cutoff","sentence-length","remove-traces"])
-except getopt.GetoptError:
- print "Usage: extract_leaves.py [-lsc]"
- sys.exit(2)
-for opt, arg in opts:
- if opt in ("-h", "--help"):
- print "Usage: extract_leaves.py [-lsc]"
- sys.exit()
- elif opt in ("-l", "--lexicalise"):
- lexicalise = True
- elif opt in ("-c", "--cutoff"):
- cutoff = int(arg)
- elif opt in ("-s", "--sentence-length"):
- length_cutoff = int(arg)
- elif opt in ("--remove-traces"):
- rm_traces = True
-
-token_freq = nltk.probability.FreqDist()
-lines = []
-for line in sys.stdin:
- t = nltk.Tree.parse(line)
- pos = t.pos()
- if len(pos) <= length_cutoff:
- lines.append(pos)
- for token, tag in pos:
- token_freq.inc(token)
-
-for line in lines:
- for token,tag in line:
- if not (rm_traces and tag == "-NONE-"):
- if lexicalise:
- if token_freq[token] < cutoff:
- token = '-UNK-'
- print '%s|%s' % (token,tag),
- else:
- print '%s' % tag,
- print