summaryrefslogtreecommitdiff
path: root/gi/pyp-topics/scripts/extract_leaves.py
diff options
context:
space:
mode:
authorAvneesh Saluja <asaluja@gmail.com>2013-03-28 18:28:16 -0700
committerAvneesh Saluja <asaluja@gmail.com>2013-03-28 18:28:16 -0700
commit5b8253e0e1f1393a509fb9975ba8c1347af758ed (patch)
tree1790470b1d07a0b4973ebce19192e896566ea60b /gi/pyp-topics/scripts/extract_leaves.py
parent2389a5a8a43dda87c355579838559515b0428421 (diff)
parentb203f8c5dc8cff1b9c9c2073832b248fcad0765a (diff)
fixed conflicts
Diffstat (limited to 'gi/pyp-topics/scripts/extract_leaves.py')
-rwxr-xr-xgi/pyp-topics/scripts/extract_leaves.py49
1 files changed, 0 insertions, 49 deletions
diff --git a/gi/pyp-topics/scripts/extract_leaves.py b/gi/pyp-topics/scripts/extract_leaves.py
deleted file mode 100755
index 14783b36..00000000
--- a/gi/pyp-topics/scripts/extract_leaves.py
+++ /dev/null
@@ -1,49 +0,0 @@
-#!/usr/bin/python
-
-import nltk
-import nltk.probability
-import sys
-import getopt
-
-lexicalise=False
-rm_traces=False
-cutoff=100
-length_cutoff=10000
-try:
- opts, args = getopt.getopt(sys.argv[1:], "hs:c:l", ["help", "lexicalise", "cutoff","sentence-length","remove-traces"])
-except getopt.GetoptError:
- print "Usage: extract_leaves.py [-lsc]"
- sys.exit(2)
-for opt, arg in opts:
- if opt in ("-h", "--help"):
- print "Usage: extract_leaves.py [-lsc]"
- sys.exit()
- elif opt in ("-l", "--lexicalise"):
- lexicalise = True
- elif opt in ("-c", "--cutoff"):
- cutoff = int(arg)
- elif opt in ("-s", "--sentence-length"):
- length_cutoff = int(arg)
- elif opt in ("--remove-traces"):
- rm_traces = True
-
-token_freq = nltk.probability.FreqDist()
-lines = []
-for line in sys.stdin:
- t = nltk.Tree.parse(line)
- pos = t.pos()
- if len(pos) <= length_cutoff:
- lines.append(pos)
- for token, tag in pos:
- token_freq.inc(token)
-
-for line in lines:
- for token,tag in line:
- if not (rm_traces and tag == "-NONE-"):
- if lexicalise:
- if token_freq[token] < cutoff:
- token = '-UNK-'
- print '%s|%s' % (token,tag),
- else:
- print '%s' % tag,
- print