diff options
author | philblunsom@gmail.com <philblunsom@gmail.com@ec762483-ff6d-05da-a07a-a48fb63a330f> | 2010-06-22 20:34:00 +0000 |
---|---|---|
committer | philblunsom@gmail.com <philblunsom@gmail.com@ec762483-ff6d-05da-a07a-a48fb63a330f> | 2010-06-22 20:34:00 +0000 |
commit | 2f2ba42a1453f4a3a08f9c1ecfc53c1b1c83d550 (patch) | |
tree | 646e81b6325280f64a72771b5eeadf5118e465a9 /gi/pyp-topics/scripts/extract_leaves.py | |
parent | 2f2e36ca3060e7e9853c3d611f6cc5e112a76ddd (diff) |
Initial ci of gi dir
git-svn-id: https://ws10smt.googlecode.com/svn/trunk@5 ec762483-ff6d-05da-a07a-a48fb63a330f
Diffstat (limited to 'gi/pyp-topics/scripts/extract_leaves.py')
-rwxr-xr-x | gi/pyp-topics/scripts/extract_leaves.py | 49 |
1 files changed, 49 insertions, 0 deletions
diff --git a/gi/pyp-topics/scripts/extract_leaves.py b/gi/pyp-topics/scripts/extract_leaves.py new file mode 100755 index 00000000..14783b36 --- /dev/null +++ b/gi/pyp-topics/scripts/extract_leaves.py @@ -0,0 +1,49 @@ +#!/usr/bin/python + +import nltk +import nltk.probability +import sys +import getopt + +lexicalise=False +rm_traces=False +cutoff=100 +length_cutoff=10000 +try: + opts, args = getopt.getopt(sys.argv[1:], "hs:c:l", ["help", "lexicalise", "cutoff","sentence-length","remove-traces"]) +except getopt.GetoptError: + print "Usage: extract_leaves.py [-lsc]" + sys.exit(2) +for opt, arg in opts: + if opt in ("-h", "--help"): + print "Usage: extract_leaves.py [-lsc]" + sys.exit() + elif opt in ("-l", "--lexicalise"): + lexicalise = True + elif opt in ("-c", "--cutoff"): + cutoff = int(arg) + elif opt in ("-s", "--sentence-length"): + length_cutoff = int(arg) + elif opt in ("--remove-traces"): + rm_traces = True + +token_freq = nltk.probability.FreqDist() +lines = [] +for line in sys.stdin: + t = nltk.Tree.parse(line) + pos = t.pos() + if len(pos) <= length_cutoff: + lines.append(pos) + for token, tag in pos: + token_freq.inc(token) + +for line in lines: + for token,tag in line: + if not (rm_traces and tag == "-NONE-"): + if lexicalise: + if token_freq[token] < cutoff: + token = '-UNK-' + print '%s|%s' % (token,tag), + else: + print '%s' % tag, + print |