diff options
author | Kenneth Heafield <github@kheafield.com> | 2012-10-22 12:07:20 +0100 |
---|---|---|
committer | Kenneth Heafield <github@kheafield.com> | 2012-10-22 12:07:20 +0100 |
commit | 5f98fe5c4f2a2090eeb9d30c030305a70a8347d1 (patch) | |
tree | 9b6002f850e6dea1e3400c6b19bb31a9cdf3067f /gi/pyp-topics/scripts/extract_leaves.py | |
parent | cf9994131993b40be62e90e213b1e11e6b550143 (diff) | |
parent | 21825a09d97c2e0afd20512f306fb25fed55e529 (diff) |
Merge remote branch 'upstream/master'
Conflicts:
Jamroot
bjam
decoder/Jamfile
decoder/cdec.cc
dpmert/Jamfile
jam-files/sanity.jam
klm/lm/Jamfile
klm/util/Jamfile
mira/Jamfile
Diffstat (limited to 'gi/pyp-topics/scripts/extract_leaves.py')
-rwxr-xr-x | gi/pyp-topics/scripts/extract_leaves.py | 49 |
1 files changed, 0 insertions, 49 deletions
diff --git a/gi/pyp-topics/scripts/extract_leaves.py b/gi/pyp-topics/scripts/extract_leaves.py deleted file mode 100755 index 14783b36..00000000 --- a/gi/pyp-topics/scripts/extract_leaves.py +++ /dev/null @@ -1,49 +0,0 @@ -#!/usr/bin/python - -import nltk -import nltk.probability -import sys -import getopt - -lexicalise=False -rm_traces=False -cutoff=100 -length_cutoff=10000 -try: - opts, args = getopt.getopt(sys.argv[1:], "hs:c:l", ["help", "lexicalise", "cutoff","sentence-length","remove-traces"]) -except getopt.GetoptError: - print "Usage: extract_leaves.py [-lsc]" - sys.exit(2) -for opt, arg in opts: - if opt in ("-h", "--help"): - print "Usage: extract_leaves.py [-lsc]" - sys.exit() - elif opt in ("-l", "--lexicalise"): - lexicalise = True - elif opt in ("-c", "--cutoff"): - cutoff = int(arg) - elif opt in ("-s", "--sentence-length"): - length_cutoff = int(arg) - elif opt in ("--remove-traces"): - rm_traces = True - -token_freq = nltk.probability.FreqDist() -lines = [] -for line in sys.stdin: - t = nltk.Tree.parse(line) - pos = t.pos() - if len(pos) <= length_cutoff: - lines.append(pos) - for token, tag in pos: - token_freq.inc(token) - -for line in lines: - for token,tag in line: - if not (rm_traces and tag == "-NONE-"): - if lexicalise: - if token_freq[token] < cutoff: - token = '-UNK-' - print '%s|%s' % (token,tag), - else: - print '%s' % tag, - print |