blob: 21117805c9a19fc33a7e9993e8ad8b4984e09c43 (
plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
|
require 'zipf'
require 'optimist'
def read_stopwords_file fn
stopwords = {}
f = ReadFile.new fn
while line = f.gets
stopwords[line.strip] = true
end
return stopwords
end
def read_vocab_file fn
if fn.split(".")[-1] == "dbm"
require 'dbm'
return DBM.new fn
else
vocab = {}
f = ReadFile.new fn
while line = f.gets
count, word = line.split
vocab[word] = count.to_i
end
return vocab
end
end
# Returns true if string s is only composed of punctuation or brackets
def is_punct s
return s.match(/^[[[:punct:]]\<\>\[\]\{\}\(\)]+$/)
end
# Returns true if string is all digits
def is_num s
return s.match(/^[[:digit:]]+$/)
end
# 'Tokenizer' based on spaces
def get_tokens s
return tokenize s
end
# Returns array of unique tokens and token counts for the string s
def get_types s, stopwords, vocab=nil, rare_threshold=1.0/0
tokens = get_tokens s
types = tokens.select { |tok|
!stopwords.include?(tok) and not is_punct(tok) and not is_num(tok)
}.uniq
if vocab
types = types.select { |t|
!vocab.fetch(t, nil) || vocab[t].to_i <= rare_threshold
}
end
return types
end
|