From 8adea055298189643a3c7a76e2d529f536a94e11 Mon Sep 17 00:00:00 2001 From: Patrick Simianer Date: Tue, 23 Apr 2019 13:55:47 +0200 Subject: init --- util.rb | 60 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 60 insertions(+) create mode 100644 util.rb (limited to 'util.rb') diff --git a/util.rb b/util.rb new file mode 100644 index 0000000..d04f823 --- /dev/null +++ b/util.rb @@ -0,0 +1,60 @@ +require 'zipf' +require 'trollop' + +def read_stopwords_file fn + stopwords = {} + f = ReadFile.new fn + while line = f.gets + stopwords[line.strip] = true + end + + return stopwords +end + +def read_vocab_file fn + if fn.split(".")[-1] == "dbm" + require 'dbm' + return DBM.new fn + else + vocab = {} + f = ReadFile.new fn + while line = f.gets + count, word = line.split + vocab[word] = count.to_i + end + + return vocab + end +end + +# Returns true if string s is only composed of punctuation or brackets +def is_punct s + return s.match(/^[[[:punct:]]\<\>\[\]\{\}\(\)]+$/) +end + +# Returns true if string is all digits +def is_num s + return s.match(/^[[:digit:]]+$/) +end + +# 'Tokenizer' based on spaces +def get_tokens s + return tokenize s +end + +# Returns array of unique tokens and token counts for the string s +def get_types s, stopwords, vocab=nil, rare_threshold=1.0/0 + tokens = get_tokens s + types = tokens.select { |tok| + !stopwords.include?(tok) and not is_punct(tok) and not is_num(tok) + }.uniq + + if vocab + types = types.select { |t| + !vocab.fetch(t, nil) || vocab[t].to_i <= rare_threshold + } + end + + return types +end + -- cgit v1.2.3