From 8adea055298189643a3c7a76e2d529f536a94e11 Mon Sep 17 00:00:00 2001
From: Patrick Simianer <p@simianer.de>
Date: Tue, 23 Apr 2019 13:55:47 +0200
Subject: init

---
 util.rb | 60 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 60 insertions(+)
 create mode 100644 util.rb

(limited to 'util.rb')

diff --git a/util.rb b/util.rb
new file mode 100644
index 0000000..d04f823
--- /dev/null
+++ b/util.rb
@@ -0,0 +1,60 @@
+require 'zipf'
+require 'trollop'
+
+def read_stopwords_file fn
+  stopwords = {}
+  f = ReadFile.new fn
+  while line = f.gets
+    stopwords[line.strip] = true
+  end
+
+  return stopwords
+end
+
+def read_vocab_file fn
+  if fn.split(".")[-1] == "dbm"
+    require 'dbm'
+    return DBM.new fn
+  else
+    vocab = {}
+    f = ReadFile.new fn
+    while line = f.gets
+      count, word = line.split
+      vocab[word] = count.to_i
+    end
+
+    return vocab
+  end
+end
+
+# Returns true if string s is only composed of punctuation or brackets
+def is_punct s
+  return s.match(/^[[[:punct:]]\<\>\[\]\{\}\(\)]+$/)
+end
+
+# Returns true if string is all digits
+def is_num s
+  return s.match(/^[[:digit:]]+$/)
+end
+
+# 'Tokenizer' based on spaces
+def get_tokens s
+  return tokenize s
+end
+
+# Returns array of unique tokens and token counts for the string s
+def get_types s, stopwords, vocab=nil, rare_threshold=1.0/0
+  tokens = get_tokens s
+  types = tokens.select { |tok|
+    !stopwords.include?(tok) and not is_punct(tok) and not is_num(tok)
+  }.uniq
+
+  if vocab
+    types = types.select { |t|
+      !vocab.fetch(t, nil) || vocab[t].to_i <= rare_threshold
+    }
+  end
+
+  return types
+end
+
-- 
cgit v1.2.3