diff options
Diffstat (limited to 'lib/nlp_ruby/stringutil.rb')
-rw-r--r-- | lib/nlp_ruby/stringutil.rb | 41 |
1 files changed, 4 insertions, 37 deletions
diff --git a/lib/nlp_ruby/stringutil.rb b/lib/nlp_ruby/stringutil.rb index d7381bb..aa9be00 100644 --- a/lib/nlp_ruby/stringutil.rb +++ b/lib/nlp_ruby/stringutil.rb @@ -1,17 +1,7 @@ -# whitespace 'tokenizer' def tokenize s s.strip.split end -def splitpipe s, n=3 - s.strip.split("|"*n) -end - -def downcase? s - s[/[[:lower:]]/] -end - -# iterator over n-grams def ngrams(s, n, fix=false) a = tokenize s a.each_with_index { |tok, i| @@ -22,34 +12,11 @@ def ngrams(s, n, fix=false) } end -# a=1.0 b=2.0 => { 'a' => 1.0, 'b' => 2.0 } -def read_feature_string s - map = SparseVector.new - tokenize(s).each { |i| - key, value = i.split '=' - map[key] = value.to_f - } - return map -end - - -def read_cfg fn - f = ReadFile.new fn - cfg = {} - while line = f.gets - line.strip! - next if /^\s*$/.match line - next if line[0]=='#' - content = line.split('#', 2).first - k, v = content.split(/\s*=\s*/, 2) - k.strip!; v.strip! - cfg[k] = v - end - return cfg -end - def bag_of_words s, stopwords=[] s.strip.split.uniq.sort.reject{ |w| stopwords.include? w } -end +end +def splitpipe s, n=3 + s.strip.split("|"*n) +end |