summaryrefslogtreecommitdiff
path: root/lib/nlp_ruby/stringutil.rb
diff options
context:
space:
mode:
authorPatrick Simianer <p@simianer.de>2014-02-14 17:14:49 +0100
committerPatrick Simianer <p@simianer.de>2014-02-14 17:14:49 +0100
commitc0daa3e70cc3187f04f67c2cdc0bd3b3217e8aa6 (patch)
treef7030f39f20e21148ed817142eee0536e557c0c0 /lib/nlp_ruby/stringutil.rb
parent7255d33914122e58b031108de49918b8910eebc6 (diff)
=> 0.3; License and README updated; some from_* methods for SparseVector; ttable.rb => Translation.rb; moved some misc. stuff to misc.rb; monkey patched String
Diffstat (limited to 'lib/nlp_ruby/stringutil.rb')
-rw-r--r--lib/nlp_ruby/stringutil.rb41
1 files changed, 4 insertions, 37 deletions
diff --git a/lib/nlp_ruby/stringutil.rb b/lib/nlp_ruby/stringutil.rb
index d7381bb..aa9be00 100644
--- a/lib/nlp_ruby/stringutil.rb
+++ b/lib/nlp_ruby/stringutil.rb
@@ -1,17 +1,7 @@
-# whitespace 'tokenizer'
def tokenize s
s.strip.split
end
-def splitpipe s, n=3
- s.strip.split("|"*n)
-end
-
-def downcase? s
- s[/[[:lower:]]/]
-end
-
-# iterator over n-grams
def ngrams(s, n, fix=false)
a = tokenize s
a.each_with_index { |tok, i|
@@ -22,34 +12,11 @@ def ngrams(s, n, fix=false)
}
end
-# a=1.0 b=2.0 => { 'a' => 1.0, 'b' => 2.0 }
-def read_feature_string s
- map = SparseVector.new
- tokenize(s).each { |i|
- key, value = i.split '='
- map[key] = value.to_f
- }
- return map
-end
-
-
-def read_cfg fn
- f = ReadFile.new fn
- cfg = {}
- while line = f.gets
- line.strip!
- next if /^\s*$/.match line
- next if line[0]=='#'
- content = line.split('#', 2).first
- k, v = content.split(/\s*=\s*/, 2)
- k.strip!; v.strip!
- cfg[k] = v
- end
- return cfg
-end
-
def bag_of_words s, stopwords=[]
s.strip.split.uniq.sort.reject{ |w| stopwords.include? w }
-end
+end
+def splitpipe s, n=3
+ s.strip.split("|"*n)
+end