diff options
author | Patrick Simianer <p@simianer.de> | 2014-01-29 19:22:56 +0100 |
---|---|---|
committer | Patrick Simianer <p@simianer.de> | 2014-01-29 19:22:56 +0100 |
commit | d9d72e06db07087aa54401fae8b259f0c4ccd649 (patch) | |
tree | 97f0444314c40d2894ac0892d5559101eda01acf /lib/nlp_ruby/stringutil.rb | |
parent | 22644ed1365e566c8bf806bfff4ecd43c46ce089 (diff) |
first usable version, name change => nlp_ruby
Diffstat (limited to 'lib/nlp_ruby/stringutil.rb')
-rw-r--r-- | lib/nlp_ruby/stringutil.rb | 34 |
1 files changed, 34 insertions, 0 deletions
diff --git a/lib/nlp_ruby/stringutil.rb b/lib/nlp_ruby/stringutil.rb new file mode 100644 index 0000000..e9a3bc9 --- /dev/null +++ b/lib/nlp_ruby/stringutil.rb @@ -0,0 +1,34 @@ +# whitespace 'tokenizer' +def tokenize s + s.strip.split +end + +def splitpipe s + s.strip.split(/\s*\|\|\|\s*/) +end + +def downcase? s + s[/[[:lower:]]/] +end + +# iterator over n-grams +def ngrams(s, n, fix=false) + a = tokenize s + a.each_with_index { |tok, i| + tok.strip! + 0.upto([n-1, a.size-i-1].min) { |m| + yield a[i..i+m] if !fix||(fix&&a[i..i+m].size==n) + } + } +end + +# a=1.0 b=2.0 => { 'a' => 1.0, 'b' => 2.0 } +def read_feature_string s + map = SparseVector.new + tokenize(s).each { |i| + key, value = i.split '=' + map[key] = value.to_f + } + return map +end + |