summaryrefslogtreecommitdiff
path: root/lib/nlp_ruby/stringutil.rb
diff options
context:
space:
mode:
authorPatrick Simianer <p@simianer.de>2014-01-29 19:22:56 +0100
committerPatrick Simianer <p@simianer.de>2014-01-29 19:22:56 +0100
commitd9d72e06db07087aa54401fae8b259f0c4ccd649 (patch)
tree97f0444314c40d2894ac0892d5559101eda01acf /lib/nlp_ruby/stringutil.rb
parent22644ed1365e566c8bf806bfff4ecd43c46ce089 (diff)
first usable version, name change => nlp_ruby
Diffstat (limited to 'lib/nlp_ruby/stringutil.rb')
-rw-r--r--lib/nlp_ruby/stringutil.rb34
1 files changed, 34 insertions, 0 deletions
diff --git a/lib/nlp_ruby/stringutil.rb b/lib/nlp_ruby/stringutil.rb
new file mode 100644
index 0000000..e9a3bc9
--- /dev/null
+++ b/lib/nlp_ruby/stringutil.rb
@@ -0,0 +1,34 @@
+# whitespace 'tokenizer'
+def tokenize s
+ s.strip.split
+end
+
+def splitpipe s
+ s.strip.split(/\s*\|\|\|\s*/)
+end
+
+def downcase? s
+ s[/[[:lower:]]/]
+end
+
+# iterator over n-grams
+def ngrams(s, n, fix=false)
+ a = tokenize s
+ a.each_with_index { |tok, i|
+ tok.strip!
+ 0.upto([n-1, a.size-i-1].min) { |m|
+ yield a[i..i+m] if !fix||(fix&&a[i..i+m].size==n)
+ }
+ }
+end
+
+# a=1.0 b=2.0 => { 'a' => 1.0, 'b' => 2.0 }
+def read_feature_string s
+ map = SparseVector.new
+ tokenize(s).each { |i|
+ key, value = i.split '='
+ map[key] = value.to_f
+ }
+ return map
+end
+