summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPatrick Simianer <p@simianer.de>2015-01-31 16:27:03 +0100
committerPatrick Simianer <p@simianer.de>2015-01-31 16:27:03 +0100
commit9071a7f2c15c2eddd925edcb62db2794ccad6c50 (patch)
treefded8e709a059b6c3832189bb7332d425f47dc76
parent382da51dbb316b073da1ea48193139ee1a9f71fd (diff)
tools
-rwxr-xr-xconvert_to_svmlight_format21
-rwxr-xr-xfeature_dict23
2 files changed, 44 insertions, 0 deletions
diff --git a/convert_to_svmlight_format b/convert_to_svmlight_format
new file mode 100755
index 0000000..7e5c538
--- /dev/null
+++ b/convert_to_svmlight_format
@@ -0,0 +1,21 @@
+#!/usr/bin/env ruby
+
+require 'zipf'
+
+fd = Marshal.load ReadFile.read ARGV[0]
+d = fd.size
+
+train = []
+l_i = 1
+while line = STDIN.gets
+ puts l_i if l_i%1000==0
+ s = []
+ line.split.each { |i|
+ k,w = i.split '=', 2
+ s << [fd[k]+1, w.to_f]
+ }
+ s.sort_by! { |i| i.first }
+ puts "+1 #{s.map{|i| "#{i.first}:#{i[1]}" }.join(' ')}"
+ l_i+= 1
+end
+
diff --git a/feature_dict b/feature_dict
new file mode 100755
index 0000000..d0ebb7a
--- /dev/null
+++ b/feature_dict
@@ -0,0 +1,23 @@
+#!/usr/bin/env ruby
+
+n = 0
+feature_dict = {}
+l_i = 1
+while line = STDIN.gets
+ STDERR.write "#{l_i}\n"
+ line.split.each { |i|
+ f, v = i.split('=', 2)
+ if !feature_dict.has_key? f
+ feature_dict[f] = n
+ n += 1
+ end
+ }
+ l_i += 1
+end
+
+f = File.new "fd.dump", 'w'
+f.write Marshal.dump feature_dict
+f.close
+
+STDERR.write "size = #{feature_dict.size}\n"
+