diff options
-rwxr-xr-x | convert_to_svmlight_format | 21 | ||||
-rwxr-xr-x | feature_dict | 23 |
2 files changed, 44 insertions, 0 deletions
diff --git a/convert_to_svmlight_format b/convert_to_svmlight_format new file mode 100755 index 0000000..7e5c538 --- /dev/null +++ b/convert_to_svmlight_format @@ -0,0 +1,21 @@ +#!/usr/bin/env ruby + +require 'zipf' + +fd = Marshal.load ReadFile.read ARGV[0] +d = fd.size + +train = [] +l_i = 1 +while line = STDIN.gets + puts l_i if l_i%1000==0 + s = [] + line.split.each { |i| + k,w = i.split '=', 2 + s << [fd[k]+1, w.to_f] + } + s.sort_by! { |i| i.first } + puts "+1 #{s.map{|i| "#{i.first}:#{i[1]}" }.join(' ')}" + l_i+= 1 +end + diff --git a/feature_dict b/feature_dict new file mode 100755 index 0000000..d0ebb7a --- /dev/null +++ b/feature_dict @@ -0,0 +1,23 @@ +#!/usr/bin/env ruby + +n = 0 +feature_dict = {} +l_i = 1 +while line = STDIN.gets + STDERR.write "#{l_i}\n" + line.split.each { |i| + f, v = i.split('=', 2) + if !feature_dict.has_key? f + feature_dict[f] = n + n += 1 + end + } + l_i += 1 +end + +f = File.new "fd.dump", 'w' +f.write Marshal.dump feature_dict +f.close + +STDERR.write "size = #{feature_dict.size}\n" + |