diff options
Diffstat (limited to 'dtrain/hstreaming')
-rwxr-xr-x | dtrain/hstreaming/rule_count/map.sh | 4 | ||||
-rw-r--r-- | dtrain/hstreaming/rule_count/red.rb | 22 | ||||
-rw-r--r-- | dtrain/hstreaming/rule_count/rulecount.rb | 11 | ||||
-rw-r--r-- | dtrain/hstreaming/rule_count/test | 8 |
4 files changed, 45 insertions, 0 deletions
diff --git a/dtrain/hstreaming/rule_count/map.sh b/dtrain/hstreaming/rule_count/map.sh new file mode 100755 index 00000000..ae75fece --- /dev/null +++ b/dtrain/hstreaming/rule_count/map.sh @@ -0,0 +1,4 @@ +#!/bin/sh + +ruby rulecount.rb | sort | ruby red.rb + diff --git a/dtrain/hstreaming/rule_count/red.rb b/dtrain/hstreaming/rule_count/red.rb new file mode 100644 index 00000000..8f9109cc --- /dev/null +++ b/dtrain/hstreaming/rule_count/red.rb @@ -0,0 +1,22 @@ +STDIN.set_encoding 'utf-8' +STDOUT.set_encoding 'utf-8' + +def output(key, val) + puts "#{key}\t#{val}" +end + +prev_key = nil +sum = 0 +while line = STDIN.gets + key, val = line.strip.split /\t/ + if key != prev_key && sum > 0 + output prev_key, sum + prev_key = key + sum = 0 + elsif !prev_key + prev_key = key + end + sum += val.to_i +end +output prev_key, sum + diff --git a/dtrain/hstreaming/rule_count/rulecount.rb b/dtrain/hstreaming/rule_count/rulecount.rb new file mode 100644 index 00000000..035bdf06 --- /dev/null +++ b/dtrain/hstreaming/rule_count/rulecount.rb @@ -0,0 +1,11 @@ +STDIN.set_encoding 'utf-8' +STDOUT.set_encoding 'utf-8' + +while line = STDIN.gets + a = line.strip.chomp.split "\t" + a[3..a.size].each { |r| + id = r.split("|||")[0..2].join("|||").to_s.strip.gsub("\s", "_") + puts "#{id}\t1" + } +end + diff --git a/dtrain/hstreaming/rule_count/test b/dtrain/hstreaming/rule_count/test new file mode 100644 index 00000000..acd00a5e --- /dev/null +++ b/dtrain/hstreaming/rule_count/test @@ -0,0 +1,8 @@ +a 1 +a 1 +a 1 +b 1 +b 1 +c 1 +d 1 +a 1 |