summaryrefslogtreecommitdiff
path: root/dtrain/hstreaming
diff options
context:
space:
mode:
Diffstat (limited to 'dtrain/hstreaming')
-rwxr-xr-xdtrain/hstreaming/rule_count/map.sh4
-rw-r--r--dtrain/hstreaming/rule_count/red.rb22
-rw-r--r--dtrain/hstreaming/rule_count/rulecount.rb11
-rw-r--r--dtrain/hstreaming/rule_count/test8
4 files changed, 45 insertions, 0 deletions
diff --git a/dtrain/hstreaming/rule_count/map.sh b/dtrain/hstreaming/rule_count/map.sh
new file mode 100755
index 00000000..ae75fece
--- /dev/null
+++ b/dtrain/hstreaming/rule_count/map.sh
@@ -0,0 +1,4 @@
+#!/bin/sh
+
+ruby rulecount.rb | sort | ruby red.rb
+
diff --git a/dtrain/hstreaming/rule_count/red.rb b/dtrain/hstreaming/rule_count/red.rb
new file mode 100644
index 00000000..8f9109cc
--- /dev/null
+++ b/dtrain/hstreaming/rule_count/red.rb
@@ -0,0 +1,22 @@
+STDIN.set_encoding 'utf-8'
+STDOUT.set_encoding 'utf-8'
+
+def output(key, val)
+ puts "#{key}\t#{val}"
+end
+
+prev_key = nil
+sum = 0
+while line = STDIN.gets
+ key, val = line.strip.split /\t/
+ if key != prev_key && sum > 0
+ output prev_key, sum
+ prev_key = key
+ sum = 0
+ elsif !prev_key
+ prev_key = key
+ end
+ sum += val.to_i
+end
+output prev_key, sum
+
diff --git a/dtrain/hstreaming/rule_count/rulecount.rb b/dtrain/hstreaming/rule_count/rulecount.rb
new file mode 100644
index 00000000..035bdf06
--- /dev/null
+++ b/dtrain/hstreaming/rule_count/rulecount.rb
@@ -0,0 +1,11 @@
+STDIN.set_encoding 'utf-8'
+STDOUT.set_encoding 'utf-8'
+
+while line = STDIN.gets
+ a = line.strip.chomp.split "\t"
+ a[3..a.size].each { |r|
+ id = r.split("|||")[0..2].join("|||").to_s.strip.gsub("\s", "_")
+ puts "#{id}\t1"
+ }
+end
+
diff --git a/dtrain/hstreaming/rule_count/test b/dtrain/hstreaming/rule_count/test
new file mode 100644
index 00000000..acd00a5e
--- /dev/null
+++ b/dtrain/hstreaming/rule_count/test
@@ -0,0 +1,8 @@
+a 1
+a 1
+a 1
+b 1
+b 1
+c 1
+d 1
+a 1