summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPatrick Simianer <p@simianer.de>2015-12-23 13:53:03 +0100
committerPatrick Simianer <p@simianer.de>2015-12-23 13:53:03 +0100
commitc69080adb7cf6dbf25c0ed1129fe988163bc26fd (patch)
tree84afcb89ea95169233019387c7f2f8a2f2936794
parent5cda5278ae5a40be73feb34be7f145c2728be779 (diff)
hadoop_uniq: uniq with hadoop-streaming
-rwxr-xr-xhadoop_uniq11
1 files changed, 11 insertions, 0 deletions
diff --git a/hadoop_uniq b/hadoop_uniq
new file mode 100755
index 0000000..5052419
--- /dev/null
+++ b/hadoop_uniq
@@ -0,0 +1,11 @@
+#!/bin/zsh
+
+HADOOP_HOME=/usr/lib/hadoop
+
+$HADOOP_HOME/bin/hadoop jar /usr/lib/hadoop-mapreduce/hadoop-streaming.jar \
+ -D mapred.reduce.tasks=98 \
+ -input d \
+ -output d.uniq \
+ -mapper 'cut -d " " -f 1' \
+ -reducer /usr/bin/uniq
+