diff options
author | Patrick Simianer <p@simianer.de> | 2015-12-23 13:53:03 +0100 |
---|---|---|
committer | Patrick Simianer <p@simianer.de> | 2015-12-23 13:53:03 +0100 |
commit | c69080adb7cf6dbf25c0ed1129fe988163bc26fd (patch) | |
tree | 84afcb89ea95169233019387c7f2f8a2f2936794 | |
parent | 5cda5278ae5a40be73feb34be7f145c2728be779 (diff) |
hadoop_uniq: uniq with hadoop-streaming
-rwxr-xr-x | hadoop_uniq | 11 |
1 files changed, 11 insertions, 0 deletions
diff --git a/hadoop_uniq b/hadoop_uniq new file mode 100755 index 0000000..5052419 --- /dev/null +++ b/hadoop_uniq @@ -0,0 +1,11 @@ +#!/bin/zsh + +HADOOP_HOME=/usr/lib/hadoop + +$HADOOP_HOME/bin/hadoop jar /usr/lib/hadoop-mapreduce/hadoop-streaming.jar \ + -D mapred.reduce.tasks=98 \ + -input d \ + -output d.uniq \ + -mapper 'cut -d " " -f 1' \ + -reducer /usr/bin/uniq + |