diff options
Diffstat (limited to 'hadoop/cacheArchive')
-rw-r--r-- | hadoop/cacheArchive/input | 8 | ||||
-rw-r--r-- | hadoop/cacheArchive/mapper.py | 15 | ||||
-rw-r--r-- | hadoop/cacheArchive/my_module.zip | bin | 0 -> 524 bytes | |||
-rw-r--r-- | hadoop/cacheArchive/my_module/__init__.py | 0 | ||||
-rw-r--r-- | hadoop/cacheArchive/my_module/mod_a.py | 3 | ||||
-rw-r--r-- | hadoop/cacheArchive/my_module/mod_b.py | 3 | ||||
-rw-r--r-- | hadoop/cacheArchive/other_module.zip | bin | 0 -> 347 bytes | |||
-rw-r--r-- | hadoop/cacheArchive/other_module/__init__.py | 0 | ||||
-rw-r--r-- | hadoop/cacheArchive/other_module/other.py | 3 | ||||
-rw-r--r-- | hadoop/cacheArchive/output/part-00000 | 12 | ||||
-rw-r--r-- | hadoop/cacheArchive/output/part-00001 | 12 | ||||
-rwxr-xr-x | hadoop/cacheArchive/streaming.sh | 28 |
12 files changed, 84 insertions, 0 deletions
diff --git a/hadoop/cacheArchive/input b/hadoop/cacheArchive/input new file mode 100644 index 0000000..71ac1b5 --- /dev/null +++ b/hadoop/cacheArchive/input @@ -0,0 +1,8 @@ +a +b +c +d +e +f +g +h diff --git a/hadoop/cacheArchive/mapper.py b/hadoop/cacheArchive/mapper.py new file mode 100644 index 0000000..a7dd9f4 --- /dev/null +++ b/hadoop/cacheArchive/mapper.py @@ -0,0 +1,15 @@ +#!/usr/bin/env python2 + +import sys + +sys.path.append('.') +import my_module, other_module +from my_module import mod_a +from my_module import mod_b +from other_module import other + +for line in sys.stdin: + mod_a.bla(line) + mod_b.blubb(line) + other.foo(line) + diff --git a/hadoop/cacheArchive/my_module.zip b/hadoop/cacheArchive/my_module.zip Binary files differnew file mode 100644 index 0000000..65d960b --- /dev/null +++ b/hadoop/cacheArchive/my_module.zip diff --git a/hadoop/cacheArchive/my_module/__init__.py b/hadoop/cacheArchive/my_module/__init__.py new file mode 100644 index 0000000..e69de29 --- /dev/null +++ b/hadoop/cacheArchive/my_module/__init__.py diff --git a/hadoop/cacheArchive/my_module/mod_a.py b/hadoop/cacheArchive/my_module/mod_a.py new file mode 100644 index 0000000..ab65e6e --- /dev/null +++ b/hadoop/cacheArchive/my_module/mod_a.py @@ -0,0 +1,3 @@ +def bla(a): + print a.strip() + diff --git a/hadoop/cacheArchive/my_module/mod_b.py b/hadoop/cacheArchive/my_module/mod_b.py new file mode 100644 index 0000000..d684220 --- /dev/null +++ b/hadoop/cacheArchive/my_module/mod_b.py @@ -0,0 +1,3 @@ +def blubb(a): + print a.strip() + diff --git a/hadoop/cacheArchive/other_module.zip b/hadoop/cacheArchive/other_module.zip Binary files differnew file mode 100644 index 0000000..af99f7d --- /dev/null +++ b/hadoop/cacheArchive/other_module.zip diff --git a/hadoop/cacheArchive/other_module/__init__.py b/hadoop/cacheArchive/other_module/__init__.py new file mode 100644 index 0000000..e69de29 --- /dev/null +++ b/hadoop/cacheArchive/other_module/__init__.py diff --git a/hadoop/cacheArchive/other_module/other.py b/hadoop/cacheArchive/other_module/other.py new file mode 100644 index 0000000..fa55d0b --- /dev/null +++ b/hadoop/cacheArchive/other_module/other.py @@ -0,0 +1,3 @@ +def foo(a): + print a.strip() + diff --git a/hadoop/cacheArchive/output/part-00000 b/hadoop/cacheArchive/output/part-00000 new file mode 100644 index 0000000..89955ae --- /dev/null +++ b/hadoop/cacheArchive/output/part-00000 @@ -0,0 +1,12 @@ +a +a +a +b +b +b +c +c +c +d +d +d diff --git a/hadoop/cacheArchive/output/part-00001 b/hadoop/cacheArchive/output/part-00001 new file mode 100644 index 0000000..04576d4 --- /dev/null +++ b/hadoop/cacheArchive/output/part-00001 @@ -0,0 +1,12 @@ +e +e +e +f +f +f +g +g +g +h +h +h diff --git a/hadoop/cacheArchive/streaming.sh b/hadoop/cacheArchive/streaming.sh new file mode 100755 index 0000000..6bc9cda --- /dev/null +++ b/hadoop/cacheArchive/streaming.sh @@ -0,0 +1,28 @@ +#!/bin/sh + +HADOOP_HOME=/usr/lib/hadoop +HADOOP_VERSION=0.20.2-cdh3u1 +JAR=contrib/streaming/hadoop-streaming-$HADOOP_VERSION.jar +HSTREAMING="$HADOOP_HOME/bin/hadoop jar $HADOOP_HOME/$JAR" + +hadoop dfs -put input input +hadoop dfs -put my_module.zip my_module.zip +hadoop dfs -put other_module.zip other_module.zip + +IN=input +OUT=output + +$HSTREAMING \ + -input $IN\ + -output $OUT \ + -mapper "python mapper.py" \ + -reducer "NONE" \ + -file mapper.py\ + -cacheArchive 'hdfs:///user/simianer/my_module.zip#my_module' \ + -cacheArchive 'hdfs:///user/simianer/other_module.zip#other_module' \ + -jobconf mapred.reduce.tasks=30 #hier mal 30 statt 3 + +hadoop dfs -get $OUT . +hadoop dfs -rm $IN +hadoop dfs -rmr $OUT + |