1. 실행

$HADOOP_PREFIX/bin/hadoop  jar $HADOOP_PREFIX/share/hadoop/tools/lib/hadoop-streaming-2.2.0.jar  \

    -input myInputDir \
    -output myOutputDir
\
    -mapper mapper.py
\
    -reducer reducer.py
\
    -file mapper.py
\
    -file reducer.py



1) mapper 

$HADOOP_PREFIX/bin/hadoop  jar $HADOOP_PREFIX/share/hadoop/tools/lib/hadoop-streaming-2.2.0.jar  \
    -input myInputDir
\
    -output myOutputDir
\
    -mapper mapper.py
\
    -reducer reducer.py
\
    -file mapper.py
\
    -file reducer.py


2) reducer

from operator import itemgetter
import sys

currentWord = None
currentCount =
0

for line in sys.stdin:
    line = line.strip()

# Get elements of pair created by the mapper
    word, count = line.split()

# Convert count to an integer
   
try:
        count =
int(count)
   
except ValueError:
       
continue


   
if currentWord != word:
       
if currentWord is not None:
           
print('%s\t%d' % (currentWord, currentCount))
        currentWord = word
        currentCount =
0

currentCount += count

# Output last word group if needed
if currentCount > 0:
   
print('%s\t%d' % (currentWord, currentCount))



'NoSQL > Hadoop' 카테고리의 다른 글

yarn 구조  (0) 2017.03.08
hadoop read & write  (0) 2017.03.06
hadoop streaming  (0) 2017.03.06
hadoop locality  (0) 2017.03.06
hadoop distcp  (0) 2017.03.02

+ Recent posts