Hadoop

From campisano.org
Jump to navigation Jump to search

Requisites

  • java >= 7 (OpenJDK is ok)
  • hadoop bin package == 2.7.1
  • ssh server (ex openssh-server)
  • rsync (hadoop documentation declare it as requisite)

Hadoop as Pseudo-Distributed Environment - Install and config

Creating hadoop user

sudo su -
groupadd --gid 11001 hadoop
mkdir -p /srv/hadoop/ -m 755
useradd --uid 11001 --gid hadoop --no-user-group --shell /bin/bash --create-home --home-dir /srv/hadoop/home hadoop
chown hadoop:hadoop /srv/hadoop
exit

Creating common ssh authentication for hadoop user

sudo su - hadoop
mkdir .ssh
ssh-keygen -t rsa -P "" -f .ssh/id_rsa
cat .ssh/id_rsa.pub >> .ssh/authorized_keys
exit

Installing hadoop binary package

sudo su - hadoop
cd /srv/hadoop
wget -c http://www.us.apache.org/dist/hadoop/common/hadoop-2.7.1/hadoop-2.7.1.tar.gz
tar -xzf hadoop-2.7.1.tar.gz
ln -s hadoop-2.7.1 hadoop-current
cat >> ~/.profile << EOF

# Set the Hadoop Related Environment variables
export HADOOP_HOME=/srv/hadoop/hadoop-current
export PATH=$PATH:$HADOOP_HOME/bin:$HADOOP_HOME/sbin

# Set the JAVA_HOME
export JAVA_HOME=/usr/lib/jvm/java-7-openjdk-amd64
EOF
exit

Configuring hadoop daemons

sudo su - hadoop
#
cp -a $HADOOP_HOME/etc/hadoop/hadoop-env.sh $HADOOP_HOME/etc/hadoop/hadoop-env.sh.old
cat $HADOOP_HOME/etc/hadoop/hadoop-env.sh.old | sed 's|export JAVA_HOME=${JAVA_HOME}|export JAVA_HOME=/usr/lib/jvm/java-7-openjdk-amd64|g' > $HADOOP_HOME/etc/hadoop/hadoop-env.sh
rm $HADOOP_HOME/etc/hadoop/hadoop-env.sh.old
#
# change log size
nano $HADOOP_HOME/etc/hadoop/log4j.properties
# in section Rolling File Appender set
# hadoop.log.maxfilesize=10MB
# hadoop.log.maxbackupindex=100
#
# from https://support.pivotal.io/hc/en-us/articles/202296718-How-to-change-Hadoop-daemon-log4j-properties
echo >> $HADOOP_HOME/etc/hadoop/hadoop-env.sh
echo "# configure log to console and RFA file" >> $HADOOP_HOME/etc/hadoop/hadoop-env.sh
echo "export HADOOP_ROOT_LOGGER=\"INFO,console,RFA\"" >> $HADOOP_HOME/etc/hadoop/hadoop-env.sh
echo >> $HADOOP_HOME/etc/hadoop/mapred-env.sh
echo "# configure log to console and RFA file" >> $HADOOP_HOME/etc/hadoop/mapred-env.sh
echo "export HADOOP_MAPRED_ROOT_LOGGER=\"INFO,console,RFA\"" >> $HADOOP_HOME/etc/hadoop/mapred-env.sh
echo >> $HADOOP_HOME/etc/hadoop/yarn-env.sh
echo "# configure log to console and RFA file" >> $HADOOP_HOME/etc/hadoop/yarn-env.sh
echo "export YARN_ROOT_LOGGER=\"INFO,console,RFA\"" >> $HADOOP_HOME/etc/hadoop/yarn-env.sh
#
cat > $HADOOP_HOME/etc/hadoop/core-site.xml << EOF
<?xml version="1.0" encoding="UTF-8"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
<configuration>
    <property>
        <name>fs.default.name</name>
        <value>hdfs://localhost:9000</value>
    </property>
    <property>
        <name>hadoop.tmp.dir</name>
        <value>/srv/hadoop/tmp_data</value>
    </property>
</configuration>
EOF
#
cp -a $HADOOP_HOME/etc/hadoop/mapred-site.xml.template $HADOOP_HOME/etc/hadoop/mapred-site.xml
cat > $HADOOP_HOME/etc/hadoop/mapred-site.xml << EOF
<?xml version="1.0"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
<configuration>
    <property>
        <name>mapreduce.framework.name</name>
        <value>yarn</value>
    </property>
</configuration>
EOF
#
cat > $HADOOP_HOME/etc/hadoop/hdfs-site.xml << EOF
<?xml version="1.0" encoding="UTF-8"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
<configuration>
    <property>
        <name>dfs.replication</name>
        <value>1</value>
    </property>
</configuration>
EOF
#
cat > $HADOOP_HOME/etc/hadoop/yarn-site.xml << EOF
<?xml version="1.0"?>
<configuration>
    <property>
        <name>yarn.nodemanager.aux-services</name>
        <value>mapreduce_shuffle</value>
    </property>
    <property>
        <name>yarn.nodemanager.aux-services.mapreduce_shuffle.class</name>
        <value>org.apache.hadoop.mapred.ShuffleHandler</value>
    </property>
    <property>
        <name>yarn.resourcemanager.resource-tracker.address</name>
        <value>localhost:8025</value>
    </property>
    <property>
        <name>yarn.resourcemanager.scheduler.address</name>
        <value>localhost:8030</value>
    </property>
    <property>
        <name>yarn.resourcemanager.address</name>
        <value>localhost:8050</value>
    </property>
</configuration>
EOF
#
exit

Manual startup

sudo su - hadoop
hadoop namenode -format  # format HDFS, only first time
start-dfs.sh
start-yarn.sh
exit

Hadoop as Cluster - Install and config

Define hostnames on all hosts if it's not well defined (Master and slaves)

  • change /etc/hostname
  • run follow command to apply new hostname
hostname `cat /etc/hostname`

Define hadoop nodes IP and names on all hosts (Master and Workers)

  • NOTE: configure hadoop using IP and not hostnames appears not work, we will configure IPs on /etc/hosts and then use hostnames

- http://stackoverflow.com/questions/28381807/hadoop-slaves-file-regard-ip-as-hostname

- https://raseshmori.wordpress.com/2012/10/14/install-hadoop-nextgen-yarn-multi-node-cluster/ 15. Possible errors

- http://wiki.apache.org/hadoop/UnknownHost


  • However, you may encounter several problems of communication. A sample problem is a hostname like
127.0.0.1 localhost
127.0.1.1 myhostname

10.1.1.5 myhostname


if you use myhostname in the configuration above, the master server appears to listening only on 127.0.1.1 !

so we will add specific hadoop hostnames

  • adictional warning: do not use '_' char in hostnames!


  • Add something like follow example at the end of the file /etc/hosts (if not already configured), where MASTER_HOSTNAME or NODE_HOSTNAME_* are the real names used in /etc/hostname :
# Hadoop needs the follow specific config
10.1.1.120      MASTER_HOSTNAME     hadoopmaster
10.1.1.121      NODE_HOSTNAME_1     hadoopnode1
10.1.1.122      NODE_HOSTNAME_2     hadoopnode2
...
10.1.1.12N      NODE_HOSTNAME_N     hadoopnodeN

Commands executed at Master host

Creating hadoop user

sudo groupadd --gid 11001 hadoop
sudo mkdir -p /srv/hadoop -m 755
sudo useradd --uid 11001 --gid hadoop --no-user-group --shell /bin/bash --create-home --home-dir /srv/hadoop/home hadoop

Creating common ssh authentication for hadoop user

  • NOTE: hadoop use ssh to communicate, so a duple key without password is needed
sudo su - hadoop
mkdir .ssh
ssh-keygen -t rsa -P "" -f .ssh/id_rsa                                          # creating the key (same key will used on all nodes)
cat .ssh/id_rsa.pub >> .ssh/authorized_keys                                     # authorize the key on this host (auth will shared on all nodes)
tar -czf ssh.tgz .ssh
exit
  • copy the file in all nodes
scp /srv/hadoop/home/ssh.tgz NODE_HOSTNAME_1:/tmp                               # copy to node 1
scp /srv/hadoop/home/ssh.tgz NODE_HOSTNAME_2:/tmp                               # copy to node 2
...                                                                             # ...
scp /srv/hadoop/home/ssh.tgz NODE_HOSTNAME_N:/tmp                               # copy to node N
sudo rm /srv/hadoop/home/ssh.tgz                                                # remove the file

Commands executed at any Slave host

Creating hadoop user

sudo groupadd --gid 11001 hadoop
sudo mkdir -p /srv/hadoop -m 755
sudo useradd --uid 11001 --gid hadoop --no-user-group --shell /bin/bash --create-home --home-dir /srv/hadoop/home hadoop

Setting ssh and environment configs

sudo chown -R hadoop:hadoop /tmp/ssh.tgz
sudo mv /tmp/ssh.tgz /srv/hadoop/home
sudo su - hadoop
tar -xzf ssh.tgz
rm -f ssh.tgz
exit

Commands executed at Master host

Configuring common hadoop env variables

  • as hadoop user, add follow var in ~/.profile
sudo su - hadoop
nano .profile
...
export JAVA_HOME=/usr/lib/jvm/java-7-openjdk-amd64 # or /usr/lib/jvm/java-7-oracle
export HADOOP_PREFIX=/srv/hadoop
export HADOOP_HOME=$HADOOP_PREFIX
export HADOOP_MAPRED_HOME=$HADOOP_PREFIX
export HADOOP_COMMON_HOME=$HADOOP_PREFIX
export HADOOP_HDFS_HOME=$HADOOP_PREFIX
export HADOOP_CONF_DIR=${HADOOP_PREFIX}"/etc/hadoop"
export HADOOP_YARN_HOME=$HADOOP_PREFIX
export PATH=$PATH:$HADOOP_PREFIX/bin
export PATH=$PATH:$HADOOP_PREFIX/sbin
...
exit
  • copy the file in all nodes
sudo su - hadoop
scp .profile NODE_HOSTNAME_1:                                                   # copy to node 1
scp .profile NODE_HOSTNAME_2:                                                   # copy to node 2
...                                                                             # ...
scp .profile NODE_HOSTNAME_N:                                                   # copy to node N
exit

Installing hadoop binary package

sudo su - hadoop
wget -c http://www.us.apache.org/dist/hadoop/common/hadoop-2.7.1/hadoop-2.7.1.tar.gz
exit
cd /srv/hadoop/home/
sudo tar -xzf hadoop-2.7.1.tar.gz -C /srv/
sudo mv /srv/hadoop-2.7.1/* /srv/hadoop/
sudo rmdir /srv/hadoop-2.7.1/
sudo chown -R hadoop:hadoop /srv/hadoop/

Copy binary package to all Workers nodes

sudo su - hadoop
scp hadoop-2.7.1.tar.gz NODE_HOSTNAME_1:                                        # copy to node 1
scp hadoop-2.7.1.tar.gz NODE_HOSTNAME_2:                                        # copy to node 2
...                                                                             # ...
scp hadoop-2.7.1.tar.gz NODE_HOSTNAME_N:                                        # copy to node N
exit

Configuring hadoop servers

Change default JAVA_HOME and HADOOP_PREFIX in /srv/hadoop/etc/hadoop/hadoop-env.sh

# set to the root of your Java installation
export JAVA_HOME=/usr/lib/jvm/java-7-openjdk-amd64 # or /usr/lib/jvm/java-7-oracle

# Assuming your installation directory is /srv/hadoop
export HADOOP_PREFIX=/srv/hadoop
export HADOOP_CONF_DIR=${HADOOP_PREFIX}"/etc/hadoop"

Create Master and Slave directories

sudo su - hadoop
mkdir -pv /srv/hadoop/data/namenode
mkdir -pv /srv/hadoop/data/datanode
mkdir -pv /srv/hadoop/logs
exit

Configure logger

  • Change the log size
nano /srv/hadoop/etc/hadoop/log4j.properties
# in section Rolling File Appender set
# hadoop.log.maxfilesize=10MB
# hadoop.log.maxbackupindex=100
  • Change the log type
# from https://support.pivotal.io/hc/en-us/articles/202296718-How-to-change-Hadoop-daemon-log4j-properties
echo >> /srv/hadoop/etc/hadoop/hadoop-env.sh
echo "# configure log to console and RFA file" >> /srv/hadoop/etc/hadoop/hadoop-env.sh
echo "export HADOOP_ROOT_LOGGER=\"INFO,console,RFA\"" >> /srv/hadoop/etc/hadoop/hadoop-env.sh
echo >> /srv/hadoop/etc/hadoop/mapred-env.sh
echo "# configure log to console and RFA file" >> /srv/hadoop/etc/hadoop/mapred-env.sh
echo "export HADOOP_MAPRED_ROOT_LOGGER=\"INFO,console,RFA\"" >> /srv/hadoop/etc/hadoop/mapred-env.sh
echo >> /srv/hadoop/etc/hadoop/yarn-env.sh
echo "# configure log to console and RFA file" >> /srv/hadoop/etc/hadoop/yarn-env.sh
echo "export YARN_ROOT_LOGGER=\"INFO,console,RFA\"" >> /srv/hadoop/etc/hadoop/yarn-env.sh

Define Master configuration tags /srv/hadoop/etc/hadoop/core-site.xml

    <property>
        <name>fs.defaultFS</name>
        <value>hdfs://hadoopmaster:9000</value>
        <description>NameNode URI</description>
    </property>

Define Common hosts configuration tags /srv/hadoop/etc/hadoop/hdfs-site.xml

    <property>
        <name>dfs.replication</name>
        <value>1</value>
    </property>

    <property>
        <name>dfs.namenode.name.dir</name>
        <value>file:///srv/hadoop/data/namenode</value>
        <description>NameNode directory for namespace and transaction logs storage.</description>
    </property>

    <property>
        <name>dfs.datanode.data.dir</name>
        <value>file:///srv/hadoop/data/datanode</value>
        <description>DataNode directory</description>
    </property>

    <property>
        <name>dfs.permissions</name>
        <value>false</value>
    </property>

    <property>
        <name>dfs.datanode.use.datanode.hostname</name>
        <value>false</value>
    </property>

    <property>
        <name>dfs.namenode.datanode.registration.ip-hostname-check</name>
        <value>false</value>
    </property>

Define Slaves host names configuration tags /srv/hadoop/etc/hadoop/slaves

NODE_HOSTNAME_1
NODE_HOSTNAME_2
...
NODE_HOSTNAME_N

Define YARN scheduler configuration tags /srv/hadoop/etc/hadoop/mapred-site.xml

sudo su - hadoop
cp /srv/hadoop/etc/hadoop/mapred-site.xml.template /srv/hadoop/etc/hadoop/mapred-site.xml
exit
    <property>
        <name>mapreduce.framework.name</name>
        <value>yarn</value>
    </property>

    <property>
        <name>mapred.task.timeout</name>
        <value>0</value>
    </property>

    <property>
        <name>mapreduce.map.memory.mb</name>
        <value>1536</value>
    </property>

    <property>
        <name>mapreduce.map.java.opts</name>
        <value>-Xmx1024M</value>
    </property>

    <property>
        <name>mapreduce.reduce.memory.mb</name>
        <value>3072</value>
    </property>

    <property>
        <name>mapreduce.reduce.java.opts</name>
        <value>-Xmx2560M</value>
    </property>

Define YARN scheduler configuration tags /srv/hadoop/etc/hadoop/yarn-site.xml

    <property>
        <name>yarn.nodemanager.aux-services</name>
        <value>mapreduce_shuffle</value>
    </property>

    <property>
        <name>yarn.nodemanager.aux-services.mapreduce_shuffle.class</name>
        <value>org.apache.hadoop.mapred.ShuffleHandler</value>
    </property>

    <property>
        <name>yarn.nodemanager.log.retain-seconds</name>
        <value>259200</value>
    </property>

    <property>
        <name>yarn.resourcemanager.scheduler.address</name>
        <value>hadoopmaster:8030</value>
    </property>

    <property>
        <name>yarn.resourcemanager.resource-tracker.address</name>
        <value>hadoopmaster:8031</value>
    </property>

    <property>
        <name>yarn.resourcemanager.address</name>
        <value>hadoopmaster:8032</value>
    </property>

    <property>
        <name>yarn.scheduler.maximum-allocation-mb</name>
        <value>4096</value>
    </property>

Share configs with all Slave host

sudo su - hadoop
scp -r ../etc/hadoop NODE_HOSTNAME_1:etc_hadoop                                 # copy to node 1
scp -r ../etc/hadoop NODE_HOSTNAME_2:etc_hadoop                                 # copy to node 2
...                                                                             # ...
scp -r ../etc/hadoop NODE_HOSTNAME_N:etc_hadoop                                 # copy to node N
exit

Commands executed at any Worker host

Install Hadoop and apply configs

sudo tar -xzf /srv/hadoop/home/hadoop-2.7.1.tar.gz -C /srv/
sudo mv /srv/hadoop-2.7.1/* /srv/hadoop/
sudo rmdir /srv/hadoop-2.7.1/
sudo rm -rf /srv/hadoop/etc/hadoop/
sudo chown -R hadoop:hadoop /srv/hadoop/
sudo su - hadoop
mv etc_hadoop ../etc/hadoop
exit

Starting/Stopping Hadoop

  • You can start hadoop manually or configure a daemon service at host startup

Option 1: Manual startup

Start

Setting up hadoop from Master host
sudo su - hadoop
# Format a new distributed filesystem (execute only first time)
hdfs namenode -format
#
# Start the HDFS with the following command, run on the designated NameNode:
hadoop-daemon.sh --config $HADOOP_CONF_DIR --script hdfs start namenode
#
# Start the YARN with the following command, run on the designated ResourceManager:
yarn-daemon.sh --config $HADOOP_CONF_DIR start resourcemanager
exit
Setting up Slave hosts (this must executed at all node hosts)
sudo su - hadoop
# Run a script to start DataNodes on all slaves:
hadoop-daemon.sh --config $HADOOP_CONF_DIR --script hdfs start datanode
#
# Run a script to start NodeManagers on all slaves:
yarn-daemon.sh --config $HADOOP_CONF_DIR start nodemanager
exit

Stop

When needed, stop hadoop from Master host
sudo su - hadoop
# Stop the NameNode with the following command, run on the designated NameNode:
hadoop-daemon.sh --config $HADOOP_CONF_DIR --script hdfs stop namenode
#
# Stop the ResourceManager with the following command, run on the designated ResourceManager:
yarn-daemon.sh --config $HADOOP_CONF_DIR stop resourcemanager
#
# this can solve some problems
#rm -rf /srv/hadoop/data/namenode/current/
#rm -rf /srv/hadoop/logs/*
exit
And stop hadoop services in all node hosts
sudo su - hadoop
# Run a script to stop DataNodes on all slaves:
hadoop-daemon.sh --config $HADOOP_CONF_DIR --script hdfs stop datanode
#
# Run a script to stop NodeManagers on all slaves:
yarn-daemon.sh --config $HADOOP_CONF_DIR stop nodemanager
#
# this can solve some problems
#rm -rf /srv/hadoop/data/datanode/current/
#rm -rf /srv/hadoop/logs/*
exit

Option 2: Automatic boot startup

On Master host

  • Create the script /etc/init.d/hadoop_master.sh at Master host
#!/bin/bash
#
# hadoop_master.sh
# version 1.0
# from http://www.campisano.org/wiki/en/Script_chroot.sh



### BEGIN INIT INFO
# Provides:          hadoop_master.sh
# Required-Start:    $remote_fs $syslog $time
# Required-Stop:     $remote_fs $syslog $time
# Should-Start:      $network
# Should-Stop:       $network
# Default-Start:     2 3 4 5
# Default-Stop:      0 1 6
# Short-Description: Hadoop Master Node
# Description:       Hadoop Master Node
### END INIT INFO



PATH=/sbin:/bin:/usr/sbin:/usr/bin;

OWNER[1]="hadoop";
MSG[1]="Hadoop (Master) namenode daemon ...";
START_CMD[1]="\$HADOOP_HOME/sbin/hadoop-daemon.sh --config \$HADOOP_CONF_DIR --script hdfs start namenode";
STOP_CMD[1]="\$HADOOP_HOME/sbin/hadoop-daemon.sh --config \$HADOOP_CONF_DIR --script hdfs stop namenode";

OWNER[2]="hadoop";
MSG[2]="Hadoop (Master) yarn resourcemanager daemon ...";
START_CMD[2]="\$HADOOP_HOME/sbin/yarn-daemon.sh --config \$HADOOP_CONF_DIR start resourcemanager";
STOP_CMD[2]="\$HADOOP_HOME/sbin/yarn-daemon.sh --config \$HADOOP_CONF_DIR stop resourcemanager";



function start()
{
    N=1;
    while test -n "${OWNER[$N]}"; do
        echo -ne "Starting ${MSG[$N]} ...";
        su "${OWNER[$N]}" -l -c "${START_CMD[$N]}";
        echo "Done.";
        N=$(($N+1));
    done;
}

function stop()
{
    N=${#OWNER[@]};
    while test -n "${OWNER[$N]}"; do
        echo -ne "Stopping ${MSG[$N]} ...";
        su "${OWNER[$N]}" -l -c "${STOP_CMD[$N]}" || break;
        echo "Done.";
        N=$(($N-1));
    done;
}

function usage()
{
    echo "Usage: $0 {start|stop}"
    exit 0
}



case "$1" in
    start)
        start
        ;;

    stop)
        stop
        ;;

    *)
        usage
esac;



# End

  • Configure as startup script: debian example
#remeber to do that the first time: hdfs namenode -format
sudo chmod 755 /etc/init.d/hadoop_master.sh
sudo update-rc.d hadoop_master.sh defaults
# Format a new distributed filesystem (execute only first time)
sudo su - hadoop
hdfs namenode -format
exit
sudo service hadoop_master.sh start

On all slave hosts

  • Create the script /etc/init.d/hadoop_slave.sh at all slave hosts
#!/bin/bash
#
# hadoop_slave.sh
# version 1.0
# from http://www.campisano.org/wiki/en/Script_chroot.sh



### BEGIN INIT INFO
# Provides:          hadoop_slave.sh
# Required-Start:    $remote_fs $syslog $time
# Required-Stop:     $remote_fs $syslog $time
# Should-Start:      $network
# Should-Stop:       $network
# Default-Start:     2 3 4 5
# Default-Stop:      0 1 6
# Short-Description: Hadoop Slave Node
# Description:       Hadoop Slave Node
### END INIT INFO



PATH=/sbin:/bin:/usr/sbin:/usr/bin;

OWNER[1]="hadoop";
MSG[1]="Hadoop (Slave) datanode daemon ...";
START_CMD[1]="\$HADOOP_HOME/sbin/hadoop-daemon.sh --config \$HADOOP_CONF_DIR --script hdfs start datanode";
STOP_CMD[1]="\$HADOOP_HOME/sbin/hadoop-daemon.sh --config \$HADOOP_CONF_DIR --script hdfs stop datanode";

OWNER[2]="hadoop";
MSG[2]="Hadoop (Slave) yarn nodemanager daemon ...";
START_CMD[2]="\$HADOOP_HOME/sbin/yarn-daemon.sh --config \$HADOOP_CONF_DIR start nodemanager";
STOP_CMD[2]="\$HADOOP_HOME/sbin/yarn-daemon.sh --config \$HADOOP_CONF_DIR stop nodemanager";



function start()
{
    N=1;
    while test -n "${OWNER[$N]}"; do
        echo -ne "Starting ${MSG[$N]} ...";
        su "${OWNER[$N]}" -l -c "${START_CMD[$N]}";
        echo "Done.";
        N=$(($N+1));
    done;
}

function stop()
{
    N=${#OWNER[@]};
    while test -n "${OWNER[$N]}"; do
        echo -ne "Stopping ${MSG[$N]} ...";
        su "${OWNER[$N]}" -l -c "${STOP_CMD[$N]}" || break;
        echo "Done.";
        N=$(($N-1));
    done;
}

function usage()
{
    echo "Usage: $0 {start|stop}"
    exit 0
}



case "$1" in
    start)
        start
        ;;

    stop)
        stop
        ;;

    *)
        usage
esac;



# End

  • Configure as startup script
sudo chmod 755 /etc/init.d/hadoop_slave.sh
sudo update-rc.d hadoop_slave.sh defaults
sudo service hadoop_slave.sh start

Hadoop WEB interface

  • If you have any problem to access this port (ex firewall lock), you can tunneling this port on local computer with the follow command:
ssh -C -N -L 8088:localhost:8088 -p MASTER_SSH_PORT root@MASTER_HOST


NOTE: the HDFS interface usually run on http://MASTER_HOST:50070

Run

First ready example

sudo su - hadoop
hdfs namenode -format
hdfs dfs -mkdir -p /user/hadoop
hdfs dfs -put /srv/hadoop/etc/hadoop /user/hadoop/input
hadoop jar /srv/hadoop/share/hadoop/mapreduce/hadoop-mapreduce-examples-2.7.1.jar grep input output 'dfs[a-z.]+'
exit

First Java example

sudo su - hadoop
mkdir java_mapreduce_example
cd java_mapreduce_example
mkdir txt_books
wget http://www.gutenberg.org/ebooks/20417.txt.utf-8 -P txt_books
wget http://www.gutenberg.org/files/5000/5000-8.txt -P txt_books
wget http://www.gutenberg.org/ebooks/4300.txt.utf-8 -P txt_books
hdfs dfs -mkdir -p /user/hadoop/java_mapreduce_example/txt_books
hdfs dfs -put txt_books/* /user/hadoop/java_mapreduce_example/txt_books
# create file WordCount.java
export HADOOP_CLASSPATH=${JAVA_HOME}/lib/tools.jar
hadoop com.sun.tools.javac.Main WordCount.java
jar cf wc.jar WordCount*.class
hadoop jar wc.jar WordCount /user/hadoop/java_mapreduce_example/txt_books /user/hadoop/java_mapreduce_example/output
exit

Creating a python example

sudo su - hadoop
mkdir python_mapreduce_example
cd python_mapreduce_example
  • create file mapper.py, mode 755
#!/usr/bin/env python
#
# from http://www.michael-noll.com/tutorials/\
#     writing-an-hadoop-mapreduce-program-in-python/

import sys

# input comes from STDIN (standard input)
for line in sys.stdin:
    # remove leading and trailing whitespace
    line = line.strip()
    # split the line into words
    words = line.split()
    # increase counters
    for word in words:
        # write the results to STDOUT (standard output);
        # what we output here will be the input for the
        # Reduce step, i.e. the input for reducer.py
        #
        # tab-delimited; the trivial word count is 1
        print '%s\t%s' % (word, 1)
  • create file reducer.py, mode 755
#!/usr/bin/env python
#
# from http://www.michael-noll.com/tutorials/\
#     writing-an-hadoop-mapreduce-program-in-python/

import sys

current_word = None
current_count = 0
word = None

# input comes from STDIN
for line in sys.stdin:
    # remove leading and trailing whitespace
    line = line.strip()

    # parse the input we got from mapper.py
    word, count = line.split('\t', 1)

    # convert count (currently a string) to int
    try:
        count = int(count)
    except ValueError:
        # count was not a number, so silently
        # ignore/discard this line
        continue

    # this IF-switch only works because Hadoop sorts map output
    # by key (here: word) before it is passed to the reducer
    if current_word == word:
        current_count += count
    else:
        if current_word:
            # write result to STDOUT
            print '%s\t%s' % (current_word, current_count)
        current_count = count
        current_word = word

# do not forget to output the last word if needed!
if current_word == word:
    print '%s\t%s' % (current_word, current_count)
  • getting input data
mkdir txt_books
wget http://www.gutenberg.org/ebooks/20417.txt.utf-8 -P txt_books
wget http://www.gutenberg.org/files/5000/5000-8.txt -P txt_books
wget http://www.gutenberg.org/ebooks/4300.txt.utf-8 -P txt_books
  • putting data into hdfs, run and cleanup
hdfs dfs -mkdir -p /user/hadoop/python_mapreduce_example/txt_books
hdfs dfs -put txt_books/* /user/hadoop/python_mapreduce_example/txt_books
hadoop jar ~/../share/hadoop/tools/lib/hadoop-streaming-2.7.1.jar \
    -files mapper.py,reducer.py -mapper mapper.py -reducer reducer.py \
    -input /user/hadoop/python_mapreduce_example/txt_books \
    -output /user/hadoop/python_mapreduce_example/output
hadoop fs -copyToLocal /user/hadoop/python_mapreduce_example/output
hadoop fs -rm -r -f /user/hadoop/python_mapreduce_example
exit

Creating the R equivalent example

sudo su - hadoop
mkdir r_mapreduce_example
cd r_mapreduce_example
  • create file mapper.r, mode 755
#!/usr/bin/env Rscript

options(warn=-1)            # disables warnings
sink("/dev/null")           # redirect all R output to /dev/null

# input comes from STDIN (standard input)
input <- file("stdin", "r")

while(length(line <- readLines(input, n=1, warn=FALSE)) > 0) {

    # remove leading and trailing whitespace
    line <- gsub("^\\s+|\\s+$", "", line)

    # split the line into words
    words <- strsplit(line, "[ \t\n\r\f]")[[1]]
    words <- words[! words == ""]

    # increase counters
    for (word in words){

        sink()              # reenable normal output

        # write the results to STDOUT (standard output);
        # what we output here will be the input for the
        # Reduce step, i.e. the input for reducer.py
        #
        # tab-delimited; the trivial word count is 1
        cat(word, 1, "\n", sep="\t")

        sink("/dev/null")   # redirect all R output to /dev/null
    }
}

close(input)

  • create file reducer.r, mode 755
#!/usr/bin/env Rscript

options(warn=-1)            # disables warnings
sink("/dev/null")           # redirect all R output to /dev/null

# input comes from STDIN (standard input)
input <- file("stdin", "r")

current_word <- NULL
current_count <- 0
word <- NULL

while(length(line <- readLines(input, n=1, warn=FALSE)) > 0) {

    # remove leading and trailing whitespace
    line <- gsub("^\\s+|\\s+$", "", line)

    # split the line into words
    tuples <- strsplit(line, "\t", fixed=TRUE)[[1]]

    if(length(tuples) != 2 || is.na(suppressWarnings(as.integer(tuples[2])))) {
        next
    }

    word <- tuples[1]
    count <- as.integer(tuples[2])

    if((!is.null(current_word)) && current_word == word) {
        current_count <- current_count + 1
    } else {

        if(!is.null(current_word)) {
            # write result to STDOUT
            sink()              # reenable normal output
            cat(current_word, current_count, "\n", sep="\t")
            sink("/dev/null")   # redirect all R output to /dev/null
        }

        current_count <- count
        current_word <- word
    }
}

# do not forget to output the last word if needed!
if((! is.null(current_word)) && current_word == word) {
    sink()              # reenable normal output
    cat(current_word, current_count, "\n", sep="\t")
    sink("/dev/null")   # redirect all R output to /dev/null
}

  • getting input data
mkdir txt_books
wget http://www.gutenberg.org/ebooks/20417.txt.utf-8 -P txt_books
wget http://www.gutenberg.org/files/5000/5000-8.txt -P txt_books
wget http://www.gutenberg.org/ebooks/4300.txt.utf-8 -P txt_books
  • putting data into hdfs, run and cleanup
hdfs dfs -mkdir -p /user/hadoop/r_mapreduce_example/txt_books
hdfs dfs -put txt_books/* /user/hadoop/r_mapreduce_example/txt_books
hadoop jar ~/../share/hadoop/tools/lib/hadoop-streaming-2.7.1.jar \
    -files mapper.r,reducer.r -mapper mapper.r -reducer reducer.r \
    -input /user/hadoop/r_mapreduce_example/txt_books \
    -output /user/hadoop/r_mapreduce_example/output
hadoop fs -copyToLocal /user/hadoop/r_mapreduce_example/output
hadoop fs -rm -r -f /user/hadoop/r_mapreduce_example
exit

References