diff --git a/hadoop/2.7.4/Dockerfile b/hadoop/2.7.4/Dockerfile new file mode 100644 index 0000000..bd78419 --- /dev/null +++ b/hadoop/2.7.4/Dockerfile @@ -0,0 +1,57 @@ +FROM alpine:3.6 + +MAINTAINER Newnius + +USER root + +# Prerequisites +RUN apk add --no-cache openssh openssl openjdk8-jre rsync bash procps + +ENV JAVA_HOME /usr/lib/jvm/java-1.8-openjdk +ENV PATH $PATH:$JAVA_HOME/bin + +# Passwordless SSH +RUN ssh-keygen -q -N "" -t dsa -f /etc/ssh/ssh_host_dsa_key +RUN ssh-keygen -q -N "" -t rsa -f /etc/ssh/ssh_host_rsa_key +RUN ssh-keygen -q -N "" -t rsa -f /root/.ssh/id_rsa +RUN cp /root/.ssh/id_rsa.pub /root/.ssh/authorized_keys + +ADD ssh_config /root/.ssh/config +RUN chmod 600 /root/.ssh/config +RUN chown root:root /root/.ssh/config + +RUN echo "Port 2122" >> /etc/ssh/sshd_config + +# Install Hadoop +RUN wget -O hadoop.tar.gz https://archive.apache.org/dist/hadoop/common/hadoop-2.7.4/hadoop-2.7.4.tar.gz && \ +tar -xzf hadoop.tar.gz -C /usr/local/ && rm hadoop.tar.gz + +RUN ln -s /usr/local/hadoop-2.7.4 /usr/local/hadoop + +ENV HADOOP_HOME /usr/local/hadoop +ENV PATH $PATH:$HADOOP_HOME/bin:$HADOOP_HOME/sbin + +ENV HADOOP_PREFIX $HADOOP_HOME +ENV HADOOP_COMMON_HOME $HADOOP_HOME +ENV HADOOP_HDFS_HOME $HADOOP_HOME +ENV HADOOP_MAPRED_HOME $HADOOP_HOME +ENV HADOOP_YARN_HOME $HADOOP_HOME +ENV HADOOP_CONF_DIR $HADOOP_HOME/etc/hadoop +ENV YARN_CONF_DIR $HADOOP_PREFIX/etc/hadoop + +# Default Conf Files +ADD core-site.xml $HADOOP_HOME/etc/hadoop/core-site.xml +ADD hdfs-site.xml $HADOOP_HOME/etc/hadoop/hdfs-site.xml +ADD mapred-site.xml $HADOOP_HOME/etc/hadoop/mapred-site.xml +ADD yarn-site.xml $HADOOP_HOME/etc/hadoop/yarn-site.xml +ADD slaves $HADOOP_HOME/etc/hadoop/slaves + +RUN sed -i "/^export JAVA_HOME/ s:.*:export JAVA_HOME=${JAVA_HOME}\nexport HADOOP_HOME=${HADOOP_HOME}\nexport HADOOP_PREFIX=${HADOOP_PREFIX}:" ${HADOOP_HOME}/etc/hadoop/hadoop-env.sh + +RUN sed -i '/^export HADOOP_CONF_DIR/ s:.*:export HADOOP_CONF_DIR=$HADOOP_PREFIX/etc/hadoop/:' $HADOOP_PREFIX/etc/hadoop/hadoop-env.sh + +WORKDIR $HADOOP_HOME + +ADD bootstrap.sh /etc/bootstrap.sh + +CMD ["/etc/bootstrap.sh", "-d"] diff --git a/hadoop/2.7.4/README b/hadoop/2.7.4/README new file mode 100644 index 0000000..567e1ed --- /dev/null +++ b/hadoop/2.7.4/README @@ -0,0 +1,87 @@ +# based on sequenceiq/hadoop-docker + +## Create a hadoop cluster in swarm mode + +`--hostname` needs 1.13 or higher + +``` +docker service create \ +--name hadoop-master \ +--network swarm-net \ +--hostname hadoop-master \ +--replicas 1 \ +--endpoint-mode dnsrr \ +newnius/hadoop +``` + +``` +docker service create \ +--name hadoop-slave1 \ +--network swarm-net \ +--hostname hadoop-slave1 \ +--replicas 1 \ +--endpoint-mode dnsrr \ +newnius/hadoop +``` + +``` +docker service create \ +--name hadoop-slave2 \ +--network swarm-net \ +--hostname hadoop-slave2 \ +--replicas 1 \ +--endpoint-mode dnsrr \ +newnius/hadoop +``` + +``` +docker service create \ +--name hadoop-slave3 \ +--network swarm-net \ +--hostname hadoop-slave3 \ +--replicas 1 \ +--endpoint-mode dnsrr \ +newnius/hadoop +``` + +## Init && Test + +In the first deploy, format dfs first + +### stop cluster (in master) +`sbin/stop-yarn.sh` +`sbin/stop-dfs.sh` + +### remove previous data (in all nodes) +clear all data in /tmp in all nodes + +### format hdfs (in master) +``` +bin/hadoop namenode -format +``` + +### start cluster (in master) +`sbin/start-dfs.sh` +`sbin/start-yarn.sh` + +### Run a test job + +bin/hadoop jar share/hadoop/mapreduce/hadoop-mapreduce-examples-2.7.4.jar grep input output 'dfs[a-z.]+' + +Note: there is no such dir default, you can add data by + +``` +bin/hadoop dfs -mkdir -p /user/root/input +``` +and +``` +bin/hadoop dfs -put etc/hadoop/* /user/root/input +``` + +### monitor cluster in browser + +YARN: hadoop-master:8088 + +HDFS: hadoop-master:50070 + +_Proxy needed, e.g. newnius/docker-proxy_ diff --git a/hadoop/2.7.4/bootstrap.sh b/hadoop/2.7.4/bootstrap.sh new file mode 100755 index 0000000..42f4f75 --- /dev/null +++ b/hadoop/2.7.4/bootstrap.sh @@ -0,0 +1,31 @@ +#!/bin/bash + +: ${HADOOP_PREFIX:=/usr/local/hadoop} + +$HADOOP_PREFIX/etc/hadoop/hadoop-env.sh + +rm /tmp/*.pid + +# installing libraries if any - (resource urls added comma separated to the ACP system variable) +cd $HADOOP_PREFIX/share/hadoop/common ; for cp in ${ACP//,/ }; do echo == $cp; curl -LO $cp ; done; cd - + +# replace config +cp /mnt/hadoop-config/* $HADOOP_PREFIX/etc/hadoop/ + +/usr/sbin/sshd + +## stop all in case master starts far behind +$HADOOP_PREFIX/sbin/stop-yarn.sh +$HADOOP_PREFIX/sbin/stop-dfs.sh + +$HADOOP_PREFIX/sbin/start-dfs.sh +$HADOOP_PREFIX/sbin/start-yarn.sh +$HADOOP_PREFIX/sbin/mr-jobhistory-daemon.sh start historyserver + +if [[ $1 == "-d" ]]; then + while true; do sleep 1000; done +fi + +if [[ $1 == "-bash" ]]; then + /bin/bash +fi diff --git a/hadoop/2.7.4/core-site.xml b/hadoop/2.7.4/core-site.xml new file mode 100644 index 0000000..da536c3 --- /dev/null +++ b/hadoop/2.7.4/core-site.xml @@ -0,0 +1,29 @@ + + + + + + + + + + fs.defaultFS + hdfs://hadoop-master:8020 + + + fs.default.name + hdfs://hadoop-master:8020 + + diff --git a/hadoop/2.7.4/hdfs-site.xml b/hadoop/2.7.4/hdfs-site.xml new file mode 100644 index 0000000..af4e13d --- /dev/null +++ b/hadoop/2.7.4/hdfs-site.xml @@ -0,0 +1,46 @@ + + + + + + + + + + dfs.permissions + false + + + + dfs.namenode.secondary.http-address + hadoop-slave1:50090 + + + dfs.namenode.http-address + hadoop-master:50070 + + + + dfs.datanode.max.transfer.threads + 8192 + + + + dfs.replication + 3 + + + + diff --git a/hadoop/2.7.4/mapred-site.xml b/hadoop/2.7.4/mapred-site.xml new file mode 100644 index 0000000..cad0cee --- /dev/null +++ b/hadoop/2.7.4/mapred-site.xml @@ -0,0 +1,33 @@ + + + + + + + + + + mapreduce.framework.name + yarn + + + mapreduce.jobhistory.address + hadoop-master:10020 + + + mapreduce.jobhistory.webapp.address + hadoop-master:19888 + + diff --git a/hadoop/2.7.4/slaves b/hadoop/2.7.4/slaves new file mode 100644 index 0000000..3bb91be --- /dev/null +++ b/hadoop/2.7.4/slaves @@ -0,0 +1,3 @@ +hadoop-slave1 +hadoop-slave2 +hadoop-slave3 diff --git a/hadoop/2.7.4/ssh_config b/hadoop/2.7.4/ssh_config new file mode 100644 index 0000000..535f9d3 --- /dev/null +++ b/hadoop/2.7.4/ssh_config @@ -0,0 +1,5 @@ +Host * + UserKnownHostsFile /dev/null + StrictHostKeyChecking no + LogLevel quiet + Port 2122 diff --git a/hadoop/2.7.4/yarn-site.xml b/hadoop/2.7.4/yarn-site.xml new file mode 100644 index 0000000..bf318f4 --- /dev/null +++ b/hadoop/2.7.4/yarn-site.xml @@ -0,0 +1,56 @@ + + + + + + + yarn.application.classpath + /usr/local/hadoop/etc/hadoop, /usr/local/hadoop/share/hadoop/common/*, /usr/local/hadoop/share/hadoop/common/lib/*, /usr/local/hadoop/share/hadoop/hdfs/*, /usr/local/hadoop/share/hadoop/hdfs/lib/*, /usr/local/hadoop/share/hadoop/mapreduce/*, /usr/local/hadoop/share/hadoop/mapreduce/lib/*, /usr/local/hadoop/share/hadoop/yarn/*, /usr/local/hadoop/share/hadoop/yarn/lib/* + + + + + yarn.resourcemanager.hostname + hadoop-master + + + + yarn.nodemanager.aux-services + mapreduce_shuffle + + + + yarn.log-aggregation-enable + true + + + + yarn.log-aggregation.retain-seconds + 604800 + + + + yarn.nodemanager.resource.memory-mb + 2048 + + + yarn.nodemanager.resource.cpu-vcores + 2 + + + yarn.scheduler.minimum-allocation-mb + 1024 + + diff --git a/hadoop/2.8.1/Dockerfile b/hadoop/2.8.1/Dockerfile index 1fec60a..50aabfd 100644 --- a/hadoop/2.8.1/Dockerfile +++ b/hadoop/2.8.1/Dockerfile @@ -26,6 +26,8 @@ RUN echo "Port 2122" >> /etc/ssh/sshd_config RUN wget -O hadoop.tar.gz https://archive.apache.org/dist/hadoop/common/hadoop-2.8.1/hadoop-2.8.1.tar.gz && \ tar -xzf hadoop.tar.gz -C /usr/local/ && rm hadoop.tar.gz +RUN ln -s /usr/local/hadoop-2.7.4 /usr/local/hadoop + ENV HADOOP_HOME /usr/local/hadoop ENV PATH $PATH:$HADOOP_HOME/bin:$HADOOP_HOME/sbin