From 8e119766eea71143808e9673f05e4d902e2900bc Mon Sep 17 00:00:00 2001 From: Newnius Date: Mon, 6 Aug 2018 17:14:20 +0800 Subject: [PATCH] add more hadoop versions --- hadoop/2.7.1/Dockerfile | 66 +++++++++++++--- hadoop/2.7.1/README.md | 146 +++++++++++++++++++++-------------- hadoop/2.7.1/bootstrap.sh | 10 ++- hadoop/2.7.1/core-site.xml | 17 ++-- hadoop/2.7.1/hdfs-site.xml | 22 ++---- hadoop/2.7.1/mapred-site.xml | 12 ++- hadoop/2.7.1/ssh_config | 5 ++ hadoop/2.7.1/yarn-site.xml | 24 ++---- hadoop/2.8.1/Dockerfile | 15 ++-- hadoop/2.8.1/README.md | 146 +++++++++++++++++++++-------------- hadoop/2.8.1/bootstrap.sh | 8 +- hadoop/2.8.4/Dockerfile | 59 ++++++++++++++ hadoop/2.8.4/README.md | 115 +++++++++++++++++++++++++++ hadoop/2.8.4/bootstrap.sh | 35 +++++++++ hadoop/2.8.4/core-site.xml | 28 +++++++ hadoop/2.8.4/hdfs-site.xml | 40 ++++++++++ hadoop/2.8.4/mapred-site.xml | 32 ++++++++ hadoop/2.8.4/slaves | 3 + hadoop/2.8.4/ssh_config | 5 ++ hadoop/2.8.4/yarn-site.xml | 49 ++++++++++++ hadoop/2.9.1/Dockerfile | 4 +- 21 files changed, 655 insertions(+), 186 deletions(-) create mode 100644 hadoop/2.7.1/ssh_config create mode 100644 hadoop/2.8.4/Dockerfile create mode 100644 hadoop/2.8.4/README.md create mode 100755 hadoop/2.8.4/bootstrap.sh create mode 100644 hadoop/2.8.4/core-site.xml create mode 100644 hadoop/2.8.4/hdfs-site.xml create mode 100644 hadoop/2.8.4/mapred-site.xml create mode 100644 hadoop/2.8.4/slaves create mode 100644 hadoop/2.8.4/ssh_config create mode 100644 hadoop/2.8.4/yarn-site.xml diff --git a/hadoop/2.7.1/Dockerfile b/hadoop/2.7.1/Dockerfile index 9c7ad0f..52896a6 100644 --- a/hadoop/2.7.1/Dockerfile +++ b/hadoop/2.7.1/Dockerfile @@ -1,17 +1,59 @@ -FROM sequenceiq/hadoop-docker:2.7.1 +FROM alpine:3.8 -MAINTAINER Newnius +MAINTAINER Newnius + +USER root + +# Prerequisites +RUN apk add --no-cache openssh openssl openjdk8-jre rsync bash procps + +ENV JAVA_HOME /usr/lib/jvm/java-1.8-openjdk +ENV PATH $PATH:$JAVA_HOME/bin + +# Passwordless SSH +RUN ssh-keygen -q -N "" -t dsa -f /etc/ssh/ssh_host_dsa_key +RUN ssh-keygen -q -N "" -t rsa -f /etc/ssh/ssh_host_rsa_key +RUN ssh-keygen -q -N "" -t rsa -f /root/.ssh/id_rsa +RUN cp /root/.ssh/id_rsa.pub /root/.ssh/authorized_keys + +ADD ssh_config /root/.ssh/config +RUN chmod 600 /root/.ssh/config +RUN chown root:root /root/.ssh/config + +RUN echo "Port 2122" >> /etc/ssh/sshd_config + +# Install Hadoop +ENV HADOOP_VER 2.7.1 + +RUN wget -O hadoop.tar.gz https://archive.apache.org/dist/hadoop/common/hadoop-$HADOOP_VER/hadoop-$HADOOP_VER.tar.gz && \ + tar -xzf hadoop.tar.gz -C /usr/local/ && rm hadoop.tar.gz + +RUN ln -s /usr/local/hadoop-$HADOOP_VER /usr/local/hadoop + +ENV HADOOP_HOME /usr/local/hadoop +ENV PATH $PATH:$HADOOP_HOME/bin:$HADOOP_HOME/sbin + +ENV HADOOP_PREFIX $HADOOP_HOME +ENV HADOOP_COMMON_HOME $HADOOP_HOME +ENV HADOOP_HDFS_HOME $HADOOP_HOME +ENV HADOOP_MAPRED_HOME $HADOOP_HOME +ENV HADOOP_YARN_HOME $HADOOP_HOME +ENV HADOOP_CONF_DIR $HADOOP_HOME/etc/hadoop +ENV YARN_CONF_DIR $HADOOP_PREFIX/etc/hadoop + +# Default Conf Files +ADD core-site.xml $HADOOP_HOME/etc/hadoop/core-site.xml +ADD hdfs-site.xml $HADOOP_HOME/etc/hadoop/hdfs-site.xml +ADD mapred-site.xml $HADOOP_HOME/etc/hadoop/mapred-site.xml +ADD yarn-site.xml $HADOOP_HOME/etc/hadoop/yarn-site.xml +ADD slaves $HADOOP_HOME/etc/hadoop/slaves + +RUN sed -i "/^export JAVA_HOME/ s:.*:export JAVA_HOME=${JAVA_HOME}\nexport HADOOP_HOME=${HADOOP_HOME}\nexport HADOOP_PREFIX=${HADOOP_PREFIX}:" ${HADOOP_HOME}/etc/hadoop/hadoop-env.sh + +RUN sed -i '/^export HADOOP_CONF_DIR/ s:.*:export HADOOP_CONF_DIR=$HADOOP_PREFIX/etc/hadoop/:' $HADOOP_PREFIX/etc/hadoop/hadoop-env.sh + +WORKDIR $HADOOP_HOME ADD bootstrap.sh /etc/bootstrap.sh -RUN mkdir -p /mnt/hadoop-config - -ADD core-site.xml /mnt/hadoop-config -ADD yarn-site.xml /mnt/hadoop-config -ADD mapred-site.xml /mnt/hadoop-config -ADD hdfs-site.xml /mnt/hadoop-config -ADD slaves /mnt/hadoop-config - -WORKDIR /usr/local/hadoop - CMD ["/etc/bootstrap.sh", "-d"] diff --git a/hadoop/2.7.1/README.md b/hadoop/2.7.1/README.md index e1ae28c..91ad353 100644 --- a/hadoop/2.7.1/README.md +++ b/hadoop/2.7.1/README.md @@ -1,87 +1,115 @@ -# based on sequenceiq/hadoop-docker +# Deploy one Hadoop Cluster with docker -## Create a hadoop cluster in swarm mode +## Start Master -`--hostname` needs 1.13 or higher - -``` +```bash docker service create \ ---name hadoop-master \ ---network swarm-net \ ---hostname hadoop-master \ ---replicas 1 \ ---endpoint-mode dnsrr \ -newnius/hadoop + --name hadoop-master \ + --hostname hadoop-master \ + --network swarm-net \ + --replicas 1 \ + --detach=true \ + --endpoint-mode dnsrr \ + --mount type=bind,source=/etc/localtime,target=/etc/localtime \ + newnius/hadoop:2.7.1 ``` -``` +## Start slaves + +```bash docker service create \ ---name hadoop-slave1 \ ---network swarm-net \ ---hostname hadoop-slave1 \ ---replicas 1 \ ---endpoint-mode dnsrr \ -newnius/hadoop + --name hadoop-slave1 \ + --hostname hadoop-slave1 \ + --network swarm-net \ + --replicas 1 \ + --detach=true \ + --endpoint-mode dnsrr \ + --mount type=bind,source=/etc/localtime,target=/etc/localtime \ + newnius/hadoop:2.7.1 ``` -``` +```bash docker service create \ ---name hadoop-slave2 \ ---network swarm-net \ ---hostname hadoop-slave2 \ ---replicas 1 \ ---endpoint-mode dnsrr \ -newnius/hadoop + --name hadoop-slave2 \ + --network swarm-net \ + --hostname hadoop-slave2 \ + --replicas 1 \ + --detach=true \ + --endpoint-mode dnsrr \ + --mount type=bind,source=/etc/localtime,target=/etc/localtime \ + newnius/hadoop:2.7.1 ``` -``` +```bash docker service create \ ---name hadoop-slave3 \ ---network swarm-net \ ---hostname hadoop-slave3 \ ---replicas 1 \ ---endpoint-mode dnsrr \ -newnius/hadoop + --name hadoop-slave3 \ + --hostname hadoop-slave3 \ + --network swarm-net \ + --replicas 1 \ + --detach=true \ + --endpoint-mode dnsrr \ + --mount type=bind,source=/etc/localtime,target=/etc/localtime \ + newnius/hadoop:2.7.1 ``` -## Init && Test +## Init for the first time -In the first deploy, format dfs first +#### format dfs first +Run these commands on the master node. -### stop cluster (in master) -`sbin/stop-yarn.sh` -`sbin/stop-dfs.sh` +```bash +# stop HDFS services +sbin/stop-dfs.sh -### remove previous data (in all nodes) -clear all data in /tmp in all nodes - -### format hdfs (in master) -``` +# format HDFS meta data bin/hadoop namenode -format + +# restart HDFS services +sbin/start-dfs.sh ``` -### start cluster (in master) -`sbin/start-dfs.sh` -`sbin/start-yarn.sh` +## Run a test job +To make sure youui have successfully setup the Hadoop cluster, just run the floowing commands to see if it is executed well. -### Run a test job - -bin/hadoop jar share/hadoop/mapreduce/hadoop-mapreduce-examples-2.7.1.jar grep input output 'dfs[a-z.]+' - -Note: there is no such dir default, you can add data by - -``` +```bash +# prepare input data bin/hadoop dfs -mkdir -p /user/root/input -``` -and -``` + +# copy files to input path bin/hadoop dfs -put etc/hadoop/* /user/root/input + +# submit the job +bin/hadoop jar share/hadoop/mapreduce/hadoop-mapreduce-examples-2.7.4.jar grep input output 'dfs[a-z.]+' ``` -### monitor cluster in browser +## Browse the web UI +You can expose the ports in the script, but I'd rather not since the slaves shoule occupy the same ports. -YARN: hadoop-master:8088 +To access the web UI, deploy another (socks5) proxy to route the traffic. -HDFS: hadoop-master:50070 +If you don't one, try [newnius/docker-proxy](https://hub.docker.com/r/newnius/docker-proxy/), it is rather easy to use. -_Proxy needed, e.g. newnius/docker-proxy_ +Visit [hadoop-master:8088](hadoop-master:8088) fo YARN pages. + +Visit [hadoop-master:50070](hadoop-master:50070) fo YARN pages. + +## Custom configuration + +To persist data or modify the conf files, refer to the following script. + +The `/config/hadoop` path is where new conf files to be replaces, you don't have to put all the files. + +```bash +docker service create \ + --name hadoop-master \ + --hostname hadoop-master \ + --network swarm-net \ + --replicas 1 \ + --detach=true \ + --endpoint-mode dnsrr \ + --mount type=bind,source=/etc/localtime,target=/etc/localtime \ + --mount type=bind,source=/data/hadoop/config,target=/config/hadoop \ + --mount type=bind,source=/data/hadoop/hdfs/master,target=/tmp/hadoop-root \ + --mount type=bind,source=/data/hadoop/logs/master,target=/usr/local/hadoop/logs \ + newnius/hadoop:2.7.1 +``` diff --git a/hadoop/2.7.1/bootstrap.sh b/hadoop/2.7.1/bootstrap.sh index 802c4f7..4289858 100755 --- a/hadoop/2.7.1/bootstrap.sh +++ b/hadoop/2.7.1/bootstrap.sh @@ -9,10 +9,14 @@ rm /tmp/*.pid # installing libraries if any - (resource urls added comma separated to the ACP system variable) cd $HADOOP_PREFIX/share/hadoop/common ; for cp in ${ACP//,/ }; do echo == $cp; curl -LO $cp ; done; cd - -# replace config -cp /mnt/hadoop-config/* $HADOOP_PREFIX/etc/hadoop/ +## replace config +: ${EXTRA_CONF_DIR:=/config/hadoop} -service sshd start +if [ -d "$EXTRA_CONF_DIR" ]; then + cp $EXTRA_CONF_DIR/* $HADOOP_PREFIX/etc/hadoop/ +fi + +/usr/sbin/sshd ## stop all in case master starts far behind $HADOOP_PREFIX/sbin/stop-yarn.sh diff --git a/hadoop/2.7.1/core-site.xml b/hadoop/2.7.1/core-site.xml index da536c3..7bd76e9 100644 --- a/hadoop/2.7.1/core-site.xml +++ b/hadoop/2.7.1/core-site.xml @@ -17,13 +17,12 @@ - - - fs.defaultFS - hdfs://hadoop-master:8020 - - - fs.default.name - hdfs://hadoop-master:8020 - + + fs.defaultFS + hdfs://hadoop-master:8020 + + + fs.default.name + hdfs://hadoop-master:8020 + diff --git a/hadoop/2.7.1/hdfs-site.xml b/hadoop/2.7.1/hdfs-site.xml index af4e13d..ba3613a 100644 --- a/hadoop/2.7.1/hdfs-site.xml +++ b/hadoop/2.7.1/hdfs-site.xml @@ -17,30 +17,24 @@ - dfs.permissions false - dfs.namenode.secondary.http-address hadoop-slave1:50090 - + dfs.namenode.http-address hadoop-master:50070 - - - dfs.datanode.max.transfer.threads - 8192 + + dfs.datanode.max.transfer.threads + 8192 + + + dfs.replication + 3 - - - dfs.replication - 3 - - - diff --git a/hadoop/2.7.1/mapred-site.xml b/hadoop/2.7.1/mapred-site.xml index ce18519..00fc7b8 100644 --- a/hadoop/2.7.1/mapred-site.xml +++ b/hadoop/2.7.1/mapred-site.xml @@ -17,10 +17,16 @@ - - + mapreduce.framework.name yarn - + + mapreduce.jobhistory.address + hadoop-master:10020 + + + mapreduce.jobhistory.webapp.address + hadoop-master:19888 + diff --git a/hadoop/2.7.1/ssh_config b/hadoop/2.7.1/ssh_config new file mode 100644 index 0000000..535f9d3 --- /dev/null +++ b/hadoop/2.7.1/ssh_config @@ -0,0 +1,5 @@ +Host * + UserKnownHostsFile /dev/null + StrictHostKeyChecking no + LogLevel quiet + Port 2122 diff --git a/hadoop/2.7.1/yarn-site.xml b/hadoop/2.7.1/yarn-site.xml index f3acd92..c3fec7b 100644 --- a/hadoop/2.7.1/yarn-site.xml +++ b/hadoop/2.7.1/yarn-site.xml @@ -14,34 +14,26 @@ --> - - - yarn.application.classpath - /usr/local/hadoop/etc/hadoop, /usr/local/hadoop/share/hadoop/common/*, /usr/local/hadoop/share/hadoop/common/lib/*, /usr/local/hadoop/share/hadoop/hdfs/*, /usr/local/hadoop/share/hadoop/hdfs/lib/*, /usr/local/hadoop/share/hadoop/mapreduce/*, /usr/local/hadoop/share/hadoop/mapreduce/lib/*, /usr/local/hadoop/share/hadoop/yarn/*, /usr/local/hadoop/share/hadoop/yarn/lib/* - - - + + yarn.application.classpath + /usr/local/hadoop/etc/hadoop, /usr/local/hadoop/share/hadoop/common/*, /usr/local/hadoop/share/hadoop/common/lib/*, /usr/local/hadoop/share/hadoop/hdfs/*, /usr/local/hadoop/share/hadoop/hdfs/lib/*, /usr/local/hadoop/share/hadoop/mapreduce/*, /usr/local/hadoop/share/hadoop/mapreduce/lib/*, /usr/local/hadoop/share/hadoop/yarn/*, /usr/local/hadoop/share/hadoop/yarn/lib/* + yarn.resourcemanager.hostname hadoop-master - yarn.nodemanager.aux-services mapreduce_shuffle - yarn.log-aggregation-enable true - yarn.log-aggregation.retain-seconds 604800 - - yarn.nodemanager.resource.memory-mb 2048 @@ -50,8 +42,8 @@ yarn.nodemanager.resource.cpu-vcores 2 - - yarn.scheduler.minimum-allocation-mb - 1024 - + + yarn.scheduler.minimum-allocation-mb + 1024 + diff --git a/hadoop/2.8.1/Dockerfile b/hadoop/2.8.1/Dockerfile index ff0ffa1..5af05ab 100644 --- a/hadoop/2.8.1/Dockerfile +++ b/hadoop/2.8.1/Dockerfile @@ -1,4 +1,4 @@ -FROM alpine:3.6 +FROM alpine:3.8 MAINTAINER Newnius @@ -23,10 +23,12 @@ RUN chown root:root /root/.ssh/config RUN echo "Port 2122" >> /etc/ssh/sshd_config # Install Hadoop -RUN wget -O hadoop.tar.gz https://archive.apache.org/dist/hadoop/common/hadoop-2.8.1/hadoop-2.8.1.tar.gz && \ -tar -xzf hadoop.tar.gz -C /usr/local/ && rm hadoop.tar.gz +ENV HADOOP_VER 2.8.1 -RUN ln -s /usr/local/hadoop-2.8.1 /usr/local/hadoop +RUN wget -O hadoop.tar.gz https://archive.apache.org/dist/hadoop/common/hadoop-$HADOOP_VER/hadoop-$HADOOP_VER.tar.gz && \ + tar -xzf hadoop.tar.gz -C /usr/local/ && rm hadoop.tar.gz + +RUN ln -s /usr/local/hadoop-$HADOOP_VER /usr/local/hadoop ENV HADOOP_HOME /usr/local/hadoop ENV PATH $PATH:$HADOOP_HOME/bin:$HADOOP_HOME/sbin @@ -46,10 +48,9 @@ ADD mapred-site.xml $HADOOP_HOME/etc/hadoop/mapred-site.xml ADD yarn-site.xml $HADOOP_HOME/etc/hadoop/yarn-site.xml ADD slaves $HADOOP_HOME/etc/hadoop/slaves +RUN sed -i "/^export JAVA_HOME/ s:.*:export JAVA_HOME=${JAVA_HOME}\nexport HADOOP_HOME=${HADOOP_HOME}\nexport HADOOP_PREFIX=${HADOOP_PREFIX}:" ${HADOOP_HOME}/etc/hadoop/hadoop-env.sh -RUN sed -i "s|export JAVA_HOME=.*|export JAVA_HOME=${JAVA_HOME}\nexport HADOOP_HOME=${HADOOP_HOME}\nexport HADOOP_PREFIX=${HADOOP_PREFIX}|g" ${HADOOP_PREFIX}/etc/hadoop/hadoop-env.sh - -RUN sed -i "s|export HADOOP_CONF_DIR=.*|export HADOOP_CONF_DIR=${HADOOP_PREFIX}/etc/hadoop/|g" ${HADOOP_PREFIX}/etc/hadoop/hadoop-env.sh +RUN sed -i '/^export HADOOP_CONF_DIR/ s:.*:export HADOOP_CONF_DIR=$HADOOP_PREFIX/etc/hadoop/:' $HADOOP_PREFIX/etc/hadoop/hadoop-env.sh WORKDIR $HADOOP_HOME diff --git a/hadoop/2.8.1/README.md b/hadoop/2.8.1/README.md index fde23bc..b5c483f 100644 --- a/hadoop/2.8.1/README.md +++ b/hadoop/2.8.1/README.md @@ -1,87 +1,115 @@ -# based on alpine +# Deploy one Hadoop Cluster with docker -## Create a hadoop cluster in swarm mode +## Start Master -`--hostname` needs docker 1.13 or higher - -``` +```bash docker service create \ ---name hadoop-master \ ---network swarm-net \ ---hostname hadoop-master \ ---replicas 1 \ ---endpoint-mode dnsrr \ -newnius/hadoop + --name hadoop-master \ + --hostname hadoop-master \ + --network swarm-net \ + --replicas 1 \ + --detach=true \ + --endpoint-mode dnsrr \ + --mount type=bind,source=/etc/localtime,target=/etc/localtime \ + newnius/hadoop:2.8.1 ``` -``` +## Start slaves + +```bash docker service create \ ---name hadoop-slave1 \ ---network swarm-net \ ---hostname hadoop-slave1 \ ---replicas 1 \ ---endpoint-mode dnsrr \ -newnius/hadoop + --name hadoop-slave1 \ + --hostname hadoop-slave1 \ + --network swarm-net \ + --replicas 1 \ + --detach=true \ + --endpoint-mode dnsrr \ + --mount type=bind,source=/etc/localtime,target=/etc/localtime \ + newnius/hadoop:2.8.1 ``` -``` +```bash docker service create \ ---name hadoop-slave2 \ ---network swarm-net \ ---hostname hadoop-slave2 \ ---replicas 1 \ ---endpoint-mode dnsrr \ -newnius/hadoop + --name hadoop-slave2 \ + --network swarm-net \ + --hostname hadoop-slave2 \ + --replicas 1 \ + --detach=true \ + --endpoint-mode dnsrr \ + --mount type=bind,source=/etc/localtime,target=/etc/localtime \ + newnius/hadoop:2.8.1 ``` -``` +```bash docker service create \ ---name hadoop-slave3 \ ---network swarm-net \ ---hostname hadoop-slave3 \ ---replicas 1 \ ---endpoint-mode dnsrr \ -newnius/hadoop + --name hadoop-slave3 \ + --hostname hadoop-slave3 \ + --network swarm-net \ + --replicas 1 \ + --detach=true \ + --endpoint-mode dnsrr \ + --mount type=bind,source=/etc/localtime,target=/etc/localtime \ + newnius/hadoop:2.8.1 ``` -## Init && Test +## Init for the first time -In the first deploy, format dfs first +#### format dfs first +Run these commands on the master node. -### stop cluster (in master) -`sbin/stop-yarn.sh` -`sbin/stop-dfs.sh` +```bash +# stop HDFS services +sbin/stop-dfs.sh -### remove previous data (in all nodes) -clear all data in /tmp in all nodes - -### format hdfs (in master) -``` +# format HDFS meta data bin/hadoop namenode -format + +# restart HDFS services +sbin/start-dfs.sh ``` -### start cluster (in master) -`sbin/start-dfs.sh` -`sbin/start-yarn.sh` +## Run a test job +To make sure youui have successfully setup the Hadoop cluster, just run the floowing commands to see if it is executed well. -### Run a test job - -bin/hadoop jar share/hadoop/mapreduce/hadoop-mapreduce-examples-2.7.4.jar grep input output 'dfs[a-z.]+' - -Note: there is no such dir default, you can add data by - -``` +```bash +# prepare input data bin/hadoop dfs -mkdir -p /user/root/input -``` -and -``` + +# copy files to input path bin/hadoop dfs -put etc/hadoop/* /user/root/input + +# submit the job +bin/hadoop jar share/hadoop/mapreduce/hadoop-mapreduce-examples-2.7.4.jar grep input output 'dfs[a-z.]+' ``` -### monitor cluster in browser +## Browse the web UI +You can expose the ports in the script, but I'd rather not since the slaves shoule occupy the same ports. -YARN: hadoop-master:8088 +To access the web UI, deploy another (socks5) proxy to route the traffic. -HDFS: hadoop-master:50070 +If you don't one, try [newnius/docker-proxy](https://hub.docker.com/r/newnius/docker-proxy/), it is rather easy to use. -_Proxy needed: newnius/docker-proxy_ +Visit [hadoop-master:8088](hadoop-master:8088) fo YARN pages. + +Visit [hadoop-master:50070](hadoop-master:50070) fo YARN pages. + +## Custom configuration + +To persist data or modify the conf files, refer to the following script. + +The `/config/hadoop` path is where new conf files to be replaces, you don't have to put all the files. + +```bash +docker service create \ + --name hadoop-master \ + --hostname hadoop-master \ + --network swarm-net \ + --replicas 1 \ + --detach=true \ + --endpoint-mode dnsrr \ + --mount type=bind,source=/etc/localtime,target=/etc/localtime \ + --mount type=bind,source=/data/hadoop/config,target=/config/hadoop \ + --mount type=bind,source=/data/hadoop/hdfs/master,target=/tmp/hadoop-root \ + --mount type=bind,source=/data/hadoop/logs/master,target=/usr/local/hadoop/logs \ + newnius/hadoop:2.8.1 +``` diff --git a/hadoop/2.8.1/bootstrap.sh b/hadoop/2.8.1/bootstrap.sh index 42f4f75..4289858 100755 --- a/hadoop/2.8.1/bootstrap.sh +++ b/hadoop/2.8.1/bootstrap.sh @@ -9,8 +9,12 @@ rm /tmp/*.pid # installing libraries if any - (resource urls added comma separated to the ACP system variable) cd $HADOOP_PREFIX/share/hadoop/common ; for cp in ${ACP//,/ }; do echo == $cp; curl -LO $cp ; done; cd - -# replace config -cp /mnt/hadoop-config/* $HADOOP_PREFIX/etc/hadoop/ +## replace config +: ${EXTRA_CONF_DIR:=/config/hadoop} + +if [ -d "$EXTRA_CONF_DIR" ]; then + cp $EXTRA_CONF_DIR/* $HADOOP_PREFIX/etc/hadoop/ +fi /usr/sbin/sshd diff --git a/hadoop/2.8.4/Dockerfile b/hadoop/2.8.4/Dockerfile new file mode 100644 index 0000000..cab1c11 --- /dev/null +++ b/hadoop/2.8.4/Dockerfile @@ -0,0 +1,59 @@ +FROM alpine:3.8 + +MAINTAINER Newnius + +USER root + +# Prerequisites +RUN apk add --no-cache openssh openssl openjdk8-jre rsync bash procps + +ENV JAVA_HOME /usr/lib/jvm/java-1.8-openjdk +ENV PATH $PATH:$JAVA_HOME/bin + +# Passwordless SSH +RUN ssh-keygen -q -N "" -t dsa -f /etc/ssh/ssh_host_dsa_key +RUN ssh-keygen -q -N "" -t rsa -f /etc/ssh/ssh_host_rsa_key +RUN ssh-keygen -q -N "" -t rsa -f /root/.ssh/id_rsa +RUN cp /root/.ssh/id_rsa.pub /root/.ssh/authorized_keys + +ADD ssh_config /root/.ssh/config +RUN chmod 600 /root/.ssh/config +RUN chown root:root /root/.ssh/config + +RUN echo "Port 2122" >> /etc/ssh/sshd_config + +# Install Hadoop +ENV HADOOP_VER 2.8.4 + +RUN wget -O hadoop.tar.gz https://archive.apache.org/dist/hadoop/common/hadoop-$HADOOP_VER/hadoop-$HADOOP_VER.tar.gz && \ + tar -xzf hadoop.tar.gz -C /usr/local/ && rm hadoop.tar.gz + +RUN ln -s /usr/local/hadoop-$HADOOP_VER /usr/local/hadoop + +ENV HADOOP_HOME /usr/local/hadoop +ENV PATH $PATH:$HADOOP_HOME/bin:$HADOOP_HOME/sbin + +ENV HADOOP_PREFIX $HADOOP_HOME +ENV HADOOP_COMMON_HOME $HADOOP_HOME +ENV HADOOP_HDFS_HOME $HADOOP_HOME +ENV HADOOP_MAPRED_HOME $HADOOP_HOME +ENV HADOOP_YARN_HOME $HADOOP_HOME +ENV HADOOP_CONF_DIR $HADOOP_HOME/etc/hadoop +ENV YARN_CONF_DIR $HADOOP_PREFIX/etc/hadoop + +# Default Conf Files +ADD core-site.xml $HADOOP_HOME/etc/hadoop/core-site.xml +ADD hdfs-site.xml $HADOOP_HOME/etc/hadoop/hdfs-site.xml +ADD mapred-site.xml $HADOOP_HOME/etc/hadoop/mapred-site.xml +ADD yarn-site.xml $HADOOP_HOME/etc/hadoop/yarn-site.xml +ADD slaves $HADOOP_HOME/etc/hadoop/slaves + +RUN sed -i "/^export JAVA_HOME/ s:.*:export JAVA_HOME=${JAVA_HOME}\nexport HADOOP_HOME=${HADOOP_HOME}\nexport HADOOP_PREFIX=${HADOOP_PREFIX}:" ${HADOOP_HOME}/etc/hadoop/hadoop-env.sh + +RUN sed -i '/^export HADOOP_CONF_DIR/ s:.*:export HADOOP_CONF_DIR=$HADOOP_PREFIX/etc/hadoop/:' $HADOOP_PREFIX/etc/hadoop/hadoop-env.sh + +WORKDIR $HADOOP_HOME + +ADD bootstrap.sh /etc/bootstrap.sh + +CMD ["/etc/bootstrap.sh", "-d"] diff --git a/hadoop/2.8.4/README.md b/hadoop/2.8.4/README.md new file mode 100644 index 0000000..54fbcba --- /dev/null +++ b/hadoop/2.8.4/README.md @@ -0,0 +1,115 @@ +# Deploy one Hadoop Cluster with docker + +## Start Master + +```bash +docker service create \ + --name hadoop-master \ + --hostname hadoop-master \ + --network swarm-net \ + --replicas 1 \ + --detach=true \ + --endpoint-mode dnsrr \ + --mount type=bind,source=/etc/localtime,target=/etc/localtime \ + newnius/hadoop:2.8.4 +``` + +## Start slaves + +```bash +docker service create \ + --name hadoop-slave1 \ + --hostname hadoop-slave1 \ + --network swarm-net \ + --replicas 1 \ + --detach=true \ + --endpoint-mode dnsrr \ + --mount type=bind,source=/etc/localtime,target=/etc/localtime \ + newnius/hadoop:2.8.4 +``` + +```bash +docker service create \ + --name hadoop-slave2 \ + --network swarm-net \ + --hostname hadoop-slave2 \ + --replicas 1 \ + --detach=true \ + --endpoint-mode dnsrr \ + --mount type=bind,source=/etc/localtime,target=/etc/localtime \ + newnius/hadoop:2.8.4 +``` + +```bash +docker service create \ + --name hadoop-slave3 \ + --hostname hadoop-slave3 \ + --network swarm-net \ + --replicas 1 \ + --detach=true \ + --endpoint-mode dnsrr \ + --mount type=bind,source=/etc/localtime,target=/etc/localtime \ + newnius/hadoop:2.8.4 +``` + +## Init for the first time + +#### format dfs first +Run these commands on the master node. + +```bash +# stop HDFS services +sbin/stop-dfs.sh + +# format HDFS meta data +bin/hadoop namenode -format + +# restart HDFS services +sbin/start-dfs.sh +``` + +## Run a test job +To make sure youui have successfully setup the Hadoop cluster, just run the floowing commands to see if it is executed well. + +```bash +# prepare input data +bin/hadoop dfs -mkdir -p /user/root/input + +# copy files to input path +bin/hadoop dfs -put etc/hadoop/* /user/root/input + +# submit the job +bin/hadoop jar share/hadoop/mapreduce/hadoop-mapreduce-examples-2.7.4.jar grep input output 'dfs[a-z.]+' +``` + +## Browse the web UI +You can expose the ports in the script, but I'd rather not since the slaves shoule occupy the same ports. + +To access the web UI, deploy another (socks5) proxy to route the traffic. + +If you don't one, try [newnius/docker-proxy](https://hub.docker.com/r/newnius/docker-proxy/), it is rather easy to use. + +Visit [hadoop-master:8088](hadoop-master:8088) fo YARN pages. + +Visit [hadoop-master:50070](hadoop-master:50070) fo YARN pages. + +## Custom configuration + +To persist data or modify the conf files, refer to the following script. + +The `/config/hadoop` path is where new conf files to be replaces, you don't have to put all the files. + +```bash +docker service create \ + --name hadoop-master \ + --hostname hadoop-master \ + --network swarm-net \ + --replicas 1 \ + --detach=true \ + --endpoint-mode dnsrr \ + --mount type=bind,source=/etc/localtime,target=/etc/localtime \ + --mount type=bind,source=/data/hadoop/config,target=/config/hadoop \ + --mount type=bind,source=/data/hadoop/hdfs/master,target=/tmp/hadoop-root \ + --mount type=bind,source=/data/hadoop/logs/master,target=/usr/local/hadoop/logs \ + newnius/hadoop:2.8.4 +``` diff --git a/hadoop/2.8.4/bootstrap.sh b/hadoop/2.8.4/bootstrap.sh new file mode 100755 index 0000000..4289858 --- /dev/null +++ b/hadoop/2.8.4/bootstrap.sh @@ -0,0 +1,35 @@ +#!/bin/bash + +: ${HADOOP_PREFIX:=/usr/local/hadoop} + +$HADOOP_PREFIX/etc/hadoop/hadoop-env.sh + +rm /tmp/*.pid + +# installing libraries if any - (resource urls added comma separated to the ACP system variable) +cd $HADOOP_PREFIX/share/hadoop/common ; for cp in ${ACP//,/ }; do echo == $cp; curl -LO $cp ; done; cd - + +## replace config +: ${EXTRA_CONF_DIR:=/config/hadoop} + +if [ -d "$EXTRA_CONF_DIR" ]; then + cp $EXTRA_CONF_DIR/* $HADOOP_PREFIX/etc/hadoop/ +fi + +/usr/sbin/sshd + +## stop all in case master starts far behind +$HADOOP_PREFIX/sbin/stop-yarn.sh +$HADOOP_PREFIX/sbin/stop-dfs.sh + +$HADOOP_PREFIX/sbin/start-dfs.sh +$HADOOP_PREFIX/sbin/start-yarn.sh +$HADOOP_PREFIX/sbin/mr-jobhistory-daemon.sh start historyserver + +if [[ $1 == "-d" ]]; then + while true; do sleep 1000; done +fi + +if [[ $1 == "-bash" ]]; then + /bin/bash +fi diff --git a/hadoop/2.8.4/core-site.xml b/hadoop/2.8.4/core-site.xml new file mode 100644 index 0000000..7bd76e9 --- /dev/null +++ b/hadoop/2.8.4/core-site.xml @@ -0,0 +1,28 @@ + + + + + + + + + fs.defaultFS + hdfs://hadoop-master:8020 + + + fs.default.name + hdfs://hadoop-master:8020 + + diff --git a/hadoop/2.8.4/hdfs-site.xml b/hadoop/2.8.4/hdfs-site.xml new file mode 100644 index 0000000..ba3613a --- /dev/null +++ b/hadoop/2.8.4/hdfs-site.xml @@ -0,0 +1,40 @@ + + + + + + + + + dfs.permissions + false + + + dfs.namenode.secondary.http-address + hadoop-slave1:50090 + + + dfs.namenode.http-address + hadoop-master:50070 + + + dfs.datanode.max.transfer.threads + 8192 + + + dfs.replication + 3 + + diff --git a/hadoop/2.8.4/mapred-site.xml b/hadoop/2.8.4/mapred-site.xml new file mode 100644 index 0000000..00fc7b8 --- /dev/null +++ b/hadoop/2.8.4/mapred-site.xml @@ -0,0 +1,32 @@ + + + + + + + + + mapreduce.framework.name + yarn + + + mapreduce.jobhistory.address + hadoop-master:10020 + + + mapreduce.jobhistory.webapp.address + hadoop-master:19888 + + diff --git a/hadoop/2.8.4/slaves b/hadoop/2.8.4/slaves new file mode 100644 index 0000000..3bb91be --- /dev/null +++ b/hadoop/2.8.4/slaves @@ -0,0 +1,3 @@ +hadoop-slave1 +hadoop-slave2 +hadoop-slave3 diff --git a/hadoop/2.8.4/ssh_config b/hadoop/2.8.4/ssh_config new file mode 100644 index 0000000..535f9d3 --- /dev/null +++ b/hadoop/2.8.4/ssh_config @@ -0,0 +1,5 @@ +Host * + UserKnownHostsFile /dev/null + StrictHostKeyChecking no + LogLevel quiet + Port 2122 diff --git a/hadoop/2.8.4/yarn-site.xml b/hadoop/2.8.4/yarn-site.xml new file mode 100644 index 0000000..c3fec7b --- /dev/null +++ b/hadoop/2.8.4/yarn-site.xml @@ -0,0 +1,49 @@ + + + + + + yarn.application.classpath + /usr/local/hadoop/etc/hadoop, /usr/local/hadoop/share/hadoop/common/*, /usr/local/hadoop/share/hadoop/common/lib/*, /usr/local/hadoop/share/hadoop/hdfs/*, /usr/local/hadoop/share/hadoop/hdfs/lib/*, /usr/local/hadoop/share/hadoop/mapreduce/*, /usr/local/hadoop/share/hadoop/mapreduce/lib/*, /usr/local/hadoop/share/hadoop/yarn/*, /usr/local/hadoop/share/hadoop/yarn/lib/* + + + yarn.resourcemanager.hostname + hadoop-master + + + yarn.nodemanager.aux-services + mapreduce_shuffle + + + yarn.log-aggregation-enable + true + + + yarn.log-aggregation.retain-seconds + 604800 + + + yarn.nodemanager.resource.memory-mb + 2048 + + + yarn.nodemanager.resource.cpu-vcores + 2 + + + yarn.scheduler.minimum-allocation-mb + 1024 + + diff --git a/hadoop/2.9.1/Dockerfile b/hadoop/2.9.1/Dockerfile index 7610cc0..b6341b7 100644 --- a/hadoop/2.9.1/Dockerfile +++ b/hadoop/2.9.1/Dockerfile @@ -1,4 +1,4 @@ -FROM alpine:3.6 +FROM alpine:3.8 MAINTAINER Newnius @@ -25,7 +25,7 @@ RUN echo "Port 2122" >> /etc/ssh/sshd_config # Install Hadoop ENV HADOOP_VER 2.9.1 -RUN wget -O hadoop.tar.gz http://archive.apache.org/dist/hadoop/common/hadoop-$HADOOP_VER/hadoop-$HADOOP_VER.tar.gz && \ +RUN wget -O hadoop.tar.gz https://archive.apache.org/dist/hadoop/common/hadoop-$HADOOP_VER/hadoop-$HADOOP_VER.tar.gz && \ tar -xzf hadoop.tar.gz -C /usr/local/ && rm hadoop.tar.gz RUN ln -s /usr/local/hadoop-$HADOOP_VER /usr/local/hadoop