From b27aa6080f82f70d51239d892afd6c9834f4ff71 Mon Sep 17 00:00:00 2001 From: Newnius Date: Mon, 6 Aug 2018 16:57:42 +0800 Subject: [PATCH] add hadoop:2.9.1 --- hadoop/2.9.1/Dockerfile | 59 ++++++++++++++++++ hadoop/2.9.1/README.md | 115 +++++++++++++++++++++++++++++++++++ hadoop/2.9.1/bootstrap.sh | 35 +++++++++++ hadoop/2.9.1/core-site.xml | 28 +++++++++ hadoop/2.9.1/hdfs-site.xml | 40 ++++++++++++ hadoop/2.9.1/mapred-site.xml | 32 ++++++++++ hadoop/2.9.1/slaves | 3 + hadoop/2.9.1/ssh_config | 5 ++ hadoop/2.9.1/yarn-site.xml | 49 +++++++++++++++ 9 files changed, 366 insertions(+) create mode 100644 hadoop/2.9.1/Dockerfile create mode 100644 hadoop/2.9.1/README.md create mode 100755 hadoop/2.9.1/bootstrap.sh create mode 100644 hadoop/2.9.1/core-site.xml create mode 100644 hadoop/2.9.1/hdfs-site.xml create mode 100644 hadoop/2.9.1/mapred-site.xml create mode 100644 hadoop/2.9.1/slaves create mode 100644 hadoop/2.9.1/ssh_config create mode 100644 hadoop/2.9.1/yarn-site.xml diff --git a/hadoop/2.9.1/Dockerfile b/hadoop/2.9.1/Dockerfile new file mode 100644 index 0000000..cd15c86 --- /dev/null +++ b/hadoop/2.9.1/Dockerfile @@ -0,0 +1,59 @@ +FROM alpine:3.6 + +MAINTAINER Newnius + +USER root + +# Prerequisites +RUN apk add --no-cache openssh openssl openjdk8-jre rsync bash procps + +ENV JAVA_HOME /usr/lib/jvm/java-1.8-openjdk +ENV PATH $PATH:$JAVA_HOME/bin + +# Passwordless SSH +RUN ssh-keygen -q -N "" -t dsa -f /etc/ssh/ssh_host_dsa_key +RUN ssh-keygen -q -N "" -t rsa -f /etc/ssh/ssh_host_rsa_key +RUN ssh-keygen -q -N "" -t rsa -f /root/.ssh/id_rsa +RUN cp /root/.ssh/id_rsa.pub /root/.ssh/authorized_keys + +ADD ssh_config /root/.ssh/config +RUN chmod 600 /root/.ssh/config +RUN chown root:root /root/.ssh/config + +RUN echo "Port 2122" >> /etc/ssh/sshd_config + +# Install Hadoop +ENV HADOOP_VER 2.9.1 + +RUN wget -O hadoop.tar.gz https://archive.apache.org/dist/hadoop/common/hadoop-$HADOOP_VER/hadoop-$HADOOP_VER.tar.gz && \ + tar -xzf hadoop.tar.gz -C /usr/local/ && rm hadoop.tar.gz + +RUN ln -s /usr/local/hadoop-$HADOOP_VER /usr/local/hadoop + +ENV HADOOP_HOME /usr/local/hadoop +ENV PATH $PATH:$HADOOP_HOME/bin:$HADOOP_HOME/sbin + +ENV HADOOP_PREFIX $HADOOP_HOME +ENV HADOOP_COMMON_HOME $HADOOP_HOME +ENV HADOOP_HDFS_HOME $HADOOP_HOME +ENV HADOOP_MAPRED_HOME $HADOOP_HOME +ENV HADOOP_YARN_HOME $HADOOP_HOME +ENV HADOOP_CONF_DIR $HADOOP_HOME/etc/hadoop +ENV YARN_CONF_DIR $HADOOP_PREFIX/etc/hadoop + +# Default Conf Files +ADD core-site.xml $HADOOP_HOME/etc/hadoop/core-site.xml +ADD hdfs-site.xml $HADOOP_HOME/etc/hadoop/hdfs-site.xml +ADD mapred-site.xml $HADOOP_HOME/etc/hadoop/mapred-site.xml +ADD yarn-site.xml $HADOOP_HOME/etc/hadoop/yarn-site.xml +ADD slaves $HADOOP_HOME/etc/hadoop/slaves + +RUN sed -i "/^export JAVA_HOME/ s:.*:export JAVA_HOME=${JAVA_HOME}\nexport HADOOP_HOME=${HADOOP_HOME}\nexport HADOOP_PREFIX=${HADOOP_PREFIX}:" ${HADOOP_HOME}/etc/hadoop/hadoop-env.sh + +RUN sed -i '/^export HADOOP_CONF_DIR/ s:.*:export HADOOP_CONF_DIR=$HADOOP_PREFIX/etc/hadoop/:' $HADOOP_PREFIX/etc/hadoop/hadoop-env.sh + +WORKDIR $HADOOP_HOME + +ADD bootstrap.sh /etc/bootstrap.sh + +CMD ["/etc/bootstrap.sh", "-d"] diff --git a/hadoop/2.9.1/README.md b/hadoop/2.9.1/README.md new file mode 100644 index 0000000..7cb8d25 --- /dev/null +++ b/hadoop/2.9.1/README.md @@ -0,0 +1,115 @@ +# Deploy one Hadoop Cluster with docker + +## Start Master + +```bash +docker service create \ + --name hadoop-master \ + --hostname hadoop-master \ + --network swarm-net \ + --replicas 1 \ + --detach=true \ + --endpoint-mode dnsrr \ + --mount type=bind,source=/etc/localtime,target=/etc/localtime \ + newnius/hadoop:2.9.1 +``` + +## Start slaves + +```bash +docker service create \ + --name hadoop-slave1 \ + --hostname hadoop-slave1 \ + --network swarm-net \ + --replicas 1 \ + --detach=true \ + --endpoint-mode dnsrr \ + --mount type=bind,source=/etc/localtime,target=/etc/localtime \ + newnius/hadoop:2.9.1 +``` + +```bash +docker service create \ + --name hadoop-slave2 \ + --network swarm-net \ + --hostname hadoop-slave2 \ + --replicas 1 \ + --detach=true \ + --endpoint-mode dnsrr \ + --mount type=bind,source=/etc/localtime,target=/etc/localtime \ + newnius/hadoop:2.9.1 +``` + +```bash +docker service create \ + --name hadoop-slave3 \ + --hostname hadoop-slave3 \ + --network swarm-net \ + --replicas 1 \ + --detach=true \ + --endpoint-mode dnsrr \ + --mount type=bind,source=/etc/localtime,target=/etc/localtime \ + newnius/hadoop:2.9.1 +``` + +## Init for the first time + +#### format dfs first +Run these commands on the master node. + +```bash +# stop HDFS services +sbin/stop-dfs.sh + +# format HDFS meta data +bin/hadoop namenode -format + +# restart HDFS services +sbin/start-dfs.sh +``` + +## Run a test job +To make sure youui have successfully setup the Hadoop cluster, just run the floowing commands to see if it is executed well. + +```bash +# prepare input data +bin/hadoop dfs -mkdir -p /user/root/input + +# copy files to input path +bin/hadoop dfs -put etc/hadoop/* /user/root/input + +# submit the job +bin/hadoop jar share/hadoop/mapreduce/hadoop-mapreduce-examples-2.7.4.jar grep input output 'dfs[a-z.]+' +``` + +## Browse the web UI +You can expose the ports in the script, but I'd rather not since the slaves shoule occupy the same ports. + +To access the web UI, deploy another (socks5) proxy to route the traffic. + +If you don't one, try [newnius/docker-proxy](https://hub.docker.com/r/newnius/docker-proxy/), it is rather easy to use. + +Visit [hadoop-master:8088](hadoop-master:8088) fo YARN pages. + +Visit [hadoop-master:50070](hadoop-master:50070) fo YARN pages. + +## Custom configuration + +To persist data or modify the conf files, refer to the following script. + +The `/config/hadoop` path is where new conf files to be replaces, you don't have to put all the files. + +```bash +docker service create \ + --name hadoop-master \ + --hostname hadoop-master \ + --network swarm-net \ + --replicas 1 \ + --detach=true \ + --endpoint-mode dnsrr \ + --mount type=bind,source=/etc/localtime,target=/etc/localtime \ + --mount type=bind,source=/data/hadoop/config,target=/config/hadoop \ + --mount type=bind,source=/data/hadoop/hdfs/master,target=/tmp/hadoop-root \ + --mount type=bind,source=/data/hadoop/logs/master,target=/usr/local/hadoop/logs \ + newnius/hadoop:2.9.1 +``` diff --git a/hadoop/2.9.1/bootstrap.sh b/hadoop/2.9.1/bootstrap.sh new file mode 100755 index 0000000..4289858 --- /dev/null +++ b/hadoop/2.9.1/bootstrap.sh @@ -0,0 +1,35 @@ +#!/bin/bash + +: ${HADOOP_PREFIX:=/usr/local/hadoop} + +$HADOOP_PREFIX/etc/hadoop/hadoop-env.sh + +rm /tmp/*.pid + +# installing libraries if any - (resource urls added comma separated to the ACP system variable) +cd $HADOOP_PREFIX/share/hadoop/common ; for cp in ${ACP//,/ }; do echo == $cp; curl -LO $cp ; done; cd - + +## replace config +: ${EXTRA_CONF_DIR:=/config/hadoop} + +if [ -d "$EXTRA_CONF_DIR" ]; then + cp $EXTRA_CONF_DIR/* $HADOOP_PREFIX/etc/hadoop/ +fi + +/usr/sbin/sshd + +## stop all in case master starts far behind +$HADOOP_PREFIX/sbin/stop-yarn.sh +$HADOOP_PREFIX/sbin/stop-dfs.sh + +$HADOOP_PREFIX/sbin/start-dfs.sh +$HADOOP_PREFIX/sbin/start-yarn.sh +$HADOOP_PREFIX/sbin/mr-jobhistory-daemon.sh start historyserver + +if [[ $1 == "-d" ]]; then + while true; do sleep 1000; done +fi + +if [[ $1 == "-bash" ]]; then + /bin/bash +fi diff --git a/hadoop/2.9.1/core-site.xml b/hadoop/2.9.1/core-site.xml new file mode 100644 index 0000000..7bd76e9 --- /dev/null +++ b/hadoop/2.9.1/core-site.xml @@ -0,0 +1,28 @@ + + + + + + + + + fs.defaultFS + hdfs://hadoop-master:8020 + + + fs.default.name + hdfs://hadoop-master:8020 + + diff --git a/hadoop/2.9.1/hdfs-site.xml b/hadoop/2.9.1/hdfs-site.xml new file mode 100644 index 0000000..ba3613a --- /dev/null +++ b/hadoop/2.9.1/hdfs-site.xml @@ -0,0 +1,40 @@ + + + + + + + + + dfs.permissions + false + + + dfs.namenode.secondary.http-address + hadoop-slave1:50090 + + + dfs.namenode.http-address + hadoop-master:50070 + + + dfs.datanode.max.transfer.threads + 8192 + + + dfs.replication + 3 + + diff --git a/hadoop/2.9.1/mapred-site.xml b/hadoop/2.9.1/mapred-site.xml new file mode 100644 index 0000000..00fc7b8 --- /dev/null +++ b/hadoop/2.9.1/mapred-site.xml @@ -0,0 +1,32 @@ + + + + + + + + + mapreduce.framework.name + yarn + + + mapreduce.jobhistory.address + hadoop-master:10020 + + + mapreduce.jobhistory.webapp.address + hadoop-master:19888 + + diff --git a/hadoop/2.9.1/slaves b/hadoop/2.9.1/slaves new file mode 100644 index 0000000..3bb91be --- /dev/null +++ b/hadoop/2.9.1/slaves @@ -0,0 +1,3 @@ +hadoop-slave1 +hadoop-slave2 +hadoop-slave3 diff --git a/hadoop/2.9.1/ssh_config b/hadoop/2.9.1/ssh_config new file mode 100644 index 0000000..535f9d3 --- /dev/null +++ b/hadoop/2.9.1/ssh_config @@ -0,0 +1,5 @@ +Host * + UserKnownHostsFile /dev/null + StrictHostKeyChecking no + LogLevel quiet + Port 2122 diff --git a/hadoop/2.9.1/yarn-site.xml b/hadoop/2.9.1/yarn-site.xml new file mode 100644 index 0000000..c3fec7b --- /dev/null +++ b/hadoop/2.9.1/yarn-site.xml @@ -0,0 +1,49 @@ + + + + + + yarn.application.classpath + /usr/local/hadoop/etc/hadoop, /usr/local/hadoop/share/hadoop/common/*, /usr/local/hadoop/share/hadoop/common/lib/*, /usr/local/hadoop/share/hadoop/hdfs/*, /usr/local/hadoop/share/hadoop/hdfs/lib/*, /usr/local/hadoop/share/hadoop/mapreduce/*, /usr/local/hadoop/share/hadoop/mapreduce/lib/*, /usr/local/hadoop/share/hadoop/yarn/*, /usr/local/hadoop/share/hadoop/yarn/lib/* + + + yarn.resourcemanager.hostname + hadoop-master + + + yarn.nodemanager.aux-services + mapreduce_shuffle + + + yarn.log-aggregation-enable + true + + + yarn.log-aggregation.retain-seconds + 604800 + + + yarn.nodemanager.resource.memory-mb + 2048 + + + yarn.nodemanager.resource.cpu-vcores + 2 + + + yarn.scheduler.minimum-allocation-mb + 1024 + +