diff --git a/hadoop/2.7.4/Dockerfile b/hadoop/2.7.4/Dockerfile index bd78419..e26c6a3 100644 --- a/hadoop/2.7.4/Dockerfile +++ b/hadoop/2.7.4/Dockerfile @@ -23,10 +23,12 @@ RUN chown root:root /root/.ssh/config RUN echo "Port 2122" >> /etc/ssh/sshd_config # Install Hadoop -RUN wget -O hadoop.tar.gz https://archive.apache.org/dist/hadoop/common/hadoop-2.7.4/hadoop-2.7.4.tar.gz && \ -tar -xzf hadoop.tar.gz -C /usr/local/ && rm hadoop.tar.gz +ENV HADOOP_VER 2.7.4 -RUN ln -s /usr/local/hadoop-2.7.4 /usr/local/hadoop +RUN wget -O hadoop.tar.gz https://archive.apache.org/dist/hadoop/common/hadoop-$HADOOP_VER/hadoop-$HADOOP_VER.tar.gz && \ + tar -xzf hadoop.tar.gz -C /usr/local/ && rm hadoop.tar.gz + +RUN ln -s /usr/local/hadoop-$HADOOP_VER /usr/local/hadoop ENV HADOOP_HOME /usr/local/hadoop ENV PATH $PATH:$HADOOP_HOME/bin:$HADOOP_HOME/sbin diff --git a/hadoop/2.7.4/README.md b/hadoop/2.7.4/README.md index 89ef602..17f7b37 100644 --- a/hadoop/2.7.4/README.md +++ b/hadoop/2.7.4/README.md @@ -1,85 +1,115 @@ -# based on alpine +# Deploy one Hadoop Cluster with docker -## Create a hadoop cluster in swarm mode - -`--hostname` needs docker 1.13 or higher +## Start Master ```bash docker service create \ ---name hadoop-master \ ---hostname hadoop-master \ ---network swarm-net \ ---replicas 1 \ ---endpoint-mode dnsrr \ -newnius/hadoop + --name hadoop-master \ + --hostname hadoop-master \ + --network swarm-net \ + --replicas 1 \ + --detach=true \ + --endpoint-mode dnsrr \ + --mount type=bind,source=/etc/localtime,target=/etc/localtime \ + newnius/hadoop:2.7.4 +``` + +## Start slaves + +```bash +docker service create \ + --name hadoop-slave1 \ + --hostname hadoop-slave1 \ + --network swarm-net \ + --replicas 1 \ + --detach=true \ + --endpoint-mode dnsrr \ + --mount type=bind,source=/etc/localtime,target=/etc/localtime \ + newnius/hadoop:2.7.4 ``` ```bash docker service create \ ---name hadoop-slave1 \ ---hostname hadoop-slave1 \ ---network swarm-net \ ---replicas 1 \ ---endpoint-mode dnsrr \ -newnius/hadoop + --name hadoop-slave2 \ + --network swarm-net \ + --hostname hadoop-slave2 \ + --replicas 1 \ + --detach=true \ + --endpoint-mode dnsrr \ + --mount type=bind,source=/etc/localtime,target=/etc/localtime \ + newnius/hadoop:2.7.4 ``` ```bash docker service create \ ---name hadoop-slave2 \ ---hostname hadoop-slave2 \ ---network swarm-net \ ---replicas 1 \ ---endpoint-mode dnsrr \ -newnius/hadoop + --name hadoop-slave3 \ + --hostname hadoop-slave3 \ + --network swarm-net \ + --replicas 1 \ + --detach=true \ + --endpoint-mode dnsrr \ + --mount type=bind,source=/etc/localtime,target=/etc/localtime \ + newnius/hadoop:2.7.4 ``` +## Init for the first time + +#### format dfs first +Run these commands on the master node. + ```bash -docker service create \ ---name hadoop-slave3 \ ---hostname hadoop-slave3 \ ---network swarm-net \ ---replicas 1 \ ---endpoint-mode dnsrr \ -newnius/hadoop -``` - -## Init && Test - -In the first deploy, format dfs first - -### stop cluster (in master) -```bash +# stop HDFS services sbin/stop-dfs.sh -``` -### format hdfs (in master) -```bash +# format HDFS meta data bin/hadoop namenode -format -``` -### start cluster (in master) -```bash +# restart HDFS services sbin/start-dfs.sh ``` -### Run a test job +## Run a test job +To make sure youui have successfully setup the Hadoop cluster, just run the floowing commands to see if it is executed well. ```bash # prepare input data bin/hadoop dfs -mkdir -p /user/root/input +# copy files to input path bin/hadoop dfs -put etc/hadoop/* /user/root/input -``` -```bash + +# submit the job bin/hadoop jar share/hadoop/mapreduce/hadoop-mapreduce-examples-2.7.4.jar grep input output 'dfs[a-z.]+' - ``` -### monitor cluster in browser +## Browse the web UI +You can expose the ports in the script, but I'd rather not since the slaves shoule occupy the same ports. -YARN: hadoop-master:8088 +To access the web UI, deploy another (socks5) proxy to route the traffic. -HDFS: hadoop-master:50070 +If you don't one, try [newnius/docker-proxy](https://hub.docker.com/r/newnius/docker-proxy/), it is rather easy to use. -_Proxy needed: newnius/docker-proxy_ +Visit [hadoop-master:8088](hadoop-master:8088) fo YARN pages. + +Visit [hadoop-master:50070](hadoop-master:50070) fo YARN pages. + +## Custom configuration + +To persist data or modify the conf files, refer to the following script. + +The `/config/hadoop` path is where new conf files to be replaces, you don't have to put all the files. + +```bash +docker service create \ + --name hadoop-master \ + --hostname hadoop-master \ + --network swarm-net \ + --replicas 1 \ + --detach=true \ + --endpoint-mode dnsrr \ + --mount type=bind,source=/etc/localtime,target=/etc/localtime \ + --mount type=bind,source=/data/hadoop/config,target=/config/hadoop \ + --mount type=bind,source=/data/hadoop/hdfs/master,target=/tmp/hadoop-root \ + --mount type=bind,source=/data/hadoop/logs/master,target=/usr/local/hadoop/logs \ + newnius/hadoop:2.7.4 +``` diff --git a/hadoop/2.7.4/bootstrap.sh b/hadoop/2.7.4/bootstrap.sh index 42f4f75..4289858 100755 --- a/hadoop/2.7.4/bootstrap.sh +++ b/hadoop/2.7.4/bootstrap.sh @@ -9,8 +9,12 @@ rm /tmp/*.pid # installing libraries if any - (resource urls added comma separated to the ACP system variable) cd $HADOOP_PREFIX/share/hadoop/common ; for cp in ${ACP//,/ }; do echo == $cp; curl -LO $cp ; done; cd - -# replace config -cp /mnt/hadoop-config/* $HADOOP_PREFIX/etc/hadoop/ +## replace config +: ${EXTRA_CONF_DIR:=/config/hadoop} + +if [ -d "$EXTRA_CONF_DIR" ]; then + cp $EXTRA_CONF_DIR/* $HADOOP_PREFIX/etc/hadoop/ +fi /usr/sbin/sshd