update spark

2026-05-07 10:35:08 +00:00 · 2018-08-07 20:04:47 +08:00
parent 0582327307
commit 94228e4944
13 changed files with 98 additions and 125 deletions
--- a/spark/2.2.1-yarn/Dockerfile
+++ b/spark/2.2.1-yarn/Dockerfile
@@ -0,0 +1,26 @@
+FROM newnius/hadoop:2.8.1
+
+MAINTAINER Newnius <newnius.cn@gmail.com>
+
+RUN wget http://mirrors.ocf.berkeley.edu/apache/spark/spark-2.2.1/spark-2.2.1-bin-hadoop2.7.tgz && \
+	tar -xvf spark-2.2.1-bin-hadoop2.7.tgz -C /usr/local && \
+	rm spark-2.2.1-bin-hadoop2.7.tgz
+
+RUN ln -s /usr/local/spark-2.2.1-bin-hadoop2.7 /usr/local/spark
+
+ENV SPARK_HOME /usr/local/spark
+
+ENV PATH $PATH:$SPARK_HOME/bin
+
+# Default Conf Files
+ADD core-site.xml $HADOOP_HOME/etc/hadoop/core-site.xml
+ADD hdfs-site.xml $HADOOP_HOME/etc/hadoop/hdfs-site.xml
+ADD mapred-site.xml $HADOOP_HOME/etc/hadoop/mapred-site.xml
+ADD yarn-site.xml $HADOOP_HOME/etc/hadoop/yarn-site.xml
+ADD slaves $HADOOP_HOME/etc/hadoop/slaves
+
+ADD bootstrap.sh /etc/bootstrap-spark.sh
+
+WORKDIR /usr/local/spark
+
+CMD ["/etc/bootstrap-spark.sh", "-d"]
--- a/spark/2.2.1-yarn/README.md
+++ b/spark/2.2.1-yarn/README.md
@@ -2,6 +2,8 @@

 ## Create a spark cluster in swarm mode

+`--hostname` needs 1.13 or higher
+
 ```bash
 docker service create \
 --name spark-master \
--- a/spark/2.2.1-yarn/bootstrap.sh
+++ b/spark/2.2.1-yarn/bootstrap.sh
@@ -0,0 +1,6 @@
+#! /bin/bash
+# replace config
+
+cp /mnt/config/spark/* $SPARK_HOME/conf
+
+bash -c "/etc/bootstrap.sh -d"
--- a/spark/2.2.1-yarn/core-site.xml
+++ b/spark/2.2.1-yarn/core-site.xml
--- a/spark/2.2.1-yarn/hdfs-site.xml
+++ b/spark/2.2.1-yarn/hdfs-site.xml
--- a/spark/2.2.1-yarn/mapred-site.xml
+++ b/spark/2.2.1-yarn/mapred-site.xml
--- a/spark/2.2.1-yarn/slaves
+++ b/spark/2.2.1-yarn/slaves
--- a/spark/2.2.1-yarn/yarn-site.xml
+++ b/spark/2.2.1-yarn/yarn-site.xml
--- a/spark/2.2.1/Dockerfile
+++ b/spark/2.2.1/Dockerfile
@@ -1,26 +1,30 @@
-FROM newnius/hadoop:2.8.1
+FROM alpine:3.8

 MAINTAINER Newnius <newnius.cn@gmail.com>

-RUN wget http://mirrors.ocf.berkeley.edu/apache/spark/spark-2.2.1/spark-2.2.1-bin-hadoop2.7.tgz && \
-	tar -xvf spark-2.2.1-bin-hadoop2.7.tgz -C /usr/local && \
-	rm spark-2.2.1-bin-hadoop2.7.tgz
+USER root

-RUN ln -s /usr/local/spark-2.2.1-bin-hadoop2.7 /usr/local/spark
+# Prerequisites
+RUN apk add --no-cache openssh openssl openjdk8-jre rsync bash procps coreutils
+
+ENV JAVA_HOME /usr/lib/jvm/java-1.8-openjdk
+
+ENV PATH $PATH:$JAVA_HOME/bin
+
+ENV SPARK_VER 2.2.1
+
+RUN wget https://archive.apache.org/dist/spark/spark-$SPARK_VER/spark-$SPARK_VER-bin-hadoop2.7.tgz && \
+	tar -xvf spark-$SPARK_VER-bin-hadoop2.7.tgz -C /usr/local && \
+	rm spark-$SPARK_VER-bin-hadoop2.7.tgz
+
+RUN ln -s /usr/local/spark-$SPARK_VER-bin-hadoop2.7 /usr/local/spark

 ENV SPARK_HOME /usr/local/spark

 ENV PATH $PATH:$SPARK_HOME/bin

-# Default Conf Files
-ADD core-site.xml $HADOOP_HOME/etc/hadoop/core-site.xml
-ADD hdfs-site.xml $HADOOP_HOME/etc/hadoop/hdfs-site.xml
-ADD mapred-site.xml $HADOOP_HOME/etc/hadoop/mapred-site.xml
-ADD yarn-site.xml $HADOOP_HOME/etc/hadoop/yarn-site.xml
-ADD slaves $HADOOP_HOME/etc/hadoop/slaves
-
-ADD bootstrap.sh /etc/bootstrap-spark.sh
+ADD bootstrap.sh /etc/bootstrap.sh

 WORKDIR /usr/local/spark

-CMD ["/etc/bootstrap-spark.sh", "-d"]
+ENTRYPOINT ["/etc/bootstrap.sh"]
--- a/spark/2.2.1/README.md
+++ b/spark/2.2.1/README.md
@@ -1,93 +1,64 @@
-# Spark on yarn
+# Deploy Spark Cluster of standalone mode

-## Create a spark cluster in swarm mode
-
-`--hostname` needs 1.13 or higher
+## Master

 ```bash
 docker service create \
 	--name spark-master \
 	--hostname spark-master \
--detach true \
 	--network swarm-net \
 	--replicas 1 \
+	--detach true \
 	--endpoint-mode dnsrr \
-newnius/spark:2.2.1
+	newnius/spark:2.2.1 master
 ```

+## Slaves
+
 ```bash
 docker service create \
--name spark-slave1 \
--hostname spark-slave1 \
--detach true \
+	--name spark-slave \
 	--network swarm-net \
--replicas 1 \
--endpoint-mode dnsrr \
-newnius/spark:2.2.1
-```
-
-```bash
-docker service create \
--name spark-slave2 \
--hostname spark-slave2 \
+	--replicas 5 \
 	--detach true \
--network swarm-net \
--replicas 1 \
 	--endpoint-mode dnsrr \
-newnius/spark:2.2.1
+	newnius/spark:2.2.1 slave spark://spark-master:7077
 ```

-## Init && Test
+## Validate installation

-In the first deploy, format hdfs
+#### spark-submit PI

-### Stop HDFS (in master)
-```bash
-sbin/stop-dfs.sh
-```
-
-### Format HDFS (in master)
-```
-bin/hadoop namenode -format
-```
-
-### Start HDFS (in master)
-```bash
-sbin/start-dfs.sh
-```
-
-### Run Hello World
 ```bash
 spark-submit \
-	--master yarn \
+	--master spark://spark-master:7077 \
 	--deploy-mode cluster \
 	--class org.apache.spark.examples.JavaSparkPi \
 	./examples/jars/spark-examples_2.11-2.2.1.jar 100
 ```

-### UI
+#### spark-shell HDFS wordcount

-YARN: spark-master:8088
+Enter `spark-shell --master spark://spark-master:7077` to enter shell.

-HDFS: spark-master:50070
+```shell
+val lines = sc.textFile("hdfs://hadoop-master:8020/user/root/input")

-_Proxy needed, e.g. [newnius/docker-proxy](https://hub.docker.com/r/newnius/docker-proxy/)_
+val words = lines.flatMap(_.split("\\s+"))

-## customized config
+val wc = words.map(word => (word, 1)).reduceByKey(_ + _)

-```bash
-docker service create \
--name spark-master \
--hostname spark-master \
--detach=true \
--network swarm-net \
--replicas 1 \
--mount type=bind,source=/mnt/data/spark/hdfs/master,target=/tmp/hadoop-root \
--mount type=bind,source=/mnt/data/spark/logs/master,target=/usr/local/hadoop/logs \
--mount type=bind,source=/mnt/data/spark/config/hadoop,target=/mnt/config/hadoop \
--mount type=bind,source=/mnt/data/spark/config/spark,target=/mnt/config/spark \
--endpoint-mode dnsrr \
-newnius/spark:2.2.1
+wc.collect()
+
+val cnt = words.map(word => 1).reduce(_ + _)
 ```

-You dont't need to put all files in dir, only add files to be replaced.
+## Browse the web UI
+
+You can expose the ports in the script, but I'd rather not since the slaves shoule occupy the same ports.
+
+To access the web UI, deploy another (socks5) proxy to route the traffic.
+
+If you don't one, try [newnius/docker-proxy](https://hub.docker.com/r/newnius/docker-proxy/), it is rather easy to use.
+
+Visit [spark-master:8080](http://spark-master:8080) to view the cluster.
--- a/spark/2.2.1/bootstrap.sh
+++ b/spark/2.2.1/bootstrap.sh
@@ -1,6 +1,11 @@
 #! /bin/bash
-# replace config

-cp /mnt/config/spark/* $SPARK_HOME/conf
+if [[ $1 == "master" ]]; then
+	/usr/local/spark/sbin/start-master.sh
+fi

-bash -c "/etc/bootstrap.sh -d"
+if [[ $1 == "slave" ]]; then
+	/usr/local/spark/sbin/start-slave.sh $2
+fi
+
+while true; do sleep 1000; done
--- a/spark/test/Dockerfile
+++ b/spark/test/Dockerfile
@@ -1,30 +0,0 @@
-FROM alpine:3.8
-
-MAINTAINER Newnius <newnius.cn@gmail.com>
-
-USER root
-
-# Prerequisites
-RUN apk add --no-cache openssh openssl openjdk8-jre rsync bash procps coreutils
-
-ENV JAVA_HOME /usr/lib/jvm/java-1.8-openjdk
-
-ENV PATH $PATH:$JAVA_HOME/bin
-
-ENV SPARK_VER 2.2.1
-
-RUN wget https://archive.apache.org/dist/spark/spark-$SPARK_VER/spark-$SPARK_VER-bin-hadoop2.7.tgz && \
-	tar -xvf spark-$SPARK_VER-bin-hadoop2.7.tgz -C /usr/local && \
-	rm spark-$SPARK_VER-bin-hadoop2.7.tgz
-
-RUN ln -s /usr/local/spark-$SPARK_VER-bin-hadoop2.7 /usr/local/spark
-
-ENV SPARK_HOME /usr/local/spark
-
-ENV PATH $PATH:$SPARK_HOME/bin
-
-ADD bootstrap.sh /etc/bootstrap.sh
-
-WORKDIR /usr/local/spark
-
-ENTRYPOINT ["/etc/bootstrap.sh"]
--- a/spark/test/bootstrap.sh
+++ b/spark/test/bootstrap.sh
@@ -1,11 +0,0 @@
-#! /bin/bash
-
-if [[ $1 == "master" ]]; then
-	/usr/local/spark/sbin/start_master.sh
-fi
-
-if [[ $1 == "slave" ]]; then
-	/usr/local/spark/sbin/start_slave.sh $2
-fi
-
-while true; do sleep 1000; done