update spark

2025-06-07 16:41:55 +00:00 · 2018-08-08 12:16:24 +08:00 · 2018-08-08 12:16:24 +08:00 · 702f667281
commit 702f667281
parent c7b252fe79
16 changed files with 52 additions and 335 deletions
--- a/spark/2.2.1-yarn/Dockerfile
+++ b/spark/2.2.1-yarn/Dockerfile
@ -1,26 +1,34 @@
-FROM newnius/hadoop:2.8.1
+FROM alpine:3.8
 MAINTAINER Newnius <newnius.cn@gmail.com>
-RUN wget http://mirrors.ocf.berkeley.edu/apache/spark/spark-2.2.1/spark-2.2.1-bin-hadoop2.7.tgz && \
+USER root
 	tar -xvf spark-2.2.1-bin-hadoop2.7.tgz -C /usr/local && \
 	rm spark-2.2.1-bin-hadoop2.7.tgz
-RUN ln -s /usr/local/spark-2.2.1-bin-hadoop2.7 /usr/local/spark
+# Prerequisites
 RUN apk add --no-cache openssh openssl openjdk8-jre rsync bash procps coreutils
 ENV JAVA_HOME /usr/lib/jvm/java-1.8-openjdk
 ENV PATH $PATH:$JAVA_HOME/bin
 ENV SPARK_VER 2.2.1
 RUN wget https://archive.apache.org/dist/spark/spark-$SPARK_VER/spark-$SPARK_VER-bin-hadoop2.7.tgz && \
 	tar -xvf spark-$SPARK_VER-bin-hadoop2.7.tgz -C /usr/local && \
 	rm spark-$SPARK_VER-bin-hadoop2.7.tgz
 RUN ln -s /usr/local/spark-$SPARK_VER-bin-hadoop2.7 /usr/local/spark
 ENV SPARK_HOME /usr/local/spark
 ADD config/* /usr/local/hadoop/etc/hadoop/
 ENV HADOOP_CONF_DIR /usr/local/hadoop/etc/hadoop
 ENV PATH $PATH:$SPARK_HOME/bin
-# Default Conf Files
+ADD bootstrap.sh /etc/bootstrap.sh
 ADD core-site.xml $HADOOP_HOME/etc/hadoop/core-site.xml
 ADD hdfs-site.xml $HADOOP_HOME/etc/hadoop/hdfs-site.xml
 ADD mapred-site.xml $HADOOP_HOME/etc/hadoop/mapred-site.xml
 ADD yarn-site.xml $HADOOP_HOME/etc/hadoop/yarn-site.xml
 ADD slaves $HADOOP_HOME/etc/hadoop/slaves
 ADD bootstrap.sh /etc/bootstrap-spark.sh
 WORKDIR /usr/local/spark
-CMD ["/etc/bootstrap-spark.sh", "-d"]
+CMD ["/etc/bootstrap.sh"]
--- a/spark/2.2.1-yarn/README.md
+++ b/spark/2.2.1-yarn/README.md
@ -1,93 +1,45 @@
-# Spark on yarn
+# Deploy Spark On Yarn
-## Create a spark cluster in swarm mode
+## Client
 `--hostname` needs 1.13 or higher
 ```bash
 docker service create \
--name spark-master \
+	--name spark-client \
--hostname spark-master \
+	--hostname spark-client \
--detach true \
+	--network swarm-net \
--network swarm-net \
+	--replicas 1 \
--replicas 1 \
+	--detach true \
--endpoint-mode dnsrr \
+	newnius/spark:2.2.1-yarn
 newnius/spark:2.2.1
 ```
-```bash
+## Validate installation
 docker service create \
 --name spark-slave1 \
 --hostname spark-slave1 \
 --detach true \
 --network swarm-net \
 --replicas 1 \
 --endpoint-mode dnsrr \
 newnius/spark:2.2.1
 ```
-```bash
+#### spark-submit PI
 docker service create \
 --name spark-slave2 \
 --hostname spark-slave2 \
 --detach true \
 --network swarm-net \
 --replicas 1 \
 --endpoint-mode dnsrr \
 newnius/spark:2.2.1
 ```
 ## Init && Test
 In the first deploy, format hdfs
 ### Stop HDFS (in master)
 ```bash
 sbin/stop-dfs.sh
 ```
 ### Format HDFS (in master)
 ```
 bin/hadoop namenode -format
 ```
 ### Start HDFS (in master)
 ```bash
 sbin/start-dfs.sh
 ```
 ### Run Hello World
 ```bash
 spark-submit \
 	--master yarn \
 	--deploy-mode cluster \
 	--class org.apache.spark.examples.JavaSparkPi \
-	./examples/jars/spark-examples_2.11-2.2.1.jar 100
+	./examples/jars/spark-examples*.jar 100
 ```
-### UI
+#### spark-shell HDFS wordcount
-YARN: spark-master:8088
+Enter `spark-shell --master yarn` to enter shell.
-HDFS: spark-master:50070
+```shell
 val lines = sc.textFile("hdfs://hadoop-master:8020/user/root/input")
-_Proxy needed, e.g. [newnius/docker-proxy](https://hub.docker.com/r/newnius/docker-proxy/)_
+val words = lines.flatMap(_.split("\\s+"))
-## customized config
+val wc = words.map(word => (word, 1)).reduceByKey(_ + _)
-```bash
+wc.collect()
-docker service create \
+
--name spark-master \
+val cnt = words.map(word => 1).reduce(_ + _)
 --hostname spark-master \
 --detach=true \
 --network swarm-net \
 --replicas 1 \
 --mount type=bind,source=/mnt/data/spark/hdfs/master,target=/tmp/hadoop-root \
 --mount type=bind,source=/mnt/data/spark/logs/master,target=/usr/local/hadoop/logs \
 --mount type=bind,source=/mnt/data/spark/config/hadoop,target=/mnt/config/hadoop \
 --mount type=bind,source=/mnt/data/spark/config/spark,target=/mnt/config/spark \
 --endpoint-mode dnsrr \
 newnius/spark:2.2.1
 ```
-You dont't need to put all files in dir, only add files to be replaced.
+## Browse the web UI
 In Spark On Yarn mode, the spark jobs will occur in the Yarn web UI.
--- a/spark/2.2.1-yarn/bootstrap.sh
+++ b/spark/2.2.1-yarn/bootstrap.sh
@ -1,6 +1,10 @@
 #! /bin/bash
 # replace config
-cp /mnt/config/spark/* $SPARK_HOME/conf
+## replace config
 : ${EXTRA_CONF_DIR:=/config/hadoop}
-bash -c "/etc/bootstrap.sh -d"
+if [ -d "$EXTRA_CONF_DIR" ]; then
 	cp $EXTRA_CONF_DIR/* /usr/local/hadoop/etc/hadoop/
 fi
 while true; do sleep 1000; done
--- a/spark/2.2.1-yarn/config/core-site.xml
+++ b/spark/2.2.1-yarn/config/core-site.xml
--- a/spark/2.2.1-yarn/config/hdfs-site.xml
+++ b/spark/2.2.1-yarn/config/hdfs-site.xml
--- a/spark/2.2.1-yarn/config/mapred-site.xml
+++ b/spark/2.2.1-yarn/config/mapred-site.xml
--- a/spark/2.2.1-yarn/config/slaves
+++ b/spark/2.2.1-yarn/config/slaves
--- a/spark/2.2.1-yarn/config/yarn-site.xml
+++ b/spark/2.2.1-yarn/config/yarn-site.xml
--- a/spark/2.2.1-yarn/core-site.xml
+++ b/spark/2.2.1-yarn/core-site.xml
@ -1,28 +0,0 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
 <!--
  Licensed under the Apache License, Version 2.0 (the "License");
  you may not use this file except in compliance with the License.
  You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
  Unless required by applicable law or agreed to in writing, software
  distributed under the License is distributed on an "AS IS" BASIS,
  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  See the License for the specific language governing permissions and
  limitations under the License. See accompanying LICENSE file.
 -->
 <!-- Put site-specific property overrides in this file. -->
 <configuration>
 	<property>		 
 		<name>fs.defaultFS</name>
 		<value>hdfs://spark-master:8020</value>
 	</property>
 	<property>
 		<name>fs.default.name</name>
 		<value>hdfs://spark-master:8020</value>
 	</property>
 </configuration>
--- a/spark/2.2.1-yarn/hdfs-site.xml
+++ b/spark/2.2.1-yarn/hdfs-site.xml
@ -1,40 +0,0 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
 <!--
  Licensed under the Apache License, Version 2.0 (the "License");
  you may not use this file except in compliance with the License.
  You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
  Unless required by applicable law or agreed to in writing, software
  distributed under the License is distributed on an "AS IS" BASIS,
  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  See the License for the specific language governing permissions and
  limitations under the License. See accompanying LICENSE file.
 -->
 <!-- Put site-specific property overrides in this file. -->
 <configuration>
 	<property>
 		<name>dfs.permissions</name>
 		<value>false</value>
 	</property>
 	<property>
 		<name>dfs.namenode.secondary.http-address</name>
 		<value>spark-slave1:50090</value>
 	</property>
 	<property>
 		<name>dfs.namenode.http-address</name>
 		<value>spark-master:50070</value>
 	</property>
 	<property>
 		<name>dfs.datanode.max.transfer.threads</name>
 		<value>8192</value>
 	</property>
 	<property>
 		<name>dfs.replication</name>
 		<value>2</value>
 	</property>
 </configuration>
--- a/spark/2.2.1-yarn/mapred-site.xml
+++ b/spark/2.2.1-yarn/mapred-site.xml
@ -1,32 +0,0 @@
 <?xml version="1.0"?>
 <?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
 <!--
  Licensed under the Apache License, Version 2.0 (the "License");
  you may not use this file except in compliance with the License.
  You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
  Unless required by applicable law or agreed to in writing, software
  distributed under the License is distributed on an "AS IS" BASIS,
  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  See the License for the specific language governing permissions and
  limitations under the License. See accompanying LICENSE file.
 -->
 <!-- Put site-specific property overrides in this file. -->
 <configuration>
 	<property>
 		<name>mapreduce.framework.name</name>
 		<value>yarn</value>
 	</property>
 	<property>
 		<name>mapreduce.jobhistory.address</name>
 		<value>spark-master:10020</value>
 	</property>
 	<property>
 		<name>mapreduce.jobhistory.webapp.address</name>
 		<value>spark-master:19888</value>
 	</property>
 </configuration>
--- a/spark/2.2.1-yarn/slaves
+++ b/spark/2.2.1-yarn/slaves
@ -1,3 +0,0 @@
 spark-slave1
 spark-slave2
 spark-slave3
--- a/spark/2.2.1-yarn/yarn-site.xml
+++ b/spark/2.2.1-yarn/yarn-site.xml
@ -1,49 +0,0 @@
 <?xml version="1.0"?>
 <!--
  Licensed under the Apache License, Version 2.0 (the "License");
  you may not use this file except in compliance with the License.
  You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
  Unless required by applicable law or agreed to in writing, software
  distributed under the License is distributed on an "AS IS" BASIS,
  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  See the License for the specific language governing permissions and
  limitations under the License. See accompanying LICENSE file.
 -->
 <!-- Site specific YARN configuration properties -->
 <configuration>
 	<property>
 		<name>yarn.application.classpath</name>
 		<value>/usr/local/hadoop/etc/hadoop, /usr/local/hadoop/share/hadoop/common/*, /usr/local/hadoop/share/hadoop/common/lib/*, /usr/local/hadoop/share/hadoop/hdfs/*, /usr/local/hadoop/share/hadoop/hdfs/lib/*, /usr/local/hadoop/share/hadoop/mapreduce/*, /usr/local/hadoop/share/hadoop/mapreduce/lib/*, /usr/local/hadoop/share/hadoop/yarn/*, /usr/local/hadoop/share/hadoop/yarn/lib/*</value>
 	</property>
 	<property>
 		<name>yarn.resourcemanager.hostname</name>
 		<value>spark-master</value>
 	</property>
 	<property>
 		<name>yarn.nodemanager.aux-services</name>
 		<value>mapreduce_shuffle</value>
 	</property>
 	<property>
 		<name>yarn.log-aggregation-enable</name>
 		<value>true</value>
 	</property>
 	<property>
 		<name>yarn.log-aggregation.retain-seconds</name>
 		<value>604800</value>
 	</property>
 	<property>
 		<name>yarn.nodemanager.resource.memory-mb</name>
 		<value>2048</value>
 	</property>
 	<property>
 		<name>yarn.nodemanager.resource.cpu-vcores</name>
 		<value>2</value>
 	</property>
 	<property>
 		<name>yarn.scheduler.minimum-allocation-mb</name>
 		<value>1024</value>
 	</property>	
 </configuration>
--- a/spark/test/Dockerfile
+++ b/spark/test/Dockerfile
@ -1,34 +0,0 @@
 FROM alpine:3.8
 MAINTAINER Newnius <newnius.cn@gmail.com>
 USER root
 # Prerequisites
 RUN apk add --no-cache openssh openssl openjdk8-jre rsync bash procps coreutils
 ENV JAVA_HOME /usr/lib/jvm/java-1.8-openjdk
 ENV PATH $PATH:$JAVA_HOME/bin
 ENV SPARK_VER 2.2.1
 RUN wget https://archive.apache.org/dist/spark/spark-$SPARK_VER/spark-$SPARK_VER-bin-hadoop2.7.tgz && \
 	tar -xvf spark-$SPARK_VER-bin-hadoop2.7.tgz -C /usr/local && \
 	rm spark-$SPARK_VER-bin-hadoop2.7.tgz
 RUN ln -s /usr/local/spark-$SPARK_VER-bin-hadoop2.7 /usr/local/spark
 ENV SPARK_HOME /usr/local/spark
 ADD config/* /usr/local/hadoop/etc/hadoop
 ENV HADOOP_CONF_DIR /usr/local/hadoop/etc/hadoop
 ENV PATH $PATH:$SPARK_HOME/bin
 ADD bootstrap.sh /etc/bootstrap.sh
 WORKDIR /usr/local/spark
 CMD ["/etc/bootstrap.sh", "-d"]
--- a/spark/test/README.md
+++ b/spark/test/README.md
@ -1,45 +0,0 @@
 # Deploy Spark On Yarn
 ## Client
 ```bash
 docker service create \
 	--name spark-client \
 	--hostname spark-client \
 	--network swarm-net \
 	--replicas 1 \
 	--detach true \
 	newnius/spark:2.2.1-yarn
 ```
 ## Validate installation
 #### spark-submit PI
 ```bash
 spark-submit \
 	--master yarn \
 	--deploy-mode cluster \
 	--class org.apache.spark.examples.JavaSparkPi \
 	./examples/jars/spark-examples*.jar 100
 ```
 #### spark-shell HDFS wordcount
 Enter `spark-shell --master yarn` to enter shell.
 ```shell
 val lines = sc.textFile("hdfs://hadoop-master:8020/user/root/input")
 val words = lines.flatMap(_.split("\\s+"))
 val wc = words.map(word => (word, 1)).reduceByKey(_ + _)
 wc.collect()
 val cnt = words.map(word => 1).reduce(_ + _)
 ```
 ## Browse the web UI
 In Spark On Yarn mode, the spark jobs will occur in the Yarn web UI.
--- a/spark/test/bootstrap.sh
+++ b/spark/test/bootstrap.sh
@ -1,16 +0,0 @@
 #! /bin/bash
 ## replace config
 : ${EXTRA_CONF_DIR:=/config/hadoop}
 if [ -d "$EXTRA_CONF_DIR" ]; then
 	cp $EXTRA_CONF_DIR/* /usr/local/hadoop/etc/hadoop/
 fi
 if [[ $1 == "-d" ]]; then
  while true; do sleep 1000; done
 fi
 if [[ $1 == "-bash" ]]; then
  /bin/bash
 fi