update spark, add more version

2025-06-06 16:11:55 +00:00 · 2018-08-07 20:11:26 +08:00 · 2018-08-07 20:11:26 +08:00 · 5d78cbb18c
commit 5d78cbb18c
parent 94228e4944
13 changed files with 73 additions and 361 deletions
--- a/spark/1.6.0/Dockerfile
+++ b/spark/1.6.0/Dockerfile
@ -1,13 +1,30 @@
-FROM sequenceiq/spark:1.6.0
+FROM alpine:3.8

-MAINTAINER Newnius <docker@newnius.com>
+MAINTAINER Newnius <newnius.cn@gmail.com>
+
+USER root
+
+# Prerequisites
+RUN apk add --no-cache openssh openssl openjdk8-jre rsync bash procps coreutils
+
+ENV JAVA_HOME /usr/lib/jvm/java-1.8-openjdk
+
+ENV PATH $PATH:$JAVA_HOME/bin
+
+ENV SPARK_VER 1.6.0
+
+RUN wget https://archive.apache.org/dist/spark/spark-$SPARK_VER/spark-$SPARK_VER-bin-hadoop2.6.tgz && \
+	tar -xvf spark-$SPARK_VER-bin-hadoop2.6.tgz -C /usr/local && \
+	rm spark-$SPARK_VER-bin-hadoop2.6.tgz
+
+RUN ln -s /usr/local/spark-$SPARK_VER-bin-hadoop2.6 /usr/local/spark
+
+ENV SPARK_HOME /usr/local/spark
+
+ENV PATH $PATH:$SPARK_HOME/bin

 ADD bootstrap.sh /etc/bootstrap.sh

-COPY config/hadoop /mnt/config/hadoop
-COPY config/spark  /mnt/config/spark
-COPY config/spark-yarn-remote-client /mnt/config/spark-yarn-remote-client
-
-WORKDIR /usr/local/hadoop
+WORKDIR /usr/local/spark

 ENTRYPOINT ["/etc/bootstrap.sh"]
--- a/spark/1.6.0/README.md
+++ b/spark/1.6.0/README.md
@ -1,82 +1,64 @@
-# based on sequenceiq/spark
+# Deploy Spark Cluster of standalone mode

-## Create a spark cluster in swarm mode
-
-`--hostname` needs 1.13 or higher
+## Master

 ```bash
 docker service create \
--name spark-master \
--network swarm-net \
--replicas 1 \
--endpoint-mode dnsrr \
-newnius/spark
+	--name spark-master \
+	--hostname spark-master \
+	--network swarm-net \
+	--replicas 1 \
+	--detach true \
+	--endpoint-mode dnsrr \
+	newnius/spark:1.6.0 master
 ```

+## Slaves
+
 ```bash
 docker service create \
--name spark-slave1 \
--network swarm-net \
--replicas 1 \
--endpoint-mode dnsrr \
-newnius/spark
+	--name spark-slave \
+	--network swarm-net \
+	--replicas 5 \
+	--detach true \
+	--endpoint-mode dnsrr \
+	newnius/spark:1.6.0 slave spark://spark-master:7077
 ```

+## Validate installation
+
+#### spark-submit PI
+
 ```bash
-docker service create \
--name spark-slave2 \
--network swarm-net \
--replicas 1 \
--endpoint-mode dnsrr \
-newnius/spark
+spark-submit \
+	--master spark://spark-master:7077 \
+	--deploy-mode cluster \
+	--class org.apache.spark.examples.JavaSparkPi \
+	./examples/jars/spark-examples_2.11-1.6.0.jar 100
 ```

-## Init && Test
+#### spark-shell HDFS wordcount

-In the first deploy, format dfs first
+Enter `spark-shell --master spark://spark-master:7077` to enter shell.

-### stop cluster (in master)
-`sbin/stop-yarn.sh`
-`sbin/stop-dfs.sh`
-`../spark/sbin/stop-all.sh`
+```shell
+val lines = sc.textFile("hdfs://hadoop-master:8020/user/root/input")

-### remove previous data (in all nodes)
-clear all data in /tmp in all nodes
+val words = lines.flatMap(_.split("\\s+"))

-### format hdfs (in master)
-```
-bin/hadoop namenode -format
+val wc = words.map(word => (word, 1)).reduceByKey(_ + _)
+
+wc.collect()
+
+val cnt = words.map(word => 1).reduce(_ + _)
 ```

-### start cluster (in master)
-`sbin/start-dfs.sh`
-`sbin/start-yarn.sh`
-`../spark/sbin/start-all.sh`
+## Browse the web UI

-### monitor cluster in browser
+You can expose the ports in the script, but I'd rather not since the slaves shoule occupy the same ports.

-YARN:  spark-master:8088
+To access the web UI, deploy another (socks5) proxy to route the traffic.

-HDFS:  spark-master:50070
+If you don't one, try [newnius/docker-proxy](https://hub.docker.com/r/newnius/docker-proxy/), it is rather easy to use.

-SPARK: spark-master:8080
-
-_Proxy needed, e.g. newnius/docker-proxy_
-
-## customized config
-
-```bash
-docker service create \
--name spark-master \
--network swarm-net \
--replicas 1 \
--mount type=bind,source=/mnt/data/spark/hdfs/master,target=/tmp/hadoop-root \
--mount type=bind,source=/mnt/data/spark/logs/master,target=/usr/local/hadoop/logs \
--mount type=bind,source=/mnt/data/spark/config/hadoop,target=/mnt/config/hadoop \
--mount type=bind,source=/mnt/data/spark/config/spark,target=/mnt/config/spark \
--mount type=bind,source=/mnt/data/spark/config/spark-yarn-remote-client,target=/mnt/config/spark-yarn-remote-client \
--endpoint-mode dnsrr \
-newnius/spark
-```
-
-You dont't need to put all files in dir, only add files needs modified.
+Visit [spark-master:8080](http://spark-master:8080) to view the cluster.
--- a/spark/1.6.0/bootstrap.sh
+++ b/spark/1.6.0/bootstrap.sh
@ -1,35 +1,11 @@
 #! /bin/bash

-
-: ${HADOOP_PREFIX:=/usr/local/hadoop}
-
-$HADOOP_PREFIX/etc/hadoop/hadoop-env.sh
-
-rm /tmp/*.pid
-
-# installing libraries if any - (resource urls added comma separated to the ACP system variable)
-cd $HADOOP_PREFIX/share/hadoop/common ; for cp in ${ACP//,/ }; do  echo == $cp; curl -LO $cp ; done; cd -
-
-cp -a /mnt/config/hadoop/. /usr/local/hadoop/etc/hadoop/
-cp -a /mnt/config/spark/. /usr/local/spark/conf/
-cp -a /mnt/config/spark-yarn-remote-client/. /usr/local/spark/yarn-remote-client/
-
-service sshd start
-
-## stop all in case master starts far behind
-$HADOOP_PREFIX/sbin/stop-yarn.sh
-$HADOOP_PREFIX/sbin/stop-dfs.sh
-
-$HADOOP_PREFIX/sbin/start-dfs.sh
-$HADOOP_PREFIX/sbin/start-yarn.sh
-$HADOOP_PREFIX/sbin/mr-jobhistory-daemon.sh start historyserver
-
-
-$SPARK_HOME/sbin/start-all.sh
-
-
-if [[ $1 == "-bash" ]]; then
-  /bin/bash
-else
-  while true; do sleep 1000; done
+if [[ $1 == "master" ]]; then
+	/usr/local/spark/sbin/start-master.sh
 fi
+
+if [[ $1 == "slave" ]]; then
+	/usr/local/spark/sbin/start-slave.sh $2
+fi
+
+while true; do sleep 1000; done
--- a/spark/1.6.0/config/hadoop/core-site.xml
+++ b/spark/1.6.0/config/hadoop/core-site.xml
@ -1,29 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
-<!--
-  Licensed under the Apache License, Version 2.0 (the "License");
-  you may not use this file except in compliance with the License.
-  You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-  Unless required by applicable law or agreed to in writing, software
-  distributed under the License is distributed on an "AS IS" BASIS,
-  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-  See the License for the specific language governing permissions and
-  limitations under the License. See accompanying LICENSE file.
-->
-
-<!-- Put site-specific property overrides in this file. -->
-
-<configuration>
-
-  <property>		 
-    <name>fs.defaultFS</name>
-    <value>hdfs://spark-master:8020</value>
-  </property>
-  <property>
-    <name>fs.default.name</name>
-    <value>hdfs://spark-master:8020</value>
-  </property>
-</configuration>
--- a/spark/1.6.0/config/hadoop/hdfs-site.xml
+++ b/spark/1.6.0/config/hadoop/hdfs-site.xml
@ -1,46 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
-<!--
-  Licensed under the Apache License, Version 2.0 (the "License");
-  you may not use this file except in compliance with the License.
-  You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-  Unless required by applicable law or agreed to in writing, software
-  distributed under the License is distributed on an "AS IS" BASIS,
-  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-  See the License for the specific language governing permissions and
-  limitations under the License. See accompanying LICENSE file.
-->
-
-<!-- Put site-specific property overrides in this file. -->
-
-<configuration>
-
-	<property>
-		<name>dfs.permissions</name>
-		<value>false</value>
-	</property>
-	
-	<property>
-		<name>dfs.namenode.secondary.http-address</name>
-		<value>spark-slave1:50090</value>
-	</property>	
-	<property>
-		<name>dfs.namenode.http-address</name>
-		<value>spark-master:50070</value>
-	</property>
-	
-	<property>   
-		<name>dfs.datanode.max.transfer.threads</name>   
-		<value>8192</value>    
-	</property>
-
-    <property>
-        <name>dfs.replication</name>
-        <value>3</value>
-    </property>
-
-
-</configuration>
--- a/spark/1.6.0/config/hadoop/mapred-site.xml
+++ b/spark/1.6.0/config/hadoop/mapred-site.xml
@ -1,26 +0,0 @@
-<?xml version="1.0"?>
-<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
-<!--
-  Licensed under the Apache License, Version 2.0 (the "License");
-  you may not use this file except in compliance with the License.
-  You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-  Unless required by applicable law or agreed to in writing, software
-  distributed under the License is distributed on an "AS IS" BASIS,
-  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-  See the License for the specific language governing permissions and
-  limitations under the License. See accompanying LICENSE file.
-->
-
-<!-- Put site-specific property overrides in this file. -->
-
-<configuration>
-
-	<property>	 	        		
-		<name>mapreduce.framework.name</name>
-		<value>yarn</value>
-	</property>
-	
-</configuration>
--- a/spark/1.6.0/config/hadoop/slaves
+++ b/spark/1.6.0/config/hadoop/slaves
@ -1,2 +0,0 @@
-spark-slave1
-spark-slave2
--- a/spark/1.6.0/config/hadoop/yarn-site.xml
+++ b/spark/1.6.0/config/hadoop/yarn-site.xml
@ -1,57 +0,0 @@
-<?xml version="1.0"?>
-<!--
-  Licensed under the Apache License, Version 2.0 (the "License");
-  you may not use this file except in compliance with the License.
-  You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-  Unless required by applicable law or agreed to in writing, software
-  distributed under the License is distributed on an "AS IS" BASIS,
-  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-  See the License for the specific language governing permissions and
-  limitations under the License. See accompanying LICENSE file.
-->
-<!-- Site specific YARN configuration properties -->
-<configuration>
-
-    <property>
-      <name>yarn.application.classpath</name>
-      <value>/usr/local/hadoop/etc/hadoop, /usr/local/hadoop/share/hadoop/common/*, /usr/local/hadoop/share/hadoop/common/lib/*, /usr/local/hadoop/share/hadoop/hdfs/*, /usr/local/hadoop/share/hadoop/hdfs/lib/*, /usr/local/hadoop/share/hadoop/mapreduce/*, /usr/local/hadoop/share/hadoop/mapreduce/lib/*, /usr/local/hadoop/share/hadoop/yarn/*, /usr/local/hadoop/share/hadoop/yarn/lib/*</value>
-    </property>
-
-
-	<property>
-		<name>yarn.resourcemanager.hostname</name>
-		<value>spark-master</value>
-	</property>
-	
-	<property>
-		<name>yarn.nodemanager.aux-services</name>
-		<value>mapreduce_shuffle</value>
-	</property>
-	
-	<property>
-		<name>yarn.log-aggregation-enable</name>
-		<value>true</value>
-	</property>
-
-	<property>
-		<name>yarn.log-aggregation.retain-seconds</name>
-		<value>604800</value>
-	</property>
-	
-		
-	<property>
-		<name>yarn.nodemanager.resource.memory-mb</name>
-		<value>2048</value>
-	</property>
-	<property>
-		<name>yarn.nodemanager.resource.cpu-vcores</name>
-		<value>2</value>
-	</property>
-    <property>
-      <name>yarn.scheduler.minimum-allocation-mb</name>
-      <value>1024</value>
-    </property>	
-</configuration>
--- a/spark/1.6.0/config/spark-yarn-remote-client/core-site.xml
+++ b/spark/1.6.0/config/spark-yarn-remote-client/core-site.xml
@ -1,12 +0,0 @@
-<?xml version="1.0"?>
-<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
-<configuration>
-  <property>
-      <name>fs.default.name</name>
-      <value>hdfs://spark-master:8020</value>
-  </property>
-  <property>
-      <name>dfs.client.use.legacy.blockreader</name>
-      <value>true</value>
-  </property>
-</configuration>
--- a/spark/1.6.0/config/spark-yarn-remote-client/yarn-site.xml
+++ b/spark/1.6.0/config/spark-yarn-remote-client/yarn-site.xml
@ -1,26 +0,0 @@
-<configuration>
-  <property>
-    <name>yarn.resourcemanager.scheduler.address</name>
-    <value>spark-master:8030</value>
-  </property>
-  <property>
-    <name>yarn.resourcemanager.address</name>
-    <value>spark-master:8032</value>
-  </property>
-  <property>
-    <name>yarn.resourcemanager.webapp.address</name>
-    <value>spark-master:8088</value>
-  </property>
-  <property>
-    <name>yarn.resourcemanager.resource-tracker.address</name>
-    <value>spark-master:8031</value>
-  </property>
-  <property>
-    <name>yarn.resourcemanager.admin.address</name>
-    <value>spark-master:8033</value>
-  </property>
-  <property>
-      <name>yarn.application.classpath</name>
-      <value>/usr/local/hadoop/etc/hadoop, /usr/local/hadoop/share/hadoop/common/*, /usr/local/hadoop/share/hadoop/common/lib/*, /usr/local/hadoop/share/hadoop/hdfs/*, /usr/local/hadoop/share/hadoop/hdfs/lib/*, /usr/local/hadoop/share/hadoop/mapreduce/*, /usr/local/hadoop/share/hadoop/mapreduce/lib/*, /usr/local/hadoop/share/hadoop/yarn/*, /usr/local/hadoop/share/hadoop/yarn/lib/*, /usr/local/hadoop/share/spark/*</value>
-   </property>
-</configuration>
--- a/spark/1.6.0/config/spark/slaves
+++ b/spark/1.6.0/config/spark/slaves
@ -1,3 +0,0 @@
-# A Spark Worker will be started on each of the machines listed below.
-spark-slave1
-spark-slave2
--- a/spark/1.6.0/config/spark/spark-defaults.conf
+++ b/spark/1.6.0/config/spark/spark-defaults.conf
@ -1,10 +0,0 @@
-# Default system properties included when running spark-submit.
-# This is useful for setting default environmental settings.
-
-# Example:
- spark.master                     spark://spark-master:7077
- spark.eventLog.enabled           true
- spark.eventLog.dir               hdfs://spark-master:/var/log/spark
- spark.serializer                 org.apache.spark.serializer.KryoSerializer
-# spark.driver.memory              5g
-# spark.executor.extraJavaOptions  -XX:+PrintGCDetails -Dkey=value -Dnumbers="one two three"
--- a/spark/1.6.0/config/spark/spark-env.sh
+++ b/spark/1.6.0/config/spark/spark-env.sh
@ -1,52 +0,0 @@
-#!/usr/bin/env bash
-
-# This file is sourced when running various Spark programs.
-# Copy it as spark-env.sh and edit that to configure Spark for your site.
-
-# Options read when launching programs locally with 
-# ./bin/run-example or ./bin/spark-submit
-# - HADOOP_CONF_DIR, to point Spark towards Hadoop configuration files
-# - SPARK_LOCAL_IP, to set the IP address Spark binds to on this node
-# - SPARK_PUBLIC_DNS, to set the public dns name of the driver program
-# - SPARK_CLASSPATH, default classpath entries to append
-
-# Options read by executors and drivers running inside the cluster
-# - SPARK_LOCAL_IP, to set the IP address Spark binds to on this node
-# - SPARK_PUBLIC_DNS, to set the public DNS name of the driver program
-# - SPARK_CLASSPATH, default classpath entries to append
-# - SPARK_LOCAL_DIRS, storage directories to use on this node for shuffle and RDD data
-# - MESOS_NATIVE_LIBRARY, to point to your libmesos.so if you use Mesos
-
-# Options read in YARN client mode
-# - HADOOP_CONF_DIR, to point Spark towards Hadoop configuration files
-# - SPARK_EXECUTOR_INSTANCES, Number of workers to start (Default: 2)
-# - SPARK_EXECUTOR_CORES, Number of cores for the workers (Default: 1).
-# - SPARK_EXECUTOR_MEMORY, Memory per Worker (e.g. 1000M, 2G) (Default: 1G)
-# - SPARK_DRIVER_MEMORY, Memory for Master (e.g. 1000M, 2G) (Default: 512 Mb)
-# - SPARK_YARN_APP_NAME, The name of your application (Default: Spark)
-# - SPARK_YARN_QUEUE, The hadoop queue to use for allocation requests (Default: ‘default’)
-# - SPARK_YARN_DIST_FILES, Comma separated list of files to be distributed with the job.
-# - SPARK_YARN_DIST_ARCHIVES, Comma separated list of archives to be distributed with the job.
-
-# Options for the daemons used in the standalone deploy mode:
-# - SPARK_MASTER_IP, to bind the master to a different IP address or hostname
-# - SPARK_MASTER_PORT / SPARK_MASTER_WEBUI_PORT, to use non-default ports for the master
-# - SPARK_MASTER_OPTS, to set config properties only for the master (e.g. "-Dx=y")
-# - SPARK_WORKER_CORES, to set the number of cores to use on this machine
-# - SPARK_WORKER_MEMORY, to set how much total memory workers have to give executors (e.g. 1000m, 2g)
-# - SPARK_WORKER_PORT / SPARK_WORKER_WEBUI_PORT, to use non-default ports for the worker
-# - SPARK_WORKER_INSTANCES, to set the number of worker processes per node
-# - SPARK_WORKER_DIR, to set the working directory of worker processes
-# - SPARK_WORKER_OPTS, to set config properties only for the worker (e.g. "-Dx=y")
-# - SPARK_HISTORY_OPTS, to set config properties only for the history server (e.g. "-Dx=y")
-# - SPARK_DAEMON_JAVA_OPTS, to set config properties for all daemons (e.g. "-Dx=y")
-# - SPARK_PUBLIC_DNS, to set the public dns name of the master or workers
-
-export JAVA_HOME=/usr/java/default
-export SPARK_MASTER_IP=spark-master
-export SPARK_WORKER_CORES=1
-export SPARK_WORKER_INSTANCES=1
-export SPARK_MORKER_PORT=7077
-export SPARK_WORKER_MEMORY=2g
-export MASTER=spark://${SPARK_MASTER_IP}:${SPARK_MASTER_PORT}
-export SPARK_HISTORY_OPTS="-Dspark.history.ui.port=18080 -Dspark.history.retainedApplications=3 -Dspark.history.fs.logDirectory=hdfs://spark-master:/var/log/spark"