update spark, add more version

This commit is contained in:
Newnius 2018-08-07 20:11:26 +08:00
parent 94228e4944
commit 5d78cbb18c
13 changed files with 73 additions and 361 deletions

View File

@ -1,13 +1,30 @@
FROM sequenceiq/spark:1.6.0
FROM alpine:3.8
MAINTAINER Newnius <docker@newnius.com>
MAINTAINER Newnius <newnius.cn@gmail.com>
USER root
# Prerequisites
RUN apk add --no-cache openssh openssl openjdk8-jre rsync bash procps coreutils
ENV JAVA_HOME /usr/lib/jvm/java-1.8-openjdk
ENV PATH $PATH:$JAVA_HOME/bin
ENV SPARK_VER 1.6.0
RUN wget https://archive.apache.org/dist/spark/spark-$SPARK_VER/spark-$SPARK_VER-bin-hadoop2.6.tgz && \
tar -xvf spark-$SPARK_VER-bin-hadoop2.6.tgz -C /usr/local && \
rm spark-$SPARK_VER-bin-hadoop2.6.tgz
RUN ln -s /usr/local/spark-$SPARK_VER-bin-hadoop2.6 /usr/local/spark
ENV SPARK_HOME /usr/local/spark
ENV PATH $PATH:$SPARK_HOME/bin
ADD bootstrap.sh /etc/bootstrap.sh
COPY config/hadoop /mnt/config/hadoop
COPY config/spark /mnt/config/spark
COPY config/spark-yarn-remote-client /mnt/config/spark-yarn-remote-client
WORKDIR /usr/local/hadoop
WORKDIR /usr/local/spark
ENTRYPOINT ["/etc/bootstrap.sh"]

View File

@ -1,82 +1,64 @@
# based on sequenceiq/spark
# Deploy Spark Cluster of standalone mode
## Create a spark cluster in swarm mode
`--hostname` needs 1.13 or higher
## Master
```bash
docker service create \
--name spark-master \
--network swarm-net \
--replicas 1 \
--endpoint-mode dnsrr \
newnius/spark
--name spark-master \
--hostname spark-master \
--network swarm-net \
--replicas 1 \
--detach true \
--endpoint-mode dnsrr \
newnius/spark:1.6.0 master
```
## Slaves
```bash
docker service create \
--name spark-slave1 \
--network swarm-net \
--replicas 1 \
--endpoint-mode dnsrr \
newnius/spark
--name spark-slave \
--network swarm-net \
--replicas 5 \
--detach true \
--endpoint-mode dnsrr \
newnius/spark:1.6.0 slave spark://spark-master:7077
```
## Validate installation
#### spark-submit PI
```bash
docker service create \
--name spark-slave2 \
--network swarm-net \
--replicas 1 \
--endpoint-mode dnsrr \
newnius/spark
spark-submit \
--master spark://spark-master:7077 \
--deploy-mode cluster \
--class org.apache.spark.examples.JavaSparkPi \
./examples/jars/spark-examples_2.11-1.6.0.jar 100
```
## Init && Test
#### spark-shell HDFS wordcount
In the first deploy, format dfs first
Enter `spark-shell --master spark://spark-master:7077` to enter shell.
### stop cluster (in master)
`sbin/stop-yarn.sh`
`sbin/stop-dfs.sh`
`../spark/sbin/stop-all.sh`
```shell
val lines = sc.textFile("hdfs://hadoop-master:8020/user/root/input")
### remove previous data (in all nodes)
clear all data in /tmp in all nodes
val words = lines.flatMap(_.split("\\s+"))
### format hdfs (in master)
```
bin/hadoop namenode -format
val wc = words.map(word => (word, 1)).reduceByKey(_ + _)
wc.collect()
val cnt = words.map(word => 1).reduce(_ + _)
```
### start cluster (in master)
`sbin/start-dfs.sh`
`sbin/start-yarn.sh`
`../spark/sbin/start-all.sh`
## Browse the web UI
### monitor cluster in browser
You can expose the ports in the script, but I'd rather not since the slaves shoule occupy the same ports.
YARN: spark-master:8088
To access the web UI, deploy another (socks5) proxy to route the traffic.
HDFS: spark-master:50070
If you don't one, try [newnius/docker-proxy](https://hub.docker.com/r/newnius/docker-proxy/), it is rather easy to use.
SPARK: spark-master:8080
_Proxy needed, e.g. newnius/docker-proxy_
## customized config
```bash
docker service create \
--name spark-master \
--network swarm-net \
--replicas 1 \
--mount type=bind,source=/mnt/data/spark/hdfs/master,target=/tmp/hadoop-root \
--mount type=bind,source=/mnt/data/spark/logs/master,target=/usr/local/hadoop/logs \
--mount type=bind,source=/mnt/data/spark/config/hadoop,target=/mnt/config/hadoop \
--mount type=bind,source=/mnt/data/spark/config/spark,target=/mnt/config/spark \
--mount type=bind,source=/mnt/data/spark/config/spark-yarn-remote-client,target=/mnt/config/spark-yarn-remote-client \
--endpoint-mode dnsrr \
newnius/spark
```
You dont't need to put all files in dir, only add files needs modified.
Visit [spark-master:8080](http://spark-master:8080) to view the cluster.

View File

@ -1,35 +1,11 @@
#! /bin/bash
: ${HADOOP_PREFIX:=/usr/local/hadoop}
$HADOOP_PREFIX/etc/hadoop/hadoop-env.sh
rm /tmp/*.pid
# installing libraries if any - (resource urls added comma separated to the ACP system variable)
cd $HADOOP_PREFIX/share/hadoop/common ; for cp in ${ACP//,/ }; do echo == $cp; curl -LO $cp ; done; cd -
cp -a /mnt/config/hadoop/. /usr/local/hadoop/etc/hadoop/
cp -a /mnt/config/spark/. /usr/local/spark/conf/
cp -a /mnt/config/spark-yarn-remote-client/. /usr/local/spark/yarn-remote-client/
service sshd start
## stop all in case master starts far behind
$HADOOP_PREFIX/sbin/stop-yarn.sh
$HADOOP_PREFIX/sbin/stop-dfs.sh
$HADOOP_PREFIX/sbin/start-dfs.sh
$HADOOP_PREFIX/sbin/start-yarn.sh
$HADOOP_PREFIX/sbin/mr-jobhistory-daemon.sh start historyserver
$SPARK_HOME/sbin/start-all.sh
if [[ $1 == "-bash" ]]; then
/bin/bash
else
while true; do sleep 1000; done
if [[ $1 == "master" ]]; then
/usr/local/spark/sbin/start-master.sh
fi
if [[ $1 == "slave" ]]; then
/usr/local/spark/sbin/start-slave.sh $2
fi
while true; do sleep 1000; done

View File

@ -1,29 +0,0 @@
<?xml version="1.0" encoding="UTF-8"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
<!--
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. See accompanying LICENSE file.
-->
<!-- Put site-specific property overrides in this file. -->
<configuration>
<property>
<name>fs.defaultFS</name>
<value>hdfs://spark-master:8020</value>
</property>
<property>
<name>fs.default.name</name>
<value>hdfs://spark-master:8020</value>
</property>
</configuration>

View File

@ -1,46 +0,0 @@
<?xml version="1.0" encoding="UTF-8"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
<!--
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. See accompanying LICENSE file.
-->
<!-- Put site-specific property overrides in this file. -->
<configuration>
<property>
<name>dfs.permissions</name>
<value>false</value>
</property>
<property>
<name>dfs.namenode.secondary.http-address</name>
<value>spark-slave1:50090</value>
</property>
<property>
<name>dfs.namenode.http-address</name>
<value>spark-master:50070</value>
</property>
<property>
<name>dfs.datanode.max.transfer.threads</name>
<value>8192</value>
</property>
<property>
<name>dfs.replication</name>
<value>3</value>
</property>
</configuration>

View File

@ -1,26 +0,0 @@
<?xml version="1.0"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
<!--
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. See accompanying LICENSE file.
-->
<!-- Put site-specific property overrides in this file. -->
<configuration>
<property>
<name>mapreduce.framework.name</name>
<value>yarn</value>
</property>
</configuration>

View File

@ -1,2 +0,0 @@
spark-slave1
spark-slave2

View File

@ -1,57 +0,0 @@
<?xml version="1.0"?>
<!--
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. See accompanying LICENSE file.
-->
<!-- Site specific YARN configuration properties -->
<configuration>
<property>
<name>yarn.application.classpath</name>
<value>/usr/local/hadoop/etc/hadoop, /usr/local/hadoop/share/hadoop/common/*, /usr/local/hadoop/share/hadoop/common/lib/*, /usr/local/hadoop/share/hadoop/hdfs/*, /usr/local/hadoop/share/hadoop/hdfs/lib/*, /usr/local/hadoop/share/hadoop/mapreduce/*, /usr/local/hadoop/share/hadoop/mapreduce/lib/*, /usr/local/hadoop/share/hadoop/yarn/*, /usr/local/hadoop/share/hadoop/yarn/lib/*</value>
</property>
<property>
<name>yarn.resourcemanager.hostname</name>
<value>spark-master</value>
</property>
<property>
<name>yarn.nodemanager.aux-services</name>
<value>mapreduce_shuffle</value>
</property>
<property>
<name>yarn.log-aggregation-enable</name>
<value>true</value>
</property>
<property>
<name>yarn.log-aggregation.retain-seconds</name>
<value>604800</value>
</property>
<property>
<name>yarn.nodemanager.resource.memory-mb</name>
<value>2048</value>
</property>
<property>
<name>yarn.nodemanager.resource.cpu-vcores</name>
<value>2</value>
</property>
<property>
<name>yarn.scheduler.minimum-allocation-mb</name>
<value>1024</value>
</property>
</configuration>

View File

@ -1,12 +0,0 @@
<?xml version="1.0"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
<configuration>
<property>
<name>fs.default.name</name>
<value>hdfs://spark-master:8020</value>
</property>
<property>
<name>dfs.client.use.legacy.blockreader</name>
<value>true</value>
</property>
</configuration>

View File

@ -1,26 +0,0 @@
<configuration>
<property>
<name>yarn.resourcemanager.scheduler.address</name>
<value>spark-master:8030</value>
</property>
<property>
<name>yarn.resourcemanager.address</name>
<value>spark-master:8032</value>
</property>
<property>
<name>yarn.resourcemanager.webapp.address</name>
<value>spark-master:8088</value>
</property>
<property>
<name>yarn.resourcemanager.resource-tracker.address</name>
<value>spark-master:8031</value>
</property>
<property>
<name>yarn.resourcemanager.admin.address</name>
<value>spark-master:8033</value>
</property>
<property>
<name>yarn.application.classpath</name>
<value>/usr/local/hadoop/etc/hadoop, /usr/local/hadoop/share/hadoop/common/*, /usr/local/hadoop/share/hadoop/common/lib/*, /usr/local/hadoop/share/hadoop/hdfs/*, /usr/local/hadoop/share/hadoop/hdfs/lib/*, /usr/local/hadoop/share/hadoop/mapreduce/*, /usr/local/hadoop/share/hadoop/mapreduce/lib/*, /usr/local/hadoop/share/hadoop/yarn/*, /usr/local/hadoop/share/hadoop/yarn/lib/*, /usr/local/hadoop/share/spark/*</value>
</property>
</configuration>

View File

@ -1,3 +0,0 @@
# A Spark Worker will be started on each of the machines listed below.
spark-slave1
spark-slave2

View File

@ -1,10 +0,0 @@
# Default system properties included when running spark-submit.
# This is useful for setting default environmental settings.
# Example:
spark.master spark://spark-master:7077
spark.eventLog.enabled true
spark.eventLog.dir hdfs://spark-master:/var/log/spark
spark.serializer org.apache.spark.serializer.KryoSerializer
# spark.driver.memory 5g
# spark.executor.extraJavaOptions -XX:+PrintGCDetails -Dkey=value -Dnumbers="one two three"

View File

@ -1,52 +0,0 @@
#!/usr/bin/env bash
# This file is sourced when running various Spark programs.
# Copy it as spark-env.sh and edit that to configure Spark for your site.
# Options read when launching programs locally with
# ./bin/run-example or ./bin/spark-submit
# - HADOOP_CONF_DIR, to point Spark towards Hadoop configuration files
# - SPARK_LOCAL_IP, to set the IP address Spark binds to on this node
# - SPARK_PUBLIC_DNS, to set the public dns name of the driver program
# - SPARK_CLASSPATH, default classpath entries to append
# Options read by executors and drivers running inside the cluster
# - SPARK_LOCAL_IP, to set the IP address Spark binds to on this node
# - SPARK_PUBLIC_DNS, to set the public DNS name of the driver program
# - SPARK_CLASSPATH, default classpath entries to append
# - SPARK_LOCAL_DIRS, storage directories to use on this node for shuffle and RDD data
# - MESOS_NATIVE_LIBRARY, to point to your libmesos.so if you use Mesos
# Options read in YARN client mode
# - HADOOP_CONF_DIR, to point Spark towards Hadoop configuration files
# - SPARK_EXECUTOR_INSTANCES, Number of workers to start (Default: 2)
# - SPARK_EXECUTOR_CORES, Number of cores for the workers (Default: 1).
# - SPARK_EXECUTOR_MEMORY, Memory per Worker (e.g. 1000M, 2G) (Default: 1G)
# - SPARK_DRIVER_MEMORY, Memory for Master (e.g. 1000M, 2G) (Default: 512 Mb)
# - SPARK_YARN_APP_NAME, The name of your application (Default: Spark)
# - SPARK_YARN_QUEUE, The hadoop queue to use for allocation requests (Default: default)
# - SPARK_YARN_DIST_FILES, Comma separated list of files to be distributed with the job.
# - SPARK_YARN_DIST_ARCHIVES, Comma separated list of archives to be distributed with the job.
# Options for the daemons used in the standalone deploy mode:
# - SPARK_MASTER_IP, to bind the master to a different IP address or hostname
# - SPARK_MASTER_PORT / SPARK_MASTER_WEBUI_PORT, to use non-default ports for the master
# - SPARK_MASTER_OPTS, to set config properties only for the master (e.g. "-Dx=y")
# - SPARK_WORKER_CORES, to set the number of cores to use on this machine
# - SPARK_WORKER_MEMORY, to set how much total memory workers have to give executors (e.g. 1000m, 2g)
# - SPARK_WORKER_PORT / SPARK_WORKER_WEBUI_PORT, to use non-default ports for the worker
# - SPARK_WORKER_INSTANCES, to set the number of worker processes per node
# - SPARK_WORKER_DIR, to set the working directory of worker processes
# - SPARK_WORKER_OPTS, to set config properties only for the worker (e.g. "-Dx=y")
# - SPARK_HISTORY_OPTS, to set config properties only for the history server (e.g. "-Dx=y")
# - SPARK_DAEMON_JAVA_OPTS, to set config properties for all daemons (e.g. "-Dx=y")
# - SPARK_PUBLIC_DNS, to set the public dns name of the master or workers
export JAVA_HOME=/usr/java/default
export SPARK_MASTER_IP=spark-master
export SPARK_WORKER_CORES=1
export SPARK_WORKER_INSTANCES=1
export SPARK_MORKER_PORT=7077
export SPARK_WORKER_MEMORY=2g
export MASTER=spark://${SPARK_MASTER_IP}:${SPARK_MASTER_PORT}
export SPARK_HISTORY_OPTS="-Dspark.history.ui.port=18080 -Dspark.history.retainedApplications=3 -Dspark.history.fs.logDirectory=hdfs://spark-master:/var/log/spark"