mirror of
https://github.com/newnius/Dockerfiles.git
synced 2025-06-06 16:11:55 +00:00
update spark, add more version
This commit is contained in:
parent
94228e4944
commit
5d78cbb18c
@ -1,13 +1,30 @@
|
||||
FROM sequenceiq/spark:1.6.0
|
||||
FROM alpine:3.8
|
||||
|
||||
MAINTAINER Newnius <docker@newnius.com>
|
||||
MAINTAINER Newnius <newnius.cn@gmail.com>
|
||||
|
||||
USER root
|
||||
|
||||
# Prerequisites
|
||||
RUN apk add --no-cache openssh openssl openjdk8-jre rsync bash procps coreutils
|
||||
|
||||
ENV JAVA_HOME /usr/lib/jvm/java-1.8-openjdk
|
||||
|
||||
ENV PATH $PATH:$JAVA_HOME/bin
|
||||
|
||||
ENV SPARK_VER 1.6.0
|
||||
|
||||
RUN wget https://archive.apache.org/dist/spark/spark-$SPARK_VER/spark-$SPARK_VER-bin-hadoop2.6.tgz && \
|
||||
tar -xvf spark-$SPARK_VER-bin-hadoop2.6.tgz -C /usr/local && \
|
||||
rm spark-$SPARK_VER-bin-hadoop2.6.tgz
|
||||
|
||||
RUN ln -s /usr/local/spark-$SPARK_VER-bin-hadoop2.6 /usr/local/spark
|
||||
|
||||
ENV SPARK_HOME /usr/local/spark
|
||||
|
||||
ENV PATH $PATH:$SPARK_HOME/bin
|
||||
|
||||
ADD bootstrap.sh /etc/bootstrap.sh
|
||||
|
||||
COPY config/hadoop /mnt/config/hadoop
|
||||
COPY config/spark /mnt/config/spark
|
||||
COPY config/spark-yarn-remote-client /mnt/config/spark-yarn-remote-client
|
||||
|
||||
WORKDIR /usr/local/hadoop
|
||||
WORKDIR /usr/local/spark
|
||||
|
||||
ENTRYPOINT ["/etc/bootstrap.sh"]
|
||||
|
@ -1,82 +1,64 @@
|
||||
# based on sequenceiq/spark
|
||||
# Deploy Spark Cluster of standalone mode
|
||||
|
||||
## Create a spark cluster in swarm mode
|
||||
|
||||
`--hostname` needs 1.13 or higher
|
||||
## Master
|
||||
|
||||
```bash
|
||||
docker service create \
|
||||
--name spark-master \
|
||||
--network swarm-net \
|
||||
--replicas 1 \
|
||||
--endpoint-mode dnsrr \
|
||||
newnius/spark
|
||||
--name spark-master \
|
||||
--hostname spark-master \
|
||||
--network swarm-net \
|
||||
--replicas 1 \
|
||||
--detach true \
|
||||
--endpoint-mode dnsrr \
|
||||
newnius/spark:1.6.0 master
|
||||
```
|
||||
|
||||
## Slaves
|
||||
|
||||
```bash
|
||||
docker service create \
|
||||
--name spark-slave1 \
|
||||
--network swarm-net \
|
||||
--replicas 1 \
|
||||
--endpoint-mode dnsrr \
|
||||
newnius/spark
|
||||
--name spark-slave \
|
||||
--network swarm-net \
|
||||
--replicas 5 \
|
||||
--detach true \
|
||||
--endpoint-mode dnsrr \
|
||||
newnius/spark:1.6.0 slave spark://spark-master:7077
|
||||
```
|
||||
|
||||
## Validate installation
|
||||
|
||||
#### spark-submit PI
|
||||
|
||||
```bash
|
||||
docker service create \
|
||||
--name spark-slave2 \
|
||||
--network swarm-net \
|
||||
--replicas 1 \
|
||||
--endpoint-mode dnsrr \
|
||||
newnius/spark
|
||||
spark-submit \
|
||||
--master spark://spark-master:7077 \
|
||||
--deploy-mode cluster \
|
||||
--class org.apache.spark.examples.JavaSparkPi \
|
||||
./examples/jars/spark-examples_2.11-1.6.0.jar 100
|
||||
```
|
||||
|
||||
## Init && Test
|
||||
#### spark-shell HDFS wordcount
|
||||
|
||||
In the first deploy, format dfs first
|
||||
Enter `spark-shell --master spark://spark-master:7077` to enter shell.
|
||||
|
||||
### stop cluster (in master)
|
||||
`sbin/stop-yarn.sh`
|
||||
`sbin/stop-dfs.sh`
|
||||
`../spark/sbin/stop-all.sh`
|
||||
```shell
|
||||
val lines = sc.textFile("hdfs://hadoop-master:8020/user/root/input")
|
||||
|
||||
### remove previous data (in all nodes)
|
||||
clear all data in /tmp in all nodes
|
||||
val words = lines.flatMap(_.split("\\s+"))
|
||||
|
||||
### format hdfs (in master)
|
||||
```
|
||||
bin/hadoop namenode -format
|
||||
val wc = words.map(word => (word, 1)).reduceByKey(_ + _)
|
||||
|
||||
wc.collect()
|
||||
|
||||
val cnt = words.map(word => 1).reduce(_ + _)
|
||||
```
|
||||
|
||||
### start cluster (in master)
|
||||
`sbin/start-dfs.sh`
|
||||
`sbin/start-yarn.sh`
|
||||
`../spark/sbin/start-all.sh`
|
||||
## Browse the web UI
|
||||
|
||||
### monitor cluster in browser
|
||||
You can expose the ports in the script, but I'd rather not since the slaves shoule occupy the same ports.
|
||||
|
||||
YARN: spark-master:8088
|
||||
To access the web UI, deploy another (socks5) proxy to route the traffic.
|
||||
|
||||
HDFS: spark-master:50070
|
||||
If you don't one, try [newnius/docker-proxy](https://hub.docker.com/r/newnius/docker-proxy/), it is rather easy to use.
|
||||
|
||||
SPARK: spark-master:8080
|
||||
|
||||
_Proxy needed, e.g. newnius/docker-proxy_
|
||||
|
||||
## customized config
|
||||
|
||||
```bash
|
||||
docker service create \
|
||||
--name spark-master \
|
||||
--network swarm-net \
|
||||
--replicas 1 \
|
||||
--mount type=bind,source=/mnt/data/spark/hdfs/master,target=/tmp/hadoop-root \
|
||||
--mount type=bind,source=/mnt/data/spark/logs/master,target=/usr/local/hadoop/logs \
|
||||
--mount type=bind,source=/mnt/data/spark/config/hadoop,target=/mnt/config/hadoop \
|
||||
--mount type=bind,source=/mnt/data/spark/config/spark,target=/mnt/config/spark \
|
||||
--mount type=bind,source=/mnt/data/spark/config/spark-yarn-remote-client,target=/mnt/config/spark-yarn-remote-client \
|
||||
--endpoint-mode dnsrr \
|
||||
newnius/spark
|
||||
```
|
||||
|
||||
You dont't need to put all files in dir, only add files needs modified.
|
||||
Visit [spark-master:8080](http://spark-master:8080) to view the cluster.
|
||||
|
@ -1,35 +1,11 @@
|
||||
#! /bin/bash
|
||||
|
||||
|
||||
: ${HADOOP_PREFIX:=/usr/local/hadoop}
|
||||
|
||||
$HADOOP_PREFIX/etc/hadoop/hadoop-env.sh
|
||||
|
||||
rm /tmp/*.pid
|
||||
|
||||
# installing libraries if any - (resource urls added comma separated to the ACP system variable)
|
||||
cd $HADOOP_PREFIX/share/hadoop/common ; for cp in ${ACP//,/ }; do echo == $cp; curl -LO $cp ; done; cd -
|
||||
|
||||
cp -a /mnt/config/hadoop/. /usr/local/hadoop/etc/hadoop/
|
||||
cp -a /mnt/config/spark/. /usr/local/spark/conf/
|
||||
cp -a /mnt/config/spark-yarn-remote-client/. /usr/local/spark/yarn-remote-client/
|
||||
|
||||
service sshd start
|
||||
|
||||
## stop all in case master starts far behind
|
||||
$HADOOP_PREFIX/sbin/stop-yarn.sh
|
||||
$HADOOP_PREFIX/sbin/stop-dfs.sh
|
||||
|
||||
$HADOOP_PREFIX/sbin/start-dfs.sh
|
||||
$HADOOP_PREFIX/sbin/start-yarn.sh
|
||||
$HADOOP_PREFIX/sbin/mr-jobhistory-daemon.sh start historyserver
|
||||
|
||||
|
||||
$SPARK_HOME/sbin/start-all.sh
|
||||
|
||||
|
||||
if [[ $1 == "-bash" ]]; then
|
||||
/bin/bash
|
||||
else
|
||||
while true; do sleep 1000; done
|
||||
if [[ $1 == "master" ]]; then
|
||||
/usr/local/spark/sbin/start-master.sh
|
||||
fi
|
||||
|
||||
if [[ $1 == "slave" ]]; then
|
||||
/usr/local/spark/sbin/start-slave.sh $2
|
||||
fi
|
||||
|
||||
while true; do sleep 1000; done
|
||||
|
@ -1,29 +0,0 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
|
||||
<!--
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License. See accompanying LICENSE file.
|
||||
-->
|
||||
|
||||
<!-- Put site-specific property overrides in this file. -->
|
||||
|
||||
<configuration>
|
||||
|
||||
<property>
|
||||
<name>fs.defaultFS</name>
|
||||
<value>hdfs://spark-master:8020</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>fs.default.name</name>
|
||||
<value>hdfs://spark-master:8020</value>
|
||||
</property>
|
||||
</configuration>
|
@ -1,46 +0,0 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
|
||||
<!--
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License. See accompanying LICENSE file.
|
||||
-->
|
||||
|
||||
<!-- Put site-specific property overrides in this file. -->
|
||||
|
||||
<configuration>
|
||||
|
||||
<property>
|
||||
<name>dfs.permissions</name>
|
||||
<value>false</value>
|
||||
</property>
|
||||
|
||||
<property>
|
||||
<name>dfs.namenode.secondary.http-address</name>
|
||||
<value>spark-slave1:50090</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>dfs.namenode.http-address</name>
|
||||
<value>spark-master:50070</value>
|
||||
</property>
|
||||
|
||||
<property>
|
||||
<name>dfs.datanode.max.transfer.threads</name>
|
||||
<value>8192</value>
|
||||
</property>
|
||||
|
||||
<property>
|
||||
<name>dfs.replication</name>
|
||||
<value>3</value>
|
||||
</property>
|
||||
|
||||
|
||||
</configuration>
|
@ -1,26 +0,0 @@
|
||||
<?xml version="1.0"?>
|
||||
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
|
||||
<!--
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License. See accompanying LICENSE file.
|
||||
-->
|
||||
|
||||
<!-- Put site-specific property overrides in this file. -->
|
||||
|
||||
<configuration>
|
||||
|
||||
<property>
|
||||
<name>mapreduce.framework.name</name>
|
||||
<value>yarn</value>
|
||||
</property>
|
||||
|
||||
</configuration>
|
@ -1,2 +0,0 @@
|
||||
spark-slave1
|
||||
spark-slave2
|
@ -1,57 +0,0 @@
|
||||
<?xml version="1.0"?>
|
||||
<!--
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License. See accompanying LICENSE file.
|
||||
-->
|
||||
<!-- Site specific YARN configuration properties -->
|
||||
<configuration>
|
||||
|
||||
<property>
|
||||
<name>yarn.application.classpath</name>
|
||||
<value>/usr/local/hadoop/etc/hadoop, /usr/local/hadoop/share/hadoop/common/*, /usr/local/hadoop/share/hadoop/common/lib/*, /usr/local/hadoop/share/hadoop/hdfs/*, /usr/local/hadoop/share/hadoop/hdfs/lib/*, /usr/local/hadoop/share/hadoop/mapreduce/*, /usr/local/hadoop/share/hadoop/mapreduce/lib/*, /usr/local/hadoop/share/hadoop/yarn/*, /usr/local/hadoop/share/hadoop/yarn/lib/*</value>
|
||||
</property>
|
||||
|
||||
|
||||
<property>
|
||||
<name>yarn.resourcemanager.hostname</name>
|
||||
<value>spark-master</value>
|
||||
</property>
|
||||
|
||||
<property>
|
||||
<name>yarn.nodemanager.aux-services</name>
|
||||
<value>mapreduce_shuffle</value>
|
||||
</property>
|
||||
|
||||
<property>
|
||||
<name>yarn.log-aggregation-enable</name>
|
||||
<value>true</value>
|
||||
</property>
|
||||
|
||||
<property>
|
||||
<name>yarn.log-aggregation.retain-seconds</name>
|
||||
<value>604800</value>
|
||||
</property>
|
||||
|
||||
|
||||
<property>
|
||||
<name>yarn.nodemanager.resource.memory-mb</name>
|
||||
<value>2048</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>yarn.nodemanager.resource.cpu-vcores</name>
|
||||
<value>2</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>yarn.scheduler.minimum-allocation-mb</name>
|
||||
<value>1024</value>
|
||||
</property>
|
||||
</configuration>
|
@ -1,12 +0,0 @@
|
||||
<?xml version="1.0"?>
|
||||
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
|
||||
<configuration>
|
||||
<property>
|
||||
<name>fs.default.name</name>
|
||||
<value>hdfs://spark-master:8020</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>dfs.client.use.legacy.blockreader</name>
|
||||
<value>true</value>
|
||||
</property>
|
||||
</configuration>
|
@ -1,26 +0,0 @@
|
||||
<configuration>
|
||||
<property>
|
||||
<name>yarn.resourcemanager.scheduler.address</name>
|
||||
<value>spark-master:8030</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>yarn.resourcemanager.address</name>
|
||||
<value>spark-master:8032</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>yarn.resourcemanager.webapp.address</name>
|
||||
<value>spark-master:8088</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>yarn.resourcemanager.resource-tracker.address</name>
|
||||
<value>spark-master:8031</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>yarn.resourcemanager.admin.address</name>
|
||||
<value>spark-master:8033</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>yarn.application.classpath</name>
|
||||
<value>/usr/local/hadoop/etc/hadoop, /usr/local/hadoop/share/hadoop/common/*, /usr/local/hadoop/share/hadoop/common/lib/*, /usr/local/hadoop/share/hadoop/hdfs/*, /usr/local/hadoop/share/hadoop/hdfs/lib/*, /usr/local/hadoop/share/hadoop/mapreduce/*, /usr/local/hadoop/share/hadoop/mapreduce/lib/*, /usr/local/hadoop/share/hadoop/yarn/*, /usr/local/hadoop/share/hadoop/yarn/lib/*, /usr/local/hadoop/share/spark/*</value>
|
||||
</property>
|
||||
</configuration>
|
@ -1,3 +0,0 @@
|
||||
# A Spark Worker will be started on each of the machines listed below.
|
||||
spark-slave1
|
||||
spark-slave2
|
@ -1,10 +0,0 @@
|
||||
# Default system properties included when running spark-submit.
|
||||
# This is useful for setting default environmental settings.
|
||||
|
||||
# Example:
|
||||
spark.master spark://spark-master:7077
|
||||
spark.eventLog.enabled true
|
||||
spark.eventLog.dir hdfs://spark-master:/var/log/spark
|
||||
spark.serializer org.apache.spark.serializer.KryoSerializer
|
||||
# spark.driver.memory 5g
|
||||
# spark.executor.extraJavaOptions -XX:+PrintGCDetails -Dkey=value -Dnumbers="one two three"
|
@ -1,52 +0,0 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
# This file is sourced when running various Spark programs.
|
||||
# Copy it as spark-env.sh and edit that to configure Spark for your site.
|
||||
|
||||
# Options read when launching programs locally with
|
||||
# ./bin/run-example or ./bin/spark-submit
|
||||
# - HADOOP_CONF_DIR, to point Spark towards Hadoop configuration files
|
||||
# - SPARK_LOCAL_IP, to set the IP address Spark binds to on this node
|
||||
# - SPARK_PUBLIC_DNS, to set the public dns name of the driver program
|
||||
# - SPARK_CLASSPATH, default classpath entries to append
|
||||
|
||||
# Options read by executors and drivers running inside the cluster
|
||||
# - SPARK_LOCAL_IP, to set the IP address Spark binds to on this node
|
||||
# - SPARK_PUBLIC_DNS, to set the public DNS name of the driver program
|
||||
# - SPARK_CLASSPATH, default classpath entries to append
|
||||
# - SPARK_LOCAL_DIRS, storage directories to use on this node for shuffle and RDD data
|
||||
# - MESOS_NATIVE_LIBRARY, to point to your libmesos.so if you use Mesos
|
||||
|
||||
# Options read in YARN client mode
|
||||
# - HADOOP_CONF_DIR, to point Spark towards Hadoop configuration files
|
||||
# - SPARK_EXECUTOR_INSTANCES, Number of workers to start (Default: 2)
|
||||
# - SPARK_EXECUTOR_CORES, Number of cores for the workers (Default: 1).
|
||||
# - SPARK_EXECUTOR_MEMORY, Memory per Worker (e.g. 1000M, 2G) (Default: 1G)
|
||||
# - SPARK_DRIVER_MEMORY, Memory for Master (e.g. 1000M, 2G) (Default: 512 Mb)
|
||||
# - SPARK_YARN_APP_NAME, The name of your application (Default: Spark)
|
||||
# - SPARK_YARN_QUEUE, The hadoop queue to use for allocation requests (Default: ‘default’)
|
||||
# - SPARK_YARN_DIST_FILES, Comma separated list of files to be distributed with the job.
|
||||
# - SPARK_YARN_DIST_ARCHIVES, Comma separated list of archives to be distributed with the job.
|
||||
|
||||
# Options for the daemons used in the standalone deploy mode:
|
||||
# - SPARK_MASTER_IP, to bind the master to a different IP address or hostname
|
||||
# - SPARK_MASTER_PORT / SPARK_MASTER_WEBUI_PORT, to use non-default ports for the master
|
||||
# - SPARK_MASTER_OPTS, to set config properties only for the master (e.g. "-Dx=y")
|
||||
# - SPARK_WORKER_CORES, to set the number of cores to use on this machine
|
||||
# - SPARK_WORKER_MEMORY, to set how much total memory workers have to give executors (e.g. 1000m, 2g)
|
||||
# - SPARK_WORKER_PORT / SPARK_WORKER_WEBUI_PORT, to use non-default ports for the worker
|
||||
# - SPARK_WORKER_INSTANCES, to set the number of worker processes per node
|
||||
# - SPARK_WORKER_DIR, to set the working directory of worker processes
|
||||
# - SPARK_WORKER_OPTS, to set config properties only for the worker (e.g. "-Dx=y")
|
||||
# - SPARK_HISTORY_OPTS, to set config properties only for the history server (e.g. "-Dx=y")
|
||||
# - SPARK_DAEMON_JAVA_OPTS, to set config properties for all daemons (e.g. "-Dx=y")
|
||||
# - SPARK_PUBLIC_DNS, to set the public dns name of the master or workers
|
||||
|
||||
export JAVA_HOME=/usr/java/default
|
||||
export SPARK_MASTER_IP=spark-master
|
||||
export SPARK_WORKER_CORES=1
|
||||
export SPARK_WORKER_INSTANCES=1
|
||||
export SPARK_MORKER_PORT=7077
|
||||
export SPARK_WORKER_MEMORY=2g
|
||||
export MASTER=spark://${SPARK_MASTER_IP}:${SPARK_MASTER_PORT}
|
||||
export SPARK_HISTORY_OPTS="-Dspark.history.ui.port=18080 -Dspark.history.retainedApplications=3 -Dspark.history.fs.logDirectory=hdfs://spark-master:/var/log/spark"
|
Loading…
Reference in New Issue
Block a user