update spark

This commit is contained in:
Newnius 2018-08-08 12:16:24 +08:00
parent c7b252fe79
commit 702f667281
16 changed files with 52 additions and 335 deletions

View File

@ -1,26 +1,34 @@
FROM newnius/hadoop:2.8.1 FROM alpine:3.8
MAINTAINER Newnius <newnius.cn@gmail.com> MAINTAINER Newnius <newnius.cn@gmail.com>
RUN wget http://mirrors.ocf.berkeley.edu/apache/spark/spark-2.2.1/spark-2.2.1-bin-hadoop2.7.tgz && \ USER root
tar -xvf spark-2.2.1-bin-hadoop2.7.tgz -C /usr/local && \
rm spark-2.2.1-bin-hadoop2.7.tgz
RUN ln -s /usr/local/spark-2.2.1-bin-hadoop2.7 /usr/local/spark # Prerequisites
RUN apk add --no-cache openssh openssl openjdk8-jre rsync bash procps coreutils
ENV JAVA_HOME /usr/lib/jvm/java-1.8-openjdk
ENV PATH $PATH:$JAVA_HOME/bin
ENV SPARK_VER 2.2.1
RUN wget https://archive.apache.org/dist/spark/spark-$SPARK_VER/spark-$SPARK_VER-bin-hadoop2.7.tgz && \
tar -xvf spark-$SPARK_VER-bin-hadoop2.7.tgz -C /usr/local && \
rm spark-$SPARK_VER-bin-hadoop2.7.tgz
RUN ln -s /usr/local/spark-$SPARK_VER-bin-hadoop2.7 /usr/local/spark
ENV SPARK_HOME /usr/local/spark ENV SPARK_HOME /usr/local/spark
ADD config/* /usr/local/hadoop/etc/hadoop/
ENV HADOOP_CONF_DIR /usr/local/hadoop/etc/hadoop
ENV PATH $PATH:$SPARK_HOME/bin ENV PATH $PATH:$SPARK_HOME/bin
# Default Conf Files ADD bootstrap.sh /etc/bootstrap.sh
ADD core-site.xml $HADOOP_HOME/etc/hadoop/core-site.xml
ADD hdfs-site.xml $HADOOP_HOME/etc/hadoop/hdfs-site.xml
ADD mapred-site.xml $HADOOP_HOME/etc/hadoop/mapred-site.xml
ADD yarn-site.xml $HADOOP_HOME/etc/hadoop/yarn-site.xml
ADD slaves $HADOOP_HOME/etc/hadoop/slaves
ADD bootstrap.sh /etc/bootstrap-spark.sh
WORKDIR /usr/local/spark WORKDIR /usr/local/spark
CMD ["/etc/bootstrap-spark.sh", "-d"] CMD ["/etc/bootstrap.sh"]

View File

@ -1,93 +1,45 @@
# Spark on yarn # Deploy Spark On Yarn
## Create a spark cluster in swarm mode ## Client
`--hostname` needs 1.13 or higher
```bash ```bash
docker service create \ docker service create \
--name spark-master \ --name spark-client \
--hostname spark-master \ --hostname spark-client \
--detach true \ --network swarm-net \
--network swarm-net \ --replicas 1 \
--replicas 1 \ --detach true \
--endpoint-mode dnsrr \ newnius/spark:2.2.1-yarn
newnius/spark:2.2.1
``` ```
```bash ## Validate installation
docker service create \
--name spark-slave1 \
--hostname spark-slave1 \
--detach true \
--network swarm-net \
--replicas 1 \
--endpoint-mode dnsrr \
newnius/spark:2.2.1
```
```bash #### spark-submit PI
docker service create \
--name spark-slave2 \
--hostname spark-slave2 \
--detach true \
--network swarm-net \
--replicas 1 \
--endpoint-mode dnsrr \
newnius/spark:2.2.1
```
## Init && Test
In the first deploy, format hdfs
### Stop HDFS (in master)
```bash
sbin/stop-dfs.sh
```
### Format HDFS (in master)
```
bin/hadoop namenode -format
```
### Start HDFS (in master)
```bash
sbin/start-dfs.sh
```
### Run Hello World
```bash ```bash
spark-submit \ spark-submit \
--master yarn \ --master yarn \
--deploy-mode cluster \ --deploy-mode cluster \
--class org.apache.spark.examples.JavaSparkPi \ --class org.apache.spark.examples.JavaSparkPi \
./examples/jars/spark-examples_2.11-2.2.1.jar 100 ./examples/jars/spark-examples*.jar 100
``` ```
### UI #### spark-shell HDFS wordcount
YARN: spark-master:8088 Enter `spark-shell --master yarn` to enter shell.
HDFS: spark-master:50070 ```shell
val lines = sc.textFile("hdfs://hadoop-master:8020/user/root/input")
_Proxy needed, e.g. [newnius/docker-proxy](https://hub.docker.com/r/newnius/docker-proxy/)_ val words = lines.flatMap(_.split("\\s+"))
## customized config val wc = words.map(word => (word, 1)).reduceByKey(_ + _)
```bash wc.collect()
docker service create \
--name spark-master \ val cnt = words.map(word => 1).reduce(_ + _)
--hostname spark-master \
--detach=true \
--network swarm-net \
--replicas 1 \
--mount type=bind,source=/mnt/data/spark/hdfs/master,target=/tmp/hadoop-root \
--mount type=bind,source=/mnt/data/spark/logs/master,target=/usr/local/hadoop/logs \
--mount type=bind,source=/mnt/data/spark/config/hadoop,target=/mnt/config/hadoop \
--mount type=bind,source=/mnt/data/spark/config/spark,target=/mnt/config/spark \
--endpoint-mode dnsrr \
newnius/spark:2.2.1
``` ```
You dont't need to put all files in dir, only add files to be replaced. ## Browse the web UI
In Spark On Yarn mode, the spark jobs will occur in the Yarn web UI.

View File

@ -1,6 +1,10 @@
#! /bin/bash #! /bin/bash
# replace config
cp /mnt/config/spark/* $SPARK_HOME/conf ## replace config
: ${EXTRA_CONF_DIR:=/config/hadoop}
bash -c "/etc/bootstrap.sh -d" if [ -d "$EXTRA_CONF_DIR" ]; then
cp $EXTRA_CONF_DIR/* /usr/local/hadoop/etc/hadoop/
fi
while true; do sleep 1000; done

View File

@ -1,28 +0,0 @@
<?xml version="1.0" encoding="UTF-8"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
<!--
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. See accompanying LICENSE file.
-->
<!-- Put site-specific property overrides in this file. -->
<configuration>
<property>
<name>fs.defaultFS</name>
<value>hdfs://spark-master:8020</value>
</property>
<property>
<name>fs.default.name</name>
<value>hdfs://spark-master:8020</value>
</property>
</configuration>

View File

@ -1,40 +0,0 @@
<?xml version="1.0" encoding="UTF-8"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
<!--
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. See accompanying LICENSE file.
-->
<!-- Put site-specific property overrides in this file. -->
<configuration>
<property>
<name>dfs.permissions</name>
<value>false</value>
</property>
<property>
<name>dfs.namenode.secondary.http-address</name>
<value>spark-slave1:50090</value>
</property>
<property>
<name>dfs.namenode.http-address</name>
<value>spark-master:50070</value>
</property>
<property>
<name>dfs.datanode.max.transfer.threads</name>
<value>8192</value>
</property>
<property>
<name>dfs.replication</name>
<value>2</value>
</property>
</configuration>

View File

@ -1,32 +0,0 @@
<?xml version="1.0"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
<!--
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. See accompanying LICENSE file.
-->
<!-- Put site-specific property overrides in this file. -->
<configuration>
<property>
<name>mapreduce.framework.name</name>
<value>yarn</value>
</property>
<property>
<name>mapreduce.jobhistory.address</name>
<value>spark-master:10020</value>
</property>
<property>
<name>mapreduce.jobhistory.webapp.address</name>
<value>spark-master:19888</value>
</property>
</configuration>

View File

@ -1,3 +0,0 @@
spark-slave1
spark-slave2
spark-slave3

View File

@ -1,49 +0,0 @@
<?xml version="1.0"?>
<!--
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. See accompanying LICENSE file.
-->
<!-- Site specific YARN configuration properties -->
<configuration>
<property>
<name>yarn.application.classpath</name>
<value>/usr/local/hadoop/etc/hadoop, /usr/local/hadoop/share/hadoop/common/*, /usr/local/hadoop/share/hadoop/common/lib/*, /usr/local/hadoop/share/hadoop/hdfs/*, /usr/local/hadoop/share/hadoop/hdfs/lib/*, /usr/local/hadoop/share/hadoop/mapreduce/*, /usr/local/hadoop/share/hadoop/mapreduce/lib/*, /usr/local/hadoop/share/hadoop/yarn/*, /usr/local/hadoop/share/hadoop/yarn/lib/*</value>
</property>
<property>
<name>yarn.resourcemanager.hostname</name>
<value>spark-master</value>
</property>
<property>
<name>yarn.nodemanager.aux-services</name>
<value>mapreduce_shuffle</value>
</property>
<property>
<name>yarn.log-aggregation-enable</name>
<value>true</value>
</property>
<property>
<name>yarn.log-aggregation.retain-seconds</name>
<value>604800</value>
</property>
<property>
<name>yarn.nodemanager.resource.memory-mb</name>
<value>2048</value>
</property>
<property>
<name>yarn.nodemanager.resource.cpu-vcores</name>
<value>2</value>
</property>
<property>
<name>yarn.scheduler.minimum-allocation-mb</name>
<value>1024</value>
</property>
</configuration>

View File

@ -1,34 +0,0 @@
FROM alpine:3.8
MAINTAINER Newnius <newnius.cn@gmail.com>
USER root
# Prerequisites
RUN apk add --no-cache openssh openssl openjdk8-jre rsync bash procps coreutils
ENV JAVA_HOME /usr/lib/jvm/java-1.8-openjdk
ENV PATH $PATH:$JAVA_HOME/bin
ENV SPARK_VER 2.2.1
RUN wget https://archive.apache.org/dist/spark/spark-$SPARK_VER/spark-$SPARK_VER-bin-hadoop2.7.tgz && \
tar -xvf spark-$SPARK_VER-bin-hadoop2.7.tgz -C /usr/local && \
rm spark-$SPARK_VER-bin-hadoop2.7.tgz
RUN ln -s /usr/local/spark-$SPARK_VER-bin-hadoop2.7 /usr/local/spark
ENV SPARK_HOME /usr/local/spark
ADD config/* /usr/local/hadoop/etc/hadoop
ENV HADOOP_CONF_DIR /usr/local/hadoop/etc/hadoop
ENV PATH $PATH:$SPARK_HOME/bin
ADD bootstrap.sh /etc/bootstrap.sh
WORKDIR /usr/local/spark
CMD ["/etc/bootstrap.sh", "-d"]

View File

@ -1,45 +0,0 @@
# Deploy Spark On Yarn
## Client
```bash
docker service create \
--name spark-client \
--hostname spark-client \
--network swarm-net \
--replicas 1 \
--detach true \
newnius/spark:2.2.1-yarn
```
## Validate installation
#### spark-submit PI
```bash
spark-submit \
--master yarn \
--deploy-mode cluster \
--class org.apache.spark.examples.JavaSparkPi \
./examples/jars/spark-examples*.jar 100
```
#### spark-shell HDFS wordcount
Enter `spark-shell --master yarn` to enter shell.
```shell
val lines = sc.textFile("hdfs://hadoop-master:8020/user/root/input")
val words = lines.flatMap(_.split("\\s+"))
val wc = words.map(word => (word, 1)).reduceByKey(_ + _)
wc.collect()
val cnt = words.map(word => 1).reduce(_ + _)
```
## Browse the web UI
In Spark On Yarn mode, the spark jobs will occur in the Yarn web UI.

View File

@ -1,16 +0,0 @@
#! /bin/bash
## replace config
: ${EXTRA_CONF_DIR:=/config/hadoop}
if [ -d "$EXTRA_CONF_DIR" ]; then
cp $EXTRA_CONF_DIR/* /usr/local/hadoop/etc/hadoop/
fi
if [[ $1 == "-d" ]]; then
while true; do sleep 1000; done
fi
if [[ $1 == "-bash" ]]; then
/bin/bash
fi