update spark

This commit is contained in:
2018-08-08 12:16:24 +08:00
parent c7b252fe79
commit 702f667281
16 changed files with 52 additions and 335 deletions

View File

@@ -1,26 +1,34 @@
FROM newnius/hadoop:2.8.1
FROM alpine:3.8
MAINTAINER Newnius <newnius.cn@gmail.com>
RUN wget http://mirrors.ocf.berkeley.edu/apache/spark/spark-2.2.1/spark-2.2.1-bin-hadoop2.7.tgz && \
tar -xvf spark-2.2.1-bin-hadoop2.7.tgz -C /usr/local && \
rm spark-2.2.1-bin-hadoop2.7.tgz
USER root
RUN ln -s /usr/local/spark-2.2.1-bin-hadoop2.7 /usr/local/spark
# Prerequisites
RUN apk add --no-cache openssh openssl openjdk8-jre rsync bash procps coreutils
ENV JAVA_HOME /usr/lib/jvm/java-1.8-openjdk
ENV PATH $PATH:$JAVA_HOME/bin
ENV SPARK_VER 2.2.1
RUN wget https://archive.apache.org/dist/spark/spark-$SPARK_VER/spark-$SPARK_VER-bin-hadoop2.7.tgz && \
tar -xvf spark-$SPARK_VER-bin-hadoop2.7.tgz -C /usr/local && \
rm spark-$SPARK_VER-bin-hadoop2.7.tgz
RUN ln -s /usr/local/spark-$SPARK_VER-bin-hadoop2.7 /usr/local/spark
ENV SPARK_HOME /usr/local/spark
ADD config/* /usr/local/hadoop/etc/hadoop/
ENV HADOOP_CONF_DIR /usr/local/hadoop/etc/hadoop
ENV PATH $PATH:$SPARK_HOME/bin
# Default Conf Files
ADD core-site.xml $HADOOP_HOME/etc/hadoop/core-site.xml
ADD hdfs-site.xml $HADOOP_HOME/etc/hadoop/hdfs-site.xml
ADD mapred-site.xml $HADOOP_HOME/etc/hadoop/mapred-site.xml
ADD yarn-site.xml $HADOOP_HOME/etc/hadoop/yarn-site.xml
ADD slaves $HADOOP_HOME/etc/hadoop/slaves
ADD bootstrap.sh /etc/bootstrap-spark.sh
ADD bootstrap.sh /etc/bootstrap.sh
WORKDIR /usr/local/spark
CMD ["/etc/bootstrap-spark.sh", "-d"]
CMD ["/etc/bootstrap.sh"]

View File

@@ -1,93 +1,45 @@
# Spark on yarn
# Deploy Spark On Yarn
## Create a spark cluster in swarm mode
`--hostname` needs 1.13 or higher
## Client
```bash
docker service create \
--name spark-master \
--hostname spark-master \
--detach true \
--network swarm-net \
--replicas 1 \
--endpoint-mode dnsrr \
newnius/spark:2.2.1
--name spark-client \
--hostname spark-client \
--network swarm-net \
--replicas 1 \
--detach true \
newnius/spark:2.2.1-yarn
```
```bash
docker service create \
--name spark-slave1 \
--hostname spark-slave1 \
--detach true \
--network swarm-net \
--replicas 1 \
--endpoint-mode dnsrr \
newnius/spark:2.2.1
```
## Validate installation
```bash
docker service create \
--name spark-slave2 \
--hostname spark-slave2 \
--detach true \
--network swarm-net \
--replicas 1 \
--endpoint-mode dnsrr \
newnius/spark:2.2.1
```
#### spark-submit PI
## Init && Test
In the first deploy, format hdfs
### Stop HDFS (in master)
```bash
sbin/stop-dfs.sh
```
### Format HDFS (in master)
```
bin/hadoop namenode -format
```
### Start HDFS (in master)
```bash
sbin/start-dfs.sh
```
### Run Hello World
```bash
spark-submit \
--master yarn \
--deploy-mode cluster \
--class org.apache.spark.examples.JavaSparkPi \
./examples/jars/spark-examples_2.11-2.2.1.jar 100
./examples/jars/spark-examples*.jar 100
```
### UI
#### spark-shell HDFS wordcount
YARN: spark-master:8088
Enter `spark-shell --master yarn` to enter shell.
HDFS: spark-master:50070
```shell
val lines = sc.textFile("hdfs://hadoop-master:8020/user/root/input")
_Proxy needed, e.g. [newnius/docker-proxy](https://hub.docker.com/r/newnius/docker-proxy/)_
val words = lines.flatMap(_.split("\\s+"))
## customized config
val wc = words.map(word => (word, 1)).reduceByKey(_ + _)
```bash
docker service create \
--name spark-master \
--hostname spark-master \
--detach=true \
--network swarm-net \
--replicas 1 \
--mount type=bind,source=/mnt/data/spark/hdfs/master,target=/tmp/hadoop-root \
--mount type=bind,source=/mnt/data/spark/logs/master,target=/usr/local/hadoop/logs \
--mount type=bind,source=/mnt/data/spark/config/hadoop,target=/mnt/config/hadoop \
--mount type=bind,source=/mnt/data/spark/config/spark,target=/mnt/config/spark \
--endpoint-mode dnsrr \
newnius/spark:2.2.1
wc.collect()
val cnt = words.map(word => 1).reduce(_ + _)
```
You dont't need to put all files in dir, only add files to be replaced.
## Browse the web UI
In Spark On Yarn mode, the spark jobs will occur in the Yarn web UI.

View File

@@ -1,6 +1,10 @@
#! /bin/bash
# replace config
cp /mnt/config/spark/* $SPARK_HOME/conf
## replace config
: ${EXTRA_CONF_DIR:=/config/hadoop}
bash -c "/etc/bootstrap.sh -d"
if [ -d "$EXTRA_CONF_DIR" ]; then
cp $EXTRA_CONF_DIR/* /usr/local/hadoop/etc/hadoop/
fi
while true; do sleep 1000; done

View File

@@ -19,10 +19,10 @@
<configuration>
<property>
<name>fs.defaultFS</name>
<value>hdfs://spark-master:8020</value>
<value>hdfs://hadoop-master:8020</value>
</property>
<property>
<name>fs.default.name</name>
<value>hdfs://spark-master:8020</value>
<value>hdfs://hadoop-master:8020</value>
</property>
</configuration>

View File

@@ -23,11 +23,11 @@
</property>
<property>
<name>dfs.namenode.secondary.http-address</name>
<value>spark-slave1:50090</value>
<value>hadoop-slave1:50090</value>
</property>
<property>
<name>dfs.namenode.http-address</name>
<value>spark-master:50070</value>
<value>hadoop-master:50070</value>
</property>
<property>
<name>dfs.datanode.max.transfer.threads</name>
@@ -35,6 +35,6 @@
</property>
<property>
<name>dfs.replication</name>
<value>2</value>
<value>3</value>
</property>
</configuration>

View File

@@ -23,10 +23,10 @@
</property>
<property>
<name>mapreduce.jobhistory.address</name>
<value>spark-master:10020</value>
<value>hadoop-master:10020</value>
</property>
<property>
<name>mapreduce.jobhistory.webapp.address</name>
<value>spark-master:19888</value>
<value>hadoop-master:19888</value>
</property>
</configuration>

View File

@@ -0,0 +1,3 @@
hadoop-slave1
hadoop-slave2
hadoop-slave3

View File

@@ -20,7 +20,7 @@
</property>
<property>
<name>yarn.resourcemanager.hostname</name>
<value>spark-master</value>
<value>hadoop-master</value>
</property>
<property>
<name>yarn.nodemanager.aux-services</name>

View File

@@ -1,3 +0,0 @@
spark-slave1
spark-slave2
spark-slave3