update spark

This commit is contained in:
Newnius 2018-08-07 20:04:47 +08:00
parent 0582327307
commit 94228e4944
13 changed files with 98 additions and 125 deletions

View File

@ -0,0 +1,26 @@
FROM newnius/hadoop:2.8.1
MAINTAINER Newnius <newnius.cn@gmail.com>
RUN wget http://mirrors.ocf.berkeley.edu/apache/spark/spark-2.2.1/spark-2.2.1-bin-hadoop2.7.tgz && \
tar -xvf spark-2.2.1-bin-hadoop2.7.tgz -C /usr/local && \
rm spark-2.2.1-bin-hadoop2.7.tgz
RUN ln -s /usr/local/spark-2.2.1-bin-hadoop2.7 /usr/local/spark
ENV SPARK_HOME /usr/local/spark
ENV PATH $PATH:$SPARK_HOME/bin
# Default Conf Files
ADD core-site.xml $HADOOP_HOME/etc/hadoop/core-site.xml
ADD hdfs-site.xml $HADOOP_HOME/etc/hadoop/hdfs-site.xml
ADD mapred-site.xml $HADOOP_HOME/etc/hadoop/mapred-site.xml
ADD yarn-site.xml $HADOOP_HOME/etc/hadoop/yarn-site.xml
ADD slaves $HADOOP_HOME/etc/hadoop/slaves
ADD bootstrap.sh /etc/bootstrap-spark.sh
WORKDIR /usr/local/spark
CMD ["/etc/bootstrap-spark.sh", "-d"]

View File

@ -2,6 +2,8 @@
## Create a spark cluster in swarm mode ## Create a spark cluster in swarm mode
`--hostname` needs 1.13 or higher
```bash ```bash
docker service create \ docker service create \
--name spark-master \ --name spark-master \

6
spark/2.2.1-yarn/bootstrap.sh Executable file
View File

@ -0,0 +1,6 @@
#! /bin/bash
# replace config
cp /mnt/config/spark/* $SPARK_HOME/conf
bash -c "/etc/bootstrap.sh -d"

View File

@ -1,26 +1,30 @@
FROM newnius/hadoop:2.8.1 FROM alpine:3.8
MAINTAINER Newnius <newnius.cn@gmail.com> MAINTAINER Newnius <newnius.cn@gmail.com>
RUN wget http://mirrors.ocf.berkeley.edu/apache/spark/spark-2.2.1/spark-2.2.1-bin-hadoop2.7.tgz && \ USER root
tar -xvf spark-2.2.1-bin-hadoop2.7.tgz -C /usr/local && \
rm spark-2.2.1-bin-hadoop2.7.tgz
RUN ln -s /usr/local/spark-2.2.1-bin-hadoop2.7 /usr/local/spark # Prerequisites
RUN apk add --no-cache openssh openssl openjdk8-jre rsync bash procps coreutils
ENV JAVA_HOME /usr/lib/jvm/java-1.8-openjdk
ENV PATH $PATH:$JAVA_HOME/bin
ENV SPARK_VER 2.2.1
RUN wget https://archive.apache.org/dist/spark/spark-$SPARK_VER/spark-$SPARK_VER-bin-hadoop2.7.tgz && \
tar -xvf spark-$SPARK_VER-bin-hadoop2.7.tgz -C /usr/local && \
rm spark-$SPARK_VER-bin-hadoop2.7.tgz
RUN ln -s /usr/local/spark-$SPARK_VER-bin-hadoop2.7 /usr/local/spark
ENV SPARK_HOME /usr/local/spark ENV SPARK_HOME /usr/local/spark
ENV PATH $PATH:$SPARK_HOME/bin ENV PATH $PATH:$SPARK_HOME/bin
# Default Conf Files ADD bootstrap.sh /etc/bootstrap.sh
ADD core-site.xml $HADOOP_HOME/etc/hadoop/core-site.xml
ADD hdfs-site.xml $HADOOP_HOME/etc/hadoop/hdfs-site.xml
ADD mapred-site.xml $HADOOP_HOME/etc/hadoop/mapred-site.xml
ADD yarn-site.xml $HADOOP_HOME/etc/hadoop/yarn-site.xml
ADD slaves $HADOOP_HOME/etc/hadoop/slaves
ADD bootstrap.sh /etc/bootstrap-spark.sh
WORKDIR /usr/local/spark WORKDIR /usr/local/spark
CMD ["/etc/bootstrap-spark.sh", "-d"] ENTRYPOINT ["/etc/bootstrap.sh"]

View File

@ -1,93 +1,64 @@
# Spark on yarn # Deploy Spark Cluster of standalone mode
## Create a spark cluster in swarm mode ## Master
`--hostname` needs 1.13 or higher
```bash ```bash
docker service create \ docker service create \
--name spark-master \ --name spark-master \
--hostname spark-master \ --hostname spark-master \
--detach true \ --network swarm-net \
--network swarm-net \ --replicas 1 \
--replicas 1 \ --detach true \
--endpoint-mode dnsrr \ --endpoint-mode dnsrr \
newnius/spark:2.2.1 newnius/spark:2.2.1 master
``` ```
## Slaves
```bash ```bash
docker service create \ docker service create \
--name spark-slave1 \ --name spark-slave \
--hostname spark-slave1 \ --network swarm-net \
--detach true \ --replicas 5 \
--network swarm-net \ --detach true \
--replicas 1 \ --endpoint-mode dnsrr \
--endpoint-mode dnsrr \ newnius/spark:2.2.1 slave spark://spark-master:7077
newnius/spark:2.2.1
``` ```
```bash ## Validate installation
docker service create \
--name spark-slave2 \
--hostname spark-slave2 \
--detach true \
--network swarm-net \
--replicas 1 \
--endpoint-mode dnsrr \
newnius/spark:2.2.1
```
## Init && Test #### spark-submit PI
In the first deploy, format hdfs
### Stop HDFS (in master)
```bash
sbin/stop-dfs.sh
```
### Format HDFS (in master)
```
bin/hadoop namenode -format
```
### Start HDFS (in master)
```bash
sbin/start-dfs.sh
```
### Run Hello World
```bash ```bash
spark-submit \ spark-submit \
--master yarn \ --master spark://spark-master:7077 \
--deploy-mode cluster \ --deploy-mode cluster \
--class org.apache.spark.examples.JavaSparkPi \ --class org.apache.spark.examples.JavaSparkPi \
./examples/jars/spark-examples_2.11-2.2.1.jar 100 ./examples/jars/spark-examples_2.11-2.2.1.jar 100
``` ```
### UI #### spark-shell HDFS wordcount
YARN: spark-master:8088 Enter `spark-shell --master spark://spark-master:7077` to enter shell.
HDFS: spark-master:50070 ```shell
val lines = sc.textFile("hdfs://hadoop-master:8020/user/root/input")
_Proxy needed, e.g. [newnius/docker-proxy](https://hub.docker.com/r/newnius/docker-proxy/)_ val words = lines.flatMap(_.split("\\s+"))
## customized config val wc = words.map(word => (word, 1)).reduceByKey(_ + _)
```bash wc.collect()
docker service create \
--name spark-master \ val cnt = words.map(word => 1).reduce(_ + _)
--hostname spark-master \
--detach=true \
--network swarm-net \
--replicas 1 \
--mount type=bind,source=/mnt/data/spark/hdfs/master,target=/tmp/hadoop-root \
--mount type=bind,source=/mnt/data/spark/logs/master,target=/usr/local/hadoop/logs \
--mount type=bind,source=/mnt/data/spark/config/hadoop,target=/mnt/config/hadoop \
--mount type=bind,source=/mnt/data/spark/config/spark,target=/mnt/config/spark \
--endpoint-mode dnsrr \
newnius/spark:2.2.1
``` ```
You dont't need to put all files in dir, only add files to be replaced. ## Browse the web UI
You can expose the ports in the script, but I'd rather not since the slaves shoule occupy the same ports.
To access the web UI, deploy another (socks5) proxy to route the traffic.
If you don't one, try [newnius/docker-proxy](https://hub.docker.com/r/newnius/docker-proxy/), it is rather easy to use.
Visit [spark-master:8080](http://spark-master:8080) to view the cluster.

View File

@ -1,6 +1,11 @@
#! /bin/bash #! /bin/bash
# replace config
cp /mnt/config/spark/* $SPARK_HOME/conf if [[ $1 == "master" ]]; then
/usr/local/spark/sbin/start-master.sh
fi
bash -c "/etc/bootstrap.sh -d" if [[ $1 == "slave" ]]; then
/usr/local/spark/sbin/start-slave.sh $2
fi
while true; do sleep 1000; done

View File

@ -1,30 +0,0 @@
FROM alpine:3.8
MAINTAINER Newnius <newnius.cn@gmail.com>
USER root
# Prerequisites
RUN apk add --no-cache openssh openssl openjdk8-jre rsync bash procps coreutils
ENV JAVA_HOME /usr/lib/jvm/java-1.8-openjdk
ENV PATH $PATH:$JAVA_HOME/bin
ENV SPARK_VER 2.2.1
RUN wget https://archive.apache.org/dist/spark/spark-$SPARK_VER/spark-$SPARK_VER-bin-hadoop2.7.tgz && \
tar -xvf spark-$SPARK_VER-bin-hadoop2.7.tgz -C /usr/local && \
rm spark-$SPARK_VER-bin-hadoop2.7.tgz
RUN ln -s /usr/local/spark-$SPARK_VER-bin-hadoop2.7 /usr/local/spark
ENV SPARK_HOME /usr/local/spark
ENV PATH $PATH:$SPARK_HOME/bin
ADD bootstrap.sh /etc/bootstrap.sh
WORKDIR /usr/local/spark
ENTRYPOINT ["/etc/bootstrap.sh"]

View File

@ -1,11 +0,0 @@
#! /bin/bash
if [[ $1 == "master" ]]; then
/usr/local/spark/sbin/start_master.sh
fi
if [[ $1 == "slave" ]]; then
/usr/local/spark/sbin/start_slave.sh $2
fi
while true; do sleep 1000; done