diff --git a/spark/2.2.1/Dockerfile b/spark/2.2.1/Dockerfile new file mode 100644 index 0000000..c7db7bf --- /dev/null +++ b/spark/2.2.1/Dockerfile @@ -0,0 +1,26 @@ +FROM newnius/hadoop:2.8.1 + +MAINTAINER Newnius + +RUN wget -O spark-2.2.1.tgz http://mirrors.ocf.berkeley.edu/apache/spark/spark-2.2.1/spark-2.2.1-bin-hadoop2.7.tgz && \ + tar -xvf spark-2.2.1-bin-hadoop2.7.tgz -C /usr/local && \ + rm spark-2.2.1-bin-hadoop2.7.tgz + +RUN ln -s /usr/local/spark-2.2.1 /usr/local/spark + +ENV SPARK_HOME /usr/local/spark + +ENV PATH $PATH:$SPARK_HOME/bin + +# Default Conf Files +ADD core-site.xml $HADOOP_HOME/etc/hadoop/core-site.xml +ADD hdfs-site.xml $HADOOP_HOME/etc/hadoop/hdfs-site.xml +ADD mapred-site.xml $HADOOP_HOME/etc/hadoop/mapred-site.xml +ADD yarn-site.xml $HADOOP_HOME/etc/hadoop/yarn-site.xml +ADD slaves $HADOOP_HOME/etc/hadoop/slaves + +ADD bootstrap.sh /etc/bootstrap-spark.sh + +WORKDIR /usr/local/spark + +CMD ["/etc/bootstrap-spark.sh", "-d"] diff --git a/spark/2.2.1/README.md b/spark/2.2.1/README.md new file mode 100644 index 0000000..85505c4 --- /dev/null +++ b/spark/2.2.1/README.md @@ -0,0 +1,92 @@ +# Spark on yarn + +## Create a spark cluster in swarm mode + +`--hostname` needs 1.13 or higher + +```bash +docker service create \ +--name spark-master \ +--hostname spark-master \ +--detach true \ +--network swarm-net \ +--replicas 1 \ +--endpoint-mode dnsrr \ +newnius/spark:2.2.1 +``` + +```bash +docker service create \ +--name spark-slave1 \ +--hostname spark-slave1 \ +--detach true \ +--network swarm-net \ +--replicas 1 \ +--endpoint-mode dnsrr \ +newnius/spark:2.2.1 +``` + +```bash +docker service create \ +--name spark-slave2 \ +--hostname spark-slave2 \ +--detach true \ +--network swarm-net \ +--replicas 1 \ +--endpoint-mode dnsrr \ +newnius/spark:2.2.1 +``` + +## Init && Test + +In the first deploy, format hdfs + +### Stop HDFS (in master) +```bash +sbin/stop-dfs.sh +``` + +### Format HDFS (in master) +``` +bin/hadoop namenode -format +``` + +### Start HDFS (in master) +```bash +sbin/start-dfs.sh +``` + +### Run Hello World +```bash +spark-submit \ + --master yarn-cluster \ + --class org.apache.spark.examples.JavaSparkPi \ + ./examples/jars/spark-examples_2.11-2.2.1.jar 100 +``` + +### UI + +YARN: spark-master:8088 + +HDFS: spark-master:50070 + +_Proxy needed, e.g. [newnius/docker-proxy](https://hub.docker.com/r/newnius/docker-proxy/)_ + +## customized config + +```bash +docker service create \ +--name spark-master \ +--hostname spark-master \ +--detach true \ +--network swarm-net \ +--replicas 1 \ +--mount type=bind,source=/mnt/data/spark/hdfs/master,target=/tmp/hadoop-root \ +--mount type=bind,source=/mnt/data/spark/logs/master,target=/usr/local/hadoop/logs \ +--mount type=bind,source=/mnt/data/spark/config/hadoop,target=/mnt/config/hadoop \ +--mount type=bind,source=/mnt/data/spark/config/spark,target=/mnt/config/spark \ +--endpoint-mode dnsrr \ +newnius/spark:2.2.1 +``` + +You dont't need to put all files in dir, only add files to be replaced. diff --git a/spark/2.2.1/bootstrap.sh b/spark/2.2.1/bootstrap.sh new file mode 100644 index 0000000..406cb3e --- /dev/null +++ b/spark/2.2.1/bootstrap.sh @@ -0,0 +1,6 @@ +#! /bin/bash +# replace config + +cp /mnt/config/spark/* $SPARK_HOME/conf + +bash -c "/etc/bootstrap.sh -d" diff --git a/spark/2.2.1/core-site.xml b/spark/2.2.1/core-site.xml new file mode 100644 index 0000000..78ba8ec --- /dev/null +++ b/spark/2.2.1/core-site.xml @@ -0,0 +1,28 @@ + + + + + + + + + fs.defaultFS + hdfs://spark-master:8020 + + + fs.default.name + hdfs://spark-master:8020 + + diff --git a/spark/2.2.1/hdfs-site.xml b/spark/2.2.1/hdfs-site.xml new file mode 100644 index 0000000..0d0f6fa --- /dev/null +++ b/spark/2.2.1/hdfs-site.xml @@ -0,0 +1,40 @@ + + + + + + + + + dfs.permissions + false + + + dfs.namenode.secondary.http-address + spark-slave1:50090 + + + dfs.namenode.http-address + spark-master:50070 + + + dfs.datanode.max.transfer.threads + 8192 + + + dfs.replication + 2 + + diff --git a/spark/2.2.1/mapred-site.xml b/spark/2.2.1/mapred-site.xml new file mode 100644 index 0000000..3be4742 --- /dev/null +++ b/spark/2.2.1/mapred-site.xml @@ -0,0 +1,32 @@ + + + + + + + + + mapreduce.framework.name + yarn + + + mapreduce.jobhistory.address + spark-master:10020 + + + mapreduce.jobhistory.webapp.address + spark-master:19888 + + diff --git a/spark/2.2.1/slaves b/spark/2.2.1/slaves new file mode 100644 index 0000000..5c51d10 --- /dev/null +++ b/spark/2.2.1/slaves @@ -0,0 +1,3 @@ +spark-slave1 +spark-slave2 +spark-slave3 diff --git a/spark/2.2.1/yarn-site.xml b/spark/2.2.1/yarn-site.xml new file mode 100644 index 0000000..a7475c7 --- /dev/null +++ b/spark/2.2.1/yarn-site.xml @@ -0,0 +1,49 @@ + + + + + + yarn.application.classpath + /usr/local/hadoop/etc/hadoop, /usr/local/hadoop/share/hadoop/common/*, /usr/local/hadoop/share/hadoop/common/lib/*, /usr/local/hadoop/share/hadoop/hdfs/*, /usr/local/hadoop/share/hadoop/hdfs/lib/*, /usr/local/hadoop/share/hadoop/mapreduce/*, /usr/local/hadoop/share/hadoop/mapreduce/lib/*, /usr/local/hadoop/share/hadoop/yarn/*, /usr/local/hadoop/share/hadoop/yarn/lib/* + + + yarn.resourcemanager.hostname + spark-master + + + yarn.nodemanager.aux-services + mapreduce_shuffle + + + yarn.log-aggregation-enable + true + + + yarn.log-aggregation.retain-seconds + 604800 + + + yarn.nodemanager.resource.memory-mb + 2048 + + + yarn.nodemanager.resource.cpu-vcores + 2 + + + yarn.scheduler.minimum-allocation-mb + 1024 + +