diff --git a/spark/2.3.1-yarn/Dockerfile b/spark/2.3.1-yarn/Dockerfile new file mode 100644 index 0000000..70e3170 --- /dev/null +++ b/spark/2.3.1-yarn/Dockerfile @@ -0,0 +1,34 @@ +FROM alpine:3.8 + +MAINTAINER Newnius + +USER root + +# Prerequisites +RUN apk add --no-cache openssh openssl openjdk8-jre rsync bash procps coreutils + +ENV JAVA_HOME /usr/lib/jvm/java-1.8-openjdk + +ENV PATH $PATH:$JAVA_HOME/bin + +ENV SPARK_VER 2.3.1 + +RUN wget https://archive.apache.org/dist/spark/spark-$SPARK_VER/spark-$SPARK_VER-bin-hadoop2.7.tgz && \ + tar -xvf spark-$SPARK_VER-bin-hadoop2.7.tgz -C /usr/local && \ + rm spark-$SPARK_VER-bin-hadoop2.7.tgz + +RUN ln -s /usr/local/spark-$SPARK_VER-bin-hadoop2.7 /usr/local/spark + +ENV SPARK_HOME /usr/local/spark + +ADD config/* /usr/local/hadoop/etc/hadoop/ + +ENV HADOOP_CONF_DIR /usr/local/hadoop/etc/hadoop + +ENV PATH $PATH:$SPARK_HOME/bin + +ADD bootstrap.sh /etc/bootstrap.sh + +WORKDIR /usr/local/spark + +CMD ["/etc/bootstrap.sh"] diff --git a/spark/2.3.1-yarn/README.md b/spark/2.3.1-yarn/README.md new file mode 100644 index 0000000..21c6037 --- /dev/null +++ b/spark/2.3.1-yarn/README.md @@ -0,0 +1,45 @@ +# Deploy Spark On Yarn + +## Client + +```bash +docker service create \ + --name spark-client \ + --hostname spark-client \ + --network swarm-net \ + --replicas 1 \ + --detach true \ + newnius/spark:2.3.1-yarn +``` + +## Validate installation + +#### spark-submit PI + +```bash +spark-submit \ + --master yarn \ + --deploy-mode cluster \ + --class org.apache.spark.examples.JavaSparkPi \ + ./examples/jars/spark-examples*.jar 100 +``` + +#### spark-shell HDFS wordcount + +Enter `spark-shell --master yarn` to enter shell. + +```shell +val lines = sc.textFile("hdfs://hadoop-master:8020/user/root/input") + +val words = lines.flatMap(_.split("\\s+")) + +val wc = words.map(word => (word, 1)).reduceByKey(_ + _) + +wc.collect() + +val cnt = words.map(word => 1).reduce(_ + _) +``` + +## Browse the web UI + +In Spark On Yarn mode, the spark jobs will occur in the Yarn web UI. diff --git a/spark/2.3.1-yarn/bootstrap.sh b/spark/2.3.1-yarn/bootstrap.sh new file mode 100755 index 0000000..c89698c --- /dev/null +++ b/spark/2.3.1-yarn/bootstrap.sh @@ -0,0 +1,10 @@ +#! /bin/bash + +## replace config +: ${EXTRA_CONF_DIR:=/config/hadoop} + +if [ -d "$EXTRA_CONF_DIR" ]; then + cp $EXTRA_CONF_DIR/* /usr/local/hadoop/etc/hadoop/ +fi + +while true; do sleep 1000; done diff --git a/spark/2.3.1-yarn/config/core-site.xml b/spark/2.3.1-yarn/config/core-site.xml new file mode 100644 index 0000000..7bd76e9 --- /dev/null +++ b/spark/2.3.1-yarn/config/core-site.xml @@ -0,0 +1,28 @@ + + + + + + + + + fs.defaultFS + hdfs://hadoop-master:8020 + + + fs.default.name + hdfs://hadoop-master:8020 + + diff --git a/spark/2.3.1-yarn/config/hdfs-site.xml b/spark/2.3.1-yarn/config/hdfs-site.xml new file mode 100644 index 0000000..ba3613a --- /dev/null +++ b/spark/2.3.1-yarn/config/hdfs-site.xml @@ -0,0 +1,40 @@ + + + + + + + + + dfs.permissions + false + + + dfs.namenode.secondary.http-address + hadoop-slave1:50090 + + + dfs.namenode.http-address + hadoop-master:50070 + + + dfs.datanode.max.transfer.threads + 8192 + + + dfs.replication + 3 + + diff --git a/spark/2.3.1-yarn/config/mapred-site.xml b/spark/2.3.1-yarn/config/mapred-site.xml new file mode 100644 index 0000000..00fc7b8 --- /dev/null +++ b/spark/2.3.1-yarn/config/mapred-site.xml @@ -0,0 +1,32 @@ + + + + + + + + + mapreduce.framework.name + yarn + + + mapreduce.jobhistory.address + hadoop-master:10020 + + + mapreduce.jobhistory.webapp.address + hadoop-master:19888 + + diff --git a/spark/2.3.1-yarn/config/slaves b/spark/2.3.1-yarn/config/slaves new file mode 100644 index 0000000..3bb91be --- /dev/null +++ b/spark/2.3.1-yarn/config/slaves @@ -0,0 +1,3 @@ +hadoop-slave1 +hadoop-slave2 +hadoop-slave3 diff --git a/spark/2.3.1-yarn/config/yarn-site.xml b/spark/2.3.1-yarn/config/yarn-site.xml new file mode 100644 index 0000000..c3fec7b --- /dev/null +++ b/spark/2.3.1-yarn/config/yarn-site.xml @@ -0,0 +1,49 @@ + + + + + + yarn.application.classpath + /usr/local/hadoop/etc/hadoop, /usr/local/hadoop/share/hadoop/common/*, /usr/local/hadoop/share/hadoop/common/lib/*, /usr/local/hadoop/share/hadoop/hdfs/*, /usr/local/hadoop/share/hadoop/hdfs/lib/*, /usr/local/hadoop/share/hadoop/mapreduce/*, /usr/local/hadoop/share/hadoop/mapreduce/lib/*, /usr/local/hadoop/share/hadoop/yarn/*, /usr/local/hadoop/share/hadoop/yarn/lib/* + + + yarn.resourcemanager.hostname + hadoop-master + + + yarn.nodemanager.aux-services + mapreduce_shuffle + + + yarn.log-aggregation-enable + true + + + yarn.log-aggregation.retain-seconds + 604800 + + + yarn.nodemanager.resource.memory-mb + 2048 + + + yarn.nodemanager.resource.cpu-vcores + 2 + + + yarn.scheduler.minimum-allocation-mb + 1024 + +