From 10962791c836c0ea8a821688fc1d2a98c5934cb7 Mon Sep 17 00:00:00 2001 From: newnius Date: Sat, 8 Apr 2017 22:22:34 +0800 Subject: [PATCH] add spark --- hadoop/2.7.1/README | 2 + php/5.6/README.md | 10 +-- spark/1.6.0/Dockerfile | 13 +++ spark/1.6.0/README | 82 +++++++++++++++++++ spark/1.6.0/bootstrap.sh | 35 ++++++++ spark/1.6.0/config/hadoop/core-site.xml | 29 +++++++ spark/1.6.0/config/hadoop/hdfs-site.xml | 46 +++++++++++ spark/1.6.0/config/hadoop/mapred-site.xml | 26 ++++++ spark/1.6.0/config/hadoop/slaves | 2 + spark/1.6.0/config/hadoop/yarn-site.xml | 57 +++++++++++++ .../spark-yarn-remote-client/core-site.xml | 12 +++ .../spark-yarn-remote-client/yarn-site.xml | 26 ++++++ spark/1.6.0/config/spark/slaves | 3 + spark/1.6.0/config/spark/spark-defaults.conf | 10 +++ spark/1.6.0/config/spark/spark-env.sh | 52 ++++++++++++ 15 files changed, 400 insertions(+), 5 deletions(-) create mode 100644 spark/1.6.0/Dockerfile create mode 100644 spark/1.6.0/README create mode 100644 spark/1.6.0/bootstrap.sh create mode 100755 spark/1.6.0/config/hadoop/core-site.xml create mode 100755 spark/1.6.0/config/hadoop/hdfs-site.xml create mode 100755 spark/1.6.0/config/hadoop/mapred-site.xml create mode 100755 spark/1.6.0/config/hadoop/slaves create mode 100755 spark/1.6.0/config/hadoop/yarn-site.xml create mode 100755 spark/1.6.0/config/spark-yarn-remote-client/core-site.xml create mode 100755 spark/1.6.0/config/spark-yarn-remote-client/yarn-site.xml create mode 100755 spark/1.6.0/config/spark/slaves create mode 100644 spark/1.6.0/config/spark/spark-defaults.conf create mode 100755 spark/1.6.0/config/spark/spark-env.sh diff --git a/hadoop/2.7.1/README b/hadoop/2.7.1/README index 3003f14..e1ae28c 100644 --- a/hadoop/2.7.1/README +++ b/hadoop/2.7.1/README @@ -83,3 +83,5 @@ bin/hadoop dfs -put etc/hadoop/* /user/root/input YARN: hadoop-master:8088 HDFS: hadoop-master:50070 + +_Proxy needed, e.g. newnius/docker-proxy_ diff --git a/php/5.6/README.md b/php/5.6/README.md index 031bd56..ccd2f0c 100644 --- a/php/5.6/README.md +++ b/php/5.6/README.md @@ -2,9 +2,9 @@ ## CHANGELOG - remove `Options Indexes` (apache2.conf) - - remove `Options FollowSymLinks` (apache2.conf) - - add `Options SymLinksIfOwnerMatch` to support .htaccess (apache2.conf) - - enable `rewrite` - - update to `ServerTokens Prod` (security.conf) + - remove `Options FollowSymLinks` (apache2.conf) + - add `Options SymLinksIfOwnerMatch` to support .htaccess (apache2.conf) + - enable `rewrite` + - update to `ServerTokens Prod` (security.conf) - update to `ServerSignature Off` (security.conf) - - add `ServerName localhost` (apache2.conf) + - add `ServerName localhost` (apache2.conf) diff --git a/spark/1.6.0/Dockerfile b/spark/1.6.0/Dockerfile new file mode 100644 index 0000000..e2bb391 --- /dev/null +++ b/spark/1.6.0/Dockerfile @@ -0,0 +1,13 @@ +FROM sequenceiq/spark:1.6.0 + +MAINTAINER Newnius + +ADD bootstrap.sh /etc/bootstrap.sh + +COPY config/hadoop /mnt/config/hadoop +COPY config/spark /mnt/config/spark +COPY config/spark-yarn-remote-client /mnt/config/spark-yarn-remote-client + +WORKDIR /usr/local/hadoop + +ENTRYPOINT ["/etc/bootstrap.sh"] diff --git a/spark/1.6.0/README b/spark/1.6.0/README new file mode 100644 index 0000000..d337a8e --- /dev/null +++ b/spark/1.6.0/README @@ -0,0 +1,82 @@ +# based on sequenceiq/spark + +## Create a spark cluster in swarm mode + +`--hostname` needs 1.13 or higher + +```bash +docker service create \ +--name spark-master \ +--network swarm-net \ +--replicas 1 \ +--endpoint-mode dnsrr \ +newnius/spark +``` + +```bash +docker service create \ +--name spark-slave1 \ +--network swarm-net \ +--replicas 1 \ +--endpoint-mode dnsrr \ +newnius/spark +``` + +```bash +docker service create \ +--name spark-slave2 \ +--network swarm-net \ +--replicas 1 \ +--endpoint-mode dnsrr \ +newnius/spark +``` + +## Init && Test + +In the first deploy, format dfs first + +### stop cluster (in master) +`sbin/stop-yarn.sh` +`sbin/stop-dfs.sh` +`../spark/sbin/stop-all.sh` + +### remove previous data (in all nodes) +clear all data in /tmp in all nodes + +### format hdfs (in master) +``` +bin/hadoop namenode -format +``` + +### start cluster (in master) +`sbin/start-dfs.sh` +`sbin/start-yarn.sh` +`../spark/sbin/start-all.sh` + +### monitor cluster in browser + +YARN: spark-master:8088 + +HDFS: spark-master:50070 + +SPARK: spark-master:8080 + +_Proxy needed, e.g. newnius/docker-proxy_ + +## customized config + +```bash +docker service create \ +--name spark-master \ +--network swarm-net \ +--replicas 1 \ +--mount type=bind,source=/mnt/data/spark/hdfs/master,target=/tmp/hadoop-root \ +--mount type=bind,source=/mnt/data/spark/logs/master,target=/usr/local/hadoop/logs \ +--mount type=bind,source=/mnt/data/spark/config/hadoop,target=/mnt/config/hadoop \ +--mount type=bind,source=/mnt/data/spark/config/spark,target=/mnt/config/spark \ +--mount type=bind,source=/mnt/data/spark/config/spark-yarn-remote-client,target=/mnt/config/spark-yarn-remote-client \ +--endpoint-mode dnsrr \ +newnius/spark +``` + +You dont't need to put all files in dir, only add files needs modified. diff --git a/spark/1.6.0/bootstrap.sh b/spark/1.6.0/bootstrap.sh new file mode 100644 index 0000000..a9d7458 --- /dev/null +++ b/spark/1.6.0/bootstrap.sh @@ -0,0 +1,35 @@ +#! /bin/bash + + +: ${HADOOP_PREFIX:=/usr/local/hadoop} + +$HADOOP_PREFIX/etc/hadoop/hadoop-env.sh + +rm /tmp/*.pid + +# installing libraries if any - (resource urls added comma separated to the ACP system variable) +cd $HADOOP_PREFIX/share/hadoop/common ; for cp in ${ACP//,/ }; do echo == $cp; curl -LO $cp ; done; cd - + +cp -a /mnt/config/hadoop/. /usr/local/hadoop/etc/hadoop/ +cp -a /mnt/config/spark/. /usr/local/spark/conf/ +cp -a /mnt/config/spark-yarn-remote-client/. /usr/local/spark/yarn-remote-client/ + +service sshd start + +## stop all in case master starts far behind +$HADOOP_PREFIX/sbin/stop-yarn.sh +$HADOOP_PREFIX/sbin/stop-dfs.sh + +$HADOOP_PREFIX/sbin/start-dfs.sh +$HADOOP_PREFIX/sbin/start-yarn.sh +$HADOOP_PREFIX/sbin/mr-jobhistory-daemon.sh start historyserver + + +$SPARK_HOME/sbin/start-all.sh + + +if [[ $1 == "-bash" ]]; then + /bin/bash +else + while true; do sleep 1000; done +fi diff --git a/spark/1.6.0/config/hadoop/core-site.xml b/spark/1.6.0/config/hadoop/core-site.xml new file mode 100755 index 0000000..c9155d6 --- /dev/null +++ b/spark/1.6.0/config/hadoop/core-site.xml @@ -0,0 +1,29 @@ + + + + + + + + + + fs.defaultFS + hdfs://spark-master:8020 + + + fs.default.name + hdfs://spark-master:8020 + + diff --git a/spark/1.6.0/config/hadoop/hdfs-site.xml b/spark/1.6.0/config/hadoop/hdfs-site.xml new file mode 100755 index 0000000..541a5b8 --- /dev/null +++ b/spark/1.6.0/config/hadoop/hdfs-site.xml @@ -0,0 +1,46 @@ + + + + + + + + + + dfs.permissions + false + + + + dfs.namenode.secondary.http-address + spark-slave1:50090 + + + dfs.namenode.http-address + spark-master:50070 + + + + dfs.datanode.max.transfer.threads + 8192 + + + + dfs.replication + 3 + + + + diff --git a/spark/1.6.0/config/hadoop/mapred-site.xml b/spark/1.6.0/config/hadoop/mapred-site.xml new file mode 100755 index 0000000..ce18519 --- /dev/null +++ b/spark/1.6.0/config/hadoop/mapred-site.xml @@ -0,0 +1,26 @@ + + + + + + + + + + mapreduce.framework.name + yarn + + + diff --git a/spark/1.6.0/config/hadoop/slaves b/spark/1.6.0/config/hadoop/slaves new file mode 100755 index 0000000..2e11201 --- /dev/null +++ b/spark/1.6.0/config/hadoop/slaves @@ -0,0 +1,2 @@ +spark-slave1 +spark-slave2 diff --git a/spark/1.6.0/config/hadoop/yarn-site.xml b/spark/1.6.0/config/hadoop/yarn-site.xml new file mode 100755 index 0000000..1ef9ddf --- /dev/null +++ b/spark/1.6.0/config/hadoop/yarn-site.xml @@ -0,0 +1,57 @@ + + + + + + + yarn.application.classpath + /usr/local/hadoop/etc/hadoop, /usr/local/hadoop/share/hadoop/common/*, /usr/local/hadoop/share/hadoop/common/lib/*, /usr/local/hadoop/share/hadoop/hdfs/*, /usr/local/hadoop/share/hadoop/hdfs/lib/*, /usr/local/hadoop/share/hadoop/mapreduce/*, /usr/local/hadoop/share/hadoop/mapreduce/lib/*, /usr/local/hadoop/share/hadoop/yarn/*, /usr/local/hadoop/share/hadoop/yarn/lib/* + + + + + yarn.resourcemanager.hostname + spark-master + + + + yarn.nodemanager.aux-services + mapreduce_shuffle + + + + yarn.log-aggregation-enable + true + + + + yarn.log-aggregation.retain-seconds + 604800 + + + + + yarn.nodemanager.resource.memory-mb + 2048 + + + yarn.nodemanager.resource.cpu-vcores + 2 + + + yarn.scheduler.minimum-allocation-mb + 1024 + + diff --git a/spark/1.6.0/config/spark-yarn-remote-client/core-site.xml b/spark/1.6.0/config/spark-yarn-remote-client/core-site.xml new file mode 100755 index 0000000..a6bb4ce --- /dev/null +++ b/spark/1.6.0/config/spark-yarn-remote-client/core-site.xml @@ -0,0 +1,12 @@ + + + + + fs.default.name + hdfs://spark-master:8020 + + + dfs.client.use.legacy.blockreader + true + + diff --git a/spark/1.6.0/config/spark-yarn-remote-client/yarn-site.xml b/spark/1.6.0/config/spark-yarn-remote-client/yarn-site.xml new file mode 100755 index 0000000..1f5c9c5 --- /dev/null +++ b/spark/1.6.0/config/spark-yarn-remote-client/yarn-site.xml @@ -0,0 +1,26 @@ + + + yarn.resourcemanager.scheduler.address + spark-master:8030 + + + yarn.resourcemanager.address + spark-master:8032 + + + yarn.resourcemanager.webapp.address + spark-master:8088 + + + yarn.resourcemanager.resource-tracker.address + spark-master:8031 + + + yarn.resourcemanager.admin.address + spark-master:8033 + + + yarn.application.classpath + /usr/local/hadoop/etc/hadoop, /usr/local/hadoop/share/hadoop/common/*, /usr/local/hadoop/share/hadoop/common/lib/*, /usr/local/hadoop/share/hadoop/hdfs/*, /usr/local/hadoop/share/hadoop/hdfs/lib/*, /usr/local/hadoop/share/hadoop/mapreduce/*, /usr/local/hadoop/share/hadoop/mapreduce/lib/*, /usr/local/hadoop/share/hadoop/yarn/*, /usr/local/hadoop/share/hadoop/yarn/lib/*, /usr/local/hadoop/share/spark/* + + diff --git a/spark/1.6.0/config/spark/slaves b/spark/1.6.0/config/spark/slaves new file mode 100755 index 0000000..e1020ba --- /dev/null +++ b/spark/1.6.0/config/spark/slaves @@ -0,0 +1,3 @@ +# A Spark Worker will be started on each of the machines listed below. +spark-slave1 +spark-slave2 diff --git a/spark/1.6.0/config/spark/spark-defaults.conf b/spark/1.6.0/config/spark/spark-defaults.conf new file mode 100644 index 0000000..0d49e47 --- /dev/null +++ b/spark/1.6.0/config/spark/spark-defaults.conf @@ -0,0 +1,10 @@ +# Default system properties included when running spark-submit. +# This is useful for setting default environmental settings. + +# Example: + spark.master spark://spark-master:7077 + spark.eventLog.enabled true + spark.eventLog.dir hdfs://spark-master:/var/log/spark + spark.serializer org.apache.spark.serializer.KryoSerializer +# spark.driver.memory 5g +# spark.executor.extraJavaOptions -XX:+PrintGCDetails -Dkey=value -Dnumbers="one two three" diff --git a/spark/1.6.0/config/spark/spark-env.sh b/spark/1.6.0/config/spark/spark-env.sh new file mode 100755 index 0000000..dd563d5 --- /dev/null +++ b/spark/1.6.0/config/spark/spark-env.sh @@ -0,0 +1,52 @@ +#!/usr/bin/env bash + +# This file is sourced when running various Spark programs. +# Copy it as spark-env.sh and edit that to configure Spark for your site. + +# Options read when launching programs locally with +# ./bin/run-example or ./bin/spark-submit +# - HADOOP_CONF_DIR, to point Spark towards Hadoop configuration files +# - SPARK_LOCAL_IP, to set the IP address Spark binds to on this node +# - SPARK_PUBLIC_DNS, to set the public dns name of the driver program +# - SPARK_CLASSPATH, default classpath entries to append + +# Options read by executors and drivers running inside the cluster +# - SPARK_LOCAL_IP, to set the IP address Spark binds to on this node +# - SPARK_PUBLIC_DNS, to set the public DNS name of the driver program +# - SPARK_CLASSPATH, default classpath entries to append +# - SPARK_LOCAL_DIRS, storage directories to use on this node for shuffle and RDD data +# - MESOS_NATIVE_LIBRARY, to point to your libmesos.so if you use Mesos + +# Options read in YARN client mode +# - HADOOP_CONF_DIR, to point Spark towards Hadoop configuration files +# - SPARK_EXECUTOR_INSTANCES, Number of workers to start (Default: 2) +# - SPARK_EXECUTOR_CORES, Number of cores for the workers (Default: 1). +# - SPARK_EXECUTOR_MEMORY, Memory per Worker (e.g. 1000M, 2G) (Default: 1G) +# - SPARK_DRIVER_MEMORY, Memory for Master (e.g. 1000M, 2G) (Default: 512 Mb) +# - SPARK_YARN_APP_NAME, The name of your application (Default: Spark) +# - SPARK_YARN_QUEUE, The hadoop queue to use for allocation requests (Default: ‘default’) +# - SPARK_YARN_DIST_FILES, Comma separated list of files to be distributed with the job. +# - SPARK_YARN_DIST_ARCHIVES, Comma separated list of archives to be distributed with the job. + +# Options for the daemons used in the standalone deploy mode: +# - SPARK_MASTER_IP, to bind the master to a different IP address or hostname +# - SPARK_MASTER_PORT / SPARK_MASTER_WEBUI_PORT, to use non-default ports for the master +# - SPARK_MASTER_OPTS, to set config properties only for the master (e.g. "-Dx=y") +# - SPARK_WORKER_CORES, to set the number of cores to use on this machine +# - SPARK_WORKER_MEMORY, to set how much total memory workers have to give executors (e.g. 1000m, 2g) +# - SPARK_WORKER_PORT / SPARK_WORKER_WEBUI_PORT, to use non-default ports for the worker +# - SPARK_WORKER_INSTANCES, to set the number of worker processes per node +# - SPARK_WORKER_DIR, to set the working directory of worker processes +# - SPARK_WORKER_OPTS, to set config properties only for the worker (e.g. "-Dx=y") +# - SPARK_HISTORY_OPTS, to set config properties only for the history server (e.g. "-Dx=y") +# - SPARK_DAEMON_JAVA_OPTS, to set config properties for all daemons (e.g. "-Dx=y") +# - SPARK_PUBLIC_DNS, to set the public dns name of the master or workers + +export JAVA_HOME=/usr/java/default +export SPARK_MASTER_IP=spark-master +export SPARK_WORKER_CORES=1 +export SPARK_WORKER_INSTANCES=1 +export SPARK_MORKER_PORT=7077 +export SPARK_WORKER_MEMORY=2g +export MASTER=spark://${SPARK_MASTER_IP}:${SPARK_MASTER_PORT} +export SPARK_HISTORY_OPTS="-Dspark.history.ui.port=18080 -Dspark.history.retainedApplications=3 -Dspark.history.fs.logDirectory=hdfs://spark-master:/var/log/spark"