add spark

2026-05-07 10:35:08 +00:00 · 2017-04-08 22:22:34 +08:00
parent 342312b75e
commit 10962791c8
15 changed files with 400 additions and 5 deletions
--- a/hadoop/2.7.1/README
+++ b/hadoop/2.7.1/README
@@ -83,3 +83,5 @@ bin/hadoop dfs -put etc/hadoop/* /user/root/input
 YARN: hadoop-master:8088

 HDFS: hadoop-master:50070
+
+_Proxy needed, e.g. newnius/docker-proxy_
--- a/php/5.6/README.md
+++ b/php/5.6/README.md
@@ -2,9 +2,9 @@

 ## CHANGELOG
  - remove `Options Indexes` (apache2.conf)
-	- remove `Options FollowSymLinks` (apache2.conf)
-	- add `Options SymLinksIfOwnerMatch` to support .htaccess (apache2.conf)
-	- enable `rewrite` 
-	- update to `ServerTokens Prod` (security.conf)
+  - remove `Options FollowSymLinks` (apache2.conf)
+  - add `Options SymLinksIfOwnerMatch` to support .htaccess (apache2.conf)
+  - enable `rewrite` 
+  - update to `ServerTokens Prod` (security.conf)
  - update to `ServerSignature Off` (security.conf)
-	- add `ServerName localhost` (apache2.conf)
+  - add `ServerName localhost` (apache2.conf)
--- a/spark/1.6.0/Dockerfile
+++ b/spark/1.6.0/Dockerfile
@@ -0,0 +1,13 @@
+FROM sequenceiq/spark:1.6.0
+
+MAINTAINER Newnius <newnius.cn@gmail.com>
+
+ADD bootstrap.sh /etc/bootstrap.sh
+
+COPY config/hadoop /mnt/config/hadoop
+COPY config/spark  /mnt/config/spark
+COPY config/spark-yarn-remote-client /mnt/config/spark-yarn-remote-client
+
+WORKDIR /usr/local/hadoop
+
+ENTRYPOINT ["/etc/bootstrap.sh"]
--- a/spark/1.6.0/README
+++ b/spark/1.6.0/README
@@ -0,0 +1,82 @@
+# based on sequenceiq/spark
+
+## Create a spark cluster in swarm mode
+
+`--hostname` needs 1.13 or higher
+
+```bash
+docker service create \
+--name spark-master \
+--network swarm-net \
+--replicas 1 \
+--endpoint-mode dnsrr \
+newnius/spark
+```
+
+```bash
+docker service create \
+--name spark-slave1 \
+--network swarm-net \
+--replicas 1 \
+--endpoint-mode dnsrr \
+newnius/spark
+```
+
+```bash
+docker service create \
+--name spark-slave2 \
+--network swarm-net \
+--replicas 1 \
+--endpoint-mode dnsrr \
+newnius/spark
+```
+
+## Init && Test
+
+In the first deploy, format dfs first
+
+### stop cluster (in master)
+`sbin/stop-yarn.sh`
+`sbin/stop-dfs.sh`
+`../spark/sbin/stop-all.sh`
+
+### remove previous data (in all nodes)
+clear all data in /tmp in all nodes
+
+### format hdfs (in master)
+```
+bin/hadoop namenode -format
+```
+
+### start cluster (in master)
+`sbin/start-dfs.sh`
+`sbin/start-yarn.sh`
+`../spark/sbin/start-all.sh`
+
+### monitor cluster in browser
+
+YARN:  spark-master:8088
+
+HDFS:  spark-master:50070
+
+SPARK: spark-master:8080
+
+_Proxy needed, e.g. newnius/docker-proxy_
+
+## customized config
+
+```bash
+docker service create \
+--name spark-master \
+--network swarm-net \
+--replicas 1 \
+--mount type=bind,source=/mnt/data/spark/hdfs/master,target=/tmp/hadoop-root \
+--mount type=bind,source=/mnt/data/spark/logs/master,target=/usr/local/hadoop/logs \
+--mount type=bind,source=/mnt/data/spark/config/hadoop,target=/mnt/config/hadoop \
+--mount type=bind,source=/mnt/data/spark/config/spark,target=/mnt/config/spark \
+--mount type=bind,source=/mnt/data/spark/config/spark-yarn-remote-client,target=/mnt/config/spark-yarn-remote-client \
+--endpoint-mode dnsrr \
+newnius/spark
+```
+
+You dont't need to put all files in dir, only add files needs modified.
--- a/spark/1.6.0/bootstrap.sh
+++ b/spark/1.6.0/bootstrap.sh
@@ -0,0 +1,35 @@
+#! /bin/bash
+
+
+: ${HADOOP_PREFIX:=/usr/local/hadoop}
+
+$HADOOP_PREFIX/etc/hadoop/hadoop-env.sh
+
+rm /tmp/*.pid
+
+# installing libraries if any - (resource urls added comma separated to the ACP system variable)
+cd $HADOOP_PREFIX/share/hadoop/common ; for cp in ${ACP//,/ }; do  echo == $cp; curl -LO $cp ; done; cd -
+
+cp -a /mnt/config/hadoop/. /usr/local/hadoop/etc/hadoop/
+cp -a /mnt/config/spark/. /usr/local/spark/conf/
+cp -a /mnt/config/spark-yarn-remote-client/. /usr/local/spark/yarn-remote-client/
+
+service sshd start
+
+## stop all in case master starts far behind
+$HADOOP_PREFIX/sbin/stop-yarn.sh
+$HADOOP_PREFIX/sbin/stop-dfs.sh
+
+$HADOOP_PREFIX/sbin/start-dfs.sh
+$HADOOP_PREFIX/sbin/start-yarn.sh
+$HADOOP_PREFIX/sbin/mr-jobhistory-daemon.sh start historyserver
+
+
+$SPARK_HOME/sbin/start-all.sh
+
+
+if [[ $1 == "-bash" ]]; then
+  /bin/bash
+else
+  while true; do sleep 1000; done
+fi
--- a/spark/1.6.0/config/hadoop/core-site.xml
+++ b/spark/1.6.0/config/hadoop/core-site.xml
@@ -0,0 +1,29 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
+<!--
+  Licensed under the Apache License, Version 2.0 (the "License");
+  you may not use this file except in compliance with the License.
+  You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License. See accompanying LICENSE file.
+-->
+
+<!-- Put site-specific property overrides in this file. -->
+
+<configuration>
+
+  <property>		 
+    <name>fs.defaultFS</name>
+    <value>hdfs://spark-master:8020</value>
+  </property>
+  <property>
+    <name>fs.default.name</name>
+    <value>hdfs://spark-master:8020</value>
+  </property>
+</configuration>
--- a/spark/1.6.0/config/hadoop/hdfs-site.xml
+++ b/spark/1.6.0/config/hadoop/hdfs-site.xml
@@ -0,0 +1,46 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
+<!--
+  Licensed under the Apache License, Version 2.0 (the "License");
+  you may not use this file except in compliance with the License.
+  You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License. See accompanying LICENSE file.
+-->
+
+<!-- Put site-specific property overrides in this file. -->
+
+<configuration>
+
+	<property>
+		<name>dfs.permissions</name>
+		<value>false</value>
+	</property>
+	
+	<property>
+		<name>dfs.namenode.secondary.http-address</name>
+		<value>spark-slave1:50090</value>
+	</property>	
+	<property>
+		<name>dfs.namenode.http-address</name>
+		<value>spark-master:50070</value>
+	</property>
+	
+	<property>   
+		<name>dfs.datanode.max.transfer.threads</name>   
+		<value>8192</value>    
+	</property>
+
+    <property>
+        <name>dfs.replication</name>
+        <value>3</value>
+    </property>
+
+
+</configuration>
--- a/spark/1.6.0/config/hadoop/mapred-site.xml
+++ b/spark/1.6.0/config/hadoop/mapred-site.xml
@@ -0,0 +1,26 @@
+<?xml version="1.0"?>
+<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
+<!--
+  Licensed under the Apache License, Version 2.0 (the "License");
+  you may not use this file except in compliance with the License.
+  You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License. See accompanying LICENSE file.
+-->
+
+<!-- Put site-specific property overrides in this file. -->
+
+<configuration>
+
+	<property>	 	        		
+		<name>mapreduce.framework.name</name>
+		<value>yarn</value>
+	</property>
+	
+</configuration>
--- a/spark/1.6.0/config/hadoop/slaves
+++ b/spark/1.6.0/config/hadoop/slaves
@@ -0,0 +1,2 @@
+spark-slave1
+spark-slave2
--- a/spark/1.6.0/config/hadoop/yarn-site.xml
+++ b/spark/1.6.0/config/hadoop/yarn-site.xml
@@ -0,0 +1,57 @@
+<?xml version="1.0"?>
+<!--
+  Licensed under the Apache License, Version 2.0 (the "License");
+  you may not use this file except in compliance with the License.
+  You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License. See accompanying LICENSE file.
+-->
+<!-- Site specific YARN configuration properties -->
+<configuration>
+
+    <property>
+      <name>yarn.application.classpath</name>
+      <value>/usr/local/hadoop/etc/hadoop, /usr/local/hadoop/share/hadoop/common/*, /usr/local/hadoop/share/hadoop/common/lib/*, /usr/local/hadoop/share/hadoop/hdfs/*, /usr/local/hadoop/share/hadoop/hdfs/lib/*, /usr/local/hadoop/share/hadoop/mapreduce/*, /usr/local/hadoop/share/hadoop/mapreduce/lib/*, /usr/local/hadoop/share/hadoop/yarn/*, /usr/local/hadoop/share/hadoop/yarn/lib/*</value>
+    </property>
+
+
+	<property>
+		<name>yarn.resourcemanager.hostname</name>
+		<value>spark-master</value>
+	</property>
+	
+	<property>
+		<name>yarn.nodemanager.aux-services</name>
+		<value>mapreduce_shuffle</value>
+	</property>
+	
+	<property>
+		<name>yarn.log-aggregation-enable</name>
+		<value>true</value>
+	</property>
+
+	<property>
+		<name>yarn.log-aggregation.retain-seconds</name>
+		<value>604800</value>
+	</property>
+	
+		
+	<property>
+		<name>yarn.nodemanager.resource.memory-mb</name>
+		<value>2048</value>
+	</property>
+	<property>
+		<name>yarn.nodemanager.resource.cpu-vcores</name>
+		<value>2</value>
+	</property>
+    <property>
+      <name>yarn.scheduler.minimum-allocation-mb</name>
+      <value>1024</value>
+    </property>	
+</configuration>
--- a/spark/1.6.0/config/spark-yarn-remote-client/core-site.xml
+++ b/spark/1.6.0/config/spark-yarn-remote-client/core-site.xml
@@ -0,0 +1,12 @@
+<?xml version="1.0"?>
+<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
+<configuration>
+  <property>
+      <name>fs.default.name</name>
+      <value>hdfs://spark-master:8020</value>
+  </property>
+  <property>
+      <name>dfs.client.use.legacy.blockreader</name>
+      <value>true</value>
+  </property>
+</configuration>
--- a/spark/1.6.0/config/spark-yarn-remote-client/yarn-site.xml
+++ b/spark/1.6.0/config/spark-yarn-remote-client/yarn-site.xml
@@ -0,0 +1,26 @@
+<configuration>
+  <property>
+    <name>yarn.resourcemanager.scheduler.address</name>
+    <value>spark-master:8030</value>
+  </property>
+  <property>
+    <name>yarn.resourcemanager.address</name>
+    <value>spark-master:8032</value>
+  </property>
+  <property>
+    <name>yarn.resourcemanager.webapp.address</name>
+    <value>spark-master:8088</value>
+  </property>
+  <property>
+    <name>yarn.resourcemanager.resource-tracker.address</name>
+    <value>spark-master:8031</value>
+  </property>
+  <property>
+    <name>yarn.resourcemanager.admin.address</name>
+    <value>spark-master:8033</value>
+  </property>
+  <property>
+      <name>yarn.application.classpath</name>
+      <value>/usr/local/hadoop/etc/hadoop, /usr/local/hadoop/share/hadoop/common/*, /usr/local/hadoop/share/hadoop/common/lib/*, /usr/local/hadoop/share/hadoop/hdfs/*, /usr/local/hadoop/share/hadoop/hdfs/lib/*, /usr/local/hadoop/share/hadoop/mapreduce/*, /usr/local/hadoop/share/hadoop/mapreduce/lib/*, /usr/local/hadoop/share/hadoop/yarn/*, /usr/local/hadoop/share/hadoop/yarn/lib/*, /usr/local/hadoop/share/spark/*</value>
+   </property>
+</configuration>
--- a/spark/1.6.0/config/spark/slaves
+++ b/spark/1.6.0/config/spark/slaves
@@ -0,0 +1,3 @@
+# A Spark Worker will be started on each of the machines listed below.
+spark-slave1
+spark-slave2
--- a/spark/1.6.0/config/spark/spark-defaults.conf
+++ b/spark/1.6.0/config/spark/spark-defaults.conf
@@ -0,0 +1,10 @@
+# Default system properties included when running spark-submit.
+# This is useful for setting default environmental settings.
+
+# Example:
+ spark.master                     spark://spark-master:7077
+ spark.eventLog.enabled           true
+ spark.eventLog.dir               hdfs://spark-master:/var/log/spark
+ spark.serializer                 org.apache.spark.serializer.KryoSerializer
+# spark.driver.memory              5g
+# spark.executor.extraJavaOptions  -XX:+PrintGCDetails -Dkey=value -Dnumbers="one two three"
--- a/spark/1.6.0/config/spark/spark-env.sh
+++ b/spark/1.6.0/config/spark/spark-env.sh
@@ -0,0 +1,52 @@
+#!/usr/bin/env bash
+
+# This file is sourced when running various Spark programs.
+# Copy it as spark-env.sh and edit that to configure Spark for your site.
+
+# Options read when launching programs locally with 
+# ./bin/run-example or ./bin/spark-submit
+# - HADOOP_CONF_DIR, to point Spark towards Hadoop configuration files
+# - SPARK_LOCAL_IP, to set the IP address Spark binds to on this node
+# - SPARK_PUBLIC_DNS, to set the public dns name of the driver program
+# - SPARK_CLASSPATH, default classpath entries to append
+
+# Options read by executors and drivers running inside the cluster
+# - SPARK_LOCAL_IP, to set the IP address Spark binds to on this node
+# - SPARK_PUBLIC_DNS, to set the public DNS name of the driver program
+# - SPARK_CLASSPATH, default classpath entries to append
+# - SPARK_LOCAL_DIRS, storage directories to use on this node for shuffle and RDD data
+# - MESOS_NATIVE_LIBRARY, to point to your libmesos.so if you use Mesos
+
+# Options read in YARN client mode
+# - HADOOP_CONF_DIR, to point Spark towards Hadoop configuration files
+# - SPARK_EXECUTOR_INSTANCES, Number of workers to start (Default: 2)
+# - SPARK_EXECUTOR_CORES, Number of cores for the workers (Default: 1).
+# - SPARK_EXECUTOR_MEMORY, Memory per Worker (e.g. 1000M, 2G) (Default: 1G)
+# - SPARK_DRIVER_MEMORY, Memory for Master (e.g. 1000M, 2G) (Default: 512 Mb)
+# - SPARK_YARN_APP_NAME, The name of your application (Default: Spark)
+# - SPARK_YARN_QUEUE, The hadoop queue to use for allocation requests (Default: ‘default’)
+# - SPARK_YARN_DIST_FILES, Comma separated list of files to be distributed with the job.
+# - SPARK_YARN_DIST_ARCHIVES, Comma separated list of archives to be distributed with the job.
+
+# Options for the daemons used in the standalone deploy mode:
+# - SPARK_MASTER_IP, to bind the master to a different IP address or hostname
+# - SPARK_MASTER_PORT / SPARK_MASTER_WEBUI_PORT, to use non-default ports for the master
+# - SPARK_MASTER_OPTS, to set config properties only for the master (e.g. "-Dx=y")
+# - SPARK_WORKER_CORES, to set the number of cores to use on this machine
+# - SPARK_WORKER_MEMORY, to set how much total memory workers have to give executors (e.g. 1000m, 2g)
+# - SPARK_WORKER_PORT / SPARK_WORKER_WEBUI_PORT, to use non-default ports for the worker
+# - SPARK_WORKER_INSTANCES, to set the number of worker processes per node
+# - SPARK_WORKER_DIR, to set the working directory of worker processes
+# - SPARK_WORKER_OPTS, to set config properties only for the worker (e.g. "-Dx=y")
+# - SPARK_HISTORY_OPTS, to set config properties only for the history server (e.g. "-Dx=y")
+# - SPARK_DAEMON_JAVA_OPTS, to set config properties for all daemons (e.g. "-Dx=y")
+# - SPARK_PUBLIC_DNS, to set the public dns name of the master or workers
+
+export JAVA_HOME=/usr/java/default
+export SPARK_MASTER_IP=spark-master
+export SPARK_WORKER_CORES=1
+export SPARK_WORKER_INSTANCES=1
+export SPARK_MORKER_PORT=7077
+export SPARK_WORKER_MEMORY=2g
+export MASTER=spark://${SPARK_MASTER_IP}:${SPARK_MASTER_PORT}
+export SPARK_HISTORY_OPTS="-Dspark.history.ui.port=18080 -Dspark.history.retainedApplications=3 -Dspark.history.fs.logDirectory=hdfs://spark-master:/var/log/spark"