add spark

2026-02-04 06:45:54 +00:00 · 2017-04-08 22:22:34 +08:00
parent 342312b75e
commit 10962791c8
15 changed files with 400 additions and 5 deletions
--- a/spark/1.6.0/config/hadoop/core-site.xml
+++ b/spark/1.6.0/config/hadoop/core-site.xml
@@ -0,0 +1,29 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
+<!--
+  Licensed under the Apache License, Version 2.0 (the "License");
+  you may not use this file except in compliance with the License.
+  You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License. See accompanying LICENSE file.
+-->
+
+<!-- Put site-specific property overrides in this file. -->
+
+<configuration>
+
+  <property>		 
+    <name>fs.defaultFS</name>
+    <value>hdfs://spark-master:8020</value>
+  </property>
+  <property>
+    <name>fs.default.name</name>
+    <value>hdfs://spark-master:8020</value>
+  </property>
+</configuration>
--- a/spark/1.6.0/config/hadoop/hdfs-site.xml
+++ b/spark/1.6.0/config/hadoop/hdfs-site.xml
@@ -0,0 +1,46 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
+<!--
+  Licensed under the Apache License, Version 2.0 (the "License");
+  you may not use this file except in compliance with the License.
+  You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License. See accompanying LICENSE file.
+-->
+
+<!-- Put site-specific property overrides in this file. -->
+
+<configuration>
+
+	<property>
+		<name>dfs.permissions</name>
+		<value>false</value>
+	</property>
+	
+	<property>
+		<name>dfs.namenode.secondary.http-address</name>
+		<value>spark-slave1:50090</value>
+	</property>	
+	<property>
+		<name>dfs.namenode.http-address</name>
+		<value>spark-master:50070</value>
+	</property>
+	
+	<property>   
+		<name>dfs.datanode.max.transfer.threads</name>   
+		<value>8192</value>    
+	</property>
+
+    <property>
+        <name>dfs.replication</name>
+        <value>3</value>
+    </property>
+
+
+</configuration>
--- a/spark/1.6.0/config/hadoop/mapred-site.xml
+++ b/spark/1.6.0/config/hadoop/mapred-site.xml
@@ -0,0 +1,26 @@
+<?xml version="1.0"?>
+<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
+<!--
+  Licensed under the Apache License, Version 2.0 (the "License");
+  you may not use this file except in compliance with the License.
+  You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License. See accompanying LICENSE file.
+-->
+
+<!-- Put site-specific property overrides in this file. -->
+
+<configuration>
+
+	<property>	 	        		
+		<name>mapreduce.framework.name</name>
+		<value>yarn</value>
+	</property>
+	
+</configuration>
--- a/spark/1.6.0/config/hadoop/slaves
+++ b/spark/1.6.0/config/hadoop/slaves
@@ -0,0 +1,2 @@
+spark-slave1
+spark-slave2
--- a/spark/1.6.0/config/hadoop/yarn-site.xml
+++ b/spark/1.6.0/config/hadoop/yarn-site.xml
@@ -0,0 +1,57 @@
+<?xml version="1.0"?>
+<!--
+  Licensed under the Apache License, Version 2.0 (the "License");
+  you may not use this file except in compliance with the License.
+  You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License. See accompanying LICENSE file.
+-->
+<!-- Site specific YARN configuration properties -->
+<configuration>
+
+    <property>
+      <name>yarn.application.classpath</name>
+      <value>/usr/local/hadoop/etc/hadoop, /usr/local/hadoop/share/hadoop/common/*, /usr/local/hadoop/share/hadoop/common/lib/*, /usr/local/hadoop/share/hadoop/hdfs/*, /usr/local/hadoop/share/hadoop/hdfs/lib/*, /usr/local/hadoop/share/hadoop/mapreduce/*, /usr/local/hadoop/share/hadoop/mapreduce/lib/*, /usr/local/hadoop/share/hadoop/yarn/*, /usr/local/hadoop/share/hadoop/yarn/lib/*</value>
+    </property>
+
+
+	<property>
+		<name>yarn.resourcemanager.hostname</name>
+		<value>spark-master</value>
+	</property>
+	
+	<property>
+		<name>yarn.nodemanager.aux-services</name>
+		<value>mapreduce_shuffle</value>
+	</property>
+	
+	<property>
+		<name>yarn.log-aggregation-enable</name>
+		<value>true</value>
+	</property>
+
+	<property>
+		<name>yarn.log-aggregation.retain-seconds</name>
+		<value>604800</value>
+	</property>
+	
+		
+	<property>
+		<name>yarn.nodemanager.resource.memory-mb</name>
+		<value>2048</value>
+	</property>
+	<property>
+		<name>yarn.nodemanager.resource.cpu-vcores</name>
+		<value>2</value>
+	</property>
+    <property>
+      <name>yarn.scheduler.minimum-allocation-mb</name>
+      <value>1024</value>
+    </property>	
+</configuration>
--- a/spark/1.6.0/config/spark-yarn-remote-client/core-site.xml
+++ b/spark/1.6.0/config/spark-yarn-remote-client/core-site.xml
@@ -0,0 +1,12 @@
+<?xml version="1.0"?>
+<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
+<configuration>
+  <property>
+      <name>fs.default.name</name>
+      <value>hdfs://spark-master:8020</value>
+  </property>
+  <property>
+      <name>dfs.client.use.legacy.blockreader</name>
+      <value>true</value>
+  </property>
+</configuration>
--- a/spark/1.6.0/config/spark-yarn-remote-client/yarn-site.xml
+++ b/spark/1.6.0/config/spark-yarn-remote-client/yarn-site.xml
@@ -0,0 +1,26 @@
+<configuration>
+  <property>
+    <name>yarn.resourcemanager.scheduler.address</name>
+    <value>spark-master:8030</value>
+  </property>
+  <property>
+    <name>yarn.resourcemanager.address</name>
+    <value>spark-master:8032</value>
+  </property>
+  <property>
+    <name>yarn.resourcemanager.webapp.address</name>
+    <value>spark-master:8088</value>
+  </property>
+  <property>
+    <name>yarn.resourcemanager.resource-tracker.address</name>
+    <value>spark-master:8031</value>
+  </property>
+  <property>
+    <name>yarn.resourcemanager.admin.address</name>
+    <value>spark-master:8033</value>
+  </property>
+  <property>
+      <name>yarn.application.classpath</name>
+      <value>/usr/local/hadoop/etc/hadoop, /usr/local/hadoop/share/hadoop/common/*, /usr/local/hadoop/share/hadoop/common/lib/*, /usr/local/hadoop/share/hadoop/hdfs/*, /usr/local/hadoop/share/hadoop/hdfs/lib/*, /usr/local/hadoop/share/hadoop/mapreduce/*, /usr/local/hadoop/share/hadoop/mapreduce/lib/*, /usr/local/hadoop/share/hadoop/yarn/*, /usr/local/hadoop/share/hadoop/yarn/lib/*, /usr/local/hadoop/share/spark/*</value>
+   </property>
+</configuration>
--- a/spark/1.6.0/config/spark/slaves
+++ b/spark/1.6.0/config/spark/slaves
@@ -0,0 +1,3 @@
+# A Spark Worker will be started on each of the machines listed below.
+spark-slave1
+spark-slave2
--- a/spark/1.6.0/config/spark/spark-defaults.conf
+++ b/spark/1.6.0/config/spark/spark-defaults.conf
@@ -0,0 +1,10 @@
+# Default system properties included when running spark-submit.
+# This is useful for setting default environmental settings.
+
+# Example:
+ spark.master                     spark://spark-master:7077
+ spark.eventLog.enabled           true
+ spark.eventLog.dir               hdfs://spark-master:/var/log/spark
+ spark.serializer                 org.apache.spark.serializer.KryoSerializer
+# spark.driver.memory              5g
+# spark.executor.extraJavaOptions  -XX:+PrintGCDetails -Dkey=value -Dnumbers="one two three"
--- a/spark/1.6.0/config/spark/spark-env.sh
+++ b/spark/1.6.0/config/spark/spark-env.sh
@@ -0,0 +1,52 @@
+#!/usr/bin/env bash
+
+# This file is sourced when running various Spark programs.
+# Copy it as spark-env.sh and edit that to configure Spark for your site.
+
+# Options read when launching programs locally with 
+# ./bin/run-example or ./bin/spark-submit
+# - HADOOP_CONF_DIR, to point Spark towards Hadoop configuration files
+# - SPARK_LOCAL_IP, to set the IP address Spark binds to on this node
+# - SPARK_PUBLIC_DNS, to set the public dns name of the driver program
+# - SPARK_CLASSPATH, default classpath entries to append
+
+# Options read by executors and drivers running inside the cluster
+# - SPARK_LOCAL_IP, to set the IP address Spark binds to on this node
+# - SPARK_PUBLIC_DNS, to set the public DNS name of the driver program
+# - SPARK_CLASSPATH, default classpath entries to append
+# - SPARK_LOCAL_DIRS, storage directories to use on this node for shuffle and RDD data
+# - MESOS_NATIVE_LIBRARY, to point to your libmesos.so if you use Mesos
+
+# Options read in YARN client mode
+# - HADOOP_CONF_DIR, to point Spark towards Hadoop configuration files
+# - SPARK_EXECUTOR_INSTANCES, Number of workers to start (Default: 2)
+# - SPARK_EXECUTOR_CORES, Number of cores for the workers (Default: 1).
+# - SPARK_EXECUTOR_MEMORY, Memory per Worker (e.g. 1000M, 2G) (Default: 1G)
+# - SPARK_DRIVER_MEMORY, Memory for Master (e.g. 1000M, 2G) (Default: 512 Mb)
+# - SPARK_YARN_APP_NAME, The name of your application (Default: Spark)
+# - SPARK_YARN_QUEUE, The hadoop queue to use for allocation requests (Default: ‘default’)
+# - SPARK_YARN_DIST_FILES, Comma separated list of files to be distributed with the job.
+# - SPARK_YARN_DIST_ARCHIVES, Comma separated list of archives to be distributed with the job.
+
+# Options for the daemons used in the standalone deploy mode:
+# - SPARK_MASTER_IP, to bind the master to a different IP address or hostname
+# - SPARK_MASTER_PORT / SPARK_MASTER_WEBUI_PORT, to use non-default ports for the master
+# - SPARK_MASTER_OPTS, to set config properties only for the master (e.g. "-Dx=y")
+# - SPARK_WORKER_CORES, to set the number of cores to use on this machine
+# - SPARK_WORKER_MEMORY, to set how much total memory workers have to give executors (e.g. 1000m, 2g)
+# - SPARK_WORKER_PORT / SPARK_WORKER_WEBUI_PORT, to use non-default ports for the worker
+# - SPARK_WORKER_INSTANCES, to set the number of worker processes per node
+# - SPARK_WORKER_DIR, to set the working directory of worker processes
+# - SPARK_WORKER_OPTS, to set config properties only for the worker (e.g. "-Dx=y")
+# - SPARK_HISTORY_OPTS, to set config properties only for the history server (e.g. "-Dx=y")
+# - SPARK_DAEMON_JAVA_OPTS, to set config properties for all daemons (e.g. "-Dx=y")
+# - SPARK_PUBLIC_DNS, to set the public dns name of the master or workers
+
+export JAVA_HOME=/usr/java/default
+export SPARK_MASTER_IP=spark-master
+export SPARK_WORKER_CORES=1
+export SPARK_WORKER_INSTANCES=1
+export SPARK_MORKER_PORT=7077
+export SPARK_WORKER_MEMORY=2g
+export MASTER=spark://${SPARK_MASTER_IP}:${SPARK_MASTER_PORT}
+export SPARK_HISTORY_OPTS="-Dspark.history.ui.port=18080 -Dspark.history.retainedApplications=3 -Dspark.history.fs.logDirectory=hdfs://spark-master:/var/log/spark"