From 5a5e090f875167252af8e1c263a975bdf52beeec Mon Sep 17 00:00:00 2001 From: John Sundarraj Date: Tue, 2 Jul 2024 19:58:01 +0530 Subject: [PATCH] Implement Apache Hadoop 2.10.1 cluster for Data Lake project - Added required configuration files for Apache Hadoop name and data nodes. - Included script files to start and stop the Apache Hadoop nodes. - Written Docker compose YAML configuration to manage Apache Hadoop cluster. - Configured Docker compose environment files for Data Lake implementation. Co-authored-by: Divya Priya Muthuvel <143541386+DivyaPriya-Muthuvel@users.noreply.github.com> --- compose/DEV.env | 1 + compose/QAA.env | 1 + compose/QAB.env | 1 + compose/data-lake/hadoop/main.yml | 124 ++++++++++++++++++ compose/data-lake/kafka/main.yml | 12 +- image/hadoop/2.10.1/amazon-linux-2.dockerfile | 6 + .../hadoop/2.10.1/data/conf/core-site.xml | 10 ++ workload/hadoop/2.10.1/data/conf/env.sh | 117 +++++++++++++++++ .../hadoop/2.10.1/data/conf/hdfs-site.xml | 18 +++ .../hadoop/2.10.1/data/conf/supervisor.ini | 6 + workload/hadoop/2.10.1/data/script/start.sh | 3 + workload/hadoop/2.10.1/data/script/stop.sh | 3 + .../hadoop/2.10.1/name/conf/core-site.xml | 10 ++ workload/hadoop/2.10.1/name/conf/env.sh | 117 +++++++++++++++++ .../hadoop/2.10.1/name/conf/hdfs-site.xml | 18 +++ .../hadoop/2.10.1/name/conf/supervisor.ini | 6 + workload/hadoop/2.10.1/name/script/start.sh | 6 + workload/hadoop/2.10.1/name/script/stop.sh | 3 + 18 files changed, 456 insertions(+), 6 deletions(-) create mode 100644 compose/data-lake/hadoop/main.yml create mode 100644 workload/hadoop/2.10.1/data/conf/core-site.xml create mode 100644 workload/hadoop/2.10.1/data/conf/env.sh create mode 100644 workload/hadoop/2.10.1/data/conf/hdfs-site.xml create mode 100644 workload/hadoop/2.10.1/data/conf/supervisor.ini create mode 100755 workload/hadoop/2.10.1/data/script/start.sh create mode 100755 workload/hadoop/2.10.1/data/script/stop.sh create mode 100644 workload/hadoop/2.10.1/name/conf/core-site.xml create mode 100644 workload/hadoop/2.10.1/name/conf/env.sh create mode 100644 workload/hadoop/2.10.1/name/conf/hdfs-site.xml create mode 100644 workload/hadoop/2.10.1/name/conf/supervisor.ini create mode 100755 workload/hadoop/2.10.1/name/script/start.sh create mode 100755 workload/hadoop/2.10.1/name/script/stop.sh diff --git a/compose/DEV.env b/compose/DEV.env index f4b8cd1..c57de3f 100644 --- a/compose/DEV.env +++ b/compose/DEV.env @@ -22,4 +22,5 @@ CTM_NGINX_IP=14.1.2.30 CTM_NGINX_PORT=8001 DATA_LAKE_KAFKA_VERSION=3.2.1 +DATA_LAKE_HADOOP_VERSION=2.10.1 DATA_LAKE_NETWORK=14.1.3.0/24 diff --git a/compose/QAA.env b/compose/QAA.env index f63e04c..aa700db 100644 --- a/compose/QAA.env +++ b/compose/QAA.env @@ -22,4 +22,5 @@ CTM_NGINX_IP=15.1.2.30 CTM_NGINX_PORT=8002 DATA_LAKE_KAFKA_VERSION=3.2.1 +DATA_LAKE_HADOOP_VERSION=2.10.1 DATA_LAKE_NETWORK=15.1.3.0/24 diff --git a/compose/QAB.env b/compose/QAB.env index ec61d8b..c5c7995 100644 --- a/compose/QAB.env +++ b/compose/QAB.env @@ -22,4 +22,5 @@ CTM_NGINX_IP=16.1.2.30 CTM_NGINX_PORT=8003 DATA_LAKE_KAFKA_VERSION=3.2.1 +DATA_LAKE_HADOOP_VERSION=2.10.1 DATA_LAKE_NETWORK=16.1.3.0/24 diff --git a/compose/data-lake/hadoop/main.yml b/compose/data-lake/hadoop/main.yml new file mode 100644 index 0000000..b635431 --- /dev/null +++ b/compose/data-lake/hadoop/main.yml @@ -0,0 +1,124 @@ +version: '3.8' +services: + hadoop-name-1: + image: sloopstash/hadoop:v2.10.1 + entrypoint: /usr/bin/supervisord + command: "-c /etc/supervisord.conf" + environment: + - JAVA_HOME=/usr/java/jdk1.8.0_131/jre + - HADOOP_HOME=/usr/local/lib/hadoop + - HADOOP_CONF_DIR=/usr/local/lib/hadoop/etc/hadoop + - PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/java/jdk1.8.0_131/jre/bin:/usr/local/lib/hadoop/bin + volumes: + - hadoop-name-1-data:/opt/hadoop/data + - hadoop-name-1-log:/opt/hadoop/log + - ${HOME_DIR}/workload/hadoop/${DATA_LAKE_HADOOP_VERSION}/name/script:/opt/hadoop/script + - hadoop-name-1-tmp:/opt/hadoop/tmp + - ${HOME_DIR}/workload/supervisor/conf/server.conf:/etc/supervisord.conf + - ${HOME_DIR}/workload/hadoop/${DATA_LAKE_HADOOP_VERSION}/name/conf/supervisor.ini:/opt/hadoop/system/supervisor.ini + - ${HOME_DIR}/workload/hadoop/${DATA_LAKE_HADOOP_VERSION}/name/conf/env.sh:/opt/hadoop/conf/env.sh + - ${HOME_DIR}/workload/hadoop/${DATA_LAKE_HADOOP_VERSION}/name/conf/core-site.xml:/opt/hadoop/conf/core-site.xml + - ${HOME_DIR}/workload/hadoop/${DATA_LAKE_HADOOP_VERSION}/name/conf/hdfs-site.xml:/opt/hadoop/conf/hdfs-site.xml + networks: + - common + hadoop-data-1: + image: sloopstash/hadoop:v2.10.1 + entrypoint: /usr/bin/supervisord + command: "-c /etc/supervisord.conf" + environment: + - JAVA_HOME=/usr/java/jdk1.8.0_131/jre + - HADOOP_HOME=/usr/local/lib/hadoop + - HADOOP_CONF_DIR=/usr/local/lib/hadoop/etc/hadoop + - PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/java/jdk1.8.0_131/jre/bin:/usr/local/lib/hadoop/bin + volumes: + - hadoop-data-1-data:/opt/hadoop/data + - hadoop-data-1-log:/opt/hadoop/log + - ${HOME_DIR}/workload/hadoop/${DATA_LAKE_HADOOP_VERSION}/data/script:/opt/hadoop/script + - hadoop-data-1-tmp:/opt/hadoop/tmp + - ${HOME_DIR}/workload/supervisor/conf/server.conf:/etc/supervisord.conf + - ${HOME_DIR}/workload/hadoop/${DATA_LAKE_HADOOP_VERSION}/data/conf/supervisor.ini:/opt/hadoop/system/supervisor.ini + - ${HOME_DIR}/workload/hadoop/${DATA_LAKE_HADOOP_VERSION}/data/conf/env.sh:/opt/hadoop/conf/env.sh + - ${HOME_DIR}/workload/hadoop/${DATA_LAKE_HADOOP_VERSION}/data/conf/core-site.xml:/opt/hadoop/conf/core-site.xml + - ${HOME_DIR}/workload/hadoop/${DATA_LAKE_HADOOP_VERSION}/data/conf/hdfs-site.xml:/opt/hadoop/conf/hdfs-site.xml + depends_on: + - hadoop-name-1 + networks: + - common + hadoop-data-2: + image: sloopstash/hadoop:v2.10.1 + entrypoint: /usr/bin/supervisord + command: "-c /etc/supervisord.conf" + environment: + - JAVA_HOME=/usr/java/jdk1.8.0_131/jre + - HADOOP_HOME=/usr/local/lib/hadoop + - HADOOP_CONF_DIR=/usr/local/lib/hadoop/etc/hadoop + - PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/java/jdk1.8.0_131/jre/bin:/usr/local/lib/hadoop/bin + volumes: + - hadoop-data-2-data:/opt/hadoop/data + - hadoop-data-2-log:/opt/hadoop/log + - ${HOME_DIR}/workload/hadoop/${DATA_LAKE_HADOOP_VERSION}/data/script:/opt/hadoop/script + - hadoop-data-2-tmp:/opt/hadoop/tmp + - ${HOME_DIR}/workload/supervisor/conf/server.conf:/etc/supervisord.conf + - ${HOME_DIR}/workload/hadoop/${DATA_LAKE_HADOOP_VERSION}/data/conf/supervisor.ini:/opt/hadoop/system/supervisor.ini + - ${HOME_DIR}/workload/hadoop/${DATA_LAKE_HADOOP_VERSION}/data/conf/env.sh:/opt/hadoop/conf/env.sh + - ${HOME_DIR}/workload/hadoop/${DATA_LAKE_HADOOP_VERSION}/data/conf/core-site.xml:/opt/hadoop/conf/core-site.xml + - ${HOME_DIR}/workload/hadoop/${DATA_LAKE_HADOOP_VERSION}/data/conf/hdfs-site.xml:/opt/hadoop/conf/hdfs-site.xml + depends_on: + - hadoop-name-1 + networks: + - common + hadoop-data-3: + image: sloopstash/hadoop:v2.10.1 + entrypoint: /usr/bin/supervisord + command: "-c /etc/supervisord.conf" + environment: + - JAVA_HOME=/usr/java/jdk1.8.0_131/jre + - HADOOP_HOME=/usr/local/lib/hadoop + - HADOOP_CONF_DIR=/usr/local/lib/hadoop/etc/hadoop + - PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/java/jdk1.8.0_131/jre/bin:/usr/local/lib/hadoop/bin + volumes: + - hadoop-data-3-data:/opt/hadoop/data + - hadoop-data-3-log:/opt/hadoop/log + - ${HOME_DIR}/workload/hadoop/${DATA_LAKE_HADOOP_VERSION}/data/script:/opt/hadoop/script + - hadoop-data-3-tmp:/opt/hadoop/tmp + - ${HOME_DIR}/workload/supervisor/conf/server.conf:/etc/supervisord.conf + - ${HOME_DIR}/workload/hadoop/${DATA_LAKE_HADOOP_VERSION}/data/conf/supervisor.ini:/opt/hadoop/system/supervisor.ini + - ${HOME_DIR}/workload/hadoop/${DATA_LAKE_HADOOP_VERSION}/data/conf/env.sh:/opt/hadoop/conf/env.sh + - ${HOME_DIR}/workload/hadoop/${DATA_LAKE_HADOOP_VERSION}/data/conf/core-site.xml:/opt/hadoop/conf/core-site.xml + - ${HOME_DIR}/workload/hadoop/${DATA_LAKE_HADOOP_VERSION}/data/conf/hdfs-site.xml:/opt/hadoop/conf/hdfs-site.xml + depends_on: + - hadoop-name-1 + networks: + - common +volumes: + hadoop-name-1-data: + driver: local + hadoop-name-1-log: + driver: local + hadoop-name-1-tmp: + driver: local + hadoop-data-1-data: + driver: local + hadoop-data-1-log: + driver: local + hadoop-data-1-tmp: + driver: local + hadoop-data-2-data: + driver: local + hadoop-data-2-log: + driver: local + hadoop-data-2-tmp: + driver: local + hadoop-data-3-data: + driver: local + hadoop-data-3-log: + driver: local + hadoop-data-3-tmp: + driver: local +networks: + common: + driver: bridge + ipam: + driver: default + config: + - subnet: ${DATA_LAKE_NETWORK} diff --git a/compose/data-lake/kafka/main.yml b/compose/data-lake/kafka/main.yml index b5f5e4e..110a864 100644 --- a/compose/data-lake/kafka/main.yml +++ b/compose/data-lake/kafka/main.yml @@ -9,10 +9,10 @@ services: volumes: - kafka-controller-1-data:/opt/kafka/data - kafka-controller-1-log:/opt/kafka/log + - ${HOME_DIR}/workload/kafka/${DATA_LAKE_KAFKA_VERSION}/controller/script:/opt/kafka/script - ${HOME_DIR}/workload/supervisor/conf/server.conf:/etc/supervisord.conf - ${HOME_DIR}/workload/kafka/${DATA_LAKE_KAFKA_VERSION}/controller/conf/supervisor.ini:/opt/kafka/system/supervisor.ini - ${HOME_DIR}/workload/kafka/${DATA_LAKE_KAFKA_VERSION}/controller/conf/server.conf:/opt/kafka/conf/server-reference.conf - - ${HOME_DIR}/workload/kafka/${DATA_LAKE_KAFKA_VERSION}/controller/script:/opt/kafka/script networks: - common kafka-controller-2: @@ -24,10 +24,10 @@ services: volumes: - kafka-controller-2-data:/opt/kafka/data - kafka-controller-2-log:/opt/kafka/log + - ${HOME_DIR}/workload/kafka/${DATA_LAKE_KAFKA_VERSION}/controller/script:/opt/kafka/script - ${HOME_DIR}/workload/supervisor/conf/server.conf:/etc/supervisord.conf - ${HOME_DIR}/workload/kafka/${DATA_LAKE_KAFKA_VERSION}/controller/conf/supervisor.ini:/opt/kafka/system/supervisor.ini - ${HOME_DIR}/workload/kafka/${DATA_LAKE_KAFKA_VERSION}/controller/conf/server.conf:/opt/kafka/conf/server-reference.conf - - ${HOME_DIR}/workload/kafka/${DATA_LAKE_KAFKA_VERSION}/controller/script:/opt/kafka/script networks: - common kafka-controller-3: @@ -39,10 +39,10 @@ services: volumes: - kafka-controller-3-data:/opt/kafka/data - kafka-controller-3-log:/opt/kafka/log + - ${HOME_DIR}/workload/kafka/${DATA_LAKE_KAFKA_VERSION}/controller/script:/opt/kafka/script - ${HOME_DIR}/workload/supervisor/conf/server.conf:/etc/supervisord.conf - ${HOME_DIR}/workload/kafka/${DATA_LAKE_KAFKA_VERSION}/controller/conf/supervisor.ini:/opt/kafka/system/supervisor.ini - ${HOME_DIR}/workload/kafka/${DATA_LAKE_KAFKA_VERSION}/controller/conf/server.conf:/opt/kafka/conf/server-reference.conf - - ${HOME_DIR}/workload/kafka/${DATA_LAKE_KAFKA_VERSION}/controller/script:/opt/kafka/script networks: - common kafka-broker-1: @@ -54,10 +54,10 @@ services: volumes: - kafka-broker-1-data:/opt/kafka/data - kafka-broker-1-log:/opt/kafka/log + - ${HOME_DIR}/workload/kafka/${DATA_LAKE_KAFKA_VERSION}/broker/script:/opt/kafka/script - ${HOME_DIR}/workload/supervisor/conf/server.conf:/etc/supervisord.conf - ${HOME_DIR}/workload/kafka/${DATA_LAKE_KAFKA_VERSION}/broker/conf/supervisor.ini:/opt/kafka/system/supervisor.ini - ${HOME_DIR}/workload/kafka/${DATA_LAKE_KAFKA_VERSION}/broker/conf/server.conf:/opt/kafka/conf/server-reference.conf - - ${HOME_DIR}/workload/kafka/${DATA_LAKE_KAFKA_VERSION}/broker/script:/opt/kafka/script depends_on: - kafka-controller-1 - kafka-controller-2 @@ -73,10 +73,10 @@ services: volumes: - kafka-broker-2-data:/opt/kafka/data - kafka-broker-2-log:/opt/kafka/log + - ${HOME_DIR}/workload/kafka/${DATA_LAKE_KAFKA_VERSION}/broker/script:/opt/kafka/script - ${HOME_DIR}/workload/supervisor/conf/server.conf:/etc/supervisord.conf - ${HOME_DIR}/workload/kafka/${DATA_LAKE_KAFKA_VERSION}/broker/conf/supervisor.ini:/opt/kafka/system/supervisor.ini - ${HOME_DIR}/workload/kafka/${DATA_LAKE_KAFKA_VERSION}/broker/conf/server.conf:/opt/kafka/conf/server-reference.conf - - ${HOME_DIR}/workload/kafka/${DATA_LAKE_KAFKA_VERSION}/broker/script:/opt/kafka/script depends_on: - kafka-controller-1 - kafka-controller-2 @@ -92,10 +92,10 @@ services: volumes: - kafka-broker-3-data:/opt/kafka/data - kafka-broker-3-log:/opt/kafka/log + - ${HOME_DIR}/workload/kafka/${DATA_LAKE_KAFKA_VERSION}/broker/script:/opt/kafka/script - ${HOME_DIR}/workload/supervisor/conf/server.conf:/etc/supervisord.conf - ${HOME_DIR}/workload/kafka/${DATA_LAKE_KAFKA_VERSION}/broker/conf/supervisor.ini:/opt/kafka/system/supervisor.ini - ${HOME_DIR}/workload/kafka/${DATA_LAKE_KAFKA_VERSION}/broker/conf/server.conf:/opt/kafka/conf/server-reference.conf - - ${HOME_DIR}/workload/kafka/${DATA_LAKE_KAFKA_VERSION}/broker/script:/opt/kafka/script depends_on: - kafka-controller-1 - kafka-controller-2 diff --git a/image/hadoop/2.10.1/amazon-linux-2.dockerfile b/image/hadoop/2.10.1/amazon-linux-2.dockerfile index 5c63b35..697366a 100644 --- a/image/hadoop/2.10.1/amazon-linux-2.dockerfile +++ b/image/hadoop/2.10.1/amazon-linux-2.dockerfile @@ -25,8 +25,14 @@ RUN set -x \ && mkdir /opt/hadoop/script \ && mkdir /opt/hadoop/system \ && mkdir /opt/hadoop/tmp \ + && touch /opt/hadoop/conf/env.sh \ + && touch /opt/hadoop/conf/core-site.xml \ + && touch /opt/hadoop/conf/hdfs-site.xml \ && touch /opt/hadoop/system/node.pid \ && touch /opt/hadoop/system/supervisor.ini \ + && ln -sf /opt/hadoop/conf/env.sh /usr/local/lib/hadoop/etc/hadoop/hadoop-env.sh \ + && ln -sf /opt/hadoop/conf/core-site.xml /usr/local/lib/hadoop/etc/hadoop/core-site.xml \ + && ln -sf /opt/hadoop/conf/hdfs-site.xml /usr/local/lib/hadoop/etc/hadoop/hdfs-site.xml \ && ln -s /opt/hadoop/system/supervisor.ini /etc/supervisord.d/hadoop.ini \ && history -c diff --git a/workload/hadoop/2.10.1/data/conf/core-site.xml b/workload/hadoop/2.10.1/data/conf/core-site.xml new file mode 100644 index 0000000..bbc5e9d --- /dev/null +++ b/workload/hadoop/2.10.1/data/conf/core-site.xml @@ -0,0 +1,10 @@ + + + fs.defaultFS + hdfs://hadoop-name-1:9000 + + + hadoop.tmp.dir + /opt/hadoop/tmp + + diff --git a/workload/hadoop/2.10.1/data/conf/env.sh b/workload/hadoop/2.10.1/data/conf/env.sh new file mode 100644 index 0000000..694df6f --- /dev/null +++ b/workload/hadoop/2.10.1/data/conf/env.sh @@ -0,0 +1,117 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Set Hadoop-specific environment variables here. + +# The only required environment variable is JAVA_HOME. All others are +# optional. When running a distributed configuration it is best to +# set JAVA_HOME in this file, so that it is correctly defined on +# remote nodes. + +# The java implementation to use. +export JAVA_HOME=${JAVA_HOME} + +# The jsvc implementation to use. Jsvc is required to run secure datanodes +# that bind to privileged ports to provide authentication of data transfer +# protocol. Jsvc is not required if SASL is configured for authentication of +# data transfer protocol using non-privileged ports. +#export JSVC_HOME=${JSVC_HOME} + +export HADOOP_CONF_DIR=${HADOOP_CONF_DIR:-"/etc/hadoop"} + +# Extra Java CLASSPATH elements. Automatically insert capacity-scheduler. +for f in $HADOOP_HOME/contrib/capacity-scheduler/*.jar; do + if [ "$HADOOP_CLASSPATH" ]; then + export HADOOP_CLASSPATH=$HADOOP_CLASSPATH:$f + else + export HADOOP_CLASSPATH=$f + fi +done + +# The maximum amount of heap to use, in MB. Default is 1000. +export HADOOP_HEAPSIZE=256 +export HADOOP_NAMENODE_INIT_HEAPSIZE=256 + +# Enable extra debugging of Hadoop's JAAS binding, used to set up +# Kerberos security. +# export HADOOP_JAAS_DEBUG=true + +# Extra Java runtime options. Empty by default. +# For Kerberos debugging, an extended option set logs more invormation +# export HADOOP_OPTS="-Djava.net.preferIPv4Stack=true -Dsun.security.krb5.debug=true -Dsun.security.spnego.debug" +export HADOOP_OPTS="$HADOOP_OPTS -Djava.net.preferIPv4Stack=true" + +# Command specific options appended to HADOOP_OPTS when specified +export HADOOP_NAMENODE_OPTS="-Dhadoop.security.logger=${HADOOP_SECURITY_LOGGER:-INFO,RFAS} -Dhdfs.audit.logger=${HDFS_AUDIT_LOGGER:-INFO,NullAppender} $HADOOP_NAMENODE_OPTS" +export HADOOP_DATANODE_OPTS="-Dhadoop.security.logger=ERROR,RFAS $HADOOP_DATANODE_OPTS" + +export HADOOP_SECONDARYNAMENODE_OPTS="-Dhadoop.security.logger=${HADOOP_SECURITY_LOGGER:-INFO,RFAS} -Dhdfs.audit.logger=${HDFS_AUDIT_LOGGER:-INFO,NullAppender} $HADOOP_SECONDARYNAMENODE_OPTS" + +export HADOOP_NFS3_OPTS="$HADOOP_NFS3_OPTS" +export HADOOP_PORTMAP_OPTS="-Xmx256m $HADOOP_PORTMAP_OPTS" + +# The following applies to multiple commands (fs, dfs, fsck, distcp etc) +export HADOOP_CLIENT_OPTS="$HADOOP_CLIENT_OPTS" +# set heap args when HADOOP_HEAPSIZE is empty +if [ "$HADOOP_HEAPSIZE" = "" ]; then + export HADOOP_CLIENT_OPTS="-Xmx256m $HADOOP_CLIENT_OPTS" +fi +#HADOOP_JAVA_PLATFORM_OPTS="-XX:-UsePerfData $HADOOP_JAVA_PLATFORM_OPTS" + +# On secure datanodes, user to run the datanode as after dropping privileges. +# This **MUST** be uncommented to enable secure HDFS if using privileged ports +# to provide authentication of data transfer protocol. This **MUST NOT** be +# defined if SASL is configured for authentication of data transfer protocol +# using non-privileged ports. +export HADOOP_SECURE_DN_USER=${HADOOP_SECURE_DN_USER} + +# Where log files are stored. $HADOOP_HOME/logs by default. +#export HADOOP_LOG_DIR=${HADOOP_LOG_DIR}/$USER + +# Where log files are stored in the secure data environment. +#export HADOOP_SECURE_DN_LOG_DIR=${HADOOP_LOG_DIR}/${HADOOP_HDFS_USER} + +### +# HDFS Mover specific parameters +### +# Specify the JVM options to be used when starting the HDFS Mover. +# These options will be appended to the options specified as HADOOP_OPTS +# and therefore may override any similar flags set in HADOOP_OPTS +# +# export HADOOP_MOVER_OPTS="" + +### +# Router-based HDFS Federation specific parameters +# Specify the JVM options to be used when starting the RBF Routers. +# These options will be appended to the options specified as HADOOP_OPTS +# and therefore may override any similar flags set in HADOOP_OPTS +# +# export HADOOP_DFSROUTER_OPTS="" +### + +### +# Advanced Users Only! +### + +# The directory where pid files are stored. /tmp by default. +# NOTE: this should be set to a directory that can only be written to by +# the user that will run the hadoop daemons. Otherwise there is the +# potential for a symlink attack. +export HADOOP_PID_DIR=${HADOOP_PID_DIR} +export HADOOP_SECURE_DN_PID_DIR=${HADOOP_PID_DIR} + +# A string representing this instance of hadoop. $USER by default. +export HADOOP_IDENT_STRING=$USER diff --git a/workload/hadoop/2.10.1/data/conf/hdfs-site.xml b/workload/hadoop/2.10.1/data/conf/hdfs-site.xml new file mode 100644 index 0000000..ffc2ed4 --- /dev/null +++ b/workload/hadoop/2.10.1/data/conf/hdfs-site.xml @@ -0,0 +1,18 @@ + + + dfs.datanode.data.dir + /opt/hadoop/data + + + dfs.namenode.datanode.registration.ip-hostname-check + false + + + dfs.client.use.datanode.hostname + false + + + dfs.datanode.use.datanode.hostname + false + + diff --git a/workload/hadoop/2.10.1/data/conf/supervisor.ini b/workload/hadoop/2.10.1/data/conf/supervisor.ini new file mode 100644 index 0000000..4b655de --- /dev/null +++ b/workload/hadoop/2.10.1/data/conf/supervisor.ini @@ -0,0 +1,6 @@ +[program:hadoop] +command=bash -c "/opt/hadoop/script/start.sh" +process_name=%(program_name)s +pidfile=/opt/hadoop/system/node.pid +numprocs=1 +autorestart=false diff --git a/workload/hadoop/2.10.1/data/script/start.sh b/workload/hadoop/2.10.1/data/script/start.sh new file mode 100755 index 0000000..4d9f9ed --- /dev/null +++ b/workload/hadoop/2.10.1/data/script/start.sh @@ -0,0 +1,3 @@ +#!/bin/bash + +/usr/local/lib/hadoop/sbin/hadoop-daemon.sh start datanode diff --git a/workload/hadoop/2.10.1/data/script/stop.sh b/workload/hadoop/2.10.1/data/script/stop.sh new file mode 100755 index 0000000..2c65a68 --- /dev/null +++ b/workload/hadoop/2.10.1/data/script/stop.sh @@ -0,0 +1,3 @@ +#!/bin/bash + +/usr/local/lib/hadoop/sbin/hadoop-daemon.sh stop datanode diff --git a/workload/hadoop/2.10.1/name/conf/core-site.xml b/workload/hadoop/2.10.1/name/conf/core-site.xml new file mode 100644 index 0000000..de55f17 --- /dev/null +++ b/workload/hadoop/2.10.1/name/conf/core-site.xml @@ -0,0 +1,10 @@ + + + fs.defaultFS + hdfs://0.0.0.0:9000 + + + hadoop.tmp.dir + /opt/hadoop/tmp + + diff --git a/workload/hadoop/2.10.1/name/conf/env.sh b/workload/hadoop/2.10.1/name/conf/env.sh new file mode 100644 index 0000000..694df6f --- /dev/null +++ b/workload/hadoop/2.10.1/name/conf/env.sh @@ -0,0 +1,117 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Set Hadoop-specific environment variables here. + +# The only required environment variable is JAVA_HOME. All others are +# optional. When running a distributed configuration it is best to +# set JAVA_HOME in this file, so that it is correctly defined on +# remote nodes. + +# The java implementation to use. +export JAVA_HOME=${JAVA_HOME} + +# The jsvc implementation to use. Jsvc is required to run secure datanodes +# that bind to privileged ports to provide authentication of data transfer +# protocol. Jsvc is not required if SASL is configured for authentication of +# data transfer protocol using non-privileged ports. +#export JSVC_HOME=${JSVC_HOME} + +export HADOOP_CONF_DIR=${HADOOP_CONF_DIR:-"/etc/hadoop"} + +# Extra Java CLASSPATH elements. Automatically insert capacity-scheduler. +for f in $HADOOP_HOME/contrib/capacity-scheduler/*.jar; do + if [ "$HADOOP_CLASSPATH" ]; then + export HADOOP_CLASSPATH=$HADOOP_CLASSPATH:$f + else + export HADOOP_CLASSPATH=$f + fi +done + +# The maximum amount of heap to use, in MB. Default is 1000. +export HADOOP_HEAPSIZE=256 +export HADOOP_NAMENODE_INIT_HEAPSIZE=256 + +# Enable extra debugging of Hadoop's JAAS binding, used to set up +# Kerberos security. +# export HADOOP_JAAS_DEBUG=true + +# Extra Java runtime options. Empty by default. +# For Kerberos debugging, an extended option set logs more invormation +# export HADOOP_OPTS="-Djava.net.preferIPv4Stack=true -Dsun.security.krb5.debug=true -Dsun.security.spnego.debug" +export HADOOP_OPTS="$HADOOP_OPTS -Djava.net.preferIPv4Stack=true" + +# Command specific options appended to HADOOP_OPTS when specified +export HADOOP_NAMENODE_OPTS="-Dhadoop.security.logger=${HADOOP_SECURITY_LOGGER:-INFO,RFAS} -Dhdfs.audit.logger=${HDFS_AUDIT_LOGGER:-INFO,NullAppender} $HADOOP_NAMENODE_OPTS" +export HADOOP_DATANODE_OPTS="-Dhadoop.security.logger=ERROR,RFAS $HADOOP_DATANODE_OPTS" + +export HADOOP_SECONDARYNAMENODE_OPTS="-Dhadoop.security.logger=${HADOOP_SECURITY_LOGGER:-INFO,RFAS} -Dhdfs.audit.logger=${HDFS_AUDIT_LOGGER:-INFO,NullAppender} $HADOOP_SECONDARYNAMENODE_OPTS" + +export HADOOP_NFS3_OPTS="$HADOOP_NFS3_OPTS" +export HADOOP_PORTMAP_OPTS="-Xmx256m $HADOOP_PORTMAP_OPTS" + +# The following applies to multiple commands (fs, dfs, fsck, distcp etc) +export HADOOP_CLIENT_OPTS="$HADOOP_CLIENT_OPTS" +# set heap args when HADOOP_HEAPSIZE is empty +if [ "$HADOOP_HEAPSIZE" = "" ]; then + export HADOOP_CLIENT_OPTS="-Xmx256m $HADOOP_CLIENT_OPTS" +fi +#HADOOP_JAVA_PLATFORM_OPTS="-XX:-UsePerfData $HADOOP_JAVA_PLATFORM_OPTS" + +# On secure datanodes, user to run the datanode as after dropping privileges. +# This **MUST** be uncommented to enable secure HDFS if using privileged ports +# to provide authentication of data transfer protocol. This **MUST NOT** be +# defined if SASL is configured for authentication of data transfer protocol +# using non-privileged ports. +export HADOOP_SECURE_DN_USER=${HADOOP_SECURE_DN_USER} + +# Where log files are stored. $HADOOP_HOME/logs by default. +#export HADOOP_LOG_DIR=${HADOOP_LOG_DIR}/$USER + +# Where log files are stored in the secure data environment. +#export HADOOP_SECURE_DN_LOG_DIR=${HADOOP_LOG_DIR}/${HADOOP_HDFS_USER} + +### +# HDFS Mover specific parameters +### +# Specify the JVM options to be used when starting the HDFS Mover. +# These options will be appended to the options specified as HADOOP_OPTS +# and therefore may override any similar flags set in HADOOP_OPTS +# +# export HADOOP_MOVER_OPTS="" + +### +# Router-based HDFS Federation specific parameters +# Specify the JVM options to be used when starting the RBF Routers. +# These options will be appended to the options specified as HADOOP_OPTS +# and therefore may override any similar flags set in HADOOP_OPTS +# +# export HADOOP_DFSROUTER_OPTS="" +### + +### +# Advanced Users Only! +### + +# The directory where pid files are stored. /tmp by default. +# NOTE: this should be set to a directory that can only be written to by +# the user that will run the hadoop daemons. Otherwise there is the +# potential for a symlink attack. +export HADOOP_PID_DIR=${HADOOP_PID_DIR} +export HADOOP_SECURE_DN_PID_DIR=${HADOOP_PID_DIR} + +# A string representing this instance of hadoop. $USER by default. +export HADOOP_IDENT_STRING=$USER diff --git a/workload/hadoop/2.10.1/name/conf/hdfs-site.xml b/workload/hadoop/2.10.1/name/conf/hdfs-site.xml new file mode 100644 index 0000000..e7364eb --- /dev/null +++ b/workload/hadoop/2.10.1/name/conf/hdfs-site.xml @@ -0,0 +1,18 @@ + + + dfs.namenode.name.dir + /opt/hadoop/data + + + dfs.namenode.datanode.registration.ip-hostname-check + false + + + dfs.client.use.datanode.hostname + false + + + dfs.datanode.use.datanode.hostname + false + + diff --git a/workload/hadoop/2.10.1/name/conf/supervisor.ini b/workload/hadoop/2.10.1/name/conf/supervisor.ini new file mode 100644 index 0000000..4b655de --- /dev/null +++ b/workload/hadoop/2.10.1/name/conf/supervisor.ini @@ -0,0 +1,6 @@ +[program:hadoop] +command=bash -c "/opt/hadoop/script/start.sh" +process_name=%(program_name)s +pidfile=/opt/hadoop/system/node.pid +numprocs=1 +autorestart=false diff --git a/workload/hadoop/2.10.1/name/script/start.sh b/workload/hadoop/2.10.1/name/script/start.sh new file mode 100755 index 0000000..9d4e752 --- /dev/null +++ b/workload/hadoop/2.10.1/name/script/start.sh @@ -0,0 +1,6 @@ +#!/bin/bash + +if [ ! -d '/opt/hadoop/data/current' ]; then + /usr/local/lib/hadoop/bin/hdfs namenode -format hadoop-cluster +fi +/usr/local/lib/hadoop/sbin/hadoop-daemon.sh start namenode diff --git a/workload/hadoop/2.10.1/name/script/stop.sh b/workload/hadoop/2.10.1/name/script/stop.sh new file mode 100755 index 0000000..0dcbb17 --- /dev/null +++ b/workload/hadoop/2.10.1/name/script/stop.sh @@ -0,0 +1,3 @@ +#!/bin/bash + +/usr/local/lib/hadoop/sbin/hadoop-daemon.sh stop namenode