From 29d661e61e7d39cd49b5ed07fbfa0910cad56b79 Mon Sep 17 00:00:00 2001 From: Chenlong Ma Date: Mon, 27 Nov 2023 11:19:56 +0800 Subject: [PATCH] KubeFATE support FATE v2.0.0-beta (#921) * Support FATE v2.0.0-beta Signed-off-by: Chenlong Ma * Update KubeFATE version Signed-off-by: Chenlong Ma * fix docker compose healthcheck Signed-off-by: Chenlong Ma * fix pulsar bug Signed-off-by: Chenlong Ma --------- Signed-off-by: Chenlong Ma --- build/ci/docker-deploy/docker_deploy.sh | 6 +- docker-deploy/.env | 33 ++- docker-deploy/README.md | 12 +- docker-deploy/README_zh.md | 12 +- docker-deploy/docker_deploy.sh | 2 +- docker-deploy/generate_config.sh | 117 ++++---- docker-deploy/parties.conf | 2 +- .../backends/eggroll/conf/README.md | 6 + .../backends/eggroll/conf/eggroll.properties | 4 +- .../backends/eggroll/conf/whitelist.json | 120 +++++++- .../docker-compose-eggroll.yml | 53 ++-- .../docker-compose-spark-slim.yml | 21 +- .../docker-compose-spark.yml | 58 ++-- .../public/fate_flow/conf/service_conf.yaml | 244 +++++++--------- docs/FATE_On_Spark_With_Pulsar.md | 4 +- docs/Manage_FATE_and_FATE-Serving_Version.md | 8 +- ...ster_in_One_Linux_Machine_with_MiniKube.md | 18 +- ...r_in_One_Linux_Machine_with_MiniKube_zh.md | 18 +- helm-charts/FATE/Chart.yaml | 4 +- .../eggroll/clustermanager/deployment.yaml | 2 +- .../eggroll/lb-rollsite/deployment.yaml | 4 +- .../eggroll/nodemanager/statefulSet.yaml | 2 +- .../backends/eggroll/rollsite/deployment.yaml | 2 +- .../backends/spark/hdfs/datanode.yaml | 2 +- .../backends/spark/hdfs/namenode.yaml | 4 +- .../backends/spark/nginx/deployment.yaml | 2 +- .../backends/spark/pulsar/statefulSet.yaml | 2 +- .../backends/spark/rabbitmq/deployment.yaml | 2 +- .../backends/spark/spark/deployment.yaml | 4 +- .../templates/core/client/statefulSet.yaml | 2 +- .../FATE/templates/core/fateboard.yaml | 2 +- .../templates/core/fateflow/configmap.yaml | 251 ++++++++--------- .../templates/core/mysql/statefulSet.yaml | 6 +- .../FATE/templates/core/python-spark.yaml | 29 +- helm-charts/FATE/values-template-example.yaml | 36 ++- helm-charts/FATE/values-template.yaml | 36 ++- helm-charts/FATE/values.yaml | 44 ++- helm-charts/Images_list.md | 2 +- k8s-deploy/README.md | 4 +- k8s-deploy/README_zh.md | 4 +- k8s-deploy/cluster-spark-pulsar.yaml | 263 ++++++++++-------- k8s-deploy/cluster-spark-rabbitmq.yaml | 152 ++++++---- k8s-deploy/cluster-spark-slim.yaml | 222 +++++++++------ k8s-deploy/cluster.yaml | 2 +- .../examples/party-10000/cluster-gpu.yaml | 2 +- .../cluster-spark-local-pulsar.yaml | 2 +- .../party-10000/cluster-spark-pulsar.yaml | 2 +- .../party-10000/cluster-spark-rabbitmq.yaml | 2 +- k8s-deploy/examples/party-10000/cluster.yaml | 2 +- .../examples/party-9999/cluster-gpu.yaml | 2 +- .../cluster-spark-local-pulsar.yaml | 2 +- .../party-9999/cluster-spark-pulsar.yaml | 2 +- .../party-9999/cluster-spark-rabbitmq.yaml | 2 +- k8s-deploy/examples/party-9999/cluster.yaml | 2 +- k8s-deploy/examples/party.config | 4 +- 55 files changed, 1078 insertions(+), 769 deletions(-) create mode 100644 docker-deploy/training_template/backends/eggroll/conf/README.md diff --git a/build/ci/docker-deploy/docker_deploy.sh b/build/ci/docker-deploy/docker_deploy.sh index b9532f916..3f55763fb 100644 --- a/build/ci/docker-deploy/docker_deploy.sh +++ b/build/ci/docker-deploy/docker_deploy.sh @@ -23,7 +23,7 @@ tar -xzf confs-${target_party_id}.tar cd confs-${target_party_id} docker compose down docker volume rm -f confs-${target_party_id}_shared_dir_examples -docker volume rm -f confs-${target_party_id}_shared_dir_federatedml +docker volume rm -f confs-${target_party_id}_shared_dir_fate # exclude client service to save time ! docker compose up -d @@ -47,8 +47,8 @@ for ((i = 1; i <= MAX_TRY; i++)); do result=$(docker ps | wc -l) if [ "${result}" -eq ${CONTAINER_NUM} ]; then echo "# containers are ok" - FATE_FLOW_STATUS=$(curl -s -X POST localhost:9380/v1/version/get) - success='"retmsg":"success"' + FATE_FLOW_STATUS=$(curl -s -X GET localhost:9380/v2/server/fateflow) + success='"message":"success"' result=$(echo $FATE_FLOW_STATUS | grep "${success}") if [[ "$result" != "" ]] then diff --git a/docker-deploy/.env b/docker-deploy/.env index 48a9ffbc4..810a3c092 100644 --- a/docker-deploy/.env +++ b/docker-deploy/.env @@ -1,5 +1,4 @@ RegistryURI= -TAG=1.11.2-release SERVING_TAG=2.1.6-release SSH_PORT=22 @@ -7,3 +6,35 @@ SSH_PORT=22 # RegistryURI: address of the local registry # TAG: tag of module images. # SSH_PORT: port of SSH, default 22 + + +KubeFATE_Version=v2.0.0-beta + +# components version + +FATEFlow_IMAGE="federatedai/fateflow" +FATEFlow_IMAGE_TAG="v2.0.0-beta" +FATEBoard_IMAGE="federatedai/fateboard" +FATEBoard_IMAGE_TAG="v2.0.0-beta" +MySQL_IMAGE="mysql" +MySQL_IMAGE_TAG="8.0.28" +Client_IMAGE="federatedai/client" +Client_IMAGE_TAG="v2.0.0-beta" + +EGGRoll_IMAGE="federatedai/eggroll" +EGGRoll_IMAGE_TAG="v2.0.0-beta" + +Nginx_IMAGE="federatedai/nginx" +Nginx_IMAGE_TAG="v2.0.0-beta" +RabbitMQ_IMAGE="federatedai/rabbitmq" +RabbitMQ_IMAGE_TAG="3.8.3-management" +Pulsar_IMAGE="federatedai/pulsar" +Pulsar_IMAGE_TAG="2.10.2" +Hadoop_NameNode_IMAGE="federatedai/hadoop-namenode" +Hadoop_NameNode_IMAGE_TAG="2.0.0-hadoop3.2.1-java8" +Hadoop_DataNode_IMAGE="federatedai/hadoop-datanode" +Hadoop_DataNode_IMAGE_TAG="2.0.0-hadoop3.2.1-java8" +Spark_Master_IMAGE="federatedai/spark-master" +Spark_Master_IMAGE_TAG="v2.0.0-beta" +Spark_Worker_IMAGE="federatedai/spark-worker" +Spark_Worker_IMAGE_TAG="v2.0.0-beta" \ No newline at end of file diff --git a/docker-deploy/README.md b/docker-deploy/README.md index d673cb483..22cde8dff 100644 --- a/docker-deploy/README.md +++ b/docker-deploy/README.md @@ -192,13 +192,13 @@ The output is shown as follows. If the status of each component is `Up`, and the ```bash NAME IMAGE COMMAND SERVICE CREATED STATUS PORTS -confs-10000-client-1 federatedai/client:1.11.2-release "bash -c 'pipeline i…" client About a minute ago Up About a minute 0.0.0.0:20000->20000/tcp, :::20000->20000/tcp -confs-10000-clustermanager-1 federatedai/eggroll:1.11.2-release "/tini -- bash -c 'j…" clustermanager About a minute ago Up About a minute 4670/tcp -confs-10000-fateboard-1 federatedai/fateboard:1.11.2-release "/bin/sh -c 'java -D…" fateboard About a minute ago Up About a minute 0.0.0.0:8080->8080/tcp, :::8080->8080/tcp -confs-10000-fateflow-1 federatedai/fateflow:1.11.2-release "/bin/bash -c 'set -…" fateflow About a minute ago Up About a minute (healthy) 0.0.0.0:9360->9360/tcp, :::9360->9360/tcp, 0.0.0.0:9380->9380/tcp, :::9380->9380/tcp +confs-10000-client-1 federatedai/client:v2.0.0-beta "bash -c 'pipeline i…" client About a minute ago Up About a minute 0.0.0.0:20000->20000/tcp, :::20000->20000/tcp +confs-10000-clustermanager-1 federatedai/eggroll:v2.0.0-beta "/tini -- bash -c 'j…" clustermanager About a minute ago Up About a minute 4670/tcp +confs-10000-fateboard-1 federatedai/fateboard:v2.0.0-beta "/bin/sh -c 'java -D…" fateboard About a minute ago Up About a minute 0.0.0.0:8080->8080/tcp, :::8080->8080/tcp +confs-10000-fateflow-1 federatedai/fateflow:v2.0.0-beta "/bin/bash -c 'set -…" fateflow About a minute ago Up About a minute (healthy) 0.0.0.0:9360->9360/tcp, :::9360->9360/tcp, 0.0.0.0:9380->9380/tcp, :::9380->9380/tcp confs-10000-mysql-1 mysql:8.0.28 "docker-entrypoint.s…" mysql About a minute ago Up About a minute 3306/tcp, 33060/tcp -confs-10000-nodemanager-1 federatedai/eggroll:1.11.2-release "/tini -- bash -c 'j…" nodemanager About a minute ago Up About a minute 4671/tcp -confs-10000-rollsite-1 federatedai/eggroll:1.11.2-release "/tini -- bash -c 'j…" rollsite About a minute ago Up About a minute 0.0.0.0:9370->9370/tcp, :::9370->9370/tcp +confs-10000-nodemanager-1 federatedai/eggroll:v2.0.0-beta "/tini -- bash -c 'j…" nodemanager About a minute ago Up About a minute 4671/tcp +confs-10000-rollsite-1 federatedai/eggroll:v2.0.0-beta "/tini -- bash -c 'j…" rollsite About a minute ago Up About a minute 0.0.0.0:9370->9370/tcp, :::9370->9370/tcp ``` ### Verifying the deployment diff --git a/docker-deploy/README_zh.md b/docker-deploy/README_zh.md index 5cabac41a..82b6c2178 100644 --- a/docker-deploy/README_zh.md +++ b/docker-deploy/README_zh.md @@ -231,13 +231,13 @@ docker compose ps ```bash NAME IMAGE COMMAND SERVICE CREATED STATUS PORTS -confs-10000-client-1 federatedai/client:1.11.2-release "bash -c 'pipeline i…" client About a minute ago Up About a minute 0.0.0.0:20000->20000/tcp, :::20000->20000/tcp -confs-10000-clustermanager-1 federatedai/eggroll:1.11.2-release "/tini -- bash -c 'j…" clustermanager About a minute ago Up About a minute 4670/tcp -confs-10000-fateboard-1 federatedai/fateboard:1.11.2-release "/bin/sh -c 'java -D…" fateboard About a minute ago Up About a minute 0.0.0.0:8080->8080/tcp, :::8080->8080/tcp -confs-10000-fateflow-1 federatedai/fateflow:1.11.2-release "/bin/bash -c 'set -…" fateflow About a minute ago Up About a minute (healthy) 0.0.0.0:9360->9360/tcp, :::9360->9360/tcp, 0.0.0.0:9380->9380/tcp, :::9380->9380/tcp +confs-10000-client-1 federatedai/client:v2.0.0-beta "bash -c 'pipeline i…" client About a minute ago Up About a minute 0.0.0.0:20000->20000/tcp, :::20000->20000/tcp +confs-10000-clustermanager-1 federatedai/eggroll:v2.0.0-beta "/tini -- bash -c 'j…" clustermanager About a minute ago Up About a minute 4670/tcp +confs-10000-fateboard-1 federatedai/fateboard:v2.0.0-beta "/bin/sh -c 'java -D…" fateboard About a minute ago Up About a minute 0.0.0.0:8080->8080/tcp, :::8080->8080/tcp +confs-10000-fateflow-1 federatedai/fateflow:v2.0.0-beta "/bin/bash -c 'set -…" fateflow About a minute ago Up About a minute (healthy) 0.0.0.0:9360->9360/tcp, :::9360->9360/tcp, 0.0.0.0:9380->9380/tcp, :::9380->9380/tcp confs-10000-mysql-1 mysql:8.0.28 "docker-entrypoint.s…" mysql About a minute ago Up About a minute 3306/tcp, 33060/tcp -confs-10000-nodemanager-1 federatedai/eggroll:1.11.2-release "/tini -- bash -c 'j…" nodemanager About a minute ago Up About a minute 4671/tcp -confs-10000-rollsite-1 federatedai/eggroll:1.11.2-release "/tini -- bash -c 'j…" rollsite About a minute ago Up About a minute 0.0.0.0:9370->9370/tcp, :::9370->9370/tcp +confs-10000-nodemanager-1 federatedai/eggroll:v2.0.0-beta "/tini -- bash -c 'j…" nodemanager About a minute ago Up About a minute 4671/tcp +confs-10000-rollsite-1 federatedai/eggroll:v2.0.0-beta "/tini -- bash -c 'j…" rollsite About a minute ago Up About a minute 0.0.0.0:9370->9370/tcp, :::9370->9370/tcp ``` ### 验证部署 diff --git a/docker-deploy/docker_deploy.sh b/docker-deploy/docker_deploy.sh index ddd4f3b24..d85ad3f23 100755 --- a/docker-deploy/docker_deploy.sh +++ b/docker-deploy/docker_deploy.sh @@ -165,7 +165,7 @@ tar -xzf confs-$target_party_id.tar cd confs-$target_party_id docker compose down docker volume rm -f confs-${target_party_id}_shared_dir_examples -docker volume rm -f confs-${target_party_id}_shared_dir_federatedml +docker volume rm -f confs-${target_party_id}_shared_dir_fate docker volume rm -f confs-${target_party_id}_sdownload_dir docker volume rm -f confs-${target_party_id}_fate_flow_logs diff --git a/docker-deploy/generate_config.sh b/docker-deploy/generate_config.sh index da81a7d4a..bb5d845ab 100755 --- a/docker-deploy/generate_config.sh +++ b/docker-deploy/generate_config.sh @@ -14,19 +14,18 @@ set -e BASEDIR=$(dirname "$0") -cd $BASEDIR +cd "$BASEDIR" WORKINGDIR=$(pwd) deploy_dir=/data/projects/fate # fetch fate-python image -source ${WORKINGDIR}/.env -source ${WORKINGDIR}/parties.conf +source "${WORKINGDIR}"/.env +source "${WORKINGDIR}"/parties.conf echo "Generate Config" echo "Info:" +echo " KubeFATE Version: ${KubeFATE_Version}" echo " RegistryURI: ${RegistryURI}" -echo " Tag: ${TAG}" -echo " Serving_Tag: ${SERVING_TAG}" echo " Computing: ${computing}" echo " Federation: ${federation}" echo " Storage: ${storage}" @@ -59,10 +58,10 @@ function CheckConfig(){ # Check config start computing_list="Eggroll Spark Spark_local" spark_federation_list="RabbitMQ Pulsar" - algorithm_list="Basic NN" + algorithm_list="Basic NN ALL" device_list="CPU IPCL GPU" - if ! `list_include_item "$computing_list" "$computing"`; then + if ! $(list_include_item "$computing_list" "$computing"); then echo "[ERROR]: Please check whether computing is one of $computing_list" exit 1 fi @@ -75,7 +74,7 @@ function CheckConfig(){ fi if [ $computing == "Spark" ]; then - if ! `list_include_item "$spark_federation_list" "$federation"`; then + if ! $(list_include_item "$spark_federation_list" "$federation"); then echo "[ERROR]: If you choose the Spark computing engine, the federation component must be Pulsar or RabbitMQ!" exit 1 fi @@ -85,23 +84,23 @@ function CheckConfig(){ fi fi - if [ $computing == "Spark_local" ]; then - if ! `list_include_item "$spark_federation_list" "$federation"`; then + if [ "$computing" == "Spark_local" ]; then + if ! $(list_include_item "$spark_federation_list" "$federation"); then echo "[ERROR]: If you choose the Spark_local computing engine, the federation component must be Pulsar or RabbitMQ!" exit 1 fi - if [ $storage != "LocalFS" ]; then + if [ "$storage" != "LocalFS" ]; then echo "[ERROR]: If you choose the Spark computing engine, the storage component must be LocalFS!" exit 1 fi fi - if ! `list_include_item "$algorithm_list" "$algorithm"`; then + if ! $(list_include_item "$algorithm_list" "$algorithm"); then echo "[ERROR]: Please check whether algorithm is one of $algorithm_list" exit 1 fi - if ! `list_include_item "$device_list" "$device"`; then + if ! $(list_include_item "$device_list" "$device"); then echo "[ERROR]: Please check whether algorithm is one of $device_list" exit 1 fi @@ -139,30 +138,30 @@ GenerateConfig() { eval fateboard_ip=fateboard eval fateboard_port=8080 - eval fateboard_username=${fateboard_username} - eval fateboard_password=${fateboard_password} + eval fateboard_username="${fateboard_username}" + eval fateboard_password="${fateboard_password}" eval fate_flow_ip=fateflow eval fate_flow_grpc_port=9360 eval fate_flow_http_port=9380 eval fml_agent_port=8484 - eval db_ip=${mysql_ip} - eval db_user=${mysql_user} - eval db_password=${mysql_password} - eval db_name=${mysql_db} - eval db_serverTimezone=${serverTimezone} + eval db_ip="${mysql_ip}" + eval db_user="${mysql_user}" + eval db_password="${mysql_password}" + eval db_name="${mysql_db}" + eval db_serverTimezone="${serverTimezone}" eval exchange_ip=${exchangeip} # gpu_count defaulet 1 - eval gpu_count=${gpu_count:-1} + eval gpu_count="${gpu_count:-1}" echo package $party_id start! - rm -rf confs-$party_id/ - mkdir -p confs-$party_id/confs - cp -r training_template/public/* confs-$party_id/confs/ + rm -rf confs-"$party_id"/ + mkdir -p confs-"$party_id"/confs + cp -r training_template/public/* confs-"$party_id"/confs/ # Generate confs packages @@ -206,10 +205,12 @@ GenerateConfig() { # federation if [ "$federation" == "RabbitMQ" ]; then cp -r training_template/backends/spark/rabbitmq confs-$party_id/confs/ - sed -i '200,214d' confs-$party_id/docker-compose.yml + # delete Pulsar spec + sed -i '203,217d' confs-"$party_id"/docker-compose.yml elif [ "$federation" == "Pulsar" ]; then cp -r training_template/backends/spark/pulsar confs-$party_id/confs/ - sed -i '181,198d' confs-$party_id/docker-compose.yml + # delete RabbitMQ spec + sed -i '184,201d' confs-"$party_id"/docker-compose.yml fi fi fi @@ -224,10 +225,10 @@ GenerateConfig() { # federation if [ "$federation" == "RabbitMQ" ]; then cp -r training_template/backends/spark/rabbitmq confs-$party_id/confs/ - sed -i '146,160d' confs-$party_id/docker-compose.yml + sed -i '149,163d' confs-$party_id/docker-compose.yml elif [ "$federation" == "Pulsar" ]; then cp -r training_template/backends/spark/pulsar confs-$party_id/confs/ - sed -i '128,144d' confs-$party_id/docker-compose.yml + sed -i '131,147d' confs-$party_id/docker-compose.yml fi fi fi @@ -261,11 +262,11 @@ GenerateConfig() { # eggroll or spark-worker if [ "$computing" == "Eggroll" ]; then - sed -i "s#image: \"federatedai/fateflow:\${TAG}\"#image: \"federatedai/fateflow${Suffix}:\${TAG}\"#g" ./confs-$party_id/docker-compose.yml - sed -i "s#image: \"federatedai/eggroll:\${TAG}\"#image: \"federatedai/eggroll${Suffix}:\${TAG}\"#g" ./confs-$party_id/docker-compose.yml + sed -i "s#image: \"\${FATEFlow_IMAGE}:\${FATEFlow_IMAGE_TAG}\"#image: \"\${FATEFlow_IMAGE}${Suffix}:\${FATEFlow_IMAGE_TAG}\"#g" ./confs-"$party_id"/docker-compose.yml + sed -i "s#image: \"\${EGGRoll_IMAGE}:\${EGGRoll_IMAGE_TAG}\"#image: \"\${EGGRoll_IMAGE}${Suffix}:\${EGGRoll_IMAGE_TAG}\"#g" ./confs-"$party_id"/docker-compose.yml elif [ "$computing" == "Spark" ] || [ "$computing" == "Spark_local" ]; then - sed -i "s#image: \"federatedai/fateflow:\${TAG}\"#image: \"federatedai/fateflow-spark${Suffix}:\${TAG}\"#g" ./confs-$party_id/docker-compose.yml - sed -i "s#image: \"federatedai/spark-worker:\${TAG}\"#image: \"federatedai/spark-worker${Suffix}:\${TAG}\"#g" ./confs-$party_id/docker-compose.yml + sed -i "s#image: \"\${FATEFlow_IMAGE}:\${FATEFlow_IMAGE_TAG}\"#image: \"\${FATEFlow_IMAGE}-spark${Suffix}:\${FATEFlow_IMAGE_TAG}\"#g" ./confs-"$party_id"/docker-compose.yml + sed -i "s#image: \"\${Spark_Worker_IMAGE}:\${Spark_Worker_IMAGE_TAG}\"#image: \"\${Spark_Worker_IMAGE}${Suffix}:\${Spark_Worker_IMAGE_TAG}\"#g" ./confs-"$party_id"/docker-compose.yml fi # GPU @@ -287,12 +288,16 @@ GenerateConfig() { devices:\\ - driver: nvidia\\ count: $gpu_count\\ - capabilities: [gpu]" ./confs-$party_id/docker-compose.yml + capabilities: [gpu]" ./confs-"$party_id"/docker-compose.yml fi # RegistryURI if [ "$RegistryURI" != "" ]; then - sed -i 's#federatedai#${RegistryURI}/federatedai#g' ./confs-$party_id/docker-compose.yml - sed -i 's#image: "mysql:8"#image: ${RegistryURI}/federatedai/mysql:8#g' ./confs-$party_id/docker-compose.yml + + if [ "${RegistryURI: -1}" != "/" ]; then + RegistryURI="${RegistryURI}/" + fi + + sed -i "s#RegistryURI=.*#RegistryURI=${RegistryURI}/#g" ./confs-"$party_id"/.env fi # replace namenode in training_template/public/fate_flow/conf/service_conf.yaml @@ -301,33 +306,34 @@ GenerateConfig() { fi # update serving ip - sed -i "s/fate-serving/${serving_ip}/g" ./confs-$party_id/docker-compose.yml + sed -i "s/fate-serving/${serving_ip}/g" ./confs-"$party_id"/docker-compose.yml # update the path of shared_dir shared_dir="confs-${party_id}/shared_dir" # create directories - for value in "examples" "federatedml" "data"; do - mkdir -p ${shared_dir}/${value} + for value in "examples" "fate" "data"; do + mkdir -p "${shared_dir}"/${value} done - sed -i "s||${dir}/${shared_dir}|g" ./confs-$party_id/docker-compose.yml + sed -i "s||${dir}/${shared_dir}|g" ./confs-"$party_id"/docker-compose.yml # Start the general config rendering # fateboard - sed -i "s#^server.port=.*#server.port=${fateboard_port}#g" ./confs-$party_id/confs/fateboard/conf/application.properties - sed -i "s#^fateflow.url=.*#fateflow.url=http://${fate_flow_ip}:${fate_flow_http_port}#g" ./confs-$party_id/confs/fateboard/conf/application.properties - sed -i "s##${fateboard_username}#g" ./confs-$party_id/confs/fateboard/conf/application.properties - sed -i "s##${fateboard_password}#g" ./confs-$party_id/confs/fateboard/conf/application.properties - echo fateboard module of $party_id done! + sed -i "s#^server.port=.*#server.port=${fateboard_port}#g" ./confs-"$party_id"/confs/fateboard/conf/application.properties + sed -i "s#^fateflow.url=.*#fateflow.url=http://${fate_flow_ip}:${fate_flow_http_port}#g" ./confs-"$party_id"/confs/fateboard/conf/application.properties + sed -i "s##${fateboard_username}#g" ./confs-"$party_id"/confs/fateboard/conf/application.properties + sed -i "s##${fateboard_password}#g" ./confs-"$party_id"/confs/fateboard/conf/application.properties + echo fateboard module of "$party_id" done! # mysql - echo >./confs-$party_id/confs/mysql/init/insert-node.sql - echo "CREATE DATABASE IF NOT EXISTS ${db_name};" >>./confs-$party_id/confs/mysql/init/insert-node.sql - echo "CREATE DATABASE IF NOT EXISTS fate_flow;" >>./confs-$party_id/confs/mysql/init/insert-node.sql - echo "CREATE USER '${db_user}'@'%' IDENTIFIED BY '${db_password}';" >>./confs-$party_id/confs/mysql/init/insert-node.sql - echo "GRANT ALL ON *.* TO '${db_user}'@'%';" >>./confs-$party_id/confs/mysql/init/insert-node.sql + { + echo "CREATE DATABASE IF NOT EXISTS ${db_name};" + echo "CREATE DATABASE IF NOT EXISTS fate_flow;" + echo "CREATE USER '${db_user}'@'%' IDENTIFIED BY '${db_password}';" + echo "GRANT ALL ON *.* TO '${db_user}'@'%';" + } >> ./confs-"$party_id"/confs/mysql/init/insert-node.sql if [[ "$computing" == "Eggroll" ]]; then echo 'USE `'${db_name}'`;' >>./confs-$party_id/confs/mysql/init/insert-node.sql @@ -339,7 +345,7 @@ GenerateConfig() { echo mysql module of $party_id done! # fate_flow - sed -i "s/party_id:/party_id: ${party_id}/g" ./confs-$party_id/confs/fate_flow/conf/service_conf.yaml + sed -i "s/party_id:/party_id: \"${party_id}\"/g" ./confs-$party_id/confs/fate_flow/conf/service_conf.yaml sed -i "s/name: /name: '${db_name}'/g" ./confs-$party_id/confs/fate_flow/conf/service_conf.yaml sed -i "s/user: /user: '${db_user}'/g" ./confs-$party_id/confs/fate_flow/conf/service_conf.yaml sed -i "s/passwd: /passwd: '${db_password}'/g" ./confs-$party_id/confs/fate_flow/conf/service_conf.yaml @@ -348,13 +354,13 @@ GenerateConfig() { if [[ "$computing" == "Spark" ]] || [[ "$computing" == "Spark_local" ]] ; then - sed -i "s/proxy: rollsite/proxy: nginx/g" ./confs-$party_id/confs/fate_flow/conf/service_conf.yaml + sed -i "s/proxy_name: rollsite/proxy_name: nginx/g" ./confs-$party_id/confs/fate_flow/conf/service_conf.yaml sed -i "s/computing: eggroll/computing: spark/g" ./confs-$party_id/confs/fate_flow/conf/service_conf.yaml fi if [[ "$federation" == "Pulsar" ]]; then - sed -i "s/ federation: eggroll/ federation: pulsar/g" ./confs-$party_id/confs/fate_flow/conf/service_conf.yaml + sed -i "s/ federation: rollsite/ federation: pulsar/g" ./confs-$party_id/confs/fate_flow/conf/service_conf.yaml elif [[ "$federation" == "RabbitMQ" ]]; then - sed -i "s/ federation: eggroll/ federation: RabbitMQ/g" ./confs-$party_id/confs/fate_flow/conf/service_conf.yaml + sed -i "s/ federation: rollsite/ federation: rabbitmq/g" ./confs-$party_id/confs/fate_flow/conf/service_conf.yaml fi if [[ "$storage" == "HDFS" ]]; then @@ -497,11 +503,16 @@ $(for ((j = 0; j < ${#party_list[*]}; j++)); do echo "${party_list[${j}]}: host: ${party_ip_list[${j}]} port: 6650 + sslPort: 6651 + proxy: '' " done) ${party_id}: host: pulsar port: 6650 + sslPort: 6651 + proxy: "" + EOF fi diff --git a/docker-deploy/parties.conf b/docker-deploy/parties.conf index bc4b1fb1a..fd4a9f74e 100644 --- a/docker-deploy/parties.conf +++ b/docker-deploy/parties.conf @@ -19,7 +19,7 @@ algorithm=Basic device=CPU # spark and eggroll -compute_core=16 +compute_core=8 # You only need to configure this parameter when you want to use the GPU, the default value is 1 gpu_count=1 diff --git a/docker-deploy/training_template/backends/eggroll/conf/README.md b/docker-deploy/training_template/backends/eggroll/conf/README.md new file mode 100644 index 000000000..695221aa9 --- /dev/null +++ b/docker-deploy/training_template/backends/eggroll/conf/README.md @@ -0,0 +1,6 @@ +### WhiteList Configuration Description +The whitelist mechanism is for safe deserialization during federated network transmission. +Users can configure objects in the whitelist that allow deserialization. +The current community version provides general reference only, +users can maintain their own whitelist according to actual security requirements +by updating whitelist.json under this folder \ No newline at end of file diff --git a/docker-deploy/training_template/backends/eggroll/conf/eggroll.properties b/docker-deploy/training_template/backends/eggroll/conf/eggroll.properties index 4d7bb2f26..414e40d66 100644 --- a/docker-deploy/training_template/backends/eggroll/conf/eggroll.properties +++ b/docker-deploy/training_template/backends/eggroll/conf/eggroll.properties @@ -1,4 +1,5 @@ -# Copyright 2019-2023 VMware, Inc. +# +# Copyright (c) 2019 - now, Eggroll Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -73,7 +74,6 @@ eggroll.rollsite.route.table.path=conf/route_table.json eggroll.rollsite.route.table.key= eggroll.rollsite.route.table.whitelist=127.0.0.1 eggroll.rollsite.jvm.options=-XX:+UseG1GC -XX:+PrintGCDetails -XX:+PrintGCDateStamps -Xloggc:logs/eggroll/rollsite.gc.log - eggroll.rollsite.push.max.retry=3 eggroll.rollsite.push.long.retry=2 eggroll.rollsite.push.batches.per.stream=10 diff --git a/docker-deploy/training_template/backends/eggroll/conf/whitelist.json b/docker-deploy/training_template/backends/eggroll/conf/whitelist.json index 64aed27ca..9a8230fd1 100644 --- a/docker-deploy/training_template/backends/eggroll/conf/whitelist.json +++ b/docker-deploy/training_template/backends/eggroll/conf/whitelist.json @@ -2,7 +2,8 @@ "builtins": [ "int", "list", - "set" + "set", + "slice" ], "collections": [ "defaultdict", @@ -81,10 +82,6 @@ "federatedml.linear_model.linear_model_weight": [ "LinearModelWeights" ], - "federatedml.secureprotol.fate_paillier": [ - "PaillierPublicKey", - "PaillierEncryptedNumber" - ], "federatedml.secureprotol.fixedpoint": [ "FixedPointNumber" ], @@ -127,11 +124,122 @@ "torch._utils": [ "_rebuild_tensor_v2" ], + "torch.storage": [ + "_load_from_bytes" + ], "ipcl_python.bindings.ipcl_bindings": [ "ipclPublicKey" ], "ipcl_python.ipcl_python": [ "PaillierPublicKey", "PaillierEncryptedNumber" - ] + ], + "torch": [ + "Size" + ], + "fate.arch.tensor.storage.local.device.cpu.plain": [ + "_TorchStorage" + ], + "fate.arch.tensor.types._dtype": [ + "dtype" + ], + "fate.arch.tensor.types._shape": [ + "DAxis", + "Shape" + ], + "pandas.core.frame": [ + "DataFrame" + ], + "pandas.core.indexes.base": [ + "Index", + "_new_Index" + ], + "pandas.core.indexes.range": [ + "RangeIndex" + ], + "pandas.core.series": [ + "Series" + ], + "pandas.core.internals.managers": [ + "BlockManager", + "SingleBlockManager" + ], + "fate.arch.dataframe.manager.data_manager": [ + "DataManager" + ], + "fate.arch.dataframe.manager.schema_manager": [ + "SchemaManager", + "Schema" + ], + "fate.arch.dataframe.manager.block_manager":[ + "BlockManager", + "IndexBlock", + "BlockType", + "Int64Block", + "Float32Block", + "Float64Block", + "Int32Block", + "BoolBlock", + "NPObjectBlock", + "PHETensorBlock" + ], + "fate.arch.tensor.inside._op_quantile":[ + "GKSummary" + ], + "fate.arch.protocol.phe.paillier":[ + "Coder", + "SK", + "PK", + "evaluator" + ], + "fate.arch.protocol.phe.ou":[ + "Coder", + "SK", + "PK", + "evaluator" + ], + "fate.arch.tensor.phe._tensor":[ + "PHETensorEncoded", "PHETensor" + ], + "fate.arch.tensor.phe._keypair":[ + "PHETensorCoder" + ], + "fate_utils.quantile":[ + "QuantileSummaryStream" + ], + "fate_utils.paillier":[ + "Coder","Coders", "FixedpointVector", "PK", "FixedpointPaillierVector", "CiphertextVector","PlaintextVector" + ], + "fate_utils.ou":[ + "Coder", "Coders", "FixedpointVector", "PK", "FixedpointPaillierVector", "CiphertextVector","PlaintextVector" + ], + "fate.arch.unify._infra_def":[ + "device" + ], + "fate.arch.histogram._histogram_splits": [ + "HistogramSplits" + ], + "fate.arch.histogram.values._values": [ + "HistogramValuesContainer" + ], + "fate.arch.histogram.values._plain": [ + "HistogramPlainValues" + ], + "fate.arch.histogram.values._cipher":[ + "HistogramEncryptedValues" + ], + "fate.arch.protocol.phe.mock": [ + "PK", "SK", "FV", "EV", "Coder", "evaluator" + ], + "fate.arch.histogram.histogram":[ + "HistogramSplits", "HistogramPlainValues", "HistogramEncryptedValues" + ], + "torch":[ + "float32", + "int64", + "int32", + "device", + "float64", + "Size" + ] } diff --git a/docker-deploy/training_template/docker-compose-eggroll.yml b/docker-deploy/training_template/docker-compose-eggroll.yml index 68d53d6f2..8bdbf18da 100644 --- a/docker-deploy/training_template/docker-compose-eggroll.yml +++ b/docker-deploy/training_template/docker-compose-eggroll.yml @@ -26,12 +26,12 @@ volumes: type: none o: bind device: /examples - shared_dir_federatedml: + shared_dir_fate: driver: local driver_opts: type: none o: bind - device: /federatedml + device: /fate shared_dir_data: driver: local driver_opts: @@ -41,28 +41,28 @@ volumes: services: rollsite: - image: "federatedai/eggroll:${TAG}" + image: "${RegistryURI}${EGGRoll_IMAGE}:${EGGRoll_IMAGE_TAG}" restart: always ports: - "9370:9370" environment: PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION: python volumes: - - ./confs/eggroll/conf:/data/projects/fate/eggroll/conf + - ./confs/eggroll/conf:/data/projects/fate/eggroll/conf/ - /etc/localtime:/etc/localtime:ro - - shared_dir_federatedml:/data/projects/fate/fate/python/federatedml + - shared_dir_fate:/data/projects/fate/fate networks: - fate-network command: ["bash", "-c", "java -Dlog4j.configurationFile=$${EGGROLL_HOME}/conf/log4j2.properties -cp $${EGGROLL_HOME}/lib/*:$${EGGROLL_HOME}/conf/ com.webank.eggroll.rollsite.EggSiteBootstrap -c $${EGGROLL_HOME}/conf/eggroll.properties"] fateboard: - image: "federatedai/fateboard:${TAG}" + image: "${FATEBoard_IMAGE}:${FATEBoard_IMAGE_TAG}" restart: always ports: - "8080:8080" volumes: - ./confs/fateboard/conf:/data/projects/fate/fateboard/conf - - fate_flow_logs:/data/projects/fate/fateflow/logs + - fate_flow_logs:/data/projects/fate/fate_flow/logs - /etc/localtime:/etc/localtime:ro networks: - fate-network @@ -70,14 +70,14 @@ services: - fateflow clustermanager: - image: "federatedai/eggroll:${TAG}" + image: "${EGGRoll_IMAGE}:${EGGRoll_IMAGE_TAG}" restart: always expose: - 4670 volumes: - - ./confs/eggroll/conf:/data/projects/fate/eggroll/conf + - ./confs/eggroll/conf/:/data/projects/fate/eggroll/conf/ - /etc/localtime:/etc/localtime:ro - - shared_dir_federatedml:/data/projects/fate/fate/python/federatedml + - shared_dir_fate:/data/projects/fate/fate environment: PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION: python networks: @@ -85,26 +85,28 @@ services: command: ["bash", "-c", "java -Dlog4j.configurationFile=$${EGGROLL_HOME}/conf/log4j2.properties -cp $${EGGROLL_HOME}/lib/*: com.webank.eggroll.core.Bootstrap --bootstraps com.webank.eggroll.core.resourcemanager.ClusterManagerBootstrap -c $${EGGROLL_HOME}/conf/eggroll.properties -p 4670 -s 'EGGROLL_DEAMON'"] nodemanager: - image: "federatedai/eggroll:${TAG}" + image: "${EGGRoll_IMAGE}:${EGGRoll_IMAGE_TAG}" restart: always expose: - 4671 volumes: - - ./confs/eggroll/conf:/data/projects/fate/eggroll/conf + - ./confs/eggroll/conf:/data/projects/fate/eggroll/conf/ - ./confs/fate_flow/conf/service_conf.yaml:/data/projects/fate/conf/service_conf.yaml - ./shared_dir/data/nodemanager:/data/projects/fate/eggroll/data - /etc/localtime:/etc/localtime:ro - - shared_dir_federatedml:/data/projects/fate/fate/python/federatedml + - shared_dir_fate:/data/projects/fate/fate depends_on: - clustermanager networks: - fate-network + environment: + PYTHONPATH: /data/projects/fate/fate/python:/data/projects/fate/fate_flow/python:/data/projects/fate/fate_client/python:/data/projects/fate/eggroll/python cap_add: - SYS_PTRACE command: ["bash", "-c", "java -Dlog4j.configurationFile=$${EGGROLL_HOME}/conf/log4j2.properties -cp $${EGGROLL_HOME}/lib/*: com.webank.eggroll.core.Bootstrap --bootstraps com.webank.eggroll.core.resourcemanager.NodeManagerBootstrap -c $${EGGROLL_HOME}/conf/eggroll.properties -p 4671 -s 'EGGROLL_DEAMON'"] fateflow: - image: "federatedai/fateflow:${TAG}" + image: "${FATEFlow_IMAGE}:${FATEFlow_IMAGE_TAG}" environment: FATE_PROJECT_BASE: "/data/projects/fate" FATE_LOG_LEVEL: "DEBUG" @@ -113,15 +115,15 @@ services: - "9380:9380" restart: always volumes: - - shared_dir_federatedml:/data/projects/fate/fate/python/federatedml + - shared_dir_fate:/data/projects/fate/fate - shared_dir_examples:/data/projects/fate/examples - - download_dir:/data/projects/fate/python/download_dir - - fate_flow_logs:/data/projects/fate/fateflow/logs - - ./confs/fate_flow/conf/service_conf.yaml:/data/projects/fate/conf/service_conf.yaml - - ./confs/fate_flow/conf/pulsar_route_table.yaml:/data/projects/fate/conf/pulsar_route_table.yaml - - ./confs/fate_flow/conf/rabbitmq_route_table.yaml:/data/projects/fate/conf/rabbitmq_route_table.yaml + - download_dir:/data/projects/fate/fate/python/download_dir + - fate_flow_logs:/data/projects/fate/fate_flow/logs + - ./confs/fate_flow/conf/service_conf.yaml:/data/projects/fate/fate_flow/conf/service_conf.yaml + - ./confs/fate_flow/conf/pulsar_route_table.yaml:/data/projects/fate/fate_flow/conf/pulsar_route_table.yaml + - ./confs/fate_flow/conf/rabbitmq_route_table.yaml:/data/projects/fate/fate_flow/conf/rabbitmq_route_table.yaml - ./confs/eggroll/conf:/data/projects/fate/eggroll/conf - - ./shared_dir/data/model_local_cache:/data/projects/fate/fateflow/model_local_cache + - ./shared_dir/data/model_local_cache:/data/projects/fate/fate_flow/model_local_cache - /etc/localtime:/etc/localtime:ro depends_on: - mysql @@ -132,7 +134,7 @@ services: fate-network: ipv4_address: 192.167.0.100 healthcheck: - test: ["CMD", "curl", "-f", "-X POST", "http://192.167.0.100:9380/v1/version/get"] + test: ["CMD", "curl", "-f", "-X GET", "http://192.167.0.100:9380/v2/server/fateflow"] interval: 1m30s timeout: 10s retries: 3 @@ -142,10 +144,9 @@ services: - "-c" - | set -x - sleep 5 && python fateflow/python/fate_flow/fate_flow_server.py - + pip install cryptography && sleep 5 && python fate_flow/python/fate_flow/fate_flow_server.py --debug client: - image: "federatedai/client:${TAG}" + image: "${Client_IMAGE}:${Client_IMAGE_TAG}" ports: - "20000:20000" restart: always @@ -165,7 +166,7 @@ services: command: ["bash", "-c", "pipeline init --ip $${FATE_FLOW_IP} --port $${FATE_FLOW_PORT} && flow init --ip $${FATE_FLOW_IP} --port $${FATE_FLOW_PORT} && jupyter notebook --ip=0.0.0.0 --port=20000 --allow-root --debug --NotebookApp.notebook_dir='/data/projects/fate/' --no-browser --NotebookApp.token='' --NotebookApp.password=$${NOTEBOOK_HASHED_PASSWORD} "] mysql: - image: "mysql:8.0.28" + image: "${MySQL_IMAGE}:${MySQL_IMAGE_TAG}" expose: - 3306 volumes: diff --git a/docker-deploy/training_template/docker-compose-spark-slim.yml b/docker-deploy/training_template/docker-compose-spark-slim.yml index 690ef7eea..21d7459d0 100644 --- a/docker-deploy/training_template/docker-compose-spark-slim.yml +++ b/docker-deploy/training_template/docker-compose-spark-slim.yml @@ -27,12 +27,12 @@ volumes: type: none o: bind device: /examples - shared_dir_federatedml: + shared_dir_fate: driver: local driver_opts: type: none o: bind - device: /federatedml + device: /fate shared_dir_data: driver: local driver_opts: @@ -47,7 +47,7 @@ services: - "8080:8080" volumes: - ./confs/fateboard/conf:/data/projects/fate/fateboard/conf - - fate_flow_logs:/data/projects/fate/fateflow/logs + - fate_flow_logs:/data/projects/fate/fate_flow/logs - /etc/localtime:/etc/localtime:ro restart: always networks: @@ -63,21 +63,21 @@ services: - 9360:9360 volumes: - ./confs/spark/spark-defaults.conf:/data/projects/spark-3.1.3-bin-hadoop3.2/conf/spark-defaults.conf - - shared_dir_federatedml:/data/projects/fate/fate/python/federatedml + - shared_dir_fate:/data/projects/fate/fate - shared_dir_examples:/data/projects/fate/examples - download_dir:/data/projects/fate/python/download_dir - - fate_flow_logs:/data/projects/fate/fateflow/logs + - fate_flow_logs:/data/projects/fate/fate_flow/logs - ./confs/fate_flow/conf/service_conf.yaml:/data/projects/fate/conf/service_conf.yaml - ./confs/fate_flow/conf/pulsar_route_table.yaml:/data/projects/fate/conf/pulsar_route_table.yaml - ./confs/fate_flow/conf/rabbitmq_route_table.yaml:/data/projects/fate/conf/rabbitmq_route_table.yaml - ./confs/eggroll/conf:/data/projects/fate/eggroll/conf - - ./shared_dir/data/model_local_cache:/data/projects/fate/fateflow/model_local_cache + - ./shared_dir/data/model_local_cache:/data/projects/fate/fate_flow/model_local_cache - /etc/localtime:/etc/localtime:ro networks: fate-network: ipv4_address: 192.167.0.100 healthcheck: - test: ["CMD", "curl", "-f", "-X POST", "http://192.167.0.100:9380/v1/version/get"] + test: ["CMD", "curl", "-f", "-X GET", "http://192.167.0.100:9380/v2/server/fateflow"] interval: 1m30s timeout: 10s retries: 3 @@ -87,6 +87,9 @@ services: - "-c" - | set -x + sed -i "s/int(party.party_id)/str(party.party_id)/g" /data/projects/fate/fate/python/fate/arch/federation/pulsar/_federation.py + cp /data/projects/fate/fate_flow/conf/pulsar_route_table.yaml /data/projects/fate/fate_flow/pulsar_route_table.yaml + cp /data/projects/fate/fate_flow/conf/rabbitmq_route_table.yaml /data/projects/fate/fate_flow/rabbitmq_route_table.yaml sleep 5 && python fateflow/python/fate_flow/fate_flow_server.py environment: FATE_PROJECT_BASE: "/data/projects/fate" @@ -144,14 +147,14 @@ services: - fate-network pulsar: - image: "federatedai/pulsar:2.7.0" + image: "federatedai/pulsar:2.10.2" ports: - "6650:6650" - "6651:6651" - "8001:8080" volumes: - ./confs/pulsar/standalone.conf:/pulsar/conf/standalone.conf - - ./shared_dir/data/pulsar:/pulsar/data + # - ./shared_dir/data/pulsar:/pulsar/data - /etc/localtime:/etc/localtime:ro command: ["/bin/bash", "-c", "bin/pulsar standalone -nss"] diff --git a/docker-deploy/training_template/docker-compose-spark.yml b/docker-deploy/training_template/docker-compose-spark.yml index f4875c4d2..1d43f8648 100644 --- a/docker-deploy/training_template/docker-compose-spark.yml +++ b/docker-deploy/training_template/docker-compose-spark.yml @@ -17,6 +17,7 @@ networks: ipam: config: - subnet: 192.167.0.0/16 + volumes: fate_flow_logs: download_dir: @@ -26,12 +27,12 @@ volumes: type: none o: bind device: /examples - shared_dir_federatedml: + shared_dir_fate: driver: local driver_opts: type: none o: bind - device: /federatedml + device: /fate shared_dir_data: driver: local driver_opts: @@ -41,12 +42,12 @@ volumes: services: fateboard: - image: "federatedai/fateboard:${TAG}" + image: "${FATEBoard_IMAGE}:${FATEBoard_IMAGE_TAG}" ports: - "8080:8080" volumes: - ./confs/fateboard/conf:/data/projects/fate/fateboard/conf - - fate_flow_logs:/data/projects/fate/fateflow/logs + - fate_flow_logs:/data/projects/fate/fate_flow/logs - /etc/localtime:/etc/localtime:ro networks: - fate-network @@ -55,28 +56,28 @@ services: - fateflow fateflow: - image: "federatedai/fateflow:${TAG}" + image: "${FATEFlow_IMAGE}:${FATEFlow_IMAGE_TAG}" restart: always ports: - 9380:9380 - 9360:9360 volumes: - ./confs/spark/spark-defaults.conf:/data/projects/spark-3.1.3-bin-hadoop3.2/conf/spark-defaults.conf - - shared_dir_federatedml:/data/projects/fate/fate/python/federatedml + - shared_dir_fate:/data/projects/fate/fate - shared_dir_examples:/data/projects/fate/examples - - download_dir:/data/projects/fate/python/download_dir - - fate_flow_logs:/data/projects/fate/fateflow/logs - - ./confs/fate_flow/conf/service_conf.yaml:/data/projects/fate/conf/service_conf.yaml - - ./confs/fate_flow/conf/pulsar_route_table.yaml:/data/projects/fate/conf/pulsar_route_table.yaml - - ./confs/fate_flow/conf/rabbitmq_route_table.yaml:/data/projects/fate/conf/rabbitmq_route_table.yaml + - download_dir:/data/projects/fate/fate/python/download_dir + - fate_flow_logs:/data/projects/fate/fate_flow/logs + - ./confs/fate_flow/conf/service_conf.yaml:/data/projects/fate/fate_flow/conf/service_conf.yaml + - ./confs/fate_flow/conf/pulsar_route_table.yaml:/data/projects/fate/fate_flow/conf/pulsar_route_table.yaml + - ./confs/fate_flow/conf/rabbitmq_route_table.yaml:/data/projects/fate/fate_flow/conf/rabbitmq_route_table.yaml - ./confs/eggroll/conf:/data/projects/fate/eggroll/conf - - ./shared_dir/data/model_local_cache:/data/projects/fate/fateflow/model_local_cache + - ./shared_dir/data/model_local_cache:/data/projects/fate/fate_flow/model_local_cache - /etc/localtime:/etc/localtime:ro networks: fate-network: ipv4_address: 192.167.0.100 healthcheck: - test: ["CMD", "curl", "-f", "-X POST", "http://192.167.0.100:9380/v1/version/get"] + test: ["CMD", "curl", "-f", "-X GET", "http://192.167.0.100:9380/v2/server/fateflow"] interval: 1m30s timeout: 10s retries: 3 @@ -86,7 +87,10 @@ services: - "-c" - | set -x - sleep 5 && python fateflow/python/fate_flow/fate_flow_server.py + sed -i "s/int(party.party_id)/str(party.party_id)/g" /data/projects/fate/fate/python/fate/arch/federation/pulsar/_federation.py + cp /data/projects/fate/fate_flow/conf/pulsar_route_table.yaml /data/projects/fate/fate_flow/pulsar_route_table.yaml + cp /data/projects/fate/fate_flow/conf/rabbitmq_route_table.yaml /data/projects/fate/fate_flow/rabbitmq_route_table.yaml + sleep 5 && python fate_flow/python/fate_flow/fate_flow_server.py environment: FATE_PROJECT_BASE: "/data/projects/fate" FATE_FLOW_UPLOAD_MAX_NUM: "1000000" @@ -94,7 +98,7 @@ services: FATE_LOG_LEVEL: "INFO" namenode: - image: "federatedai/hadoop-namenode:2.0.0-hadoop3.2.1-java8" + image: "${Hadoop_NameNode_IMAGE}:${Hadoop_NameNode_IMAGE_TAG}" restart: always ports: - 9000:9000 @@ -111,7 +115,7 @@ services: - fate-network datanode-0: - image: "federatedai/hadoop-datanode:2.0.0-hadoop3.2.1-java8" + image: "${Hadoop_DataNode_IMAGE}:${Hadoop_DataNode_IMAGE_TAG}" restart: always volumes: - /etc/localtime:/etc/localtime:ro @@ -124,7 +128,7 @@ services: - fate-network datanode-1: - image: "federatedai/hadoop-datanode:2.0.0-hadoop3.2.1-java8" + image: "${Hadoop_DataNode_IMAGE}:${Hadoop_DataNode_IMAGE_TAG}" restart: always volumes: - /etc/localtime:/etc/localtime:ro @@ -137,7 +141,7 @@ services: - fate-network datanode-2: - image: "federatedai/hadoop-datanode:2.0.0-hadoop3.2.1-java8" + image: "${Hadoop_DataNode_IMAGE}:${Hadoop_DataNode_IMAGE_TAG}" restart: always volumes: - /etc/localtime:/etc/localtime:ro @@ -149,9 +153,8 @@ services: networks: - fate-network - spark-master: - image: "federatedai/spark-master:${TAG}" + image: "${Spark_Master_IMAGE}:${Spark_Master_IMAGE_TAG}" restart: always ports: - "8888:8080" @@ -164,7 +167,7 @@ services: - fate-network spark-worker: - image: "federatedai/spark-worker:${TAG}" + image: "${Spark_Worker_IMAGE}:${Spark_Worker_IMAGE_TAG}" restart: always depends_on: - spark-master @@ -179,7 +182,7 @@ services: - fate-network rabbitmq: - image: "federatedai/rabbitmq:3.8.3-management" + image: "${RabbitMQ_IMAGE}:${RabbitMQ_IMAGE_TAG}" ports: - "5672:5672" - "15672:15672" @@ -198,14 +201,15 @@ services: - fate-network pulsar: - image: "federatedai/pulsar:2.7.0" + image: "${Pulsar_IMAGE}:${Pulsar_IMAGE_TAG}" ports: - "6650:6650" - "6651:6651" - "8001:8080" + user: root volumes: - ./confs/pulsar/standalone.conf:/pulsar/conf/standalone.conf - - ./shared_dir/data/pulsar:/pulsar/data + # - ./shared_dir/data/pulsar:/pulsar/data - /etc/localtime:/etc/localtime:ro command: ["/bin/bash", "-c", "bin/pulsar standalone -nss"] @@ -214,7 +218,7 @@ services: - fate-network mysql: - image: "mysql:8.0.28" + image: "${MySQL_IMAGE}:${MySQL_IMAGE_TAG}" expose: - 3306 volumes: @@ -230,7 +234,7 @@ services: - SYS_NICE nginx: - image: "federatedai/nginx:${TAG}" + image: "${Nginx_IMAGE}:${Nginx_IMAGE_TAG}" ports: - 9300:9300 - 9310:9310 @@ -245,7 +249,7 @@ services: - fateflow client: - image: "federatedai/client:${TAG}" + image: "${Client_IMAGE}:${Client_IMAGE_TAG}" ports: - "20000:20000" restart: always diff --git a/docker-deploy/training_template/public/fate_flow/conf/service_conf.yaml b/docker-deploy/training_template/public/fate_flow/conf/service_conf.yaml index b37755c25..4f99c578b 100644 --- a/docker-deploy/training_template/public/fate_flow/conf/service_conf.yaml +++ b/docker-deploy/training_template/public/fate_flow/conf/service_conf.yaml @@ -1,172 +1,124 @@ -use_registry: false -use_deserialize_safe_module: false -dependent_distribution: false -encrypt_password: false -encrypt_module: fate_arch.common.encrypt_utils#pwdecrypt -private_key: -private_key_file: party_id: -hook_module: - client_authentication: fate_flow.hook.flow.client_authentication - site_authentication: fate_flow.hook.flow.site_authentication - permission: fate_flow.hook.flow.permission -hook_server_name: -authentication: - client: - switch: false - http_app_key: - http_secret_key: - site: - switch: false -permission: - switch: false - component: false - dataset: false +use_registry: false +encrypt: + key_0: + module: fate_flow.hub.encrypt.password_encrypt#pwdecrypt + # base on: fate_flow/conf/ + private_path: private_key.pem fateflow: - # you must set real ip address, 127.0.0.1 and 0.0.0.0 is not supported host: 192.167.0.100 http_port: 9380 grpc_port: 9360 - # when you have multiple fateflow server on one party, - # we suggest using nginx for load balancing. + proxy_name: rollsite nginx: host: http_port: grpc_port: - # use random instance_id instead of {host}:{http_port} - random_instance_id: false - - # support rollsite/nginx/fateflow as a coordination proxy - # rollsite support fate on eggroll, use grpc protocol - # nginx support fate on eggroll and fate on spark, use http or grpc protocol, default is http - # fateflow support fate on eggroll and fate on spark, use http protocol, but not support exchange network mode - - # format(proxy: rollsite) means rollsite use the rollsite configuration of fate_one_eggroll and nginx use the nginx configuration of fate_one_spark - # you also can customize the config like this(set fateflow of the opposite party as proxy): - # proxy: - # name: fateflow - # host: xx - # http_port: xx - # grpc_port: xx - proxy: rollsite - # support default/http/grpc - protocol: default - # It can also be specified in the job configuration using the federated_status_collect_type parameter - default_federated_status_collect_type: PULL database: - name: - user: - passwd: - host: - port: 3306 - max_connections: 100 - stale_timeout: 30 -zookeeper: - hosts: - - "serving-zookeeper:2181" - # use_acl: false - # user: fate - # password: fate - # engine services + engine: mysql + # encrypt passwd key + decrypt_key: + mysql: + name: + user: + passwd: + host: + port: 3306 + max_connections: 100 + stale_timeout: 30 + sqlite: + # default fate_flow/runtime/system_settings: SQLITE_PATH + # /xxx/xxx.sqlite + path: default_engines: computing: eggroll - federation: eggroll + federation: rollsite storage: eggroll -fate_on_standalone: - standalone: - cores_per_node: 20 - nodes: 1 -fate_on_eggroll: - clustermanager: - cores_per_node: 16 - nodes: 1 - rollsite: - host: rollsite - port: 9370 -fate_on_spark: - spark: - # default use SPARK_HOME environment variable - home: /data/projects/spark-3.1.3-bin-hadoop3.2/ - cores_per_node: 20 - nodes: 2 - linkis_spark: - cores_per_node: 20 - nodes: 2 - host: 127.0.0.1 - port: 9001 - token_code: MLSS - python_path: /data/projects/fate/python - hive: - host: 127.0.0.1 - port: 10000 - auth_mechanism: - username: - password: - linkis_hive: - host: 127.0.0.1 - port: 9001 - hdfs: - name_node: hdfs://namenode:9000 - # default / - path_prefix: - rabbitmq: - host: rabbitmq - mng_port: 15672 - port: 5672 - user: fate - password: fate - # default conf/rabbitmq_route_table.yaml - route_table: - # mode: replication / client, default: replication - mode: replication - max_message_size: 1048576 +default_provider: + name: fate + # version default: fateflow.env + version: + device: local +federation: pulsar: host: pulsar - mng_port: 8080 port: 6650 + mng_port: 8080 cluster: standalone - # all parties should use a same tenant tenant: fl-tenant - # message ttl in minutes - topic_ttl: 5 + topic_ttl: 30 # default conf/pulsar_route_table.yaml - route_table: + route_table: conf/pulsar_route_table.yaml # mode: replication / client, default: replication mode: replication max_message_size: 1048576 nginx: - host: nginx + host: nginx http_port: 9300 grpc_port: 9310 -# external services -fateboard: - host: fateboard - port: 8080 - -enable_model_store: false -model_store_address: - # use mysql as the model store engine -# storage: mysql -# database: fate_model -# user: fate -# password: fate -# host: 127.0.0.1 -# port: 3306 - # other optional configs send to the engine -# max_connections: 10 -# stale_timeout: 10 - - # use tencent cos as model store engine - storage: tencent_cos - Region: - SecretId: - SecretKey: - Bucket: - -servings: + # http or grpc + protocol: http + rabbitmq: + host: rabbitmq + mng_port: 15672 + port: 5672 + user: fate + password: fate + # default conf/rabbitmq_route_table.yaml + route_table: conf/pulsar_route_table.yaml + # mode: replication / client, default: replication + mode: replication + max_message_size: 1048576 + rollsite: + host: rollsite + port: 9370 + osx: + host: osx + port: 9370 +computing: + standalone: + cores: 32 + eggroll: + cores: 32 + nodes: 2 + spark: + # default use SPARK_HOME environment variable + home: /data/projects/spark-3.1.3-bin-hadoop3.2/ + cores: 32 +storage: + hdfs: + name_node: hdfs://namenode:9000 +hook_module: + client_authentication: fate_flow.hook.flow.client_authentication + site_authentication: fate_flow.hook.flow.site_authentication + permission: fate_flow.hook.flow.permission +authentication: + client: false + site: false + permission: false +model_store: + engine: file + # encrypt passwd key + decrypt_key: + file: + # default fate_flow/runtime/system_settings: MODEL_STORE_PATH + path: + mysql: + name: fate_flow + user: fate + passwd: fate + host: 127.0.0.1 + port: 3306 + max_connections: 100 + stale_timeout: 30 + tencent_cos: + Region: + SecretId: + SecretKey: + Bucket: +zookeeper: hosts: - - 127.0.0.1:8000 -fatemanager: - host: 127.0.0.1 - port: 8001 - federatedId: 0 + - 127.0.0.1:2181 + use_acl: true + user: fate + password: fate \ No newline at end of file diff --git a/docs/FATE_On_Spark_With_Pulsar.md b/docs/FATE_On_Spark_With_Pulsar.md index 8559582ac..6b3b7e7aa 100644 --- a/docs/FATE_On_Spark_With_Pulsar.md +++ b/docs/FATE_On_Spark_With_Pulsar.md @@ -2,7 +2,7 @@ ## Overview -FATE supports using [Spark](https://spark.apache.org/) as a computing engine since v1.5.0 Along with Spark, it also requires HDFS and RabbitMQ as storage and transmission service respectively, to compose a functional FATE cluster. In v1.6.0, the FATE also supports to use [Pulsar](https://pulsar.apache.org/admin-rest-api/?version=2.7.0&apiversion=v2#tag/clusters) as the transmission engine, a user can switch the transmission engine easily. Ideally, the Pulsar provides better throughput and scalability, more importantly, organizations can compose FATE clusters of star network using Pulsar. The overall architecture of "FATE on Spark with Pulsar" is as the following diagram: +FATE supports using [Spark](https://spark.apache.org/) as a computing engine since v1.5.0 Along with Spark, it also requires HDFS and RabbitMQ as storage and transmission service respectively, to compose a functional FATE cluster. In v1.6.0, the FATE also supports to use [Pulsar](https://pulsar.apache.org/admin-rest-api/?version=2.10.2&apiversion=v2#tag/clusters) as the transmission engine, a user can switch the transmission engine easily. Ideally, the Pulsar provides better throughput and scalability, more importantly, organizations can compose FATE clusters of star network using Pulsar. The overall architecture of "FATE on Spark with Pulsar" is as the following diagram:
@@ -86,7 +86,7 @@ When submitting a task, the user can declare in the config file to use Pulsar as } ``` -Generally, there is no need to set such a configuration. As for the available parameters, please refer to the [`create_producer`](https://pulsar.apache.org/api/python/2.7.0-SNAPSHOT/#pulsar.Client.create_producer) and [`subscribe`](https://pulsar.apache.org/api/python/2.7.0-SNAPSHOT/#pulsar.Client.subscribe) methods in the Pulsar python client. +Generally, there is no need to set such a configuration. As for the available parameters, please refer to the [`create_producer`](https://pulsar.apache.org/api/python/2.10.2-SNAPSHOT/#pulsar.Client.create_producer) and [`subscribe`](https://pulsar.apache.org/api/python/2.10.2-SNAPSHOT/#pulsar.Client.subscribe) methods in the Pulsar python client. ## Deployment of Star Network diff --git a/docs/Manage_FATE_and_FATE-Serving_Version.md b/docs/Manage_FATE_and_FATE-Serving_Version.md index 5aaaf2e01..c8aaf3485 100644 --- a/docs/Manage_FATE_and_FATE-Serving_Version.md +++ b/docs/Manage_FATE_and_FATE-Serving_Version.md @@ -30,18 +30,18 @@ The chart can be downloaded in each KubeFATE release, with name `fate-{release_v Download it and copy it to the folder to upload. ``` -$ kubefate chart upload -f ./fate-v1.11.2.tgz +$ kubefate chart upload -f ./fate-v2.0.0-beta.tgz Upload file success $ kubefate chart ls UUID NAME VERSION APPVERSION -ca3f7843-749a-4f69-9f6b-4c544a7623ac fate v1.11.2 v1.11.2 +ca3f7843-749a-4f69-9f6b-4c544a7623ac fate v2.0.0-beta v2.0.0-beta ``` -Then, we can deploy the fate cluster of v1.11.2 version. The detail of cluster.yaml please refer to: [FATE Cluster Configuration](./configurations/FATE_cluster_configuration.md) +Then, we can deploy the fate cluster of v2.0.0-beta version. The detail of cluster.yaml please refer to: [FATE Cluster Configuration](./configurations/FATE_cluster_configuration.md) ``` chartName: fate -chartVersion: v1.11.2 +chartVersion: v2.0.0-beta ``` We can delete the chart with: diff --git a/docs/tutorials/Build_Two_Parties_FATE_Cluster_in_One_Linux_Machine_with_MiniKube.md b/docs/tutorials/Build_Two_Parties_FATE_Cluster_in_One_Linux_Machine_with_MiniKube.md index 34868d487..88c4c48f4 100644 --- a/docs/tutorials/Build_Two_Parties_FATE_Cluster_in_One_Linux_Machine_with_MiniKube.md +++ b/docs/tutorials/Build_Two_Parties_FATE_Cluster_in_One_Linux_Machine_with_MiniKube.md @@ -21,14 +21,14 @@ After the tutorial, the deployment architecture looks like the following diagram 5. Network connectivity to dockerhub or 163 Docker Image Registry, and google gcr. 6. Setup the global KubeFATE version using in the tutorial and create a folder for the whole tutorial. ``` -export fate_version=v1.11.2 && export kubefate_version=v1.4.5 && cd ~ && mkdir demo && cd demo +export fate_version=v2.0.0-beta && export kubefate_version=v1.4.5 && cd ~ && mkdir demo && cd demo ``` Notes: * When talking about KubeFATE version, usually there are 3 notions: * The KubeFATE CLI version, in this tutorial, it is v1.4.5. * The KubeFATE service version, in this tutorial, it is v1.4.5. - * The FATE version, in this tutorial, it is v1.11.2, it also means the version of the helm chart of FATE, currently we use this version to tag the KubeFATE GitHub master branch. + * The FATE version, in this tutorial, it is v2.0.0-beta, it also means the version of the helm chart of FATE, currently we use this version to tag the KubeFATE GitHub master branch. * **In this tutorial, the IP of the machine we used is 192.168.100.123. Please change it to your machine's IP in all the following commands and config files.
** # Start Tutorial @@ -87,7 +87,7 @@ When all the pods are in the ready state, it means your Kubernetes cluster is re ## Setup Kubefate ### Install KubeFATE CLI Go to [KubeFATE Release](https://github.com/FederatedAI/KubeFATE/releases), and find the latest kubefate-k8s release -pack, which is `v1.11.2` as set to ENVs before. (replace ${fate_version} with the newest version available) +pack, which is `v2.0.0-beta` as set to ENVs before. (replace ${fate_version} with the newest version available) ``` curl -LO https://github.com/FederatedAI/KubeFATE/releases/download/${fate_version}/kubefate-k8s-${fate_version}.tar.gz && tar -xzf ./kubefate-k8s-${fate_version}.tar.gz ``` @@ -256,7 +256,7 @@ For `/kubefate/examples/party-9999/cluster-spark-pulsar.yaml`, modify it as foll name: fate-9999 namespace: fate-9999 chartName: fate -chartVersion: v1.11.2 +chartVersion: v2.0.0-beta partyId: 9999 registry: "" pullPolicy: @@ -340,7 +340,7 @@ and for fate-10000: name: fate-10000 namespace: fate-10000 chartName: fate -chartVersion: v1.11.2 +chartVersion: v2.0.0-beta partyId: 10000 registry: "" pullPolicy: @@ -440,8 +440,8 @@ or watch the clusters till their STATUS changing to `Running`: ``` kubefate@machine:~/kubefate$ watch kubefate cluster ls UUID NAME NAMESPACE REVISION STATUS CHART ChartVERSION AGE -29878fa9-aeee-4ae5-a5b7-fd4e9eb7c1c3 fate-9999 fate-9999 1 Running fate v1.11.2 88s -dacc0549-b9fc-463f-837a-4e7316db2537 fate-10000 fate-10000 1 Running fate v1.11.2 69s +29878fa9-aeee-4ae5-a5b7-fd4e9eb7c1c3 fate-9999 fate-9999 1 Running fate v2.0.0-beta 88s +dacc0549-b9fc-463f-837a-4e7316db2537 fate-10000 fate-10000 1 Running fate v2.0.0-beta 69s ``` We have about 10G Docker images that need to be pulled, this step will take a while for the first time. An alternative way is offline loading the images to the local environment. @@ -479,13 +479,13 @@ UUID 29878fa9-aeee-4ae5-a5b7-fd4e9eb7c1c3 Name fate-9999 NameSpace fate-9999 ChartName fate -ChartVersion v1.11.2 +ChartVersion v2.0.0-beta Revision 1 Age 54m Status Running Spec algorithm: Basic chartName: fate - chartVersion: v1.11.2 + chartVersion: v2.0.0-beta computing: Spark device: CPU federation: Pulsar diff --git a/docs/tutorials/Build_Two_Parties_FATE_Cluster_in_One_Linux_Machine_with_MiniKube_zh.md b/docs/tutorials/Build_Two_Parties_FATE_Cluster_in_One_Linux_Machine_with_MiniKube_zh.md index 3e2ac76e1..9a458998b 100644 --- a/docs/tutorials/Build_Two_Parties_FATE_Cluster_in_One_Linux_Machine_with_MiniKube_zh.md +++ b/docs/tutorials/Build_Two_Parties_FATE_Cluster_in_One_Linux_Machine_with_MiniKube_zh.md @@ -17,14 +17,14 @@ 5. 要保证安装机器可以正常访问Docker Hub或者网易云镜像仓库,以及Google gcr; 6. 预先创建一个目录,以便整个过程使用该目录作为工作目录,命令如下: ``` -export fate_version=v1.11.2 && export kubefate_version=v1.4.5 && cd ~ && mkdir demo && cd demo +export fate_version=v2.0.0-beta && export kubefate_version=v1.4.5 && cd ~ && mkdir demo && cd demo ``` Notes: * 当我们提到"KubeFATE的版本",通常来讲会有三个概念: * KubeFATE命令行工具的版本,在本教程中为v1.4.5。 * KubeFATE服务版本,在本教程中为v1.4.5。 - * FATE版本,在本教程中v1.11.2,它也意味着FATE的Helm Chart的版本, 值得注意的是我们用这个版本来给GitHub上的KubeFATE的发布打tag。 + * FATE版本,在本教程中v2.0.0-beta,它也意味着FATE的Helm Chart的版本, 值得注意的是我们用这个版本来给GitHub上的KubeFATE的发布打tag。 * **下文介绍的MiniKube机器IP地址是192.168.100.123。请修改为你准备的实验机器IP地址** # 开始安装 @@ -77,7 +77,7 @@ sudo minikube addons enable ingress ## 安装Kubefate ### 下载KubeFATE命令行工具 -我们从Github上 [KubeFATE Release](https://github.com/FederatedAI/KubeFATE/releases)页面找到Kuberetes部署的下载包,并下载对应版本,如前面环境变量设置`v1.11.2`, +我们从Github上 [KubeFATE Release](https://github.com/FederatedAI/KubeFATE/releases)页面找到Kuberetes部署的下载包,并下载对应版本,如前面环境变量设置`v2.0.0-beta`, ``` curl -LO https://github.com/FederatedAI/KubeFATE/releases/download/${fate_version}/kubefate-k8s-${fate_version}.tar.gz && tar -xzf ./kubefate-k8s-${fate_version}.tar.gz ``` @@ -237,7 +237,7 @@ kubectl -n fate-10000 create secret docker-registry myregistrykey \ name: fate-9999 namespace: fate-9999 chartName: fate -chartVersion: v1.11.2 +chartVersion: v2.0.0-beta partyId: 9999 registry: "" pullPolicy: @@ -322,7 +322,7 @@ pulsar: name: fate-10000 namespace: fate-10000 chartName: fate -chartVersion: v1.11.2 +chartVersion: v2.0.0-beta partyId: 10000 registry: "" pullPolicy: @@ -418,8 +418,8 @@ create job success, job id=7752db70-e368-41fa-8827-d39411728d1b ``` kubefate@machine:~/kubefate$ watch kubefate cluster ls UUID NAME NAMESPACE REVISION STATUS CHART ChartVERSION AGE -29878fa9-aeee-4ae5-a5b7-fd4e9eb7c1c3 fate-9999 fate-9999 1 Running fate v1.11.2 88s -dacc0549-b9fc-463f-837a-4e7316db2537 fate-10000 fate-10000 1 Running fate v1.11.2 69s +29878fa9-aeee-4ae5-a5b7-fd4e9eb7c1c3 fate-9999 fate-9999 1 Running fate v2.0.0-beta 88s +dacc0549-b9fc-463f-837a-4e7316db2537 fate-10000 fate-10000 1 Running fate v2.0.0-beta 69s ``` 因为这个步骤需要到网易云镜像仓库去下载约10G的镜像,所以第一次执行视乎你的网络情况需要一定时间。 检查下载的进度可以用 @@ -446,13 +446,13 @@ UUID 29878fa9-aeee-4ae5-a5b7-fd4e9eb7c1c3 Name fate-9999 NameSpace fate-9999 ChartName fate -ChartVersion v1.11.2 +ChartVersion v2.0.0-beta Revision 1 Age 54m Status Running Spec algorithm: Basic chartName: fate - chartVersion: v1.11.2 + chartVersion: v2.0.0-beta computing: Spark device: CPU federation: Pulsar diff --git a/helm-charts/FATE/Chart.yaml b/helm-charts/FATE/Chart.yaml index c4e6e52a1..1f025f22c 100644 --- a/helm-charts/FATE/Chart.yaml +++ b/helm-charts/FATE/Chart.yaml @@ -1,8 +1,8 @@ apiVersion: v1 -appVersion: v1.11.2 +appVersion: v2.0.0-beta description: A Helm chart for fate-training name: fate -version: v1.11.2 +version: v2.0.0-beta home: https://fate.fedai.org icon: https://aisp-1251170195.cos.ap-hongkong.myqcloud.com/wp-content/uploads/sites/12/2019/09/logo.png sources: diff --git a/helm-charts/FATE/templates/backends/eggroll/clustermanager/deployment.yaml b/helm-charts/FATE/templates/backends/eggroll/clustermanager/deployment.yaml index 6b21c3754..6ea2015e8 100644 --- a/helm-charts/FATE/templates/backends/eggroll/clustermanager/deployment.yaml +++ b/helm-charts/FATE/templates/backends/eggroll/clustermanager/deployment.yaml @@ -35,7 +35,7 @@ spec: - env: - name: PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION value: python - image: {{ .Values.image.registry }}/eggroll{{ include "images.eggroll.suffix" . }}:{{ .Values.image.tag }} + image: {{ .Values.image.registry }}{{ .Values.modules.clustermanager.image }}{{ include "images.eggroll.suffix" . }}:{{ .Values.modules.clustermanager.imageTag }} imagePullPolicy: {{ .Values.image.pullPolicy }} {{- if .Values.modules.clustermanager.resources}} resources: diff --git a/helm-charts/FATE/templates/backends/eggroll/lb-rollsite/deployment.yaml b/helm-charts/FATE/templates/backends/eggroll/lb-rollsite/deployment.yaml index 5764d6b76..3e21986fd 100644 --- a/helm-charts/FATE/templates/backends/eggroll/lb-rollsite/deployment.yaml +++ b/helm-charts/FATE/templates/backends/eggroll/lb-rollsite/deployment.yaml @@ -37,7 +37,7 @@ spec: - "rollsite" containers: - name: exchange - image: {{ .Values.image.registry }}/nginx:1.17 + image: {{ .Values.image.registry }}nginx:1.17 imagePullPolicy: {{ .Values.image.pullPolicy }} ports: - containerPort: 9390 @@ -52,7 +52,7 @@ spec: env: - name: PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION value: python - image: {{ .Values.image.registry }}/eggroll{{ include "images.eggroll.suffix" . }}:{{ .Values.image.tag }} + image: {{ .Values.image.registry }}{{ .Values.modules.lbrollsite.image }}:{{ .Values.modules.lbrollsite.imageTag }} imagePullPolicy: {{ .Values.image.pullPolicy }} {{- if .Values.modules.lbrollsite.resources}} resources: diff --git a/helm-charts/FATE/templates/backends/eggroll/nodemanager/statefulSet.yaml b/helm-charts/FATE/templates/backends/eggroll/nodemanager/statefulSet.yaml index 810c71ca1..88aabcd65 100644 --- a/helm-charts/FATE/templates/backends/eggroll/nodemanager/statefulSet.yaml +++ b/helm-charts/FATE/templates/backends/eggroll/nodemanager/statefulSet.yaml @@ -46,7 +46,7 @@ spec: - name: fluentd-conf subPath: fluent.conf mountPath: /fluentd/etc/fluent.conf - - image: {{ .Values.image.registry }}/eggroll{{ include "images.eggroll.suffix" . }}:{{ .Values.image.tag }} + - image: {{ .Values.image.registry }}{{ .Values.modules.nodemanager.image }}:{{ .Values.modules.nodemanager.imageTag }} imagePullPolicy: {{ .Values.image.pullPolicy }} {{- if .Values.modules.nodemanager.resources}} resources: diff --git a/helm-charts/FATE/templates/backends/eggroll/rollsite/deployment.yaml b/helm-charts/FATE/templates/backends/eggroll/rollsite/deployment.yaml index 5e611eb69..4ea8c4b87 100644 --- a/helm-charts/FATE/templates/backends/eggroll/rollsite/deployment.yaml +++ b/helm-charts/FATE/templates/backends/eggroll/rollsite/deployment.yaml @@ -40,7 +40,7 @@ spec: env: - name: PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION value: python - image: {{ .Values.image.registry }}/eggroll{{ include "images.eggroll.suffix" . }}:{{ .Values.image.tag }} + image: {{ .Values.image.registry }}{{ .Values.modules.rollsite.image }}:{{ .Values.modules.rollsite.imageTag }} imagePullPolicy: {{ .Values.image.pullPolicy }} {{- if .Values.modules.rollsite.resources}} resources: diff --git a/helm-charts/FATE/templates/backends/spark/hdfs/datanode.yaml b/helm-charts/FATE/templates/backends/spark/hdfs/datanode.yaml index 38af1162e..f842a96d8 100644 --- a/helm-charts/FATE/templates/backends/spark/hdfs/datanode.yaml +++ b/helm-charts/FATE/templates/backends/spark/hdfs/datanode.yaml @@ -32,7 +32,7 @@ spec: spec: containers: - name: datanode - image: {{ .Values.image.registry }}/hadoop-datanode:2.0.0-hadoop3.2.1-java8 + image: {{ .Values.image.registry }}{{ .Values.modules.hdfs.datanode.image }}:{{ .Values.modules.hdfs.datanode.imageTag }} imagePullPolicy: {{ .Values.image.pullPolicy }} env: - name: SERVICE_PRECONDITION diff --git a/helm-charts/FATE/templates/backends/spark/hdfs/namenode.yaml b/helm-charts/FATE/templates/backends/spark/hdfs/namenode.yaml index ec5c473fd..248e1d23a 100644 --- a/helm-charts/FATE/templates/backends/spark/hdfs/namenode.yaml +++ b/helm-charts/FATE/templates/backends/spark/hdfs/namenode.yaml @@ -41,7 +41,7 @@ spec: {{ end }} containers: - name: namenode - image: {{ .Values.image.registry }}/hadoop-namenode:2.0.0-hadoop3.2.1-java8 + image: {{ .Values.image.registry }}{{ .Values.modules.hdfs.namenode.image }}:{{ .Values.modules.hdfs.namenode.imageTag }} imagePullPolicy: {{ .Values.image.pullPolicy }} env: - name: CLUSTER_NAME @@ -130,5 +130,5 @@ spec: resources: requests: storage: {{ .Values.modules.hdfs.namenode.size }} - {{- end }} + {{- end }} {{- end }} \ No newline at end of file diff --git a/helm-charts/FATE/templates/backends/spark/nginx/deployment.yaml b/helm-charts/FATE/templates/backends/spark/nginx/deployment.yaml index 1f97865e8..4b3bc8956 100644 --- a/helm-charts/FATE/templates/backends/spark/nginx/deployment.yaml +++ b/helm-charts/FATE/templates/backends/spark/nginx/deployment.yaml @@ -33,7 +33,7 @@ spec: spec: containers: - name: nginx - image: {{ .Values.image.registry }}/nginx:{{ .Values.image.tag }} + image: {{ .Values.image.registry }}{{ .Values.modules.nginx.image }}:{{ .Values.modules.nginx.imageTag }} imagePullPolicy: {{ .Values.image.pullPolicy }} command: - /bin/bash diff --git a/helm-charts/FATE/templates/backends/spark/pulsar/statefulSet.yaml b/helm-charts/FATE/templates/backends/spark/pulsar/statefulSet.yaml index f1e5a99be..7b6172d40 100644 --- a/helm-charts/FATE/templates/backends/spark/pulsar/statefulSet.yaml +++ b/helm-charts/FATE/templates/backends/spark/pulsar/statefulSet.yaml @@ -32,7 +32,7 @@ spec: spec: containers: - name: pulsar - image: {{ .Values.image.registry }}/pulsar:2.10.1 + image: {{ .Values.image.registry }}{{ .Values.modules.pulsar.image }}:{{ .Values.modules.pulsar.imageTag }} imagePullPolicy: {{ .Values.image.pullPolicy }} {{- if .Values.modules.pulsar.resources}} resources: diff --git a/helm-charts/FATE/templates/backends/spark/rabbitmq/deployment.yaml b/helm-charts/FATE/templates/backends/spark/rabbitmq/deployment.yaml index 2ed1ba22a..c735b5677 100644 --- a/helm-charts/FATE/templates/backends/spark/rabbitmq/deployment.yaml +++ b/helm-charts/FATE/templates/backends/spark/rabbitmq/deployment.yaml @@ -33,7 +33,7 @@ spec: spec: containers: - name: rabbitmq - image: {{ .Values.image.registry }}/rabbitmq:3.8.3-management + image: {{ .Values.image.registry }}{{ .Values.modules.rabbitmq.image }}:{{ .Values.modules.rabbitmq.imageTag }} imagePullPolicy: {{ .Values.image.pullPolicy }} {{- if .Values.modules.rabbitmq.resources }} resources: diff --git a/helm-charts/FATE/templates/backends/spark/spark/deployment.yaml b/helm-charts/FATE/templates/backends/spark/spark/deployment.yaml index c38db9a6d..47adefbde 100644 --- a/helm-charts/FATE/templates/backends/spark/spark/deployment.yaml +++ b/helm-charts/FATE/templates/backends/spark/spark/deployment.yaml @@ -33,7 +33,7 @@ spec: spec: containers: - name: spark-master - image: {{ if .Values.modules.spark.master.Image }}{{ .Values.modules.spark.master.Image }}{{ else }}{{ .Values.image.registry }}/spark-master{{ end }}:{{ default .Values.image.tag .Values.modules.spark.master.ImageTag }} + image: {{ .Values.image.registry }}{{ .Values.modules.spark.master.image }}:{{ .Values.modules.spark.master.imageTag }} imagePullPolicy: {{ .Values.image.pullPolicy }} {{- if .Values.modules.spark.master.resources }} resources: @@ -121,7 +121,7 @@ spec: spec: containers: - name: spark-worker - image: {{ if .Values.modules.spark.worker.Image }}{{ .Values.modules.spark.worker.Image }}{{ else }}{{ .Values.image.registry }}/spark-worker{{ include "images.spark-worker.suffix" . }}{{ end }}:{{ default .Values.image.tag .Values.modules.spark.worker.ImageTag }} + image: {{ .Values.image.registry }}{{ .Values.modules.spark.worker.image }}{{ include "images.spark-worker.suffix" . }}:{{ .Values.modules.spark.worker.imageTag }} imagePullPolicy: {{ .Values.image.pullPolicy }} {{- if .Values.modules.spark.worker.resources }} resources: diff --git a/helm-charts/FATE/templates/core/client/statefulSet.yaml b/helm-charts/FATE/templates/core/client/statefulSet.yaml index 83f218db0..9675f4417 100644 --- a/helm-charts/FATE/templates/core/client/statefulSet.yaml +++ b/helm-charts/FATE/templates/core/client/statefulSet.yaml @@ -31,7 +31,7 @@ spec: {{ include "fate.labels" . | indent 8 }} spec: containers: - - image: {{ .Values.image.registry }}/client:{{ .Values.image.tag }} + - image: {{ .Values.image.registry }}{{ .Values.modules.client.image }}:{{ .Values.modules.client.imageTag }} imagePullPolicy: {{ .Values.image.pullPolicy }} name: client env: diff --git a/helm-charts/FATE/templates/core/fateboard.yaml b/helm-charts/FATE/templates/core/fateboard.yaml index 7c16c9e65..9e4234083 100644 --- a/helm-charts/FATE/templates/core/fateboard.yaml +++ b/helm-charts/FATE/templates/core/fateboard.yaml @@ -37,7 +37,7 @@ spec: spec: containers: {{- if .Values.modules.fateboard.include }} - - image: {{ .Values.image.registry }}/fateboard:{{ .Values.image.tag }} + - image: {{ .Values.image.registry }}{{ .Values.modules.fateboard.image }}:{{ .Values.modules.fateboard.imageTag }} imagePullPolicy: {{ .Values.image.pullPolicy }} name: fateboard ports: diff --git a/helm-charts/FATE/templates/core/fateflow/configmap.yaml b/helm-charts/FATE/templates/core/fateflow/configmap.yaml index 7ee41de3f..d5eec4623 100644 --- a/helm-charts/FATE/templates/core/fateflow/configmap.yaml +++ b/helm-charts/FATE/templates/core/fateflow/configmap.yaml @@ -42,130 +42,61 @@ data: spark.pyspark.driver.python python {{- end }} service_conf.yaml: | - use_registry: {{ .Values.modules.serving.useRegistry | default false }} - use_deserialize_safe_module: false - dependent_distribution: {{ .Values.modules.python.dependent_distribution | default false }} - encrypt_password: false - encrypt_module: fate_arch.common.encrypt_utils#pwdecrypt - private_key: - private_key_file: - party_id: {{ .Values.partyId }} - hook_module: - client_authentication: fate_flow.hook.flow.client_authentication - site_authentication: fate_flow.hook.flow.site_authentication - permission: fate_flow.hook.flow.permission - hook_server_name: - authentication: - client: - switch: false - http_app_key: - http_secret_key: - site: - switch: false - permission: - switch: false - component: false - dataset: false + party_id: {{ .Values.partyId | quote }} + use_registry: false + encrypt: + key_0: + module: fate_flow.hub.encrypt.password_encrypt#pwdecrypt + # base on: fate_flow/conf/ + private_path: private_key.pem fateflow: - # you must set real ip address, 127.0.0.1 and 0.0.0.0 is not supported host: fateflow_ip http_port: 9380 grpc_port: 9360 - # when you have multiple fateflow server on one party, - # we suggest using nginx for load balancing. - nginx: - # under K8s mode, 'fateflow' is the service name, which will be a L4 load balancer. - host: fateflow - http_port: 9380 - grpc_port: 9360 - # use random instance_id instead of {host}:{http_port} - random_instance_id: false - - # support rollsite/nginx/fateflow as a coordination proxy - # rollsite support fate on eggroll, use grpc protocol - # nginx support fate on eggroll and fate on spark, use http or grpc protocol, default is http - # fateflow support fate on eggroll and fate on spark, use http protocol, but not support exchange network mode - - # format(proxy: rollsite) means rollsite use the rollsite configuration of fate_one_eggroll and nginx use the nginx configuration of fate_one_spark - # you also can customize the config like this(set fateflow of the opposite party as proxy): - # proxy: - # name: fateflow - # host: xx - # http_port: xx - # grpc_port: xx + # proxy_name: rollsite {{- if eq .Values.computing "Spark" "Spark_local" }} - proxy: nginx + proxy_name: nginx {{- else }} - proxy: rollsite + proxy_name: rollsite {{- end }} - # support default/http/grpc - protocol: default - # It can also be specified in the job configuration using the federated_status_collect_type parameter - default_federated_status_collect_type: PULL + nginx: + host: + http_port: + grpc_port: database: - name: '{{ .Values.externalMysqlDatabase | default .Values.modules.mysql.database | default "eggroll_meta" }}' - user: '{{ .Values.externalMysqlUser | default .Values.modules.mysql.user | default "fate" }}' - passwd: '{{ .Values.externalMysqlPassword | default .Values.modules.mysql.password | default "fate_dev" }}' - host: '{{ .Values.externalMysqlIp | default .Values.modules.mysql.ip | default "mysql" }}' - port: {{ .Values.externalMysqlPort | default .Values.modules.mysql.port | default "3306" }} - max_connections: 100 - stale_timeout: 30 + engine: mysql + # encrypt passwd key + decrypt_key: + mysql: + name: '{{ .Values.externalMysqlDatabase | default .Values.modules.mysql.database | default "eggroll_meta" }}' + user: '{{ .Values.externalMysqlUser | default .Values.modules.mysql.user | default "fate" }}' + passwd: '{{ .Values.externalMysqlPassword | default .Values.modules.mysql.password | default "fate_dev" }}' + host: '{{ .Values.externalMysqlIp | default .Values.modules.mysql.ip | default "mysql" }}' + port: {{ .Values.externalMysqlPort | default .Values.modules.mysql.port | default "3306" }} + max_connections: 100 + stale_timeout: 30 + sqlite: + # default fate_flow/runtime/system_settings: SQLITE_PATH + # /xxx/xxx.sqlite + path: default_engines: {{- if eq .Values.computing "Spark_local" }} computing: "spark" {{- else }} computing: {{ .Values.computing | lower }} {{- end }} + {{- if eq .Values.computing "Eggroll" }} + federation: "rollsite" + {{- else }} federation: {{ .Values.federation | lower }} + {{- end }} storage: {{ .Values.storage | lower }} - fate_on_standalone: - standalone: - cores_per_node: 20 - nodes: 1 - fate_on_eggroll: - clustermanager: - cores_per_node: {{ .Values.modules.python.clustermanager.cores_per_node | default 16 }} - nodes: {{ .Values.modules.python.clustermanager.nodes | default 2 }} - rollsite: - host: {{ .Values.modules.rollsite.ip }} - port: 9370 - fate_on_spark: - spark: - # default use SPARK_HOME environment variable - home: /data/projects/spark-3.1.3-bin-hadoop3.2/ - cores_per_node: {{ .Values.modules.python.spark.cores_per_node }} - nodes: {{ .Values.modules.python.spark.nodes }} - linkis_spark: - cores_per_node: 20 - nodes: 2 - host: 127.0.0.1 - port: 9001 - token_code: MLSS - python_path: /data/projects/fate/python - hive: - host: {{ .Values.modules.python.hive.host }} - port: {{ .Values.modules.python.hive.port }} - auth_mechanism: {{ .Values.modules.python.hive.auth_mechanism }} - username: {{ .Values.modules.python.hive.username }} - password: {{ .Values.modules.python.hive.password }} - linkis_hive: - host: 127.0.0.1 - port: 9001 - hdfs: - name_node: {{ .Values.modules.python.hdfs.name_node | default "hdfs://namenode:9000" }} - # default / - path_prefix: {{ .Values.modules.python.hdfs.path_prefix }} - rabbitmq: - host: {{ .Values.modules.python.rabbitmq.host }} - mng_port: {{ .Values.modules.python.rabbitmq.mng_port }} - port: {{ .Values.modules.python.rabbitmq.port }} - user: {{ .Values.modules.python.rabbitmq.user }} - password: {{ .Values.modules.python.rabbitmq.password }} - # default conf/rabbitmq_route_table.yaml - route_table: conf/rabbitmq_route_table/rabbitmq_route_table.yaml - # mode: replication / client, default: replication - mode: replication - max_message_size: 1048576 + default_provider: + name: fate + # version default: fateflow.env + version: + device: local + federation: pulsar: host: {{ .Values.modules.python.pulsar.host }} port: {{ .Values.modules.python.pulsar.port }} @@ -182,28 +113,66 @@ data: host: {{ .Values.modules.python.nginx.host }} http_port: {{ .Values.modules.python.nginx.http_port }} grpc_port: {{ .Values.modules.python.nginx.grpc_port }} - fateboard: - host: fateboard - port: 8080 - - enable_model_store: false - model_store_address: - # use mysql as the model store engine - # storage: mysql - # database: {{ .Values.externalMysqlDatabase | default .Values.modules.mysql.database | default "eggroll_meta" }} - # host: '{{ .Values.externalMysqlIp | default .Values.modules.mysql.ip | default "mysql" }}' - # port: {{ .Values.externalMysqlPort | default .Values.modules.mysql.port | default "3306" }} - # user: '{{ .Values.externalMysqlUser | default .Values.modules.mysql.user | default "fate" }}' - # password: '{{ .Values.externalMysqlPassword | default .Values.modules.mysql.password | default "fate_dev" }}' - # max_connections: 10 - # stale_timeout: 10 - - # use tencent cos as model store engine - storage: tencent_cos - Region: - SecretId: - SecretKey: - Bucket: + # http or grpc + protocol: http + rabbitmq: + host: {{ .Values.modules.python.rabbitmq.host }} + mng_port: {{ .Values.modules.python.rabbitmq.mng_port }} + port: {{ .Values.modules.python.rabbitmq.port }} + user: {{ .Values.modules.python.rabbitmq.user }} + password: {{ .Values.modules.python.rabbitmq.password }} + # default conf/rabbitmq_route_table.yaml + route_table: conf/rabbitmq_route_table/rabbitmq_route_table.yaml + # mode: replication / client, default: replication + mode: replication + max_message_size: 1048576 + rollsite: + host: rollsite + port: 9370 + osx: + host: osx + port: 9370 + computing: + standalone: + cores: 32 + eggroll: + cores: 32 + nodes: 2 + spark: + # default use SPARK_HOME environment variable + home: /data/projects/spark-3.1.3-bin-hadoop3.2/ + cores: 32 + storage: + hdfs: + name_node: {{ .Values.modules.python.hdfs.name_node | default "hdfs://namenode:9000" }} + hook_module: + client_authentication: fate_flow.hook.flow.client_authentication + site_authentication: fate_flow.hook.flow.site_authentication + permission: fate_flow.hook.flow.permission + authentication: + client: false + site: false + permission: false + model_store: + engine: file + # encrypt passwd key + decrypt_key: + file: + # default fate_flow/runtime/system_settings: MODEL_STORE_PATH + path: + mysql: + name: fate_flow + user: fate + passwd: fate + host: 127.0.0.1 + port: 3306 + max_connections: 100 + stale_timeout: 30 + tencent_cos: + Region: + SecretId: + SecretKey: + Bucket: {{- with .Values.modules.serving }} servings: hosts: @@ -234,23 +203,29 @@ metadata: {{ include "fate.labels" . | indent 4 }} data: pulsar_route_table.yaml: | - {{- with .Values.modules.pulsar.exchange }} - default: - proxy: "{{ .ip }}:{{ .port }}" - domain: "{{ .domain }}" - {{- end }} - {{- if .Values.modules.pulsar.route_table }} + {{- if .Values.modules.pulsar.route_table }} {{- range $key, $val := .Values.modules.pulsar.route_table }} {{ $key }}: {{ toYaml . | indent 6 }} {{- end }} - {{- else }} + {{- else }} {{ .Values.partyId }}: host: pulsar port: 6650 sslPort: 6651 proxy: "" - {{- end}} + {{- end}} + {{- with .Values.modules.pulsar.exchange }} + default: + proxy: "{{ .ip }}:{{ .port }}" + domain: "{{ .domain }}" + {{- else }} + default: + proxy: "proxy.fate.org:443" + domain: "fate.org" + brokerPort: 6650 + brokerSslPort: 6651 + {{- end }} --- kind: ConfigMap apiVersion: v1 diff --git a/helm-charts/FATE/templates/core/mysql/statefulSet.yaml b/helm-charts/FATE/templates/core/mysql/statefulSet.yaml index 397bd56e5..d6822515b 100644 --- a/helm-charts/FATE/templates/core/mysql/statefulSet.yaml +++ b/helm-charts/FATE/templates/core/mysql/statefulSet.yaml @@ -31,11 +31,7 @@ spec: {{ include "fate.labels" . | indent 8 }} spec: containers: - {{- if .Values.image.isThridParty }} - - image: {{ .Values.image.registry }}/mysql:8.0.28 - {{- else }} - - image: mysql:8.0.28 - {{- end }} + - image: {{ .Values.image.registry }}{{ .Values.modules.mysql.image }}:{{ .Values.modules.mysql.imageTag }} imagePullPolicy: {{ .Values.image.pullPolicy }} name: mysql securityContext: diff --git a/helm-charts/FATE/templates/core/python-spark.yaml b/helm-charts/FATE/templates/core/python-spark.yaml index 893c98e37..777c4ee93 100644 --- a/helm-charts/FATE/templates/core/python-spark.yaml +++ b/helm-charts/FATE/templates/core/python-spark.yaml @@ -37,11 +37,7 @@ spec: {{- if .Values.istio.enabled }} {{- else }} initContainers: - {{- if .Values.image.isThridParty }} - - image: {{ .Values.image.registry }}/mysql:8.0.28 - {{- else }} - - image: mysql:8.0.28 - {{- end }} + - image: {{ .Values.image.registry }}{{ .Values.modules.mysql.image }}:{{ .Values.modules.mysql.imageTag }} imagePullPolicy: {{ .Values.image.pullPolicy }} name: ping-mysql env: @@ -79,7 +75,7 @@ spec: {{- end }} containers: - name: fateflow - image: {{ .Values.image.registry }}/fateflow{{ include "images.fateflow.suffix" . }}:{{ .Values.image.tag }} + image: {{ .Values.image.registry }}{{ .Values.modules.python.image }}{{ include "images.fateflow.suffix" . }}:{{ .Values.modules.python.imageTag }} imagePullPolicy: {{ .Values.image.pullPolicy }} {{- if .Values.modules.python.resources}} resources: @@ -118,14 +114,19 @@ spec: - | set -x mkdir -p /data/projects/fate/conf/ - cp /data/projects/fate/conf-tmp/service_conf.yaml /data/projects/fate/conf/service_conf.yaml + cp /data/projects/fate/conf-tmp/service_conf.yaml /data/projects/fate/fate_flow/conf/service_conf.yaml # fix fateflow conf must use IP - sed -i "s/host: fateflow_ip/host: ${POD_IP}/g" /data/projects/fate/conf/service_conf.yaml + sed -i "s/host: fateflow_ip/host: ${POD_IP}/g" /data/projects/fate/fate_flow/conf/service_conf.yaml cp /data/projects/spark-3.1.3-bin-hadoop3.2/conf/spark-defaults-template.conf /data/projects/spark-3.1.3-bin-hadoop3.2/conf/spark-defaults.conf sed -i "s/fateflow/${POD_IP}/g" /data/projects/spark-3.1.3-bin-hadoop3.2/conf/spark-defaults.conf - sleep 5 && python fateflow/python/fate_flow/fate_flow_server.py + sed -i "s/int(party.party_id)/str(party.party_id)/g" /data/projects/fate/fate/python/fate/arch/federation/pulsar/_federation.py + + cp /data/projects/fate/fate_flow/conf/pulsar_route_table/pulsar_route_table.yaml /data/projects/fate/fate_flow/pulsar_route_table.yaml + cp /data/projects/fate/fate_flow/conf/rabbitmq_route_table/rabbitmq_route_table.yaml /data/projects/fate/fate_flow/rabbitmq_route_table.yaml + + sleep 5 && python fate_flow/python/fate_flow/fate_flow_server.py livenessProbe: tcpSocket: port: 9380 @@ -154,7 +155,7 @@ spec: subPath: eggroll.properties {{- end }} - name: python-data - mountPath: /data/projects/fate/fateflow/logs + mountPath: /data/projects/fate/fate_flow/logs subPath: logs - mountPath: /data/projects/fate/conf-tmp/ name: python-confs @@ -162,17 +163,17 @@ spec: name: python-confs subPath: spark-defaults.conf {{- if eq .Values.federation "RabbitMQ" }} - - mountPath: /data/projects/fate/conf/rabbitmq_route_table + - mountPath: /data/projects/fate/fate_flow/conf/rabbitmq_route_table name: rabbitmq-route-table {{- end }} {{- if eq .Values.federation "Pulsar" }} - - mountPath: /data/projects/fate/conf/pulsar_route_table + - mountPath: /data/projects/fate/fate_flow/conf/pulsar_route_table name: pulsar-route-table {{- end }} - - mountPath: /data/projects/fate/fateflow/jobs + - mountPath: /data/projects/fate/fate_flow/jobs name: python-data subPath: jobs - - mountPath: /data/projects/fate/fateflow/model_local_cache + - mountPath: /data/projects/fate/fate_flow/model_local_cache name: python-data subPath: model-local-cache - mountPath: /data/projects/fate/llm diff --git a/helm-charts/FATE/values-template-example.yaml b/helm-charts/FATE/values-template-example.yaml index 66d71788b..c294f634e 100644 --- a/helm-charts/FATE/values-template-example.yaml +++ b/helm-charts/FATE/values-template-example.yaml @@ -1,7 +1,7 @@ name: fate-9999 namespace: fate-9999 chartName: fate -chartVersion: v1.11.2 +chartVersion: v2.0.0-beta partyId: 9999 registry: "" pullPolicy: @@ -65,6 +65,8 @@ skippedKeys: # - name: party9999.pulsar.example.com # rollsite: + # image: "federatedai/eggroll" + # imageTag: "v2.0.0-beta" # type: NodePort # nodePort: 30091 # loadBalancerIP: @@ -103,6 +105,8 @@ skippedKeys: # lbrollsite: + # image: "federatedai/eggroll" + # imageTag: "v2.0.0-beta" # type: NodePort # nodePort: 30091 # loadBalancerIP: @@ -123,6 +127,8 @@ skippedKeys: # memory: "1Gi" # nodemanager: + # image: "federatedai/eggroll" + # imageTag: "v2.0.0-beta" # replicas: 2 # sessionProcessorsPerNode: 4 # nodeSelector: @@ -142,6 +148,8 @@ skippedKeys: # memory: "8Gi" # clustermanager: + # image: "federatedai/eggroll" + # imageTag: "v2.0.0-beta" # nodeSelector: # tolerations: # affinity: @@ -156,6 +164,8 @@ skippedKeys: # python: + # image: "federatedai/fateflow" + # imageTag: "v2.0.0-beta" # type: NodePort # replicas: 1 # httpNodePort: 30097 @@ -221,6 +231,8 @@ skippedKeys: # password: # fateboard: + # image: "federatedai/fateboard" + # imageTag: "v2.0.0-beta" # type: ClusterIP # username: admin # password: admin @@ -229,6 +241,8 @@ skippedKeys: # affinity: # client: + # image: "federatedai/client" + # imageTag: "v2.0.0-beta" # nodeSelector: # subPath: "" # existingClaim: "" @@ -238,6 +252,8 @@ skippedKeys: # notebook_hashed_password: "" # mysql: + # image: "mysql" + # imageTag: "8.0.28" # nodeSelector: # tolerations: # affinity: @@ -274,8 +290,8 @@ skippedKeys: # spark: # master: - # Image: "federatedai/spark-master" - # ImageTag: "1.11.2-release" + # image: "federatedai/spark-master" + # imageTag: "v2.0.0-beta" # replicas: 1 # resources: # requests: @@ -290,8 +306,8 @@ skippedKeys: # type: ClusterIP # nodePort: 30977 # worker: - # Image: "federatedai/spark-worker" - # ImageTag: "1.11.2-release" + # image: "federatedai/spark-worker" + # imageTag: "v2.0.0-beta" # replicas: 2 # resources: # requests: @@ -306,6 +322,8 @@ skippedKeys: # type: ClusterIP # hdfs: # namenode: + # image: "federatedai/hadoop-namenode" + # imageTag: "2.0.0-hadoop3.2.1-java8" # nodeSelector: # tolerations: # affinity: @@ -316,6 +334,8 @@ skippedKeys: # accessMode: ReadWriteOnce # size: 1Gi # datanode: + # image: "federatedai/hadoop-datanode" + # imageTag: "2.0.0-hadoop3.2.1-java8" # replicas: # nodeSelector: # tolerations: @@ -326,6 +346,8 @@ skippedKeys: # accessMode: # size: # nginx: + # image: "federatedai/nginx" + # imageTag: "v2.0.0-beta" # nodeSelector: # tolerations: # affinity: @@ -348,6 +370,8 @@ skippedKeys: # http_port: 30107 # grpc_port: 30102 # rabbitmq: + # image: "federatedai/rabbitmq" + # imageTag: "3.8.3-management" # nodeSelector: # tolerations: # affinity: @@ -374,6 +398,8 @@ skippedKeys: # memory: "8Gi" # pulsar: + # image: "federatedai/pulsar" + # imageTag: "2.10.2" # nodeSelector: # tolerations: # affinity: diff --git a/helm-charts/FATE/values-template.yaml b/helm-charts/FATE/values-template.yaml index 9f829ffc7..d516a3d1d 100644 --- a/helm-charts/FATE/values-template.yaml +++ b/helm-charts/FATE/values-template.yaml @@ -1,6 +1,6 @@ image: - registry: {{ .registry | default "federatedai" }} + registry: {{ .registry | default "" }} isThridParty: {{ empty .registry | ternary "false" "true" }} pullPolicy: {{ .pullPolicy | default "IfNotPresent" }} {{- with .imagePullSecrets }} @@ -149,6 +149,8 @@ modules: include: {{ has "rollsite" .modules }} {{- with .rollsite }} ip: rollsite + image: {{ .image | default "federatedai/eggroll" }} + imageTag: {{ .imageTag | default "v2.0.0-beta" }} type: {{ .type | default "ClusterIP" }} nodePort: {{ .nodePort }} loadBalancerIP: {{ .loadBalancerIP }} @@ -191,6 +193,8 @@ modules: include: {{ has "lbrollsite" .modules }} {{- with .lbrollsite }} ip: rollsite + image: {{ .image | default "federatedai/eggroll" }} + imageTag: {{ .imageTag | default "v2.0.0-beta" }} type: {{ .type | default "ClusterIP" }} loadBalancerIP: {{ .loadBalancerIP }} nodePort: {{ .nodePort }} @@ -223,6 +227,8 @@ modules: {{ toYaml . | indent 6 }} {{- end }} logLevel: {{ .logLevel | default "INFO" }} + image: {{ .image | default "federatedai/fateflow" }} + imageTag: {{ .imageTag | default "v2.0.0-beta" }} type: {{ .type | default "ClusterIP" }} httpNodePort: {{ .httpNodePort }} grpcNodePort: {{ .grpcNodePort }} @@ -301,6 +307,8 @@ modules: include: {{ has "clustermanager" .modules }} {{- with .clustermanager }} ip: clustermanager + image: {{ .image | default "federatedai/eggroll" }} + imageTag: {{ .imageTag | default "v2.0.0-beta" }} type: "ClusterIP" mysqlServerTimezone: {{ .mysqlServerTimezone }} {{- with .nodeSelector }} @@ -326,6 +334,8 @@ modules: include: {{ has "nodemanager" .modules }} {{- with .nodemanager }} sessionProcessorsPerNode: {{ .sessionProcessorsPerNode }} + image: {{ .image | default "federatedai/eggroll" }} + imageTag: {{ .imageTag | default "v2.0.0-beta" }} replicas: {{ .replicas | default 2 }} subPath: {{ .subPath }} storageClass: {{ .storageClass | default "nodemanager" }} @@ -355,6 +365,8 @@ modules: include: {{ has "client" .modules }} {{- with .client }} subPath: {{ .subPath }} + image: {{ .image | default "federatedai/client" }} + imageTag: {{ .imageTag | default "v2.0.0-beta" }} existingClaim: {{ .existingClaim }} storageClass: {{ .storageClass | default "client" }} accessMode: {{ .accessMode | default "ReadWriteOnce" }} @@ -378,6 +390,8 @@ modules: mysql: include: {{ has "mysql" .modules }} {{- with .mysql }} + image: {{ .image | default "mysql" }} + imageTag: {{ .imageTag | default "8.0.28" }} type: {{ .type | default "ClusterIP" }} {{- with .nodeSelector }} nodeSelector: @@ -417,6 +431,8 @@ modules: include: {{ has "fateboard" .modules }} {{- with .fateboard }} type: {{ .type }} + image: {{ .image | default "federatedai/fateboard" }} + imageTag: {{ .imageTag | default "v2.0.0-beta" }} username: {{ .username }} password: {{ .password }} {{- with .nodeSelector }} @@ -438,8 +454,8 @@ modules: {{- with .spark }} {{- if .master }} master: - Image: "{{ .master.Image }}" - ImageTag: "{{ .master.ImageTag }}" + image: "{{ .master.image | default "federatedai/spark-master" }}" + imageTag: "{{ .master.imageTag | default "v2.0.0-beta" }}" replicas: {{ .master.replicas }} {{- with .master.resources }} resources: @@ -462,8 +478,8 @@ modules: {{- end }} {{- if .worker }} worker: - Image: "{{ .worker.Image }}" - ImageTag: "{{ .worker.ImageTag }}" + image: {{ .worker.image | default "federatedai/spark-worker" | quote }} + imageTag: {{ .worker.imageTag | default "v2.0.0-beta" | quote }} replicas: {{ .worker.replicas }} {{- with .worker.resources }} resources: @@ -503,6 +519,8 @@ modules: {{ toYaml . | indent 8 }} {{- end }} type: {{ .namenode.type | default "ClusterIP" }} + image: {{ .image | default "federatedai/hadoop-namenode" | quote }} + imageTag: {{ .imageTag | default "2.0.0-hadoop3.2.1-java8" | quote }} nodePort: {{ .namenode.nodePort }} existingClaim: {{ .namenode.existingClaim }} storageClass: {{ .namenode.storageClass | default "" }} @@ -523,6 +541,8 @@ modules: {{ toYaml . | indent 8 }} {{- end }} type: {{ .datanode.type | default "ClusterIP" }} + image: {{ .image | default "federatedai/hadoop-datanode" | quote }} + imageTag: {{ .imageTag | default "2.0.0-hadoop3.2.1-java8" | quote }} existingClaim: {{ .datanode.existingClaim }} storageClass: {{ .datanode.storageClass | default "" }} accessMode: {{ .datanode.accessMode | default "ReadWriteOnce" }} @@ -546,6 +566,8 @@ modules: {{ toYaml . | indent 6 }} {{- end }} type: {{ .type | default "ClusterIP" }} + image: {{ .image | default "federatedai/nginx" | quote }} + imageTag: {{ .imageTag | default "v2.0.0-beta" | quote }} httpNodePort: {{ .httpNodePort }} grpcNodePort: {{ .grpcNodePort }} loadBalancerIP: {{ .loadBalancerIP }} @@ -583,6 +605,8 @@ modules: {{ toYaml . | indent 6 }} {{- end }} type: {{ .type | default "ClusterIP" }} + image: {{ .image | default "federatedai/rabbitmq" | quote }} + imageTag: {{ .imageTag | default "3.8.3-management" | quote }} nodePort: {{ .nodePort }} default_user: {{ .default_user }} default_pass: {{ .default_pass }} @@ -625,6 +649,8 @@ modules: {{ toYaml . | indent 6 }} {{- end }} type: {{ .type | default "ClusterIP" }} + image: {{ .image | default "federatedai/pulsar" | quote }} + imageTag: {{ .imageTag | default "2.10.2" | quote }} httpNodePort: {{ .httpNodePort }} httpsNodePort: {{ .httpsNodePort }} loadBalancerIP: {{ .loadBalancerIP }} diff --git a/helm-charts/FATE/values.yaml b/helm-charts/FATE/values.yaml index 6eca0e4e8..b9f8d5436 100644 --- a/helm-charts/FATE/values.yaml +++ b/helm-charts/FATE/values.yaml @@ -1,8 +1,8 @@ image: - registry: federatedai + registry: isThridParty: - tag: 1.11.2-release + tag: v2.0.0-beta pullPolicy: IfNotPresent imagePullSecrets: # - name: @@ -88,6 +88,8 @@ modules: rollsite: include: true ip: rollsite + image: "federatedai/eggroll" + imageTag: "v2.0.0-beta" type: ClusterIP nodePort: 30091 loadBalancerIP: @@ -111,6 +113,8 @@ modules: lbrollsite: include: true ip: rollsite + image: "federatedai/eggroll" + imageTag: "v2.0.0-beta" type: ClusterIP nodePort: 30091 loadBalancerIP: @@ -121,6 +125,8 @@ modules: python: include: true replicas: 1 + image: "federatedai/fateflow" + imageTag: "v2.0.0-beta" type: ClusterIP httpNodePort: 30097 grpcNodePort: 30092 @@ -181,6 +187,8 @@ modules: client: include: true ip: client + image: "federatedai/client" + imageTag: "v2.0.0-beta" type: ClusterIP nodeSelector: tolerations: @@ -194,6 +202,8 @@ modules: clustermanager: include: true ip: clustermanager + image: "federatedai/eggroll" + imageTag: "v2.0.0-beta" type: ClusterIP nodeSelector: tolerations: @@ -202,6 +212,8 @@ modules: nodemanager: include: true replicas: 2 + image: "federatedai/eggroll" + imageTag: "v2.0.0-beta" nodeSelector: tolerations: affinity: @@ -219,6 +231,8 @@ modules: mysql: include: true type: ClusterIP + image: "mysql" + imageTag: "8.0.28" nodeSelector: tolerations: affinity: @@ -248,6 +262,8 @@ modules: fateboard: include: true type: ClusterIP + image: "federatedai/fateboard" + imageTag: "v2.0.0-beta" username: admin password: admin nodeSelector: @@ -257,8 +273,8 @@ modules: spark: include: true master: - Image: "" - ImageTag: "" + image: "federatedai/spark-master" + imageTag: "v2.0.0-beta" replicas: 1 nodeSelector: tolerations: @@ -266,8 +282,8 @@ modules: type: ClusterIP nodePort: 30977 worker: - Image: "" - ImageTag: "" + image: "federatedai/spark-worker" + imageTag: "v2.0.0-beta" replicas: 2 nodeSelector: tolerations: @@ -280,6 +296,8 @@ modules: hdfs: include: true namenode: + image: "federatedai/hadoop-namenode" + imageTag: "2.0.0-hadoop3.2.1-java8" nodeSelector: tolerations: affinity: @@ -290,6 +308,8 @@ modules: accessMode: ReadWriteOnce size: 1Gi datanode: + image: "federatedai/hadoop-datanode" + imageTag: "2.0.0-hadoop3.2.1-java8" replicas: 3 nodeSelector: tolerations: @@ -301,6 +321,8 @@ modules: size: 1Gi nginx: include: true + image: "federatedai/nginx" + imageTag: "v2.0.0-beta" nodeSelector: tolerations: affinity: @@ -309,9 +331,9 @@ modules: grpcNodePort: 30098 loadBalancerIP: exchange: - ip: 192.168.10.1 - httpPort: 30003 - grpcPort: 30008 + ip: nginx + httpPort: 9300 + grpcPort: 9310 route_table: # 10000: # proxy: @@ -324,6 +346,8 @@ modules: # grpc_port: 30102 rabbitmq: include: true + image: "federatedai/rabbitmq" + imageTag: "3.8.3-management" nodeSelector: tolerations: affinity: @@ -341,6 +365,8 @@ modules: pulsar: include: true + image: "federatedai/pulsar" + imageTag: "2.10.2" nodeSelector: tolerations: env: diff --git a/helm-charts/Images_list.md b/helm-charts/Images_list.md index ce3246e4a..d2f6f787d 100644 --- a/helm-charts/Images_list.md +++ b/helm-charts/Images_list.md @@ -16,7 +16,7 @@ - nginx:1.17 - federatedai/nginx:${version-tag} - federatedai/rabbitmq:3.8.3-management -- federatedai/pulsar:2.7.0 +- federatedai/pulsar:2.10.2 ## FATE-Serving diff --git a/k8s-deploy/README.md b/k8s-deploy/README.md index d92bb3d29..2ea3c458c 100644 --- a/k8s-deploy/README.md +++ b/k8s-deploy/README.md @@ -192,13 +192,13 @@ UUID 24bb75ff-f636-4c64-8c04-1b9073f89a2f Name fate-9999 NameSpace fate-9999 ChartName fate -ChartVersion v1.11.2 +ChartVersion v2.0.0-beta Revision 1 Age 15m Status Running Spec algorithm: Basic chartName: fate - chartVersion: v1.11.2 + chartVersion: v2.0.0-beta computing: Eggroll device: CPU federation: Eggroll diff --git a/k8s-deploy/README_zh.md b/k8s-deploy/README_zh.md index 0c800c010..4ec16de98 100644 --- a/k8s-deploy/README_zh.md +++ b/k8s-deploy/README_zh.md @@ -191,13 +191,13 @@ UUID 24bb75ff-f636-4c64-8c04-1b9073f89a2f Name fate-9999 NameSpace fate-9999 ChartName fate -ChartVersion v1.11.2 +ChartVersion v2.0.0-beta Revision 1 Age 15m Status Running Spec algorithm: Basic chartName: fate - chartVersion: v1.11.2 + chartVersion: v2.0.0-beta computing: Eggroll device: CPU federation: Eggroll diff --git a/k8s-deploy/cluster-spark-pulsar.yaml b/k8s-deploy/cluster-spark-pulsar.yaml index ab41b48e8..b1a25dded 100644 --- a/k8s-deploy/cluster-spark-pulsar.yaml +++ b/k8s-deploy/cluster-spark-pulsar.yaml @@ -1,7 +1,7 @@ name: fate-9999 namespace: fate-9999 chartName: fate -chartVersion: v1.11.2 +chartVersion: v2.0.0-beta partyId: 9999 registry: "" pullPolicy: @@ -53,55 +53,90 @@ skippedKeys: # hosts: # - name: party9999.pulsar.example.com +# fateboard: + # image: "federatedai/fateboard" + # imageTag: "v2.0.0-beta" +# type: ClusterIP +# username: admin +# password: admin +# nodeSelector: +# tolerations: +# affinity: + # Specify the fateflow service's properties # python: - # type: NodePort - # httpNodePort: 30097 - # grpcNodePort: 30092 - # loadBalancerIP: - # serviceAccountName: "" + # image: "federatedai/fateflow" + # imageTag: "v2.0.0-beta" +# type: NodePort +# replicas: 1 +# httpNodePort: 30097 +# grpcNodePort: 30092 +# loadBalancerIP: +# serviceAccountName: "" +# nodeSelector: +# tolerations: +# affinity: +# failedTaskAutoRetryTimes: +# failedTaskAutoRetryDelay: +# logLevel: INFO +# existingClaim: "" +# storageClass: "python" +# accessMode: ReadWriteMany +# dependent_distribution: false +# size: 1Gi +# resources: +# requests: +# cpu: "2" +# memory: "4Gi" +# limits: +# cpu: "4" +# memory: "8Gi" +# clustermanager: +# cores_per_node: 16 +# nodes: 2 +# spark: +# cores_per_node: 20 +# nodes: 2 +# master: spark://spark-master:7077 +# driverHost: +# driverHostType: +# portMaxRetries: +# driverStartPort: +# blockManagerStartPort: +# pysparkPython: +# hdfs: +# name_node: hdfs://namenode:9000 +# path_prefix: +# rabbitmq: +# host: rabbitmq +# mng_port: 15672 +# port: 5672 +# user: fate +# password: fate +# pulsar: +# host: pulsar +# mng_port: 8080 +# port: 6650 +# topic_ttl: 3 +# cluster: standalone +# tenant: fl-tenant +# nginx: +# host: nginx +# http_port: 9300 +# grpc_port: 9310 +# hive: +# host: 127.0.0.1 +# port: 10000 +# auth_mechanism: +# username: +# password: +# Specify the mysql properties +# mysql: + # image: "mysql" + # imageTag: "8.0.28" # nodeSelector: # tolerations: # affinity: - # logLevel: INFO - # existingClaim: "" - # storageClass: "python" - # accessMode: ReadWriteMany - # size: 1Gi - # resources: - # requests: - # cpu: "2" - # memory: "4Gi" - # nvidia.com/gpu: 1 - # limits: - # cpu: "4" - # memory: "8Gi" - # nvidia.com/gpu: 1 - # spark: - # cores_per_node: 20 - # nodes: 2 - # master: spark://spark-master:7077 - # driverHost: - # driverHostType: - # portMaxRetries: - # driverStartPort: - # blockManagerStartPort: - # pysparkPython: - # hdfs: - # name_node: hdfs://namenode:9000 - # path_prefix: - # pulsar: - # host: pulsar - # mng_port: 8080 - # port: 6650 - # nginx: - # host: nginx - # http_port: 9300 - # grpc_port: 9310 - -# Specify the mysql properties -# mysql: - # nodeSelector: # ip: mysql # port: 3306 # database: eggroll_meta @@ -113,22 +148,30 @@ skippedKeys: # accessMode: ReadWriteOnce # size: 1Gi -# -# #externalMysqlIp: mysql -# #externalMysqlPort: 3306 -# #externalMysqlDatabase: eggroll_meta -# #externalMysqlUser: fate -# #externalMysqlPassword: fate_dev +# externalMysqlIp: mysql1 +# externalMysqlPort: 33060 +# externalMysqlDatabase: eggroll_meta1 +# externalMysqlUser: fate1 +# externalMysqlPassword: fate_dev1 # servingIp: 192.168.0.1 # servingPort: 30095 +# serving: +# useRegistry: false +# zookeeper: +# hosts: +# - serving-zookeeper.fate-serving-9999:2181 +# use_acl: false +# user: fate +# password: fate # FATE on spark configuration + # spark: # master: - # Image: "federatedai/spark-master" - # ImageTag: "1.11.2-release" + # image: "federatedai/spark-master" + # imageTag: "v2.0.0-beta" # replicas: 1 # resources: # requests: @@ -143,8 +186,8 @@ skippedKeys: # type: ClusterIP # nodePort: 30977 # worker: - # Image: "federatedai/spark-worker" - # ImageTag: "1.11.2-release" + # image: "federatedai/spark-worker" + # imageTag: "v2.0.0-beta" # replicas: 2 # resources: # requests: @@ -159,6 +202,8 @@ skippedKeys: # type: ClusterIP # hdfs: # namenode: + # image: "federatedai/hadoop-namenode" + # imageTag: "2.0.0-hadoop3.2.1-java8" # nodeSelector: # tolerations: # affinity: @@ -169,6 +214,8 @@ skippedKeys: # accessMode: ReadWriteOnce # size: 1Gi # datanode: + # image: "federatedai/hadoop-datanode" + # imageTag: "2.0.0-hadoop3.2.1-java8" # replicas: # nodeSelector: # tolerations: @@ -179,6 +226,8 @@ skippedKeys: # accessMode: # size: # nginx: + # image: "federatedai/nginx" + # imageTag: "v2.0.0-beta" # nodeSelector: # tolerations: # affinity: @@ -200,66 +249,48 @@ skippedKeys: # - host: 192.168.0.1 # http_port: 30107 # grpc_port: 30102 -# rabbitmq: - # nodeSelector: - # tolerations: - # affinity: - # type: ClusterIP - # nodePort: 30094 - # loadBalancerIP: - # default_user: fate - # default_pass: fate - # user: fate - # password: fate - # route_table: - # 9999: - # host: rabbitmq - # port: 5672 - # 10000: - # host: 192.168.0.1 - # port: 30104 - # resources: - # requests: - # cpu: "2" - # memory: "4Gi" - # limits: - # cpu: "4" - # memory: "8Gi" + # pulsar: - # nodeSelector: - # tolerations: - # affinity: - # type: ClusterIP - # httpNodePort: 30094 - # httpsNodePort: 30099 - # loadBalancerIP: - # storageClass: "pulsar" - # existingClaim: - # accessMode: ReadWriteOnce - # size: 1Gi - # env: - # confs: - # publicLB: - # enabled: false - # exchange: - # ip: 192.168.10.1 - # port: 30000 - # domain: fate.org - # route_table: - # 9999: - # host: pulsar - # port: 6650 - # sslPort: 6651 - # 10000: - # host: 192.168.10.1 - # port: 30105 - # sslPort: 30109 - # proxy: "" - # resources: - # requests: - # cpu: "2" - # memory: "4Gi" - # limits: - # cpu: "4" - # memory: "8Gi" + # image: "federatedai/pulsar" + # imageTag: "2.10.2" +# nodeSelector: +# tolerations: +# affinity: +# env: +# - name: PULSAR_MEM +# value: "-Xms4g -Xmx4g -XX:MaxDirectMemorySize=8g" +# confs: +# brokerDeleteInactiveTopicsFrequencySeconds: 60 +# backlogQuotaDefaultLimitGB: 10 +# type: ClusterIP +# httpNodePort: 30094 +# httpsNodePort: 30099 +# loadBalancerIP: +# storageClass: "pulsar" +# existingClaim: +# accessMode: ReadWriteOnce +# size: 1Gi +# publicLB: +# enabled: false +# exchange: +# ip: 192.168.10.1 +# port: 30000 +# domain: fate.org +# route_table: +# 9999: +# host: pulsar +# port: 6650 +# sslPort: 6651 +# 10000: +# host: 192.168.10.1 +# port: 30105 +# sslPort: 30109 +# proxy: "" +# resources: +# requests: +# cpu: "2" +# memory: "4Gi" +# limits: +# cpu: "4" +# memory: "8Gi" diff --git a/k8s-deploy/cluster-spark-rabbitmq.yaml b/k8s-deploy/cluster-spark-rabbitmq.yaml index 874e2adfc..f30cc4b7a 100644 --- a/k8s-deploy/cluster-spark-rabbitmq.yaml +++ b/k8s-deploy/cluster-spark-rabbitmq.yaml @@ -1,7 +1,7 @@ name: fate-9999 namespace: fate-9999 chartName: fate -chartVersion: v1.11.2 +chartVersion: v2.0.0-beta partyId: 9999 registry: "" pullPolicy: @@ -53,57 +53,88 @@ skippedKeys: # hosts: # - name: party9999.rabbitmq.example.com +# fateboard: + # image: "federatedai/fateboard" + # imageTag: "v2.0.0-beta" +# type: ClusterIP +# username: admin +# password: admin +# nodeSelector: +# tolerations: +# affinity: + # Specify the fateflow service's properties # python: - # type: NodePort - # httpNodePort: 30097 - # grpcNodePort: 30092 - # loadBalancerIP: - # serviceAccountName: "" - # nodeSelector: - # tolerations: - # affinity: - # logLevel: INFO - # existingClaim: "" - # storageClass: "python" - # accessMode: ReadWriteMany - # size: 1Gi - # resources: - # requests: - # cpu: "2" - # memory: "4Gi" - # nvidia.com/gpu: 1 - # limits: - # cpu: "4" - # memory: "8Gi" - # nvidia.com/gpu: 1 - # spark: - # cores_per_node: 20 - # nodes: 2 - # master: spark://spark-master:7077 - # Only for using external Spark - ## driverHost: - ## driverHostType: - ## portMaxRetries: - ## driverStartPort: - ## blockManagerStartPort: - ## pysparkPython: - # hdfs: - # name_node: hdfs://namenode:9000 - # path_prefix: - # nginx: - # host: nginx - # http_port: 9300 - # grpc_port: 9310 - # rabbitmq: - # host: rabbitmq - # mng_port: 15672 - # port: 5672 - # user: fate - # password: fate + # image: "federatedai/fateflow" + # imageTag: "v2.0.0-beta" +# type: NodePort +# replicas: 1 +# httpNodePort: 30097 +# grpcNodePort: 30092 +# loadBalancerIP: +# serviceAccountName: "" +# nodeSelector: +# tolerations: +# affinity: +# failedTaskAutoRetryTimes: +# failedTaskAutoRetryDelay: +# logLevel: INFO +# existingClaim: "" +# storageClass: "python" +# accessMode: ReadWriteMany +# dependent_distribution: false +# size: 1Gi +# resources: +# requests: +# cpu: "2" +# memory: "4Gi" +# limits: +# cpu: "4" +# memory: "8Gi" +# clustermanager: +# cores_per_node: 16 +# nodes: 2 +# spark: +# cores_per_node: 20 +# nodes: 2 +# master: spark://spark-master:7077 +# driverHost: +# driverHostType: +# portMaxRetries: +# driverStartPort: +# blockManagerStartPort: +# pysparkPython: +# hdfs: +# name_node: hdfs://namenode:9000 +# path_prefix: +# rabbitmq: +# host: rabbitmq +# mng_port: 15672 +# port: 5672 +# user: fate +# password: fate +# pulsar: +# host: pulsar +# mng_port: 8080 +# port: 6650 +# topic_ttl: 3 +# cluster: standalone +# tenant: fl-tenant +# nginx: +# host: nginx +# http_port: 9300 +# grpc_port: 9310 +# hive: +# host: 127.0.0.1 +# port: 10000 +# auth_mechanism: +# username: +# password: # Specify the mysql properties -# mysql: +# mysql: + # image: "mysql" + # imageTag: "8.0.28" # nodeSelector: # tolerations: # affinity: @@ -128,12 +159,21 @@ skippedKeys: # servingIp: 192.168.0.1 # servingPort: 30095 +# serving: +# useRegistry: false +# zookeeper: +# hosts: +# - serving-zookeeper.fate-serving-9999:2181 +# use_acl: false +# user: fate +# password: fate # FATE on spark configuration + # spark: # master: - # Image: "federatedai/spark-master" - # ImageTag: "1.6.1-release" + # image: "federatedai/spark-master" + # imageTag: "v2.0.0-beta" # replicas: 1 # resources: # requests: @@ -148,8 +188,8 @@ skippedKeys: # type: ClusterIP # nodePort: 30977 # worker: - # Image: "federatedai/spark-worker" - # ImageTag: "1.6.1-release" + # image: "federatedai/spark-worker" + # imageTag: "v2.0.0-beta" # replicas: 2 # resources: # requests: @@ -164,6 +204,8 @@ skippedKeys: # type: ClusterIP # hdfs: # namenode: + # image: "federatedai/hadoop-namenode" + # imageTag: "2.0.0-hadoop3.2.1-java8" # nodeSelector: # tolerations: # affinity: @@ -174,6 +216,8 @@ skippedKeys: # accessMode: ReadWriteOnce # size: 1Gi # datanode: + # image: "federatedai/hadoop-datanode" + # imageTag: "2.0.0-hadoop3.2.1-java8" # replicas: # nodeSelector: # tolerations: @@ -184,6 +228,8 @@ skippedKeys: # accessMode: # size: # nginx: + # image: "federatedai/nginx" + # imageTag: "v2.0.0-beta" # nodeSelector: # tolerations: # affinity: @@ -206,6 +252,8 @@ skippedKeys: # http_port: 30107 # grpc_port: 30102 # rabbitmq: + # image: "federatedai/rabbitmq" + # imageTag: "3.8.3-management" # nodeSelector: # tolerations: # affinity: diff --git a/k8s-deploy/cluster-spark-slim.yaml b/k8s-deploy/cluster-spark-slim.yaml index 3181514c2..0f4b0901d 100644 --- a/k8s-deploy/cluster-spark-slim.yaml +++ b/k8s-deploy/cluster-spark-slim.yaml @@ -1,7 +1,7 @@ name: fate-9999 namespace: fate-9999 chartName: fate -chartVersion: v1.11.2 +chartVersion: v2.0.0-beta partyId: 9999 registry: "" pullPolicy: @@ -51,55 +51,78 @@ skippedKeys: # hosts: # - name: party9999.pulsar.example.com -# Specify the fateflow service's properties + # python: - # type: NodePort - # httpNodePort: 30097 - # grpcNodePort: 30092 - # loadBalancerIP: - # serviceAccountName: "" - # nodeSelector: - # tolerations: - # affinity: - # logLevel: INFO - # existingClaim: "" - # storageClass: "python" - # accessMode: ReadWriteMany - # size: 1Gi - # resources: - # requests: - # cpu: "2" - # memory: "4Gi" - # nvidia.com/gpu: 1 - # limits: - # cpu: "4" - # memory: "8Gi" - # nvidia.com/gpu: 1 - # spark: - # cores_per_node: 20 - # nodes: 2 - # master: spark://spark-master:7077 - # Only for using external Spark - ## driverHost: - ## driverHostType: - ## portMaxRetries: - ## driverStartPort: - ## blockManagerStartPort: - ## pysparkPython: - # hdfs: - # name_node: hdfs://namenode:9000 - # path_prefix: - # pulsar: - # host: pulsar - # port: 6650 - # mng_port: 8080 - # nginx: - # host: nginx - # http_port: 9300 - # grpc_port: 9310 + # image: "federatedai/fateflow" + # imageTag: "v2.0.0-beta" +# type: NodePort +# replicas: 1 +# httpNodePort: 30097 +# grpcNodePort: 30092 +# loadBalancerIP: +# serviceAccountName: "" +# nodeSelector: +# tolerations: +# affinity: +# failedTaskAutoRetryTimes: +# failedTaskAutoRetryDelay: +# logLevel: INFO +# existingClaim: "" +# storageClass: "python" +# accessMode: ReadWriteMany +# dependent_distribution: false +# size: 1Gi +# resources: +# requests: +# cpu: "2" +# memory: "4Gi" +# limits: +# cpu: "4" +# memory: "8Gi" +# clustermanager: +# cores_per_node: 16 +# nodes: 2 +# spark: +# cores_per_node: 20 +# nodes: 2 +# master: spark://spark-master:7077 +# driverHost: +# driverHostType: +# portMaxRetries: +# driverStartPort: +# blockManagerStartPort: +# pysparkPython: +# hdfs: +# name_node: hdfs://namenode:9000 +# path_prefix: +# rabbitmq: +# host: rabbitmq +# mng_port: 15672 +# port: 5672 +# user: fate +# password: fate +# pulsar: +# host: pulsar +# mng_port: 8080 +# port: 6650 +# topic_ttl: 3 +# cluster: standalone +# tenant: fl-tenant +# nginx: +# host: nginx +# http_port: 9300 +# grpc_port: 9310 +# hive: +# host: 127.0.0.1 +# port: 10000 +# auth_mechanism: +# username: +# password: # Specify the mysql properties -# mysql: +# mysql: + # image: "mysql" + # imageTag: "8.0.28" # nodeSelector: # tolerations: # affinity: @@ -114,49 +137,90 @@ skippedKeys: # accessMode: ReadWriteOnce # size: 1Gi -# -# #externalMysqlIp: mysql -# #externalMysqlPort: 3306 -# #externalMysqlDatabase: eggroll_meta -# #externalMysqlUser: fate -# #externalMysqlPassword: fate_dev + +# externalMysqlIp: mysql1 +# externalMysqlPort: 33060 +# externalMysqlDatabase: eggroll_meta1 +# externalMysqlUser: fate1 +# externalMysqlPassword: fate_dev1 # servingIp: 192.168.0.1 # servingPort: 30095 +# serving: +# useRegistry: false +# zookeeper: +# hosts: +# - serving-zookeeper.fate-serving-9999:2181 +# use_acl: false +# user: fate +# password: fate + # nginx: - # nodeSelector: + # image: "federatedai/nginx" + # imageTag: "v2.0.0-beta" + # nodeSelector: + # tolerations: + # affinity: # type: ClusterIP + # loadBalancerIP: # httpNodePort: 30093 # grpcNodePort: 30098 - # route_table: - # 10000: - # proxy: - # - host: 192.168.0.1 + # exchange: + # ip: 192.168.10.1 + # httpPort: 30003 + # grpcPort: 30008 + # route_table: + # 10000: + # proxy: + # - host: 192.168.0.1 # http_port: 30103 - # grpc_port: 30108 - # fateflow: + # grpc_port: 30108 + # fateflow: # - host: 192.168.0.1 # http_port: 30107 # grpc_port: 30102 # pulsar: - # type: ClusterIP - # nodePort: 30094 - # route_table: - # 10000: - # host: 192.168.0.1 - # port: 6650 - # sslPort:6651 - # proxy: "" - # 9999: - # host: pulsar - # port: 6650 - # sslPort:6651 - # resources: - # requests: - # cpu: "2" - # memory: "4Gi" - # limits: - # cpu: "4" - # memory: "8Gi" + # image: "federatedai/pulsar" + # imageTag: "2.10.2" +# nodeSelector: +# tolerations: +# affinity: +# env: +# - name: PULSAR_MEM +# value: "-Xms4g -Xmx4g -XX:MaxDirectMemorySize=8g" +# confs: +# brokerDeleteInactiveTopicsFrequencySeconds: 60 +# backlogQuotaDefaultLimitGB: 10 +# type: ClusterIP +# httpNodePort: 30094 +# httpsNodePort: 30099 +# loadBalancerIP: +# storageClass: "pulsar" +# existingClaim: +# accessMode: ReadWriteOnce +# size: 1Gi +# publicLB: +# enabled: false +# exchange: +# ip: 192.168.10.1 +# port: 30000 +# domain: fate.org +# route_table: +# 9999: +# host: pulsar +# port: 6650 +# sslPort: 6651 +# 10000: +# host: 192.168.10.1 +# port: 30105 +# sslPort: 30109 +# proxy: "" +# resources: +# requests: +# cpu: "2" +# memory: "4Gi" +# limits: +# cpu: "4" +# memory: "8Gi" diff --git a/k8s-deploy/cluster.yaml b/k8s-deploy/cluster.yaml index 3560d5e54..20e4c51b8 100644 --- a/k8s-deploy/cluster.yaml +++ b/k8s-deploy/cluster.yaml @@ -1,7 +1,7 @@ name: fate-9999 namespace: fate-9999 chartName: fate -chartVersion: v1.11.2 +chartVersion: v2.0.0-beta partyId: 9999 registry: "" pullPolicy: diff --git a/k8s-deploy/examples/party-10000/cluster-gpu.yaml b/k8s-deploy/examples/party-10000/cluster-gpu.yaml index d16b4a66b..d92afbbab 100644 --- a/k8s-deploy/examples/party-10000/cluster-gpu.yaml +++ b/k8s-deploy/examples/party-10000/cluster-gpu.yaml @@ -1,7 +1,7 @@ name: fate-10000 namespace: fate-10000 chartName: fate -chartVersion: v1.11.2 +chartVersion: v2.0.0-beta partyId: 10000 registry: "" pullPolicy: diff --git a/k8s-deploy/examples/party-10000/cluster-spark-local-pulsar.yaml b/k8s-deploy/examples/party-10000/cluster-spark-local-pulsar.yaml index 302657537..74df16aba 100644 --- a/k8s-deploy/examples/party-10000/cluster-spark-local-pulsar.yaml +++ b/k8s-deploy/examples/party-10000/cluster-spark-local-pulsar.yaml @@ -1,7 +1,7 @@ name: fate-10000 namespace: fate-10000 chartName: fate -chartVersion: v1.11.2 +chartVersion: v2.0.0-beta partyId: 10000 registry: "" pullPolicy: diff --git a/k8s-deploy/examples/party-10000/cluster-spark-pulsar.yaml b/k8s-deploy/examples/party-10000/cluster-spark-pulsar.yaml index 2178e4b40..cfce46776 100644 --- a/k8s-deploy/examples/party-10000/cluster-spark-pulsar.yaml +++ b/k8s-deploy/examples/party-10000/cluster-spark-pulsar.yaml @@ -1,7 +1,7 @@ name: fate-10000 namespace: fate-10000 chartName: fate -chartVersion: v1.11.2 +chartVersion: v2.0.0-beta partyId: 10000 registry: "" pullPolicy: diff --git a/k8s-deploy/examples/party-10000/cluster-spark-rabbitmq.yaml b/k8s-deploy/examples/party-10000/cluster-spark-rabbitmq.yaml index 3797cb641..576e392c1 100644 --- a/k8s-deploy/examples/party-10000/cluster-spark-rabbitmq.yaml +++ b/k8s-deploy/examples/party-10000/cluster-spark-rabbitmq.yaml @@ -1,7 +1,7 @@ name: fate-10000 namespace: fate-10000 chartName: fate -chartVersion: v1.11.2 +chartVersion: v2.0.0-beta partyId: 10000 registry: "" pullPolicy: diff --git a/k8s-deploy/examples/party-10000/cluster.yaml b/k8s-deploy/examples/party-10000/cluster.yaml index 85f332997..24c7a1d56 100644 --- a/k8s-deploy/examples/party-10000/cluster.yaml +++ b/k8s-deploy/examples/party-10000/cluster.yaml @@ -1,7 +1,7 @@ name: fate-10000 namespace: fate-10000 chartName: fate -chartVersion: v1.11.2 +chartVersion: v2.0.0-beta partyId: 10000 registry: "" pullPolicy: diff --git a/k8s-deploy/examples/party-9999/cluster-gpu.yaml b/k8s-deploy/examples/party-9999/cluster-gpu.yaml index a45b27d71..4dfa1ade6 100644 --- a/k8s-deploy/examples/party-9999/cluster-gpu.yaml +++ b/k8s-deploy/examples/party-9999/cluster-gpu.yaml @@ -1,7 +1,7 @@ name: fate-9999 namespace: fate-9999 chartName: fate -chartVersion: v1.11.2 +chartVersion: v2.0.0-beta partyId: 9999 registry: "" pullPolicy: diff --git a/k8s-deploy/examples/party-9999/cluster-spark-local-pulsar.yaml b/k8s-deploy/examples/party-9999/cluster-spark-local-pulsar.yaml index f7cb9e570..ffb220a30 100644 --- a/k8s-deploy/examples/party-9999/cluster-spark-local-pulsar.yaml +++ b/k8s-deploy/examples/party-9999/cluster-spark-local-pulsar.yaml @@ -1,7 +1,7 @@ name: fate-9999 namespace: fate-9999 chartName: fate -chartVersion: v1.11.2 +chartVersion: v2.0.0-beta partyId: 9999 registry: "" pullPolicy: diff --git a/k8s-deploy/examples/party-9999/cluster-spark-pulsar.yaml b/k8s-deploy/examples/party-9999/cluster-spark-pulsar.yaml index d608bcc6b..0a55e76e0 100644 --- a/k8s-deploy/examples/party-9999/cluster-spark-pulsar.yaml +++ b/k8s-deploy/examples/party-9999/cluster-spark-pulsar.yaml @@ -1,7 +1,7 @@ name: fate-9999 namespace: fate-9999 chartName: fate -chartVersion: v1.11.2 +chartVersion: v2.0.0-beta partyId: 9999 registry: "" pullPolicy: diff --git a/k8s-deploy/examples/party-9999/cluster-spark-rabbitmq.yaml b/k8s-deploy/examples/party-9999/cluster-spark-rabbitmq.yaml index dfb6439bd..317a67f5a 100644 --- a/k8s-deploy/examples/party-9999/cluster-spark-rabbitmq.yaml +++ b/k8s-deploy/examples/party-9999/cluster-spark-rabbitmq.yaml @@ -1,7 +1,7 @@ name: fate-9999 namespace: fate-9999 chartName: fate -chartVersion: v1.11.2 +chartVersion: v2.0.0-beta partyId: 9999 registry: "" pullPolicy: diff --git a/k8s-deploy/examples/party-9999/cluster.yaml b/k8s-deploy/examples/party-9999/cluster.yaml index f5ec6ce56..5a51e0e2e 100644 --- a/k8s-deploy/examples/party-9999/cluster.yaml +++ b/k8s-deploy/examples/party-9999/cluster.yaml @@ -1,7 +1,7 @@ name: fate-9999 namespace: fate-9999 chartName: fate -chartVersion: v1.11.2 +chartVersion: v2.0.0-beta partyId: 9999 registry: "" pullPolicy: diff --git a/k8s-deploy/examples/party.config b/k8s-deploy/examples/party.config index 37c5f56b2..7e5e0bf25 100644 --- a/k8s-deploy/examples/party.config +++ b/k8s-deploy/examples/party.config @@ -1,5 +1,5 @@ -fate_chartVersion=v1.11.2 -fate_imageTAG=1.11.2-release +fate_chartVersion=v2.0.0-beta +fate_imageTAG=v2.0.0-beta fate_serving_chartVersion=v2.1.6 fate_serving_imageTAG=2.1.6-release party_9999_IP=192.168.9.1