From 61190b0f42d0f4e18f9377bbbe728faa61d0c24c Mon Sep 17 00:00:00 2001 From: Chenlong Ma Date: Thu, 25 Jan 2024 12:47:32 +0800 Subject: [PATCH] KubeFATE support FATE v2.0.0 (#927) * docker compose eggroll support Signed-off-by: Chenlong Ma * support spark Signed-off-by: Chenlong Ma * Helm chart support FATE v2.0.0 Signed-off-by: Chenlong Ma * helm-chart fix Signed-off-by: Chenlong Ma * update version tag Signed-off-by: Chenlong Ma --------- Signed-off-by: Chenlong Ma --- docker-deploy/.env | 18 +-- docker-deploy/README.md | 12 +- docker-deploy/README_zh.md | 12 +- docker-deploy/docker_deploy.sh | 2 +- docker-deploy/generate_config.sh | 126 ++++++++---------- docker-deploy/parties.conf | 4 +- .../backends/eggroll/conf/eggroll.properties | 61 +++------ .../eggroll/conf/node-extend-env.properties | 0 .../docker-compose-eggroll.yml | 15 +-- .../docker-compose-spark-slim.yml | 101 +++++++------- .../docker-compose-spark.yml | 2 +- .../public/fate_flow/conf/service_conf.yaml | 55 ++++---- .../mysql/init/create-eggroll-meta-tables.sql | 34 +++-- .../public/osx/conf/broker.properties | 61 +++++++++ .../public/osx/conf/route_table.json | 26 ++++ ...ster_in_One_Linux_Machine_with_MiniKube.md | 18 +-- ...r_in_One_Linux_Machine_with_MiniKube_zh.md | 20 +-- helm-charts/FATE/Chart.yaml | 4 +- .../eggroll/clustermanager/deployment.yaml | 9 +- .../templates/backends/eggroll/configmap.yaml | 93 +++---------- .../eggroll/nodemanager/statefulSet.yaml | 9 +- .../backends/eggroll/rollsite/configmap.yaml | 94 ------------- .../templates/core/fateflow/configmap.yaml | 42 +++--- .../FATE/templates/core/mysql/configmap.yaml | 39 +++--- .../FATE/templates/core/osx/configmap.yaml | 106 +++++++++++++++ .../rollsite => core/osx}/deployment.yaml | 55 +++----- .../rollsite => core/osx}/service.yaml | 20 +-- .../FATE/templates/core/python-spark.yaml | 2 +- helm-charts/FATE/values-template-example.yaml | 32 ++--- helm-charts/FATE/values-template.yaml | 34 ++--- helm-charts/FATE/values.yaml | 32 ++--- k8s-deploy/README.md | 14 +- k8s-deploy/README_zh.md | 14 +- k8s-deploy/cluster-spark-pulsar.yaml | 12 +- k8s-deploy/cluster-spark-rabbitmq.yaml | 12 +- k8s-deploy/cluster-spark-slim.yaml | 6 +- k8s-deploy/cluster.yaml | 10 +- .../examples/party-10000/cluster-gpu.yaml | 2 +- .../cluster-spark-local-pulsar.yaml | 2 +- .../party-10000/cluster-spark-pulsar.yaml | 2 +- .../party-10000/cluster-spark-rabbitmq.yaml | 2 +- k8s-deploy/examples/party-10000/cluster.yaml | 2 +- .../examples/party-9999/cluster-gpu.yaml | 2 +- .../cluster-spark-local-pulsar.yaml | 2 +- .../party-9999/cluster-spark-pulsar.yaml | 2 +- .../party-9999/cluster-spark-rabbitmq.yaml | 2 +- k8s-deploy/examples/party-9999/cluster.yaml | 2 +- k8s-deploy/examples/party.config | 4 +- 48 files changed, 620 insertions(+), 610 deletions(-) create mode 100644 docker-deploy/training_template/backends/eggroll/conf/node-extend-env.properties create mode 100644 docker-deploy/training_template/public/osx/conf/broker.properties create mode 100644 docker-deploy/training_template/public/osx/conf/route_table.json delete mode 100644 helm-charts/FATE/templates/backends/eggroll/rollsite/configmap.yaml create mode 100644 helm-charts/FATE/templates/core/osx/configmap.yaml rename helm-charts/FATE/templates/{backends/eggroll/rollsite => core/osx}/deployment.yaml (61%) rename helm-charts/FATE/templates/{backends/eggroll/rollsite => core/osx}/service.yaml (66%) diff --git a/docker-deploy/.env b/docker-deploy/.env index 810a3c092..4d0c55d34 100644 --- a/docker-deploy/.env +++ b/docker-deploy/.env @@ -8,24 +8,26 @@ SSH_PORT=22 # SSH_PORT: port of SSH, default 22 -KubeFATE_Version=v2.0.0-beta +KubeFATE_Version=v2.0.0-release # components version FATEFlow_IMAGE="federatedai/fateflow" -FATEFlow_IMAGE_TAG="v2.0.0-beta" +FATEFlow_IMAGE_TAG="2.0.0-release" FATEBoard_IMAGE="federatedai/fateboard" -FATEBoard_IMAGE_TAG="v2.0.0-beta" +FATEBoard_IMAGE_TAG="2.0.0-release" MySQL_IMAGE="mysql" MySQL_IMAGE_TAG="8.0.28" Client_IMAGE="federatedai/client" -Client_IMAGE_TAG="v2.0.0-beta" +Client_IMAGE_TAG="2.0.0-release" EGGRoll_IMAGE="federatedai/eggroll" -EGGRoll_IMAGE_TAG="v2.0.0-beta" +EGGRoll_IMAGE_TAG="2.0.0-release" +OSX_IMAGE="federatedai/osx" +OSX_IMAGE_TAG="2.0.0-release" Nginx_IMAGE="federatedai/nginx" -Nginx_IMAGE_TAG="v2.0.0-beta" +Nginx_IMAGE_TAG="2.0.0-release" RabbitMQ_IMAGE="federatedai/rabbitmq" RabbitMQ_IMAGE_TAG="3.8.3-management" Pulsar_IMAGE="federatedai/pulsar" @@ -35,6 +37,6 @@ Hadoop_NameNode_IMAGE_TAG="2.0.0-hadoop3.2.1-java8" Hadoop_DataNode_IMAGE="federatedai/hadoop-datanode" Hadoop_DataNode_IMAGE_TAG="2.0.0-hadoop3.2.1-java8" Spark_Master_IMAGE="federatedai/spark-master" -Spark_Master_IMAGE_TAG="v2.0.0-beta" +Spark_Master_IMAGE_TAG="2.0.0-release" Spark_Worker_IMAGE="federatedai/spark-worker" -Spark_Worker_IMAGE_TAG="v2.0.0-beta" \ No newline at end of file +Spark_Worker_IMAGE_TAG="2.0.0-release" \ No newline at end of file diff --git a/docker-deploy/README.md b/docker-deploy/README.md index 22cde8dff..e585c06a0 100644 --- a/docker-deploy/README.md +++ b/docker-deploy/README.md @@ -192,13 +192,13 @@ The output is shown as follows. If the status of each component is `Up`, and the ```bash NAME IMAGE COMMAND SERVICE CREATED STATUS PORTS -confs-10000-client-1 federatedai/client:v2.0.0-beta "bash -c 'pipeline i…" client About a minute ago Up About a minute 0.0.0.0:20000->20000/tcp, :::20000->20000/tcp -confs-10000-clustermanager-1 federatedai/eggroll:v2.0.0-beta "/tini -- bash -c 'j…" clustermanager About a minute ago Up About a minute 4670/tcp -confs-10000-fateboard-1 federatedai/fateboard:v2.0.0-beta "/bin/sh -c 'java -D…" fateboard About a minute ago Up About a minute 0.0.0.0:8080->8080/tcp, :::8080->8080/tcp -confs-10000-fateflow-1 federatedai/fateflow:v2.0.0-beta "/bin/bash -c 'set -…" fateflow About a minute ago Up About a minute (healthy) 0.0.0.0:9360->9360/tcp, :::9360->9360/tcp, 0.0.0.0:9380->9380/tcp, :::9380->9380/tcp +confs-10000-client-1 federatedai/client:2.0.0-release "bash -c 'pipeline i…" client About a minute ago Up About a minute 0.0.0.0:20000->20000/tcp, :::20000->20000/tcp +confs-10000-clustermanager-1 federatedai/eggroll:2.0.0-release "/tini -- bash -c 'j…" clustermanager About a minute ago Up About a minute 4670/tcp +confs-10000-fateboard-1 federatedai/fateboard:2.0.0-release "/bin/sh -c 'java -D…" fateboard About a minute ago Up About a minute 0.0.0.0:8080->8080/tcp, :::8080->8080/tcp +confs-10000-fateflow-1 federatedai/fateflow:2.0.0-release "/bin/bash -c 'set -…" fateflow About a minute ago Up About a minute (healthy) 0.0.0.0:9360->9360/tcp, :::9360->9360/tcp, 0.0.0.0:9380->9380/tcp, :::9380->9380/tcp confs-10000-mysql-1 mysql:8.0.28 "docker-entrypoint.s…" mysql About a minute ago Up About a minute 3306/tcp, 33060/tcp -confs-10000-nodemanager-1 federatedai/eggroll:v2.0.0-beta "/tini -- bash -c 'j…" nodemanager About a minute ago Up About a minute 4671/tcp -confs-10000-rollsite-1 federatedai/eggroll:v2.0.0-beta "/tini -- bash -c 'j…" rollsite About a minute ago Up About a minute 0.0.0.0:9370->9370/tcp, :::9370->9370/tcp +confs-10000-nodemanager-1 federatedai/eggroll:2.0.0-release "/tini -- bash -c 'j…" nodemanager About a minute ago Up About a minute 4671/tcp +confs-10000-osx-1 federatedai/osx:2.0.0-release "/tini -- bash -c 'j…" osx About a minute ago Up About a minute 0.0.0.0:9370->9370/tcp, :::9370->9370/tcp ``` ### Verifying the deployment diff --git a/docker-deploy/README_zh.md b/docker-deploy/README_zh.md index 82b6c2178..eafd25765 100644 --- a/docker-deploy/README_zh.md +++ b/docker-deploy/README_zh.md @@ -231,13 +231,13 @@ docker compose ps ```bash NAME IMAGE COMMAND SERVICE CREATED STATUS PORTS -confs-10000-client-1 federatedai/client:v2.0.0-beta "bash -c 'pipeline i…" client About a minute ago Up About a minute 0.0.0.0:20000->20000/tcp, :::20000->20000/tcp -confs-10000-clustermanager-1 federatedai/eggroll:v2.0.0-beta "/tini -- bash -c 'j…" clustermanager About a minute ago Up About a minute 4670/tcp -confs-10000-fateboard-1 federatedai/fateboard:v2.0.0-beta "/bin/sh -c 'java -D…" fateboard About a minute ago Up About a minute 0.0.0.0:8080->8080/tcp, :::8080->8080/tcp -confs-10000-fateflow-1 federatedai/fateflow:v2.0.0-beta "/bin/bash -c 'set -…" fateflow About a minute ago Up About a minute (healthy) 0.0.0.0:9360->9360/tcp, :::9360->9360/tcp, 0.0.0.0:9380->9380/tcp, :::9380->9380/tcp +confs-10000-client-1 federatedai/client:2.0.0-release "bash -c 'pipeline i…" client About a minute ago Up About a minute 0.0.0.0:20000->20000/tcp, :::20000->20000/tcp +confs-10000-clustermanager-1 federatedai/eggroll:2.0.0-release "/tini -- bash -c 'j…" clustermanager About a minute ago Up About a minute 4670/tcp +confs-10000-fateboard-1 federatedai/fateboard:2.0.0-release "/bin/sh -c 'java -D…" fateboard About a minute ago Up About a minute 0.0.0.0:8080->8080/tcp, :::8080->8080/tcp +confs-10000-fateflow-1 federatedai/fateflow:2.0.0-release "/bin/bash -c 'set -…" fateflow About a minute ago Up About a minute (healthy) 0.0.0.0:9360->9360/tcp, :::9360->9360/tcp, 0.0.0.0:9380->9380/tcp, :::9380->9380/tcp confs-10000-mysql-1 mysql:8.0.28 "docker-entrypoint.s…" mysql About a minute ago Up About a minute 3306/tcp, 33060/tcp -confs-10000-nodemanager-1 federatedai/eggroll:v2.0.0-beta "/tini -- bash -c 'j…" nodemanager About a minute ago Up About a minute 4671/tcp -confs-10000-rollsite-1 federatedai/eggroll:v2.0.0-beta "/tini -- bash -c 'j…" rollsite About a minute ago Up About a minute 0.0.0.0:9370->9370/tcp, :::9370->9370/tcp +confs-10000-nodemanager-1 federatedai/eggroll:2.0.0-release "/tini -- bash -c 'j…" nodemanager About a minute ago Up About a minute 4671/tcp +confs-10000-osx-1 federatedai/osx:2.0.0-release "/tini -- bash -c 'j…" osx About a minute ago Up About a minute 0.0.0.0:9370->9370/tcp, :::9370->9370/tcp ``` ### 验证部署 diff --git a/docker-deploy/docker_deploy.sh b/docker-deploy/docker_deploy.sh index d85ad3f23..335b6e9b4 100755 --- a/docker-deploy/docker_deploy.sh +++ b/docker-deploy/docker_deploy.sh @@ -323,7 +323,7 @@ handleLocally() { main() { - if [ "$1" = "" ] || [ "$" = "--help" ]; then + if [ "$1" = "" ] || [ "$1" = "--help" ]; then ShowUsage exit 1 elif [ "$1" = "--delete" ] || [ "$1" = "--del" ]; then diff --git a/docker-deploy/generate_config.sh b/docker-deploy/generate_config.sh index bb5d845ab..f11cc8293 100755 --- a/docker-deploy/generate_config.sh +++ b/docker-deploy/generate_config.sh @@ -56,7 +56,7 @@ function list_include_item { function CheckConfig(){ # Check config start - computing_list="Eggroll Spark Spark_local" + computing_list="Eggroll Spark STANDALONE" spark_federation_list="RabbitMQ Pulsar" algorithm_list="Basic NN ALL" device_list="CPU IPCL GPU" @@ -67,8 +67,8 @@ function CheckConfig(){ fi if [ $computing == "Eggroll" ]; then - if [ $federation != "Eggroll" ] || [ $storage != "Eggroll" ]; then - echo "[ERROR]: Please select the correct engine. When eggroll is selected as the computing engine, both Federation and storage must be eggroll engines!" + if [ $federation != "OSX" ] || [ $storage != "Eggroll" ]; then + echo "[ERROR]: Please select the correct engine. When eggroll is selected as the computing engine, both Federation and Storage must be osx/eggroll engines!" exit 1 fi fi @@ -84,13 +84,13 @@ function CheckConfig(){ fi fi - if [ "$computing" == "Spark_local" ]; then - if ! $(list_include_item "$spark_federation_list" "$federation"); then - echo "[ERROR]: If you choose the Spark_local computing engine, the federation component must be Pulsar or RabbitMQ!" - exit 1 - fi - if [ "$storage" != "LocalFS" ]; then - echo "[ERROR]: If you choose the Spark computing engine, the storage component must be LocalFS!" + if [ "$computing" == "STANDALONE" ]; then + # if ! $(list_include_item "$spark_federation_list" "$federation"); then + # echo "[ERROR]: If you choose the STANDALONE computing engine, the federation component must be Pulsar or RabbitMQ!" + # exit 1 + # fi + if [ "$storage" != "STANDALONE" ]; then + echo "[ERROR]: If you choose the Spark computing engine, the storage component must be STANDALONE!" exit 1 fi fi @@ -180,6 +180,7 @@ GenerateConfig() { #clustermanager & nodemanager sed -i "s##${clustermanager_ip}#g" ./confs-$party_id/confs/eggroll/conf/eggroll.properties sed -i "s##${clustermanager_port}#g" ./confs-$party_id/confs/eggroll/conf/eggroll.properties + sed -i "s##${nodemanager_ip}#g" ./confs-$party_id/confs/eggroll/conf/eggroll.properties sed -i "s##${nodemanager_port}#g" ./confs-$party_id/confs/eggroll/conf/eggroll.properties sed -i "s##${party_id}#g" ./confs-$party_id/confs/eggroll/conf/eggroll.properties @@ -206,7 +207,7 @@ GenerateConfig() { if [ "$federation" == "RabbitMQ" ]; then cp -r training_template/backends/spark/rabbitmq confs-$party_id/confs/ # delete Pulsar spec - sed -i '203,217d' confs-"$party_id"/docker-compose.yml + sed -i '203,218d' confs-"$party_id"/docker-compose.yml elif [ "$federation" == "Pulsar" ]; then cp -r training_template/backends/spark/pulsar confs-$party_id/confs/ # delete RabbitMQ spec @@ -215,21 +216,21 @@ GenerateConfig() { fi fi - if [ "$computing" == "Spark_local" ]; then + if [ "$computing" == "STANDALONE" ]; then # computing - cp -r training_template/backends/spark/nginx confs-$party_id/confs/ - cp -r training_template/backends/spark/spark confs-$party_id/confs/ + # cp -r training_template/backends/spark/nginx confs-$party_id/confs/ + # cp -r training_template/backends/spark/spark confs-$party_id/confs/ # storage - if [ "$storage" == "LocalFS" ]; then + if [ "$storage" == "STANDALONE" ]; then cp training_template/docker-compose-spark-slim.yml confs-$party_id/docker-compose.yml # federation - if [ "$federation" == "RabbitMQ" ]; then - cp -r training_template/backends/spark/rabbitmq confs-$party_id/confs/ - sed -i '149,163d' confs-$party_id/docker-compose.yml - elif [ "$federation" == "Pulsar" ]; then - cp -r training_template/backends/spark/pulsar confs-$party_id/confs/ - sed -i '131,147d' confs-$party_id/docker-compose.yml - fi + # if [ "$federation" == "RabbitMQ" ]; then + # cp -r training_template/backends/spark/rabbitmq confs-$party_id/confs/ + # sed -i '149,163d' confs-$party_id/docker-compose.yml + # elif [ "$federation" == "Pulsar" ]; then + # cp -r training_template/backends/spark/pulsar confs-$party_id/confs/ + # sed -i '131,147d' confs-$party_id/docker-compose.yml + # fi fi fi @@ -241,7 +242,7 @@ GenerateConfig() { # Images choose Suffix="" # computing - if [ "$computing" == "Spark" ] || [ "$computing" == "Spark_local" ]; then + if [ "$computing" == "Spark" ]; then Suffix=$Suffix"" fi # algorithm @@ -264,7 +265,7 @@ GenerateConfig() { if [ "$computing" == "Eggroll" ]; then sed -i "s#image: \"\${FATEFlow_IMAGE}:\${FATEFlow_IMAGE_TAG}\"#image: \"\${FATEFlow_IMAGE}${Suffix}:\${FATEFlow_IMAGE_TAG}\"#g" ./confs-"$party_id"/docker-compose.yml sed -i "s#image: \"\${EGGRoll_IMAGE}:\${EGGRoll_IMAGE_TAG}\"#image: \"\${EGGRoll_IMAGE}${Suffix}:\${EGGRoll_IMAGE_TAG}\"#g" ./confs-"$party_id"/docker-compose.yml - elif [ "$computing" == "Spark" ] || [ "$computing" == "Spark_local" ]; then + elif [ "$computing" == "Spark" ] ; then sed -i "s#image: \"\${FATEFlow_IMAGE}:\${FATEFlow_IMAGE_TAG}\"#image: \"\${FATEFlow_IMAGE}-spark${Suffix}:\${FATEFlow_IMAGE_TAG}\"#g" ./confs-"$party_id"/docker-compose.yml sed -i "s#image: \"\${Spark_Worker_IMAGE}:\${Spark_Worker_IMAGE_TAG}\"#image: \"\${Spark_Worker_IMAGE}${Suffix}:\${Spark_Worker_IMAGE_TAG}\"#g" ./confs-"$party_id"/docker-compose.yml fi @@ -273,12 +274,12 @@ GenerateConfig() { if [ "$device" == "GPU" ]; then line=0 # line refers to the line number of the fateflow `command` line in docker-compose.yaml if [ "$computing" == "Eggroll" ]; then - line=140 + line=141 fi if [ "$computing" == "Spark" ]; then - line=84 + line=85 fi - if [ "$computing" == "Spark_local" ]; then + if [ "$computing" == "STANDALONE" ]; then line=85 fi sed -i "${line}i\\ @@ -345,7 +346,7 @@ GenerateConfig() { echo mysql module of $party_id done! # fate_flow - sed -i "s/party_id:/party_id: \"${party_id}\"/g" ./confs-$party_id/confs/fate_flow/conf/service_conf.yaml + sed -i "s/party_id: .*/party_id: \"${party_id}\"/g" ./confs-$party_id/confs/fate_flow/conf/service_conf.yaml sed -i "s/name: /name: '${db_name}'/g" ./confs-$party_id/confs/fate_flow/conf/service_conf.yaml sed -i "s/user: /user: '${db_user}'/g" ./confs-$party_id/confs/fate_flow/conf/service_conf.yaml sed -i "s/passwd: /passwd: '${db_password}'/g" ./confs-$party_id/confs/fate_flow/conf/service_conf.yaml @@ -353,31 +354,35 @@ GenerateConfig() { sed -i "s/127.0.0.1:8000/${serving_ip}:8000/g" ./confs-$party_id/confs/fate_flow/conf/service_conf.yaml - if [[ "$computing" == "Spark" ]] || [[ "$computing" == "Spark_local" ]] ; then - sed -i "s/proxy_name: rollsite/proxy_name: nginx/g" ./confs-$party_id/confs/fate_flow/conf/service_conf.yaml + if [[ "$computing" == "Spark" ]] ; then + sed -i "s/proxy_name: osx/proxy_name: nginx/g" ./confs-$party_id/confs/fate_flow/conf/service_conf.yaml sed -i "s/computing: eggroll/computing: spark/g" ./confs-$party_id/confs/fate_flow/conf/service_conf.yaml fi + if [[ "$computing" == "STANDALONE" ]] ; then + # sed -i "s/proxy_name: osx/proxy_name: nginx/g" ./confs-$party_id/confs/fate_flow/conf/service_conf.yaml + sed -i "s/computing: eggroll/computing: standalone/g" ./confs-$party_id/confs/fate_flow/conf/service_conf.yaml + fi if [[ "$federation" == "Pulsar" ]]; then - sed -i "s/ federation: rollsite/ federation: pulsar/g" ./confs-$party_id/confs/fate_flow/conf/service_conf.yaml + sed -i "s/ federation: osx/ federation: pulsar/g" ./confs-$party_id/confs/fate_flow/conf/service_conf.yaml elif [[ "$federation" == "RabbitMQ" ]]; then - sed -i "s/ federation: rollsite/ federation: rabbitmq/g" ./confs-$party_id/confs/fate_flow/conf/service_conf.yaml + sed -i "s/ federation: osx/ federation: rabbitmq/g" ./confs-$party_id/confs/fate_flow/conf/service_conf.yaml fi if [[ "$storage" == "HDFS" ]]; then sed -i "s/ storage: eggroll/ storage: hdfs/g" ./confs-$party_id/confs/fate_flow/conf/service_conf.yaml - elif [[ "$storage" == "LocalFS" ]]; then - sed -i "s/ storage: eggroll/ storage: localfs/g" ./confs-$party_id/confs/fate_flow/conf/service_conf.yaml + elif [[ "$storage" == "STANDALONE" ]]; then + sed -i "s/ storage: eggroll/ storage: standalone/g" ./confs-$party_id/confs/fate_flow/conf/service_conf.yaml fi - if [[ "$computing" == "Spark_local" ]] ; then - sed -i "s#spark.master .*#spark.master local[*]#g" ./confs-$party_id/confs/spark/spark-defaults.conf - fi + # if [[ "$computing" == "STANDALONE" ]] ; then + # sed -i "s#spark.master .*#spark.master local[*]#g" ./confs-$party_id/confs/spark/spark-defaults.conf + # fi # compute_core sed -i "s/nodes: .*/nodes: 1/g" ./confs-$party_id/confs/fate_flow/conf/service_conf.yaml sed -i "s/cores_per_node: .*/cores_per_node: $compute_core/g" ./confs-$party_id/confs/fate_flow/conf/service_conf.yaml - if [[ "$federation" == "Eggroll" ]]; then + if [[ "$computing" == "Eggroll" ]]; then sed -i "s/eggroll.session.processors.per.node=.*/eggroll.session.processors.per.node=$compute_core/g" ./confs-$party_id/confs/eggroll/conf/eggroll.properties fi if [[ "$computing" == "Spark"* ]]; then @@ -386,29 +391,14 @@ GenerateConfig() { echo fate_flow module of $party_id done! # federation config - # eggroll - if [[ "$federation" == "Eggroll" ]]; then - cat >./confs-$party_id/confs/eggroll/conf/route_table.json <./confs-$party_id/confs/osx/conf/route_table.json < eggroll.resourcemanager.clustermanager.jdbc.username= eggroll.resourcemanager.clustermanager.jdbc.password= - -eggroll.data.dir=data/ -eggroll.logs.dir=logs/ eggroll.resourcemanager.clustermanager.host= eggroll.resourcemanager.clustermanager.port= +eggroll.resourcemanager.nodemanager.host= eggroll.resourcemanager.nodemanager.port= eggroll.resourcemanager.process.tag= +# dashboard +eggroll.dashboard.server.port=8083 +eggroll.security.session.expired.time=30 +eggroll.security.login.username=admin +eggroll.security.login.password=admin +eggroll.security.encrypt.private_key= +eggroll.security.encrypt.enable=false + +eggroll.data.dir=/data/projects/fate/eggroll/data/ +eggroll.logs.dir=/data/projects/fate/eggroll/logs/ + eggroll.bootstrap.root.script=bin/eggroll_boot.sh eggroll.resourcemanager.bootstrap.egg_pair.exepath=bin/roll_pair/egg_pair_bootstrap.sh eggroll.resourcemanager.bootstrap.egg_pair.venv= eggroll.resourcemanager.bootstrap.egg_pair.pythonpath=python -eggroll.resourcemanager.bootstrap.egg_pair.filepath=python/eggroll/roll_pair/egg_pair.py +eggroll.resourcemanager.bootstrap.egg_pair.filepath=python/eggroll/computing/egg_pair/egg_pair.py eggroll.resourcemanager.bootstrap.egg_pair.ld_library_path= -eggroll.resourcemanager.bootstrap.egg_frame.exepath=bin/roll_frame/egg_frame_bootstrap.sh -eggroll.resourcemanager.bootstrap.egg_frame.javahome= -eggroll.resourcemanager.bootstrap.egg_frame.classpath= -eggroll.resourcemanager.bootstrap.egg_frame.mainclass=com.webank.eggroll.rollframe.EggFrameBootstrap -eggroll.resourcemanager.bootstrap.egg_frame.jvm.options= - -eggroll.core.grpc.channel.keepalive.timeout.sec=20 - -# roll_frame -arrow.enable_unsafe_memory_access=true - -# hadoop -hadoop.fs.defaultFS=file:/// - -# hadoop HA mode -hadoop.dfs.nameservices= -hadoop.dfs.namenode.rpc-address.nn1= -hadoop.dfs.namenode.rpc-address.nn2= - # session eggroll.session.processors.per.node=4 -eggroll.session.start.timeout.ms=180000 - -# rollpair -eggroll.rollpair.transferpair.sendbuf.size=250000 -# rollsite -eggroll.rollsite.coordinator=vmware -eggroll.rollsite.host= -eggroll.rollsite.port= -eggroll.rollsite.party.id= -eggroll.rollsite.route.table.path=conf/route_table.json -eggroll.rollsite.route.table.key= -eggroll.rollsite.route.table.whitelist=127.0.0.1 -eggroll.rollsite.jvm.options=-XX:+UseG1GC -XX:+PrintGCDetails -XX:+PrintGCDateStamps -Xloggc:logs/eggroll/rollsite.gc.log -eggroll.rollsite.push.max.retry=3 -eggroll.rollsite.push.long.retry=2 -eggroll.rollsite.push.batches.per.stream=10 -eggroll.rollsite.adapter.sendbuf.size=100000 # deepspeed ## where deepspeed containers locate, required for deepspeed #eggroll.resourcemanager.nodemanager.containers.data.dir= ## which python exec that deepspeed container used, fallback to eggpair venv/bin/python -#eggroll.container.deepspeed.python.exec= +#eggroll.container.python.exec= ## provide by submit option for now #eggroll.container.deepspeed.script.path= eggroll.container.deepspeed.distributed.backend=nccl ## defaults to cluster manager endpoint #eggroll.container.deepspeed.distributed.store.host= #eggroll.container.deepspeed.distributed.store.port= + + + + diff --git a/docker-deploy/training_template/backends/eggroll/conf/node-extend-env.properties b/docker-deploy/training_template/backends/eggroll/conf/node-extend-env.properties new file mode 100644 index 000000000..e69de29bb diff --git a/docker-deploy/training_template/docker-compose-eggroll.yml b/docker-deploy/training_template/docker-compose-eggroll.yml index 8bdbf18da..46d35bd47 100644 --- a/docker-deploy/training_template/docker-compose-eggroll.yml +++ b/docker-deploy/training_template/docker-compose-eggroll.yml @@ -40,20 +40,19 @@ volumes: device: /data services: - rollsite: - image: "${RegistryURI}${EGGRoll_IMAGE}:${EGGRoll_IMAGE_TAG}" + osx: + image: "${RegistryURI}${OSX_IMAGE}:${OSX_IMAGE_TAG}" restart: always ports: - "9370:9370" environment: PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION: python volumes: - - ./confs/eggroll/conf:/data/projects/fate/eggroll/conf/ + - ./confs/osx/conf/:/data/projects/fate/osx/conf/broker/ - /etc/localtime:/etc/localtime:ro - - shared_dir_fate:/data/projects/fate/fate networks: - fate-network - command: ["bash", "-c", "java -Dlog4j.configurationFile=$${EGGROLL_HOME}/conf/log4j2.properties -cp $${EGGROLL_HOME}/lib/*:$${EGGROLL_HOME}/conf/ com.webank.eggroll.rollsite.EggSiteBootstrap -c $${EGGROLL_HOME}/conf/eggroll.properties"] + command: ["sh", "-c", "java -XX:+UseG1GC -XX:G1HeapRegionSize=16m -XX:G1ReservePercent=25 -XX:InitiatingHeapOccupancyPercent=30 -XX:SoftRefLRUPolicyMSPerMB=0 -verbose:gc -Xloggc:/dev/shm/rmq_srv_gc_%p_%t.log -XX:+PrintGCDetails -XX:+PrintGCDateStamps -XX:+PrintGCApplicationStoppedTime -XX:+PrintAdaptiveSizePolicy -XX:+UseGCLogFileRotation -XX:NumberOfGCLogFiles=5 -XX:GCLogFileSize=30m -XX:+HeapDumpOnOutOfMemoryError -XX:HeapDumpPath=/oom/heapdump.hprof -server -Xms4g -Xmx4g -XX:-OmitStackTraceInFastThrow -XX:+AlwaysPreTouch -XX:MaxDirectMemorySize=15g -XX:-UseLargePages -XX:-UseBiasedLocking -cp conf/broker/:lib/*:extension/*:/data/projects/fate/osx/lib/osx-broker-1.0.0.jar org.fedai.osx.broker.Bootstrap -c /data/projects/fate/osx/conf"] fateboard: image: "${FATEBoard_IMAGE}:${FATEBoard_IMAGE_TAG}" @@ -82,7 +81,7 @@ services: PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION: python networks: - fate-network - command: ["bash", "-c", "java -Dlog4j.configurationFile=$${EGGROLL_HOME}/conf/log4j2.properties -cp $${EGGROLL_HOME}/lib/*: com.webank.eggroll.core.Bootstrap --bootstraps com.webank.eggroll.core.resourcemanager.ClusterManagerBootstrap -c $${EGGROLL_HOME}/conf/eggroll.properties -p 4670 -s 'EGGROLL_DEAMON'"] + command: ["bash", "-c", "java -server -Dlog4j.configurationFile=$${EGGROLL_HOME}/conf/log4j2.xml -Dmodule=clustermanager -cp $${EGGROLL_HOME}/lib/*: org.fedai.eggroll.clustermanager.Bootstrap -p 4670 -s EGGROLL_DAEMON"] nodemanager: image: "${EGGRoll_IMAGE}:${EGGRoll_IMAGE_TAG}" @@ -103,7 +102,7 @@ services: PYTHONPATH: /data/projects/fate/fate/python:/data/projects/fate/fate_flow/python:/data/projects/fate/fate_client/python:/data/projects/fate/eggroll/python cap_add: - SYS_PTRACE - command: ["bash", "-c", "java -Dlog4j.configurationFile=$${EGGROLL_HOME}/conf/log4j2.properties -cp $${EGGROLL_HOME}/lib/*: com.webank.eggroll.core.Bootstrap --bootstraps com.webank.eggroll.core.resourcemanager.NodeManagerBootstrap -c $${EGGROLL_HOME}/conf/eggroll.properties -p 4671 -s 'EGGROLL_DEAMON'"] + command: ["bash", "-c", "java -server -Dlog4j.configurationFile=$${EGGROLL_HOME}/conf/log4j2.xml -Dmodule=nodemanager -cp $${EGGROLL_HOME}/lib/*: org.fedai.eggroll.nodemanager.Bootstrap -p 4671 -s EGGROLL_DAEMON"] fateflow: image: "${FATEFlow_IMAGE}:${FATEFlow_IMAGE_TAG}" @@ -127,7 +126,7 @@ services: - /etc/localtime:/etc/localtime:ro depends_on: - mysql - - rollsite + - osx - clustermanager - nodemanager networks: diff --git a/docker-deploy/training_template/docker-compose-spark-slim.yml b/docker-deploy/training_template/docker-compose-spark-slim.yml index 21d7459d0..d31694367 100644 --- a/docker-deploy/training_template/docker-compose-spark-slim.yml +++ b/docker-deploy/training_template/docker-compose-spark-slim.yml @@ -42,7 +42,7 @@ volumes: services: fateboard: - image: "federatedai/fateboard:${TAG}" + image: "${FATEBoard_IMAGE}:${FATEBoard_IMAGE_TAG}" ports: - "8080:8080" volumes: @@ -56,7 +56,7 @@ services: - fateflow fateflow: - image: "federatedai/fateflow:${TAG}" + image: "${FATEFlow_IMAGE}:${FATEFlow_IMAGE_TAG}" restart: always ports: - 9380:9380 @@ -65,11 +65,11 @@ services: - ./confs/spark/spark-defaults.conf:/data/projects/spark-3.1.3-bin-hadoop3.2/conf/spark-defaults.conf - shared_dir_fate:/data/projects/fate/fate - shared_dir_examples:/data/projects/fate/examples - - download_dir:/data/projects/fate/python/download_dir + - download_dir:/data/projects/fate/fate/python/download_dir - fate_flow_logs:/data/projects/fate/fate_flow/logs - - ./confs/fate_flow/conf/service_conf.yaml:/data/projects/fate/conf/service_conf.yaml - - ./confs/fate_flow/conf/pulsar_route_table.yaml:/data/projects/fate/conf/pulsar_route_table.yaml - - ./confs/fate_flow/conf/rabbitmq_route_table.yaml:/data/projects/fate/conf/rabbitmq_route_table.yaml + - ./confs/fate_flow/conf/service_conf.yaml:/data/projects/fate/fate_flow/conf/service_conf.yaml + - ./confs/fate_flow/conf/pulsar_route_table.yaml:/data/projects/fate/fate_flow/conf/pulsar_route_table.yaml + - ./confs/fate_flow/conf/rabbitmq_route_table.yaml:/data/projects/fate/fate_flow/conf/rabbitmq_route_table.yaml - ./confs/eggroll/conf:/data/projects/fate/eggroll/conf - ./shared_dir/data/model_local_cache:/data/projects/fate/fate_flow/model_local_cache - /etc/localtime:/etc/localtime:ro @@ -87,10 +87,10 @@ services: - "-c" - | set -x - sed -i "s/int(party.party_id)/str(party.party_id)/g" /data/projects/fate/fate/python/fate/arch/federation/pulsar/_federation.py + sed -i "s/int(party.party_id)/str(party.party_id)/g" /data/projects/fate/fate/python/fate/arch/federation/backends/pulsar/_federation.py cp /data/projects/fate/fate_flow/conf/pulsar_route_table.yaml /data/projects/fate/fate_flow/pulsar_route_table.yaml cp /data/projects/fate/fate_flow/conf/rabbitmq_route_table.yaml /data/projects/fate/fate_flow/rabbitmq_route_table.yaml - sleep 5 && python fateflow/python/fate_flow/fate_flow_server.py + sleep 5 && python fate_flow/python/fate_flow/fate_flow_server.py environment: FATE_PROJECT_BASE: "/data/projects/fate" FATE_FLOW_UPLOAD_MAX_NUM: "1000000" @@ -98,7 +98,7 @@ services: FATE_LOG_LEVEL: "INFO" mysql: - image: "mysql:8.0.28" + image: "${MySQL_IMAGE}:${MySQL_IMAGE_TAG}" expose: - 3306 volumes: @@ -113,57 +113,56 @@ services: cap_add: - SYS_NICE - nginx: - image: "federatedai/nginx:${TAG}" - ports: - - 9300:9300 - - 9310:9310 - volumes: - - ./confs/nginx/route_table.yaml:/data/projects/fate/proxy/nginx/conf/route_table.yaml - - ./confs/nginx/nginx.conf:/data/projects/fate/proxy/nginx/conf/nginx.conf - - /etc/localtime:/etc/localtime:ro + osx: + image: "${RegistryURI}${OSX_IMAGE}:${OSX_IMAGE_TAG}" restart: always - networks: - - fate-network - depends_on: - - fateflow - - rabbitmq: - image: federatedai/rabbitmq:3.8.3-management ports: - - "5672:5672" - - "15672:15672" + - "9370:9370" environment: - RABBITMQ_DEFAULT_USER: fate - RABBITMQ_DEFAULT_PASS: fate - RABBITMQ_USER: fate - RABBITMQ_PASSWORD: fate - RABBITMQ_SERVER_ADDITIONAL_ERL_ARGS: "-rabbit max_message_size 536870912" - volumes: - - ./confs/rabbitmq/enabled_plugins:/etc/rabbitmq/enabled_plugins - - ./shared_dir/data/rabbitmq:/var/lib/rabbitmq - restart: always - networks: - - fate-network - - pulsar: - image: "federatedai/pulsar:2.10.2" - ports: - - "6650:6650" - - "6651:6651" - - "8001:8080" + PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION: python volumes: - - ./confs/pulsar/standalone.conf:/pulsar/conf/standalone.conf - # - ./shared_dir/data/pulsar:/pulsar/data + - ./confs/osx/conf/:/data/projects/fate/osx/conf/broker/ - /etc/localtime:/etc/localtime:ro - command: - ["/bin/bash", "-c", "bin/pulsar standalone -nss"] - restart: always networks: - fate-network + command: ["sh", "-c", "java -XX:+UseG1GC -XX:G1HeapRegionSize=16m -XX:G1ReservePercent=25 -XX:InitiatingHeapOccupancyPercent=30 -XX:SoftRefLRUPolicyMSPerMB=0 -verbose:gc -Xloggc:/dev/shm/rmq_srv_gc_%p_%t.log -XX:+PrintGCDetails -XX:+PrintGCDateStamps -XX:+PrintGCApplicationStoppedTime -XX:+PrintAdaptiveSizePolicy -XX:+UseGCLogFileRotation -XX:NumberOfGCLogFiles=5 -XX:GCLogFileSize=30m -XX:+HeapDumpOnOutOfMemoryError -XX:HeapDumpPath=/oom/heapdump.hprof -server -Xms4g -Xmx4g -XX:-OmitStackTraceInFastThrow -XX:+AlwaysPreTouch -XX:MaxDirectMemorySize=15g -XX:-UseLargePages -XX:-UseBiasedLocking -cp conf/broker/:lib/*:extension/*:/data/projects/fate/osx/lib/osx-broker-1.0.0.jar org.fedai.osx.broker.Bootstrap -c /data/projects/fate/osx/conf"] + + # rabbitmq: + # image: "${RabbitMQ_IMAGE}:${RabbitMQ_IMAGE_TAG}" + # ports: + # - "5672:5672" + # - "15672:15672" + # environment: + # RABBITMQ_DEFAULT_USER: fate + # RABBITMQ_DEFAULT_PASS: fate + # RABBITMQ_USER: fate + # RABBITMQ_PASSWORD: fate + # RABBITMQ_SERVER_ADDITIONAL_ERL_ARGS: "-rabbit max_message_size 536870912" + # volumes: + # - ./confs/rabbitmq/enabled_plugins:/etc/rabbitmq/enabled_plugins + # - ./shared_dir/data/rabbitmq:/var/lib/rabbitmq + # restart: always + # networks: + # - fate-network + + # pulsar: + # image: "${Pulsar_IMAGE}:${Pulsar_IMAGE_TAG}" + # ports: + # - "6650:6650" + # - "6651:6651" + # - "8001:8080" + # volumes: + # - ./confs/pulsar/standalone.conf:/pulsar/conf/standalone.conf + # # - ./shared_dir/data/pulsar:/pulsar/data + # - /etc/localtime:/etc/localtime:ro + # command: + # ["/bin/bash", "-c", "bin/pulsar standalone -nss"] + # restart: always + # networks: + # - fate-network client: - image: "federatedai/client:${TAG}" + image: "${Client_IMAGE}:${Client_IMAGE_TAG}" ports: - "20000:20000" restart: always diff --git a/docker-deploy/training_template/docker-compose-spark.yml b/docker-deploy/training_template/docker-compose-spark.yml index 1d43f8648..196918035 100644 --- a/docker-deploy/training_template/docker-compose-spark.yml +++ b/docker-deploy/training_template/docker-compose-spark.yml @@ -87,7 +87,7 @@ services: - "-c" - | set -x - sed -i "s/int(party.party_id)/str(party.party_id)/g" /data/projects/fate/fate/python/fate/arch/federation/pulsar/_federation.py + sed -i "s/int(party.party_id)/str(party.party_id)/g" /data/projects/fate/fate/python/fate/arch/federation/backends/pulsar/_federation.py cp /data/projects/fate/fate_flow/conf/pulsar_route_table.yaml /data/projects/fate/fate_flow/pulsar_route_table.yaml cp /data/projects/fate/fate_flow/conf/rabbitmq_route_table.yaml /data/projects/fate/fate_flow/rabbitmq_route_table.yaml sleep 5 && python fate_flow/python/fate_flow/fate_flow_server.py diff --git a/docker-deploy/training_template/public/fate_flow/conf/service_conf.yaml b/docker-deploy/training_template/public/fate_flow/conf/service_conf.yaml index 4f99c578b..fbe429582 100644 --- a/docker-deploy/training_template/public/fate_flow/conf/service_conf.yaml +++ b/docker-deploy/training_template/public/fate_flow/conf/service_conf.yaml @@ -1,5 +1,7 @@ -party_id: +party_id: "9999" use_registry: false +# DEBUG 10/INFO 20 +log_level: 20 encrypt: key_0: module: fate_flow.hub.encrypt.password_encrypt#pwdecrypt @@ -9,11 +11,11 @@ fateflow: host: 192.167.0.100 http_port: 9380 grpc_port: 9360 - proxy_name: rollsite - nginx: - host: - http_port: - grpc_port: + proxy_name: osx +# nginx: +# host: +# http_port: +# grpc_port: database: engine: mysql # encrypt passwd key @@ -32,14 +34,32 @@ database: path: default_engines: computing: eggroll - federation: rollsite + federation: osx storage: eggroll default_provider: name: fate # version default: fateflow.env version: device: local +computing: + standalone: + cores: 32 + eggroll: + cores: 32 + nodes: 1 + # cluster manager host and port + host: clustermanager + port: 4670 + spark: + # default use SPARK_HOME environment variable + home: /data/projects/spark-3.1.3-bin-hadoop3.2/ + cores: 32 federation: + osx: + host: osx + port: 9370 + # stream or queue + mode: stream pulsar: host: pulsar port: 6650 @@ -68,23 +88,6 @@ federation: route_table: conf/pulsar_route_table.yaml # mode: replication / client, default: replication mode: replication - max_message_size: 1048576 - rollsite: - host: rollsite - port: 9370 - osx: - host: osx - port: 9370 -computing: - standalone: - cores: 32 - eggroll: - cores: 32 - nodes: 2 - spark: - # default use SPARK_HOME environment variable - home: /data/projects/spark-3.1.3-bin-hadoop3.2/ - cores: 32 storage: hdfs: name_node: hdfs://namenode:9000 @@ -102,7 +105,7 @@ model_store: decrypt_key: file: # default fate_flow/runtime/system_settings: MODEL_STORE_PATH - path: + path: mysql: name: fate_flow user: fate @@ -121,4 +124,4 @@ zookeeper: - 127.0.0.1:2181 use_acl: true user: fate - password: fate \ No newline at end of file + password: fate diff --git a/docker-deploy/training_template/public/mysql/init/create-eggroll-meta-tables.sql b/docker-deploy/training_template/public/mysql/init/create-eggroll-meta-tables.sql index 1549c6c96..9e674c77f 100644 --- a/docker-deploy/training_template/public/mysql/init/create-eggroll-meta-tables.sql +++ b/docker-deploy/training_template/public/mysql/init/create-eggroll-meta-tables.sql @@ -7,18 +7,19 @@ USE `eggroll_meta`; -- store_locator CREATE TABLE IF NOT EXISTS `store_locator` ( - `store_locator_id` SERIAL PRIMARY KEY, - `store_type` VARCHAR(255) NOT NULL, - `namespace` VARCHAR(2000) NOT NULL DEFAULT 'DEFAULT', - `name` VARCHAR(2000) NOT NULL, - `path` VARCHAR(2000) NOT NULL DEFAULT '', - `total_partitions` INT UNSIGNED NOT NULL, - `partitioner` VARCHAR(2000) NOT NULL DEFAULT 'BYTESTRING_HASH', - `serdes` VARCHAR(2000) NOT NULL DEFAULT '', - `version` INT UNSIGNED NOT NULL DEFAULT 0, - `status` VARCHAR(255) NOT NULL, - `created_at` DATETIME DEFAULT CURRENT_TIMESTAMP, - `updated_at` DATETIME DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP + `store_locator_id` SERIAL PRIMARY KEY, + `store_type` VARCHAR(255) NOT NULL, + `namespace` VARCHAR(2000) NOT NULL DEFAULT 'DEFAULT', + `name` VARCHAR(2000) NOT NULL, + `path` VARCHAR(2000) NOT NULL DEFAULT '', + `total_partitions` INT UNSIGNED NOT NULL, + `key_serdes_type` INT NOT NULL DEFAULT 0, + `value_serdes_type` INT NOT NULL DEFAULT 0, + `partitioner_type` INT NOT NULL DEFAULT 0, + `version` INT UNSIGNED NOT NULL DEFAULT 0, + `status` VARCHAR(255) NOT NULL, + `created_at` DATETIME DEFAULT CURRENT_TIMESTAMP, + `updated_at` DATETIME DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP ) DEFAULT CHARACTER SET latin1 COLLATE latin1_swedish_ci; @@ -93,6 +94,8 @@ CREATE TABLE IF NOT EXISTS `session_main` `session_id` VARCHAR(767) PRIMARY KEY, `name` VARCHAR(2000) NOT NULL DEFAULT '', `status` VARCHAR(255) NOT NULL, + `status_reason` VARCHAR(255), + `before_status` VARCHAR(255), `tag` VARCHAR(255), `total_proc_count` INT, `active_proc_count` INT, @@ -127,6 +130,7 @@ CREATE TABLE IF NOT EXISTS `session_processor` `server_node_id` INT NOT NULL, `processor_type` VARCHAR(255) NOT NULL, `status` VARCHAR(255), + `before_status` VARCHAR(255), `tag` VARCHAR(255), `command_endpoint` VARCHAR(255), `transfer_endpoint` VARCHAR(255), @@ -193,3 +197,9 @@ CREATE TABLE IF NOT EXISTS `session_ranks` COLLATE latin1_swedish_ci; CREATE INDEX `idx_session_id_session_ranks` ON `session_ranks` (`session_id`); + + + + + + diff --git a/docker-deploy/training_template/public/osx/conf/broker.properties b/docker-deploy/training_template/public/osx/conf/broker.properties new file mode 100644 index 000000000..9d537b997 --- /dev/null +++ b/docker-deploy/training_template/public/osx/conf/broker.properties @@ -0,0 +1,61 @@ +grpc.port= 9370 +# Http switch for the server. +# If set to True, the server will open the http port. +# http port configuration can be set through http.port +open.http.server=false +# port of http +http.port=8087 +https.port=8088 +# whether the http server uses TLS +#ttp.use.tls = false +# whether the grpc server uses TLS? +# If true, a grpc port will be specially opened to listen for TLS requests +# grpc tls port configuration can be set through grpc.tls.port +open.grpc.tls.server=false +grpc.tls.port=9883 +# the partyId of self ,multiple partyIds can be set. +# eg: 9999,10000,10001 +self.party=9999 +# deployment mode, including cluster/standalone, +# respectively representing cluster mode and standalone mode , +# and standalone is used by default +deploy.mode=standalone +# the zookeeper address needs to be configured when the deployment mode is cluster +zk.url=127.0.0.1:2181 +stream.limit.mode=LOCAL + +# the IP of the cluster manager component of eggroll +eggroll.cluster.manager.ip = clustermanager +# the port of the cluster manager component of eggroll +eggroll.cluster.manager.port = 4670 +# maximum number of message retries +produce.msg.max.try.time =3 + +http.client.method.config = {"UNARY_CALL":{"reqTimeout":0,"connectionTimeout":0,"socketTimeout":0}} + +http.use.tls=false + +http.ssl.trust.store.type=PKCS12 + +http.ssl.key.store.alias=22 + +http.ssl.key.store.password=123456 + + +mapped.file.size=134217728 + +#http.ssl.trust.store.path=D:\\44\\127.0.0.1.pfx + +server.ca.file= +server.cert.chain.file= +server.private.key.file= + + + + + + + + + + diff --git a/docker-deploy/training_template/public/osx/conf/route_table.json b/docker-deploy/training_template/public/osx/conf/route_table.json new file mode 100644 index 000000000..abe60b8c5 --- /dev/null +++ b/docker-deploy/training_template/public/osx/conf/route_table.json @@ -0,0 +1,26 @@ +{ + "route_table": + { + "9999": + { + "fateflow":[ + { + "port": 9360, + "ip": "127.0.0.1" + } + ] + }, + "10000":{ + "default":[{ + "protocol":"http", + "url": "http://127.0.0.1:8087/osx/inbound", + "ip": "127.0.0.1", + "port": 9370 + }] + } + }, + "permission": + { + "default_allow": true + } +} \ No newline at end of file diff --git a/docs/tutorials/Build_Two_Parties_FATE_Cluster_in_One_Linux_Machine_with_MiniKube.md b/docs/tutorials/Build_Two_Parties_FATE_Cluster_in_One_Linux_Machine_with_MiniKube.md index 88c4c48f4..22e595f1a 100644 --- a/docs/tutorials/Build_Two_Parties_FATE_Cluster_in_One_Linux_Machine_with_MiniKube.md +++ b/docs/tutorials/Build_Two_Parties_FATE_Cluster_in_One_Linux_Machine_with_MiniKube.md @@ -21,14 +21,14 @@ After the tutorial, the deployment architecture looks like the following diagram 5. Network connectivity to dockerhub or 163 Docker Image Registry, and google gcr. 6. Setup the global KubeFATE version using in the tutorial and create a folder for the whole tutorial. ``` -export fate_version=v2.0.0-beta && export kubefate_version=v1.4.5 && cd ~ && mkdir demo && cd demo +export fate_version=v2.0.0 && export kubefate_version=v1.4.5 && cd ~ && mkdir demo && cd demo ``` Notes: * When talking about KubeFATE version, usually there are 3 notions: * The KubeFATE CLI version, in this tutorial, it is v1.4.5. * The KubeFATE service version, in this tutorial, it is v1.4.5. - * The FATE version, in this tutorial, it is v2.0.0-beta, it also means the version of the helm chart of FATE, currently we use this version to tag the KubeFATE GitHub master branch. + * The FATE version, in this tutorial, it is v2.0.0, it also means the version of the helm chart of FATE, currently we use this version to tag the KubeFATE GitHub master branch. * **In this tutorial, the IP of the machine we used is 192.168.100.123. Please change it to your machine's IP in all the following commands and config files.** # Start Tutorial @@ -87,7 +87,7 @@ When all the pods are in the ready state, it means your Kubernetes cluster is re ## Setup Kubefate ### Install KubeFATE CLI Go to [KubeFATE Release](https://github.com/FederatedAI/KubeFATE/releases), and find the latest kubefate-k8s release -pack, which is `v2.0.0-beta` as set to ENVs before. (replace ${fate_version} with the newest version available) +pack, which is `v2.0.0` as set to ENVs before. (replace ${fate_version} with the newest version available) ``` curl -LO https://github.com/FederatedAI/KubeFATE/releases/download/${fate_version}/kubefate-k8s-${fate_version}.tar.gz && tar -xzf ./kubefate-k8s-${fate_version}.tar.gz ``` @@ -256,7 +256,7 @@ For `/kubefate/examples/party-9999/cluster-spark-pulsar.yaml`, modify it as foll name: fate-9999 namespace: fate-9999 chartName: fate -chartVersion: v2.0.0-beta +chartVersion: v2.0.0 partyId: 9999 registry: "" pullPolicy: @@ -340,7 +340,7 @@ and for fate-10000: name: fate-10000 namespace: fate-10000 chartName: fate -chartVersion: v2.0.0-beta +chartVersion: v2.0.0 partyId: 10000 registry: "" pullPolicy: @@ -440,8 +440,8 @@ or watch the clusters till their STATUS changing to `Running`: ``` kubefate@machine:~/kubefate$ watch kubefate cluster ls UUID NAME NAMESPACE REVISION STATUS CHART ChartVERSION AGE -29878fa9-aeee-4ae5-a5b7-fd4e9eb7c1c3 fate-9999 fate-9999 1 Running fate v2.0.0-beta 88s -dacc0549-b9fc-463f-837a-4e7316db2537 fate-10000 fate-10000 1 Running fate v2.0.0-beta 69s +29878fa9-aeee-4ae5-a5b7-fd4e9eb7c1c3 fate-9999 fate-9999 1 Running fate v2.0.0 88s +dacc0549-b9fc-463f-837a-4e7316db2537 fate-10000 fate-10000 1 Running fate v2.0.0 69s ``` We have about 10G Docker images that need to be pulled, this step will take a while for the first time. An alternative way is offline loading the images to the local environment. @@ -479,13 +479,13 @@ UUID 29878fa9-aeee-4ae5-a5b7-fd4e9eb7c1c3 Name fate-9999 NameSpace fate-9999 ChartName fate -ChartVersion v2.0.0-beta +ChartVersion v2.0.0 Revision 1 Age 54m Status Running Spec algorithm: Basic chartName: fate - chartVersion: v2.0.0-beta + chartVersion: v2.0.0 computing: Spark device: CPU federation: Pulsar diff --git a/docs/tutorials/Build_Two_Parties_FATE_Cluster_in_One_Linux_Machine_with_MiniKube_zh.md b/docs/tutorials/Build_Two_Parties_FATE_Cluster_in_One_Linux_Machine_with_MiniKube_zh.md index 9a458998b..84589d71c 100644 --- a/docs/tutorials/Build_Two_Parties_FATE_Cluster_in_One_Linux_Machine_with_MiniKube_zh.md +++ b/docs/tutorials/Build_Two_Parties_FATE_Cluster_in_One_Linux_Machine_with_MiniKube_zh.md @@ -17,14 +17,14 @@ 5. 要保证安装机器可以正常访问Docker Hub或者网易云镜像仓库,以及Google gcr; 6. 预先创建一个目录,以便整个过程使用该目录作为工作目录,命令如下: ``` -export fate_version=v2.0.0-beta && export kubefate_version=v1.4.5 && cd ~ && mkdir demo && cd demo +export fate_version=v2.0.0 && export kubefate_version=v1.4.5 && cd ~ && mkdir demo && cd demo ``` Notes: * 当我们提到"KubeFATE的版本",通常来讲会有三个概念: * KubeFATE命令行工具的版本,在本教程中为v1.4.5。 * KubeFATE服务版本,在本教程中为v1.4.5。 - * FATE版本,在本教程中v2.0.0-beta,它也意味着FATE的Helm Chart的版本, 值得注意的是我们用这个版本来给GitHub上的KubeFATE的发布打tag。 + * FATE版本,在本教程中v2.0.0,它也意味着FATE的Helm Chart的版本, 值得注意的是我们用这个版本来给GitHub上的KubeFATE的发布打tag。 * **下文介绍的MiniKube机器IP地址是192.168.100.123。请修改为你准备的实验机器IP地址** # 开始安装 @@ -77,7 +77,7 @@ sudo minikube addons enable ingress ## 安装Kubefate ### 下载KubeFATE命令行工具 -我们从Github上 [KubeFATE Release](https://github.com/FederatedAI/KubeFATE/releases)页面找到Kuberetes部署的下载包,并下载对应版本,如前面环境变量设置`v2.0.0-beta`, +我们从Github上 [KubeFATE Release](https://github.com/FederatedAI/KubeFATE/releases)页面找到Kuberetes部署的下载包,并下载对应版本,如前面环境变量设置`v2.0.0`, ``` curl -LO https://github.com/FederatedAI/KubeFATE/releases/download/${fate_version}/kubefate-k8s-${fate_version}.tar.gz && tar -xzf ./kubefate-k8s-${fate_version}.tar.gz ``` @@ -237,7 +237,7 @@ kubectl -n fate-10000 create secret docker-registry myregistrykey \ name: fate-9999 namespace: fate-9999 chartName: fate -chartVersion: v2.0.0-beta +chartVersion: v2.0.0 partyId: 9999 registry: "" pullPolicy: @@ -322,7 +322,7 @@ pulsar: name: fate-10000 namespace: fate-10000 chartName: fate -chartVersion: v2.0.0-beta +chartVersion: v2.0.0 partyId: 10000 registry: "" pullPolicy: @@ -418,8 +418,8 @@ create job success, job id=7752db70-e368-41fa-8827-d39411728d1b ``` kubefate@machine:~/kubefate$ watch kubefate cluster ls UUID NAME NAMESPACE REVISION STATUS CHART ChartVERSION AGE -29878fa9-aeee-4ae5-a5b7-fd4e9eb7c1c3 fate-9999 fate-9999 1 Running fate v2.0.0-beta 88s -dacc0549-b9fc-463f-837a-4e7316db2537 fate-10000 fate-10000 1 Running fate v2.0.0-beta 69s +29878fa9-aeee-4ae5-a5b7-fd4e9eb7c1c3 fate-9999 fate-9999 1 Running fate v2.0.0 88s +dacc0549-b9fc-463f-837a-4e7316db2537 fate-10000 fate-10000 1 Running fate v2.0.0 69s ``` 因为这个步骤需要到网易云镜像仓库去下载约10G的镜像,所以第一次执行视乎你的网络情况需要一定时间。 检查下载的进度可以用 @@ -435,7 +435,7 @@ mysql-c77b7b94b-zblt5 1/1 Running 0 12m nodemanager-0-5599db57f4-2khcg 2/2 Running 0 12m nodemanager-1-7c986f9454-qcscd 2/2 Running 0 12m python-57b66d96bd-vj8kq 3/3 Running 0 12m -rollsite-7846898d6d-j2gb9 1/1 Running 0 12m +osx-7846898d6d-j2gb9 1/1 Running 0 12m ``` ### 验证FATE的部署 @@ -446,13 +446,13 @@ UUID 29878fa9-aeee-4ae5-a5b7-fd4e9eb7c1c3 Name fate-9999 NameSpace fate-9999 ChartName fate -ChartVersion v2.0.0-beta +ChartVersion v2.0.0 Revision 1 Age 54m Status Running Spec algorithm: Basic chartName: fate - chartVersion: v2.0.0-beta + chartVersion: v2.0.0 computing: Spark device: CPU federation: Pulsar diff --git a/helm-charts/FATE/Chart.yaml b/helm-charts/FATE/Chart.yaml index 1f025f22c..be8a3bf80 100644 --- a/helm-charts/FATE/Chart.yaml +++ b/helm-charts/FATE/Chart.yaml @@ -1,8 +1,8 @@ apiVersion: v1 -appVersion: v2.0.0-beta +appVersion: v2.0.0 description: A Helm chart for fate-training name: fate -version: v2.0.0-beta +version: v2.0.0 home: https://fate.fedai.org icon: https://aisp-1251170195.cos.ap-hongkong.myqcloud.com/wp-content/uploads/sites/12/2019/09/logo.png sources: diff --git a/helm-charts/FATE/templates/backends/eggroll/clustermanager/deployment.yaml b/helm-charts/FATE/templates/backends/eggroll/clustermanager/deployment.yaml index 6ea2015e8..6397fffc8 100644 --- a/helm-charts/FATE/templates/backends/eggroll/clustermanager/deployment.yaml +++ b/helm-charts/FATE/templates/backends/eggroll/clustermanager/deployment.yaml @@ -50,14 +50,7 @@ spec: - -c - | set -x - mkdir -p /data/projects/fate/eggroll/logs/eggroll/ - touch /data/projects/fate/eggroll/logs/eggroll/eggroll-audit.log - ln -sf /dev/stdout /data/projects/fate/eggroll/logs/eggroll/eggroll-audit.log - touch /data/projects/fate/eggroll/logs/eggroll/eggroll.jvm.log - ln -sf /dev/stdout /data/projects/fate/eggroll/logs/eggroll/eggroll.jvm.log - touch /data/projects/fate/eggroll/logs/eggroll/eggroll.jvm.err.log - ln -sf /dev/stderr /data/projects/fate/eggroll/logs/eggroll/eggroll.jvm.err.log - java -Dlog4j.configurationFile=$${EGGROLL_HOME}/conf/log4j2.properties -cp $${EGGROLL_HOME}/lib/*: com.webank.eggroll.core.Bootstrap --bootstraps com.webank.eggroll.core.resourcemanager.ClusterManagerBootstrap -c $${EGGROLL_HOME}/conf/eggroll.properties -p 4670 -s 'EGGROLL_DEAMON' + java -server -Dlog4j.configurationFile=$${EGGROLL_HOME}/conf/log4j2.xml -Dmodule=clustermanager -cp $${EGGROLL_HOME}/lib/*: org.fedai.eggroll.clustermanager.Bootstrap -p 4670 -s EGGROLL_DAEMON ports: - containerPort: 4670 livenessProbe: diff --git a/helm-charts/FATE/templates/backends/eggroll/configmap.yaml b/helm-charts/FATE/templates/backends/eggroll/configmap.yaml index 7d59888ae..4767072e4 100644 --- a/helm-charts/FATE/templates/backends/eggroll/configmap.yaml +++ b/helm-charts/FATE/templates/backends/eggroll/configmap.yaml @@ -24,101 +24,52 @@ data: eggroll.properties: | [eggroll] # core + eggroll.resourcemanager.nodemanager.net.device=eth0 + eggroll.resourcemanager.nodemanager.gpu.num.shell=nvidia.sh #eggroll.resourcemanager.clustermanager.jdbc.driver.class.name=org.h2.Driver eggroll.resourcemanager.clustermanager.jdbc.driver.class.name=com.mysql.cj.jdbc.Driver #eggroll.resourcemanager.clustermanager.jdbc.url=jdbc:h2:./data/meta_h2/eggroll_meta.h2;AUTO_SERVER=TRUE;MODE=MySQL;DATABASE_TO_LOWER=TRUE;SCHEMA=eggroll_meta; eggroll.resourcemanager.clustermanager.jdbc.url=jdbc:mysql://{{ .Values.externalMysqlIp | default .Values.modules.mysql.ip | default "mysql" }}:{{ .Values.externalMysqlPort | default .Values.modules.mysql.port | default "3306" }}/{{ .Values.externalMysqlDatabase | default .Values.modules.mysql.database | default "eggroll_meta" }}?useSSL=false&serverTimezone={{ .Values.modules.clustermanager.mysqlServerTimezone | default "UTC" }}&characterEncoding=utf8&allowPublicKeyRetrieval=true eggroll.resourcemanager.clustermanager.jdbc.username={{ .Values.externalMysqlUser | default .Values.modules.mysql.user | default "fate" }} eggroll.resourcemanager.clustermanager.jdbc.password={{ .Values.externalMysqlPassword | default .Values.modules.mysql.password | default "fate_dev" }} - - eggroll.data.dir=data/ - eggroll.logs.dir=logs/ + eggroll.resourcemanager.clustermanager.host=clustermanager eggroll.resourcemanager.clustermanager.port=4670 + eggroll.resourcemanager.nodemanager.host=nodemanager eggroll.resourcemanager.nodemanager.port=4671 eggroll.resourcemanager.process.tag={{ .Values.partyId }} + # dashboard + eggroll.dashboard.server.port=8083 + eggroll.security.session.expired.time=30 + eggroll.security.login.username=admin + eggroll.security.login.password=admin + eggroll.security.encrypt.private_key= + eggroll.security.encrypt.enable=false + + eggroll.data.dir=/data/projects/fate/eggroll/data/ + eggroll.logs.dir=/data/projects/fate/eggroll/logs/ + eggroll.bootstrap.root.script=bin/eggroll_boot.sh - + eggroll.resourcemanager.bootstrap.egg_pair.exepath=bin/roll_pair/egg_pair_bootstrap.sh eggroll.resourcemanager.bootstrap.egg_pair.venv= eggroll.resourcemanager.bootstrap.egg_pair.pythonpath=python - eggroll.resourcemanager.bootstrap.egg_pair.filepath=python/eggroll/roll_pair/egg_pair.py + eggroll.resourcemanager.bootstrap.egg_pair.filepath=python/eggroll/computing/egg_pair/egg_pair.py eggroll.resourcemanager.bootstrap.egg_pair.ld_library_path= - - eggroll.resourcemanager.bootstrap.egg_frame.exepath=bin/roll_pair/roll_pair_master_bootstrap.sh - eggroll.resourcemanager.bootstrap.egg_frame.javahome=/usr/lib/jvm/java-1.8.0-openjdk - eggroll.resourcemanager.bootstrap.egg_frame.classpath=conf/:lib/* - eggroll.resourcemanager.bootstrap.egg_frame.mainclass=com.webank.eggroll.rollframe.EggFrameBootstrap - eggroll.resourcemanager.bootstrap.egg_frame.jvm.options= - - eggroll.core.grpc.channel.keepalive.timeout.sec=20 - - # roll_frame - arrow.enable_unsafe_memory_access=true - - # hadoop - hadoop.fs.defaultFS=file:/// - - # hadoop HA mode - hadoop.dfs.nameservices= - hadoop.dfs.namenode.rpc-address.nn1= - hadoop.dfs.namenode.rpc-address.nn2= - + # session - eggroll.session.processors.per.node={{ .Values.modules.nodemanager.sessionProcessorsPerNode | default 2 }} - eggroll.session.start.timeout.ms=180000 - - # rollpair - eggroll.rollpair.transferpair.sendbuf.size=250000 - - # rollsite - eggroll.rollsite.coordinator={{ .Values.partyName }} - eggroll.rollsite.host=rollsite - eggroll.rollsite.port=9370 - eggroll.rollsite.party.id={{ .Values.partyId }} - eggroll.rollsite.route.table.path=conf/route_table/route_table.json - eggroll.rollsite.route.table.key= - eggroll.rollsite.route.table.whitelist=127.0.0.1 - eggroll.rollsite.jvm.options=-XX:+UseG1GC -XX:+PrintGCDetails -XX:+PrintGCDateStamps -Xloggc:logs/eggroll/rollsite.gc.log - - eggroll.rollsite.push.max.retry=3 - eggroll.rollsite.push.long.retry=2 - eggroll.rollsite.push.batches.per.stream=10 - eggroll.rollsite.adapter.sendbuf.size=100000 - # polling - # {{ .Values.modules.rollsite.polling.enabled }} - # {{ .Values.modules.rollsite.polling.type }} - # - {{- if and .Values.modules.rollsite.polling.enabled ( eq (print .Values.modules.rollsite.polling.type) "client" ) }} - eggroll.rollsite.polling.client.enabled=true - {{- end }} - - {{- if and .Values.modules.rollsite.polling.enabled ( eq (print .Values.modules.rollsite.polling.type) "server" ) }} - eggroll.rollsite.polling.server.enabled=true - eggroll.rollsite.polling.concurrency= {{ .Values.modules.rollsite.polling.concurrency | default 50 }} - {{- end }} - + eggroll.session.processors.per.node={{ .Values.modules.nodemanager.sessionProcessorsPerNode | default 4 }} + # deepspeed ## where deepspeed containers locate, required for deepspeed - eggroll.resourcemanager.nodemanager.containers.data.dir=/data/projects/fate/eggroll/containers + #eggroll.resourcemanager.nodemanager.containers.data.dir= ## which python exec that deepspeed container used, fallback to eggpair venv/bin/python - eggroll.container.deepspeed.python.exec=/data/projects/python/venv/bin/python + #eggroll.container.python.exec= ## provide by submit option for now #eggroll.container.deepspeed.script.path= eggroll.container.deepspeed.distributed.backend=nccl ## defaults to cluster manager endpoint #eggroll.container.deepspeed.distributed.store.host= #eggroll.container.deepspeed.distributed.store.port= - {{- if .Values.modules.rollsite.enableTLS }} - cert_configs: | - eggroll.core.security.secure.cluster.enabled=true - eggroll.core.security.secure.client.auth.enabled=true - eggroll.core.security.ca.crt.path=conf/cert/ca.pem - eggroll.core.security.crt.path=conf/cert/server.crt - eggroll.core.security.key.path=conf/cert/server.key - eggroll.core.security.client.ca.crt.path=conf/cert/ca.pem - eggroll.core.security.client.crt.path=conf/cert/client.crt - eggroll.core.security.client.key.path=conf/cert/client.key - {{- end }} {{ end }} \ No newline at end of file diff --git a/helm-charts/FATE/templates/backends/eggroll/nodemanager/statefulSet.yaml b/helm-charts/FATE/templates/backends/eggroll/nodemanager/statefulSet.yaml index 88aabcd65..f2d39caa9 100644 --- a/helm-charts/FATE/templates/backends/eggroll/nodemanager/statefulSet.yaml +++ b/helm-charts/FATE/templates/backends/eggroll/nodemanager/statefulSet.yaml @@ -67,14 +67,7 @@ spec: - -c - | set -x - mkdir -p /data/projects/fate/eggroll/logs/eggroll/ - touch /data/projects/fate/eggroll/logs/eggroll/eggroll-audit.log - ln -sf /dev/stdout /data/projects/fate/eggroll/logs/eggroll/eggroll-audit.log - touch /data/projects/fate/eggroll/logs/eggroll/eggroll.jvm.log - ln -sf /dev/stdout /data/projects/fate/eggroll/logs/eggroll/eggroll.jvm.log - touch /data/projects/fate/eggroll/logs/eggroll/eggroll.jvm.err.log - ln -sf /dev/stderr /data/projects/fate/eggroll/logs/eggroll/eggroll.jvm.err.log - /tini -- java -Dlog4j.configurationFile=$${EGGROLL_HOME}/conf/log4j2.properties -cp $${EGGROLL_HOME}/lib/*: com.webank.eggroll.core.Bootstrap --bootstraps com.webank.eggroll.core.resourcemanager.NodeManagerBootstrap -c $${EGGROLL_HOME}/conf/eggroll.properties -p 4671 -s 'EGGROLL_DEAMON' + /tini -- java -server -Dlog4j.configurationFile=$${EGGROLL_HOME}/conf/log4j2.xml -Dmodule=nodemanager -cp $${EGGROLL_HOME}/lib/*: org.fedai.eggroll.nodemanager.Bootstrap -p 4671 -s EGGROLL_DAEMON ports: - containerPort: 4671 livenessProbe: diff --git a/helm-charts/FATE/templates/backends/eggroll/rollsite/configmap.yaml b/helm-charts/FATE/templates/backends/eggroll/rollsite/configmap.yaml deleted file mode 100644 index f25ba0bec..000000000 --- a/helm-charts/FATE/templates/backends/eggroll/rollsite/configmap.yaml +++ /dev/null @@ -1,94 +0,0 @@ -# Copyright 2019-2022 VMware, Inc. -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# http://www.apache.org/licenses/LICENSE-2.0 -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -{{ if .Values.modules.rollsite.include }} -kind: ConfigMap -apiVersion: v1 -metadata: - name: rollsite-config - labels: - fateMoudle: rollsite -{{ include "fate.labels" . | indent 4 }} -data: - route_table.json: | - { - "route_table": { - "default": { - "default": [ - { - {{- if .Values.exchange }} - "ip": "{{ .Values.exchange.partyIp }}", - "port": {{ .Values.exchange.partyPort }} - {{- else }} - - {{- if and .Values.modules.rollsite.polling.enabled ( eq (print .Values.modules.rollsite.polling.type) "client" ) }} - - {{- with .Values.modules.rollsite.polling.server }} - "ip": "{{ .ip }}", - "port": {{ .port }} - {{- end }} - - {{- else }} - "ip": "rollsite", - "port": 9370 - {{- end }} - - {{- end }} - } - ] - }, - {{- range .Values.partyList }} - "{{ .partyId }}": { - "default": [ - { - "ip": "{{ .partyIp }}", - "port": {{ .partyPort }} - } - ] - }, - {{- end }} - - {{- if and .Values.modules.rollsite.polling.enabled ( eq (print .Values.modules.rollsite.polling.type) "server" ) }} - - {{- range .Values.modules.rollsite.polling.clientList}} - "{{ .partID }}": - { - "default": [ - { - "port": -1, - "ip": "", - "is_polling": true - } - ] - }, - {{- end }} - - {{- end }} - "{{ .Values.partyId }}": { - "fateflow": [ - { - "ip": "fateflow", - "port": 9360 - } - ], - "default": [ - { - "ip": "rollsite", - "port": 9370 - } - ] - } - }, - "permission": { - "default_allow": true - } - } -{{ end }} \ No newline at end of file diff --git a/helm-charts/FATE/templates/core/fateflow/configmap.yaml b/helm-charts/FATE/templates/core/fateflow/configmap.yaml index d5eec4623..1df81440b 100644 --- a/helm-charts/FATE/templates/core/fateflow/configmap.yaml +++ b/helm-charts/FATE/templates/core/fateflow/configmap.yaml @@ -44,6 +44,8 @@ data: service_conf.yaml: | party_id: {{ .Values.partyId | quote }} use_registry: false + # DEBUG 10/INFO 20 + log_level: 20 encrypt: key_0: module: fate_flow.hub.encrypt.password_encrypt#pwdecrypt @@ -57,7 +59,7 @@ data: {{- if eq .Values.computing "Spark" "Spark_local" }} proxy_name: nginx {{- else }} - proxy_name: rollsite + proxy_name: osx {{- end }} nginx: host: @@ -85,18 +87,32 @@ data: {{- else }} computing: {{ .Values.computing | lower }} {{- end }} - {{- if eq .Values.computing "Eggroll" }} - federation: "rollsite" - {{- else }} federation: {{ .Values.federation | lower }} - {{- end }} storage: {{ .Values.storage | lower }} default_provider: name: fate # version default: fateflow.env version: device: local + computing: + standalone: + cores: 32 + eggroll: + cores: 16 + nodes: 2 + # cluster manager host and port + host: clustermanager + port: 4670 + spark: + # default use SPARK_HOME environment variable + home: /data/projects/spark-3.1.3-bin-hadoop3.2/ + cores: 32 federation: + osx: + host: osx + port: 9370 + # stream or queue + mode: stream pulsar: host: {{ .Values.modules.python.pulsar.host }} port: {{ .Values.modules.python.pulsar.port }} @@ -126,22 +142,6 @@ data: # mode: replication / client, default: replication mode: replication max_message_size: 1048576 - rollsite: - host: rollsite - port: 9370 - osx: - host: osx - port: 9370 - computing: - standalone: - cores: 32 - eggroll: - cores: 32 - nodes: 2 - spark: - # default use SPARK_HOME environment variable - home: /data/projects/spark-3.1.3-bin-hadoop3.2/ - cores: 32 storage: hdfs: name_node: {{ .Values.modules.python.hdfs.name_node | default "hdfs://namenode:9000" }} diff --git a/helm-charts/FATE/templates/core/mysql/configmap.yaml b/helm-charts/FATE/templates/core/mysql/configmap.yaml index 35af166ba..ae82aa472 100644 --- a/helm-charts/FATE/templates/core/mysql/configmap.yaml +++ b/helm-charts/FATE/templates/core/mysql/configmap.yaml @@ -22,26 +22,27 @@ data: {{- else }} create-eggroll-meta-tables.sql: | -- create database if not exists, default database is eggroll_meta - CREATE DATABASE IF NOT EXISTS `{{ .Values.modules.mysql.database }}`; - + CREATE DATABASE IF NOT EXISTS `eggroll_meta`; + -- all operation under this database - USE `{{ .Values.modules.mysql.database }}`; - + USE `eggroll_meta`; + -- store_locator CREATE TABLE IF NOT EXISTS `store_locator` ( - `store_locator_id` SERIAL PRIMARY KEY, - `store_type` VARCHAR(255) NOT NULL, - `namespace` VARCHAR(2000) NOT NULL DEFAULT 'DEFAULT', - `name` VARCHAR(2000) NOT NULL, - `path` VARCHAR(2000) NOT NULL DEFAULT '', - `total_partitions` INT UNSIGNED NOT NULL, - `partitioner` VARCHAR(2000) NOT NULL DEFAULT 'BYTESTRING_HASH', - `serdes` VARCHAR(2000) NOT NULL DEFAULT '', - `version` INT UNSIGNED NOT NULL DEFAULT 0, - `status` VARCHAR(255) NOT NULL, - `created_at` DATETIME DEFAULT CURRENT_TIMESTAMP, - `updated_at` DATETIME DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP + `store_locator_id` SERIAL PRIMARY KEY, + `store_type` VARCHAR(255) NOT NULL, + `namespace` VARCHAR(2000) NOT NULL DEFAULT 'DEFAULT', + `name` VARCHAR(2000) NOT NULL, + `path` VARCHAR(2000) NOT NULL DEFAULT '', + `total_partitions` INT UNSIGNED NOT NULL, + `key_serdes_type` INT NOT NULL DEFAULT 0, + `value_serdes_type` INT NOT NULL DEFAULT 0, + `partitioner_type` INT NOT NULL DEFAULT 0, + `version` INT UNSIGNED NOT NULL DEFAULT 0, + `status` VARCHAR(255) NOT NULL, + `created_at` DATETIME DEFAULT CURRENT_TIMESTAMP, + `updated_at` DATETIME DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP ) DEFAULT CHARACTER SET latin1 COLLATE latin1_swedish_ci; @@ -116,6 +117,8 @@ data: `session_id` VARCHAR(767) PRIMARY KEY, `name` VARCHAR(2000) NOT NULL DEFAULT '', `status` VARCHAR(255) NOT NULL, + `status_reason` VARCHAR(255), + `before_status` VARCHAR(255), `tag` VARCHAR(255), `total_proc_count` INT, `active_proc_count` INT, @@ -150,6 +153,7 @@ data: `server_node_id` INT NOT NULL, `processor_type` VARCHAR(255) NOT NULL, `status` VARCHAR(255), + `before_status` VARCHAR(255), `tag` VARCHAR(255), `command_endpoint` VARCHAR(255), `transfer_endpoint` VARCHAR(255), @@ -216,7 +220,6 @@ data: COLLATE latin1_swedish_ci; CREATE INDEX `idx_session_id_session_ranks` ON `session_ranks` (`session_id`); - - {{- end }} + {{- end }} --- {{- end }} \ No newline at end of file diff --git a/helm-charts/FATE/templates/core/osx/configmap.yaml b/helm-charts/FATE/templates/core/osx/configmap.yaml new file mode 100644 index 000000000..87ff26cbe --- /dev/null +++ b/helm-charts/FATE/templates/core/osx/configmap.yaml @@ -0,0 +1,106 @@ +# Copyright 2019-2022 VMware, Inc. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +{{ if .Values.modules.osx.include }} +kind: ConfigMap +apiVersion: v1 +metadata: + name: osx-config + labels: + fateMoudle: osx +{{ include "fate.labels" . | indent 4 }} +data: + route_table.json: | + { + "route_table": + { + + {{- range .Values.partyList }} + "{{ .partyId }}": { + "default": [ + { + "ip": "{{ .partyIp }}", + "port": {{ .partyPort }} + } + ] + }, + {{- end }} + "{{ .Values.partyId }}": { + "fateflow": [ + { + "ip": "fateflow", + "port": 9360 + } + ] + } + }, + "self_party":[ + "{{ .Values.partyId }}" + ], + "permission": + { + "default_allow": true + } + } + broker.properties: | + grpc.port= 9370 + # Http switch for the server. + # If set to True, the server will open the http port. + # http port configuration can be set through http.port + open.http.server=false + # port of http + http.port=8087 + https.port=8088 + # whether the http server uses TLS + #ttp.use.tls = false + # whether the grpc server uses TLS? + # If true, a grpc port will be specially opened to listen for TLS requests + # grpc tls port configuration can be set through grpc.tls.port + open.grpc.tls.server=false + grpc.tls.port=9883 + # the partyId of self ,multiple partyIds can be set. + # eg: 9999,10000,10001 + self.party=9999 + # deployment mode, including cluster/standalone, + # respectively representing cluster mode and standalone mode , + # and standalone is used by default + deploy.mode=standalone + # the zookeeper address needs to be configured when the deployment mode is cluster + zk.url=127.0.0.1:2181 + stream.limit.mode=LOCAL + + # the IP of the cluster manager component of eggroll + eggroll.cluster.manager.ip = clustermanager + # the port of the cluster manager component of eggroll + eggroll.cluster.manager.port = 4670 + # maximum number of message retries + produce.msg.max.try.time =3 + + http.client.method.config = {"UNARY_CALL":{"reqTimeout":0,"connectionTimeout":0,"socketTimeout":0}} + + http.use.tls=false + + http.ssl.trust.store.type=PKCS12 + + http.ssl.key.store.alias=22 + + http.ssl.key.store.password=123456 + + + mapped.file.size=134217728 + + #http.ssl.trust.store.path=D:\\44\\127.0.0.1.pfx + + server.ca.file= + server.cert.chain.file= + server.private.key.file= + +{{ end }} \ No newline at end of file diff --git a/helm-charts/FATE/templates/backends/eggroll/rollsite/deployment.yaml b/helm-charts/FATE/templates/core/osx/deployment.yaml similarity index 61% rename from helm-charts/FATE/templates/backends/eggroll/rollsite/deployment.yaml rename to helm-charts/FATE/templates/core/osx/deployment.yaml index 4ea8c4b87..97373f5bd 100644 --- a/helm-charts/FATE/templates/backends/eggroll/rollsite/deployment.yaml +++ b/helm-charts/FATE/templates/core/osx/deployment.yaml @@ -9,13 +9,13 @@ # See the License for the specific language governing permissions and # limitations under the License. -{{ if .Values.modules.rollsite.include }} +{{ if .Values.modules.osx.include }} apiVersion: apps/v1 kind: Deployment metadata: - name: rollsite + name: osx labels: - fateMoudle: rollsite + fateMoudle: osx {{ include "fate.labels" . | indent 4 }} spec: replicas: 1 @@ -23,49 +23,41 @@ spec: type: Recreate selector: matchLabels: - fateMoudle: rollsite + fateMoudle: osx {{ include "fate.matchLabels" . | indent 6 }} template: metadata: labels: - fateMoudle: rollsite + fateMoudle: osx {{ include "fate.labels" . | indent 8 }} spec: hostAliases: - ip: "127.0.0.1" hostnames: - - "rollsite" + - "osx" containers: - - name: rollsite + - name: osx env: - name: PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION value: python - image: {{ .Values.image.registry }}{{ .Values.modules.rollsite.image }}:{{ .Values.modules.rollsite.imageTag }} + image: {{ .Values.image.registry }}{{ .Values.modules.osx.image }}:{{ .Values.modules.osx.imageTag }} imagePullPolicy: {{ .Values.image.pullPolicy }} - {{- if .Values.modules.rollsite.resources}} + {{- if .Values.modules.osx.resources}} resources: - {{- range $key, $val := .Values.modules.rollsite.resources }} + {{- range $key, $val := .Values.modules.osx.resources }} {{ $key }}: {{ toYaml $val | indent 14 }} {{- end }} {{- end }} command: - - bash + - sh - -c - | set -x - mkdir -p /data/projects/fate/eggroll/logs/eggroll/ - touch /data/projects/fate/eggroll/logs/eggroll/eggroll-audit.log - ln -sf /dev/stdout /data/projects/fate/eggroll/logs/eggroll/eggroll-audit.log - touch /data/projects/fate/eggroll/logs/eggroll/eggroll.jvm.log - ln -sf /dev/stdout /data/projects/fate/eggroll/logs/eggroll/eggroll.jvm.log - touch /data/projects/fate/eggroll/logs/eggroll/eggroll.jvm.err.log - ln -sf /dev/stderr /data/projects/fate/eggroll/logs/eggroll/eggroll.jvm.err.log - cp /data/projects/fate/eggroll/conf/temp_eggroll.properties /data/projects/fate/eggroll/conf/eggroll.properties - {{- if .Values.modules.rollsite.enableTLS }} + {{- if .Values.modules.osx.enableTLS }} cat /data/projects/fate/eggroll/conf/cert_configs >> /data/projects/fate/eggroll/conf/eggroll.properties {{- end}} - java -Dlog4j.configurationFile=$${EGGROLL_HOME}/conf/log4j2.properties -cp $${EGGROLL_HOME}/lib/*:$${EGGROLL_HOME}/conf/ com.webank.eggroll.rollsite.EggSiteBootstrap -c $${EGGROLL_HOME}/conf/eggroll.properties + java -XX:+UseG1GC -XX:G1HeapRegionSize=16m -XX:G1ReservePercent=25 -XX:InitiatingHeapOccupancyPercent=30 -XX:SoftRefLRUPolicyMSPerMB=0 -verbose:gc -Xloggc:/dev/shm/rmq_srv_gc_%p_%t.log -XX:+PrintGCDetails -XX:+PrintGCDateStamps -XX:+PrintGCApplicationStoppedTime -XX:+PrintAdaptiveSizePolicy -XX:+UseGCLogFileRotation -XX:NumberOfGCLogFiles=5 -XX:GCLogFileSize=30m -XX:+HeapDumpOnOutOfMemoryError -XX:HeapDumpPath=/oom/heapdump.hprof -server -Xms4g -Xmx4g -XX:-OmitStackTraceInFastThrow -XX:+AlwaysPreTouch -XX:MaxDirectMemorySize=15g -XX:-UseLargePages -XX:-UseBiasedLocking -cp conf/broker/:lib/*:extension/*:/data/projects/fate/osx/lib/osx-broker-1.0.0.jar org.fedai.osx.broker.Bootstrap -c /data/projects/fate/osx/conf ports: - containerPort: 9370 livenessProbe: @@ -90,27 +82,24 @@ spec: failureThreshold: 12 periodSeconds: 10 volumeMounts: - - mountPath: /data/projects/fate/eggroll/conf/route_table/ - name: rollsite-confs - - mountPath: /data/projects/fate/eggroll/conf/eggroll.properties - name: eggroll-confs - subPath: eggroll.properties - {{- if .Values.modules.rollsite.enableTLS }} + - mountPath: /data/projects/fate/osx/conf/broker/ + name: osx-confs + {{- if .Values.modules.osx.enableTLS }} - mountPath: /data/projects/fate/eggroll/conf/cert_configs name: eggroll-confs subPath: cert_configs - mountPath: /data/projects/fate/eggroll/conf/cert/ name: eggroll-certs {{- end }} - {{- with .Values.modules.rollsite.nodeSelector }} + {{- with .Values.modules.osx.nodeSelector }} nodeSelector: {{ toYaml . | indent 8 }} {{- end }} - {{- with .Values.modules.rollsite.tolerations }} + {{- with .Values.modules.osx.tolerations }} tolerations: {{ toYaml . | indent 8 }} {{- end }} - {{- with .Values.modules.rollsite.affinity }} + {{- with .Values.modules.osx.affinity }} affinity: {{ toYaml . | indent 8 }} {{- end }} @@ -121,13 +110,13 @@ spec: serviceAccountName: {{ template "serviceAccountName" . }} restartPolicy: Always volumes: - - name: rollsite-confs + - name: osx-confs configMap: - name: rollsite-config + name: osx-config - name: eggroll-confs configMap: name: eggroll-config - {{- if .Values.modules.rollsite.enableTLS }} + {{- if .Values.modules.osx.enableTLS }} - name: eggroll-certs secret: secretName: eggroll-certs diff --git a/helm-charts/FATE/templates/backends/eggroll/rollsite/service.yaml b/helm-charts/FATE/templates/core/osx/service.yaml similarity index 66% rename from helm-charts/FATE/templates/backends/eggroll/rollsite/service.yaml rename to helm-charts/FATE/templates/core/osx/service.yaml index 0216d4dd2..119db3371 100644 --- a/helm-charts/FATE/templates/backends/eggroll/rollsite/service.yaml +++ b/helm-charts/FATE/templates/core/osx/service.yaml @@ -9,28 +9,28 @@ # See the License for the specific language governing permissions and # limitations under the License. -{{ if .Values.modules.rollsite.include }} +{{ if .Values.modules.osx.include }} apiVersion: v1 kind: Service metadata: - name: rollsite + name: osx labels: - fateMoudle: rollsite + fateMoudle: osx {{ include "fate.labels" . | indent 4 }} spec: ports: - - name: "tcp-rollsite" + - name: "tcp-osx" port: 9370 targetPort: 9370 - {{- if eq .Values.modules.rollsite.type "NodePort" "LoadBalancer" }} - nodePort: {{ .Values.modules.rollsite.nodePort }} + {{- if eq .Values.modules.osx.type "NodePort" "LoadBalancer" }} + nodePort: {{ .Values.modules.osx.nodePort }} {{- end }} protocol: TCP - type: {{ .Values.modules.rollsite.type }} - {{- if .Values.modules.rollsite.loadBalancerIP }} - loadBalancerIP: "{{ .Values.modules.rollsite.loadBalancerIP }}" + type: {{ .Values.modules.osx.type }} + {{- if .Values.modules.osx.loadBalancerIP }} + loadBalancerIP: "{{ .Values.modules.osx.loadBalancerIP }}" {{- end }} selector: - fateMoudle: rollsite + fateMoudle: osx {{ include "fate.matchLabels" . | indent 4 }} {{ end }} \ No newline at end of file diff --git a/helm-charts/FATE/templates/core/python-spark.yaml b/helm-charts/FATE/templates/core/python-spark.yaml index 777c4ee93..7cfecbe7f 100644 --- a/helm-charts/FATE/templates/core/python-spark.yaml +++ b/helm-charts/FATE/templates/core/python-spark.yaml @@ -126,7 +126,7 @@ spec: cp /data/projects/fate/fate_flow/conf/pulsar_route_table/pulsar_route_table.yaml /data/projects/fate/fate_flow/pulsar_route_table.yaml cp /data/projects/fate/fate_flow/conf/rabbitmq_route_table/rabbitmq_route_table.yaml /data/projects/fate/fate_flow/rabbitmq_route_table.yaml - sleep 5 && python fate_flow/python/fate_flow/fate_flow_server.py + pip install cryptography && sleep 5 && python fate_flow/python/fate_flow/fate_flow_server.py --debug livenessProbe: tcpSocket: port: 9380 diff --git a/helm-charts/FATE/values-template-example.yaml b/helm-charts/FATE/values-template-example.yaml index c294f634e..1e06f9f94 100644 --- a/helm-charts/FATE/values-template-example.yaml +++ b/helm-charts/FATE/values-template-example.yaml @@ -1,7 +1,7 @@ name: fate-9999 namespace: fate-9999 chartName: fate -chartVersion: v2.0.0-beta +chartVersion: v2.0.0 partyId: 9999 registry: "" pullPolicy: @@ -14,7 +14,7 @@ podSecurityPolicy: enabled: false ingressClassName: nginx modules: - - rollsite + - osx - clustermanager - nodemanager - mysql @@ -29,8 +29,8 @@ modules: # Computing : [Eggroll, Spark, Spark_local] computing: Eggroll -# Federation: [Eggroll(computing: Eggroll), Pulsar/RabbitMQ(computing: Spark/Spark_local)] -federation: Eggroll +# Federation: [OSX(computing: Eggroll), Pulsar/RabbitMQ(computing: Spark/Spark_local)] +federation: OSX # Storage: [Eggroll(computing: Eggroll), HDFS(computing: Spark), LocalFS(computing: Spark_local)] storage: Eggroll # Algorithm: [Basic, NN, ALL] @@ -64,9 +64,9 @@ skippedKeys: # hosts: # - name: party9999.pulsar.example.com -# rollsite: - # image: "federatedai/eggroll" - # imageTag: "v2.0.0-beta" +# osx: + # image: "federatedai/osx" + # imageTag: "2.0.0-release" # type: NodePort # nodePort: 30091 # loadBalancerIP: @@ -106,7 +106,7 @@ skippedKeys: # lbrollsite: # image: "federatedai/eggroll" - # imageTag: "v2.0.0-beta" + # imageTag: "2.0.0-release" # type: NodePort # nodePort: 30091 # loadBalancerIP: @@ -128,7 +128,7 @@ skippedKeys: # nodemanager: # image: "federatedai/eggroll" - # imageTag: "v2.0.0-beta" + # imageTag: "2.0.0-release" # replicas: 2 # sessionProcessorsPerNode: 4 # nodeSelector: @@ -149,7 +149,7 @@ skippedKeys: # clustermanager: # image: "federatedai/eggroll" - # imageTag: "v2.0.0-beta" + # imageTag: "2.0.0-release" # nodeSelector: # tolerations: # affinity: @@ -165,7 +165,7 @@ skippedKeys: # python: # image: "federatedai/fateflow" - # imageTag: "v2.0.0-beta" + # imageTag: "2.0.0-release" # type: NodePort # replicas: 1 # httpNodePort: 30097 @@ -232,7 +232,7 @@ skippedKeys: # fateboard: # image: "federatedai/fateboard" - # imageTag: "v2.0.0-beta" + # imageTag: "2.0.0-release" # type: ClusterIP # username: admin # password: admin @@ -242,7 +242,7 @@ skippedKeys: # client: # image: "federatedai/client" - # imageTag: "v2.0.0-beta" + # imageTag: "2.0.0-release" # nodeSelector: # subPath: "" # existingClaim: "" @@ -291,7 +291,7 @@ skippedKeys: # spark: # master: # image: "federatedai/spark-master" - # imageTag: "v2.0.0-beta" + # imageTag: "2.0.0-release" # replicas: 1 # resources: # requests: @@ -307,7 +307,7 @@ skippedKeys: # nodePort: 30977 # worker: # image: "federatedai/spark-worker" - # imageTag: "v2.0.0-beta" + # imageTag: "2.0.0-release" # replicas: 2 # resources: # requests: @@ -347,7 +347,7 @@ skippedKeys: # size: # nginx: # image: "federatedai/nginx" - # imageTag: "v2.0.0-beta" + # imageTag: "2.0.0-release" # nodeSelector: # tolerations: # affinity: diff --git a/helm-charts/FATE/values-template.yaml b/helm-charts/FATE/values-template.yaml index d516a3d1d..a03d51b3d 100644 --- a/helm-charts/FATE/values-template.yaml +++ b/helm-charts/FATE/values-template.yaml @@ -116,7 +116,7 @@ podSecurityPolicy: ingressClassName: {{ .ingressClassName | default "nginx"}} exchange: -{{- with .rollsite }} +{{- with .osx }} {{- with .exchange }} partyIp: {{ .ip }} partyPort: {{ .port }} @@ -133,7 +133,7 @@ exchangeList: {{- end }} partyList: -{{- with .rollsite }} +{{- with .osx }} {{- range .partyList }} - partyId: {{ .partyId }} partyIp: {{ .partyIp }} @@ -145,12 +145,12 @@ persistence: enabled: {{ .persistence | default "false" }} modules: - rollsite: - include: {{ has "rollsite" .modules }} - {{- with .rollsite }} - ip: rollsite - image: {{ .image | default "federatedai/eggroll" }} - imageTag: {{ .imageTag | default "v2.0.0-beta" }} + osx: + include: {{ has "osx" .modules }} + {{- with .osx }} + ip: osx + image: {{ .image | default "federatedai/osx" }} + imageTag: {{ .imageTag | default "2.0.0-release" }} type: {{ .type | default "ClusterIP" }} nodePort: {{ .nodePort }} loadBalancerIP: {{ .loadBalancerIP }} @@ -194,7 +194,7 @@ modules: {{- with .lbrollsite }} ip: rollsite image: {{ .image | default "federatedai/eggroll" }} - imageTag: {{ .imageTag | default "v2.0.0-beta" }} + imageTag: {{ .imageTag | default "2.0.0-release" }} type: {{ .type | default "ClusterIP" }} loadBalancerIP: {{ .loadBalancerIP }} nodePort: {{ .nodePort }} @@ -228,7 +228,7 @@ modules: {{- end }} logLevel: {{ .logLevel | default "INFO" }} image: {{ .image | default "federatedai/fateflow" }} - imageTag: {{ .imageTag | default "v2.0.0-beta" }} + imageTag: {{ .imageTag | default "2.0.0-release" }} type: {{ .type | default "ClusterIP" }} httpNodePort: {{ .httpNodePort }} grpcNodePort: {{ .grpcNodePort }} @@ -308,7 +308,7 @@ modules: {{- with .clustermanager }} ip: clustermanager image: {{ .image | default "federatedai/eggroll" }} - imageTag: {{ .imageTag | default "v2.0.0-beta" }} + imageTag: {{ .imageTag | default "2.0.0-release" }} type: "ClusterIP" mysqlServerTimezone: {{ .mysqlServerTimezone }} {{- with .nodeSelector }} @@ -335,7 +335,7 @@ modules: {{- with .nodemanager }} sessionProcessorsPerNode: {{ .sessionProcessorsPerNode }} image: {{ .image | default "federatedai/eggroll" }} - imageTag: {{ .imageTag | default "v2.0.0-beta" }} + imageTag: {{ .imageTag | default "2.0.0-release" }} replicas: {{ .replicas | default 2 }} subPath: {{ .subPath }} storageClass: {{ .storageClass | default "nodemanager" }} @@ -366,7 +366,7 @@ modules: {{- with .client }} subPath: {{ .subPath }} image: {{ .image | default "federatedai/client" }} - imageTag: {{ .imageTag | default "v2.0.0-beta" }} + imageTag: {{ .imageTag | default "2.0.0-release" }} existingClaim: {{ .existingClaim }} storageClass: {{ .storageClass | default "client" }} accessMode: {{ .accessMode | default "ReadWriteOnce" }} @@ -432,7 +432,7 @@ modules: {{- with .fateboard }} type: {{ .type }} image: {{ .image | default "federatedai/fateboard" }} - imageTag: {{ .imageTag | default "v2.0.0-beta" }} + imageTag: {{ .imageTag | default "2.0.0-release" }} username: {{ .username }} password: {{ .password }} {{- with .nodeSelector }} @@ -455,7 +455,7 @@ modules: {{- if .master }} master: image: "{{ .master.image | default "federatedai/spark-master" }}" - imageTag: "{{ .master.imageTag | default "v2.0.0-beta" }}" + imageTag: "{{ .master.imageTag | default "2.0.0-release" }}" replicas: {{ .master.replicas }} {{- with .master.resources }} resources: @@ -479,7 +479,7 @@ modules: {{- if .worker }} worker: image: {{ .worker.image | default "federatedai/spark-worker" | quote }} - imageTag: {{ .worker.imageTag | default "v2.0.0-beta" | quote }} + imageTag: {{ .worker.imageTag | default "2.0.0-release" | quote }} replicas: {{ .worker.replicas }} {{- with .worker.resources }} resources: @@ -567,7 +567,7 @@ modules: {{- end }} type: {{ .type | default "ClusterIP" }} image: {{ .image | default "federatedai/nginx" | quote }} - imageTag: {{ .imageTag | default "v2.0.0-beta" | quote }} + imageTag: {{ .imageTag | default "2.0.0-release" | quote }} httpNodePort: {{ .httpNodePort }} grpcNodePort: {{ .grpcNodePort }} loadBalancerIP: {{ .loadBalancerIP }} diff --git a/helm-charts/FATE/values.yaml b/helm-charts/FATE/values.yaml index b9f8d5436..720d3099b 100644 --- a/helm-charts/FATE/values.yaml +++ b/helm-charts/FATE/values.yaml @@ -2,7 +2,7 @@ image: registry: isThridParty: - tag: v2.0.0-beta + tag: 2.0.0-release pullPolicy: IfNotPresent imagePullSecrets: # - name: @@ -12,8 +12,8 @@ partyName: fate-9999 # Computing : Eggroll, Spark, Spark_local computing: Eggroll -# Federation: Eggroll(computing: Eggroll), Pulsar/RabbitMQ(computing: Spark/Spark_local) -federation: Eggroll +# Federation: OSX(computing: Eggroll), Pulsar/RabbitMQ(computing: Spark/Spark_local) +federation: OSX # Storage: Eggroll(computing: Eggroll), HDFS(computing: Spark), LocalFS(computing: Spark_local) storage: Eggroll # Algorithm: Basic, NN, ALL @@ -85,11 +85,11 @@ persistence: enabled: false modules: - rollsite: + osx: include: true - ip: rollsite - image: "federatedai/eggroll" - imageTag: "v2.0.0-beta" + ip: osx + image: "federatedai/osx" + imageTag: "2.0.0-release" type: ClusterIP nodePort: 30091 loadBalancerIP: @@ -114,7 +114,7 @@ modules: include: true ip: rollsite image: "federatedai/eggroll" - imageTag: "v2.0.0-beta" + imageTag: "2.0.0-release" type: ClusterIP nodePort: 30091 loadBalancerIP: @@ -126,7 +126,7 @@ modules: include: true replicas: 1 image: "federatedai/fateflow" - imageTag: "v2.0.0-beta" + imageTag: "2.0.0-release" type: ClusterIP httpNodePort: 30097 grpcNodePort: 30092 @@ -188,7 +188,7 @@ modules: include: true ip: client image: "federatedai/client" - imageTag: "v2.0.0-beta" + imageTag: "2.0.0-release" type: ClusterIP nodeSelector: tolerations: @@ -203,7 +203,7 @@ modules: include: true ip: clustermanager image: "federatedai/eggroll" - imageTag: "v2.0.0-beta" + imageTag: "2.0.0-release" type: ClusterIP nodeSelector: tolerations: @@ -213,7 +213,7 @@ modules: include: true replicas: 2 image: "federatedai/eggroll" - imageTag: "v2.0.0-beta" + imageTag: "2.0.0-release" nodeSelector: tolerations: affinity: @@ -263,7 +263,7 @@ modules: include: true type: ClusterIP image: "federatedai/fateboard" - imageTag: "v2.0.0-beta" + imageTag: "2.0.0-release" username: admin password: admin nodeSelector: @@ -274,7 +274,7 @@ modules: include: true master: image: "federatedai/spark-master" - imageTag: "v2.0.0-beta" + imageTag: "2.0.0-release" replicas: 1 nodeSelector: tolerations: @@ -283,7 +283,7 @@ modules: nodePort: 30977 worker: image: "federatedai/spark-worker" - imageTag: "v2.0.0-beta" + imageTag: "2.0.0-release" replicas: 2 nodeSelector: tolerations: @@ -322,7 +322,7 @@ modules: nginx: include: true image: "federatedai/nginx" - imageTag: "v2.0.0-beta" + imageTag: "2.0.0-release" nodeSelector: tolerations: affinity: diff --git a/k8s-deploy/README.md b/k8s-deploy/README.md index 2ea3c458c..14fb00bfe 100644 --- a/k8s-deploy/README.md +++ b/k8s-deploy/README.md @@ -138,7 +138,7 @@ Deploying FATE with KubeFATE can support many different engine combinations. For We support such definition for: -1. Eggroll components: cluster manager, node manager and rollsite. +1. Eggroll components: cluster manager, node manager and osx. 2. Spark components: master and worker. 3. Rabbitmq. 4. Pulsar. @@ -172,7 +172,7 @@ SubJobs nodemanager-0 ModuleStatus: Available, SubJobStatus: Success, D 2022-04-12 07:34:09, EndTime: 2022-04-12 07:47:18 python ModuleStatus: Available, SubJobStatus: Success, Duration: 14m, StartTime: 2022-04-12 07:34:09, EndTime: 2022-04-12 07:48:14 - rollsite ModuleStatus: Available, SubJobStatus: Success, Duration: 13m, StartTime: + osx ModuleStatus: Available, SubJobStatus: Success, Duration: 13m, StartTime: 2022-04-12 07:34:09, EndTime: 2022-04-12 07:47:24 client ModuleStatus: Available, SubJobStatus: Success, Duration: 11m, StartTime: 2022-04-12 07:34:09, EndTime: 2022-04-12 07:45:22 @@ -192,13 +192,13 @@ UUID 24bb75ff-f636-4c64-8c04-1b9073f89a2f Name fate-9999 NameSpace fate-9999 ChartName fate -ChartVersion v2.0.0-beta +ChartVersion v2.0.0 Revision 1 Age 15m Status Running Spec algorithm: Basic chartName: fate - chartVersion: v2.0.0-beta + chartVersion: v2.0.0 computing: Eggroll device: CPU federation: Eggroll @@ -208,7 +208,7 @@ Spec algorithm: Basic istio: enabled: false modules: - - rollsite + - osx - clustermanager - nodemanager - mysql @@ -238,10 +238,10 @@ Info dashboard: mysql: Running nodemanager: Running nodemanager-eggrollpair: Running - rollsite: Running + osx: Running deployments: clustermanager: Available - rollsite: Available + osx: Available ``` ### Access the UI of FATEBoard and Notebook diff --git a/k8s-deploy/README_zh.md b/k8s-deploy/README_zh.md index 4ec16de98..a7fa69588 100644 --- a/k8s-deploy/README_zh.md +++ b/k8s-deploy/README_zh.md @@ -137,7 +137,7 @@ create job success, job id=d92d7a56-7002-46a4-9363-da9c7346e05a 我们支持定义如下部件的资源需求: -1. Eggroll部件:包括cluster manager,node manager和rollsite。 +1. Eggroll部件:包括cluster manager,node manager和osx。 2. Spark components:包括master和worker。 3. Rabbitmq。 4. Pulsar。 @@ -171,7 +171,7 @@ SubJobs nodemanager-0 ModuleStatus: Available, SubJobStatus: Success, D 2022-04-12 07:34:09, EndTime: 2022-04-12 07:47:18 python ModuleStatus: Available, SubJobStatus: Success, Duration: 14m, StartTime: 2022-04-12 07:34:09, EndTime: 2022-04-12 07:48:14 - rollsite ModuleStatus: Available, SubJobStatus: Success, Duration: 13m, StartTime: + osx ModuleStatus: Available, SubJobStatus: Success, Duration: 13m, StartTime: 2022-04-12 07:34:09, EndTime: 2022-04-12 07:47:24 client ModuleStatus: Available, SubJobStatus: Success, Duration: 11m, StartTime: 2022-04-12 07:34:09, EndTime: 2022-04-12 07:45:22 @@ -191,13 +191,13 @@ UUID 24bb75ff-f636-4c64-8c04-1b9073f89a2f Name fate-9999 NameSpace fate-9999 ChartName fate -ChartVersion v2.0.0-beta +ChartVersion v2.0.0 Revision 1 Age 15m Status Running Spec algorithm: Basic chartName: fate - chartVersion: v2.0.0-beta + chartVersion: v2.0.0 computing: Eggroll device: CPU federation: Eggroll @@ -207,7 +207,7 @@ Spec algorithm: Basic istio: enabled: false modules: - - rollsite + - osx - clustermanager - nodemanager - mysql @@ -237,10 +237,10 @@ Info dashboard: mysql: Running nodemanager: Running nodemanager-eggrollpair: Running - rollsite: Running + osx: Running deployments: clustermanager: Available - rollsite: Available + osx: Available ``` ### 访问 FATEBoard 和 Notebook UI diff --git a/k8s-deploy/cluster-spark-pulsar.yaml b/k8s-deploy/cluster-spark-pulsar.yaml index b1a25dded..fc9d8a835 100644 --- a/k8s-deploy/cluster-spark-pulsar.yaml +++ b/k8s-deploy/cluster-spark-pulsar.yaml @@ -1,7 +1,7 @@ name: fate-9999 namespace: fate-9999 chartName: fate -chartVersion: v2.0.0-beta +chartVersion: v2.0.0 partyId: 9999 registry: "" pullPolicy: @@ -55,7 +55,7 @@ skippedKeys: # fateboard: # image: "federatedai/fateboard" - # imageTag: "v2.0.0-beta" + # imageTag: "2.0.0-release" # type: ClusterIP # username: admin # password: admin @@ -66,7 +66,7 @@ skippedKeys: # Specify the fateflow service's properties # python: # image: "federatedai/fateflow" - # imageTag: "v2.0.0-beta" + # imageTag: "2.0.0-release" # type: NodePort # replicas: 1 # httpNodePort: 30097 @@ -171,7 +171,7 @@ skippedKeys: # spark: # master: # image: "federatedai/spark-master" - # imageTag: "v2.0.0-beta" + # imageTag: "2.0.0-release" # replicas: 1 # resources: # requests: @@ -187,7 +187,7 @@ skippedKeys: # nodePort: 30977 # worker: # image: "federatedai/spark-worker" - # imageTag: "v2.0.0-beta" + # imageTag: "2.0.0-release" # replicas: 2 # resources: # requests: @@ -227,7 +227,7 @@ skippedKeys: # size: # nginx: # image: "federatedai/nginx" - # imageTag: "v2.0.0-beta" + # imageTag: "2.0.0-release" # nodeSelector: # tolerations: # affinity: diff --git a/k8s-deploy/cluster-spark-rabbitmq.yaml b/k8s-deploy/cluster-spark-rabbitmq.yaml index f30cc4b7a..86ce4ed39 100644 --- a/k8s-deploy/cluster-spark-rabbitmq.yaml +++ b/k8s-deploy/cluster-spark-rabbitmq.yaml @@ -1,7 +1,7 @@ name: fate-9999 namespace: fate-9999 chartName: fate -chartVersion: v2.0.0-beta +chartVersion: v2.0.0 partyId: 9999 registry: "" pullPolicy: @@ -55,7 +55,7 @@ skippedKeys: # fateboard: # image: "federatedai/fateboard" - # imageTag: "v2.0.0-beta" + # imageTag: "2.0.0-release" # type: ClusterIP # username: admin # password: admin @@ -66,7 +66,7 @@ skippedKeys: # Specify the fateflow service's properties # python: # image: "federatedai/fateflow" - # imageTag: "v2.0.0-beta" + # imageTag: "2.0.0-release" # type: NodePort # replicas: 1 # httpNodePort: 30097 @@ -173,7 +173,7 @@ skippedKeys: # spark: # master: # image: "federatedai/spark-master" - # imageTag: "v2.0.0-beta" + # imageTag: "2.0.0-release" # replicas: 1 # resources: # requests: @@ -189,7 +189,7 @@ skippedKeys: # nodePort: 30977 # worker: # image: "federatedai/spark-worker" - # imageTag: "v2.0.0-beta" + # imageTag: "2.0.0-release" # replicas: 2 # resources: # requests: @@ -229,7 +229,7 @@ skippedKeys: # size: # nginx: # image: "federatedai/nginx" - # imageTag: "v2.0.0-beta" + # imageTag: "2.0.0-release" # nodeSelector: # tolerations: # affinity: diff --git a/k8s-deploy/cluster-spark-slim.yaml b/k8s-deploy/cluster-spark-slim.yaml index 0f4b0901d..af2816d3d 100644 --- a/k8s-deploy/cluster-spark-slim.yaml +++ b/k8s-deploy/cluster-spark-slim.yaml @@ -1,7 +1,7 @@ name: fate-9999 namespace: fate-9999 chartName: fate -chartVersion: v2.0.0-beta +chartVersion: v2.0.0 partyId: 9999 registry: "" pullPolicy: @@ -54,7 +54,7 @@ skippedKeys: # python: # image: "federatedai/fateflow" - # imageTag: "v2.0.0-beta" + # imageTag: "2.0.0-release" # type: NodePort # replicas: 1 # httpNodePort: 30097 @@ -159,7 +159,7 @@ skippedKeys: # nginx: # image: "federatedai/nginx" - # imageTag: "v2.0.0-beta" + # imageTag: "2.0.0-release" # nodeSelector: # tolerations: # affinity: diff --git a/k8s-deploy/cluster.yaml b/k8s-deploy/cluster.yaml index 20e4c51b8..fc2a1865d 100644 --- a/k8s-deploy/cluster.yaml +++ b/k8s-deploy/cluster.yaml @@ -1,7 +1,7 @@ name: fate-9999 namespace: fate-9999 chartName: fate -chartVersion: v2.0.0-beta +chartVersion: v2.0.0 partyId: 9999 registry: "" pullPolicy: @@ -14,7 +14,7 @@ podSecurityPolicy: enabled: false ingressClassName: nginx modules: - - rollsite + - osx - clustermanager - nodemanager - mysql @@ -24,8 +24,8 @@ modules: # Computing : Eggroll, Spark, Spark_local computing: Eggroll -# Federation: Eggroll(computing: Eggroll), Pulsar/RabbitMQ(computing: Spark/Spark_local) -federation: Eggroll +# Federation: OSX(computing: Eggroll), Pulsar/RabbitMQ(computing: Spark/Spark_local) +federation: OSX # Storage: Eggroll(computing: Eggroll), HDFS(computing: Spark), LocalFS(computing: Spark_local) storage: Eggroll # Algorithm: Basic, NN, ALL @@ -56,7 +56,7 @@ skippedKeys: # - name: party9999.pulsar.example.com # Specify rollsite properties -# rollsite: +# osx: # type: NodePort # nodePort: 30091 # loadBalancerIP: diff --git a/k8s-deploy/examples/party-10000/cluster-gpu.yaml b/k8s-deploy/examples/party-10000/cluster-gpu.yaml index d92afbbab..734e00185 100644 --- a/k8s-deploy/examples/party-10000/cluster-gpu.yaml +++ b/k8s-deploy/examples/party-10000/cluster-gpu.yaml @@ -1,7 +1,7 @@ name: fate-10000 namespace: fate-10000 chartName: fate -chartVersion: v2.0.0-beta +chartVersion: v2.0.0 partyId: 10000 registry: "" pullPolicy: diff --git a/k8s-deploy/examples/party-10000/cluster-spark-local-pulsar.yaml b/k8s-deploy/examples/party-10000/cluster-spark-local-pulsar.yaml index 74df16aba..0b5f465b9 100644 --- a/k8s-deploy/examples/party-10000/cluster-spark-local-pulsar.yaml +++ b/k8s-deploy/examples/party-10000/cluster-spark-local-pulsar.yaml @@ -1,7 +1,7 @@ name: fate-10000 namespace: fate-10000 chartName: fate -chartVersion: v2.0.0-beta +chartVersion: v2.0.0 partyId: 10000 registry: "" pullPolicy: diff --git a/k8s-deploy/examples/party-10000/cluster-spark-pulsar.yaml b/k8s-deploy/examples/party-10000/cluster-spark-pulsar.yaml index cfce46776..72c5cb0d3 100644 --- a/k8s-deploy/examples/party-10000/cluster-spark-pulsar.yaml +++ b/k8s-deploy/examples/party-10000/cluster-spark-pulsar.yaml @@ -1,7 +1,7 @@ name: fate-10000 namespace: fate-10000 chartName: fate -chartVersion: v2.0.0-beta +chartVersion: v2.0.0 partyId: 10000 registry: "" pullPolicy: diff --git a/k8s-deploy/examples/party-10000/cluster-spark-rabbitmq.yaml b/k8s-deploy/examples/party-10000/cluster-spark-rabbitmq.yaml index 576e392c1..c7be21866 100644 --- a/k8s-deploy/examples/party-10000/cluster-spark-rabbitmq.yaml +++ b/k8s-deploy/examples/party-10000/cluster-spark-rabbitmq.yaml @@ -1,7 +1,7 @@ name: fate-10000 namespace: fate-10000 chartName: fate -chartVersion: v2.0.0-beta +chartVersion: v2.0.0 partyId: 10000 registry: "" pullPolicy: diff --git a/k8s-deploy/examples/party-10000/cluster.yaml b/k8s-deploy/examples/party-10000/cluster.yaml index 24c7a1d56..20f4724d2 100644 --- a/k8s-deploy/examples/party-10000/cluster.yaml +++ b/k8s-deploy/examples/party-10000/cluster.yaml @@ -1,7 +1,7 @@ name: fate-10000 namespace: fate-10000 chartName: fate -chartVersion: v2.0.0-beta +chartVersion: v2.0.0 partyId: 10000 registry: "" pullPolicy: diff --git a/k8s-deploy/examples/party-9999/cluster-gpu.yaml b/k8s-deploy/examples/party-9999/cluster-gpu.yaml index 4dfa1ade6..ac664b267 100644 --- a/k8s-deploy/examples/party-9999/cluster-gpu.yaml +++ b/k8s-deploy/examples/party-9999/cluster-gpu.yaml @@ -1,7 +1,7 @@ name: fate-9999 namespace: fate-9999 chartName: fate -chartVersion: v2.0.0-beta +chartVersion: v2.0.0 partyId: 9999 registry: "" pullPolicy: diff --git a/k8s-deploy/examples/party-9999/cluster-spark-local-pulsar.yaml b/k8s-deploy/examples/party-9999/cluster-spark-local-pulsar.yaml index ffb220a30..3feb4328c 100644 --- a/k8s-deploy/examples/party-9999/cluster-spark-local-pulsar.yaml +++ b/k8s-deploy/examples/party-9999/cluster-spark-local-pulsar.yaml @@ -1,7 +1,7 @@ name: fate-9999 namespace: fate-9999 chartName: fate -chartVersion: v2.0.0-beta +chartVersion: v2.0.0 partyId: 9999 registry: "" pullPolicy: diff --git a/k8s-deploy/examples/party-9999/cluster-spark-pulsar.yaml b/k8s-deploy/examples/party-9999/cluster-spark-pulsar.yaml index 0a55e76e0..2bcca6d9f 100644 --- a/k8s-deploy/examples/party-9999/cluster-spark-pulsar.yaml +++ b/k8s-deploy/examples/party-9999/cluster-spark-pulsar.yaml @@ -1,7 +1,7 @@ name: fate-9999 namespace: fate-9999 chartName: fate -chartVersion: v2.0.0-beta +chartVersion: v2.0.0 partyId: 9999 registry: "" pullPolicy: diff --git a/k8s-deploy/examples/party-9999/cluster-spark-rabbitmq.yaml b/k8s-deploy/examples/party-9999/cluster-spark-rabbitmq.yaml index 317a67f5a..c0be61895 100644 --- a/k8s-deploy/examples/party-9999/cluster-spark-rabbitmq.yaml +++ b/k8s-deploy/examples/party-9999/cluster-spark-rabbitmq.yaml @@ -1,7 +1,7 @@ name: fate-9999 namespace: fate-9999 chartName: fate -chartVersion: v2.0.0-beta +chartVersion: v2.0.0 partyId: 9999 registry: "" pullPolicy: diff --git a/k8s-deploy/examples/party-9999/cluster.yaml b/k8s-deploy/examples/party-9999/cluster.yaml index 5a51e0e2e..dc2ad70fc 100644 --- a/k8s-deploy/examples/party-9999/cluster.yaml +++ b/k8s-deploy/examples/party-9999/cluster.yaml @@ -1,7 +1,7 @@ name: fate-9999 namespace: fate-9999 chartName: fate -chartVersion: v2.0.0-beta +chartVersion: v2.0.0 partyId: 9999 registry: "" pullPolicy: diff --git a/k8s-deploy/examples/party.config b/k8s-deploy/examples/party.config index 7e5e0bf25..3979ca09b 100644 --- a/k8s-deploy/examples/party.config +++ b/k8s-deploy/examples/party.config @@ -1,5 +1,5 @@ -fate_chartVersion=v2.0.0-beta -fate_imageTAG=v2.0.0-beta +fate_chartVersion=v2.0.0 +fate_imageTAG=v2.0.0 fate_serving_chartVersion=v2.1.6 fate_serving_imageTAG=2.1.6-release party_9999_IP=192.168.9.1