diff --git a/README.md b/README.md index 9096b0895..dfd40a203 100644 --- a/README.md +++ b/README.md @@ -20,7 +20,7 @@ KubeFATE |-- docker-deploy |-- k8s-deploy ``` -`docker-deploy`: The primary objective is to swiftly establish a federated learning environment. Docker Compose allows for the deployment of FATE components on a single host. Leveraging Docker Compose, FATE can be configured for multi-party environments, facilitating collaborative federated setups. For further information, kindly refer to the [Docker Compose Deployment](./docker-deploy/README.md) documentation. +`docker-deploy`: The primary objective is to swiftly establish a federated learning environment. Docker Compose allows for the deployment of FATE components on a single host. Leveraging Docker Compose, FATE can be configured for multi-party environments, facilitating collaborative federated setups. For further information, kindly refer to the [Docker Compose Deployment](docker-deploy/README_old.md) documentation. `k8s-deploy`: The deployment approach is specifically tailored for production environments, providing a robust and scalable solution. Its design offers exceptional flexibility, enabling seamless operation of FATE clusters across various environments with ease and efficiency. diff --git a/docker-deploy/.env b/docker-deploy/.env index 4d0c55d34..9ac4a2900 100644 --- a/docker-deploy/.env +++ b/docker-deploy/.env @@ -8,35 +8,35 @@ SSH_PORT=22 # SSH_PORT: port of SSH, default 22 -KubeFATE_Version=v2.0.0-release +KubeFATE_Version=v2.2.0-release # components version FATEFlow_IMAGE="federatedai/fateflow" -FATEFlow_IMAGE_TAG="2.0.0-release" +FATEFlow_IMAGE_TAG="2.2.0-release" FATEBoard_IMAGE="federatedai/fateboard" -FATEBoard_IMAGE_TAG="2.0.0-release" +FATEBoard_IMAGE_TAG="2.1.1-release" MySQL_IMAGE="mysql" MySQL_IMAGE_TAG="8.0.28" Client_IMAGE="federatedai/client" -Client_IMAGE_TAG="2.0.0-release" +Client_IMAGE_TAG="2.1.0-release" EGGRoll_IMAGE="federatedai/eggroll" -EGGRoll_IMAGE_TAG="2.0.0-release" +EGGRoll_IMAGE_TAG="3.2.0-release" OSX_IMAGE="federatedai/osx" -OSX_IMAGE_TAG="2.0.0-release" +OSX_IMAGE_TAG="2.2.0-release" Nginx_IMAGE="federatedai/nginx" -Nginx_IMAGE_TAG="2.0.0-release" +Nginx_IMAGE_TAG="2.1.0-release" RabbitMQ_IMAGE="federatedai/rabbitmq" RabbitMQ_IMAGE_TAG="3.8.3-management" Pulsar_IMAGE="federatedai/pulsar" Pulsar_IMAGE_TAG="2.10.2" Hadoop_NameNode_IMAGE="federatedai/hadoop-namenode" -Hadoop_NameNode_IMAGE_TAG="2.0.0-hadoop3.2.1-java8" +Hadoop_NameNode_IMAGE_TAG="2.1.0-hadoop3.2.1-java8" Hadoop_DataNode_IMAGE="federatedai/hadoop-datanode" -Hadoop_DataNode_IMAGE_TAG="2.0.0-hadoop3.2.1-java8" +Hadoop_DataNode_IMAGE_TAG="2.1.0-hadoop3.2.1-java8" Spark_Master_IMAGE="federatedai/spark-master" -Spark_Master_IMAGE_TAG="2.0.0-release" +Spark_Master_IMAGE_TAG="2.1.0-release" Spark_Worker_IMAGE="federatedai/spark-worker" -Spark_Worker_IMAGE_TAG="2.0.0-release" \ No newline at end of file +Spark_Worker_IMAGE_TAG="2.1.0-release" \ No newline at end of file diff --git a/docker-deploy/README.md b/docker-deploy/README_old.md similarity index 100% rename from docker-deploy/README.md rename to docker-deploy/README_old.md diff --git a/docker-deploy/README_zh.md b/docker-deploy/README_zh.md index eafd25765..74d391afe 100644 --- a/docker-deploy/README_zh.md +++ b/docker-deploy/README_zh.md @@ -45,13 +45,10 @@ RegistryURI=hub.c.163.com 如果运行机没有FATE组件的镜像,可以通过以下命令从Docker Hub获取镜像。FATE镜像的版本``可在[release页面](https://github.com/FederatedAI/FATE/releases)上查看,其中serving镜像的版本信息在[这个页面](https://github.com/FederatedAI/FATE-Serving/releases): ```bash -docker pull federatedai/eggroll:-release -docker pull federatedai/fateboard:-release -docker pull federatedai/fateflow:-release -docker pull federatedai/serving-server:-release -docker pull federatedai/serving-proxy:-release -docker pull federatedai/serving-admin:-release -docker pull bitnami/zookeeper:3.7.0 +docker pull federatedai/eggroll:3.2.0-release +docker pull federatedai/fateflow:2.2.0-release +docker pull federatedai/osx:2.2.0-release +docker pull federatedai/fateboard:2.1.1-release docker pull mysql:8.0.28 ``` @@ -60,14 +57,10 @@ docker pull mysql:8.0.28 ```bash $ docker images REPOSITORY TAG -federatedai/eggroll -release -federatedai/fateboard -release -federatedai/fateflow -release -federatedai/client -release -federatedai/serving-server -release -federatedai/serving-proxy -release -federatedai/serving-admin -release -bitnami/zookeeper 3.7.0 +federatedai/fateflow 2.2.0-release +federatedai/eggroll 3.2.0-release +federatedai/osx 2.2.0-release +federatedai/fateboard 2.1.1-release mysql 8.0.28 ``` @@ -105,23 +98,39 @@ party_list=(10000 9999) party_ip_list=(192.168.7.1 192.168.7.2) serving_ip_list=(192.168.7.1 192.168.7.2) +# Engines: +# Computing : Eggroll, Spark, Spark_local computing=Eggroll -federation=Eggroll +# Federation: OSX(computing: Eggroll/Spark/Spark_local), Pulsar/RabbitMQ(computing: Spark/Spark_local) +federation=OSX +# Storage: Eggroll(computing: Eggroll), HDFS(computing: Spark), LocalFS(computing: Spark_local) storage=Eggroll - +# Algorithm: Basic, NN, ALL algorithm=Basic -device=IPCL - -compute_core=4 - -...... +# Device: CPU, IPCL, GPU +device=CPU + +# spark and eggroll +compute_core=16 + +# You only need to configure this parameter when you want to use the GPU, the default value is 1 +gpu_count=0 + +# modify if you are going to use an external db +mysql_ip=mysql +mysql_user=fate +mysql_password=fate_dev +mysql_db=fate_flow +serverTimezone=UTC + +name_node=hdfs://namenode:9000 + +# Define fateboard login information +fateboard_username=admin +fateboard_password=admin ``` -* 使用Spark+Rabbitmq的部署方式的文档可以参考[这里](../docs/FATE_On_Spark.md). -* 使用Spark+Pulsar的部署方式的文档可以参考[这里](../docs/FATE_On_Spark_With_Pulsar.md). -* 使用Spark+local Pulsar的部署方式的文档可以参考[这里](TBD) - 使用Docker-compose部署FATE可以支持多种种不同的类型引擎的组合(对computing federation storage的选择),关于不同类型的FATE的更多细节查看: [不同类型FATE的架构介绍](../docs/Introduction_to_Engine_Architecture_zh.md)。 `algorithm`和`device`的配置可以查看这里[FATE_Algorithm_and_Computational_Acceleration_Selection.md](../docs/FATE_Algorithm_and_Computational_Acceleration_Selection.md) @@ -152,23 +161,6 @@ total 0 drwxr-xr-x. 2 fate docker 6 May 27 00:51 fate ``` -### GPU支持 - -从v1.11.1开始docker compose部署支持使用GPU的FATE部署,如果要使用GPU,你需要先搞定GPU的docker环境。可以参考docker的官方文档()。 - -要使用GPU需要修改配置,这两个都需要修改 - -```sh -algorithm=NN -device=GPU - -gpu_count=1 -``` - -FATE GPU的使用只有fateflow组件,所以每个Party最少需要有一个GPU。 - -*gpu_count会映射为count,参考 [Docker compose GPU support](https://docs.docker.com/compose/gpu-support/)* - ### 执行部署脚本 **注意:**在运行以下命令之前,所有目标主机必须 @@ -186,9 +178,9 @@ FATE GPU的使用只有fateflow组件,所以每个Party最少需要有一个GP bash ./generate_config.sh # 生成部署文件 ``` -脚本将会生成10000、9999两个组织(Party)的部署文件,然后打包成tar文件。接着把tar文件`confs-.tar`、`serving-.tar`分别复制到party对应的主机上并解包,解包后的文件默认在`/data/projects/fate`目录下。然后脚本将远程登录到这些主机并使用docker compose命令启动FATE实例。 +脚本将会生成10000、9999两个组织(Party)的部署文件,然后打包成tar文件。接着把tar文件`confs-.tar`复制到party对应的主机上并解包,解包后的文件默认在`/data/projects/fate`目录下。然后脚本将远程登录到这些主机并使用docker compose命令启动FATE实例。 -默认情况下,脚本会同时启动训练和服务集群。 如果您需要单独启动它们,请将 `--training` 或 `--serving` 添加到 `docker_deploy.sh` 中,如下所示。 +默认情况下,脚本会同时启动训练和服务集群。 如果您需要单独启动它们,请将 `--training` 添加到 `docker_deploy.sh` 中,如下所示。 (可选)要部署各方训练集群,请使用以下命令: @@ -196,12 +188,6 @@ bash ./generate_config.sh # 生成部署文件 bash ./docker_deploy.sh all --training ``` -(可选)要部署各方服务集群,请使用以下命令: - -```bash -bash ./docker_deploy.sh all --serving -``` - (可选)要将 FATE 部署到单个目标主机,请使用以下命令和参与方的 ID(下例中为 10000): ```bash @@ -224,20 +210,19 @@ ssh fate@192.168.7.1 ```bash cd /data/projects/fate/confs-10000 -docker compose ps +docker-compose ps ``` 输出显示如下,若各个组件状态都是`Up`状态,并且fateflow的状态还是(healthy),说明部署成功。 ```bash NAME IMAGE COMMAND SERVICE CREATED STATUS PORTS -confs-10000-client-1 federatedai/client:2.0.0-release "bash -c 'pipeline i…" client About a minute ago Up About a minute 0.0.0.0:20000->20000/tcp, :::20000->20000/tcp -confs-10000-clustermanager-1 federatedai/eggroll:2.0.0-release "/tini -- bash -c 'j…" clustermanager About a minute ago Up About a minute 4670/tcp -confs-10000-fateboard-1 federatedai/fateboard:2.0.0-release "/bin/sh -c 'java -D…" fateboard About a minute ago Up About a minute 0.0.0.0:8080->8080/tcp, :::8080->8080/tcp -confs-10000-fateflow-1 federatedai/fateflow:2.0.0-release "/bin/bash -c 'set -…" fateflow About a minute ago Up About a minute (healthy) 0.0.0.0:9360->9360/tcp, :::9360->9360/tcp, 0.0.0.0:9380->9380/tcp, :::9380->9380/tcp -confs-10000-mysql-1 mysql:8.0.28 "docker-entrypoint.s…" mysql About a minute ago Up About a minute 3306/tcp, 33060/tcp -confs-10000-nodemanager-1 federatedai/eggroll:2.0.0-release "/tini -- bash -c 'j…" nodemanager About a minute ago Up About a minute 4671/tcp -confs-10000-osx-1 federatedai/osx:2.0.0-release "/tini -- bash -c 'j…" osx About a minute ago Up About a minute 0.0.0.0:9370->9370/tcp, :::9370->9370/tcp +confs-10000-clustermanager-1 federatedai/eggroll:3.2.0-release "/tini -- bash -c 'j…" clustermanager About a minute ago Up About a minute 4670/tcp +confs-10000-fateflow-1 federatedai/fateflow:2.2.0-release "/bin/bash -c 'set -…" fateflow About a minute ago Up About a minute (healthy) 192.168.7.1:9360->9360/tcp, :::9360->9360/tcp, 192.168.7.1:9380->9380/tcp, :::9380->9380/tcp +confs-10000-mysql-1 mysql:8.0.28 "docker-entrypoint.s…" mysql About a minute ago Up About a minute 3306/tcp, 33060/tcp +confs-10000-nodemanager-1 federatedai/eggroll:3.2.0-release "/tini -- bash -c 'j…" nodemanager About a minute ago Up About a minute 4671/tcp +confs-10000-osx-1 federatedai/osx:2.2.0-release "/tini -- bash -c 'j…" osx About a minute ago Up About a minute 192.168.7.1:9370->9370/tcp, :::9370->9370/tcp +confs-10000-fateboard-1 federatedai/fateboard:2.1.1-release "sh -c 'java -Dsprin…" fateboard About a minute ago Up About a minute 192.168.7.1:8080->8080/tcp ``` ### 验证部署 @@ -249,8 +234,8 @@ docker-compose上的FATE启动成功之后需要验证各个服务是否都正 ```bash # 在192.168.7.1上执行下列命令 -# 进入client组件容器内部 -$ docker compose exec client bash +# 进入fateflow组件容器内部 +$ docker-compose exec fateflow bash # toy 验证 $ flow test toy --guest-party-id 10000 --host-party-id 9999 ``` @@ -258,281 +243,148 @@ $ flow test toy --guest-party-id 10000 --host-party-id 9999 如果测试通过,屏幕将显示类似如下消息: ```bash -"2019-08-29 07:21:25,353 - secure_add_guest.py[line:96] - INFO: begin to init parameters of secure add example guest" -"2019-08-29 07:21:25,354 - secure_add_guest.py[line:99] - INFO: begin to make guest data" -"2019-08-29 07:21:26,225 - secure_add_guest.py[line:102] - INFO: split data into two random parts" -"2019-08-29 07:21:29,140 - secure_add_guest.py[line:105] - INFO: share one random part data to host" -"2019-08-29 07:21:29,237 - secure_add_guest.py[line:108] - INFO: get share of one random part data from host" -"2019-08-29 07:21:33,073 - secure_add_guest.py[line:111] - INFO: begin to get sum of guest and host" -"2019-08-29 07:21:33,920 - secure_add_guest.py[line:114] - INFO: receive host sum from guest" -"2019-08-29 07:21:34,118 - secure_add_guest.py[line:121] - INFO: success to calculate secure_sum, it is 2000.0000000000002" +toy test job xxxxx is success ``` -### 验证Serving-Service功能 +### 上传数据,发起任务 #### Host方操作 -##### 进入party10000 client容器 +##### 进入party10000 fateflow容器 ```bash cd /data/projects/fate/confs-10000 -docker compose exec client bash +docker-compose exec fateflow bash ``` ##### 上传host数据 - +执行python脚本,上传数据 ```bash -flow data upload -c fateflow/examples/upload/upload_host.json +# 上传数据(单边的, 双边需要在另一方再次执行) +from fate_client.pipeline import FateFlowPipeline + +guest_data_path="/data/projects/fate/examples/data/breast_hetero_guest.csv" +host_data_path="/data/projects/fate/examples/data/breast_hetero_host.csv" + +data_pipeline = FateFlowPipeline().set_parties(local="0") +guest_meta = { + "delimiter": ",", "dtype": "float64", "label_type": "int64","label_name": "y", "match_id_name": "id" + } +host_meta = { + "delimiter": ",", "input_format": "dense", "match_id_name": "id" + } +data_pipeline.transform_local_file_to_dataframe(file=guest_data_path, namespace="experiment", name="breast_hetero_guest", + meta=guest_meta, head=True, extend_sid=True) +data_pipeline.transform_local_file_to_dataframe(file=host_data_path, namespace="experiment", name="breast_hetero_host", + meta=host_meta, head=True, extend_sid=True) ``` #### Guest方操作 -##### 进入party9999 client容器 +##### 进入party9999 fateflow容器 ```bash cd /data/projects/fate/confs-9999 -docker compose exec client bash +docker-compose exec fateflow bash ``` ##### 上传guest数据 - +执行python脚本,上传数据 ```bash -flow data upload -c fateflow/examples/upload/upload_guest.json +# 上传数据(单边的, 双边需要在另一方再次执行) +from fate_client.pipeline import FateFlowPipeline + +guest_data_path="/data/projects/fate/examples/data/breast_hetero_guest.csv" +host_data_path="/data/projects/fate/examples/data/breast_hetero_host.csv" + +data_pipeline = FateFlowPipeline().set_parties(local="0") +guest_meta = { + "delimiter": ",", "dtype": "float64", "label_type": "int64","label_name": "y", "match_id_name": "id" + } +host_meta = { + "delimiter": ",", "input_format": "dense", "match_id_name": "id" + } +data_pipeline.transform_local_file_to_dataframe(file=guest_data_path, namespace="experiment", name="breast_hetero_guest", + meta=guest_meta, head=True, extend_sid=True) +data_pipeline.transform_local_file_to_dataframe(file=host_data_path, namespace="experiment", name="breast_hetero_host", + meta=host_meta, head=True, extend_sid=True) ``` ##### 提交任务 - -```bash -flow job submit -d fateflow/examples/lr/test_hetero_lr_job_dsl.json -c fateflow/examples/lr/test_hetero_lr_job_conf.json -``` - -output: - -```json -{ - "data": { - "board_url": "http://fateboard:8080/index.html#/dashboard?job_id=202111230933232084530&role=guest&party_id=9999", - "code": 0, - "dsl_path": "/data/projects/fate/fate_flow/jobs/202111230933232084530/job_dsl.json", - "job_id": "202111230933232084530", - "logs_directory": "/data/projects/fate/fate_flow/logs/202111230933232084530", - "message": "success", - "model_info": { - "model_id": "arbiter-10000#guest-9999#host-10000#model", - "model_version": "202111230933232084530" - }, - "pipeline_dsl_path": "/data/projects/fate/fate_flow/jobs/202111230933232084530/pipeline_dsl.json", - "runtime_conf_on_party_path": "/data/projects/fate/fate_flow/jobs/202111230933232084530/guest/9999/job_runtime_on_party_conf.json", - "runtime_conf_path": "/data/projects/fate/fate_flow/jobs/202111230933232084530/job_runtime_conf.json", - "train_runtime_conf_path": "/data/projects/fate/fate_flow/jobs/202111230933232084530/train_runtime_conf.json" - }, - "jobId": "202111230933232084530", - "retcode": 0, - "retmsg": "success" -} -``` - -##### 查看训练任务状态 - -```bash -flow task query -r guest -j 202111230933232084530 | grep -w f_status -``` - -output: - -```bash - "f_status": "success", - "f_status": "waiting", - "f_status": "running", - "f_status": "waiting", - "f_status": "waiting", - "f_status": "success", - "f_status": "success", -``` - -等到所有的`waiting`状态变为`success`. - -##### 部署模型 - -```bash -flow model deploy --model-id arbiter-10000#guest-9999#host-10000#model --model-version 202111230933232084530 -``` - -```json -{ - "data": { - "arbiter": { - "10000": 0 - }, - "detail": { - "arbiter": { - "10000": { - "retcode": 0, - "retmsg": "deploy model of role arbiter 10000 success" - } - }, - "guest": { - "9999": { - "retcode": 0, - "retmsg": "deploy model of role guest 9999 success" - } - }, - "host": { - "10000": { - "retcode": 0, - "retmsg": "deploy model of role host 10000 success" - } - } - }, - "guest": { - "9999": 0 - }, - "host": { - "10000": 0 - }, - "model_id": "arbiter-10000#guest-9999#host-10000#model", - "model_version": "202111230954255210490" - }, - "retcode": 0, - "retmsg": "success" -} -``` - -*后面需要用到的`model_version`都是这一步得到的`"model_version": "202111230954255210490"`* - -##### 修改加载模型的配置 - -```bash -cat > fateflow/examples/model/publish_load_model.json < fateflow/examples/model/bind_model_service.json </ # 删除docker-compose部署文件 #### CPU指令集问题 -解决办法:查看[wiki](https://github.com/FederatedAI/KubeFATE/wiki/KubeFATE)页面的storage-service部分 +解决办法:查看[wiki](https://github.com/FederatedAI/KubeFATE/wiki/KubeFATE)页面的storage-service部分。 diff --git a/docker-deploy/docker_deploy.sh b/docker-deploy/docker_deploy.sh index 335b6e9b4..b7949fe14 100755 --- a/docker-deploy/docker_deploy.sh +++ b/docker-deploy/docker_deploy.sh @@ -20,7 +20,6 @@ WORKINGDIR=$(pwd) # fetch fate-python image source ${WORKINGDIR}/.env source ${WORKINGDIR}/parties.conf - cd ${WORKINGDIR} Deploy() { @@ -143,7 +142,6 @@ DeployPartyInternal() { echo "Unable to find Party: $target_party_id, please check you input." return 1 fi - if [ "$3" != "" ]; then user=$3 fi @@ -156,20 +154,20 @@ DeployPartyInternal() { scp -P ${SSH_PORT} ${WORKINGDIR}/outputs/confs-$target_party_id.tar $user@$target_party_ip:~/ #rm -f ${WORKINGDIR}/outputs/confs-$target_party_id.tar echo "$target_party_ip training cluster copy is ok!" - ssh -p ${SSH_PORT} -tt $user@$target_party_ip <#${fateboard_username}#g" ./confs-"$party_id"/confs/fateboard/conf/application.properties - sed -i "s##${fateboard_password}#g" ./confs-"$party_id"/confs/fateboard/conf/application.properties + sed -i "s##${fateboard_username}#g" ./confs-"$party_id"/confs/fateboard/conf/application.properties + sed -i "s##${fateboard_password}#g" ./confs-"$party_id"/confs/fateboard/conf/application.properties + echo fateboard module of "$party_id" done! # mysql @@ -521,26 +522,27 @@ EOF module_name=exchange cd ${WORKINGDIR} rm -rf confs-exchange/ - mkdir -p confs-exchange/conf/ + mkdir -p confs-exchange/conf/eggroll + mkdir -p confs-exchange/conf/osx cp ${WORKINGDIR}/.env confs-exchange/ cp training_template/docker-compose-exchange.yml confs-exchange/docker-compose.yml - cp -r training_template/backends/eggroll/conf/* confs-exchange/conf/ - + cp -r training_template/backends/eggroll/conf/* confs-exchange/conf/eggroll + cp -r training_template/backends/osx/conf/* confs-exchange/conf/osx if [ "$RegistryURI" != "" ]; then sed -i 's#federatedai#${RegistryURI}/federatedai#g' ./confs-exchange/docker-compose.yml fi - sed -i "s##${proxy_ip}#g" ./confs-exchange/conf/eggroll.properties - sed -i "s##${proxy_port}#g" ./confs-exchange/conf/eggroll.properties - sed -i "s##exchange#g" ./confs-exchange/conf/eggroll.properties - sed -i "s/coordinator=.*/coordinator=exchange/g" ./confs-exchange/conf/eggroll.properties - sed -i "s/ip=.*/ip=0.0.0.0/g" ./confs-exchange/conf/eggroll.properties - - cat >./confs-exchange/conf/route_table.json <#${proxy_ip}#g" ./confs-exchange/conf/eggroll/eggroll.properties + sed -i "s##${proxy_port}#g" ./confs-exchange/conf/eggroll/eggroll.properties + sed -i "s##exchange#g" ./confs-exchange/conf/eggroll/eggroll.properties + sed -i "s/coordinator=.*/coordinator=exchange/g" ./confs-exchange/conf/eggroll/eggroll.properties + sed -i "s/ip=.*/ip=0.0.0.0/g" ./confs-exchange/conf/eggroll/eggroll.properties + cat >./confs-exchange/conf/osx/broker/route_table.json <