diff --git a/PULL_REQUEST_TEMPLATE.md b/PULL_REQUEST_TEMPLATE.md index 4f3872671..53f378d8c 100644 --- a/PULL_REQUEST_TEMPLATE.md +++ b/PULL_REQUEST_TEMPLATE.md @@ -3,6 +3,7 @@ Fixes ISSUE #xxx ## Description 1. Tell the story why you need to make this change from the user's perspective. 2. What will be the pain point if you don't make this change? +3. In summary, what did you change reach your goal? ## Tests ### Before fix diff --git a/docker-deploy/.env b/docker-deploy/.env index bb1227020..cef87147c 100644 --- a/docker-deploy/.env +++ b/docker-deploy/.env @@ -1,5 +1,5 @@ RegistryURI= -TAG=1.9.2-release +TAG=1.10.0-release SERVING_TAG=2.1.6-release SSH_PORT=22 diff --git a/docker-deploy/README.md b/docker-deploy/README.md index 467c55792..2a2ab5c36 100644 --- a/docker-deploy/README.md +++ b/docker-deploy/README.md @@ -76,7 +76,7 @@ compute_core=4 ``` -* For more details about FATE on Spark with Rebbitmq please refer to this [document](../docs/FATE_On_Spark.md). +* For more details about FATE on Spark with RabbitMQ please refer to this [document](../docs/FATE_On_Spark.md). * For more details about FATE on Spark with Pulsar, refer to this [document](../docs/FATE_On_Spark_With_Pulsar.md) * For more details about FATE on Spark with local pulsar, refer to this [document](placeholder) @@ -166,12 +166,12 @@ CONTAINER ID IMAGE COMMAND 3dca43f3c9d5 federatedai/serving-admin:2.1.5-release "/bin/sh -c 'java -c…" 5 minutes ago Up 5 minutes 0.0.0.0:8350->8350/tcp, :::8350->8350/tcp serving-9999_serving-admin_1 fe924918509b federatedai/serving-proxy:2.1.5-release "/bin/sh -c 'java -D…" 5 minutes ago Up 5 minutes 0.0.0.0:8059->8059/tcp, :::8059->8059/tcp, 0.0.0.0:8869->8869/tcp, :::8869->8869/tcp, 8879/tcp serving-9999_serving-proxy_1 b62ed8ba42b7 bitnami/zookeeper:3.7.0 "/opt/bitnami/script…" 5 minutes ago Up 5 minutes 0.0.0.0:2181->2181/tcp, :::2181->2181/tcp, 8080/tcp, 0.0.0.0:49226->2888/tcp, :::49226->2888/tcp, 0.0.0.0:49225->3888/tcp, :::49225->3888/tcp serving-9999_serving-zookeeper_1 -3c643324066f federatedai/client:1.9.2-release "/bin/sh -c 'flow in…" 5 minutes ago Up 5 minutes 0.0.0.0:20000->20000/tcp, :::20000->20000/tcp confs-9999_client_1 -3fe0af1ebd71 federatedai/fateboard:1.9.2-release "/bin/sh -c 'java -D…" 5 minutes ago Up 5 minutes 0.0.0.0:8080->8080/tcp, :::8080->8080/tcp confs-9999_fateboard_1 -635b7d99357e federatedai/fateflow:1.9.2-release "container-entrypoin…" 5 minutes ago Up 5 minutes (healthy) 0.0.0.0:9360->9360/tcp, :::9360->9360/tcp, 8080/tcp, 0.0.0.0:9380->9380/tcp, :::9380->9380/tcp confs-9999_fateflow_1 -8b515f08add3 federatedai/eggroll:1.9.2-release "/tini -- bash -c 'j…" 5 minutes ago Up 5 minutes 8080/tcp, 0.0.0.0:9370->9370/tcp, :::9370->9370/tcp confs-9999_rollsite_1 -108cc061c191 federatedai/eggroll:1.9.2-release "/tini -- bash -c 'j…" 5 minutes ago Up 5 minutes 4670/tcp, 8080/tcp confs-9999_clustermanager_1 -f10575e76899 federatedai/eggroll:1.9.2-release "/tini -- bash -c 'j…" 5 minutes ago Up 5 minutes 4671/tcp, 8080/tcp confs-9999_nodemanager_1 +3c643324066f federatedai/client:1.10.0-release "/bin/sh -c 'flow in…" 5 minutes ago Up 5 minutes 0.0.0.0:20000->20000/tcp, :::20000->20000/tcp confs-9999_client_1 +3fe0af1ebd71 federatedai/fateboard:1.10.0-release "/bin/sh -c 'java -D…" 5 minutes ago Up 5 minutes 0.0.0.0:8080->8080/tcp, :::8080->8080/tcp confs-9999_fateboard_1 +635b7d99357e federatedai/fateflow:1.10.0-release "container-entrypoin…" 5 minutes ago Up 5 minutes (healthy) 0.0.0.0:9360->9360/tcp, :::9360->9360/tcp, 8080/tcp, 0.0.0.0:9380->9380/tcp, :::9380->9380/tcp confs-9999_fateflow_1 +8b515f08add3 federatedai/eggroll:1.10.0-release "/tini -- bash -c 'j…" 5 minutes ago Up 5 minutes 8080/tcp, 0.0.0.0:9370->9370/tcp, :::9370->9370/tcp confs-9999_rollsite_1 +108cc061c191 federatedai/eggroll:1.10.0-release "/tini -- bash -c 'j…" 5 minutes ago Up 5 minutes 4670/tcp, 8080/tcp confs-9999_clustermanager_1 +f10575e76899 federatedai/eggroll:1.10.0-release "/tini -- bash -c 'j…" 5 minutes ago Up 5 minutes 4671/tcp, 8080/tcp confs-9999_nodemanager_1 aa0a0002de93 mysql:8.0.28 "docker-entrypoint.s…" 5 minutes ago Up 5 minutes 3306/tcp, 33060/tcp confs-9999_mysql_1 ``` @@ -207,21 +207,6 @@ If the test passed, the output may look like the following: docker exec -it confs-10000_client_1 bash ``` -##### Modifying examples/upload_host.json - -```bash -cat > fateflow/examples/upload/upload_host.json < fateflow/examples/upload/upload_guest.json < fateflow/examples/lr/test_hetero_lr_job_conf.json < fateflow/examples/lr/test_hetero_lr_job_dsl.json <8350/tcp, :::8350->8350/tcp serving-9999_serving-admin_1 fe924918509b federatedai/serving-proxy:2.1.5-release "/bin/sh -c 'java -D…" 5 minutes ago Up 5 minutes 0.0.0.0:8059->8059/tcp, :::8059->8059/tcp, 0.0.0.0:8869->8869/tcp, :::8869->8869/tcp, 8879/tcp serving-9999_serving-proxy_1 b62ed8ba42b7 bitnami/zookeeper:3.7.0 "/opt/bitnami/script…" 5 minutes ago Up 5 minutes 0.0.0.0:2181->2181/tcp, :::2181->2181/tcp, 8080/tcp, 0.0.0.0:49226->2888/tcp, :::49226->2888/tcp, 0.0.0.0:49225->3888/tcp, :::49225->3888/tcp serving-9999_serving-zookeeper_1 -3c643324066f federatedai/client:1.9.2-release "/bin/sh -c 'flow in…" 5 minutes ago Up 5 minutes 0.0.0.0:20000->20000/tcp, :::20000->20000/tcp confs-9999_client_1 -3fe0af1ebd71 federatedai/fateboard:1.9.2-release "/bin/sh -c 'java -D…" 5 minutes ago Up 5 minutes 0.0.0.0:8080->8080/tcp, :::8080->8080/tcp confs-9999_fateboard_1 -635b7d99357e federatedai/fateflow:1.9.2-release "container-entrypoin…" 5 minutes ago Up 5 minutes (healthy) 0.0.0.0:9360->9360/tcp, :::9360->9360/tcp, 8080/tcp, 0.0.0.0:9380->9380/tcp, :::9380->9380/tcp confs-9999_fateflow_1 -8b515f08add3 federatedai/eggroll:1.9.2-release "/tini -- bash -c 'j…" 5 minutes ago Up 5 minutes 8080/tcp, 0.0.0.0:9370->9370/tcp, :::9370->9370/tcp confs-9999_rollsite_1 -108cc061c191 federatedai/eggroll:1.9.2-release "/tini -- bash -c 'j…" 5 minutes ago Up 5 minutes 4670/tcp, 8080/tcp confs-9999_clustermanager_1 -f10575e76899 federatedai/eggroll:1.9.2-release "/tini -- bash -c 'j…" 5 minutes ago Up 5 minutes 4671/tcp, 8080/tcp confs-9999_nodemanager_1 +3c643324066f federatedai/client:1.10.0-release "/bin/sh -c 'flow in…" 5 minutes ago Up 5 minutes 0.0.0.0:20000->20000/tcp, :::20000->20000/tcp confs-9999_client_1 +3fe0af1ebd71 federatedai/fateboard:1.10.0-release "/bin/sh -c 'java -D…" 5 minutes ago Up 5 minutes 0.0.0.0:8080->8080/tcp, :::8080->8080/tcp confs-9999_fateboard_1 +635b7d99357e federatedai/fateflow:1.10.0-release "container-entrypoin…" 5 minutes ago Up 5 minutes (healthy) 0.0.0.0:9360->9360/tcp, :::9360->9360/tcp, 8080/tcp, 0.0.0.0:9380->9380/tcp, :::9380->9380/tcp confs-9999_fateflow_1 +8b515f08add3 federatedai/eggroll:1.10.0-release "/tini -- bash -c 'j…" 5 minutes ago Up 5 minutes 8080/tcp, 0.0.0.0:9370->9370/tcp, :::9370->9370/tcp confs-9999_rollsite_1 +108cc061c191 federatedai/eggroll:1.10.0-release "/tini -- bash -c 'j…" 5 minutes ago Up 5 minutes 4670/tcp, 8080/tcp confs-9999_clustermanager_1 +f10575e76899 federatedai/eggroll:1.10.0-release "/tini -- bash -c 'j…" 5 minutes ago Up 5 minutes 4671/tcp, 8080/tcp confs-9999_nodemanager_1 aa0a0002de93 mysql:8.0.28 "docker-entrypoint.s…" 5 minutes ago Up 5 minutes 3306/tcp, 33060/tcp confs-9999_mysql_1 ``` @@ -229,21 +229,6 @@ $ flow test toy --guest-party-id 10000 --host-party-id 9999 #验证 docker exec -it confs-10000_client_1 bash ``` -##### 修改examples/upload_host.json - -```bash -cat > fateflow/examples/upload/upload_host.json < fateflow/examples/upload/upload_guest.json < fateflow/examples/lr/test_hetero_lr_job_conf.json < fateflow/examples/lr/test_hetero_lr_job_dsl.json <> ./confs-$party_id/.env # Modify the configuration file @@ -236,7 +237,7 @@ GenerateConfig() { Suffix="" # computing if [ "$computing" == "Spark" ] || [ "$computing" == "Spark_local" ]; then - Suffix=$Suffix"-spark" + Suffix=$Suffix"" fi # algorithm if [ "$algorithm" == "NN" ]; then @@ -248,12 +249,13 @@ GenerateConfig() { fi # federatedai/fateflow-${computing}-${algorithm}-${device}:${version} - sed -i "s#image: \"federatedai/fateflow:\${TAG}\"#image: \"federatedai/fateflow${Suffix}:\${TAG}\"#g" ./confs-$party_id/docker-compose.yml # eggroll or spark-worker if [ "$computing" == "Eggroll" ]; then + sed -i "s#image: \"federatedai/fateflow:\${TAG}\"#image: \"federatedai/fateflow${Suffix}:\${TAG}\"#g" ./confs-$party_id/docker-compose.yml sed -i "s#image: \"federatedai/eggroll:\${TAG}\"#image: \"federatedai/eggroll${Suffix}:\${TAG}\"#g" ./confs-$party_id/docker-compose.yml - elif [ "$computing" == "Spark" ]; then + elif [ "$computing" == "Spark" ] || [ "$computing" == "Spark_local" ]; then + sed -i "s#image: \"federatedai/fateflow:\${TAG}\"#image: \"federatedai/fateflow-spark${Suffix}:\${TAG}\"#g" ./confs-$party_id/docker-compose.yml sed -i "s#image: \"federatedai/spark-worker:\${TAG}\"#image: \"federatedai/spark-worker${Suffix}:\${TAG}\"#g" ./confs-$party_id/docker-compose.yml fi @@ -494,6 +496,7 @@ EOF rm -rf confs-exchange/ mkdir -p confs-exchange/conf/ cp ${WORKINGDIR}/.env confs-exchange/ + cp training_template/docker-compose-exchange.yml confs-exchange/docker-compose.yml cp -r training_template/backends/eggroll/conf/* confs-exchange/conf/ diff --git a/docker-deploy/parties.conf b/docker-deploy/parties.conf index 8812093fe..0afbdef40 100644 --- a/docker-deploy/parties.conf +++ b/docker-deploy/parties.conf @@ -39,3 +39,6 @@ fateboard_password=admin # Define serving admin login information serving_admin_username=admin serving_admin_password=admin + +# Define notebook login information +notebook_hashed_password= \ No newline at end of file diff --git a/docker-deploy/training_template/docker-compose-eggroll.yml b/docker-deploy/training_template/docker-compose-eggroll.yml index 49ef770f9..4d6b0fb59 100644 --- a/docker-deploy/training_template/docker-compose-eggroll.yml +++ b/docker-deploy/training_template/docker-compose-eggroll.yml @@ -153,6 +153,7 @@ services: FATE_FLOW_IP: "fateflow" FATE_FLOW_PORT: "9380" FATE_SERVING_HOST: "fate-serving:8059" + NOTEBOOK_HASHED_PASSWORD: "${NOTEBOOK_HASHED_PASSWORD}" volumes: - download_dir:/data/projects/fate/download_dir - shared_dir_examples:/data/projects/fate/examples @@ -161,6 +162,7 @@ services: - fateflow networks: - fate-network + command: ["bash", "-c", "pipeline init --ip $${FATE_FLOW_IP} --port $${FATE_FLOW_PORT} && flow init --ip $${FATE_FLOW_IP} --port $${FATE_FLOW_PORT} && jupyter notebook --ip=0.0.0.0 --port=20000 --allow-root --debug --NotebookApp.notebook_dir='/data/projects/fate/' --no-browser --NotebookApp.token='' --NotebookApp.password=$${NOTEBOOK_HASHED_PASSWORD} "] mysql: image: "mysql:8.0.28" diff --git a/docker-deploy/training_template/docker-compose-spark-slim.yml b/docker-deploy/training_template/docker-compose-spark-slim.yml index 2fadb725d..fae0c679c 100644 --- a/docker-deploy/training_template/docker-compose-spark-slim.yml +++ b/docker-deploy/training_template/docker-compose-spark-slim.yml @@ -169,6 +169,7 @@ services: FATE_FLOW_IP: "fateflow" FATE_FLOW_PORT: "9380" FATE_SERVING_HOST: "fate-serving:8059" + NOTEBOOK_HASHED_PASSWORD: "${NOTEBOOK_HASHED_PASSWORD}" volumes: - download_dir:/data/projects/fate/download_dir - shared_dir_examples:/data/projects/fate/examples @@ -176,4 +177,5 @@ services: depends_on: - fateflow networks: - - fate-network \ No newline at end of file + - fate-network + command: ["bash", "-c", "pipeline init --ip $${FATE_FLOW_IP} --port $${FATE_FLOW_PORT} && flow init --ip $${FATE_FLOW_IP} --port $${FATE_FLOW_PORT} && jupyter notebook --ip=0.0.0.0 --port=20000 --allow-root --debug --NotebookApp.notebook_dir='/data/projects/fate/' --no-browser --NotebookApp.token='' --NotebookApp.password=$${NOTEBOOK_HASHED_PASSWORD} "] diff --git a/docker-deploy/training_template/docker-compose-spark.yml b/docker-deploy/training_template/docker-compose-spark.yml index cef47153c..01e20a27a 100644 --- a/docker-deploy/training_template/docker-compose-spark.yml +++ b/docker-deploy/training_template/docker-compose-spark.yml @@ -97,7 +97,7 @@ services: FATE_LOG_LEVEL: "INFO" namenode: - image: federatedai/hadoop-namenode:2.0.0-hadoop3.2.1-java8 + image: "federatedai/hadoop-namenode:2.0.0-hadoop3.2.1-java8" restart: always ports: - 9000:9000 @@ -114,9 +114,10 @@ services: - fate-network datanode-0: - image: federatedai/hadoop-datanode:2.0.0-hadoop3.2.1-java8 + image: "federatedai/hadoop-datanode:2.0.0-hadoop3.2.1-java8" restart: always volumes: + - /etc/localtime:/etc/localtime:ro - ./shared_dir/data/datanode-0:/hadoop/dfs/data environment: SERVICE_PRECONDITION: "namenode:9000" @@ -126,9 +127,10 @@ services: - fate-network datanode-1: - image: federatedai/hadoop-datanode:2.0.0-hadoop3.2.1-java8 + image: "federatedai/hadoop-datanode:2.0.0-hadoop3.2.1-java8" restart: always volumes: + - /etc/localtime:/etc/localtime:ro - ./shared_dir/data/datanode-1:/hadoop/dfs/data environment: SERVICE_PRECONDITION: "namenode:9000" @@ -138,10 +140,9 @@ services: - fate-network datanode-2: - image: federatedai/hadoop-datanode:2.0.0-hadoop3.2.1-java8 + image: "federatedai/hadoop-datanode:2.0.0-hadoop3.2.1-java8" restart: always volumes: - - ./shared_dir/data/datanode:/hadoop/dfs/data - /etc/localtime:/etc/localtime:ro - ./shared_dir/data/datanode-2:/hadoop/dfs/data environment: @@ -151,8 +152,9 @@ services: networks: - fate-network + spark-master: - image: federatedai/spark-master:${TAG} + image: "federatedai/spark-master:${TAG}" restart: always ports: - "8888:8080" @@ -165,7 +167,7 @@ services: - fate-network spark-worker: - image: federatedai/spark-worker:${TAG} + image: "federatedai/spark-worker:${TAG}" restart: always depends_on: - spark-master @@ -180,7 +182,7 @@ services: - fate-network rabbitmq: - image: federatedai/rabbitmq:3.8.3-management + image: "federatedai/rabbitmq:3.8.3-management" ports: - "5672:5672" - "15672:15672" @@ -252,6 +254,7 @@ services: FATE_FLOW_IP: "fateflow" FATE_FLOW_PORT: "9380" FATE_SERVING_HOST: "fate-serving:8059" + NOTEBOOK_HASHED_PASSWORD: "${NOTEBOOK_HASHED_PASSWORD}" volumes: - download_dir:/data/projects/fate/download_dir - shared_dir_examples:/data/projects/fate/examples @@ -260,3 +263,4 @@ services: - fateflow networks: - fate-network + command: ["bash", "-c", "pipeline init --ip $${FATE_FLOW_IP} --port $${FATE_FLOW_PORT} && flow init --ip $${FATE_FLOW_IP} --port $${FATE_FLOW_PORT} && jupyter notebook --ip=0.0.0.0 --port=20000 --allow-root --debug --NotebookApp.notebook_dir='/data/projects/fate/' --no-browser --NotebookApp.token='' --NotebookApp.password=$${NOTEBOOK_HASHED_PASSWORD} "] diff --git a/docs/Customize_KubeFATE_Chart.md b/docs/Customize_KubeFATE_Chart.md index 1ddcf8afb..14c6ba1ef 100644 --- a/docs/Customize_KubeFATE_Chart.md +++ b/docs/Customize_KubeFATE_Chart.md @@ -28,7 +28,7 @@ Unzip one KubeFATE's Chart, you can find a `templates` folder and 4 files: ## `templates` folder In `templates` folder, the template yaml file combined with values will generate valid Kubernetes manifest files for each `FATE` or `FATE-Serving` component. -e.g. For `FATE` v1.9.2, there are following templates locating in `template` folder: +e.g. For `FATE` v1.10.0, there are following templates locating in `template` folder: 1. eggroll: eggroll module, including 3 eggroll related components: clustermanager, nodemanager and rollsite/lb-rollsite. 2. spark: spark module, including spark, hdfs, nginx, pulsar/rabbitmq. People just need to pick one module from spark and eggroll. 3. client: the module for the jupyter notebook client. diff --git a/docs/Deploy_FedML_Agent_to_Kubernetes.md b/docs/Deploy_FedML_Agent_to_Kubernetes.md new file mode 100644 index 000000000..1e916ffc3 --- /dev/null +++ b/docs/Deploy_FedML_Agent_to_Kubernetes.md @@ -0,0 +1,243 @@ +## Deploy FedML Agent to Kubernetes Clusters via KubeFATE + +### Overview +This document provides guide on how to run FedML client and server agent on Kubernetes using KubeFATE. KubeFATE uses Helm charts to facilitate the deployment process, with which we can get all the benefits provided by Kubernetes natively. And it provides additional values including: + +* Declarative deployment definitions for application installing and upgrading. +* Easy management via the KubeFATE CLI. +* Additional status check making sure the deployment is successful. +* Log aggregation for convenient debugging and monitoring. + +The overall architecture for FedML with KubeFATE is shown in the below diagram + +
+ +
+ +The high-level steps are: + +1. The user install KubeFATE service in the Kubernetes cluster. +2. The user install KubeFATE CLI program in the local machine. +3. The user define the FedML client deployment yaml and use KubeFATE to deploy it. +4. The FedML client will register to the FedML public cloud MLOps platform. +5. Optionally, the user can deploy more client instances and server instances in the same Kubernetes cluster. +6. The user can now use the MLOps platform to orchestrate federated learning trainings. + +### Prerequisites + +* A running Kubernetes cluster and permissions to create KubeFATE services and namespaces required by future deployments. For the permissions KubeFATE needs, check its [rbac yaml](https://github.com/FederatedAI/KubeFATE/blob/master/k8s-deploy/rbac-config.yaml) file. +* Basic knowledge of [FedML](https://github.com/FedML-AI/FedML) and [KubeFATE](https://github.com/FederatedAI/KubeFATE). + +### Deploy KubeFATE Service and CLI +Download the KubeFATE "k8s" deployment package from the KubeFATE [release page](https://github.com/FederatedAI/KubeFATE/releases). Follow the KubeFATE K8s deploy guide ([service guide](https://github.com/FederatedAI/KubeFATE/blob/master/k8s-deploy/README.md), [cli guide](https://github.com/FederatedAI/KubeFATE/blob/master/docs/KubeFATE_CLI_user_guide.md)) to install KubeFATE service. For the service deployment, we can stop after the "Preparing domain name and deploying KubeFATE in Kubernetes" section. For the cli guide, we can stop after the "Verify the KubeFATE CLI works properly" section. + +In general this includes the following steps: + +1. Deploy KubeFATE service by applying the resources defined in the "rbac-config" and "kubefate" yaml files. +2. Make sure an ingress controller is deployed that can expose KubeFATE service. Otherwise we need to change the kubefate service type to NodePort or LoadBalancer to expose it. +3. Download and install the KubeFATE cli program and configure the local config yaml file to set the server address to the exposed KubeFATE service address. +4. Use `kubefate version` to verify the cli can work with the service. + +### (Optional) Build and Upload FedML Charts +KubeFATE internally uses Helm chart to deploy FML application. By default, KubeFATE will download the FedML charts automatically from its GitHub repo. Alternatively, we can build the chart packages locally and upload them to KubeFATE service, which is useful in scenarios like air-gapped environment. To support deploying FedML client, use the following command to package and upload FedML client charts to the KubeFATE service: + +```bash +$ helm package /helm-charts/FedML-Client +$ kubefate chart upload -f fedml-client-.tgz +``` + +After uploading the chart, use the `kubefate chart list` command to verify the chart is successfully imported. The output should contain an item looks like + +```bash +UUID NAME VERSION APPVERSION + + fedml-client 0.7.355 release +``` + +If we want to deploy the FedML server too, we need to package the server chart and use `kubefate chart upload -f fedml-server-.tgz` to upload it. + +### Deploy the FedML Edge Client +Prepare a yaml file, for example, "fedml_client.yaml", as below: + +```yaml +name: edge-client-1 +namespace: fedml-edge-client-1 +chartName: fedml-client +chartVersion: 0.7.355 + # registry: "" + # pullPolicy: IfNotPresent + # imagePullSecrets: +# - name: myregistrykey +# ingressClassName: nginx +modules: + - client + + # ingress: + # client: + # annotations: {} + # hosts: + # - host: chart-example.local + # paths: + # - path: / + # pathType: ImplementationSpecific + # tls: + # - secretName: chart-example-tls + # hosts: + # - chart-example.local + +client: + fedmlAccountId: + # fedmlVersion: "release" + # fedmlClientOsName: "Linux" + # replicaCount: 1 + # volume: + # clientHomeDirHostPath: "/home/fedml-client" + # clientHomeDirMountPath: "/home/fedml/fedml-client" + # nameOverride: "" + # fullnameOverride: "" + # serviceAccount: + # create: true + # annotations: {} + # name: "" + # type: ClusterIP + # port: 9988 + # podAnnotations: + # nodeSelector: + # tolerations: + # affinity: + # resources: + # autoscaling: + # enabled: false + # minReplicas: 1 + # maxReplicas: 10 + # targetCPUUtilizationPercentage: 80 + # targetMemoryUtilizationPercentage: 80 + # podSecurityContext: {} + # fsGroup: 2000 + # securityContext: { } + # capabilities: + # drop: + # - ALL + # readOnlyRootFilesystem: true + # runAsNonRoot: true + # runAsUser: 1000 +``` + +The layout of the file is the typically called the "cluster yaml" for KubeFATE. You can change the name and namespace fields according to your needs. And the `fedmlAccountId` id field should be set to your account id in the FedML MLOps platform. For the meanings of the other fields, please refer to the helm chart values file. Typically, we don’t need to change the default settings. + +And, if the Kubernetes namespace doesn't exist, we need to create it beforehand: +```bash +$ kubectl create ns +``` + +Now we can install the above defined FedML client: + +```bash +$ kubefate cluster install -f fedml_client.yaml +``` + +The above command will give a `job_UUID` we can use to check the status: + +```bash +# View deployment status +$ kubefate job describe ${job_UUID} +``` + +When the job status is `Success`, it indicates that the deployment succeeded. Login to your MLOps account and the client should be listed under the "Edge Device" page. + +#### Check Deployed Client Status + +After the deployment, we can use the follow command to get the UUID of the deployed instance: + +```bash +$ kubefate cluster list +``` + +Locate the UUID of the "cluster" and then we can get the detailed status of it by: + +```bash +$ kubefate cluster describe +``` + +And check the logs of this instance: +```bash +$ kubefate cluster logs +``` + +The `logs` subcommand can be useful during the future training process. + +### (Optional) Deploy FedML Server + +We can optionally deploy FedML server instance that can be used in a FedML "Run". As described above, we can optionally package and upload the FedML server chart. And then we prepare the "cluster yaml" for server deployment as: + +```yaml +name: edge-server-1 +namespace: fedml-edge-server-1 +chartName: fedml-server +chartVersion: 0.7.355 + # registry: "" + # pullPolicy: IfNotPresent + # imagePullSecrets: +# - name: myregistrykey +# ingressClassName: nginx +modules: + - server + + # ingress: + # server: + # annotations: {} + # hosts: + # - host: chart-example.local + # paths: + # - path: / + # pathType: ImplementationSpecific + # tls: + # - secretName: chart-example-tls + # hosts: + # - chart-example.local + +server: + fedmlAccountId: + # fedmlVersion: "release" + # fedmlServerOsName: "Linux" + # replicaCount: 1 + # volume: + # serverHomeDirHostPath: "/home/fedml-server" + # serverHomeDirMountPath: "/home/fedml/fedml-server" + # nameOverride: "" + # fullnameOverride: "" + # serviceAccount: + # create: true + # annotations: {} + # name: "" + # type: ClusterIP + # port: 9999 + # podAnnotations: + # nodeSelector: + # tolerations: + # affinity: + # resources: + # autoscaling: + # enabled: false + # minReplicas: 1 + # maxReplicas: 10 + # targetCPUUtilizationPercentage: 80 + # targetMemoryUtilizationPercentage: 80 + # podSecurityContext: {} + # fsGroup: 2000 + # securityContext: { } + # capabilities: + # drop: + # - ALL + # readOnlyRootFilesystem: true + # runAsNonRoot: true + # runAsUser: 1000 +``` + +The configurable parameters are almost the same as the client deployment. And we can use the same `kubefate chart install`, `kubefate cluster describe`, `kubefate cluster logs` commands to check its status. + +After it has been successfully deployed, we can choose this server instance when creating new FedML "Run"s. + +### Next Steps + +We can follow same steps to deploy FedML clients and servers to other namespaces in same Kubernetes cluster, or to other Kubernetes clusters (which requires installing KubeFATE in those clusters too). Once we and our collaborators have deployed all the instances we need, we can start FedML training in the MLOps platform. This can be done by following the MLOps platform's [official document](https://open.fedml.ai/octopus/userGuides/index). We can now skip the "2. Install FedML Agent: fedml login $account_id" section as now we have used KubeFATE to achieve that. diff --git a/docs/Eggroll_with_TLS.md b/docs/Eggroll_with_TLS.md index 53d34d735..0ff3b817f 100644 --- a/docs/Eggroll_with_TLS.md +++ b/docs/Eggroll_with_TLS.md @@ -146,7 +146,7 @@ Then in the cluster.yaml file of FATE-Exchange, turn on the ```enableTLS``` swit ## Docker-Compose mode -In KubeFATE release v1.9.2, we will not provide a switch for enabling TLS for rollsite. This can be done in below manual steps: +In KubeFATE release v1.10.0, we will not provide a switch for enabling TLS for rollsite. This can be done in below manual steps: 1. Generate the certs, as above documents shows, for every FATE cluster and for the FATE Exchange if needed. 2. Run `docker ps` to get the container id of the rollsite. diff --git a/docs/Manage_FATE_and_FATE-Serving_Version.md b/docs/Manage_FATE_and_FATE-Serving_Version.md index c8ef7599a..084688989 100644 --- a/docs/Manage_FATE_and_FATE-Serving_Version.md +++ b/docs/Manage_FATE_and_FATE-Serving_Version.md @@ -30,18 +30,18 @@ The chart can be downloaded in each KubeFATE release, with name `fate-{release_v Download it and copy it to the folder to upload. ``` -$ kubefate chart upload -f ./fate-v1.9.2.tgz +$ kubefate chart upload -f ./fate-v1.10.0.tgz Upload file success $ kubefate chart ls UUID NAME VERSION APPVERSION -ca3f7843-749a-4f69-9f6b-4c544a7623ac fate v1.9.2 v1.9.2 +ca3f7843-749a-4f69-9f6b-4c544a7623ac fate v1.10.0 v1.10.0 ``` -Then, we can deploy the fate cluster of v1.9.2 version. The detail of cluster.yaml please refer to: [FATE Cluster Configuration](./configurations/FATE_cluster_configuration.md) +Then, we can deploy the fate cluster of v1.10.0 version. The detail of cluster.yaml please refer to: [FATE Cluster Configuration](./configurations/FATE_cluster_configuration.md) ``` chartName: fate -chartVersion: v1.9.2 +chartVersion: v1.10.0 ``` We can delete the chart with: diff --git a/docs/configurations/FATE_cluster_configuration.md b/docs/configurations/FATE_cluster_configuration.md index 74aab32d2..34d83c5c2 100644 --- a/docs/configurations/FATE_cluster_configuration.md +++ b/docs/configurations/FATE_cluster_configuration.md @@ -2,48 +2,48 @@ `cluster.yaml` declares information about the FATE cluster to be deployed, which KubeFATE CLI uses to deploy the FATE cluster. ## cluster.yaml -| Name | Type | Description | -|---------------------------|--------------------|--------------------------------------------------------------------------------------------------------| -| * name | scalars | FATE cluster name. | -| * namespace | scalars | Kubernetes namespace for FATE cluster. | -| * chartName | scalars | FATE chart name. (fate/fate-serving) | -| * chartVersion | scalars | FATE chart corresponding version. | -| * partyId | scalars | FATE cluster party id. | -| registry | scalars | Other fate images sources. | -| pullPolicy | scalars | kubernetes images pull policy | -| imagePullSecrets | sequences | The imagePullSecrets names for all deployments | -| * persistence | bool | mysql and nodemanager data persistence. | -| istio.enable | bool | enable istio | -| podSecurityPolicy.enabled | bool | if `true`, create & use Pod Security Policy resources | -| ingressClassName | scalars | The Ingress class name, such as "nginx". | -| * modules | sequences | Modules to be deployed in the FATE cluster. | -| computing | set(Eggroll, Spark, Spark_local) | Configure cluster computing engine( eggroll, spark or spark_local) | -| federation | set(Eggroll,Pulsar,RabbitMQ) | Configure cluster federation engine( eggroll,pulsar or rabbitmq) | -| storage | set(Eggroll,HDFS,LocalFS) | Configure cluster storage engine( eggroll, hdfs or spark) | -| algorithm | set(Basic, NN) | Configure cluster algorithm ( basic or NeuralNetwork) | -| device | set(IPCL, CPU) | Configure cluster device( ipcl or cpu) | -| ingress | mappings | Custom domain of FATE UI component | -| rollsite | mappings | Configuration of FATE cluster `rollsite` module. | -| nodemanager | mappings | Configuration of FATE cluster `nodemanager` module. | -| python | mappings | Configuration of FATE cluster `python` module. | -| fateboard | mappings | Configuration of FATE cluster `fateboard` module. | -| client | mappings | Configuration of FATE cluster `client` module. | -| mysql | mappings | Configuration of FATE cluster `mysql` module.
If you use your own redis, please delete this item. | -| externalMysqlIp | scalars | Access your own MySQL. | -| externalMysqlPort | scalars | Access your own MySQL. | -| externalMysqlDatabase | scalars | Access your own MySQL. | -| externalMysqlUser | scalars | Access your own MySQL. | -| externalMysqlPassword | scalars | Access your own MySQL. | -| servingIp | scalars | Serving cluster connected to fate. | -| servingPort | scalars | Serving cluster connected to fate. | -| spark | mappings | Configuration of FATE cluster `spark` module. | -| hdfs | mappings | Configuration of FATE cluster `hdfs` module. | -| nginx | mappings | Configuration of FATE cluster `nginx` module. | -| rabbitmq | mappings | Configuration of FATE cluster `rabbitmq` module. | -| pulsar | mappings | Configuration of FATE cluster `pulsar` module. | -| skippedKeys | sequences | you can customize some keys which will be ignored in yaml validation | - -***Computing federation storage algorithm device configuration introduction reference [Introduction to Engine Architecture](../Introduction_to_Engine_Architecture.md) 和 [FATE Algorithm and Computational Acceleration Selection](../FATE_Algorithm_and_Computational_Acceleration_Selection.md)*** +| Name | Type | Description | +| ------------------------- | -------------------------------- | ------------------------------------------------------------------------------------------------------ | +| * name | scalars | FATE cluster name. | +| * namespace | scalars | Kubernetes namespace for FATE cluster. | +| * chartName | scalars | FATE chart name. (fate/fate-serving) | +| * chartVersion | scalars | FATE chart corresponding version. | +| * partyId | scalars | FATE cluster party id. | +| registry | scalars | Other fate images sources. | +| pullPolicy | scalars | kubernetes images pull policy | +| imagePullSecrets | sequences | The imagePullSecrets names for all deployments | +| * persistence | bool | mysql and nodemanager data persistence. | +| istio.enable | bool | enable istio | +| podSecurityPolicy.enabled | bool | if `true`, create & use Pod Security Policy resources | +| ingressClassName | scalars | The Ingress class name, such as "nginx". | +| * modules | sequences | Modules to be deployed in the FATE cluster. | +| computing | set(Eggroll, Spark, Spark_local) | Configure cluster computing engine( eggroll, spark or spark_local) | +| federation | set(Eggroll,Pulsar,RabbitMQ) | Configure cluster federation engine( eggroll,pulsar or rabbitmq) | +| storage | set(Eggroll,HDFS,LocalFS,Hive) | Configure cluster storage engine( eggroll, hdfs, spark or hive) | +| algorithm | set(Basic, NN) | Configure cluster algorithm ( basic or NeuralNetwork) | +| device | set(IPCL, CPU) | Configure cluster device( ipcl or cpu) | +| ingress | mappings | Custom domain of FATE UI component | +| rollsite | mappings | Configuration of FATE cluster `rollsite` module. | +| nodemanager | mappings | Configuration of FATE cluster `nodemanager` module. | +| python | mappings | Configuration of FATE cluster `python` module. | +| fateboard | mappings | Configuration of FATE cluster `fateboard` module. | +| client | mappings | Configuration of FATE cluster `client` module. | +| mysql | mappings | Configuration of FATE cluster `mysql` module.
If you use your own redis, please delete this item. | +| externalMysqlIp | scalars | Access your own MySQL. | +| externalMysqlPort | scalars | Access your own MySQL. | +| externalMysqlDatabase | scalars | Access your own MySQL. | +| externalMysqlUser | scalars | Access your own MySQL. | +| externalMysqlPassword | scalars | Access your own MySQL. | +| servingIp | scalars | Serving cluster connected to fate. | +| servingPort | scalars | Serving cluster connected to fate. | +| spark | mappings | Configuration of FATE cluster `spark` module. | +| hdfs | mappings | Configuration of FATE cluster `hdfs` module. | +| nginx | mappings | Configuration of FATE cluster `nginx` module. | +| rabbitmq | mappings | Configuration of FATE cluster `rabbitmq` module. | +| pulsar | mappings | Configuration of FATE cluster `pulsar` module. | +| skippedKeys | sequences | you can customize some keys which will be ignored in yaml validation | + +***Computing federation storage algorithm device configuration introduction reference [Introduction to Engine Architecture](../Introduction_to_Engine_Architecture.md) and [FATE Algorithm and Computational Acceleration Selection](../FATE_Algorithm_and_Computational_Acceleration_Selection.md)*** ### list of modules @@ -66,7 +66,7 @@ ### ingress mappings | Name | Type | Description | -|-------------------------|-------------------|------------------------------------------------------------------| +| ----------------------- | ----------------- | ---------------------------------------------------------------- | | `fateboard` | mappings | Configuration of Fateboard UI domain | | `fateboard.annotations` | mappings | The annotations used commonly for ingresses | | `fateboard.hosts` | sequences | Set hosts list of ingress record | @@ -77,7 +77,7 @@ | `client.tls` | sequences | Set this to enable TLS on the ingress record | | `spark` | mappings | Configuration of spark UI domain | | `rabbitmq` | mappings | Configuration of Rabbitmq UI domain | -| `pulsar` | mappings | Configuration of Pulsar UI domain | +| `pulsar` | mappings | Configuration of Pulsar UI domain | @@ -85,7 +85,7 @@ It is used to declare the `rollsite ` module in the FATE cluster to be deployed. | Name | subitem | Type | Description | -|--------------|-------------|-----------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| ------------ | ----------- | --------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | type | | scalars | Kubernetes ServiceTypes, default is NodePort. | | nodePort | | scalars | The port used by `proxy` module's kubernetes service, default range: 30000-32767. | | partyList | | sequences | If this FATE cluster is exchange cluster, partyList is all party's sequences of all parties proxy address. If this FATE cluster is one of participants, delete this configuration item. | @@ -117,7 +117,7 @@ The parties are directly connected. ### nodemanager mappings | Name | SubItem | Type | Description | -|----------------------------|----------------------------|-----------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------| +| -------------------------- | -------------------------- | --------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------- | | count | | scalars | Number of nodes deployed nodemanager. | | session-Processors-PerNode | | scalars | Configuration of FATE cluster `nodemanager` module. | | list | | sequences | List of nodemanager nodes. | @@ -133,7 +133,7 @@ The parties are directly connected. ### python mappings | Name | Type | Description | -|-----------------------------|----------|----------------------------------------------------------------------------------------------| +| --------------------------- | -------- | -------------------------------------------------------------------------------------------- | | type | scalars | Kubernetes ServiceTypes, default is NodePort.
Other modules can connect to the fateflow | | nodePort | scalars | The port used by `proxy` module's kubernetes service, default range: 30000-32767. | | nodeSelector | mappings | kubernetes nodeSelector. | @@ -152,13 +152,15 @@ The parties are directly connected. | rabbitmq | mappings | If you use the existing rabbitmq, you can set this configuration | | nginx | mappings | If you use the existing nginx, you can set this configuration | | logLevel | scalars | The log level of the Python process, default level is Info | +| hive | mappings | If you use the existing hive, you can set this configuration | +| dependent_distribution | mappings | Distribute dependencies with spark | ### fateboard mappings Configuration of kubernetes deployment fateboard. | Name | Type | Description | -|----------|----------|-----------------------------| +| -------- | -------- | --------------------------- | | type | mappings | Kubernetes nodeSelector. | | username | scalars | Login username of fateboard | | password | scalars | Login password of fateboard | @@ -167,21 +169,22 @@ Configuration of kubernetes deployment fateboard. Configuration of kubernetes deployment client. -| Name | Type | Description | -|---------------|----------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------| -| nodeSelector | mappings | kubernetes nodeSelector. | -| subPath | scalars | Path of data persistence, specify the "subPath" if the PVC is shared with other components. | -| existingClaim | scalars | Use the existing PVC which must be created manually before bound. | -| storageClass | scalars | Specify the "storageClass" used to provision the volume. Or the default. StorageClass will be used(the default). Set it to "-" to disable dynamic provisioning. | -| accessMode | scalars | Kubernetes Persistent Volume Access Modes:
ReadWriteOnce
ReadOnlyMany
ReadWriteMany. | -| size | scalars | Match the volume size of PVC. | +| Name | Type | Description | +| ------------------------ | -------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| nodeSelector | mappings | kubernetes nodeSelector. | +| subPath | scalars | Path of data persistence, specify the "subPath" if the PVC is shared with other components. | +| existingClaim | scalars | Use the existing PVC which must be created manually before bound. | +| storageClass | scalars | Specify the "storageClass" used to provision the volume. Or the default. StorageClass will be used(the default). Set it to "-" to disable dynamic provisioning. | +| accessMode | scalars | Kubernetes Persistent Volume Access Modes:
ReadWriteOnce
ReadOnlyMany
ReadWriteMany. | +| size | scalars | Match the volume size of PVC. | +| notebook_hashed_password | scalars | hashed password for jupyter notebook. | ### Mysql mappings Configuration of kubernetes deployment mysql. | Name | Type | Description | -|---------------|----------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------| +| ------------- | -------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------- | | nodeSelector | mappings | kubernetes nodeSelector. | | ip | scalars | Allow other modules to connect to MySQL. | | port | scalars | Mysql port. | @@ -201,7 +204,7 @@ Configuration of kubernetes deployment mysql. Configuration of kubernetes deployment spark. | Name | SubItem | Type | Description | -|-------------------|--------------|----------|------------------------------| +| ----------------- | ------------ | -------- | ---------------------------- | | master/
worker | Image | scalars | Image of spark components | | | ImageTag | scalars | ImageTag of spark components | | | replicas | scalars | Number of copies of pod | @@ -213,18 +216,18 @@ Configuration of kubernetes deployment spark. Configuration of kubernetes deployment hdfs. -| Name | SubItem | Type | Description | -|------------------------|--------------|----------|--------------------------------------------------| -| namenode/
datanode | nodeSelector | mappings | kubernetes nodeSelector. | -| | type | scalars | Kubernetes ServiceTypes, default is `ClusterIp`. | -| datanode | replicas | scalars | The replicas of the HDFS datanode pods | +| Name | SubItem | Type | Description | +| --------------------- | ------------ | -------- | ------------------------------------------------ | +| namenode/
datanode | nodeSelector | mappings | kubernetes nodeSelector. | +| | type | scalars | Kubernetes ServiceTypes, default is `ClusterIp`. | +| datanode | replicas | scalars | The replicas of the HDFS datanode pods | ### nginx mappings Configuration of kubernetes deployment hdfs. | Name | Type | Description | -|--------------|----------|------------------------------| +| ------------ | -------- | ---------------------------- | | nodeSelector | mappings | kubernetes nodeSelector. | | type | scalars | Kubernetes ServiceTypes. | | nodePort | scalars | Kubernetes Service NodePort. | @@ -263,7 +266,7 @@ Configuration of kubernetes deployment hdfs. Configuration of kubernetes deployment rabbitmq . | Name | Type | Description | -|--------------|----------|--------------------------------------------------| +| ------------ | -------- | ------------------------------------------------ | | nodeSelector | mappings | kubernetes nodeSelector. | | type | scalars | Kubernetes ServiceTypes, default is `ClusterIp`. | | nodePort | scalars | Kubernetes Service NodePort. | @@ -292,27 +295,27 @@ Configuration of kubernetes deployment rabbitmq . Configuration of kubernetes deployment pulsar . -| Name | Type | Description | -| ---------------- | --------- | ------------------------------------------------------------ | -| nodeSelector | mappings | kubernetes nodeSelector. | -| type | scalars | Kubernetes ServiceTypes, default is `ClusterIp`. | -| nodePort | scalars | Kubernetes Service NodePort. | -| skippedKeys | sequences | you can customize some keys which will be ignored in yaml validation | -| tolerations | | Kubernetes tolerations | -| affinity | | Kubernetes affinity | -| env | mappings | env of pulsar. | -| confs | mappings | configuration of pulsar. | -| httpNodePort | scalars | Pulsar HttpNodePort | -| httpsNodePort | scalars | Pulsar HttpsNodePort | -| loadBalancerIP | scalars | Ip of loadBalancer | +| Name | Type | Description | +| ---------------- | --------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| nodeSelector | mappings | kubernetes nodeSelector. | +| type | scalars | Kubernetes ServiceTypes, default is `ClusterIp`. | +| nodePort | scalars | Kubernetes Service NodePort. | +| skippedKeys | sequences | you can customize some keys which will be ignored in yaml validation | +| tolerations | | Kubernetes tolerations | +| affinity | | Kubernetes affinity | +| env | mappings | env of pulsar. | +| confs | mappings | configuration of pulsar. | +| httpNodePort | scalars | Pulsar HttpNodePort | +| httpsNodePort | scalars | Pulsar HttpsNodePort | +| loadBalancerIP | scalars | Ip of loadBalancer | | storageClass | scalars | Specify the "storageClass" used to provision the volume. Or the default. StorageClass will be used(the default). Set it to "-" to disable dynamic provisioning. | -| existingClaim | scalars | Kubernetes existingClaim | -| accessMode | scalars | Kubernetes Persistent Volume Access Modes:
ReadWriteOnce
ReadOnlyMany
ReadWriteMany. | -| size | scalars | Match the volume size of PVC. | -| publicLB.enabled | bool | if `true`, enable publicLB | -| exchange | mappings | FATE cluster `exchange` module's ip and port. | -| resources | mappings | resources of Pod | -| route_table | mappings | route table of pulsar. | +| existingClaim | scalars | Kubernetes existingClaim | +| accessMode | scalars | Kubernetes Persistent Volume Access Modes:
ReadWriteOnce
ReadOnlyMany
ReadWriteMany. | +| size | scalars | Match the volume size of PVC. | +| publicLB.enabled | bool | if `true`, enable publicLB | +| exchange | mappings | FATE cluster `exchange` module's ip and port. | +| resources | mappings | resources of Pod | +| route_table | mappings | route table of pulsar. | *example of route_table*: @@ -335,3 +338,16 @@ Configuration of kubernetes deployment pulsar . - backlogQuotaDefaultLimitGB - brokerDeleteInactiveTopicsFrequencySeconds + +### hive mappings + +Configuration of existing hive. +Please note: you have to run hive cluster by your self, then set below configurations used for storage engine. + +| Name | Type | Description | +| -------------- | ------- | ---------------------- | +| host | scalars | configuration of hive. | +| port | scalars | configuration of hive. | +| auth_mechanism | scalars | configuration of hive. | +| user | scalars | configuration of hive. | +| password | scalars | configuration of hive. | \ No newline at end of file diff --git a/docs/images/fedml_with_kubefate.png b/docs/images/fedml_with_kubefate.png new file mode 100644 index 000000000..84d821df7 Binary files /dev/null and b/docs/images/fedml_with_kubefate.png differ diff --git a/docs/tutorials/Build_Two_Parties_FATE_Cluster_in_One_Linux_Machine_with_MiniKube.md b/docs/tutorials/Build_Two_Parties_FATE_Cluster_in_One_Linux_Machine_with_MiniKube.md index a927718d6..10b8e6b10 100644 --- a/docs/tutorials/Build_Two_Parties_FATE_Cluster_in_One_Linux_Machine_with_MiniKube.md +++ b/docs/tutorials/Build_Two_Parties_FATE_Cluster_in_One_Linux_Machine_with_MiniKube.md @@ -21,14 +21,14 @@ After the tutorial, the deployment architecture looks like the following diagram 5. Network connectivity to dockerhub or 163 Docker Image Registry, and google gcr. 6. Setup the global KubeFATE version using in the tutorial and create a folder for the whole tutorial. ``` -export fate_version=v1.9.2 && export kubefate_version=v1.4.5 && cd ~ && mkdir demo && cd demo +export fate_version=v1.10.0 && export kubefate_version=v1.4.5 && cd ~ && mkdir demo && cd demo ``` Notes: * When talking about KubeFATE version, usually there are 3 notions: * The KubeFATE CLI version, in this tutorial, it is v1.4.5. * The KubeFATE service version, in this tutorial, it is v1.4.5. - * The FATE version, in this tutorial, it is v1.9.2, it also means the version of the helm chart of FATE, currently we use this version to tag the KubeFATE GitHub master branch. + * The FATE version, in this tutorial, it is v1.10.0, it also means the version of the helm chart of FATE, currently we use this version to tag the KubeFATE GitHub master branch. * **In this tutorial, the IP of the machine we used is 192.168.100.123. Please change it to your machine's IP in all the following commands and config files.** # Start Tutorial @@ -87,7 +87,7 @@ When all the pods are in the ready state, it means your Kubernetes cluster is re ## Setup Kubefate ### Install KubeFATE CLI Go to [KubeFATE Release](https://github.com/FederatedAI/KubeFATE/releases), and find the latest kubefate-k8s release -pack, which is `v1.9.2` as set to ENVs before. (replace ${fate_version} with the newest version available) +pack, which is `v1.10.0` as set to ENVs before. (replace ${fate_version} with the newest version available) ``` curl -LO https://github.com/FederatedAI/KubeFATE/releases/download/${fate_version}/kubefate-k8s-${fate_version}.tar.gz && tar -xzf ./kubefate-k8s-${fate_version}.tar.gz ``` @@ -256,7 +256,7 @@ For `/kubefate/examples/party-9999/cluster-spark-pulsar.yaml`, modify it as foll name: fate-9999 namespace: fate-9999 chartName: fate -chartVersion: v1.9.2 +chartVersion: v1.10.0 partyId: 9999 registry: "" pullPolicy: @@ -340,7 +340,7 @@ and for fate-10000: name: fate-10000 namespace: fate-10000 chartName: fate -chartVersion: v1.9.2 +chartVersion: v1.10.0 partyId: 10000 registry: "" pullPolicy: @@ -440,8 +440,8 @@ or watch the clusters till their STATUS changing to `Running`: ``` kubefate@machine:~/kubefate$ watch kubefate cluster ls UUID NAME NAMESPACE REVISION STATUS CHART ChartVERSION AGE -29878fa9-aeee-4ae5-a5b7-fd4e9eb7c1c3 fate-9999 fate-9999 1 Running fate v1.9.2 88s -dacc0549-b9fc-463f-837a-4e7316db2537 fate-10000 fate-10000 1 Running fate v1.9.2 69s +29878fa9-aeee-4ae5-a5b7-fd4e9eb7c1c3 fate-9999 fate-9999 1 Running fate v1.10.0 88s +dacc0549-b9fc-463f-837a-4e7316db2537 fate-10000 fate-10000 1 Running fate v1.10.0 69s ``` We have about 10G Docker images that need to be pulled, this step will take a while for the first time. An alternative way is offline loading the images to the local environment. @@ -479,13 +479,13 @@ UUID 29878fa9-aeee-4ae5-a5b7-fd4e9eb7c1c3 Name fate-9999 NameSpace fate-9999 ChartName fate -ChartVersion v1.9.2 +ChartVersion v1.10.0 Revision 1 Age 54m Status Running Spec algorithm: Basic chartName: fate - chartVersion: v1.9.2 + chartVersion: v1.10.0 computing: Spark device: CPU federation: Pulsar diff --git a/docs/tutorials/Build_Two_Parties_FATE_Cluster_in_One_Linux_Machine_with_MiniKube_zh.md b/docs/tutorials/Build_Two_Parties_FATE_Cluster_in_One_Linux_Machine_with_MiniKube_zh.md index 523fe01a6..de43ed832 100644 --- a/docs/tutorials/Build_Two_Parties_FATE_Cluster_in_One_Linux_Machine_with_MiniKube_zh.md +++ b/docs/tutorials/Build_Two_Parties_FATE_Cluster_in_One_Linux_Machine_with_MiniKube_zh.md @@ -17,14 +17,14 @@ 5. 要保证安装机器可以正常访问Docker Hub或者网易云镜像仓库,以及Google gcr; 6. 预先创建一个目录,以便整个过程使用该目录作为工作目录,命令如下: ``` -export fate_version=v1.9.2 && export kubefate_version=v1.4.5 && cd ~ && mkdir demo && cd demo +export fate_version=v1.10.0 && export kubefate_version=v1.4.5 && cd ~ && mkdir demo && cd demo ``` Notes: * 当我们提到"KubeFATE的版本",通常来讲会有三个概念: * KubeFATE命令行工具的版本,在本教程中为v1.4.5。 * KubeFATE服务版本,在本教程中为v1.4.5。 - * FATE版本,在本教程中v1.9.2,它也意味着FATE的Helm Chart的版本, 值得注意的是我们用这个版本来给GitHub上的KubeFATE的发布打tag。 + * FATE版本,在本教程中v1.10.0,它也意味着FATE的Helm Chart的版本, 值得注意的是我们用这个版本来给GitHub上的KubeFATE的发布打tag。 * **下文介绍的MiniKube机器IP地址是192.168.100.123。请修改为你准备的实验机器IP地址** # 开始安装 @@ -77,7 +77,7 @@ sudo minikube addons enable ingress ## 安装Kubefate ### 下载KubeFATE命令行工具 -我们从Github上 [KubeFATE Release](https://github.com/FederatedAI/KubeFATE/releases)页面找到Kuberetes部署的下载包,并下载对应版本,如前面环境变量设置`v1.9.2`, +我们从Github上 [KubeFATE Release](https://github.com/FederatedAI/KubeFATE/releases)页面找到Kuberetes部署的下载包,并下载对应版本,如前面环境变量设置`v1.10.0`, ``` curl -LO https://github.com/FederatedAI/KubeFATE/releases/download/${fate_version}/kubefate-k8s-${fate_version}.tar.gz && tar -xzf ./kubefate-k8s-${fate_version}.tar.gz ``` @@ -237,7 +237,7 @@ kubectl -n fate-10000 create secret docker-registry myregistrykey \ name: fate-9999 namespace: fate-9999 chartName: fate -chartVersion: v1.9.2 +chartVersion: v1.10.0 partyId: 9999 registry: "" pullPolicy: @@ -322,7 +322,7 @@ pulsar: name: fate-10000 namespace: fate-10000 chartName: fate -chartVersion: v1.9.2 +chartVersion: v1.10.0 partyId: 10000 registry: "" pullPolicy: @@ -418,8 +418,8 @@ create job success, job id=7752db70-e368-41fa-8827-d39411728d1b ``` kubefate@machine:~/kubefate$ watch kubefate cluster ls UUID NAME NAMESPACE REVISION STATUS CHART ChartVERSION AGE -29878fa9-aeee-4ae5-a5b7-fd4e9eb7c1c3 fate-9999 fate-9999 1 Running fate v1.9.2 88s -dacc0549-b9fc-463f-837a-4e7316db2537 fate-10000 fate-10000 1 Running fate v1.9.2 69s +29878fa9-aeee-4ae5-a5b7-fd4e9eb7c1c3 fate-9999 fate-9999 1 Running fate v1.10.0 88s +dacc0549-b9fc-463f-837a-4e7316db2537 fate-10000 fate-10000 1 Running fate v1.10.0 69s ``` 因为这个步骤需要到网易云镜像仓库去下载约10G的镜像,所以第一次执行视乎你的网络情况需要一定时间。 检查下载的进度可以用 @@ -446,13 +446,13 @@ UUID 29878fa9-aeee-4ae5-a5b7-fd4e9eb7c1c3 Name fate-9999 NameSpace fate-9999 ChartName fate -ChartVersion v1.9.2 +ChartVersion v1.10.0 Revision 1 Age 54m Status Running Spec algorithm: Basic chartName: fate - chartVersion: v1.9.2 + chartVersion: v1.10.0 computing: Spark device: CPU federation: Pulsar diff --git a/helm-charts/FATE-Exchange/Chart.yaml b/helm-charts/FATE-Exchange/Chart.yaml index 24c642d73..253a79e81 100644 --- a/helm-charts/FATE-Exchange/Chart.yaml +++ b/helm-charts/FATE-Exchange/Chart.yaml @@ -1,5 +1,5 @@ apiVersion: v1 -appVersion: v1.9.2 +appVersion: v1.10.0 description: A Helm chart for fate exchange name: fate-exchange -version: v1.9.2 +version: v1.10.0 diff --git a/helm-charts/FATE-Exchange/values-template-example.yaml b/helm-charts/FATE-Exchange/values-template-example.yaml index 2fc0fef16..c711d9ac4 100644 --- a/helm-charts/FATE-Exchange/values-template-example.yaml +++ b/helm-charts/FATE-Exchange/values-template-example.yaml @@ -1,9 +1,10 @@ name: fate-exchange namespace: fate-exchange chartName: fate-exchange -chartVersion: v1.9.2 +chartVersion: v1.10.0 partyId: 1 registry: "" +pullPolicy: imagePullSecrets: - name: myregistrykey persistence: false diff --git a/helm-charts/FATE-Exchange/values.yaml b/helm-charts/FATE-Exchange/values.yaml index 8d81142f5..1c61523d8 100644 --- a/helm-charts/FATE-Exchange/values.yaml +++ b/helm-charts/FATE-Exchange/values.yaml @@ -4,7 +4,7 @@ partyName: fate-exchange image: registry: federatedai isThridParty: - tag: 1.9.2-release + tag: 1.10.0-release pullPolicy: IfNotPresent imagePullSecrets: # - name: diff --git a/helm-charts/FATE/Chart.yaml b/helm-charts/FATE/Chart.yaml index 5cb4682d2..85a11ebf7 100644 --- a/helm-charts/FATE/Chart.yaml +++ b/helm-charts/FATE/Chart.yaml @@ -1,8 +1,8 @@ apiVersion: v1 -appVersion: v1.9.2 +appVersion: v1.10.0 description: A Helm chart for fate-training name: fate -version: v1.9.2 +version: v1.10.0 home: https://fate.fedai.org icon: https://aisp-1251170195.cos.ap-hongkong.myqcloud.com/wp-content/uploads/sites/12/2019/09/logo.png sources: diff --git a/helm-charts/FATE/templates/core/client/statefulSet.yaml b/helm-charts/FATE/templates/core/client/statefulSet.yaml index bb0ef7dbf..83f218db0 100644 --- a/helm-charts/FATE/templates/core/client/statefulSet.yaml +++ b/helm-charts/FATE/templates/core/client/statefulSet.yaml @@ -43,8 +43,11 @@ spec: value: "9380" - name: FATE_SERVING_HOST value: "{{.Values.modules.serving.ip}}:{{.Values.modules.serving.port}}" + - name: NOTEBOOK_HASHED_PASSWORD + value: {{ .Values.modules.client.notebook_hashed_password }} ports: - containerPort: 20000 + command: ["bash", "-c", "pipeline init --ip ${FATE_FLOW_IP} --port ${FATE_FLOW_PORT} && flow init --ip ${FATE_FLOW_IP} --port ${FATE_FLOW_PORT} && jupyter notebook --ip=0.0.0.0 --port=20000 --allow-root --debug --NotebookApp.notebook_dir='/data/projects/fate/' --no-browser --NotebookApp.token='' --NotebookApp.password=${NOTEBOOK_HASHED_PASSWORD}"] livenessProbe: httpGet: path: / @@ -122,4 +125,4 @@ spec: requests: storage: {{ .Values.modules.client.size }} {{- end }} -{{- end }} \ No newline at end of file +{{- end }} diff --git a/helm-charts/FATE/templates/core/fateboard.yaml b/helm-charts/FATE/templates/core/fateboard.yaml new file mode 100644 index 000000000..7c16c9e65 --- /dev/null +++ b/helm-charts/FATE/templates/core/fateboard.yaml @@ -0,0 +1,106 @@ +# Copyright 2019-2022 VMware, Inc. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +{{- if .Values.modules.fateboard.include }} +apiVersion: apps/v1 +kind: Deployment +metadata: + name: fateboard + labels: + fateMoudle: fateboard +{{ include "fate.labels" . | indent 4 }} +spec: + replicas: 1 + strategy: + type: Recreate + selector: + matchLabels: + fateMoudle: fateboard +{{ include "fate.matchLabels" . | indent 6 }} + template: + metadata: + annotations: + {{- if .Values.istio.enabled }} + sidecar.istio.io/rewriteAppHTTPProbers: "false" + {{- end }} + labels: + fateMoudle: fateboard +{{ include "fate.labels" . | indent 8 }} + spec: + containers: + {{- if .Values.modules.fateboard.include }} + - image: {{ .Values.image.registry }}/fateboard:{{ .Values.image.tag }} + imagePullPolicy: {{ .Values.image.pullPolicy }} + name: fateboard + ports: + - containerPort: 8080 + livenessProbe: + httpGet: + path: / + port: 8080 + httpHeaders: + - name: X-Custom-Header + value: livenessProbe + initialDelaySeconds: 1 + periodSeconds: 10 + timeoutSeconds: 3 + successThreshold: 1 + failureThreshold: 3 + readinessProbe: + httpGet: + path: / + port: 8080 + httpHeaders: + - name: X-Custom-Header + value: readinessProbe + initialDelaySeconds: 1 + periodSeconds: 10 + timeoutSeconds: 3 + successThreshold: 1 + failureThreshold: 3 + startupProbe: + httpGet: + path: / + port: 8080 + httpHeaders: + - name: X-Custom-Header + value: startupProbe + failureThreshold: 12 + periodSeconds: 10 + volumeMounts: + - mountPath: /data/projects/fate/fateboard/conf/application.properties + name: fateboard-confs + subPath: application.properties + {{- end }} + {{- with .Values.modules.fateboard.nodeSelector }} + nodeSelector: +{{ toYaml . | indent 8 }} + {{- end }} + {{- with .Values.modules.fateboard.tolerations }} + tolerations: +{{ toYaml . | indent 8 }} + {{- end }} + {{- with .Values.modules.fateboard.affinity }} + affinity: +{{ toYaml . | indent 8 }} + {{- end }} + {{- with .Values.image.imagePullSecrets }} + imagePullSecrets: +{{ toYaml . | indent 6 }} + {{- end }} + restartPolicy: Always + volumes: + {{- if .Values.modules.fateboard.include }} + - name: fateboard-confs + configMap: + name: fateboard-config + {{- end }} +{{- end }} diff --git a/helm-charts/FATE/templates/core/fateboard/configmap.yaml b/helm-charts/FATE/templates/core/fateboard/configmap.yaml index a7bd717f8..51098d141 100644 --- a/helm-charts/FATE/templates/core/fateboard/configmap.yaml +++ b/helm-charts/FATE/templates/core/fateboard/configmap.yaml @@ -24,6 +24,11 @@ data: #priority is higher than {fateflow.url}, split by ; #below config can support configuring more than one fate flow for this fate board fateflow.url-list= + {{- $replicaCount := .Values.modules.python.replicas | int -}} + {{- range $index0 := until $replicaCount }} + {{- $index1 := $index0 | add1 -}} + http://python-{{ $index0 }}.fateflow:9380{{ if ne $index1 $replicaCount }};{{ end }} + {{- end }} fateflow.http_app_key= fateflow.http_secret_key= server.servlet.encoding.charset=UTF-8 diff --git a/helm-charts/FATE/templates/core/fateboard/service.yaml b/helm-charts/FATE/templates/core/fateboard/service.yaml index 5059f5bc0..0920ff8d0 100644 --- a/helm-charts/FATE/templates/core/fateboard/service.yaml +++ b/helm-charts/FATE/templates/core/fateboard/service.yaml @@ -15,7 +15,7 @@ kind: Service metadata: name: fateboard labels: - fateMoudle: python + fateMoudle: fateboard {{ include "fate.labels" . | indent 4 }} spec: ports: @@ -25,6 +25,6 @@ spec: protocol: TCP type: {{ .Values.modules.fateboard.type }} selector: - fateMoudle: python + fateMoudle: fateboard {{ include "fate.matchLabels" . | indent 4 }} {{- end }} \ No newline at end of file diff --git a/helm-charts/FATE/templates/core/fateflow/configmap.yaml b/helm-charts/FATE/templates/core/fateflow/configmap.yaml index 298218dc4..3488c7a23 100644 --- a/helm-charts/FATE/templates/core/fateflow/configmap.yaml +++ b/helm-charts/FATE/templates/core/fateflow/configmap.yaml @@ -44,7 +44,7 @@ data: service_conf.yaml: | use_registry: {{ .Values.modules.serving.useRegistry | default false }} use_deserialize_safe_module: false - dependent_distribution: false + dependent_distribution: {{ .Values.modules.python.dependent_distribution | default false }} encrypt_password: false encrypt_module: fate_arch.common.encrypt_utils#pwdecrypt private_key: @@ -67,9 +67,19 @@ data: dataset: false fateflow: # you must set real ip address, 127.0.0.1 and 0.0.0.0 is not supported - host: fateflow + host: fateflow_ip http_port: 9380 grpc_port: 9360 + # when you have multiple fateflow server on one party, + # we suggest using nginx for load balancing. + nginx: + # under K8s mode, 'fateflow' is the service name, which will be a L4 load balancer. + host: fateflow + http_port: 9380 + grpc_port: 9360 + # use random instance_id instead of {host}:{http_port} + random_instance_id: false + # support rollsite/nginx/fateflow as a coordination proxy # rollsite support fate on eggroll, use grpc protocol # nginx support fate on eggroll and fate on spark, use http or grpc protocol, default is http @@ -99,13 +109,6 @@ data: port: {{ .Values.externalMysqlPort | default .Values.modules.mysql.port | default "3306" }} max_connections: 100 stale_timeout: 30 - zookeeper: - hosts: - - "serving-zookeeper:2181" - # use_acl: false - # user: fate - # password: fate - # engine services default_engines: {{- if eq .Values.computing "Spark_local" }} computing: "spark" @@ -139,11 +142,11 @@ data: token_code: MLSS python_path: /data/projects/fate/python hive: - host: 127.0.0.1 - port: 10000 - auth_mechanism: - username: - password: + host: {{ .Values.modules.python.hive.host }} + port: {{ .Values.modules.python.hive.port }} + auth_mechanism: {{ .Values.modules.python.hive.auth_mechanism }} + username: {{ .Values.modules.python.hive.username }} + password: {{ .Values.modules.python.hive.password }} linkis_hive: host: 127.0.0.1 port: 9001 @@ -166,7 +169,9 @@ data: host: {{ .Values.modules.python.pulsar.host }} port: {{ .Values.modules.python.pulsar.port }} mng_port: {{ .Values.modules.python.pulsar.mng_port }} - topic_ttl: 3 + topic_ttl: {{ .Values.modules.python.pulsar.topic_ttl }} + cluster: {{ .Values.modules.python.pulsar.cluster }} + tenant: {{ .Values.modules.python.pulsar.tenant }} # default conf/pulsar_route_table.yaml route_table: conf/pulsar_route_table/pulsar_route_table.yaml # mode: replication / client, default: replication @@ -179,14 +184,14 @@ data: fateboard: host: fateboard port: 8080 - enable_model_store: false + enable_model_store: true model_store_address: storage: mysql - name: {{ .Values.externalMysqlDatabase | default .Values.modules.mysql.database | default "eggroll_meta" }} + database: {{ .Values.externalMysqlDatabase | default .Values.modules.mysql.database | default "eggroll_meta" }} host: '{{ .Values.externalMysqlIp | default .Values.modules.mysql.ip | default "mysql" }}' port: {{ .Values.externalMysqlPort | default .Values.modules.mysql.port | default "3306" }} user: '{{ .Values.externalMysqlUser | default .Values.modules.mysql.user | default "fate" }}' - passwd: '{{ .Values.externalMysqlPassword | default .Values.modules.mysql.password | default "fate_dev" }}' + password: '{{ .Values.externalMysqlPassword | default .Values.modules.mysql.password | default "fate_dev" }}' max_connections: 10 stale_timeout: 10 {{- with .Values.modules.serving }} @@ -197,9 +202,15 @@ data: {{- else }} - '' {{- end }} - {{- if and .useRegistry .zookeeper }} zookeeper: + {{- if .zookeeper }} {{ toYaml .zookeeper | indent 6 }} + {{- else}} + hosts: + - serving-zookeeper.fate-serving-9999:2181 + use_acl: false + user: fate + password: fate {{- end }} {{- end }} transfer_conf.yaml: | @@ -246,8 +257,8 @@ data: federated_command_trys: 3 end_status_job_scheduling_time_limit: 300000 # ms end_status_job_scheduling_updates: 1 - auto_retries: 0 - auto_retry_delay: 1 #seconds + auto_retries: {{ .Values.modules.python.failedTaskAutoRetryTimes }} + auto_retry_delay: {{ .Values.modules.python.failedTaskAutoRetryDelay }} #seconds # It can also be specified in the job configuration using the federated_status_collect_type parameter federated_status_collect_type: PUSH detect_connect_max_retry_count: 3 diff --git a/helm-charts/FATE/templates/core/fateflow/service.yaml b/helm-charts/FATE/templates/core/fateflow/service.yaml index e2d7bce37..a94757b56 100644 --- a/helm-charts/FATE/templates/core/fateflow/service.yaml +++ b/helm-charts/FATE/templates/core/fateflow/service.yaml @@ -57,11 +57,9 @@ spec: {{- end }} protocol: TCP type: {{ .Values.modules.python.type }} - {{- if .Values.modules.python.loadBalancerIP }} loadBalancerIP: "{{ .Values.modules.python.loadBalancerIP }}" {{- end }} - selector: fateMoudle: python {{ include "fate.matchLabels" . | indent 4 }} diff --git a/helm-charts/FATE/templates/core/python-spark.yaml b/helm-charts/FATE/templates/core/python-spark.yaml index ca14a14be..ce597d5f6 100644 --- a/helm-charts/FATE/templates/core/python-spark.yaml +++ b/helm-charts/FATE/templates/core/python-spark.yaml @@ -19,7 +19,7 @@ metadata: {{ include "fate.labels" . | indent 4 }} spec: serviceName: fateflow - replicas: 1 + replicas: {{ .Values.modules.python.replicas }} selector: matchLabels: fateMoudle: python @@ -123,7 +123,7 @@ spec: cp /data/projects/fate/conf-tmp/component_registry.json /data/projects/fate/fateflow/conf/component_registry.json cp /data/projects/fate/conf-tmp/job_default_config.yaml /data/projects/fate/fateflow/conf/job_default_config.yaml # fix fateflow conf must use IP - sed -i "s/host: fateflow/host: ${POD_IP}/g" /data/projects/fate/conf/service_conf.yaml + sed -i "s/host: fateflow_ip/host: ${POD_IP}/g" /data/projects/fate/conf/service_conf.yaml cp /data/projects/spark-3.1.3-bin-hadoop3.2/conf/spark-defaults-template.conf /data/projects/spark-3.1.3-bin-hadoop3.2/conf/spark-defaults.conf sed -i "s/fateflow/${POD_IP}/g" /data/projects/spark-3.1.3-bin-hadoop3.2/conf/spark-defaults.conf @@ -178,53 +178,6 @@ spec: - mountPath: /data/projects/fate/fateflow/model_local_cache name: python-data subPath: model-local-cache - {{- if .Values.modules.fateboard.include }} - - image: {{ .Values.image.registry }}/fateboard:{{ .Values.image.tag }} - imagePullPolicy: {{ .Values.image.pullPolicy }} - name: fateboard - ports: - - containerPort: 8080 - livenessProbe: - httpGet: - path: / - port: 8080 - httpHeaders: - - name: X-Custom-Header - value: livenessProbe - initialDelaySeconds: 1 - periodSeconds: 10 - timeoutSeconds: 3 - successThreshold: 1 - failureThreshold: 3 - readinessProbe: - httpGet: - path: / - port: 8080 - httpHeaders: - - name: X-Custom-Header - value: readinessProbe - initialDelaySeconds: 1 - periodSeconds: 10 - timeoutSeconds: 3 - successThreshold: 1 - failureThreshold: 3 - startupProbe: - httpGet: - path: / - port: 8080 - httpHeaders: - - name: X-Custom-Header - value: startupProbe - failureThreshold: 12 - periodSeconds: 10 - volumeMounts: - - mountPath: /data/projects/fate/fateboard/conf/application.properties - name: fateboard-confs - subPath: application.properties - - name: python-data - mountPath: /data/projects/fate/fateflow/logs - subPath: logs - {{- end }} {{- with .Values.modules.python.nodeSelector }} nodeSelector: {{ toYaml . | indent 8 }} @@ -266,11 +219,6 @@ spec: configMap: name: pulsar-route-table {{- end }} - {{- if .Values.modules.fateboard.include }} - - name: fateboard-confs - configMap: - name: fateboard-config - {{- end }} {{- if not .Values.persistence.enabled }} - name: python-data emptyDir: {} diff --git a/helm-charts/FATE/values-template-example.yaml b/helm-charts/FATE/values-template-example.yaml index c2d39e492..3e4770d0e 100644 --- a/helm-charts/FATE/values-template-example.yaml +++ b/helm-charts/FATE/values-template-example.yaml @@ -1,11 +1,11 @@ name: fate-9999 namespace: fate-9999 chartName: fate -chartVersion: v1.9.2 +chartVersion: v1.10.0 partyId: 9999 registry: "" -pullPolicy: -imagePullSecrets: +pullPolicy: +imagePullSecrets: - name: myregistrykey persistence: false istio: @@ -47,27 +47,27 @@ skippedKeys: # hosts: # - name: party9999.fateboard.example.com # path: / - # tls: + # tls: # - secretName: my-tls-secret # hosts: # - party9999.fateboard.example.com - # client: + # client: # hosts: # - name: party9999.notebook.example.com - # spark: + # spark: # hosts: # - name: party9999.spark.example.com - # rabbitmq: + # rabbitmq: # hosts: # - name: party9999.rabbitmq.example.com - # pulsar: + # pulsar: # hosts: # - name: party9999.pulsar.example.com - -# rollsite: + +# rollsite: # type: NodePort # nodePort: 30091 - # loadBalancerIP: + # loadBalancerIP: # exchange: # ip: 192.168.0.1 # port: 30000 @@ -155,72 +155,88 @@ skippedKeys: # python: - # type: NodePort - # httpNodePort: 30097 - # grpcNodePort: 30092 - # loadBalancerIP: - # serviceAccountName: "" - # nodeSelector: - # tolerations: - # affinity: - # enabledNN: false - # logLevel: INFO - # existingClaim: "" - # storageClass: "python" - # accessMode: ReadWriteMany - # size: 1Gi - # resources: - # requests: - # cpu: "2" - # memory: "4Gi" - # limits: - # cpu: "4" - # memory: "8Gi" - # clustermanager: - # cores_per_node: 16 - # nodes: 2 - # spark: - # cores_per_node: 20 - # nodes: 2 - # master: spark://spark-master:7077 - # driverHost: - # driverHostType: - # portMaxRetries: - # driverStartPort: - # blockManagerStartPort: - # pysparkPython: - # hdfs: - # name_node: hdfs://namenode:9000 - # path_prefix: - # rabbitmq: - # host: rabbitmq - # mng_port: 15672 - # port: 5672 - # user: fate - # password: fate - # pulsar: - # host: pulsar - # mng_port: 8080 - # port: 6650 - # nginx: - # host: nginx - # http_port: 9300 - # grpc_port: 9310 +# type: NodePort +# replicas: 1 +# httpNodePort: 30097 +# grpcNodePort: 30092 +# loadBalancerIP: +# serviceAccountName: "" +# nodeSelector: +# tolerations: +# affinity: +# failedTaskAutoRetryTimes: +# failedTaskAutoRetryDelay: +# logLevel: INFO +# existingClaim: "" +# storageClass: "python" +# accessMode: ReadWriteMany +# dependent_distribution: false +# size: 1Gi +# resources: +# requests: +# cpu: "2" +# memory: "4Gi" +# limits: +# cpu: "4" +# memory: "8Gi" +# clustermanager: +# cores_per_node: 16 +# nodes: 2 +# spark: +# cores_per_node: 20 +# nodes: 2 +# master: spark://spark-master:7077 +# driverHost: +# driverHostType: +# portMaxRetries: +# driverStartPort: +# blockManagerStartPort: +# pysparkPython: +# hdfs: +# name_node: hdfs://namenode:9000 +# path_prefix: +# rabbitmq: +# host: rabbitmq +# mng_port: 15672 +# port: 5672 +# user: fate +# password: fate +# pulsar: +# host: pulsar +# mng_port: 8080 +# port: 6650 +# topic_ttl: 3 +# cluster: standalone +# tenant: fl-tenant +# nginx: +# host: nginx +# http_port: 9300 +# grpc_port: 9310 +# hive: +# host: 127.0.0.1 +# port: 10000 +# auth_mechanism: +# username: +# password: + +# fateboard: +# type: ClusterIP +# username: admin +# password: admin +# nodeSelector: +# tolerations: +# affinity: -# fateboard: - # type: ClusterIP - # username: admin - # password: admin - # client: - # nodeSelector: + # nodeSelector: # subPath: "" # existingClaim: "" # storageClass: "client" # accessMode: ReadWriteOnce # size: 1Gi + # notebook_hashed_password: "" -# mysql: +# mysql: # nodeSelector: # tolerations: # affinity: @@ -246,17 +262,19 @@ skippedKeys: # servingIp: 192.168.0.1 # servingPort: 30095 # serving: - # useRegistry: false - # zookeeper: - # hosts: - # - serving-zookeeper.fate-serving-9999:2181 - # use_acl: false +# useRegistry: false +# zookeeper: +# hosts: +# - serving-zookeeper.fate-serving-9999:2181 +# use_acl: false +# user: fate +# password: fate # spark: # master: # Image: "federatedai/spark-master" - # ImageTag: "1.9.2-release" + # ImageTag: "1.10.0-release" # replicas: 1 # resources: # requests: @@ -272,7 +290,7 @@ skippedKeys: # nodePort: 30977 # worker: # Image: "federatedai/spark-worker" - # ImageTag: "1.9.2-release" + # ImageTag: "1.10.0-release" # replicas: 2 # resources: # requests: @@ -318,13 +336,13 @@ skippedKeys: # ip: 192.168.10.1 # httpPort: 30003 # grpcPort: 30008 - # route_table: - # 10000: - # proxy: - # - host: 192.168.0.1 + # route_table: + # 10000: + # proxy: + # - host: 192.168.0.1 # http_port: 30103 - # grpc_port: 30108 - # fateflow: + # grpc_port: 30108 + # fateflow: # - host: 192.168.0.1 # http_port: 30107 # grpc_port: 30102 diff --git a/helm-charts/FATE/values-template.yaml b/helm-charts/FATE/values-template.yaml index cdd68942e..8a5ae4611 100644 --- a/helm-charts/FATE/values-template.yaml +++ b/helm-charts/FATE/values-template.yaml @@ -217,6 +217,7 @@ modules: python: include: {{ has "python" .modules }} {{- with .python }} + replicas: {{ .replicas | default 1 }} {{- with .resources }} resources: {{ toYaml . | indent 6 }} @@ -226,6 +227,7 @@ modules: httpNodePort: {{ .httpNodePort }} grpcNodePort: {{ .grpcNodePort }} loadBalancerIP: {{ .loadBalancerIP }} + dependent_distribution: {{ .dependent_distribution }} serviceAccountName: {{ .serviceAccountName }} {{- with .nodeSelector }} nodeSelector: @@ -239,6 +241,8 @@ modules: affinity: {{ toYaml . | indent 6 }} {{- end }} + failedTaskAutoRetryTimes: {{ .failedTaskAutoRetryTimes | default 5 }} + failedTaskAutoRetryDelay: {{ .failedTaskAutoRetryDelay | default 60 }} existingClaim: {{ .existingClaim }} claimName: {{ .claimName | default "python-data" }} storageClass: {{ .storageClass | default "python" }} @@ -264,6 +268,9 @@ modules: host: {{ .host }} mng_port: {{ .mng_port }} port: {{ .port }} + topic_ttl: {{ .topic_ttl }} + cluster: {{ .cluster }} + tenant: {{ .tenant }} {{- end }} {{- with .rabbitmq }} rabbitmq: @@ -279,6 +286,14 @@ modules: http_port: {{ .http_port }} grpc_port: {{ .grpc_port }} {{- end }} + {{- with .hive }} + hive: + host: {{ .host }} + port: {{ .port }} + auth_mechanism: {{ .auth_mechanism }} + username: {{ .username }} + password: {{ .password }} + {{- end }} {{- end }} @@ -313,7 +328,8 @@ modules: sessionProcessorsPerNode: {{ .sessionProcessorsPerNode }} replicas: {{ .replicas | default 2 }} subPath: {{ .subPath }} - storageClass: {{ .storageClass | default "client" }} + storageClass: {{ .storageClass | default "nodemanager" }} + existingClaim: {{ .existingClaim }} accessMode: {{ .accessMode | default "ReadWriteOnce" }} size: {{ .size | default "1Gi" }} {{- with .nodeSelector }} @@ -355,6 +371,7 @@ modules: affinity: {{ toYaml . | indent 6 }} {{- end }} + notebook_hashed_password: {{ .notebook_hashed_password | default "" }} {{- end }} @@ -402,6 +419,18 @@ modules: type: {{ .type }} username: {{ .username }} password: {{ .password }} + {{- with .nodeSelector }} + nodeSelector: +{{ toYaml . | indent 6 }} + {{- end }} + {{- with .tolerations }} + tolerations: +{{ toYaml . | indent 6 }} + {{- end }} + {{- with .affinity }} + affinity: +{{ toYaml . | indent 6 }} + {{- end }} {{- end}} spark: diff --git a/helm-charts/FATE/values.yaml b/helm-charts/FATE/values.yaml index abc4378d6..84515aec4 100644 --- a/helm-charts/FATE/values.yaml +++ b/helm-charts/FATE/values.yaml @@ -2,7 +2,7 @@ image: registry: federatedai isThridParty: - tag: 1.9.2-release + tag: 1.10.0-release pullPolicy: IfNotPresent imagePullSecrets: # - name: @@ -120,6 +120,7 @@ modules: affinity: python: include: true + replicas: 1 type: ClusterIP httpNodePort: 30097 grpcNodePort: 30092 @@ -128,9 +129,12 @@ modules: nodeSelector: tolerations: affinity: + failedTaskAutoRetryTimes: + failedTaskAutoRetryDelay: logLevel: INFO # subPath: "" existingClaim: + dependent_distribution: false claimName: python-data storageClass: accessMode: ReadWriteOnce @@ -159,12 +163,21 @@ modules: password: fate pulsar: host: pulsar - mng_port: 8080 port: 6650 + mng_port: 8080 + topic_ttl: 3 + cluster: standalone + tenant: fl-tenant nginx: host: nginx http_port: 9300 grpc_port: 9310 + hive: + host: + port: + auth_mechanism: + username: + password: client: include: true ip: client @@ -177,6 +190,7 @@ modules: storageClass: accessMode: ReadWriteOnce size: 1Gi + notebook_hashed_password: clustermanager: include: true ip: clustermanager @@ -201,19 +215,6 @@ modules: cpu: "2" memory: "4Gi" - client: - include: true - ip: client - type: ClusterIP - nodeSelector: - tolerations: - affinity: - subPath: "client" - existingClaim: - storageClass: - accessMode: ReadWriteOnce - size: 1Gi - mysql: include: true type: ClusterIP @@ -231,6 +232,7 @@ modules: storageClass: accessMode: ReadWriteOnce size: 1Gi + serving: ip: 192.168.9.1 port: 30095 @@ -238,12 +240,18 @@ modules: zookeeper: hosts: - serving-zookeeper.fate-serving-9999:2181 - use_acl: false + use_acl: false + user: fate + password: fate + fateboard: include: true type: ClusterIP username: admin password: admin + nodeSelector: + tolerations: + affinity: spark: include: true diff --git a/helm-charts/FedML-Client/Chart.yaml b/helm-charts/FedML-Client/Chart.yaml new file mode 100644 index 000000000..2b6066dde --- /dev/null +++ b/helm-charts/FedML-Client/Chart.yaml @@ -0,0 +1,6 @@ +apiVersion: v2 +appVersion: release +description: A Helm chart for FedML Edge Client +name: fedml-client +type: application +version: 0.7.355 diff --git a/helm-charts/FedML-Client/templates/NOTES.txt b/helm-charts/FedML-Client/templates/NOTES.txt new file mode 100644 index 000000000..10ed1b5e1 --- /dev/null +++ b/helm-charts/FedML-Client/templates/NOTES.txt @@ -0,0 +1 @@ +You may visit fedml documents by the URL https://doc.fedml.ai. \ No newline at end of file diff --git a/helm-charts/FedML-Client/templates/_helpers.tpl b/helm-charts/FedML-Client/templates/_helpers.tpl new file mode 100644 index 000000000..2e8cc341c --- /dev/null +++ b/helm-charts/FedML-Client/templates/_helpers.tpl @@ -0,0 +1,68 @@ +{{/* +Expand the name of the chart. +*/}} +{{- define "fedml-edge-client.name" -}} +{{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" }} +{{- end }} + +{{/* +Create a default fully qualified app name. +We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec). +If release name contains chart name it will be used as a full name. +*/}} +{{- define "fedml-edge-client.fullname" -}} +{{- if .Values.fullnameOverride }} +{{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" }} +{{- else }} +{{- $name := default .Chart.Name .Values.nameOverride }} +{{- if contains $name .Release.Name }} +{{- .Release.Name | trunc 63 | trimSuffix "-" }} +{{- else }} +{{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" }} +{{- end }} +{{- end }} +{{- end }} + +{{/* +Create chart name and version as used by the chart label. +*/}} +{{- define "fedml-edge-client.chart" -}} +{{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" }} +{{- end }} + +{{/* +Common labels +*/}} +{{- define "fedml-edge-client.labels" -}} +helm.sh/chart: {{ include "fedml-edge-client.chart" . }} +{{ include "fedml-edge-client.selectorLabels" . }} +{{- if .Chart.AppVersion }} +app.kubernetes.io/version: {{ .Chart.AppVersion | quote }} +{{- end }} +app.kubernetes.io/managed-by: {{ .Release.Service }} +owner: kubefate +cluster: fedml-client +heritage: {{ .Release.Service }} +release: {{ .Release.Name }} +chart: {{ .Chart.Name }} +{{- end }} + +{{/* +Selector labels +*/}} +{{- define "fedml-edge-client.selectorLabels" -}} +app.kubernetes.io/name: {{ include "fedml-edge-client.name" . }} +app.kubernetes.io/instance: {{ .Release.Name }} +name: {{ .Release.Name | quote }} +{{- end }} + +{{/* +Create the name of the service account to use +*/}} +{{- define "fedml-edge-client.serviceAccountName" -}} +{{- if .Values.serviceAccount.create }} +{{- default (include "fedml-edge-client.fullname" .) .Values.serviceAccount.name }} +{{- else }} +{{- default "default" .Values.serviceAccount.name }} +{{- end }} +{{- end }} diff --git a/helm-charts/FedML-Client/templates/deployment.yaml b/helm-charts/FedML-Client/templates/deployment.yaml new file mode 100644 index 000000000..85f8cb79d --- /dev/null +++ b/helm-charts/FedML-Client/templates/deployment.yaml @@ -0,0 +1,70 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: {{ include "fedml-edge-client.fullname" . }} + labels: + {{- include "fedml-edge-client.labels" . | nindent 4 }} +spec: + {{- if not .Values.autoscaling.enabled }} + replicas: {{ .Values.replicaCount }} + {{- end }} + selector: + matchLabels: + {{- include "fedml-edge-client.selectorLabels" . | nindent 6 }} + template: + metadata: + {{- with .Values.podAnnotations }} + annotations: + {{- toYaml . | nindent 8 }} + {{- end }} + labels: + {{- include "fedml-edge-client.selectorLabels" . | nindent 8 }} + spec: + {{- with .Values.imagePullSecrets }} + imagePullSecrets: + {{- toYaml . | nindent 8 }} + {{- end }} + serviceAccountName: {{ include "fedml-edge-client.serviceAccountName" . }} + securityContext: + {{- toYaml .Values.podSecurityContext | nindent 8 }} + containers: + - name: {{ .Chart.Name }} + securityContext: + {{- toYaml .Values.securityContext | nindent 12 }} + image: "{{ .Values.image.repository }}:{{ .Values.image.tag | default .Chart.AppVersion }}" + imagePullPolicy: {{ .Values.image.pullPolicy }} + env: + - name: ACCOUNT_ID + value: "{{ .Values.env.fedmlAccountId }}" + - name: FEDML_VERSION + value: "{{ .Values.env.fedmlVersion }}" + - name: CLIENT_OS_NAME + value: "{{ .Values.env.fedmlClientOsName }}" + - name: CLIENT_DEVICE_ID + value: {{ include "fedml-edge-client.fullname" . }} + ports: + - name: http + containerPort: {{ .Values.service.port }} + protocol: TCP + resources: + {{- toYaml .Values.resources | nindent 12 }} + volumeMounts: + - name: home-dir + mountPath: {{ .Values.volume.clientHomeDirMountPath }} + {{- with .Values.nodeSelector }} + nodeSelector: + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.affinity }} + affinity: + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.tolerations }} + tolerations: + {{- toYaml . | nindent 8 }} + {{- end }} + volumes: + - name: home-dir + hostPath: + type: DirectoryOrCreate + path: {{ .Values.volume.clientHomeDirHostPath }} diff --git a/helm-charts/FedML-Client/templates/hpa.yaml b/helm-charts/FedML-Client/templates/hpa.yaml new file mode 100644 index 000000000..344aee18e --- /dev/null +++ b/helm-charts/FedML-Client/templates/hpa.yaml @@ -0,0 +1,28 @@ +{{- if .Values.autoscaling.enabled }} +apiVersion: autoscaling/v2beta1 +kind: HorizontalPodAutoscaler +metadata: + name: {{ include "fedml-edge-client.fullname" . }} + labels: + {{- include "fedml-edge-client.labels" . | nindent 4 }} +spec: + scaleTargetRef: + apiVersion: apps/v1 + kind: Deployment + name: {{ include "fedml-edge-client.fullname" . }} + minReplicas: {{ .Values.autoscaling.minReplicas }} + maxReplicas: {{ .Values.autoscaling.maxReplicas }} + metrics: + {{- if .Values.autoscaling.targetCPUUtilizationPercentage }} + - type: Resource + resource: + name: cpu + targetAverageUtilization: {{ .Values.autoscaling.targetCPUUtilizationPercentage }} + {{- end }} + {{- if .Values.autoscaling.targetMemoryUtilizationPercentage }} + - type: Resource + resource: + name: memory + targetAverageUtilization: {{ .Values.autoscaling.targetMemoryUtilizationPercentage }} + {{- end }} +{{- end }} diff --git a/helm-charts/FedML-Client/templates/ingress.yaml b/helm-charts/FedML-Client/templates/ingress.yaml new file mode 100644 index 000000000..f8c7bf1ff --- /dev/null +++ b/helm-charts/FedML-Client/templates/ingress.yaml @@ -0,0 +1,61 @@ +{{- if .Values.ingress.enabled -}} +{{- $fullName := include "fedml-edge-client.fullname" . -}} +{{- $svcPort := .Values.service.port -}} +{{- if and .Values.ingress.className (not (semverCompare ">=1.18-0" .Capabilities.KubeVersion.GitVersion)) }} + {{- if not (hasKey .Values.ingress.annotations "kubernetes.io/ingress.class") }} + {{- $_ := set .Values.ingress.annotations "kubernetes.io/ingress.class" .Values.ingress.className}} + {{- end }} +{{- end }} +{{- if semverCompare ">=1.19-0" .Capabilities.KubeVersion.GitVersion -}} +apiVersion: networking.k8s.io/v1 +{{- else if semverCompare ">=1.14-0" .Capabilities.KubeVersion.GitVersion -}} +apiVersion: networking.k8s.io/v1beta1 +{{- else -}} +apiVersion: extensions/v1beta1 +{{- end }} +kind: Ingress +metadata: + name: {{ $fullName }} + labels: + {{- include "fedml-edge-client.labels" . | nindent 4 }} + {{- with .Values.ingress.annotations }} + annotations: + {{- toYaml . | nindent 4 }} + {{- end }} +spec: + {{- if and .Values.ingress.className (semverCompare ">=1.18-0" .Capabilities.KubeVersion.GitVersion) }} + ingressClassName: {{ .Values.ingress.className }} + {{- end }} + {{- if .Values.ingress.tls }} + tls: + {{- range .Values.ingress.tls }} + - hosts: + {{- range .hosts }} + - {{ . | quote }} + {{- end }} + secretName: {{ .secretName }} + {{- end }} + {{- end }} + rules: + {{- range .Values.ingress.hosts }} + - host: {{ .host | quote }} + http: + paths: + {{- range .paths }} + - path: {{ .path }} + {{- if and .pathType (semverCompare ">=1.18-0" $.Capabilities.KubeVersion.GitVersion) }} + pathType: {{ .pathType }} + {{- end }} + backend: + {{- if semverCompare ">=1.19-0" $.Capabilities.KubeVersion.GitVersion }} + service: + name: {{ $fullName }} + port: + number: {{ $svcPort }} + {{- else }} + serviceName: {{ $fullName }} + servicePort: {{ $svcPort }} + {{- end }} + {{- end }} + {{- end }} +{{- end }} diff --git a/helm-charts/FedML-Client/templates/service.yaml b/helm-charts/FedML-Client/templates/service.yaml new file mode 100644 index 000000000..08debca12 --- /dev/null +++ b/helm-charts/FedML-Client/templates/service.yaml @@ -0,0 +1,15 @@ +apiVersion: v1 +kind: Service +metadata: + name: {{ include "fedml-edge-client.fullname" . }} + labels: + {{- include "fedml-edge-client.labels" . | nindent 4 }} +spec: + type: {{ .Values.service.type }} + ports: + - port: {{ .Values.service.port }} + targetPort: http + protocol: TCP + name: http + selector: + {{- include "fedml-edge-client.selectorLabels" . | nindent 4 }} diff --git a/helm-charts/FedML-Client/templates/serviceaccount.yaml b/helm-charts/FedML-Client/templates/serviceaccount.yaml new file mode 100644 index 000000000..8c5442e4d --- /dev/null +++ b/helm-charts/FedML-Client/templates/serviceaccount.yaml @@ -0,0 +1,12 @@ +{{- if .Values.serviceAccount.create -}} +apiVersion: v1 +kind: ServiceAccount +metadata: + name: {{ include "fedml-edge-client.serviceAccountName" . }} + labels: + {{- include "fedml-edge-client.labels" . | nindent 4 }} + {{- with .Values.serviceAccount.annotations }} + annotations: + {{- toYaml . | nindent 4 }} + {{- end }} +{{- end }} diff --git a/helm-charts/FedML-Client/templates/tests/test-connection.yaml b/helm-charts/FedML-Client/templates/tests/test-connection.yaml new file mode 100644 index 000000000..027f95340 --- /dev/null +++ b/helm-charts/FedML-Client/templates/tests/test-connection.yaml @@ -0,0 +1,15 @@ +apiVersion: v1 +kind: Pod +metadata: + name: "{{ include "fedml-edge-client.fullname" . }}-test-connection" + labels: + {{- include "fedml-edge-client.labels" . | nindent 4 }} + annotations: + "helm.sh/hook": test +spec: + containers: + - name: wget + image: busybox + command: ['wget'] + args: ['{{ include "fedml-edge-client.fullname" . }}:{{ .Values.service.port }}'] + restartPolicy: Never diff --git a/helm-charts/FedML-Client/values-template-example.yaml b/helm-charts/FedML-Client/values-template-example.yaml new file mode 100644 index 000000000..771710352 --- /dev/null +++ b/helm-charts/FedML-Client/values-template-example.yaml @@ -0,0 +1,62 @@ +name: edge-client-1 +namespace: fedml-edge-client-1 +chartName: fedml-client +chartVersion: 0.7.355 +# registry: "" +# pullPolicy: IfNotPresent +# imagePullSecrets: + # - name: myregistrykey +# ingressClassName: nginx +modules: + - client + +# ingress: + # client: + # annotations: {} + # hosts: + # - host: chart-example.local + # paths: + # - path: / + # pathType: ImplementationSpecific + # tls: + # - secretName: chart-example-tls + # hosts: + # - chart-example.local + +client: + fedmlAccountId: "1183" + # fedmlVersion: "release" + # fedmlClientOsName: "Linux" + # replicaCount: 1 + # volume: + # clientHomeDirHostPath: "/home/fedml-client" + # clientHomeDirMountPath: "/home/fedml/fedml-client" + # nameOverride: "" + # fullnameOverride: "" + # serviceAccount: + # create: true + # annotations: {} + # name: "" + # type: ClusterIP + # port: 9988 + # podAnnotations: + # nodeSelector: + # tolerations: + # affinity: + # resources: + # autoscaling: + # enabled: false + # minReplicas: 1 + # maxReplicas: 10 + # targetCPUUtilizationPercentage: 80 + # targetMemoryUtilizationPercentage: 80 + # podSecurityContext: {} + # fsGroup: 2000 + # securityContext: { } + # capabilities: + # drop: + # - ALL + # readOnlyRootFilesystem: true + # runAsNonRoot: true + # runAsUser: 1000 + diff --git a/helm-charts/FedML-Client/values-template.yaml b/helm-charts/FedML-Client/values-template.yaml new file mode 100644 index 000000000..775aa11e5 --- /dev/null +++ b/helm-charts/FedML-Client/values-template.yaml @@ -0,0 +1,115 @@ +image: + repository: {{ .registry | default "public.ecr.aws/x6k8q1x9" }}/fedml-client-agent + pullPolicy: {{ .pullPolicy | default "IfNotPresent" }} + tag: "" + +{{- with .imagePullSecrets }} +imagePullSecrets: + {{- toYaml . | nindent 2 }} +{{- end }} + +{{- with .ingress }} +{{- if hasKey . "client" }} +ingress: + enabled: true + className: {{ $.ingressClassName }} + {{- with .client }} + {{- with .annotations }} + annotations: + {{- toYaml . | nindent 4 }} + {{- end }} + {{- with .hosts }} + hosts: + {{- toYaml . | nindent 4 }} + {{- end }} + {{- with .tls }} + tls: + {{- toYaml . | nindent 4 }} + {{- end }} + {{- end }} +{{- end }} +{{- end }} + +{{- with .client }} +replicaCount: {{ .replicaCount | default 1 }} +{{- if .nameOverride }} +nameOverride: {{ .nameOverride }} +{{- end }} +{{- if .fullnameOverride }} +fullnameOverride: {{ .fullnameOverride }} +{{- end }} + +{{- with .serviceAccount }} +serviceAccount: + create: {{ .create | default true }} + {{- with .annotations }} + annotations: + {{- toYaml . | nindent 4 }} + {{- end }} + name: {{ .name }} +{{- end }} + +{{- with .podAnnotations }} +podAnnotations: + {{- toYaml . | nindent 2 }} +{{- end }} + +{{- with .podSecurityContext }} +podSecurityContext: + {{- toYaml . | nindent 2 }} +{{- end }} + +{{- with .securityContext }} +securityContext: + {{- toYaml . | nindent 2 }} +{{- end }} + +service: + type: {{ .type | default "ClusterIP" }} + port: {{ .port | default 9988 }} + +{{- with .resources }} +resources: + {{- toYaml . | nindent 2 }} +{{- end }} + +{{- with .autoscaling }} +autoscaling: + enabled: false + minReplicas: {{ .minReplicas | default 1 }} + maxReplicas: {{ .minReplicas | default 10 }} + {{- if .targetCPUUtilizationPercentage }} + targetCPUUtilizationPercentage: {{ .targetCPUUtilizationPercentage }} + {{- end }} + {{- if .targetMemoryUtilizationPercentage }} + targetMemoryUtilizationPercentage: {{ .targetMemoryUtilizationPercentage }} + {{- end }} +{{- end }} + +{{- with .nodeSelector }} +nodeSelector: + {{- toYaml . | nindent 2 }} +{{- end }} + +{{- with .tolerations }} +tolerations: + {{- toYaml . | nindent 2 }} +{{- end }} + +{{- with .affinity }} +affinity: + {{- toYaml . | nindent 2 }} +{{- end }} + +env: + fedmlAccountId: {{ .fedmlAccountId }} + fedmlVersion: {{ .fedmlVersion | default "release" }} + fedmlClientOsName: {{ .fedmlClientOsName | default "Linux" }} + +{{- with .volume }} +volume: + clientHomeDirHostPath: {{ .clientHomeDirHostPath | default "/home/fedml-client" }} + clientHomeDirMountPath: {{ .clientHomeDirMountPath | default "/home/fedml/fedml-client" }} +{{- end }} + +{{- end -}} diff --git a/helm-charts/FedML-Client/values.yaml b/helm-charts/FedML-Client/values.yaml new file mode 100644 index 000000000..585ff01a2 --- /dev/null +++ b/helm-charts/FedML-Client/values.yaml @@ -0,0 +1,91 @@ +# Default values for fedml-edge-client. +# This is a YAML-formatted file. +# Declare variables to be passed into your templates. + +replicaCount: 1 + +image: + repository: public.ecr.aws/x6k8q1x9/fedml-client-agent + pullPolicy: IfNotPresent + # Overrides the image tag whose default is the chart appVersion. + tag: "" + +imagePullSecrets: [] +nameOverride: "" +fullnameOverride: "" + +serviceAccount: + # Specifies whether a service account should be created + create: true + # Annotations to add to the service account + annotations: {} + # The name of the service account to use. + # If not set and create is true, a name is generated using the fullname template + name: "" + +podAnnotations: {} + +podSecurityContext: {} + # fsGroup: 2000 + +securityContext: {} + # capabilities: + # drop: + # - ALL + # readOnlyRootFilesystem: true + # runAsNonRoot: true + # runAsUser: 1000 + +service: + type: ClusterIP + port: 9988 + +ingress: + enabled: false + className: "" + annotations: {} + # kubernetes.io/ingress.class: nginx + # kubernetes.io/tls-acme: "true" + hosts: + - host: chart-example.local + paths: + - path: / + pathType: ImplementationSpecific + tls: [] + # - secretName: chart-example-tls + # hosts: + # - chart-example.local + +resources: {} + # We usually recommend not to specify default resources and to leave this as a conscious + # choice for the user. This also increases chances charts run on environments with little + # resources, such as Minikube. If you do want to specify resources, uncomment the following + # lines, adjust them as necessary, and remove the curly braces after 'resources:'. + # limits: + # cpu: 100m + # memory: 128Mi + # requests: + # cpu: 100m + # memory: 128Mi + +autoscaling: + enabled: false + minReplicas: 1 + maxReplicas: 10 + targetCPUUtilizationPercentage: 80 + # targetMemoryUtilizationPercentage: 80 + +nodeSelector: {} + +tolerations: [] + +affinity: {} + +env: + fedmlAccountId: "1183" + fedmlVersion: "release" + fedmlClientOsName: "Linux" + +volume: + clientHomeDirHostPath: "/home/fedml-client" + clientHomeDirMountPath: "/home/fedml/fedml-client" diff --git a/helm-charts/FedML-Server/Chart.yaml b/helm-charts/FedML-Server/Chart.yaml new file mode 100644 index 000000000..8ffd69eeb --- /dev/null +++ b/helm-charts/FedML-Server/Chart.yaml @@ -0,0 +1,6 @@ +apiVersion: v2 +appVersion: release +description: A Helm chart for FedML Edge Server +name: fedml-server +type: application +version: 0.7.355 diff --git a/helm-charts/FedML-Server/templates/NOTES.txt b/helm-charts/FedML-Server/templates/NOTES.txt new file mode 100644 index 000000000..10ed1b5e1 --- /dev/null +++ b/helm-charts/FedML-Server/templates/NOTES.txt @@ -0,0 +1 @@ +You may visit fedml documents by the URL https://doc.fedml.ai. \ No newline at end of file diff --git a/helm-charts/FedML-Server/templates/_helpers.tpl b/helm-charts/FedML-Server/templates/_helpers.tpl new file mode 100644 index 000000000..06735dd1f --- /dev/null +++ b/helm-charts/FedML-Server/templates/_helpers.tpl @@ -0,0 +1,68 @@ +{{/* +Expand the name of the chart. +*/}} +{{- define "fedml-edge-server.name" -}} +{{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" }} +{{- end }} + +{{/* +Create a default fully qualified app name. +We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec). +If release name contains chart name it will be used as a full name. +*/}} +{{- define "fedml-edge-server.fullname" -}} +{{- if .Values.fullnameOverride }} +{{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" }} +{{- else }} +{{- $name := default .Chart.Name .Values.nameOverride }} +{{- if contains $name .Release.Name }} +{{- .Release.Name | trunc 63 | trimSuffix "-" }} +{{- else }} +{{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" }} +{{- end }} +{{- end }} +{{- end }} + +{{/* +Create chart name and version as used by the chart label. +*/}} +{{- define "fedml-edge-server.chart" -}} +{{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" }} +{{- end }} + +{{/* +Common labels +*/}} +{{- define "fedml-edge-server.labels" -}} +helm.sh/chart: {{ include "fedml-edge-server.chart" . }} +{{ include "fedml-edge-server.selectorLabels" . }} +{{- if .Chart.AppVersion }} +app.kubernetes.io/version: {{ .Chart.AppVersion | quote }} +{{- end }} +app.kubernetes.io/managed-by: {{ .Release.Service }} +owner: kubefate +cluster: fedml-server +heritage: {{ .Release.Service }} +release: {{ .Release.Name }} +chart: {{ .Chart.Name }} +{{- end }} + +{{/* +Selector labels +*/}} +{{- define "fedml-edge-server.selectorLabels" -}} +app.kubernetes.io/name: {{ include "fedml-edge-server.name" . }} +app.kubernetes.io/instance: {{ .Release.Name }} +name: {{ .Release.Name | quote }} +{{- end }} + +{{/* +Create the name of the service account to use +*/}} +{{- define "fedml-edge-server.serviceAccountName" -}} +{{- if .Values.serviceAccount.create }} +{{- default (include "fedml-edge-server.fullname" .) .Values.serviceAccount.name }} +{{- else }} +{{- default "default" .Values.serviceAccount.name }} +{{- end }} +{{- end }} diff --git a/helm-charts/FedML-Server/templates/deployment.yaml b/helm-charts/FedML-Server/templates/deployment.yaml new file mode 100644 index 000000000..1f12d29ac --- /dev/null +++ b/helm-charts/FedML-Server/templates/deployment.yaml @@ -0,0 +1,70 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: {{ include "fedml-edge-server.fullname" . }} + labels: + {{- include "fedml-edge-server.labels" . | nindent 4 }} +spec: + {{- if not .Values.autoscaling.enabled }} + replicas: {{ .Values.replicaCount }} + {{- end }} + selector: + matchLabels: + {{- include "fedml-edge-server.selectorLabels" . | nindent 6 }} + template: + metadata: + {{- with .Values.podAnnotations }} + annotations: + {{- toYaml . | nindent 8 }} + {{- end }} + labels: + {{- include "fedml-edge-server.selectorLabels" . | nindent 8 }} + spec: + {{- with .Values.imagePullSecrets }} + imagePullSecrets: + {{- toYaml . | nindent 8 }} + {{- end }} + serviceAccountName: {{ include "fedml-edge-server.serviceAccountName" . }} + securityContext: + {{- toYaml .Values.podSecurityContext | nindent 8 }} + containers: + - name: {{ .Chart.Name }} + securityContext: + {{- toYaml .Values.securityContext | nindent 12 }} + image: "{{ .Values.image.repository }}:{{ .Values.image.tag | default .Chart.AppVersion }}" + imagePullPolicy: {{ .Values.image.pullPolicy }} + env: + - name: ACCOUNT_ID + value: "{{ .Values.env.fedmlAccountId }}" + - name: FEDML_VERSION + value: "{{ .Values.env.fedmlVersion }}" + - name: SERVER_OS_NAME + value: "{{ .Values.env.fedmlServerOsName }}" + - name: SERVER_DEVICE_ID + value: {{ include "fedml-edge-server.fullname" . }} + ports: + - name: http + containerPort: {{ .Values.service.port }} + protocol: TCP + resources: + {{- toYaml .Values.resources | nindent 12 }} + volumeMounts: + - name: home-dir + mountPath: {{ .Values.volume.serverHomeDirMountPath }} + {{- with .Values.nodeSelector }} + nodeSelector: + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.affinity }} + affinity: + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.tolerations }} + tolerations: + {{- toYaml . | nindent 8 }} + {{- end }} + volumes: + - name: home-dir + hostPath: + type: DirectoryOrCreate + path: {{ .Values.volume.serverHomeDirHostPath }} diff --git a/helm-charts/FedML-Server/templates/hpa.yaml b/helm-charts/FedML-Server/templates/hpa.yaml new file mode 100644 index 000000000..6fa213341 --- /dev/null +++ b/helm-charts/FedML-Server/templates/hpa.yaml @@ -0,0 +1,28 @@ +{{- if .Values.autoscaling.enabled }} +apiVersion: autoscaling/v2beta1 +kind: HorizontalPodAutoscaler +metadata: + name: {{ include "fedml-edge-server.fullname" . }} + labels: + {{- include "fedml-edge-server.labels" . | nindent 4 }} +spec: + scaleTargetRef: + apiVersion: apps/v1 + kind: Deployment + name: {{ include "fedml-edge-server.fullname" . }} + minReplicas: {{ .Values.autoscaling.minReplicas }} + maxReplicas: {{ .Values.autoscaling.maxReplicas }} + metrics: + {{- if .Values.autoscaling.targetCPUUtilizationPercentage }} + - type: Resource + resource: + name: cpu + targetAverageUtilization: {{ .Values.autoscaling.targetCPUUtilizationPercentage }} + {{- end }} + {{- if .Values.autoscaling.targetMemoryUtilizationPercentage }} + - type: Resource + resource: + name: memory + targetAverageUtilization: {{ .Values.autoscaling.targetMemoryUtilizationPercentage }} + {{- end }} +{{- end }} diff --git a/helm-charts/FedML-Server/templates/ingress.yaml b/helm-charts/FedML-Server/templates/ingress.yaml new file mode 100644 index 000000000..24edd68ee --- /dev/null +++ b/helm-charts/FedML-Server/templates/ingress.yaml @@ -0,0 +1,61 @@ +{{- if .Values.ingress.enabled -}} +{{- $fullName := include "fedml-edge-server.fullname" . -}} +{{- $svcPort := .Values.service.port -}} +{{- if and .Values.ingress.className (not (semverCompare ">=1.18-0" .Capabilities.KubeVersion.GitVersion)) }} + {{- if not (hasKey .Values.ingress.annotations "kubernetes.io/ingress.class") }} + {{- $_ := set .Values.ingress.annotations "kubernetes.io/ingress.class" .Values.ingress.className}} + {{- end }} +{{- end }} +{{- if semverCompare ">=1.19-0" .Capabilities.KubeVersion.GitVersion -}} +apiVersion: networking.k8s.io/v1 +{{- else if semverCompare ">=1.14-0" .Capabilities.KubeVersion.GitVersion -}} +apiVersion: networking.k8s.io/v1beta1 +{{- else -}} +apiVersion: extensions/v1beta1 +{{- end }} +kind: Ingress +metadata: + name: {{ $fullName }} + labels: + {{- include "fedml-edge-server.labels" . | nindent 4 }} + {{- with .Values.ingress.annotations }} + annotations: + {{- toYaml . | nindent 4 }} + {{- end }} +spec: + {{- if and .Values.ingress.className (semverCompare ">=1.18-0" .Capabilities.KubeVersion.GitVersion) }} + ingressClassName: {{ .Values.ingress.className }} + {{- end }} + {{- if .Values.ingress.tls }} + tls: + {{- range .Values.ingress.tls }} + - hosts: + {{- range .hosts }} + - {{ . | quote }} + {{- end }} + secretName: {{ .secretName }} + {{- end }} + {{- end }} + rules: + {{- range .Values.ingress.hosts }} + - host: {{ .host | quote }} + http: + paths: + {{- range .paths }} + - path: {{ .path }} + {{- if and .pathType (semverCompare ">=1.18-0" $.Capabilities.KubeVersion.GitVersion) }} + pathType: {{ .pathType }} + {{- end }} + backend: + {{- if semverCompare ">=1.19-0" $.Capabilities.KubeVersion.GitVersion }} + service: + name: {{ $fullName }} + port: + number: {{ $svcPort }} + {{- else }} + serviceName: {{ $fullName }} + servicePort: {{ $svcPort }} + {{- end }} + {{- end }} + {{- end }} +{{- end }} diff --git a/helm-charts/FedML-Server/templates/service.yaml b/helm-charts/FedML-Server/templates/service.yaml new file mode 100644 index 000000000..676b29063 --- /dev/null +++ b/helm-charts/FedML-Server/templates/service.yaml @@ -0,0 +1,15 @@ +apiVersion: v1 +kind: Service +metadata: + name: {{ include "fedml-edge-server.fullname" . }} + labels: + {{- include "fedml-edge-server.labels" . | nindent 4 }} +spec: + type: {{ .Values.service.type }} + ports: + - port: {{ .Values.service.port }} + targetPort: http + protocol: TCP + name: http + selector: + {{- include "fedml-edge-server.selectorLabels" . | nindent 4 }} diff --git a/helm-charts/FedML-Server/templates/serviceaccount.yaml b/helm-charts/FedML-Server/templates/serviceaccount.yaml new file mode 100644 index 000000000..eadca78f6 --- /dev/null +++ b/helm-charts/FedML-Server/templates/serviceaccount.yaml @@ -0,0 +1,12 @@ +{{- if .Values.serviceAccount.create -}} +apiVersion: v1 +kind: ServiceAccount +metadata: + name: {{ include "fedml-edge-server.serviceAccountName" . }} + labels: + {{- include "fedml-edge-server.labels" . | nindent 4 }} + {{- with .Values.serviceAccount.annotations }} + annotations: + {{- toYaml . | nindent 4 }} + {{- end }} +{{- end }} diff --git a/helm-charts/FedML-Server/templates/tests/test-connection.yaml b/helm-charts/FedML-Server/templates/tests/test-connection.yaml new file mode 100644 index 000000000..2a35a226f --- /dev/null +++ b/helm-charts/FedML-Server/templates/tests/test-connection.yaml @@ -0,0 +1,15 @@ +apiVersion: v1 +kind: Pod +metadata: + name: "{{ include "fedml-edge-server.fullname" . }}-test-connection" + labels: + {{- include "fedml-edge-server.labels" . | nindent 4 }} + annotations: + "helm.sh/hook": test +spec: + containers: + - name: wget + image: busybox + command: ['wget'] + args: ['{{ include "fedml-edge-server.fullname" . }}:{{ .Values.service.port }}'] + restartPolicy: Never diff --git a/helm-charts/FedML-Server/values-template-example.yaml b/helm-charts/FedML-Server/values-template-example.yaml new file mode 100644 index 000000000..0433d2440 --- /dev/null +++ b/helm-charts/FedML-Server/values-template-example.yaml @@ -0,0 +1,62 @@ +name: edge-server-1 +namespace: fedml-edge-server-1 +chartName: fedml-server +chartVersion: 0.7.355 +# registry: "" +# pullPolicy: IfNotPresent +# imagePullSecrets: + # - name: myregistrykey +# ingressClassName: nginx +modules: + - server + +# ingress: + # server: + # annotations: {} + # hosts: + # - host: chart-example.local + # paths: + # - path: / + # pathType: ImplementationSpecific + # tls: + # - secretName: chart-example-tls + # hosts: + # - chart-example.local + +server: + fedmlAccountId: "1183" + # fedmlVersion: "release" + # fedmlServerOsName: "Linux" + # replicaCount: 1 + # volume: + # serverHomeDirHostPath: "/home/fedml-server" + # serverHomeDirMountPath: "/home/fedml/fedml-server" + # nameOverride: "" + # fullnameOverride: "" + # serviceAccount: + # create: true + # annotations: {} + # name: "" + # type: ClusterIP + # port: 9999 + # podAnnotations: + # nodeSelector: + # tolerations: + # affinity: + # resources: + # autoscaling: + # enabled: false + # minReplicas: 1 + # maxReplicas: 10 + # targetCPUUtilizationPercentage: 80 + # targetMemoryUtilizationPercentage: 80 + # podSecurityContext: {} + # fsGroup: 2000 + # securityContext: { } + # capabilities: + # drop: + # - ALL + # readOnlyRootFilesystem: true + # runAsNonRoot: true + # runAsUser: 1000 + diff --git a/helm-charts/FedML-Server/values-template.yaml b/helm-charts/FedML-Server/values-template.yaml new file mode 100644 index 000000000..559124015 --- /dev/null +++ b/helm-charts/FedML-Server/values-template.yaml @@ -0,0 +1,115 @@ +image: + repository: {{ .registry | default "public.ecr.aws/x6k8q1x9" }}/fedml-edge-server + pullPolicy: {{ .pullPolicy | default "IfNotPresent" }} + tag: "" + +{{- with .imagePullSecrets }} +imagePullSecrets: + {{- toYaml . | nindent 2 }} +{{- end }} + +{{- with .ingress }} +{{- if hasKey . "server" }} +ingress: + enabled: true + className: {{ $.ingressClassName }} + {{- with .server }} + {{- with .annotations }} + annotations: + {{- toYaml . | nindent 4 }} + {{- end }} + {{- with .hosts }} + hosts: + {{- toYaml . | nindent 4 }} + {{- end }} + {{- with .tls }} + tls: + {{- toYaml . | nindent 4 }} + {{- end }} + {{- end }} +{{- end }} +{{- end }} + +{{- with .server }} +replicaCount: {{ .replicaCount | default 1 }} +{{- if .nameOverride }} +nameOverride: {{ .nameOverride }} +{{- end }} +{{- if .fullnameOverride }} +fullnameOverride: {{ .fullnameOverride }} +{{- end }} + +{{- with .serviceAccount }} +serviceAccount: + create: {{ .create | default true }} + {{- with .annotations }} + annotations: + {{- toYaml . | nindent 4 }} + {{- end }} + name: {{ .name }} +{{- end }} + +{{- with .podAnnotations }} +podAnnotations: + {{- toYaml . | nindent 2 }} +{{- end }} + +{{- with .podSecurityContext }} +podSecurityContext: + {{- toYaml . | nindent 2 }} +{{- end }} + +{{- with .securityContext }} +securityContext: + {{- toYaml . | nindent 2 }} +{{- end }} + +service: + type: {{ .type | default "ClusterIP" }} + port: {{ .port | default 9999 }} + +{{- with .resources }} +resources: + {{- toYaml . | nindent 2 }} +{{- end }} + +{{- with .autoscaling }} +autoscaling: + enabled: false + minReplicas: {{ .minReplicas | default 1 }} + maxReplicas: {{ .minReplicas | default 10 }} + {{- if .targetCPUUtilizationPercentage }} + targetCPUUtilizationPercentage: {{ .targetCPUUtilizationPercentage }} + {{- end }} + {{- if .targetMemoryUtilizationPercentage }} + targetMemoryUtilizationPercentage: {{ .targetMemoryUtilizationPercentage }} + {{- end }} +{{- end }} + +{{- with .nodeSelector }} +nodeSelector: + {{- toYaml . | nindent 2 }} +{{- end }} + +{{- with .tolerations }} +tolerations: + {{- toYaml . | nindent 2 }} +{{- end }} + +{{- with .affinity }} +affinity: + {{- toYaml . | nindent 2 }} +{{- end }} + +env: + fedmlAccountId: {{ .fedmlAccountId }} + fedmlVersion: {{ .fedmlVersion | default "release" }} + fedmlServerOsName: {{ .fedmlServerOsName | default "Linux" }} + +{{- with .volume }} +volume: + serverHomeDirHostPath: {{ .serverHomeDirHostPath | default "/home/fedml-server" }} + serverHomeDirMountPath: {{ .serverHomeDirMountPath | default "/home/fedml/fedml-server" }} +{{- end }} + +{{- end -}} diff --git a/helm-charts/FedML-Server/values.yaml b/helm-charts/FedML-Server/values.yaml new file mode 100644 index 000000000..576bafae6 --- /dev/null +++ b/helm-charts/FedML-Server/values.yaml @@ -0,0 +1,91 @@ +# Default values for fedml-edge-server. +# This is a YAML-formatted file. +# Declare variables to be passed into your templates. + +replicaCount: 1 + +image: + repository: public.ecr.aws/x6k8q1x9/fedml-edge-server + pullPolicy: IfNotPresent + # Overrides the image tag whose default is the chart appVersion. + tag: "" + +imagePullSecrets: [] +nameOverride: "" +fullnameOverride: "" + +serviceAccount: + # Specifies whether a service account should be created + create: true + # Annotations to add to the service account + annotations: {} + # The name of the service account to use. + # If not set and create is true, a name is generated using the fullname template + name: "" + +podAnnotations: {} + +podSecurityContext: {} + # fsGroup: 2000 + +securityContext: {} + # capabilities: + # drop: + # - ALL + # readOnlyRootFilesystem: true + # runAsNonRoot: true + # runAsUser: 1000 + +service: + type: ClusterIP + port: 9999 + +ingress: + enabled: false + className: "" + annotations: {} + # kubernetes.io/ingress.class: nginx + # kubernetes.io/tls-acme: "true" + hosts: + - host: chart-example.local + paths: + - path: / + pathType: ImplementationSpecific + tls: [] + # - secretName: chart-example-tls + # hosts: + # - chart-example.local + +resources: {} + # We usually recommend not to specify default resources and to leave this as a conscious + # choice for the user. This also increases chances charts run on environments with little + # resources, such as Minikube. If you do want to specify resources, uncomment the following + # lines, adjust them as necessary, and remove the curly braces after 'resources:'. + # limits: + # cpu: 100m + # memory: 128Mi + # requests: + # cpu: 100m + # memory: 128Mi + +autoscaling: + enabled: false + minReplicas: 1 + maxReplicas: 10 + targetCPUUtilizationPercentage: 80 + # targetMemoryUtilizationPercentage: 80 + +nodeSelector: {} + +tolerations: [] + +affinity: {} + +env: + fedmlAccountId: "1183" + fedmlVersion: "release" + fedmlServerOsName: "Linux" + +volume: + serverHomeDirHostPath: "/home/fedml-server" + serverHomeDirMountPath: "/home/fedml/fedml-server" diff --git a/helm-charts/Makefile b/helm-charts/Makefile index 6a32dc6b4..3451b3eea 100644 --- a/helm-charts/Makefile +++ b/helm-charts/Makefile @@ -2,7 +2,9 @@ release: helm package ./FATE helm package ./FATE-Serving helm package ./FATE-Exchange + helm package ./UpgradeManager lint: helm lint ./FATE helm lint ./FATE-Serving - helm lint ./FATE-Exchange \ No newline at end of file + helm lint ./FATE-Exchange + helm lint ./UpgradeManager \ No newline at end of file diff --git a/helm-charts/UpgradeManager/values.yaml b/helm-charts/UpgradeManager/values.yaml index fda03b93a..b49d24dbb 100644 --- a/helm-charts/UpgradeManager/values.yaml +++ b/helm-charts/UpgradeManager/values.yaml @@ -1,4 +1,4 @@ username: fate password: fate_dev -start: 1.9.2 -target: 1.9.2 \ No newline at end of file +start: 1.10.0 +target: 1.10.0 \ No newline at end of file diff --git a/k8s-deploy/Dockerfile b/k8s-deploy/Dockerfile index 9f3777a01..4ef5ddd3a 100644 --- a/k8s-deploy/Dockerfile +++ b/k8s-deploy/Dockerfile @@ -1,3 +1,5 @@ +ARG ARCH=amd64 + FROM golang:1.17 as builder WORKDIR /workspace @@ -13,9 +15,10 @@ COPY docs/docs.go docs/docs.go COPY config.yaml config.yaml ARG LDFLAGS -RUN CGO_ENABLED=0 GOOS=linux GOARCH=amd64 GO111MODULE=on go build -a -ldflags '-s' -installsuffix cgo -o kubefate kubefate.go +ARG ARCH +RUN CGO_ENABLED=0 GOOS=linux GOARCH=${ARCH} GO111MODULE=on go build -a -ldflags '-s' -installsuffix cgo -o kubefate kubefate.go -FROM gcr.io/distroless/static:nonroot +FROM gcr.io/distroless/static:nonroot-${ARCH} WORKDIR / COPY --from=builder /workspace/kubefate . COPY --from=builder /workspace/config.yaml . diff --git a/k8s-deploy/Makefile b/k8s-deploy/Makefile index 0b65dd552..5e1312531 100644 --- a/k8s-deploy/Makefile +++ b/k8s-deploy/Makefile @@ -1,6 +1,8 @@ NAME ?= federatedai/kubefate VERSION ?= v1.4.5 IMG ?= ${NAME}:${VERSION} +ARCH ?= amd64 +GOOS ?= linux ifeq (,$(shell go env GOBIN)) GOBIN=$(shell go env GOPATH)/bin @@ -26,10 +28,10 @@ test: fmt vet # Build manager binary kubefate: fmt vet swag - CGO_ENABLED=0 go build -a --ldflags '-extldflags "-static"' -o ${OUTPUT_FILE} ${BUILD_MODE} kubefate.go + GOOS=${GOOS} GOARCH=${ARCH} CGO_ENABLED=0 go build -a --ldflags '-extldflags "-static"' -o ${OUTPUT_FILE} ${BUILD_MODE} kubefate.go kubefate-without-swag: fmt vet - CGO_ENABLED=0 go build -a --ldflags '-extldflags "-static"' -o ${OUTPUT_FILE} ${BUILD_MODE} kubefate.go + GOOS=${GOOS} GOARCH=${ARCH} CGO_ENABLED=0 go build -a --ldflags '-extldflags "-static"' -o ${OUTPUT_FILE} ${BUILD_MODE} kubefate.go run: fmt vet go run ./kubefate.go service @@ -43,16 +45,16 @@ uninstall: kubectl delete -f rbac-config.yaml docker-build: test - docker build . -t ${IMG} + docker build --build-arg ARCH=${ARCH} . -t ${IMG} docker-build-without-test: - docker build . -t ${IMG} + docker build --build-arg ARCH=${ARCH} . -t ${IMG} docker-push: docker push ${IMG} docker-save: docker-build-without-test - docker save -o kubefate-${VERSION}.docker ${IMG} + docker save -o kubefate-${VERSION}-${ARCH}.docker ${IMG} # Run go fmt against code fmt: @@ -68,12 +70,12 @@ swag: swag-bin package: kubefate-without-swag mkdir -p tmp/kubefate; cp -r bin/kubefate *.yaml examples tmp/kubefate; - tar -czvf kubefate-k8s-${RELEASE_VERSION}.tar.gz -C tmp/ kubefate; + tar -czvf kubefate-k8s-${RELEASE_VERSION}-${ARCH}.tar.gz -C tmp/ kubefate; rm -r tmp; release: package docker-save mkdir -p release; - mv kubefate-k8s-${RELEASE_VERSION}.tar.gz kubefate-${VERSION}.docker release/; + mv kubefate-k8s-${RELEASE_VERSION}-${ARCH}.tar.gz kubefate-${VERSION}-${ARCH}.docker release/; clean: rm -r release diff --git a/k8s-deploy/README.md b/k8s-deploy/README.md index 41dea4f2c..77293e88e 100644 --- a/k8s-deploy/README.md +++ b/k8s-deploy/README.md @@ -186,13 +186,13 @@ UUID 24bb75ff-f636-4c64-8c04-1b9073f89a2f Name fate-9999 NameSpace fate-9999 ChartName fate -ChartVersion v1.9.2 +ChartVersion v1.10.0 Revision 1 Age 15m Status Running Spec algorithm: Basic chartName: fate - chartVersion: v1.9.2 + chartVersion: v1.10.0 computing: Eggroll device: CPU federation: Eggroll diff --git a/k8s-deploy/README_zh.md b/k8s-deploy/README_zh.md index 4cb67601b..9486ae68c 100644 --- a/k8s-deploy/README_zh.md +++ b/k8s-deploy/README_zh.md @@ -185,13 +185,13 @@ UUID 24bb75ff-f636-4c64-8c04-1b9073f89a2f Name fate-9999 NameSpace fate-9999 ChartName fate -ChartVersion v1.9.2 +ChartVersion v1.10.0 Revision 1 Age 15m Status Running Spec algorithm: Basic chartName: fate - chartVersion: v1.9.2 + chartVersion: v1.10.0 computing: Eggroll device: CPU federation: Eggroll diff --git a/k8s-deploy/cluster-spark-pulsar.yaml b/k8s-deploy/cluster-spark-pulsar.yaml index 63e78317a..df9b1ad16 100644 --- a/k8s-deploy/cluster-spark-pulsar.yaml +++ b/k8s-deploy/cluster-spark-pulsar.yaml @@ -1,7 +1,7 @@ name: fate-9999 namespace: fate-9999 chartName: fate -chartVersion: v1.9.2 +chartVersion: v1.10.0 partyId: 9999 registry: "" pullPolicy: @@ -63,7 +63,6 @@ skippedKeys: # nodeSelector: # tolerations: # affinity: - # enabledNN: false # logLevel: INFO # existingClaim: "" # storageClass: "python" diff --git a/k8s-deploy/cluster-spark-rabbitmq.yaml b/k8s-deploy/cluster-spark-rabbitmq.yaml index 71e89e0c4..b4037cb12 100644 --- a/k8s-deploy/cluster-spark-rabbitmq.yaml +++ b/k8s-deploy/cluster-spark-rabbitmq.yaml @@ -1,7 +1,7 @@ name: fate-9999 namespace: fate-9999 chartName: fate -chartVersion: v1.9.2 +chartVersion: v1.10.0 partyId: 9999 registry: "" pullPolicy: diff --git a/k8s-deploy/cluster-spark-slim.yaml b/k8s-deploy/cluster-spark-slim.yaml index 4c988537f..653c78065 100644 --- a/k8s-deploy/cluster-spark-slim.yaml +++ b/k8s-deploy/cluster-spark-slim.yaml @@ -1,7 +1,7 @@ name: fate-9999 namespace: fate-9999 chartName: fate -chartVersion: v1.9.2 +chartVersion: v1.10.0 partyId: 9999 registry: "" pullPolicy: diff --git a/k8s-deploy/cluster.yaml b/k8s-deploy/cluster.yaml index 5106df8ee..e3858d754 100644 --- a/k8s-deploy/cluster.yaml +++ b/k8s-deploy/cluster.yaml @@ -1,7 +1,7 @@ name: fate-9999 namespace: fate-9999 chartName: fate -chartVersion: v1.9.2 +chartVersion: v1.10.0 partyId: 9999 registry: "" pullPolicy: @@ -129,7 +129,6 @@ skippedKeys: # nodeSelector: # tolerations: # affinity: - # enabledNN: false # logLevel: INFO # existingClaim: "" # storageClass: "python" diff --git a/k8s-deploy/examples/party-10000/cluster-spark-local-pulsar.yaml b/k8s-deploy/examples/party-10000/cluster-spark-local-pulsar.yaml index 411459038..df69d5caf 100644 --- a/k8s-deploy/examples/party-10000/cluster-spark-local-pulsar.yaml +++ b/k8s-deploy/examples/party-10000/cluster-spark-local-pulsar.yaml @@ -1,7 +1,7 @@ name: fate-10000 namespace: fate-10000 chartName: fate -chartVersion: v1.9.2 +chartVersion: v1.10.0 partyId: 10000 registry: "" pullPolicy: diff --git a/k8s-deploy/examples/party-10000/cluster-spark-pulsar.yaml b/k8s-deploy/examples/party-10000/cluster-spark-pulsar.yaml index b8a2e0b20..e798aadbe 100644 --- a/k8s-deploy/examples/party-10000/cluster-spark-pulsar.yaml +++ b/k8s-deploy/examples/party-10000/cluster-spark-pulsar.yaml @@ -1,7 +1,7 @@ name: fate-10000 namespace: fate-10000 chartName: fate -chartVersion: v1.9.2 +chartVersion: v1.10.0 partyId: 10000 registry: "" pullPolicy: diff --git a/k8s-deploy/examples/party-10000/cluster-spark-rabbitmq.yaml b/k8s-deploy/examples/party-10000/cluster-spark-rabbitmq.yaml index 0c9a87443..d1d4028a8 100644 --- a/k8s-deploy/examples/party-10000/cluster-spark-rabbitmq.yaml +++ b/k8s-deploy/examples/party-10000/cluster-spark-rabbitmq.yaml @@ -1,7 +1,7 @@ name: fate-10000 namespace: fate-10000 chartName: fate -chartVersion: v1.9.2 +chartVersion: v1.10.0 partyId: 10000 registry: "" pullPolicy: diff --git a/k8s-deploy/examples/party-10000/cluster.yaml b/k8s-deploy/examples/party-10000/cluster.yaml index 6f4380d6d..fc90e7033 100644 --- a/k8s-deploy/examples/party-10000/cluster.yaml +++ b/k8s-deploy/examples/party-10000/cluster.yaml @@ -1,7 +1,7 @@ name: fate-10000 namespace: fate-10000 chartName: fate -chartVersion: v1.9.2 +chartVersion: v1.10.0 partyId: 10000 registry: "" pullPolicy: diff --git a/k8s-deploy/examples/party-9999/cluster-spark-local-pulsar.yaml b/k8s-deploy/examples/party-9999/cluster-spark-local-pulsar.yaml index 29736bfec..8d83d6099 100644 --- a/k8s-deploy/examples/party-9999/cluster-spark-local-pulsar.yaml +++ b/k8s-deploy/examples/party-9999/cluster-spark-local-pulsar.yaml @@ -1,7 +1,7 @@ name: fate-9999 namespace: fate-9999 chartName: fate -chartVersion: v1.9.2 +chartVersion: v1.10.0 partyId: 9999 registry: "" pullPolicy: diff --git a/k8s-deploy/examples/party-9999/cluster-spark-pulsar.yaml b/k8s-deploy/examples/party-9999/cluster-spark-pulsar.yaml index 101525d0d..4c4912b76 100644 --- a/k8s-deploy/examples/party-9999/cluster-spark-pulsar.yaml +++ b/k8s-deploy/examples/party-9999/cluster-spark-pulsar.yaml @@ -1,7 +1,7 @@ name: fate-9999 namespace: fate-9999 chartName: fate -chartVersion: v1.9.2 +chartVersion: v1.10.0 partyId: 9999 registry: "" pullPolicy: diff --git a/k8s-deploy/examples/party-9999/cluster-spark-rabbitmq.yaml b/k8s-deploy/examples/party-9999/cluster-spark-rabbitmq.yaml index 60f75c1ac..a744eee42 100644 --- a/k8s-deploy/examples/party-9999/cluster-spark-rabbitmq.yaml +++ b/k8s-deploy/examples/party-9999/cluster-spark-rabbitmq.yaml @@ -1,7 +1,7 @@ name: fate-9999 namespace: fate-9999 chartName: fate -chartVersion: v1.9.2 +chartVersion: v1.10.0 partyId: 9999 registry: "" pullPolicy: diff --git a/k8s-deploy/examples/party-9999/cluster.yaml b/k8s-deploy/examples/party-9999/cluster.yaml index 1e2e14727..69f6fc00b 100644 --- a/k8s-deploy/examples/party-9999/cluster.yaml +++ b/k8s-deploy/examples/party-9999/cluster.yaml @@ -1,7 +1,7 @@ name: fate-9999 namespace: fate-9999 chartName: fate -chartVersion: v1.9.2 +chartVersion: v1.10.0 partyId: 9999 registry: "" pullPolicy: diff --git a/k8s-deploy/examples/party-exchange/rollsite.yaml b/k8s-deploy/examples/party-exchange/rollsite.yaml index ae73dcec7..d67a60fb8 100644 --- a/k8s-deploy/examples/party-exchange/rollsite.yaml +++ b/k8s-deploy/examples/party-exchange/rollsite.yaml @@ -1,7 +1,7 @@ name: fate-exchange namespace: fate-exchange chartName: fate-exchange -chartVersion: v1.9.2 +chartVersion: v1.10.0 partyId: 1 registry: "" pullPolicy: diff --git a/k8s-deploy/examples/party-exchange/trafficServer.yaml b/k8s-deploy/examples/party-exchange/trafficServer.yaml index 8da89524c..c497cd02e 100644 --- a/k8s-deploy/examples/party-exchange/trafficServer.yaml +++ b/k8s-deploy/examples/party-exchange/trafficServer.yaml @@ -1,7 +1,7 @@ name: fate-exchange namespace: fate-exchange chartName: fate-exchange -chartVersion: v1.9.2 +chartVersion: v1.10.0 partyId: 1 registry: "" pullPolicy: diff --git a/k8s-deploy/examples/party.config b/k8s-deploy/examples/party.config index 0a54025ea..89fc3c3ec 100644 --- a/k8s-deploy/examples/party.config +++ b/k8s-deploy/examples/party.config @@ -1,5 +1,5 @@ -fate_chartVersion=v1.9.2 -fate_imageTAG=1.9.2-release +fate_chartVersion=v1.10.0 +fate_imageTAG=1.10.0-release fate_serving_chartVersion=v2.1.6 fate_serving_imageTAG=2.1.6-release party_9999_IP=192.168.9.1 diff --git a/k8s-deploy/pkg/job/cluster_install.go b/k8s-deploy/pkg/job/cluster_install.go index 50f4bd5a8..63d2c5e15 100644 --- a/k8s-deploy/pkg/job/cluster_install.go +++ b/k8s-deploy/pkg/job/cluster_install.go @@ -321,17 +321,19 @@ func updateLastJobEvent(job *modules.Job, Event string) { log.Error().Err(dbErr).Msg("job.SetStatus error") } } + +// checkStatus returns a bool to indicate if the job is finished successfully func checkStatus(job *modules.Job, cluster *modules.Cluster) bool { // update subJobs - ClusterStatus, err := service.GetClusterDeployStatus(cluster.Name, cluster.NameSpace) + clusterComponentStatus, err := getClusterComponentsStatus(cluster.Name, cluster.NameSpace) if err != nil { - log.Error().Err(err).Msg("GetClusterDeployStatus error") + log.Error().Err(err).Msg("getClusterComponentsStatus error") return false } - log.Debug().Interface("ClusterStatus", ClusterStatus).Msg("GetClusterDeployStatus()") - subJobs := generateSubJobs(job, ClusterStatus) + log.Debug().Interface("ClusterStatus", clusterComponentStatus).Msg("the cluster component status") + subJobs := generateSubJobs(job, clusterComponentStatus) dbErr := job.SetSubJobs(subJobs) if dbErr != nil { @@ -339,7 +341,7 @@ func checkStatus(job *modules.Job, cluster *modules.Cluster) bool { return false } - if service.CheckClusterStatus(ClusterStatus) { + if service.CheckClusterStatus(clusterComponentStatus) { dbErr := job.SetStatus(modules.JobStatusSuccess) if dbErr != nil { log.Error().Err(dbErr).Msg("job setStatus error") diff --git a/k8s-deploy/pkg/job/fate_upgrade_manager.go b/k8s-deploy/pkg/job/fate_upgrade_manager.go index 04fa7f26d..39cd4a0cb 100644 --- a/k8s-deploy/pkg/job/fate_upgrade_manager.go +++ b/k8s-deploy/pkg/job/fate_upgrade_manager.go @@ -105,21 +105,21 @@ func getMysqlCredFromSpec(clusterSpec modules.MapStringInterface) (username, pas defaultUsername := "fate" defaultPassword := "fate_dev" if clusterSpec["mysql"] == nil { - return defaultUsername, defaultPassword + return defaultUsername, defaultPassword } mysqlSpec := clusterSpec["mysql"].(map[string]interface{}) if mysqlSpec["user"] == nil { - username = defaultUsername + username = defaultUsername } else { - username = mysqlSpec["user"].(string) + username = mysqlSpec["user"].(string) } if mysqlSpec["password"] == nil { - password = defaultPassword + password = defaultPassword } else { - password = mysqlSpec["password"].(string) + password = mysqlSpec["password"].(string) } return - } +} func constructFumSpec(oldSpec, newSpec modules.MapStringInterface) (fumSpec modules.MapStringInterface) { oldVersion := strings.ReplaceAll(oldSpec["chartVersion"].(string), "v", "") diff --git a/k8s-deploy/pkg/job/job.go b/k8s-deploy/pkg/job/job.go index 2c07596c8..bc329f364 100644 --- a/k8s-deploy/pkg/job/job.go +++ b/k8s-deploy/pkg/job/job.go @@ -41,57 +41,32 @@ func stopJob(job *modules.Job, cluster *modules.Cluster) bool { return false } -func generateSubJobs_b(job *modules.Job, ClusterStatus map[string]string) modules.SubJobs { - - subJobs := make(modules.SubJobs) - if job.SubJobs != nil { - subJobs = job.SubJobs +func getClusterComponentsStatus(clusterName, clusterNamespace string) (map[string]string, error) { + deploymentStatus, err := service.GetClusterDeployStatus(clusterName, clusterNamespace) + if err != nil { + log.Error().Err(err).Msg("GetClusterDeployStatus error") + return deploymentStatus, err } - - for k, v := range ClusterStatus { - var subJobStatus string - if v == "Running" { - subJobStatus = "Success" - } else if v == "Failed" || v == "Unknown" || v == "Pending" { - subJobStatus = v - } else { - subJobStatus = "Running" - } - - var subJob modules.SubJob - if _, ok := subJobs[k]; !ok { - subJob = modules.SubJob{ - ModuleName: k, - Status: subJobStatus, - ModulesStatus: v, - StartTime: job.StartTime, - } - } else { - subJob = subJobs[k] - subJob.Status = subJobStatus - subJob.ModulesStatus = v - } - - if subJobStatus == "Success" && subJob.EndTime.IsZero() { - subJob.EndTime = time.Now() - } - - subJobs[k] = subJob - log.Debug().Interface("subJob", subJob).Msg("generate SubJobs") + stsStatus, err := service.GetClusterStsStatus(clusterName, clusterNamespace) + if err != nil { + log.Error().Err(err).Msg("GetClusterStsStatus error") + return deploymentStatus, err } - - job.SubJobs = subJobs - return subJobs + for k, v := range stsStatus { + deploymentStatus[k] = v + } + return deploymentStatus, nil } -func generateSubJobs(job *modules.Job, ClusterDeployStatus map[string]string) modules.SubJobs { +func generateSubJobs(job *modules.Job, clusterComponentStatus map[string]string) modules.SubJobs { subJobs := make(modules.SubJobs) if job.SubJobs != nil { subJobs = job.SubJobs } - for k, v := range ClusterDeployStatus { + // The cluster component status includes deployments and statefulSets + for k, v := range clusterComponentStatus { var subJobStatus string = "Running" if service.CheckStatus(v) { subJobStatus = "Success" @@ -154,6 +129,9 @@ func ClusterUpdate(clusterArgs *modules.ClusterArgs, creator string) (*modules.J um = &FateUpgradeManager{ namespace: clusterArgs.Namespace, } + default: + um = &FallbackUpgradeManager{} + log.Info().Msgf("no upgrade manager is available for %s", cluster.Name) } err = um.validate(specOld, specNew) if err != nil { @@ -182,9 +160,8 @@ func ClusterUpdate(clusterArgs *modules.ClusterArgs, creator string) (*modules.J if dbErr != nil { log.Error().Err(dbErr).Msg("Cluster.SetStatus error") } - - if specOld["chartVersion"].(string) != specNew["chartVersion"].(string) { - umCluster := um.getCluster(specOld, specNew) + umCluster := um.getCluster(specOld, specNew) + if umCluster.Name != "fallbackUM" && specOld["chartVersion"].(string) != specNew["chartVersion"].(string) { // We will implicitly install a new cluster for the upgrade manager, and delete it after it finishes its job err := umCluster.HelmInstall() if err != nil { @@ -281,19 +258,19 @@ func ClusterUpdate(clusterArgs *modules.ClusterArgs, creator string) (*modules.J } // update subJobs - ClusterStatus, err := service.GetClusterDeployStatus(clusterArgs.Name, clusterArgs.Namespace) + clusterComponentStatus, err := getClusterComponentsStatus(clusterArgs.Name, clusterArgs.Namespace) if err != nil { log.Error().Err(err).Msg("GetClusterDeployStatus error") } - subJobs := generateSubJobs(job, ClusterStatus) + subJobs := generateSubJobs(job, clusterComponentStatus) dbErr = job.SetSubJobs(subJobs) if dbErr != nil { log.Error().Err(dbErr).Msg("job.SetSubJobs error") } - if service.CheckClusterStatus(ClusterStatus) { + if service.CheckClusterStatus(clusterComponentStatus) { dbErr := job.SetStatus(modules.JobStatusSuccess) if dbErr != nil { log.Error().Err(dbErr).Msg("job.SetStatus error") @@ -380,21 +357,21 @@ func ClusterUpdate(clusterArgs *modules.ClusterArgs, creator string) (*modules.J } // update subJobs - ClusterStatus, err := service.GetClusterDeployStatus(clusterArgs.Name, clusterArgs.Namespace) + clusterComponentStatus, err := getClusterComponentsStatus(clusterArgs.Name, clusterArgs.Namespace) if err != nil { - log.Error().Err(err).Msg("GetClusterDeployStatus error") + log.Error().Err(err).Msg("clusterComponentStatus error") } - log.Debug().Interface("ClusterStatus", ClusterStatus).Msg("GetClusterDeployStatus()") + log.Debug().Interface("clusterComponentStatus", clusterComponentStatus).Msg("clusterComponentStatus()") - subJobs := generateSubJobs(job, ClusterStatus) + subJobs := generateSubJobs(job, clusterComponentStatus) dbErr = job.SetSubJobs(subJobs) if dbErr != nil { log.Error().Err(dbErr).Msg("job.SetSubJobs error") } - if service.CheckClusterStatus(ClusterStatus) { + if service.CheckClusterStatus(clusterComponentStatus) { dbErr := job.SetStatus(modules.JobStatusSuccess) if dbErr != nil { log.Error().Err(dbErr).Msg("job.SetStatus error") diff --git a/k8s-deploy/pkg/job/upgrade_manager.go b/k8s-deploy/pkg/job/upgrade_manager.go index 9611b39da..fbd75b58b 100644 --- a/k8s-deploy/pkg/job/upgrade_manager.go +++ b/k8s-deploy/pkg/job/upgrade_manager.go @@ -22,3 +22,21 @@ type UpgradeManager interface { getCluster(specOld, specNew modules.MapStringInterface) modules.Cluster waitFinish(interval, round int) bool } + +type FallbackUpgradeManager struct { + UpgradeManager +} + +func (um *FallbackUpgradeManager) validate(specold, specNew modules.MapStringInterface) error { + return nil +} + +func (um *FallbackUpgradeManager) getCluster(specold, specNew modules.MapStringInterface) modules.Cluster { + return modules.Cluster{ + Name: "fallbackUM", + } +} + +func (um *FallbackUpgradeManager) waitFinish(interval, round int) bool { + return true +} diff --git a/k8s-deploy/pkg/modules/cluster_kube.go b/k8s-deploy/pkg/modules/cluster_kube.go deleted file mode 100644 index 41d63115c..000000000 --- a/k8s-deploy/pkg/modules/cluster_kube.go +++ /dev/null @@ -1,48 +0,0 @@ -/* - * Copyright 2019-2021 VMware, Inc. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * http://www.apache.org/licenses/LICENSE-2.0 - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - * - */ - -package modules - -import ( - "github.com/FederatedAI/KubeFATE/k8s-deploy/pkg/service" - "github.com/rs/zerolog/log" -) - -func (e *Cluster) GetClusterStatus() (map[string]map[string]string, error) { - - ClusterPodStatus, err := service.GetClusterPodStatus(e.Name, e.NameSpace) - if err != nil { - log.Error().Err(err).Msg("GetClusterPodStatus error") - return nil, err - } - - //ClusterServiceStatus, err := service.GetClusterServiceStatus(e.Name, e.NameSpace) - //if err != nil { - // log.Error().Err(err).Msg("GetClusterServiceStatus error") - // return nil, err - //} - // - //ClusterIngressStatus, err := service.GetClusterIngressStatus(e.Name, e.NameSpace) - //if err != nil { - // log.Error().Err(err).Msg("ClusterIngressStatus error") - // return nil, err - //} - - return map[string]map[string]string{ - "modules": ClusterPodStatus, - //"service": ClusterServiceStatus, - //"ingress": ClusterIngressStatus, - }, nil -} diff --git a/k8s-deploy/pkg/service/info.go b/k8s-deploy/pkg/service/info.go index 83d6f7b81..63936a005 100644 --- a/k8s-deploy/pkg/service/info.go +++ b/k8s-deploy/pkg/service/info.go @@ -1,5 +1,5 @@ /* - * Copyright 2019-2021 VMware, Inc. + * Copyright 2019-2022 VMware, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -30,22 +30,29 @@ func GetClusterInfo(name, namespace string) (map[string]interface{}, error) { return nil, err } - containerList, err := GetPodContainersStatus(name, getDefaultNamespace(namespace)) + containerStatus, err := GetPodContainersStatus(name, getDefaultNamespace(namespace)) if err != nil { log.Error().Str("func", "GetPodContainersStatus()").Err(err).Msg("GetPodContainersStatus error") return nil, err } - deploymentList, err := GetClusterDeployStatus(name, getDefaultNamespace(namespace)) + deploymentStatus, err := GetClusterDeployStatus(name, getDefaultNamespace(namespace)) if err != nil { log.Error().Str("func", "GetClusterDeployStatus()").Err(err).Msg("GetClusterDeployStatus error") return nil, err } + stsStatus, err := GetClusterStsStatus(name, getDefaultNamespace(namespace)) + if err != nil { + log.Error().Str("func", "GetClusterStsStatus()").Err(err).Msg("GetClusterStsStatus error") + return nil, err + } + status := make(map[string]interface{}) - status["containers"] = containerList - status["deployments"] = deploymentList + status["containers"] = containerStatus + status["deployments"] = deploymentStatus + status["statefulSets"] = stsStatus ingressURLList, err := GetIngressURLList(name, getDefaultNamespace(namespace)) if err != nil { @@ -71,11 +78,6 @@ func GetClusterInfo(name, namespace string) (map[string]interface{}, error) { return info, nil } -//GetClusterStatus GetClusterStatus -func GetClusterStatus(name, namespace string) (map[string]string, error) { - return GetClusterDeployStatus(name, namespace) -} - // CheckClusterStatus CheckClusterStatus func CheckClusterStatus(ClusterStatus map[string]string) bool { if len(ClusterStatus) == 0 { diff --git a/k8s-deploy/pkg/service/kube.go b/k8s-deploy/pkg/service/kube.go index 3d66dac78..10335228d 100644 --- a/k8s-deploy/pkg/service/kube.go +++ b/k8s-deploy/pkg/service/kube.go @@ -35,6 +35,7 @@ type kubeClient interface { kube.Log kube.Deployment kube.Job + kube.Sts } var KubeClient kubeClient = &kube.KUBE diff --git a/k8s-deploy/pkg/service/kube/deployment.go b/k8s-deploy/pkg/service/kube/deployment.go index 89984408d..c0e31e39c 100644 --- a/k8s-deploy/pkg/service/kube/deployment.go +++ b/k8s-deploy/pkg/service/kube/deployment.go @@ -1,5 +1,5 @@ /* - * Copyright 2019-2021 VMware, Inc. + * Copyright 2019-2022 VMware, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/k8s-deploy/pkg/service/kube/sts.go b/k8s-deploy/pkg/service/kube/sts.go new file mode 100644 index 000000000..e38bfa748 --- /dev/null +++ b/k8s-deploy/pkg/service/kube/sts.go @@ -0,0 +1,41 @@ +/* + * Copyright 2019-2022 VMware, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * http://www.apache.org/licenses/LICENSE-2.0 + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + +package kube + +import ( + "context" + + v1 "k8s.io/api/apps/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" +) + +// Sts StatefulSet +type Sts interface { + GetSts(namespace, stsName string) (*v1.StatefulSet, error) + GetStsList(namespace, LabelSelector string) (*v1.StatefulSetList, error) +} + +// GetSts gets a StatefulSet +func (e *Kube) GetSts(namespace, stsName string) (*v1.StatefulSet, error) { + sts, err := e.client.AppsV1().StatefulSets(namespace).Get(context.Background(), stsName, metav1.GetOptions{}) + return sts, err +} + +// GetStsList gets a StatefulSet lis +func (e *Kube) GetStsList(namespace, LabelSelector string) (*v1.StatefulSetList, error) { + stsList, err := e.client.AppsV1().StatefulSets(namespace).List(context.Background(), metav1.ListOptions{LabelSelector: LabelSelector}) + return stsList, err +} diff --git a/k8s-deploy/pkg/service/kube_deploy.go b/k8s-deploy/pkg/service/kube_deploy.go index 321836fc1..f435dcbdc 100644 --- a/k8s-deploy/pkg/service/kube_deploy.go +++ b/k8s-deploy/pkg/service/kube_deploy.go @@ -34,43 +34,6 @@ func GetDeployList(clusterName, namespace string) (*v1.DeploymentList, error) { return list, nil } -// GetDeploy GetDeploy -func GetDeploy(deploymentName, namespace string) (*v1.Deployment, error) { - - deploy, err := KubeClient.GetDeployment(namespace, deploymentName) - if err != nil { - return nil, err - } - - return deploy, nil -} - -// CheckDeploy CheckDeploy -func CheckDeploy(deploy *v1.Deployment) bool { - if deploy == nil { - return false - } - for _, v := range deploy.Status.Conditions { - if v.Type == v1.DeploymentAvailable && v.Status == corev1.ConditionTrue { - return true - } - } - return false -} - -// CheckDeploys CheckDeploys -func CheckDeploys(deploys *v1.DeploymentList) bool { - if deploys == nil || len(deploys.Items) == 0 { - return false - } - for _, v := range deploys.Items { - if !CheckDeploy(&v) { - return false - } - } - return true -} - // GetDeployStatus GetDeployStatus func GetDeployStatus(deploy *v1.Deployment) (string, string) { @@ -92,16 +55,6 @@ func GetDeployStatus(deploy *v1.Deployment) (string, string) { return "Undefined", fmt.Sprintf("please use kubectl cli check deploy status of %s", deploy.Name) } -//GetDeploymentStatus GetDeploymentStatus -func GetDeploymentStatusInfo(deploys *v1.DeploymentList) (map[string]string, error) { - status := make(map[string]string) - for _, v := range deploys.Items { - Type, message := GetDeployStatus(&v) - status[v.Name] = fmt.Sprintf("%s, %s", Type, message) - } - return status, nil -} - func GetDeploymentStatus(deploys *v1.DeploymentList) (map[string]string, error) { status := make(map[string]string) for _, v := range deploys.Items { diff --git a/k8s-deploy/pkg/service/kube_deploy_test.go b/k8s-deploy/pkg/service/kube_deploy_test.go deleted file mode 100644 index 148006d30..000000000 --- a/k8s-deploy/pkg/service/kube_deploy_test.go +++ /dev/null @@ -1,193 +0,0 @@ -/* - * Copyright 2019-2021 VMware, Inc. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * http://www.apache.org/licenses/LICENSE-2.0 - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - * - */ - -package service - -import ( - "testing" - - v1 "k8s.io/api/apps/v1" - corev1 "k8s.io/api/core/v1" -) - -func TestCheckDeploy(t *testing.T) { - type args struct { - deploy *v1.Deployment - } - tests := []struct { - name string - args args - want bool - }{ - { - name: "True", - args: args{ - deploy: &v1.Deployment{ - Status: v1.DeploymentStatus{ - Conditions: []v1.DeploymentCondition{ - { - Type: v1.DeploymentAvailable, - Status: corev1.ConditionTrue, - }, - }, - }, - }, - }, - want: true, - }, - { - name: "nil", - args: args{ - deploy: &v1.Deployment{}, - }, - want: false, - }, - { - name: "Zero", - args: args{ - deploy: &v1.Deployment{}, - }, - want: false, - }, - } - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - if got := CheckDeploy(tt.args.deploy); got != tt.want { - t.Errorf("CheckDeploy() = %v, want %v", got, tt.want) - } - }) - } -} - -func TestCheckDeploys(t *testing.T) { - type args struct { - deploys *v1.DeploymentList - } - tests := []struct { - name string - args args - want bool - }{ - { - name: "nil", - args: args{}, - want: false, - }, - { - name: "count-0", - args: args{ - deploys: &v1.DeploymentList{}, - }, - want: false, - }, - { - name: "one-false", - args: args{ - deploys: &v1.DeploymentList{ - Items: []v1.Deployment{ - { - Status: v1.DeploymentStatus{ - Conditions: []v1.DeploymentCondition{ - { - Type: v1.DeploymentAvailable, - Status: corev1.ConditionTrue, - }, - }, - }, - }, - { - Status: v1.DeploymentStatus{ - Conditions: []v1.DeploymentCondition{ - { - Type: v1.DeploymentAvailable, - Status: corev1.ConditionFalse, - }, - }, - }, - }, - }, - }, - }, - want: false, - }, - { - name: "one-not-Available", - args: args{ - deploys: &v1.DeploymentList{ - Items: []v1.Deployment{ - { - Status: v1.DeploymentStatus{ - Conditions: []v1.DeploymentCondition{ - { - Type: v1.DeploymentAvailable, - Status: corev1.ConditionTrue, - }, - }, - }, - }, - { - Status: v1.DeploymentStatus{ - Conditions: []v1.DeploymentCondition{ - { - Type: v1.DeploymentProgressing, - Status: corev1.ConditionTrue, - }, - }, - }, - }, - }, - }, - }, - want: false, - }, - { - name: "all-Available", - args: args{ - deploys: &v1.DeploymentList{ - Items: []v1.Deployment{ - { - Status: v1.DeploymentStatus{ - Conditions: []v1.DeploymentCondition{ - { - Type: v1.DeploymentAvailable, - Status: corev1.ConditionTrue, - }, - }, - }, - }, - { - Status: v1.DeploymentStatus{ - Conditions: []v1.DeploymentCondition{ - { - Type: v1.DeploymentAvailable, - Status: corev1.ConditionTrue, - }, - }, - }, - }, - }, - }, - }, - want: true, - }, - } - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - if got := CheckDeploys(tt.args.deploys); got != tt.want { - t.Errorf("CheckDeploys() = %v, want %v", got, tt.want) - } - }) - } -} diff --git a/k8s-deploy/pkg/service/kube_sts.go b/k8s-deploy/pkg/service/kube_sts.go new file mode 100644 index 000000000..caeb2cb47 --- /dev/null +++ b/k8s-deploy/pkg/service/kube_sts.go @@ -0,0 +1,61 @@ +/* + * Copyright 2019-2021 VMware, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * http://www.apache.org/licenses/LICENSE-2.0 + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + +package service + +import ( + "fmt" + + v1 "k8s.io/api/apps/v1" +) + +// GetStsList gets the statefulSets list under the namespace +func GetStsList(clusterName, namespace string) (*v1.StatefulSetList, error) { + + list, err := KubeClient.GetStsList(namespace, getLabelSelector(namespace, clusterName)) + if err != nil { + return nil, err + } + + return list, nil +} + +// GetStsStatus gets the status if a certain statefulSet +func GetStsStatus(sts *v1.StatefulSet) (string, string) { + if sts.Status.ReadyReplicas >= sts.Status.Replicas { + return "Available", "all the replicas are in the ready state" + } else { + return "Progressing", "Detailed status need to be checked by kubectl CLI" + } +} + +// GetStssStatus gets the status of a list of statefulSets +func GetStssStatus(stss *v1.StatefulSetList) (map[string]string, error) { + status := make(map[string]string) + for _, v := range stss.Items { + Type, _ := GetStsStatus(&v) + status[v.Name] = fmt.Sprintf("%s", Type) + } + return status, nil +} + +// GetClusterStsStatus gets all the statefulSet related information with the cluster name and namespace +func GetClusterStsStatus(name, namespace string) (map[string]string, error) { + stsList, err := GetStsList(name, namespace) + if err != nil { + return nil, err + } + return GetStssStatus(stsList) +} diff --git a/k8s-deploy/rbac-config.yaml b/k8s-deploy/rbac-config.yaml index 8aef0a393..ecd26bc76 100644 --- a/k8s-deploy/rbac-config.yaml +++ b/k8s-deploy/rbac-config.yaml @@ -36,24 +36,6 @@ stringData: mariadbUsername: kubefate mariadbPassword: kubefate --- -apiVersion: policy/v1beta1 -kind: PodSecurityPolicy -metadata: - name: kubefate-psp - namespace: kube-fate -spec: - privileged: false - seLinux: - rule: RunAsAny - supplementalGroups: - rule: RunAsAny - runAsUser: - rule: RunAsAny - fsGroup: - rule: RunAsAny - volumes: - - '*' ---- apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRole metadata: @@ -121,17 +103,6 @@ rules: - delete - update - patch -- apiGroups: - - policy - resources: - - podsecuritypolicies - verbs: - - get - - use - - create - - delete - - update - - patch - apiGroups: - rbac.authorization.k8s.io resources: