diff --git a/.github/workflows/codesee-arch-diagram.yml b/.github/workflows/codesee-arch-diagram.yml new file mode 100644 index 000000000..806d41d12 --- /dev/null +++ b/.github/workflows/codesee-arch-diagram.yml @@ -0,0 +1,23 @@ +# This workflow was added by CodeSee. Learn more at https://codesee.io/ +# This is v2.0 of this workflow file +on: + push: + branches: + - main + pull_request_target: + types: [opened, synchronize, reopened] + +name: CodeSee + +permissions: read-all + +jobs: + codesee: + runs-on: ubuntu-latest + continue-on-error: true + name: Analyze the repo with CodeSee + steps: + - uses: Codesee-io/codesee-action@v2 + with: + codesee-token: ${{ secrets.CODESEE_ARCH_DIAG_API_TOKEN }} + codesee-url: https://app.codesee.io diff --git a/api/api/version_endpoints_api.go b/api/api/version_endpoints_api.go index 531848794..98022671b 100644 --- a/api/api/version_endpoints_api.go +++ b/api/api/version_endpoints_api.go @@ -392,9 +392,13 @@ func validateUpdateRequest(prev *models.VersionEndpoint, new *models.VersionEndp return fmt.Errorf("Updating environment is not allowed, previous: %s, new: %s", prev.EnvironmentName, new.EnvironmentName) } + if prev.Status == models.EndpointPending { + return fmt.Errorf("Updating endpoint status to %s is not allowed when the endpoint is currently in the pending state", new.Status) + } + if new.Status != prev.Status { if prev.Status == models.EndpointServing { - return fmt.Errorf("Updating endpoint status to %s is not allowed when the endpoint is in serving state", new.Status) + return fmt.Errorf("Updating endpoint status to %s is not allowed when the endpoint is currently in the serving state", new.Status) } if new.Status != models.EndpointRunning && new.Status != models.EndpointTerminated { diff --git a/api/api/version_endpoints_api_test.go b/api/api/version_endpoints_api_test.go index 01088c9bf..e61dffade 100644 --- a/api/api/version_endpoints_api_test.go +++ b/api/api/version_endpoints_api_test.go @@ -3394,7 +3394,7 @@ func TestUpdateEndpoint(t *testing.T) { ID: uuid, VersionID: models.ID(1), VersionModelID: models.ID(1), - Status: models.EndpointPending, + Status: models.EndpointRunning, ServiceName: "sample", InferenceServiceName: "sample", Namespace: "sample", @@ -3648,7 +3648,120 @@ func TestUpdateEndpoint(t *testing.T) { }, expected: &Response{ code: http.StatusBadRequest, - data: Error{Message: "Error validating request: Updating endpoint status to running is not allowed when the endpoint is in serving state"}, + data: Error{Message: "Error validating request: Updating endpoint status to running is not allowed when the endpoint is currently in the serving state"}, + }, + }, + { + desc: "Should 400 if endpoint status is in the pending state", + vars: map[string]string{ + "model_id": "1", + "version_id": "1", + "endpoint_id": uuid.String(), + }, + requestBody: &models.VersionEndpoint{ + ID: uuid, + VersionID: models.ID(1), + VersionModelID: models.ID(1), + Status: models.EndpointRunning, + ServiceName: "sample", + Namespace: "sample", + EnvironmentName: "dev", + Message: "", + ResourceRequest: &models.ResourceRequest{ + MinReplica: 1, + MaxReplica: 4, + CPURequest: resource.MustParse("1"), + MemoryRequest: resource.MustParse("1Gi"), + }, + EnvVars: models.EnvVars([]models.EnvVar{ + { + Name: "WORKER", + Value: "1", + }, + }), + }, + modelService: func() *mocks.ModelsService { + svc := &mocks.ModelsService{} + svc.On("FindByID", context.Background(), models.ID(1)).Return(&models.Model{ + ID: models.ID(1), + Name: "model-1", + ProjectID: models.ID(1), + Project: mlp.Project{}, + ExperimentID: 1, + Type: "pyfunc", + MlflowURL: "", + Endpoints: nil, + }, nil) + return svc + }, + versionService: func() *mocks.VersionsService { + svc := &mocks.VersionsService{} + svc.On("FindByID", context.Background(), models.ID(1), models.ID(1), mock.Anything).Return(&models.Version{ + ID: models.ID(1), + ModelID: models.ID(1), + Model: &models.Model{ + ID: models.ID(1), + Name: "model-1", + ProjectID: models.ID(1), + Project: mlp.Project{}, + ExperimentID: 1, + Type: "pyfunc", + MlflowURL: "", + Endpoints: nil, + }, + }, nil) + return svc + }, + envService: func() *mocks.EnvironmentService { + svc := &mocks.EnvironmentService{} + svc.On("GetEnvironment", "dev").Return(&models.Environment{ + ID: models.ID(1), + Name: "dev", + Cluster: "dev", + IsDefault: &trueBoolean, + Region: "id", + GcpProject: "dev-proj", + MaxCPU: "1", + MaxMemory: "1Gi", + }, nil) + return svc + }, + endpointService: func() *mocks.EndpointsService { + svc := &mocks.EndpointsService{} + svc.On("FindByID", context.Background(), uuid).Return(&models.VersionEndpoint{ + ID: uuid, + VersionID: models.ID(1), + VersionModelID: models.ID(1), + Status: models.EndpointPending, + ServiceName: "sample", + InferenceServiceName: "sample", + Namespace: "sample", + URL: "http://endpoint.svc", + MonitoringURL: "http://monitoring.com", + Environment: &models.Environment{ + ID: models.ID(1), + Name: "dev", + Cluster: "dev", + IsDefault: &trueBoolean, + Region: "id", + GcpProject: "dev-proj", + MaxCPU: "1", + MaxMemory: "1Gi", + }, EnvironmentName: "dev", + Message: "", + ResourceRequest: nil, + EnvVars: models.EnvVars([]models.EnvVar{ + { + Name: "WORKER", + Value: "1", + }, + }), + }, nil) + return svc + }, + expected: &Response{ + code: http.StatusBadRequest, + data: Error{Message: "Error validating request: Updating endpoint status to running is not allowed when the endpoint is currently in the pending state"}, }, }, { @@ -3949,7 +4062,7 @@ func TestUpdateEndpoint(t *testing.T) { ID: uuid, VersionID: models.ID(1), VersionModelID: models.ID(1), - Status: models.EndpointPending, + Status: models.EndpointRunning, ServiceName: "sample", InferenceServiceName: "sample", Namespace: "sample", @@ -4062,7 +4175,7 @@ func TestUpdateEndpoint(t *testing.T) { ID: uuid, VersionID: models.ID(1), VersionModelID: models.ID(1), - Status: models.EndpointPending, + Status: models.EndpointRunning, ServiceName: "sample", InferenceServiceName: "sample", Namespace: "sample", @@ -4239,7 +4352,7 @@ func TestUpdateEndpoint(t *testing.T) { ID: uuid, VersionID: models.ID(1), VersionModelID: models.ID(1), - Status: models.EndpointPending, + Status: models.EndpointRunning, ServiceName: "sample", InferenceServiceName: "sample", Namespace: "sample", @@ -4927,7 +5040,7 @@ func TestUpdateEndpoint(t *testing.T) { }, expected: &Response{ code: http.StatusBadRequest, - data: Error{Message: "Changing deployment type of a pending model is not allowed, please terminate it first."}, + data: Error{Message: "Error validating request: Updating endpoint status to running is not allowed when the endpoint is currently in the pending state"}, }, }, } diff --git a/api/client/model_environment.go b/api/client/model_environment.go index 7bc65812b..1159a31f0 100644 --- a/api/client/model_environment.go +++ b/api/client/model_environment.go @@ -22,7 +22,7 @@ type Environment struct { DefaultResourceRequest *ResourceRequest `json:"default_resource_request,omitempty"` DefaultTransformerResourceRequest *ResourceRequest `json:"default_transformer_resource_request,omitempty"` DefaultPredictionJobResourceRequest *PredictionJobResourceRequest `json:"default_prediction_job_resource_request,omitempty"` - Gpus []Gpu `json:"gpus,omitempty"` + Gpus []GpuConfig `json:"gpus,omitempty"` CreatedAt time.Time `json:"created_at,omitempty"` UpdatedAt time.Time `json:"updated_at,omitempty"` } diff --git a/api/client/model_gpu.go b/api/client/model_gpu.go deleted file mode 100644 index 4139b012b..000000000 --- a/api/client/model_gpu.go +++ /dev/null @@ -1,17 +0,0 @@ -/* - * Merlin - * - * API Guide for accessing Merlin's model management, deployment, and serving functionalities - * - * API version: 0.14.0 - * Generated by: Swagger Codegen (https://github.com/swagger-api/swagger-codegen.git) - */ -package client - -type Gpu struct { - Values []string `json:"values,omitempty"` - DisplayName string `json:"display_name,omitempty"` - ResourceType string `json:"resource_type,omitempty"` - NodeSelector map[string]string `json:"node_selector,omitempty"` - MonthlyCostPerGpu float64 `json:"monthly_cost_per_gpu,omitempty"` -} diff --git a/api/client/model_gpu_config.go b/api/client/model_gpu_config.go new file mode 100644 index 000000000..0e74d1dfa --- /dev/null +++ b/api/client/model_gpu_config.go @@ -0,0 +1,19 @@ +/* + * Merlin + * + * API Guide for accessing Merlin's model management, deployment, and serving functionalities + * + * API version: 0.14.0 + * Generated by: Swagger Codegen (https://github.com/swagger-api/swagger-codegen.git) + */ +package client + +type GpuConfig struct { + Name string `json:"name,omitempty"` + Values []string `json:"values,omitempty"` + ResourceType string `json:"resource_type,omitempty"` + NodeSelector map[string]string `json:"node_selector,omitempty"` + Tolerations []GpuToleration `json:"tolerations,omitempty"` + MinMonthlyCostPerGpu float64 `json:"min_monthly_cost_per_gpu,omitempty"` + MaxMonthlyCostPerGpu float64 `json:"max_monthly_cost_per_gpu,omitempty"` +} diff --git a/api/client/model_gpu_toleration.go b/api/client/model_gpu_toleration.go new file mode 100644 index 000000000..c0c7a9c29 --- /dev/null +++ b/api/client/model_gpu_toleration.go @@ -0,0 +1,17 @@ +/* + * Merlin + * + * API Guide for accessing Merlin's model management, deployment, and serving functionalities + * + * API version: 0.14.0 + * Generated by: Swagger Codegen (https://github.com/swagger-api/swagger-codegen.git) + */ +package client + +type GpuToleration struct { + Key string `json:"key,omitempty"` + Operator string `json:"operator,omitempty"` + Value string `json:"value,omitempty"` + Effect string `json:"effect,omitempty"` + TolerationSeconds int64 `json:"toleration_seconds,omitempty"` +} diff --git a/api/client/model_resource_request.go b/api/client/model_resource_request.go index 985fc5f23..5c9383b9a 100644 --- a/api/client/model_resource_request.go +++ b/api/client/model_resource_request.go @@ -9,11 +9,10 @@ package client type ResourceRequest struct { - MinReplica int32 `json:"min_replica,omitempty"` - MaxReplica int32 `json:"max_replica,omitempty"` - CpuRequest string `json:"cpu_request,omitempty"` - MemoryRequest string `json:"memory_request,omitempty"` - GpuResourceType string `json:"gpu_resource_type,omitempty"` - GpuRequest string `json:"gpu_request,omitempty"` - GpuNodeSelector map[string]string `json:"gpu_node_selector,omitempty"` + MinReplica int32 `json:"min_replica,omitempty"` + MaxReplica int32 `json:"max_replica,omitempty"` + CpuRequest string `json:"cpu_request,omitempty"` + MemoryRequest string `json:"memory_request,omitempty"` + GpuName string `json:"gpu_name,omitempty"` + GpuRequest string `json:"gpu_request,omitempty"` } diff --git a/api/cluster/resource/templater.go b/api/cluster/resource/templater.go index 622419757..41f995240 100644 --- a/api/cluster/resource/templater.go +++ b/api/cluster/resource/templater.go @@ -256,16 +256,23 @@ func createPredictorSpec(modelService *models.Service, config *config.Deployment } nodeSelector := map[string]string{} - if !modelService.ResourceRequest.GPURequest.IsZero() { - // Declare and initialize resourceType and resourceQuantity variables - resourceType := corev1.ResourceName(modelService.ResourceRequest.GPUResourceType) - resourceQuantity := modelService.ResourceRequest.GPURequest - - // Set the resourceType as the key in the maps, with resourceQuantity as the value - resources.Requests[resourceType] = resourceQuantity - resources.Limits[resourceType] = resourceQuantity - - nodeSelector = modelService.ResourceRequest.GPUNodeSelector + tolerations := []corev1.Toleration{} + if modelService.ResourceRequest.GPUName != "" && !modelService.ResourceRequest.GPURequest.IsZero() { + // Look up to the GPU resource type and quantity from DeploymentConfig + for _, gpuConfig := range config.GPUs { + if gpuConfig.Name == modelService.ResourceRequest.GPUName { + // Declare and initialize resourceType and resourceQuantity variables + resourceType := corev1.ResourceName(gpuConfig.ResourceType) + resourceQuantity := modelService.ResourceRequest.GPURequest + + // Set the resourceType as the key in the maps, with resourceQuantity as the value + resources.Requests[resourceType] = resourceQuantity + resources.Limits[resourceType] = resourceQuantity + + nodeSelector = gpuConfig.NodeSelector + tolerations = gpuConfig.Tolerations + } + } } // liveness probe config. if env var to disable != true or not set, it will default to enabled @@ -360,13 +367,17 @@ func createPredictorSpec(modelService *models.Service, config *config.Deployment }, } case models.ModelTypeCustom: - predictorSpec = createCustomPredictorSpec(modelService, resources, nodeSelector) + predictorSpec = createCustomPredictorSpec(modelService, resources, nodeSelector, tolerations) } if len(nodeSelector) > 0 { predictorSpec.NodeSelector = nodeSelector } + if len(tolerations) > 0 { + predictorSpec.Tolerations = tolerations + } + var loggerSpec *kservev1beta1.LoggerSpec if modelService.Logger != nil && modelService.Logger.Model != nil && modelService.Logger.Model.Enabled { logger := modelService.Logger @@ -802,7 +813,7 @@ func createDefaultPredictorEnvVars(modelService *models.Service) models.EnvVars return defaultEnvVars } -func createCustomPredictorSpec(modelService *models.Service, resources corev1.ResourceRequirements, nodeSelector map[string]string) kservev1beta1.PredictorSpec { +func createCustomPredictorSpec(modelService *models.Service, resources corev1.ResourceRequirements, nodeSelector map[string]string, tolerations []corev1.Toleration) kservev1beta1.PredictorSpec { envVars := modelService.EnvVars // Add default env var (Overwrite by user not allowed) @@ -846,6 +857,10 @@ func createCustomPredictorSpec(modelService *models.Service, resources corev1.Re spec.NodeSelector = nodeSelector } + if len(tolerations) > 0 { + spec.Tolerations = tolerations + } + return spec } diff --git a/api/cluster/resource/templater_gpu_test.go b/api/cluster/resource/templater_gpu_test.go index cf840de06..ed6c05b2f 100644 --- a/api/cluster/resource/templater_gpu_test.go +++ b/api/cluster/resource/templater_gpu_test.go @@ -22,6 +22,35 @@ import ( ) var ( + defaultGPUNodeSelector = map[string]string{"cloud.google.com/gke-accelerator": "nvidia-tesla-p4"} + + defaultGPUTolerations = []corev1.Toleration{ + { + Key: "caraml/nvidia-tesla-p4", + Operator: corev1.TolerationOpEqual, + Value: "enabled", + Effect: corev1.TaintEffectNoSchedule, + }, + { + Key: "nvidia.com/gpu", + Operator: corev1.TolerationOpEqual, + Value: "present", + Effect: corev1.TaintEffectNoSchedule, + }, + } + + defaultGPUsConfig = []config.GPUConfig{ + { + Name: "NVIDIA P4", + Values: []string{"None", "1", "2", "4"}, + ResourceType: "nvidia.com/gpu", + NodeSelector: defaultGPUNodeSelector, + Tolerations: defaultGPUTolerations, + MinMonthlyCostPerGPU: 332.15, + MaxMonthlyCostPerGPU: 332.15, + }, + } + expDefaultModelResourceRequestsWithGPU = corev1.ResourceRequirements{ Requests: corev1.ResourceList{ corev1.ResourceCPU: defaultModelResourceRequests.CPURequest, @@ -67,13 +96,12 @@ func TestCreateInferenceServiceSpecWithGPU(t *testing.T) { }, Protocol: protocol.HttpJson, ResourceRequest: &models.ResourceRequest{ - MinReplica: 1, - MaxReplica: 2, - CPURequest: resource.MustParse("500m"), - MemoryRequest: resource.MustParse("500Mi"), - GPURequest: resource.MustParse("1"), - GPUResourceType: "nvidia.com/gpu", - GPUNodeSelector: map[string]string{"cloud.google.com/gke-accelerator": "nvidia-tesla-p4"}, + MinReplica: 1, + MaxReplica: 2, + CPURequest: resource.MustParse("500m"), + MemoryRequest: resource.MustParse("500Mi"), + GPUName: "NVIDIA P4", + GPURequest: resource.MustParse("1"), }, } @@ -142,7 +170,8 @@ func TestCreateInferenceServiceSpecWithGPU(t *testing.T) { MaxReplicas: defaultModelResourceRequests.MaxReplica, }, PodSpec: kservev1beta1.PodSpec{ - NodeSelector: modelSvc.ResourceRequest.GPUNodeSelector, + NodeSelector: defaultGPUNodeSelector, + Tolerations: defaultGPUTolerations, }, }, }, @@ -214,7 +243,8 @@ func TestCreateInferenceServiceSpecWithGPU(t *testing.T) { MaxReplicas: defaultModelResourceRequests.MaxReplica, }, PodSpec: kservev1beta1.PodSpec{ - NodeSelector: modelSvc.ResourceRequest.GPUNodeSelector, + NodeSelector: defaultGPUNodeSelector, + Tolerations: defaultGPUTolerations, }, }, }, @@ -272,7 +302,8 @@ func TestCreateInferenceServiceSpecWithGPU(t *testing.T) { MaxReplicas: defaultModelResourceRequests.MaxReplica, }, PodSpec: kservev1beta1.PodSpec{ - NodeSelector: modelSvc.ResourceRequest.GPUNodeSelector, + NodeSelector: defaultGPUNodeSelector, + Tolerations: defaultGPUTolerations, }, }, }, @@ -330,7 +361,8 @@ func TestCreateInferenceServiceSpecWithGPU(t *testing.T) { MaxReplicas: defaultModelResourceRequests.MaxReplica, }, PodSpec: kservev1beta1.PodSpec{ - NodeSelector: modelSvc.ResourceRequest.GPUNodeSelector, + NodeSelector: defaultGPUNodeSelector, + Tolerations: defaultGPUTolerations, }, }, }, @@ -385,7 +417,8 @@ func TestCreateInferenceServiceSpecWithGPU(t *testing.T) { MaxReplicas: defaultModelResourceRequests.MaxReplica, }, PodSpec: kservev1beta1.PodSpec{ - NodeSelector: modelSvc.ResourceRequest.GPUNodeSelector, + NodeSelector: defaultGPUNodeSelector, + Tolerations: defaultGPUTolerations, }, }, }, @@ -442,7 +475,8 @@ func TestCreateInferenceServiceSpecWithGPU(t *testing.T) { MaxReplicas: defaultModelResourceRequests.MaxReplica, }, PodSpec: kservev1beta1.PodSpec{ - NodeSelector: modelSvc.ResourceRequest.GPUNodeSelector, + NodeSelector: defaultGPUNodeSelector, + Tolerations: defaultGPUTolerations, }, }, }, @@ -499,7 +533,8 @@ func TestCreateInferenceServiceSpecWithGPU(t *testing.T) { MaxReplicas: defaultModelResourceRequests.MaxReplica, }, PodSpec: kservev1beta1.PodSpec{ - NodeSelector: modelSvc.ResourceRequest.GPUNodeSelector, + NodeSelector: defaultGPUNodeSelector, + Tolerations: defaultGPUTolerations, }, }, }, @@ -556,7 +591,8 @@ func TestCreateInferenceServiceSpecWithGPU(t *testing.T) { MaxReplicas: defaultModelResourceRequests.MaxReplica, }, PodSpec: kservev1beta1.PodSpec{ - NodeSelector: modelSvc.ResourceRequest.GPUNodeSelector, + NodeSelector: defaultGPUNodeSelector, + Tolerations: defaultGPUTolerations, }, }, }, @@ -612,7 +648,8 @@ func TestCreateInferenceServiceSpecWithGPU(t *testing.T) { Resources: expDefaultModelResourceRequestsWithGPU, }, }, - NodeSelector: modelSvc.ResourceRequest.GPUNodeSelector, + NodeSelector: defaultGPUNodeSelector, + Tolerations: defaultGPUTolerations, }, ComponentExtensionSpec: kservev1beta1.ComponentExtensionSpec{ MinReplicas: &defaultModelResourceRequests.MinReplica, @@ -672,7 +709,8 @@ func TestCreateInferenceServiceSpecWithGPU(t *testing.T) { Resources: expDefaultModelResourceRequestsWithGPU, }, }, - NodeSelector: modelSvc.ResourceRequest.GPUNodeSelector, + NodeSelector: defaultGPUNodeSelector, + Tolerations: defaultGPUTolerations, }, ComponentExtensionSpec: kservev1beta1.ComponentExtensionSpec{ MinReplicas: &defaultModelResourceRequests.MinReplica, @@ -738,7 +776,8 @@ func TestCreateInferenceServiceSpecWithGPU(t *testing.T) { Resources: expDefaultModelResourceRequestsWithGPU, }, }, - NodeSelector: modelSvc.ResourceRequest.GPUNodeSelector, + NodeSelector: defaultGPUNodeSelector, + Tolerations: defaultGPUTolerations, }, ComponentExtensionSpec: kservev1beta1.ComponentExtensionSpec{ MinReplicas: &defaultModelResourceRequests.MinReplica, @@ -805,7 +844,8 @@ func TestCreateInferenceServiceSpecWithGPU(t *testing.T) { }, }, }, - NodeSelector: modelSvc.ResourceRequest.GPUNodeSelector, + NodeSelector: defaultGPUNodeSelector, + Tolerations: defaultGPUTolerations, }, ComponentExtensionSpec: kservev1beta1.ComponentExtensionSpec{ MinReplicas: &modelSvc.ResourceRequest.MinReplica, @@ -874,7 +914,8 @@ func TestCreateInferenceServiceSpecWithGPU(t *testing.T) { MaxReplicas: defaultModelResourceRequests.MaxReplica, }, PodSpec: kservev1beta1.PodSpec{ - NodeSelector: modelSvc.ResourceRequest.GPUNodeSelector, + NodeSelector: defaultGPUNodeSelector, + Tolerations: defaultGPUTolerations, }, }, }, @@ -961,7 +1002,8 @@ func TestCreateInferenceServiceSpecWithGPU(t *testing.T) { MaxReplicas: defaultModelResourceRequests.MaxReplica, }, PodSpec: kservev1beta1.PodSpec{ - NodeSelector: modelSvc.ResourceRequest.GPUNodeSelector, + NodeSelector: defaultGPUNodeSelector, + Tolerations: defaultGPUTolerations, }, }, }, @@ -1026,7 +1068,8 @@ func TestCreateInferenceServiceSpecWithGPU(t *testing.T) { MaxReplicas: defaultModelResourceRequests.MaxReplica, }, PodSpec: kservev1beta1.PodSpec{ - NodeSelector: modelSvc.ResourceRequest.GPUNodeSelector, + NodeSelector: defaultGPUNodeSelector, + Tolerations: defaultGPUTolerations, }, }, }, @@ -1091,7 +1134,8 @@ func TestCreateInferenceServiceSpecWithGPU(t *testing.T) { MaxReplicas: modelSvc.ResourceRequest.MaxReplica, }, PodSpec: kservev1beta1.PodSpec{ - NodeSelector: modelSvc.ResourceRequest.GPUNodeSelector, + NodeSelector: defaultGPUNodeSelector, + Tolerations: defaultGPUTolerations, }, }, }, @@ -1156,7 +1200,8 @@ func TestCreateInferenceServiceSpecWithGPU(t *testing.T) { MaxReplicas: defaultModelResourceRequests.MaxReplica, }, PodSpec: kservev1beta1.PodSpec{ - NodeSelector: modelSvc.ResourceRequest.GPUNodeSelector, + NodeSelector: defaultGPUNodeSelector, + Tolerations: defaultGPUTolerations, }, }, }, @@ -1221,7 +1266,8 @@ func TestCreateInferenceServiceSpecWithGPU(t *testing.T) { MaxReplicas: defaultModelResourceRequests.MaxReplica, }, PodSpec: kservev1beta1.PodSpec{ - NodeSelector: modelSvc.ResourceRequest.GPUNodeSelector, + NodeSelector: defaultGPUNodeSelector, + Tolerations: defaultGPUTolerations, }, }, }, @@ -1279,7 +1325,8 @@ func TestCreateInferenceServiceSpecWithGPU(t *testing.T) { MaxReplicas: defaultModelResourceRequests.MaxReplica, }, PodSpec: kservev1beta1.PodSpec{ - NodeSelector: modelSvc.ResourceRequest.GPUNodeSelector, + NodeSelector: defaultGPUNodeSelector, + Tolerations: defaultGPUTolerations, }, }, }, @@ -1336,7 +1383,8 @@ func TestCreateInferenceServiceSpecWithGPU(t *testing.T) { LivenessProbe: probeConfigUPI, }, }, - NodeSelector: modelSvc.ResourceRequest.GPUNodeSelector, + NodeSelector: defaultGPUNodeSelector, + Tolerations: defaultGPUTolerations, }, ComponentExtensionSpec: kservev1beta1.ComponentExtensionSpec{ MinReplicas: &defaultModelResourceRequests.MinReplica, @@ -1398,7 +1446,8 @@ func TestCreateInferenceServiceSpecWithGPU(t *testing.T) { MaxReplicas: defaultModelResourceRequests.MaxReplica, }, PodSpec: kservev1beta1.PodSpec{ - NodeSelector: modelSvc.ResourceRequest.GPUNodeSelector, + NodeSelector: defaultGPUNodeSelector, + Tolerations: defaultGPUTolerations, }, }, }, @@ -1460,7 +1509,8 @@ func TestCreateInferenceServiceSpecWithGPU(t *testing.T) { Ports: grpcContainerPorts, }, }, - NodeSelector: modelSvc.ResourceRequest.GPUNodeSelector, + NodeSelector: defaultGPUNodeSelector, + Tolerations: defaultGPUTolerations, }, ComponentExtensionSpec: kservev1beta1.ComponentExtensionSpec{ MinReplicas: &defaultModelResourceRequests.MinReplica, @@ -1479,6 +1529,7 @@ func TestCreateInferenceServiceSpecWithGPU(t *testing.T) { DefaultTransformerResourceRequests: defaultTransformerResourceRequests, QueueResourcePercentage: tt.resourcePercentage, PyfuncGRPCOptions: "{}", + GPUs: defaultGPUsConfig, } tpl := NewInferenceServiceTemplater(standardTransformerConfig) diff --git a/api/config/environment.go b/api/config/environment.go index d7658cd05..b19cc3e30 100644 --- a/api/config/environment.go +++ b/api/config/environment.go @@ -113,19 +113,26 @@ type ResourceRequestConfig struct { } type GPUConfig struct { + // Name is used as the key to identify the GPU configuration. + // It also specifies how the accelerator type will be written in the UI. + // Example: "NVIDIA T4" + Name string `yaml:"name"` // Values limits how many GPUs can be requested by users. - // Example: "none", "1", "2", "4" + // Example: "None", "1", "2", "4" Values []string `yaml:"values"` - // Specifies how the accelerator type will be written in the UI. - // Example: "NVIDIA T4" - DisplayName string `yaml:"display_name"` // Specifies how the accelerator type will be translated to // K8s resource type. Example: nvidia.com/gpu ResourceType string `yaml:"resource_type"` // To deploy the models on a specific GPU node. NodeSelector map[string]string `yaml:"node_selector"` + // To deploy the models on a specific GPU node via taints and tolerations. + Tolerations []corev1.Toleration `yaml:"tolerations"` + // MinMonthlyCostPerGPU is the minimum monthly cost per GPU, for example, if you enable time-sharing GPUs with 8 max shared clients, + // the minimum monthly cost per GPU is max_monthly_cost_per_gpu divided by 8. + // MaxMonthlyCostPerGPU is the maximum monthly cost if you use the whole GPU. // https://cloud.google.com/compute/gpus-pricing#other-gpu-models - MonthlyCostPerGPU float64 `yaml:"monthly_cost_per_gpu"` + MinMonthlyCostPerGPU float64 `yaml:"min_monthly_cost_per_gpu"` + MaxMonthlyCostPerGPU float64 `yaml:"max_monthly_cost_per_gpu"` } func InitEnvironmentConfigs(path string) ([]*EnvironmentConfig, error) { diff --git a/api/config/environment_test.go b/api/config/environment_test.go index fec59719f..24db43b33 100644 --- a/api/config/environment_test.go +++ b/api/config/environment_test.go @@ -186,13 +186,50 @@ func TestGPUsConfig(t *testing.T) { envConfigPath: "./testdata/valid-environment-1.yaml", expectedGPUsConfig: []GPUConfig{ { - Values: []string{"none", "1"}, - DisplayName: "NVIDIA T4", + Name: "NVIDIA T4", + Values: []string{"None", "1"}, ResourceType: "nvidia.com/gpu", NodeSelector: map[string]string{ "cloud.google.com/gke-accelerator": "nvidia-tesla-t4", }, - MonthlyCostPerGPU: 189.07, + MinMonthlyCostPerGPU: 189.07, + MaxMonthlyCostPerGPU: 189.07, + }, + { + Name: "NVIDIA T4 with Time Sharing", + Values: []string{"None", "1"}, + ResourceType: "nvidia.com/gpu", + NodeSelector: map[string]string{ + "cloud.google.com/gke-accelerator": "nvidia-tesla-t4", + "cloud.google.com/gke-max-shared-clients-per-gpu": "8", + "cloud.google.com/gke-gpu-sharing-strategy": "time-sharing", + }, + MinMonthlyCostPerGPU: 23.63, + MaxMonthlyCostPerGPU: 189.07, + }, + { + Name: "NVIDIA P4", + Values: []string{"None", "1", "2"}, + ResourceType: "nvidia.com/gpu", + NodeSelector: map[string]string{ + "cloud.google.com/gke-accelerator": "nvidia-tesla-p4", + }, + Tolerations: []corev1.Toleration{ + { + Key: "caraml/nvidia-tesla-p4", + Operator: corev1.TolerationOpEqual, + Value: "enabled", + Effect: corev1.TaintEffectNoSchedule, + }, + { + Key: "nvidia.com/gpu", + Operator: corev1.TolerationOpEqual, + Value: "present", + Effect: corev1.TaintEffectNoSchedule, + }, + }, + MinMonthlyCostPerGPU: 332.15, + MaxMonthlyCostPerGPU: 332.15, }, }, }, diff --git a/api/config/testdata/valid-environment-1.yaml b/api/config/testdata/valid-environment-1.yaml index 0fbf8d83c..1ea25a60c 100644 --- a/api/config/testdata/valid-environment-1.yaml +++ b/api/config/testdata/valid-environment-1.yaml @@ -41,9 +41,35 @@ interactiveMode: IfAvailable provideClusterInfo: true gpus: - - values: ["none", "1"] - display_name: "NVIDIA T4" + - name: "NVIDIA T4" + values: ["None", "1"] resource_type: "nvidia.com/gpu" node_selector: "cloud.google.com/gke-accelerator": "nvidia-tesla-t4" - monthly_cost_per_gpu: 189.07 + min_monthly_cost_per_gpu: 189.07 + max_monthly_cost_per_gpu: 189.07 + - name: "NVIDIA T4 with Time Sharing" + values: ["None", "1"] + resource_type: "nvidia.com/gpu" + node_selector: + "cloud.google.com/gke-accelerator": "nvidia-tesla-t4" + "cloud.google.com/gke-max-shared-clients-per-gpu": "8" + "cloud.google.com/gke-gpu-sharing-strategy": "time-sharing" + min_monthly_cost_per_gpu: 23.63 + max_monthly_cost_per_gpu: 189.07 + - name: "NVIDIA P4" + values: ["None", "1", "2"] + resource_type: "nvidia.com/gpu" + node_selector: + "cloud.google.com/gke-accelerator": "nvidia-tesla-p4" + tolerations: + - key: "caraml/nvidia-tesla-p4" + operator: "Equal" + value: "enabled" + effect: "NoSchedule" + - key: "nvidia.com/gpu" + operator: "Equal" + value: "present" + effect: "NoSchedule" + min_monthly_cost_per_gpu: 332.15 + max_monthly_cost_per_gpu: 332.15 diff --git a/api/models/gpu.go b/api/models/gpu.go index ffa15c49e..26b9f3915 100644 --- a/api/models/gpu.go +++ b/api/models/gpu.go @@ -5,23 +5,32 @@ import ( "encoding/json" "errors" + corev1 "k8s.io/api/core/v1" + "github.com/caraml-dev/merlin/config" ) type GPU struct { + // Name is used as the key to identify the GPU configuration. + // It also specifies how the accelerator type will be written in the UI. + // Example: "NVIDIA T4" + Name string `json:"name"` // Values limits how many GPUs can be requested by users. // Example: "none", "1", "2", "4" Values []string `json:"values"` - // Specifies how the accelerator type will be written in the UI. - // Example: "NVIDIA T4" - DisplayName string `json:"display_name"` // Specifies how the accelerator type will be translated to // K8s resource type. Example: nvidia.com/gpu ResourceType string `json:"resource_type"` // To deploy the models on a specific GPU node. - NodeSelector map[string]string `json:"node_selector"` + NodeSelector map[string]string `json:"node_selector,omitempty"` + // To deploy the models on a specific GPU node via taints and tolerations. + Tolerations []corev1.Toleration `json:"tolerations,omitempty"` + // MinMonthlyCostPerGPU is the minimum monthly cost per GPU, for example, if you enable time-sharing GPUs with 8 max shared clients, + // the minimum monthly cost per GPU is max_monthly_cost_per_gpu divided by 8. + // MaxMonthlyCostPerGPU is the maximum monthly cost if you use the whole GPU. // https://cloud.google.com/compute/gpus-pricing#other-gpu-models - MonthlyCostPerGPU float64 `json:"monthly_cost_per_gpu"` + MinMonthlyCostPerGPU float64 `json:"min_monthly_cost_per_gpu"` + MaxMonthlyCostPerGPU float64 `json:"max_monthly_cost_per_gpu"` } type GPUs []GPU @@ -45,11 +54,13 @@ func ParseGPUsConfig(configGPUs []config.GPUConfig) GPUs { for _, configGPU := range configGPUs { gpu := GPU{ - Values: configGPU.Values, - DisplayName: configGPU.DisplayName, - ResourceType: configGPU.ResourceType, - NodeSelector: configGPU.NodeSelector, - MonthlyCostPerGPU: configGPU.MonthlyCostPerGPU, + Name: configGPU.Name, + Values: configGPU.Values, + ResourceType: configGPU.ResourceType, + NodeSelector: configGPU.NodeSelector, + Tolerations: configGPU.Tolerations, + MinMonthlyCostPerGPU: configGPU.MinMonthlyCostPerGPU, + MaxMonthlyCostPerGPU: configGPU.MaxMonthlyCostPerGPU, } gpus = append(gpus, gpu) } diff --git a/api/models/gpu_test.go b/api/models/gpu_test.go new file mode 100644 index 000000000..179f59b9b --- /dev/null +++ b/api/models/gpu_test.go @@ -0,0 +1,86 @@ +package models + +import ( + "reflect" + "testing" + + corev1 "k8s.io/api/core/v1" + + "github.com/caraml-dev/merlin/config" +) + +func TestParseGPUsConfig(t *testing.T) { + type args struct { + configGPUs []config.GPUConfig + } + tests := []struct { + name string + args args + want GPUs + }{ + { + name: "successful parsing", + args: args{ + configGPUs: []config.GPUConfig{ + { + Name: "NVIDIA P4", + Values: []string{"None", "1", "2"}, + ResourceType: "nvidia.com/gpu", + NodeSelector: map[string]string{ + "cloud.google.com/gke-accelerator": "nvidia-tesla-p4", + }, + Tolerations: []corev1.Toleration{ + { + Key: "caraml/nvidia-tesla-p4", + Operator: corev1.TolerationOpEqual, + Value: "enabled", + Effect: "NoSchedule", + }, + { + Key: "nvidia.com/gpu", + Operator: corev1.TolerationOpEqual, + Value: "present", + Effect: "NoSchedule", + }, + }, + MinMonthlyCostPerGPU: 332.15, + MaxMonthlyCostPerGPU: 332.15, + }, + }, + }, + want: GPUs{ + { + Name: "NVIDIA P4", + Values: []string{"None", "1", "2"}, + ResourceType: "nvidia.com/gpu", + NodeSelector: map[string]string{ + "cloud.google.com/gke-accelerator": "nvidia-tesla-p4", + }, + Tolerations: []corev1.Toleration{ + { + Key: "caraml/nvidia-tesla-p4", + Operator: corev1.TolerationOpEqual, + Value: "enabled", + Effect: "NoSchedule", + }, + { + Key: "nvidia.com/gpu", + Operator: corev1.TolerationOpEqual, + Value: "present", + Effect: "NoSchedule", + }, + }, + MinMonthlyCostPerGPU: 332.15, + MaxMonthlyCostPerGPU: 332.15, + }, + }, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + if got := ParseGPUsConfig(tt.args.configGPUs); !reflect.DeepEqual(got, tt.want) { + t.Errorf("ParseGPUsConfig() = %v, want %v", got, tt.want) + } + }) + } +} diff --git a/api/models/resource_request.go b/api/models/resource_request.go index b537f1914..222c4c2da 100644 --- a/api/models/resource_request.go +++ b/api/models/resource_request.go @@ -31,12 +31,11 @@ type ResourceRequest struct { CPURequest resource.Quantity `json:"cpu_request"` // Memory request of inference service MemoryRequest resource.Quantity `json:"memory_request"` - // GPU resource type (nvidia.com/gpu or amd.com/gpu) - GPUResourceType string `json:"gpu_resource_type"` + + // GPU name + GPUName string `json:"gpu_name,omitempty"` // GPU Quantity requests - GPURequest resource.Quantity `json:"gpu_request"` - // GPU Node selector - GPUNodeSelector map[string]string `json:"gpu_node_selector"` + GPURequest resource.Quantity `json:"gpu_request,omitempty"` } func (r ResourceRequest) Value() (driver.Value, error) { diff --git a/api/service/environment_service_test.go b/api/service/environment_service_test.go index 6a2cb2ae5..46db6da95 100644 --- a/api/service/environment_service_test.go +++ b/api/service/environment_service_test.go @@ -64,13 +64,14 @@ func TestSave(t *testing.T) { }, GPUs: models.GPUs{ { + Name: "NVIDIA T4", Values: []string{"none", "1"}, - DisplayName: "NVIDIA T4", ResourceType: "nvidia.com/gpu", NodeSelector: map[string]string{ "cloud.google.com/gke-accelerator": "nvidia-tesla-t4", }, - MonthlyCostPerGPU: 189.07, + MinMonthlyCostPerGPU: 189.07, + MaxMonthlyCostPerGPU: 189.07, }, }, }, diff --git a/docs/connecting-to-merlin/python-sdk.md b/docs/connecting-to-merlin/python-sdk.md index 150944f8c..d0be7bb7f 100644 --- a/docs/connecting-to-merlin/python-sdk.md +++ b/docs/connecting-to-merlin/python-sdk.md @@ -15,6 +15,10 @@ from merlin.model import ModelType # Connect to an existing Merlin deployment merlin.set_url("merlin.example.com") +# Set the active model to the name given by parameter, if the model with the given name is not found, a new model will +# be created. +merlin.set_model("example-model", ModelType.PYFUNC) + # Ensure that you're connected by printing out some Model Endpoints merlin.list_model_endpoints() ``` diff --git a/python/sdk/client/__init__.py b/python/sdk/client/__init__.py index ebba7064b..9aa7d5af9 100644 --- a/python/sdk/client/__init__.py +++ b/python/sdk/client/__init__.py @@ -42,7 +42,8 @@ from client.models.environment import Environment from client.models.file_format import FileFormat from client.models.free_form_object import FreeFormObject -from client.models.gpu import GPU +from client.models.gpu_config import GPUConfig +from client.models.gpu_toleration import GPUToleration from client.models.label import Label from client.models.logger import Logger from client.models.logger_config import LoggerConfig diff --git a/python/sdk/client/models/__init__.py b/python/sdk/client/models/__init__.py index 597e406ab..d4dcc14f2 100644 --- a/python/sdk/client/models/__init__.py +++ b/python/sdk/client/models/__init__.py @@ -26,7 +26,8 @@ from client.models.environment import Environment from client.models.file_format import FileFormat from client.models.free_form_object import FreeFormObject -from client.models.gpu import GPU +from client.models.gpu_config import GPUConfig +from client.models.gpu_toleration import GPUToleration from client.models.label import Label from client.models.logger import Logger from client.models.logger_config import LoggerConfig diff --git a/python/sdk/client/models/environment.py b/python/sdk/client/models/environment.py index 18d1d07c0..74fe16193 100644 --- a/python/sdk/client/models/environment.py +++ b/python/sdk/client/models/environment.py @@ -37,7 +37,7 @@ class Environment(object): 'default_resource_request': 'ResourceRequest', 'default_transformer_resource_request': 'ResourceRequest', 'default_prediction_job_resource_request': 'PredictionJobResourceRequest', - 'gpus': 'list[GPU]', + 'gpus': 'list[GPUConfig]', 'created_at': 'datetime', 'updated_at': 'datetime' } @@ -293,7 +293,7 @@ def gpus(self): :return: The gpus of this Environment. # noqa: E501 - :rtype: list[GPU] + :rtype: list[GPUConfig] """ return self._gpus @@ -303,7 +303,7 @@ def gpus(self, gpus): :param gpus: The gpus of this Environment. # noqa: E501 - :type: list[GPU] + :type: list[GPUConfig] """ self._gpus = gpus diff --git a/python/sdk/client/models/gpu.py b/python/sdk/client/models/gpu.py deleted file mode 100644 index 632320a9c..000000000 --- a/python/sdk/client/models/gpu.py +++ /dev/null @@ -1,214 +0,0 @@ -# coding: utf-8 - -""" - Merlin - - API Guide for accessing Merlin's model management, deployment, and serving functionalities # noqa: E501 - - OpenAPI spec version: 0.14.0 - - Generated by: https://github.com/swagger-api/swagger-codegen.git -""" - -import pprint -import re # noqa: F401 - -import six - -class GPU(object): - """NOTE: This class is auto generated by the swagger code generator program. - - Do not edit the class manually. - """ - """ - Attributes: - swagger_types (dict): The key is attribute name - and the value is attribute type. - attribute_map (dict): The key is attribute name - and the value is json key in definition. - """ - swagger_types = { - 'values': 'list[str]', - 'display_name': 'str', - 'resource_type': 'str', - 'node_selector': 'dict(str, str)', - 'monthly_cost_per_gpu': 'float' - } - - attribute_map = { - 'values': 'values', - 'display_name': 'display_name', - 'resource_type': 'resource_type', - 'node_selector': 'node_selector', - 'monthly_cost_per_gpu': 'monthly_cost_per_gpu' - } - - def __init__(self, values=None, display_name=None, resource_type=None, node_selector=None, monthly_cost_per_gpu=None): # noqa: E501 - """GPU - a model defined in Swagger""" # noqa: E501 - self._values = None - self._display_name = None - self._resource_type = None - self._node_selector = None - self._monthly_cost_per_gpu = None - self.discriminator = None - if values is not None: - self.values = values - if display_name is not None: - self.display_name = display_name - if resource_type is not None: - self.resource_type = resource_type - if node_selector is not None: - self.node_selector = node_selector - if monthly_cost_per_gpu is not None: - self.monthly_cost_per_gpu = monthly_cost_per_gpu - - @property - def values(self): - """Gets the values of this GPU. # noqa: E501 - - - :return: The values of this GPU. # noqa: E501 - :rtype: list[str] - """ - return self._values - - @values.setter - def values(self, values): - """Sets the values of this GPU. - - - :param values: The values of this GPU. # noqa: E501 - :type: list[str] - """ - - self._values = values - - @property - def display_name(self): - """Gets the display_name of this GPU. # noqa: E501 - - - :return: The display_name of this GPU. # noqa: E501 - :rtype: str - """ - return self._display_name - - @display_name.setter - def display_name(self, display_name): - """Sets the display_name of this GPU. - - - :param display_name: The display_name of this GPU. # noqa: E501 - :type: str - """ - - self._display_name = display_name - - @property - def resource_type(self): - """Gets the resource_type of this GPU. # noqa: E501 - - - :return: The resource_type of this GPU. # noqa: E501 - :rtype: str - """ - return self._resource_type - - @resource_type.setter - def resource_type(self, resource_type): - """Sets the resource_type of this GPU. - - - :param resource_type: The resource_type of this GPU. # noqa: E501 - :type: str - """ - - self._resource_type = resource_type - - @property - def node_selector(self): - """Gets the node_selector of this GPU. # noqa: E501 - - - :return: The node_selector of this GPU. # noqa: E501 - :rtype: dict(str, str) - """ - return self._node_selector - - @node_selector.setter - def node_selector(self, node_selector): - """Sets the node_selector of this GPU. - - - :param node_selector: The node_selector of this GPU. # noqa: E501 - :type: dict(str, str) - """ - - self._node_selector = node_selector - - @property - def monthly_cost_per_gpu(self): - """Gets the monthly_cost_per_gpu of this GPU. # noqa: E501 - - - :return: The monthly_cost_per_gpu of this GPU. # noqa: E501 - :rtype: float - """ - return self._monthly_cost_per_gpu - - @monthly_cost_per_gpu.setter - def monthly_cost_per_gpu(self, monthly_cost_per_gpu): - """Sets the monthly_cost_per_gpu of this GPU. - - - :param monthly_cost_per_gpu: The monthly_cost_per_gpu of this GPU. # noqa: E501 - :type: float - """ - - self._monthly_cost_per_gpu = monthly_cost_per_gpu - - def to_dict(self): - """Returns the model properties as a dict""" - result = {} - - for attr, _ in six.iteritems(self.swagger_types): - value = getattr(self, attr) - if isinstance(value, list): - result[attr] = list(map( - lambda x: x.to_dict() if hasattr(x, "to_dict") else x, - value - )) - elif hasattr(value, "to_dict"): - result[attr] = value.to_dict() - elif isinstance(value, dict): - result[attr] = dict(map( - lambda item: (item[0], item[1].to_dict()) - if hasattr(item[1], "to_dict") else item, - value.items() - )) - else: - result[attr] = value - if issubclass(GPU, dict): - for key, value in self.items(): - result[key] = value - - return result - - def to_str(self): - """Returns the string representation of the model""" - return pprint.pformat(self.to_dict()) - - def __repr__(self): - """For `print` and `pprint`""" - return self.to_str() - - def __eq__(self, other): - """Returns true if both objects are equal""" - if not isinstance(other, GPU): - return False - - return self.__dict__ == other.__dict__ - - def __ne__(self, other): - """Returns true if both objects are not equal""" - return not self == other diff --git a/python/sdk/client/models/gpu_config.py b/python/sdk/client/models/gpu_config.py new file mode 100644 index 000000000..a75ecf4ab --- /dev/null +++ b/python/sdk/client/models/gpu_config.py @@ -0,0 +1,266 @@ +# coding: utf-8 + +""" + Merlin + + API Guide for accessing Merlin's model management, deployment, and serving functionalities # noqa: E501 + + OpenAPI spec version: 0.14.0 + + Generated by: https://github.com/swagger-api/swagger-codegen.git +""" + +import pprint +import re # noqa: F401 + +import six + +class GPUConfig(object): + """NOTE: This class is auto generated by the swagger code generator program. + + Do not edit the class manually. + """ + """ + Attributes: + swagger_types (dict): The key is attribute name + and the value is attribute type. + attribute_map (dict): The key is attribute name + and the value is json key in definition. + """ + swagger_types = { + 'name': 'str', + 'values': 'list[str]', + 'resource_type': 'str', + 'node_selector': 'dict(str, str)', + 'tolerations': 'list[GPUToleration]', + 'min_monthly_cost_per_gpu': 'float', + 'max_monthly_cost_per_gpu': 'float' + } + + attribute_map = { + 'name': 'name', + 'values': 'values', + 'resource_type': 'resource_type', + 'node_selector': 'node_selector', + 'tolerations': 'tolerations', + 'min_monthly_cost_per_gpu': 'min_monthly_cost_per_gpu', + 'max_monthly_cost_per_gpu': 'max_monthly_cost_per_gpu' + } + + def __init__(self, name=None, values=None, resource_type=None, node_selector=None, tolerations=None, min_monthly_cost_per_gpu=None, max_monthly_cost_per_gpu=None): # noqa: E501 + """GPUConfig - a model defined in Swagger""" # noqa: E501 + self._name = None + self._values = None + self._resource_type = None + self._node_selector = None + self._tolerations = None + self._min_monthly_cost_per_gpu = None + self._max_monthly_cost_per_gpu = None + self.discriminator = None + if name is not None: + self.name = name + if values is not None: + self.values = values + if resource_type is not None: + self.resource_type = resource_type + if node_selector is not None: + self.node_selector = node_selector + if tolerations is not None: + self.tolerations = tolerations + if min_monthly_cost_per_gpu is not None: + self.min_monthly_cost_per_gpu = min_monthly_cost_per_gpu + if max_monthly_cost_per_gpu is not None: + self.max_monthly_cost_per_gpu = max_monthly_cost_per_gpu + + @property + def name(self): + """Gets the name of this GPUConfig. # noqa: E501 + + + :return: The name of this GPUConfig. # noqa: E501 + :rtype: str + """ + return self._name + + @name.setter + def name(self, name): + """Sets the name of this GPUConfig. + + + :param name: The name of this GPUConfig. # noqa: E501 + :type: str + """ + + self._name = name + + @property + def values(self): + """Gets the values of this GPUConfig. # noqa: E501 + + + :return: The values of this GPUConfig. # noqa: E501 + :rtype: list[str] + """ + return self._values + + @values.setter + def values(self, values): + """Sets the values of this GPUConfig. + + + :param values: The values of this GPUConfig. # noqa: E501 + :type: list[str] + """ + + self._values = values + + @property + def resource_type(self): + """Gets the resource_type of this GPUConfig. # noqa: E501 + + + :return: The resource_type of this GPUConfig. # noqa: E501 + :rtype: str + """ + return self._resource_type + + @resource_type.setter + def resource_type(self, resource_type): + """Sets the resource_type of this GPUConfig. + + + :param resource_type: The resource_type of this GPUConfig. # noqa: E501 + :type: str + """ + + self._resource_type = resource_type + + @property + def node_selector(self): + """Gets the node_selector of this GPUConfig. # noqa: E501 + + + :return: The node_selector of this GPUConfig. # noqa: E501 + :rtype: dict(str, str) + """ + return self._node_selector + + @node_selector.setter + def node_selector(self, node_selector): + """Sets the node_selector of this GPUConfig. + + + :param node_selector: The node_selector of this GPUConfig. # noqa: E501 + :type: dict(str, str) + """ + + self._node_selector = node_selector + + @property + def tolerations(self): + """Gets the tolerations of this GPUConfig. # noqa: E501 + + + :return: The tolerations of this GPUConfig. # noqa: E501 + :rtype: list[GPUToleration] + """ + return self._tolerations + + @tolerations.setter + def tolerations(self, tolerations): + """Sets the tolerations of this GPUConfig. + + + :param tolerations: The tolerations of this GPUConfig. # noqa: E501 + :type: list[GPUToleration] + """ + + self._tolerations = tolerations + + @property + def min_monthly_cost_per_gpu(self): + """Gets the min_monthly_cost_per_gpu of this GPUConfig. # noqa: E501 + + + :return: The min_monthly_cost_per_gpu of this GPUConfig. # noqa: E501 + :rtype: float + """ + return self._min_monthly_cost_per_gpu + + @min_monthly_cost_per_gpu.setter + def min_monthly_cost_per_gpu(self, min_monthly_cost_per_gpu): + """Sets the min_monthly_cost_per_gpu of this GPUConfig. + + + :param min_monthly_cost_per_gpu: The min_monthly_cost_per_gpu of this GPUConfig. # noqa: E501 + :type: float + """ + + self._min_monthly_cost_per_gpu = min_monthly_cost_per_gpu + + @property + def max_monthly_cost_per_gpu(self): + """Gets the max_monthly_cost_per_gpu of this GPUConfig. # noqa: E501 + + + :return: The max_monthly_cost_per_gpu of this GPUConfig. # noqa: E501 + :rtype: float + """ + return self._max_monthly_cost_per_gpu + + @max_monthly_cost_per_gpu.setter + def max_monthly_cost_per_gpu(self, max_monthly_cost_per_gpu): + """Sets the max_monthly_cost_per_gpu of this GPUConfig. + + + :param max_monthly_cost_per_gpu: The max_monthly_cost_per_gpu of this GPUConfig. # noqa: E501 + :type: float + """ + + self._max_monthly_cost_per_gpu = max_monthly_cost_per_gpu + + def to_dict(self): + """Returns the model properties as a dict""" + result = {} + + for attr, _ in six.iteritems(self.swagger_types): + value = getattr(self, attr) + if isinstance(value, list): + result[attr] = list(map( + lambda x: x.to_dict() if hasattr(x, "to_dict") else x, + value + )) + elif hasattr(value, "to_dict"): + result[attr] = value.to_dict() + elif isinstance(value, dict): + result[attr] = dict(map( + lambda item: (item[0], item[1].to_dict()) + if hasattr(item[1], "to_dict") else item, + value.items() + )) + else: + result[attr] = value + if issubclass(GPUConfig, dict): + for key, value in self.items(): + result[key] = value + + return result + + def to_str(self): + """Returns the string representation of the model""" + return pprint.pformat(self.to_dict()) + + def __repr__(self): + """For `print` and `pprint`""" + return self.to_str() + + def __eq__(self, other): + """Returns true if both objects are equal""" + if not isinstance(other, GPUConfig): + return False + + return self.__dict__ == other.__dict__ + + def __ne__(self, other): + """Returns true if both objects are not equal""" + return not self == other diff --git a/python/sdk/client/models/gpu_toleration.py b/python/sdk/client/models/gpu_toleration.py new file mode 100644 index 000000000..69de71a0b --- /dev/null +++ b/python/sdk/client/models/gpu_toleration.py @@ -0,0 +1,214 @@ +# coding: utf-8 + +""" + Merlin + + API Guide for accessing Merlin's model management, deployment, and serving functionalities # noqa: E501 + + OpenAPI spec version: 0.14.0 + + Generated by: https://github.com/swagger-api/swagger-codegen.git +""" + +import pprint +import re # noqa: F401 + +import six + +class GPUToleration(object): + """NOTE: This class is auto generated by the swagger code generator program. + + Do not edit the class manually. + """ + """ + Attributes: + swagger_types (dict): The key is attribute name + and the value is attribute type. + attribute_map (dict): The key is attribute name + and the value is json key in definition. + """ + swagger_types = { + 'key': 'str', + 'operator': 'str', + 'value': 'str', + 'effect': 'str', + 'toleration_seconds': 'int' + } + + attribute_map = { + 'key': 'key', + 'operator': 'operator', + 'value': 'value', + 'effect': 'effect', + 'toleration_seconds': 'toleration_seconds' + } + + def __init__(self, key=None, operator=None, value=None, effect=None, toleration_seconds=None): # noqa: E501 + """GPUToleration - a model defined in Swagger""" # noqa: E501 + self._key = None + self._operator = None + self._value = None + self._effect = None + self._toleration_seconds = None + self.discriminator = None + if key is not None: + self.key = key + if operator is not None: + self.operator = operator + if value is not None: + self.value = value + if effect is not None: + self.effect = effect + if toleration_seconds is not None: + self.toleration_seconds = toleration_seconds + + @property + def key(self): + """Gets the key of this GPUToleration. # noqa: E501 + + + :return: The key of this GPUToleration. # noqa: E501 + :rtype: str + """ + return self._key + + @key.setter + def key(self, key): + """Sets the key of this GPUToleration. + + + :param key: The key of this GPUToleration. # noqa: E501 + :type: str + """ + + self._key = key + + @property + def operator(self): + """Gets the operator of this GPUToleration. # noqa: E501 + + + :return: The operator of this GPUToleration. # noqa: E501 + :rtype: str + """ + return self._operator + + @operator.setter + def operator(self, operator): + """Sets the operator of this GPUToleration. + + + :param operator: The operator of this GPUToleration. # noqa: E501 + :type: str + """ + + self._operator = operator + + @property + def value(self): + """Gets the value of this GPUToleration. # noqa: E501 + + + :return: The value of this GPUToleration. # noqa: E501 + :rtype: str + """ + return self._value + + @value.setter + def value(self, value): + """Sets the value of this GPUToleration. + + + :param value: The value of this GPUToleration. # noqa: E501 + :type: str + """ + + self._value = value + + @property + def effect(self): + """Gets the effect of this GPUToleration. # noqa: E501 + + + :return: The effect of this GPUToleration. # noqa: E501 + :rtype: str + """ + return self._effect + + @effect.setter + def effect(self, effect): + """Sets the effect of this GPUToleration. + + + :param effect: The effect of this GPUToleration. # noqa: E501 + :type: str + """ + + self._effect = effect + + @property + def toleration_seconds(self): + """Gets the toleration_seconds of this GPUToleration. # noqa: E501 + + + :return: The toleration_seconds of this GPUToleration. # noqa: E501 + :rtype: int + """ + return self._toleration_seconds + + @toleration_seconds.setter + def toleration_seconds(self, toleration_seconds): + """Sets the toleration_seconds of this GPUToleration. + + + :param toleration_seconds: The toleration_seconds of this GPUToleration. # noqa: E501 + :type: int + """ + + self._toleration_seconds = toleration_seconds + + def to_dict(self): + """Returns the model properties as a dict""" + result = {} + + for attr, _ in six.iteritems(self.swagger_types): + value = getattr(self, attr) + if isinstance(value, list): + result[attr] = list(map( + lambda x: x.to_dict() if hasattr(x, "to_dict") else x, + value + )) + elif hasattr(value, "to_dict"): + result[attr] = value.to_dict() + elif isinstance(value, dict): + result[attr] = dict(map( + lambda item: (item[0], item[1].to_dict()) + if hasattr(item[1], "to_dict") else item, + value.items() + )) + else: + result[attr] = value + if issubclass(GPUToleration, dict): + for key, value in self.items(): + result[key] = value + + return result + + def to_str(self): + """Returns the string representation of the model""" + return pprint.pformat(self.to_dict()) + + def __repr__(self): + """For `print` and `pprint`""" + return self.to_str() + + def __eq__(self, other): + """Returns true if both objects are equal""" + if not isinstance(other, GPUToleration): + return False + + return self.__dict__ == other.__dict__ + + def __ne__(self, other): + """Returns true if both objects are not equal""" + return not self == other diff --git a/python/sdk/client/models/resource_request.py b/python/sdk/client/models/resource_request.py index 9aae0c14f..38480f1ac 100644 --- a/python/sdk/client/models/resource_request.py +++ b/python/sdk/client/models/resource_request.py @@ -32,9 +32,8 @@ class ResourceRequest(object): 'max_replica': 'int', 'cpu_request': 'str', 'memory_request': 'str', - 'gpu_resource_type': 'str', - 'gpu_request': 'str', - 'gpu_node_selector': 'dict(str, str)' + 'gpu_name': 'str', + 'gpu_request': 'str' } attribute_map = { @@ -42,20 +41,18 @@ class ResourceRequest(object): 'max_replica': 'max_replica', 'cpu_request': 'cpu_request', 'memory_request': 'memory_request', - 'gpu_resource_type': 'gpu_resource_type', - 'gpu_request': 'gpu_request', - 'gpu_node_selector': 'gpu_node_selector' + 'gpu_name': 'gpu_name', + 'gpu_request': 'gpu_request' } - def __init__(self, min_replica=None, max_replica=None, cpu_request=None, memory_request=None, gpu_resource_type=None, gpu_request=None, gpu_node_selector=None): # noqa: E501 + def __init__(self, min_replica=None, max_replica=None, cpu_request=None, memory_request=None, gpu_name=None, gpu_request=None): # noqa: E501 """ResourceRequest - a model defined in Swagger""" # noqa: E501 self._min_replica = None self._max_replica = None self._cpu_request = None self._memory_request = None - self._gpu_resource_type = None + self._gpu_name = None self._gpu_request = None - self._gpu_node_selector = None self.discriminator = None if min_replica is not None: self.min_replica = min_replica @@ -65,12 +62,10 @@ def __init__(self, min_replica=None, max_replica=None, cpu_request=None, memory_ self.cpu_request = cpu_request if memory_request is not None: self.memory_request = memory_request - if gpu_resource_type is not None: - self.gpu_resource_type = gpu_resource_type + if gpu_name is not None: + self.gpu_name = gpu_name if gpu_request is not None: self.gpu_request = gpu_request - if gpu_node_selector is not None: - self.gpu_node_selector = gpu_node_selector @property def min_replica(self): @@ -157,25 +152,25 @@ def memory_request(self, memory_request): self._memory_request = memory_request @property - def gpu_resource_type(self): - """Gets the gpu_resource_type of this ResourceRequest. # noqa: E501 + def gpu_name(self): + """Gets the gpu_name of this ResourceRequest. # noqa: E501 - :return: The gpu_resource_type of this ResourceRequest. # noqa: E501 + :return: The gpu_name of this ResourceRequest. # noqa: E501 :rtype: str """ - return self._gpu_resource_type + return self._gpu_name - @gpu_resource_type.setter - def gpu_resource_type(self, gpu_resource_type): - """Sets the gpu_resource_type of this ResourceRequest. + @gpu_name.setter + def gpu_name(self, gpu_name): + """Sets the gpu_name of this ResourceRequest. - :param gpu_resource_type: The gpu_resource_type of this ResourceRequest. # noqa: E501 + :param gpu_name: The gpu_name of this ResourceRequest. # noqa: E501 :type: str """ - self._gpu_resource_type = gpu_resource_type + self._gpu_name = gpu_name @property def gpu_request(self): @@ -198,27 +193,6 @@ def gpu_request(self, gpu_request): self._gpu_request = gpu_request - @property - def gpu_node_selector(self): - """Gets the gpu_node_selector of this ResourceRequest. # noqa: E501 - - - :return: The gpu_node_selector of this ResourceRequest. # noqa: E501 - :rtype: dict(str, str) - """ - return self._gpu_node_selector - - @gpu_node_selector.setter - def gpu_node_selector(self, gpu_node_selector): - """Sets the gpu_node_selector of this ResourceRequest. - - - :param gpu_node_selector: The gpu_node_selector of this ResourceRequest. # noqa: E501 - :type: dict(str, str) - """ - - self._gpu_node_selector = gpu_node_selector - def to_dict(self): """Returns the model properties as a dict""" result = {} diff --git a/python/sdk/merlin/model.py b/python/sdk/merlin/model.py index 4e611bb21..e727d66e8 100644 --- a/python/sdk/merlin/model.py +++ b/python/sdk/merlin/model.py @@ -33,6 +33,7 @@ from docker import APIClient from docker.errors import BuildError from docker.models.containers import Container +from merlin import pyfunc from merlin.autoscaling import (RAW_DEPLOYMENT_DEFAULT_AUTOSCALING_POLICY, SERVERLESS_DEFAULT_AUTOSCALING_POLICY, AutoscalingPolicy) @@ -57,7 +58,6 @@ from mlflow.pyfunc import PythonModel import mlflow -from merlin import pyfunc # Ensure backward compatibility after moving PyFuncModel and PyFuncV2Model to pyfunc.py # This allows users to do following import statement @@ -1045,7 +1045,7 @@ def deploy(self, environment_name: str = None, if resource_request is None: env_api = EnvironmentApi(self._api_client) env_list = env_api.environments_get() - + for env in env_list: if env.name == target_env_name: resource_request = ResourceRequest( @@ -1054,7 +1054,7 @@ def deploy(self, environment_name: str = None, env.default_resource_request.cpu_request, env.default_resource_request.memory_request, ) - + # This case is when the default resource request is not specified in the environment config if resource_request is None: raise ValueError("resource request must be specified") @@ -1064,20 +1064,19 @@ def deploy(self, environment_name: str = None, target_resource_request = client.ResourceRequest( resource_request.min_replica, resource_request.max_replica, resource_request.cpu_request, resource_request.memory_request) - + if resource_request.gpu_request is not None and resource_request.gpu_name is not None: env_api = EnvironmentApi(self._api_client) env_list = env_api.environments_get() for env in env_list: for gpu in env.gpus: - if resource_request.gpu_name == gpu.display_name: + if resource_request.gpu_name == gpu.name: if resource_request.gpu_request not in gpu.values: raise ValueError(f"Invalid GPU request count. Supported GPUs count for {resource_request.gpu_name} is {gpu.values}") - + + target_resource_request.gpu_name = resource_request.gpu_name target_resource_request.gpu_request = resource_request.gpu_request - target_resource_request.gpu_resource_type = gpu.resource_type - target_resource_request.gpu_node_selector = gpu.node_selector break target_env_vars = [] @@ -1119,12 +1118,23 @@ def deploy(self, environment_name: str = None, autoscaling_policy.target_value), protocol=protocol.value ) - - endpoint = endpoint_api \ - .models_model_id_versions_version_id_endpoint_post(int(model.id), - int(self.id), - body=endpoint.to_dict()) - + current_endpoint = self.endpoint + if current_endpoint is not None: + # This allows a serving deployment to be update while it is serving + if current_endpoint.status == Status.SERVING: + endpoint.status = Status.SERVING.value + else: + endpoint.status = Status.RUNNING.value + endpoint = endpoint_api \ + .models_model_id_versions_version_id_endpoint_endpoint_id_put(int(model.id), + int(self.id), + current_endpoint.id, + body=endpoint.to_dict()) + else: + endpoint = endpoint_api \ + .models_model_id_versions_version_id_endpoint_post(int(model.id), + int(self.id), + body=endpoint.to_dict()) bar = pyprind.ProgBar(100, track_time=True, title=f"Deploying model {model.name} version " f"{self.id}") @@ -1139,7 +1149,7 @@ def deploy(self, environment_name: str = None, sleep(5) bar.stop() - if endpoint.status != "running": + if endpoint.status != "running" and endpoint.status != "serving": raise ModelEndpointDeploymentError(model.name, self.id, endpoint.message) log_url = f"{self.url}/{self.id}/endpoints/{endpoint.id}/logs" @@ -1147,7 +1157,7 @@ def deploy(self, environment_name: str = None, f"\nView model version logs: {log_url}") self._version_endpoints = self.list_endpoint() - + return VersionEndpoint(endpoint, log_url) def create_transformer_spec(self, transformer: Transformer, target_env_name: str) -> client.Transformer: @@ -1514,7 +1524,7 @@ def _wait_build_complete(self, logs): if image_id: return raise BuildError('Unknown', logs) - + def delete_model_version(self) -> int: """ Delete this model version. Please note that any inactive related entity (endpoints and prediction jobs) will get deleted by this process. diff --git a/python/sdk/merlin/resource_request.py b/python/sdk/merlin/resource_request.py index 52d3df81e..2ab0ce761 100644 --- a/python/sdk/merlin/resource_request.py +++ b/python/sdk/merlin/resource_request.py @@ -14,6 +14,7 @@ from typing import Optional + class ResourceRequest: """ The resource requirement and replicas requests for model version endpoint. @@ -63,15 +64,15 @@ def memory_request(self, memory_request): @property def gpu_request(self) -> Optional[str]: return self._gpu_request - + @gpu_request.setter def gpu_request(self, gpu_request): self._gpu_request = gpu_request - + @property def gpu_name(self) -> Optional[str]: return self._gpu_name - + @gpu_name.setter def gpu_name(self, gpu_name): self._gpu_name = gpu_name diff --git a/python/sdk/merlin/version.py b/python/sdk/merlin/version.py index dd7993990..ff1f7b009 100644 --- a/python/sdk/merlin/version.py +++ b/python/sdk/merlin/version.py @@ -12,4 +12,4 @@ # See the License for the specific language governing permissions and # limitations under the License. -VERSION = "0.25.0" +VERSION = "0.0.0" diff --git a/python/sdk/test/integration_test.py b/python/sdk/test/integration_test.py index 529e65b09..2cf30ea0e 100644 --- a/python/sdk/test/integration_test.py +++ b/python/sdk/test/integration_test.py @@ -18,7 +18,6 @@ import pandas as pd import pytest -from merlin.endpoint import Status from merlin.logger import Logger, LoggerConfig, LoggerMode from merlin.model import ModelType from merlin.resource_request import ResourceRequest @@ -26,7 +25,7 @@ from recursive_diff import recursive_eq import merlin -from merlin import DeploymentMode +from merlin import DeploymentMode, MetricsType request_json = {"instances": [[2.8, 1.0, 6.8, 0.4], [3.1, 1.4, 4.5, 1.6]]} tensorflow_request_json = { @@ -412,13 +411,25 @@ def test_resource_request( merlin.undeploy(v) + @pytest.mark.gpu @pytest.mark.integration -@pytest.mark.parametrize("deployment_mode", [DeploymentMode.RAW_DEPLOYMENT, DeploymentMode.SERVERLESS]) -def test_resource_request_with_gpu(integration_test_url, project_name, deployment_mode, use_google_oauth, requests, gpu_config): +@pytest.mark.parametrize( + "deployment_mode", [DeploymentMode.RAW_DEPLOYMENT, DeploymentMode.SERVERLESS] +) +def test_resource_request_with_gpu( + integration_test_url, + project_name, + deployment_mode, + use_google_oauth, + requests, + gpu_config, +): merlin.set_url(integration_test_url, use_google_oauth=use_google_oauth) merlin.set_project(project_name) - merlin.set_model(f"resource-request-with-gpu-{deployment_mode_suffix(deployment_mode)}", ModelType.XGBOOST) + merlin.set_model( + f"gpu-{deployment_mode_suffix(deployment_mode)}", ModelType.XGBOOST + ) model_dir = "test/xgboost-model" @@ -435,7 +446,10 @@ def test_resource_request_with_gpu(integration_test_url, project_name, deploymen resource_request = ResourceRequest(1, 1, "100m", "200Mi", **gpu_config) endpoint = merlin.deploy( - v, environment_name=default_env.name, resource_request=resource_request, deployment_mode=deployment_mode + v, + environment_name=default_env.name, + resource_request=resource_request, + deployment_mode=deployment_mode, ) resp = requests.post(f"{endpoint.url}", json=request_json) @@ -1046,5 +1060,63 @@ def test_deployment_mode_for_serving_model( undeploy_all_version() +@pytest.mark.integration +def test_redeploy_model(integration_test_url, project_name, use_google_oauth, requests): + """ + Validate that calling the 'merlin.deploy' twice in a row redeploys a Merlin model + """ + + merlin.set_url(integration_test_url, use_google_oauth=use_google_oauth) + merlin.set_project(project_name) + merlin.set_model("model-sdk-redeploy", ModelType.TENSORFLOW) + model_dir = "test/tensorflow-model" + + undeploy_all_version() + + # Upload new model version: v1 + with merlin.new_model_version() as v1: + merlin.log_model(model_dir=model_dir) + + # Deploy using serverless with RPS autoscaling policy + endpoint = merlin.deploy( + v1, + autoscaling_policy=merlin.AutoscalingPolicy( + metrics_type=merlin.MetricsType.RPS, target_value=20 + ), + ) + + resp = requests.post(f"{endpoint.url}", json=tensorflow_request_json) + + assert resp.status_code == 200 + assert resp.json() is not None + assert len(resp.json()["predictions"]) == len(tensorflow_request_json["instances"]) + + # Check the autoscaling policy of v1 + assert endpoint.autoscaling_policy.metrics_type == MetricsType.RPS + assert endpoint.autoscaling_policy.target_value == 20 + + # Deploy v2 using raw_deployment with CPU autoscaling policy + new_endpoint = merlin.deploy( + v1, + autoscaling_policy=merlin.AutoscalingPolicy( + metrics_type=merlin.MetricsType.CPU_UTILIZATION, target_value=10 + ), + ) + + resp = requests.post(f"{new_endpoint.url}", json=tensorflow_request_json) + + assert resp.status_code == 200 + assert resp.json() is not None + assert len(resp.json()["predictions"]) == len(tensorflow_request_json["instances"]) + + # Check that the endpoint remains the same + assert endpoint.url == new_endpoint.url + # Check the autoscaling policy of v2 + assert new_endpoint.autoscaling_policy.metrics_type == MetricsType.CPU_UTILIZATION + assert new_endpoint.autoscaling_policy.target_value == 10 + + undeploy_all_version() + + def deployment_mode_suffix(deployment_mode: DeploymentMode): return deployment_mode.value.lower()[0:1] diff --git a/python/sdk/test/model_test.py b/python/sdk/test/model_test.py index d9c66ed92..c5ac44e17 100644 --- a/python/sdk/test/model_test.py +++ b/python/sdk/test/model_test.py @@ -18,11 +18,11 @@ import client import client as cl +import merlin import pytest -from merlin.autoscaling import ( - RAW_DEPLOYMENT_DEFAULT_AUTOSCALING_POLICY, - SERVERLESS_DEFAULT_AUTOSCALING_POLICY, -) +from merlin import AutoscalingPolicy, DeploymentMode, MetricsType +from merlin.autoscaling import (RAW_DEPLOYMENT_DEFAULT_AUTOSCALING_POLICY, + SERVERLESS_DEFAULT_AUTOSCALING_POLICY) from merlin.batch.config import PredictionJobConfig, ResultType from merlin.batch.job import JobStatus from merlin.batch.sink import BigQuerySink, SaveMode @@ -32,17 +32,18 @@ from merlin.protocol import Protocol from urllib3_mock import Responses -import merlin -from merlin import AutoscalingPolicy, DeploymentMode, MetricsType - responses = Responses("requests.packages.urllib3") default_resource_request = cl.ResourceRequest(1, 1, "100m", "128Mi") -gpu = cl.GPU( +gpu = cl.GPUConfig( + name="nvidia-tesla-p4", values=["1", "4", "8"], - display_name="nvidia-tesla-p4", resource_type="nvidia.com/gpu", node_selector={"cloud.google.com/gke-accelerator": "nvidia-tesla-p4"}, + tolerations=[ + cl.GPUToleration(key="caraml/nvidia-tesla-p4", operator="Equal", value="enabled", effect="NoSchedule"), + cl.GPUToleration(key="nvidia.com/gpu", operator="Equal", value="present", effect="NoSchedule"), + ], ) env_1 = cl.Environment( @@ -96,9 +97,8 @@ max_replica=1, cpu_request="100m", memory_request="128Mi", + gpu_name="nvidia-tesla-p4", gpu_request="1", - gpu_resource_type="nvidia.com/gpu", - gpu_node_selector={"cloud.google.com/gke-accelerator": "nvidia-tesla-p4"}, ) ep5 = cl.VersionEndpoint( "789", @@ -346,6 +346,14 @@ def test_deploy(self, version): status=200, content_type="application/json", ) + # This is the additional check which deploy makes to determine if there are any existing endpoints associated + responses.add( + "GET", + "/v1/models/1/versions/1/endpoint", + body=json.dumps([]), + status=200, + content_type="application/json", + ) responses.add( "POST", "/v1/models/1/versions/1/endpoint", @@ -381,6 +389,14 @@ def test_deploy_upiv1(self, version): status=200, content_type="application/json", ) + # This is the additional check which deploy makes to determine if there are any existing endpoints associated + responses.add( + "GET", + "/v1/models/1/versions/1/endpoint", + body=json.dumps([]), + status=200, + content_type="application/json", + ) responses.add( "POST", "/v1/models/1/versions/1/endpoint", @@ -416,6 +432,14 @@ def test_deploy_using_raw_deployment_mode(self, version): status=200, content_type="application/json", ) + # This is the additional check which deploy makes to determine if there are any existing endpoints associated + responses.add( + "GET", + "/v1/models/1/versions/1/endpoint", + body=json.dumps([]), + status=200, + content_type="application/json", + ) responses.add( "POST", "/v1/models/1/versions/1/endpoint", @@ -452,6 +476,13 @@ def test_deploy_with_autoscaling_policy(self, version): status=200, content_type="application/json", ) + # This is the additional check which deploy makes to determine if there are any existing endpoints associated + responses.add( + "GET", "/v1/models/1/versions/1/endpoint", + body=json.dumps([]), + status=200, + content_type="application/json", + ) responses.add( "POST", "/v1/models/1/versions/1/endpoint", @@ -505,6 +536,14 @@ def test_deploy_default_env(self, version): status=200, content_type="application/json", ) + # This is the additional check which deploy makes to determine if there are any existing endpoints associated + responses.add( + "GET", + "/v1/models/1/versions/1/endpoint", + body=json.dumps([]), + status=200, + content_type="application/json", + ) responses.add( "POST", "/v1/models/1/versions/1/endpoint", @@ -528,6 +567,52 @@ def test_deploy_default_env(self, version): assert endpoint.environment.cluster == env_1.cluster assert endpoint.environment.name == env_1.name + @responses.activate + def test_redeploy_model(self, version): + responses.add( + "GET", + "/v1/environments", + body=json.dumps([env_1.to_dict(), env_2.to_dict()]), + status=200, + content_type="application/json", + ) + # This is the additional check which deploy makes to determine if there are any existing endpoints associated + responses.add( + "GET", + "/v1/models/1/versions/1/endpoint", + body=json.dumps([ep3.to_dict()]), + status=200, + content_type="application/json", + ) + responses.add( + "PUT", + "/v1/models/1/versions/1/endpoint/1234", + body=json.dumps(ep4.to_dict()), + status=200, + content_type="application/json", + ) + responses.add( + "GET", + "/v1/models/1/versions/1/endpoint", + body=json.dumps([ep4.to_dict()]), + status=200, + content_type="application/json", + ) + + # Redeployment (add autoscaling policy and change deployment mode) + endpoint = version.deploy(environment_name=env_1.name, + autoscaling_policy=AutoscalingPolicy(metrics_type=MetricsType.CPU_UTILIZATION, + target_value=10)) + + assert endpoint.id == ep4.id + assert endpoint.status.value == ep4.status + assert endpoint.environment_name == ep4.environment_name + assert endpoint.environment.cluster == env_1.cluster + assert endpoint.environment.name == env_1.name + assert endpoint.deployment_mode == DeploymentMode.SERVERLESS + assert endpoint.autoscaling_policy.metrics_type == MetricsType.CPU_UTILIZATION + assert endpoint.autoscaling_policy.target_value == 10 + @responses.activate def test_deploy_with_gpu(self, version): responses.add( @@ -561,16 +646,12 @@ def test_deploy_with_gpu(self, version): assert endpoint.environment.name == env_3.name assert endpoint.deployment_mode == DeploymentMode.SERVERLESS assert ( - endpoint.resource_request.gpu_request - == resource_request_with_gpu.gpu_request - ) - assert ( - endpoint.resource_request.gpu_resource_type - == resource_request_with_gpu.gpu_resource_type + endpoint.resource_request.gpu_name + == resource_request_with_gpu.gpu_name ) assert ( - endpoint.resource_request.gpu_node_selector - == resource_request_with_gpu.gpu_node_selector + endpoint.resource_request.gpu_request + == resource_request_with_gpu.gpu_request ) @responses.activate diff --git a/swagger.yaml b/swagger.yaml index c7a2dc167..346039576 100644 --- a/swagger.yaml +++ b/swagger.yaml @@ -1056,7 +1056,7 @@ definitions: gpus: type: "array" items: - $ref: "#/definitions/GPU" + $ref: "#/definitions/GPUConfig" created_at: type: "string" format: "date-time" @@ -1436,14 +1436,10 @@ definitions: type: "string" memory_request: type: "string" - gpu_resource_type: + gpu_name: type: "string" gpu_request: type: "string" - gpu_node_selector: - type: "object" - additionalProperties: - type: "string" AutoscalingPolicy: type: "object" @@ -1538,23 +1534,44 @@ definitions: items: $ref: "#/definitions/EnvVar" - GPU: + GPUConfig: type: "object" properties: + name: + type: "string" values: type: "array" items: type: string - display_name: - type: "string" resource_type: type: "string" node_selector: type: "object" additionalProperties: type: "string" - monthly_cost_per_gpu: + tolerations: + type: "array" + items: + $ref: "#/definitions/GPUToleration" + min_monthly_cost_per_gpu: type: "number" + max_monthly_cost_per_gpu: + type: "number" + + GPUToleration: + type: "object" + properties: + key: + type: "string" + operator: + type: "string" + value: + type: "string" + effect: + type: "string" + toleration_seconds: + type: "integer" + format: "int64" PredictionJobResourceRequest: type: "object" diff --git a/ui/src/components/ResourcesConfigTable.js b/ui/src/components/ResourcesConfigTable.js index 1a0c98b79..cce0d4e9e 100644 --- a/ui/src/components/ResourcesConfigTable.js +++ b/ui/src/components/ResourcesConfigTable.js @@ -14,32 +14,53 @@ * limitations under the License. */ -import React from "react"; -import PropTypes from "prop-types"; import { EuiDescriptionList } from "@elastic/eui"; +import PropTypes from "prop-types"; +import React from "react"; export const ResourcesConfigTable = ({ - resourceRequest: { cpu_request, memory_request, min_replica, max_replica } + resourceRequest: { + cpu_request, + memory_request, + min_replica, + max_replica, + gpu_name, + gpu_request, + }, }) => { const items = [ { title: "CPU Request", - description: cpu_request + description: cpu_request, }, { title: "Memory Request", - description: memory_request + description: memory_request, }, { title: "Min Replicas", - description: min_replica + description: min_replica, }, { title: "Max Replicas", - description: max_replica - } + description: max_replica, + }, ]; + if (gpu_name !== undefined && gpu_name !== "") { + items.push({ + title: "GPU Name", + description: gpu_name, + }); + } + + if (gpu_request !== undefined && gpu_request !== "0") { + items.push({ + title: "GPU Request", + description: gpu_request, + }); + } + return ( { const modelMinCost = calculateCost( versionEndpoint.resource_request.min_replica, versionEndpoint.resource_request.cpu_request, - versionEndpoint.resource_request.memory_request + versionEndpoint.resource_request.memory_request, + versionEndpoint.resource_request.gpu_request, + versionEndpoint.resource_request.min_monthly_cost_per_gpu ); const modelMaxCost = calculateCost( versionEndpoint.resource_request.max_replica, versionEndpoint.resource_request.cpu_request, - versionEndpoint.resource_request.memory_request + versionEndpoint.resource_request.memory_request, + versionEndpoint.resource_request.gpu_request, + versionEndpoint.resource_request.max_monthly_cost_per_gpu ); const transformerMinCost = @@ -73,6 +77,9 @@ export const CostEstimationPanel = ({ versionEndpoint }) => { {versionEndpoint.resource_request.memory_request} Memory + {versionEndpoint.resource_request.gpu_request !== "0" && ( + {versionEndpoint.resource_request.gpu_request} GPU + )} diff --git a/ui/src/pages/version/components/forms/components/ResourcesPanel.js b/ui/src/pages/version/components/forms/components/ResourcesPanel.js index 20907c5a1..f9d908e03 100644 --- a/ui/src/pages/version/components/forms/components/ResourcesPanel.js +++ b/ui/src/pages/version/components/forms/components/ResourcesPanel.js @@ -1,11 +1,6 @@ -import React, { - useMemo, - useContext, - useState, - useEffect, - useCallback, -} from "react"; +import { FormLabelWithToolTip, useOnChangeHandler } from "@caraml-dev/ui-lib"; import { + EuiCallOut, EuiDualRange, EuiFieldText, EuiFlexGroup, @@ -13,30 +8,40 @@ import { EuiForm, EuiFormRow, EuiSpacer, - EuiCallOut, EuiSuperSelect, } from "@elastic/eui"; -import { FormLabelWithToolTip, useOnChangeHandler } from "@caraml-dev/ui-lib"; -import { Panel } from "./Panel"; -import { calculateCost } from "../../../../../utils/costEstimation"; +import React, { + useCallback, + useContext, + useEffect, + useMemo, + useState, +} from "react"; import EnvironmentsContext from "../../../../../providers/environments/context"; +import { calculateCost } from "../../../../../utils/costEstimation"; +import { Panel } from "./Panel"; const maxTicks = 20; export const ResourcesPanel = ({ - environment, + environment: initEnvironment, + isGPUEnabled, resourcesConfig, onChangeHandler, errors = {}, maxAllowedReplica, }) => { + const environment = initEnvironment; const environments = useContext(EnvironmentsContext); + + const { onChange } = useOnChangeHandler(onChangeHandler); + const gpus = useMemo(() => { const dict = {}; environments.forEach((env) => { if (env.name === environment) { env.gpus.forEach((gpu) => { - dict[gpu.display_name] = gpu; + dict[gpu.name] = gpu; }); } }); @@ -47,37 +52,42 @@ export const ResourcesPanel = ({ useEffect(() => { if ( resourcesConfig && - resourcesConfig.gpu_display_name && - resourcesConfig.gpu_display_name !== "" && - resourcesConfig.gpu_display_name !== "None" && + resourcesConfig.gpu_name && + resourcesConfig.gpu_name !== "" && + resourcesConfig.gpu_name !== "None" && Object.keys(gpus).length > 0 ) { - const gpu = gpus[resourcesConfig.gpu_display_name]; - const gpuValues = gpu.values.map((value) => ({ - value: value, - inputDisplay: value, - })); - setGpuValueOptions(gpuValues); + const gpu = gpus[resourcesConfig.gpu_name]; + if (!!gpu) { + const gpuValues = gpu.values.map((value) => ({ + value: value, + inputDisplay: value, + })); + setGpuValueOptions(gpuValues); + } } else { setGpuValueOptions([{ value: "None", inputDisplay: "None" }]); } - }, [resourcesConfig, resourcesConfig.gpu_display_name, gpus]); + }, [resourcesConfig, resourcesConfig.gpu_name, gpus]); - const { onChange } = useOnChangeHandler(onChangeHandler); const replicasError = useMemo( () => [...(errors.min_replica || []), ...(errors.max_replica || [])], [errors.min_replica, errors.max_replica] ); - const onGPUTypeChange = (gpu_display_name) => { - if (gpu_display_name === "None") { + const onGPUTypeChange = (gpu_name) => { + if (gpu_name === "None") { resetGPU(); return; } - onChange("gpu_display_name")(gpu_display_name); - onChange("gpu_resource_type")(gpus[gpu_display_name].resource_type); - onChange("gpu_node_selector")(gpus[gpu_display_name].node_selector); + onChange("gpu_name")(gpu_name); onChange("gpu_request")(undefined); + onChange("min_monthly_cost_per_gpu")( + gpus[gpu_name].min_monthly_cost_per_gpu + ); + onChange("max_monthly_cost_per_gpu")( + gpus[gpu_name].max_monthly_cost_per_gpu + ); }; const onGPUValueChange = (value) => { @@ -85,15 +95,17 @@ export const ResourcesPanel = ({ }; const resetGPU = useCallback(() => { - onChange("gpu_display_name")(undefined); - onChange("gpu_resource_type")(undefined); - onChange("gpu_node_selector")(undefined); + onChange("gpu_name")(undefined); onChange("gpu_request")(undefined); + onChange("min_monthly_cost_per_gpu")(undefined); + onChange("max_monthly_cost_per_gpu")(undefined); }, [onChange]); useEffect(() => { - resetGPU(); - }, [environment, resetGPU, onChange]); + if (environment !== initEnvironment) { + resetGPU(); + } + }, [environment, initEnvironment, resetGPU, onChange]); return ( @@ -146,15 +158,15 @@ export const ResourcesPanel = ({ - {Object.keys(gpus).length > 0 && ( + {isGPUEnabled && Object.keys(gpus).length > 0 && ( <> } fullWidth @@ -166,12 +178,12 @@ export const ResourcesPanel = ({ value: "None", inputDisplay: "None", }, - ...Object.keys(gpus).map((display_name) => ({ - value: display_name, - inputDisplay: display_name, + ...Object.keys(gpus).map((name) => ({ + value: name, + inputDisplay: name, })), ]} - valueOfSelected={resourcesConfig.gpu_display_name || "None"} + valueOfSelected={resourcesConfig.gpu_name || "None"} hasDividers /> @@ -192,7 +204,12 @@ export const ResourcesPanel = ({ @@ -241,13 +258,17 @@ export const ResourcesPanel = ({ {calculateCost( resourcesConfig.min_replica, resourcesConfig.cpu_request, - resourcesConfig.memory_request + resourcesConfig.memory_request, + resourcesConfig.gpu_request, + resourcesConfig.min_monthly_cost_per_gpu ).toFixed(2)} - {calculateCost( resourcesConfig.max_replica, resourcesConfig.cpu_request, - resourcesConfig.memory_request + resourcesConfig.memory_request, + resourcesConfig.gpu_request, + resourcesConfig.max_monthly_cost_per_gpu ).toFixed(2)}{" "} / Month

diff --git a/ui/src/pages/version/components/forms/steps/ModelStep.js b/ui/src/pages/version/components/forms/steps/ModelStep.js index 4343f3afc..5c941344e 100644 --- a/ui/src/pages/version/components/forms/steps/ModelStep.js +++ b/ui/src/pages/version/components/forms/steps/ModelStep.js @@ -1,17 +1,17 @@ -import React, { useContext } from "react"; -import { EuiFlexGroup, EuiFlexItem } from "@elastic/eui"; import { FormContext, FormValidationContext, get, useOnChangeHandler, } from "@caraml-dev/ui-lib"; +import { EuiFlexGroup, EuiFlexItem } from "@elastic/eui"; +import React, { useContext } from "react"; import { appConfig } from "../../../../../config"; +import { PROTOCOL } from "../../../../../services/version_endpoint/VersionEndpoint"; import { DeploymentConfigPanel } from "../components/DeploymentConfigPanel"; import { EnvVariablesPanel } from "../components/EnvVariablesPanel"; import { LoggerPanel } from "../components/LoggerPanel"; import { ResourcesPanel } from "../components/ResourcesPanel"; -import { PROTOCOL } from "../../../../../services/version_endpoint/VersionEndpoint"; export const ModelStep = ({ version, isEnvironmentDisabled = false }) => { const { data, onChangeHandler } = useContext(FormContext); @@ -34,6 +34,7 @@ export const ModelStep = ({ version, isEnvironmentDisabled = false }) => { { const { - data: { transformer, logger, protocol}, - onChangeHandler + data: { transformer, logger, protocol }, + onChangeHandler, } = useContext(FormContext); const { onChange } = useOnChangeHandler(onChangeHandler); const { errors } = useContext(FormValidationContext); @@ -35,22 +35,22 @@ export const TransformerStep = () => { <> - { protocol !== PROTOCOL.UPI_V1 && ( - - - - ) - } + {protocol !== PROTOCOL.UPI_V1 && ( + + + + )} { +export const calculateCost = ( + replica, + cpu, + memory, + gpu = 0, + monthlyCostPerGPU = 0 +) => { const parsed_cpu = parseCpu(cpu); const parsed_memory_gb = parseMemoryInGi(memory); + const parsed_gpu = parseFloat(gpu) || 0; return ( replica * (parsed_cpu * costEstimationConfig.cpuCost + - parsed_memory_gb * costEstimationConfig.memoryCost) + parsed_memory_gb * costEstimationConfig.memoryCost + + parsed_gpu * monthlyCostPerGPU) ); };